From 801b989af942e48a594c6896e4f5f2116a838afe Mon Sep 17 00:00:00 2001
From: mblos <156897072+mblos@users.noreply.github.com>
Date: Mon, 27 Apr 2026 14:57:16 +0200
Subject: [PATCH 01/54] fix: CR CRD, adding missing files (#764)
---
.../v1alpha1/zz_generated.deepcopy.go | 2 +-
.../crds/cortex.cloud_committedresources.yaml | 267 ++++++++++++++++++
2 files changed, 268 insertions(+), 1 deletion(-)
create mode 100644 helm/library/cortex/files/crds/cortex.cloud_committedresources.yaml
diff --git a/api/external/ironcore/v1alpha1/zz_generated.deepcopy.go b/api/external/ironcore/v1alpha1/zz_generated.deepcopy.go
index b6e6b7bce..e098f8bfb 100644
--- a/api/external/ironcore/v1alpha1/zz_generated.deepcopy.go
+++ b/api/external/ironcore/v1alpha1/zz_generated.deepcopy.go
@@ -9,7 +9,7 @@ package v1alpha1
import (
corev1alpha1 "github.com/ironcore-dev/ironcore/api/core/v1alpha1"
- runtime "k8s.io/apimachinery/pkg/runtime"
+ "k8s.io/apimachinery/pkg/runtime"
)
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
diff --git a/helm/library/cortex/files/crds/cortex.cloud_committedresources.yaml b/helm/library/cortex/files/crds/cortex.cloud_committedresources.yaml
new file mode 100644
index 000000000..73cc8f9a2
--- /dev/null
+++ b/helm/library/cortex/files/crds/cortex.cloud_committedresources.yaml
@@ -0,0 +1,267 @@
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+ annotations:
+ controller-gen.kubebuilder.io/version: v0.20.1
+ name: committedresources.cortex.cloud
+spec:
+ group: cortex.cloud
+ names:
+ kind: CommittedResource
+ listKind: CommittedResourceList
+ plural: committedresources
+ singular: committedresource
+ scope: Cluster
+ versions:
+ - additionalPrinterColumns:
+ - jsonPath: .spec.projectID
+ name: Project
+ type: string
+ - jsonPath: .spec.flavorGroupName
+ name: FlavorGroup
+ type: string
+ - jsonPath: .spec.resourceType
+ name: ResourceType
+ type: string
+ - jsonPath: .spec.availabilityZone
+ name: AZ
+ type: string
+ - jsonPath: .spec.amount
+ name: Amount
+ type: string
+ - jsonPath: .status.acceptedAmount
+ name: AcceptedAmount
+ type: string
+ - jsonPath: .status.usedAmount
+ name: UsedAmount
+ type: string
+ - jsonPath: .spec.state
+ name: State
+ type: string
+ - jsonPath: .status.conditions[?(@.type=='Ready')].status
+ name: Ready
+ type: string
+ - jsonPath: .spec.startTime
+ name: StartTime
+ priority: 1
+ type: date
+ - jsonPath: .spec.endTime
+ name: EndTime
+ priority: 1
+ type: date
+ name: v1alpha1
+ schema:
+ openAPIV3Schema:
+ description: CommittedResource is the Schema for the committedresources API
+ properties:
+ apiVersion:
+ description: |-
+ APIVersion defines the versioned schema of this representation of an object.
+ Servers should convert recognized schemas to the latest internal value, and
+ may reject unrecognized values.
+ More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+ type: string
+ kind:
+ description: |-
+ Kind is a string value representing the REST resource this object represents.
+ Servers may infer this from the endpoint the client submits requests to.
+ Cannot be updated.
+ In CamelCase.
+ More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+ type: string
+ metadata:
+ type: object
+ spec:
+ description: CommittedResourceSpec defines the desired state of CommittedResource,
+ properties:
+ amount:
+ anyOf:
+ - type: integer
+ - type: string
+ description: |-
+ Amount is the total committed quantity.
+ memory: MiB expressed in K8s binary SI notation (e.g. "1280Gi", "640Mi").
+ cores: integer core count (e.g. "40").
+ pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+ x-kubernetes-int-or-string: true
+ availabilityZone:
+ description: AvailabilityZone specifies the availability zone for
+ this commitment.
+ type: string
+ commitmentUUID:
+ description: UUID of the commitment this resource corresponds to.
+ type: string
+ confirmedAt:
+ description: ConfirmedAt is when the commitment was confirmed.
+ format: date-time
+ type: string
+ domainID:
+ description: DomainID of the OpenStack domain this commitment belongs
+ to.
+ type: string
+ endTime:
+ description: EndTime is when Reservation slots expire. Nil for unbounded
+ commitments with no expiry.
+ format: date-time
+ type: string
+ flavorGroupName:
+ description: FlavorGroupName identifies the flavor group this commitment
+ targets, e.g. "kvm_v2_hana_s".
+ type: string
+ projectID:
+ description: ProjectID of the OpenStack project this commitment belongs
+ to.
+ type: string
+ resourceType:
+ description: 'ResourceType identifies the kind of resource committed:
+ memory drives Reservation slots; cores uses an arithmetic check
+ only.'
+ enum:
+ - memory
+ - cores
+ type: string
+ schedulingDomain:
+ description: SchedulingDomain specifies the scheduling domain for
+ this committed resource (e.g., "nova", "ironcore").
+ type: string
+ startTime:
+ description: |-
+ StartTime is the activation time for Reservation slots.
+ Nil for guaranteed commitments (slots are active from creation); set to ConfirmedAt for confirmed ones.
+ format: date-time
+ type: string
+ state:
+ description: State is the lifecycle state of the commitment.
+ enum:
+ - planned
+ - pending
+ - guaranteed
+ - confirmed
+ - superseded
+ - expired
+ type: string
+ required:
+ - amount
+ - availabilityZone
+ - commitmentUUID
+ - domainID
+ - flavorGroupName
+ - projectID
+ - resourceType
+ - state
+ type: object
+ status:
+ description: CommittedResourceStatus defines the observed state of CommittedResource.
+ properties:
+ acceptedAmount:
+ anyOf:
+ - type: integer
+ - type: string
+ description: |-
+ AcceptedAmount is the quantity the controller last successfully provisioned as Reservation slots.
+ Nil if the spec has never been successfully reconciled.
+ pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+ x-kubernetes-int-or-string: true
+ acceptedAt:
+ description: AcceptedAt is when the controller last successfully reconciled
+ the spec into Reservation slots.
+ format: date-time
+ type: string
+ assignedVMs:
+ description: |-
+ AssignedVMs holds the UUIDs of VMs deterministically assigned to this committed resource.
+ Populated by the usage reconciler; used to compute UsedAmount and drive the quota controller.
+ items:
+ type: string
+ type: array
+ conditions:
+ description: Conditions holds the current status conditions.
+ items:
+ description: Condition contains details for one aspect of the current
+ state of this API Resource.
+ properties:
+ lastTransitionTime:
+ description: |-
+ lastTransitionTime is the last time the condition transitioned from one status to another.
+ This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable.
+ format: date-time
+ type: string
+ message:
+ description: |-
+ message is a human readable message indicating details about the transition.
+ This may be an empty string.
+ maxLength: 32768
+ type: string
+ observedGeneration:
+ description: |-
+ observedGeneration represents the .metadata.generation that the condition was set based upon.
+ For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
+ with respect to the current state of the instance.
+ format: int64
+ minimum: 0
+ type: integer
+ reason:
+ description: |-
+ reason contains a programmatic identifier indicating the reason for the condition's last transition.
+ Producers of specific condition types may define expected values and meanings for this field,
+ and whether the values are considered a guaranteed API.
+ The value should be a CamelCase string.
+ This field may not be empty.
+ maxLength: 1024
+ minLength: 1
+ pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
+ type: string
+ status:
+ description: status of the condition, one of True, False, Unknown.
+ enum:
+ - "True"
+ - "False"
+ - Unknown
+ type: string
+ type:
+ description: type of condition in CamelCase or in foo.example.com/CamelCase.
+ maxLength: 316
+ pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
+ type: string
+ required:
+ - lastTransitionTime
+ - message
+ - reason
+ - status
+ - type
+ type: object
+ type: array
+ lastChanged:
+ description: |-
+ LastChanged is when the spec was last written by the syncer.
+ When AcceptedAt is older than LastChanged, the controller has pending work.
+ format: date-time
+ type: string
+ lastReconcileAt:
+ description: LastReconcileAt is when the controller last ran its reconcile
+ loop for this resource.
+ format: date-time
+ type: string
+ lastUsageReconcileAt:
+ description: LastUsageReconcileAt is when the usage reconciler last
+ updated AssignedVMs and UsedAmount.
+ format: date-time
+ type: string
+ usedAmount:
+ anyOf:
+ - type: integer
+ - type: string
+ description: |-
+ UsedAmount is the sum of assigned VM resources expressed in the same units as Spec.Amount.
+ Populated by the usage reconciler.
+ pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+ x-kubernetes-int-or-string: true
+ type: object
+ required:
+ - spec
+ type: object
+ served: true
+ storage: true
+ subresources:
+ status: {}
From 1ee94125fd1961ccbe1306eccad33e165f13b498 Mon Sep 17 00:00:00 2001
From: Philipp Matthes
Date: Mon, 27 Apr 2026 15:06:10 +0200
Subject: [PATCH 02/54] Ensure only complete scaffolds are committed
---
.github/workflows/lint.yaml | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
index b0bd45e41..41f83e1c3 100644
--- a/.github/workflows/lint.yaml
+++ b/.github/workflows/lint.yaml
@@ -21,6 +21,20 @@ jobs:
uses: actions/setup-go@v6
with:
go-version-file: 'go.mod'
+ - name: Run make crds deepcopy lint-fix
+ run: make crds deepcopy lint-fix
+ - name: Check for diff
+ run: |
+ set -e
+ if ! git diff --exit-code; then
+ echo "::error::Generated files are out of date. Run 'make crds deepcopy lint-fix' and commit the changes."
+ exit 1
+ fi
+ if git ls-files --others --exclude-standard | grep -q .; then
+ echo "::error::Untracked files after generation:"
+ git ls-files --others --exclude-standard
+ exit 1
+ fi
- name: Run golangci-lint
run: |
set -eux
From f54cd16b7ab502a1769fcb9e5c6d5e91d6499253 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Mon, 27 Apr 2026 13:15:39 +0000
Subject: [PATCH 03/54] Bump cortex chart appVersions to sha-1ee94125 [skip ci]
---
helm/library/cortex/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml
index 84bf46232..d00071154 100644
--- a/helm/library/cortex/Chart.yaml
+++ b/helm/library/cortex/Chart.yaml
@@ -3,6 +3,6 @@ name: cortex
description: A Helm chart to distribute cortex.
type: application
version: 0.0.44
-appVersion: "sha-28311dec"
+appVersion: "sha-1ee94125"
icon: "https://example.com/icon.png"
dependencies: []
From 92eb783cc3c0c0c50ac1a2ead80364c71e6b6918 Mon Sep 17 00:00:00 2001
From: Philipp Matthes <27271818+PhilippMatthes@users.noreply.github.com>
Date: Tue, 28 Apr 2026 16:22:31 +0200
Subject: [PATCH 04/54] Exercise all three feature modes in placement shim e2e
tests via header override (#767)
The middleware now reads X-Cortex-Feature-Mode from every request and,
when valid, stores it in context. featureModeFromConfOrHeader resolves
the effective mode per handler (override if present, configured default
otherwise). All handlers use this helper instead of reading config
directly.
Each e2e test is wrapped with e2eWrapWithModes which iterates
passthrough, hybrid, and crd. A custom RoundTripper (e2eModeTransport)
auto-injects the header from context so individual tests need no
changes. Passthrough-only endpoints probe for 501 via
e2eProbeUnimplemented and skip gracefully. Tests that require CRD
infrastructure (traits CRUD, RP traits writes) gate on the configured
mode. The resourceLocker is now always initialized to avoid nil panics
from mode overrides.
---
.../handle_allocation_candidates_e2e.go | 11 +-
.../shim/placement/handle_allocations_e2e.go | 11 +-
.../shim/placement/handle_reshaper_e2e.go | 11 +-
.../placement/handle_resource_classes_e2e.go | 11 +-
...handle_resource_provider_aggregates_e2e.go | 11 +-
...andle_resource_provider_allocations_e2e.go | 11 +-
...andle_resource_provider_inventories_e2e.go | 11 +-
.../handle_resource_provider_traits.go | 11 +-
.../handle_resource_provider_traits_e2e.go | 11 +-
.../handle_resource_provider_usages_e2e.go | 11 +-
.../placement/handle_resource_providers.go | 22 +--
.../handle_resource_providers_e2e.go | 20 ++-
internal/shim/placement/handle_root.go | 2 +-
internal/shim/placement/handle_root_e2e.go | 2 +-
internal/shim/placement/handle_traits.go | 8 +-
internal/shim/placement/handle_traits_e2e.go | 93 +++++++-----
internal/shim/placement/handle_usages_e2e.go | 11 +-
internal/shim/placement/shim.go | 49 +++++--
internal/shim/placement/shim_e2e.go | 101 ++++++++++++-
internal/shim/placement/shim_io.go | 8 ++
internal/shim/placement/shim_test.go | 134 ++++++++++++++++++
21 files changed, 475 insertions(+), 85 deletions(-)
diff --git a/internal/shim/placement/handle_allocation_candidates_e2e.go b/internal/shim/placement/handle_allocation_candidates_e2e.go
index e90193224..c2dd4aacd 100644
--- a/internal/shim/placement/handle_allocation_candidates_e2e.go
+++ b/internal/shim/placement/handle_allocation_candidates_e2e.go
@@ -47,6 +47,15 @@ func e2eTestAllocationCandidates(ctx context.Context, _ client.Client) error {
const testRC = "CUSTOM_CORTEX_E2E_CAND_RC"
const apiVersion = "placement 1.26"
+ // Probe: for non-passthrough modes, verify endpoint returns 501.
+ unimplemented, err := e2eProbeUnimplemented(ctx, sc, sc.Endpoint+"/allocation_candidates?resources=VCPU:1")
+ if err != nil {
+ return fmt.Errorf("probe: %w", err)
+ }
+ if unimplemented {
+ return nil
+ }
+
// Pre-cleanup: delete leftover test resources from a prior run.
log.Info("Pre-cleanup: deleting leftover test resources")
for _, cleanup := range []struct {
@@ -296,5 +305,5 @@ func e2eTestAllocationCandidates(ctx context.Context, _ client.Client) error {
}
func init() {
- e2eTests = append(e2eTests, e2eTest{name: "allocation_candidates", run: e2eTestAllocationCandidates})
+ e2eTests = append(e2eTests, e2eTest{name: "allocation_candidates", run: e2eWrapWithModes(e2eTestAllocationCandidates)})
}
diff --git a/internal/shim/placement/handle_allocations_e2e.go b/internal/shim/placement/handle_allocations_e2e.go
index 7f09a507b..27887ca7f 100644
--- a/internal/shim/placement/handle_allocations_e2e.go
+++ b/internal/shim/placement/handle_allocations_e2e.go
@@ -56,6 +56,15 @@ func e2eTestAllocations(ctx context.Context, _ client.Client) error {
const userID = "e2e50000-0000-0000-0000-000000000001"
const apiVersion = "placement 1.28"
+ // Probe: for non-passthrough modes, verify endpoint returns 501.
+ unimplemented, err := e2eProbeUnimplemented(ctx, sc, sc.Endpoint+"/allocations/"+consumerUUID1)
+ if err != nil {
+ return fmt.Errorf("probe: %w", err)
+ }
+ if unimplemented {
+ return nil
+ }
+
// Pre-cleanup: delete allocations, resource provider, and resource class.
log.Info("Pre-cleanup: deleting leftover test resources")
for _, cleanup := range []struct {
@@ -476,5 +485,5 @@ func e2eTestAllocations(ctx context.Context, _ client.Client) error {
}
func init() {
- e2eTests = append(e2eTests, e2eTest{name: "allocations", run: e2eTestAllocations})
+ e2eTests = append(e2eTests, e2eTest{name: "allocations", run: e2eWrapWithModes(e2eTestAllocations)})
}
diff --git a/internal/shim/placement/handle_reshaper_e2e.go b/internal/shim/placement/handle_reshaper_e2e.go
index f43809de4..29f84be34 100644
--- a/internal/shim/placement/handle_reshaper_e2e.go
+++ b/internal/shim/placement/handle_reshaper_e2e.go
@@ -57,6 +57,15 @@ func e2eTestReshaper(ctx context.Context, _ client.Client) error {
const userID = "e2e50000-0000-0000-0000-000000000001"
const apiVersion = "placement 1.30"
+ // Probe: for non-passthrough modes, verify endpoint returns 501.
+ unimplemented, err := e2eProbeUnimplemented(ctx, sc, sc.Endpoint+"/allocations/"+consumerUUID)
+ if err != nil {
+ return fmt.Errorf("probe: %w", err)
+ }
+ if unimplemented {
+ return nil
+ }
+
// Pre-cleanup: delete allocation, both RPs, and custom resource class.
log.Info("Pre-cleanup: deleting leftover test resources")
for _, cleanup := range []struct {
@@ -571,5 +580,5 @@ func e2eTestReshaper(ctx context.Context, _ client.Client) error {
}
func init() {
- e2eTests = append(e2eTests, e2eTest{name: "reshaper", run: e2eTestReshaper})
+ e2eTests = append(e2eTests, e2eTest{name: "reshaper", run: e2eWrapWithModes(e2eTestReshaper)})
}
diff --git a/internal/shim/placement/handle_resource_classes_e2e.go b/internal/shim/placement/handle_resource_classes_e2e.go
index 360e1ef80..e848ee034 100644
--- a/internal/shim/placement/handle_resource_classes_e2e.go
+++ b/internal/shim/placement/handle_resource_classes_e2e.go
@@ -42,6 +42,15 @@ func e2eTestResourceClasses(ctx context.Context, _ client.Client) error {
const testRC = "CUSTOM_CORTEX_E2E_RC"
+ // Probe: for non-passthrough modes, verify endpoint returns 501.
+ unimplemented, err := e2eProbeUnimplemented(ctx, sc, sc.Endpoint+"/resource_classes")
+ if err != nil {
+ return fmt.Errorf("probe: %w", err)
+ }
+ if unimplemented {
+ return nil
+ }
+
// Pre-cleanup: delete any leftover test resource class from a prior run.
log.Info("Pre-cleanup: deleting leftover test resource class", "class", testRC)
req, err := http.NewRequestWithContext(ctx,
@@ -226,5 +235,5 @@ func e2eTestResourceClasses(ctx context.Context, _ client.Client) error {
}
func init() {
- e2eTests = append(e2eTests, e2eTest{name: "resource_classes", run: e2eTestResourceClasses})
+ e2eTests = append(e2eTests, e2eTest{name: "resource_classes", run: e2eWrapWithModes(e2eTestResourceClasses)})
}
diff --git a/internal/shim/placement/handle_resource_provider_aggregates_e2e.go b/internal/shim/placement/handle_resource_provider_aggregates_e2e.go
index b673c75f6..7eb6ba089 100644
--- a/internal/shim/placement/handle_resource_provider_aggregates_e2e.go
+++ b/internal/shim/placement/handle_resource_provider_aggregates_e2e.go
@@ -48,6 +48,15 @@ func e2eTestResourceProviderAggregates(ctx context.Context, _ client.Client) err
const testAggUUID1 = "e2e30000-0000-0000-0000-000000000001"
const testAggUUID2 = "e2e30000-0000-0000-0000-000000000002"
+ // Probe: for non-passthrough modes, verify endpoint returns 501.
+ unimplemented, err := e2eProbeUnimplemented(ctx, sc, sc.Endpoint+"/resource_providers/"+testRPUUID+"/aggregates")
+ if err != nil {
+ return fmt.Errorf("probe: %w", err)
+ }
+ if unimplemented {
+ return nil
+ }
+
// Pre-cleanup: delete any leftover test resource provider from a prior run.
log.Info("Pre-cleanup: deleting leftover test resource provider", "uuid", testRPUUID)
req, err := http.NewRequestWithContext(ctx,
@@ -346,5 +355,5 @@ func e2eTestResourceProviderAggregates(ctx context.Context, _ client.Client) err
}
func init() {
- e2eTests = append(e2eTests, e2eTest{name: "resource_provider_aggregates", run: e2eTestResourceProviderAggregates})
+ e2eTests = append(e2eTests, e2eTest{name: "resource_provider_aggregates", run: e2eWrapWithModes(e2eTestResourceProviderAggregates)})
}
diff --git a/internal/shim/placement/handle_resource_provider_allocations_e2e.go b/internal/shim/placement/handle_resource_provider_allocations_e2e.go
index a63c8cb4f..aea84ec06 100644
--- a/internal/shim/placement/handle_resource_provider_allocations_e2e.go
+++ b/internal/shim/placement/handle_resource_provider_allocations_e2e.go
@@ -44,6 +44,15 @@ func e2eTestResourceProviderAllocations(ctx context.Context, _ client.Client) er
const testRPUUID = "e2e10000-0000-0000-0000-000000000006"
const testRPName = "cortex-e2e-test-rp-alloc-view"
+ // Probe: for non-passthrough modes, verify endpoint returns 501.
+ unimplemented, err := e2eProbeUnimplemented(ctx, sc, sc.Endpoint+"/resource_providers/"+testRPUUID+"/allocations")
+ if err != nil {
+ return fmt.Errorf("probe: %w", err)
+ }
+ if unimplemented {
+ return nil
+ }
+
// Pre-cleanup: delete any leftover test resource provider from a prior run.
log.Info("Pre-cleanup: deleting leftover test resource provider", "uuid", testRPUUID)
req, err := http.NewRequestWithContext(ctx,
@@ -227,5 +236,5 @@ func e2eTestResourceProviderAllocations(ctx context.Context, _ client.Client) er
}
func init() {
- e2eTests = append(e2eTests, e2eTest{name: "resource_provider_allocations", run: e2eTestResourceProviderAllocations})
+ e2eTests = append(e2eTests, e2eTest{name: "resource_provider_allocations", run: e2eWrapWithModes(e2eTestResourceProviderAllocations)})
}
diff --git a/internal/shim/placement/handle_resource_provider_inventories_e2e.go b/internal/shim/placement/handle_resource_provider_inventories_e2e.go
index 354460e81..1462ca87b 100644
--- a/internal/shim/placement/handle_resource_provider_inventories_e2e.go
+++ b/internal/shim/placement/handle_resource_provider_inventories_e2e.go
@@ -53,6 +53,15 @@ func e2eTestResourceProviderInventories(ctx context.Context, _ client.Client) er
const testRC = "CUSTOM_CORTEX_E2E_INV_RC"
const apiVersion = "placement 1.26"
+ // Probe: for non-passthrough modes, verify endpoint returns 501.
+ unimplemented, err := e2eProbeUnimplemented(ctx, sc, sc.Endpoint+"/resource_providers/"+testRPUUID+"/inventories")
+ if err != nil {
+ return fmt.Errorf("probe: %w", err)
+ }
+ if unimplemented {
+ return nil
+ }
+
// Pre-cleanup: delete the resource provider (cascades inventories), then
// the custom resource class. Ignore 404/409.
log.Info("Pre-cleanup: deleting leftover test resources")
@@ -488,5 +497,5 @@ func e2eTestResourceProviderInventories(ctx context.Context, _ client.Client) er
}
func init() {
- e2eTests = append(e2eTests, e2eTest{name: "resource_provider_inventories", run: e2eTestResourceProviderInventories})
+ e2eTests = append(e2eTests, e2eTest{name: "resource_provider_inventories", run: e2eWrapWithModes(e2eTestResourceProviderInventories)})
}
diff --git a/internal/shim/placement/handle_resource_provider_traits.go b/internal/shim/placement/handle_resource_provider_traits.go
index 16978a593..b23ac8e59 100644
--- a/internal/shim/placement/handle_resource_provider_traits.go
+++ b/internal/shim/placement/handle_resource_provider_traits.go
@@ -4,7 +4,6 @@
package placement
import (
- "fmt"
"net/http"
hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
@@ -33,7 +32,7 @@ func (s *Shim) HandleListResourceProviderTraits(w http.ResponseWriter, r *http.R
if !ok {
return
}
- switch s.config.Features.ResourceProviderTraits.orDefault() {
+ switch s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviderTraits) {
case FeatureModePassthrough:
s.forward(w, r)
case FeatureModeHybrid:
@@ -92,13 +91,13 @@ func (s *Shim) HandleUpdateResourceProviderTraits(w http.ResponseWriter, r *http
if _, ok := requiredUUIDPathParam(w, r, "uuid"); !ok {
return
}
- switch s.config.Features.ResourceProviderTraits.orDefault() {
+ switch s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviderTraits) {
case FeatureModePassthrough:
s.forward(w, r)
case FeatureModeHybrid:
s.forward(w, r)
case FeatureModeCRD:
- http.Error(w, fmt.Sprintf("%s mode is not yet implemented for resource provider trait writes", s.config.Features.ResourceProviderTraits), http.StatusNotImplemented)
+ http.Error(w, "crd mode is not yet implemented for resource provider trait writes", http.StatusNotImplemented)
default:
http.Error(w, "unknown feature mode", http.StatusInternalServerError)
}
@@ -117,13 +116,13 @@ func (s *Shim) HandleDeleteResourceProviderTraits(w http.ResponseWriter, r *http
if _, ok := requiredUUIDPathParam(w, r, "uuid"); !ok {
return
}
- switch s.config.Features.ResourceProviderTraits.orDefault() {
+ switch s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviderTraits) {
case FeatureModePassthrough:
s.forward(w, r)
case FeatureModeHybrid:
s.forward(w, r)
case FeatureModeCRD:
- http.Error(w, fmt.Sprintf("%s mode is not yet implemented for resource provider trait writes", s.config.Features.ResourceProviderTraits), http.StatusNotImplemented)
+ http.Error(w, "crd mode is not yet implemented for resource provider trait writes", http.StatusNotImplemented)
default:
http.Error(w, "unknown feature mode", http.StatusInternalServerError)
}
diff --git a/internal/shim/placement/handle_resource_provider_traits_e2e.go b/internal/shim/placement/handle_resource_provider_traits_e2e.go
index c697ca7ff..4acd665b0 100644
--- a/internal/shim/placement/handle_resource_provider_traits_e2e.go
+++ b/internal/shim/placement/handle_resource_provider_traits_e2e.go
@@ -42,6 +42,15 @@ func e2eTestResourceProviderTraits(ctx context.Context, _ client.Client) error {
}
log.Info("Successfully created openstack client for resource provider traits e2e test")
+ // Resource provider trait writes (PUT/DELETE) are not yet implemented in
+ // crd mode, and the test RP created via POST won't exist as a Hypervisor
+ // CRD either, so skip the entire test in crd mode.
+ rpTraitsMode := e2eCurrentMode(ctx)
+ if rpTraitsMode == FeatureModeCRD {
+ log.Info("Skipping resource provider traits e2e test because mode is crd (writes not implemented)")
+ return nil
+ }
+
const testRPUUID = "e2e10000-0000-0000-0000-000000000003"
const testRPName = "cortex-e2e-test-rp-traits"
const testTrait = "CUSTOM_CORTEX_E2E_RP_TRAIT"
@@ -382,5 +391,5 @@ func e2eTestResourceProviderTraits(ctx context.Context, _ client.Client) error {
}
func init() {
- e2eTests = append(e2eTests, e2eTest{name: "resource_provider_traits", run: e2eTestResourceProviderTraits})
+ e2eTests = append(e2eTests, e2eTest{name: "resource_provider_traits", run: e2eWrapWithModes(e2eTestResourceProviderTraits)})
}
diff --git a/internal/shim/placement/handle_resource_provider_usages_e2e.go b/internal/shim/placement/handle_resource_provider_usages_e2e.go
index c548162ba..05965b23c 100644
--- a/internal/shim/placement/handle_resource_provider_usages_e2e.go
+++ b/internal/shim/placement/handle_resource_provider_usages_e2e.go
@@ -44,6 +44,15 @@ func e2eTestResourceProviderUsages(ctx context.Context, _ client.Client) error {
const testRPUUID = "e2e10000-0000-0000-0000-000000000005"
const testRPName = "cortex-e2e-test-rp-usages"
+ // Probe: for non-passthrough modes, verify endpoint returns 501.
+ unimplemented, err := e2eProbeUnimplemented(ctx, sc, sc.Endpoint+"/resource_providers/"+testRPUUID+"/usages")
+ if err != nil {
+ return fmt.Errorf("probe: %w", err)
+ }
+ if unimplemented {
+ return nil
+ }
+
// Pre-cleanup: delete any leftover test resource provider from a prior run.
log.Info("Pre-cleanup: deleting leftover test resource provider", "uuid", testRPUUID)
req, err := http.NewRequestWithContext(ctx,
@@ -227,5 +236,5 @@ func e2eTestResourceProviderUsages(ctx context.Context, _ client.Client) error {
}
func init() {
- e2eTests = append(e2eTests, e2eTest{name: "resource_provider_usages", run: e2eTestResourceProviderUsages})
+ e2eTests = append(e2eTests, e2eTest{name: "resource_provider_usages", run: e2eWrapWithModes(e2eTestResourceProviderUsages)})
}
diff --git a/internal/shim/placement/handle_resource_providers.go b/internal/shim/placement/handle_resource_providers.go
index 7d4dd1fff..04955fd72 100644
--- a/internal/shim/placement/handle_resource_providers.go
+++ b/internal/shim/placement/handle_resource_providers.go
@@ -115,7 +115,8 @@ func (s *Shim) HandleCreateResourceProvider(w http.ResponseWriter, r *http.Reque
ctx := r.Context()
log := logf.FromContext(ctx)
- switch s.config.Features.ResourceProviders.orDefault() {
+ mode := s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviders)
+ switch mode {
case FeatureModePassthrough:
s.forward(w, r)
return
@@ -184,7 +185,7 @@ func (s *Shim) HandleCreateResourceProvider(w http.ResponseWriter, r *http.Reque
}
// No conflict — forward to upstream placement (hybrid) or reject (crd).
- if s.config.Features.ResourceProviders.orDefault() == FeatureModeCRD {
+ if mode == FeatureModeCRD {
log.Info("crd mode: non-kvm resource provider create not supported", "name", req.Name)
http.Error(w, "resource provider not found", http.StatusNotFound)
return
@@ -209,7 +210,8 @@ func (s *Shim) HandleShowResourceProvider(w http.ResponseWriter, r *http.Request
ctx := r.Context()
log := logf.FromContext(ctx)
- switch s.config.Features.ResourceProviders.orDefault() {
+ mode := s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviders)
+ switch mode {
case FeatureModePassthrough:
s.forward(w, r)
return
@@ -229,7 +231,7 @@ func (s *Shim) HandleShowResourceProvider(w http.ResponseWriter, r *http.Request
var hvs hv1.HypervisorList
err := s.List(ctx, &hvs, client.MatchingFields{idxHypervisorOpenStackId: uuid})
if apierrors.IsNotFound(err) || len(hvs.Items) == 0 {
- if s.config.Features.ResourceProviders.orDefault() == FeatureModeCRD {
+ if mode == FeatureModeCRD {
log.Info("resource provider not found in kubernetes (crd mode)", "uuid", uuid)
http.Error(w, "resource provider not found", http.StatusNotFound)
return
@@ -278,7 +280,8 @@ func (s *Shim) HandleUpdateResourceProvider(w http.ResponseWriter, r *http.Reque
ctx := r.Context()
log := logf.FromContext(ctx)
- switch s.config.Features.ResourceProviders.orDefault() {
+ mode := s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviders)
+ switch mode {
case FeatureModePassthrough:
s.forward(w, r)
return
@@ -315,7 +318,7 @@ func (s *Shim) HandleUpdateResourceProvider(w http.ResponseWriter, r *http.Reque
var hvs hv1.HypervisorList
err = s.List(ctx, &hvs, client.MatchingFields{idxHypervisorOpenStackId: uuid})
if apierrors.IsNotFound(err) || len(hvs.Items) == 0 {
- if s.config.Features.ResourceProviders.orDefault() == FeatureModeCRD {
+ if mode == FeatureModeCRD {
log.Info("resource provider not found in kubernetes (crd mode)", "uuid", uuid)
http.Error(w, "resource provider not found", http.StatusNotFound)
return
@@ -373,7 +376,8 @@ func (s *Shim) HandleDeleteResourceProvider(w http.ResponseWriter, r *http.Reque
ctx := r.Context()
log := logf.FromContext(ctx)
- switch s.config.Features.ResourceProviders.orDefault() {
+ mode := s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviders)
+ switch mode {
case FeatureModePassthrough:
s.forward(w, r)
return
@@ -393,7 +397,7 @@ func (s *Shim) HandleDeleteResourceProvider(w http.ResponseWriter, r *http.Reque
var hvs hv1.HypervisorList
err := s.List(ctx, &hvs, client.MatchingFields{idxHypervisorOpenStackId: uuid})
if apierrors.IsNotFound(err) || len(hvs.Items) == 0 {
- if s.config.Features.ResourceProviders.orDefault() == FeatureModeCRD {
+ if mode == FeatureModeCRD {
log.Info("resource provider not found in kubernetes (crd mode)", "uuid", uuid)
http.Error(w, "resource provider not found", http.StatusNotFound)
return
@@ -448,7 +452,7 @@ type listResourceProvidersResponse struct {
//
// See: https://docs.openstack.org/api-ref/placement/#list-resource-providers
func (s *Shim) HandleListResourceProviders(w http.ResponseWriter, r *http.Request) {
- switch s.config.Features.ResourceProviders.orDefault() {
+ switch s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviders) {
case FeatureModePassthrough:
s.forward(w, r)
case FeatureModeHybrid:
diff --git a/internal/shim/placement/handle_resource_providers_e2e.go b/internal/shim/placement/handle_resource_providers_e2e.go
index 90b850369..2faca2fc2 100644
--- a/internal/shim/placement/handle_resource_providers_e2e.go
+++ b/internal/shim/placement/handle_resource_providers_e2e.go
@@ -58,14 +58,24 @@ func e2eTestResourceProviders(ctx context.Context, cl client.Client) error {
// ==================== Phase 1: VMware path ====================
- log.Info("=== VMware path: passthrough resource provider tests ===")
- if err := e2eVMwareResourceProviders(ctx, sc); err != nil {
- return fmt.Errorf("VMware path: %w", err)
+ // The VMware path creates synthetic test RPs against upstream placement.
+ // In crd mode there is no upstream, so skip it.
+ mode := e2eCurrentMode(ctx)
+ if mode == "" {
+ mode = config.Features.ResourceProviders.orDefault()
+ }
+ if mode != FeatureModeCRD {
+ log.Info("=== VMware path: passthrough resource provider tests ===")
+ if err := e2eVMwareResourceProviders(ctx, sc); err != nil {
+ return fmt.Errorf("VMware path: %w", err)
+ }
+ } else {
+ log.Info("Skipping VMware path because mode is crd (no upstream placement)")
}
// ==================== Phase 2: KVM path ====================
- if config.Features.ResourceProviders.orDefault() == FeatureModePassthrough {
+ if mode == FeatureModePassthrough {
log.Info("Skipping KVM resource provider e2e tests because resourceProviders mode is passthrough")
} else {
log.Info("=== KVM path: hypervisor-backed resource provider tests ===")
@@ -506,5 +516,5 @@ func e2eKVMResourceProviders(ctx context.Context, sc *gophercloud.ServiceClient,
}
func init() {
- e2eTests = append(e2eTests, e2eTest{name: "resource_providers", run: e2eTestResourceProviders})
+ e2eTests = append(e2eTests, e2eTest{name: "resource_providers", run: e2eWrapWithModes(e2eTestResourceProviders)})
}
diff --git a/internal/shim/placement/handle_root.go b/internal/shim/placement/handle_root.go
index acad69dcb..9f9b510e7 100644
--- a/internal/shim/placement/handle_root.go
+++ b/internal/shim/placement/handle_root.go
@@ -50,7 +50,7 @@ func (s *Shim) HandleGetRoot(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()
log := logf.FromContext(ctx)
- switch s.config.Features.Root.orDefault() {
+ switch s.featureModeFromConfOrHeader(r, s.config.Features.Root) {
case FeatureModePassthrough:
log.Info("forwarding GET / to upstream placement")
s.forward(w, r)
diff --git a/internal/shim/placement/handle_root_e2e.go b/internal/shim/placement/handle_root_e2e.go
index 2e558705a..e4a785606 100644
--- a/internal/shim/placement/handle_root_e2e.go
+++ b/internal/shim/placement/handle_root_e2e.go
@@ -51,5 +51,5 @@ func e2eTestGetRoot(ctx context.Context, _ client.Client) error {
}
func init() {
- e2eTests = append(e2eTests, e2eTest{name: "root", run: e2eTestGetRoot})
+ e2eTests = append(e2eTests, e2eTest{name: "root", run: e2eWrapWithModes(e2eTestGetRoot)})
}
diff --git a/internal/shim/placement/handle_traits.go b/internal/shim/placement/handle_traits.go
index b509b2599..429b87cfc 100644
--- a/internal/shim/placement/handle_traits.go
+++ b/internal/shim/placement/handle_traits.go
@@ -62,7 +62,7 @@ func (s *Shim) HandleListTraits(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()
log := logf.FromContext(ctx)
- switch s.config.Features.Traits.orDefault() {
+ switch s.featureModeFromConfOrHeader(r, s.config.Features.Traits) {
case FeatureModePassthrough, FeatureModeHybrid:
s.forward(w, r)
return
@@ -132,7 +132,7 @@ func (s *Shim) HandleShowTrait(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()
log := logf.FromContext(ctx)
- switch s.config.Features.Traits.orDefault() {
+ switch s.featureModeFromConfOrHeader(r, s.config.Features.Traits) {
case FeatureModePassthrough, FeatureModeHybrid:
s.forward(w, r)
return
@@ -174,7 +174,7 @@ func (s *Shim) HandleUpdateTrait(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()
log := logf.FromContext(ctx)
- switch s.config.Features.Traits.orDefault() {
+ switch s.featureModeFromConfOrHeader(r, s.config.Features.Traits) {
case FeatureModePassthrough, FeatureModeHybrid:
s.forward(w, r)
return
@@ -298,7 +298,7 @@ func (s *Shim) HandleDeleteTrait(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()
log := logf.FromContext(ctx)
- switch s.config.Features.Traits.orDefault() {
+ switch s.featureModeFromConfOrHeader(r, s.config.Features.Traits) {
case FeatureModePassthrough, FeatureModeHybrid:
s.forward(w, r)
return
diff --git a/internal/shim/placement/handle_traits_e2e.go b/internal/shim/placement/handle_traits_e2e.go
index 8a904b935..4a5831f72 100644
--- a/internal/shim/placement/handle_traits_e2e.go
+++ b/internal/shim/placement/handle_traits_e2e.go
@@ -83,7 +83,11 @@ func e2eTestTraits(ctx context.Context, _ client.Client) error {
// When traits are served locally (hybrid or crd mode) the static list may
// be empty. Only require at least one trait when forwarding to upstream
// placement, which always has standard traits.
- if config.Features.Traits.orDefault() == FeatureModePassthrough && len(listResp.Traits) == 0 {
+ traitsMode := e2eCurrentMode(ctx)
+ if traitsMode == "" {
+ traitsMode = config.Features.Traits.orDefault()
+ }
+ if traitsMode == FeatureModePassthrough && len(listResp.Traits) == 0 {
return errors.New("GET /traits: expected at least one trait, got 0")
}
log.Info("Successfully retrieved traits", "count", len(listResp.Traits))
@@ -133,8 +137,13 @@ func e2eTestTraits(ctx context.Context, _ client.Client) error {
// ==================== Phase 2: CRUD tests (feature-gated) ====================
- if config.Features.Traits.orDefault() == FeatureModePassthrough {
- log.Info("Skipping trait CRUD e2e tests because traits mode is passthrough")
+ // CRUD tests require traits ConfigMaps which are only created when the
+ // configured traits mode is hybrid or crd. The override header changes
+ // handler routing but cannot create ConfigMaps that don't exist.
+ configuredTraitsMode := config.Features.Traits.orDefault()
+ if traitsMode == FeatureModePassthrough || configuredTraitsMode == FeatureModePassthrough {
+ log.Info("Skipping trait CRUD e2e tests",
+ "overrideMode", traitsMode, "configuredMode", configuredTraitsMode)
return nil
}
@@ -287,47 +296,53 @@ func e2eTestTraits(ctx context.Context, _ client.Client) error {
}
log.Info("Verified test trait was deleted", "trait", testTrait)
- // Test PUT /traits/{name} with bad prefix → 400.
- log.Info("Testing PUT /traits/{name} with non-CUSTOM_ prefix")
- req, err = http.NewRequestWithContext(ctx,
- http.MethodPut, sc.Endpoint+"/traits/HW_CORTEX_E2E_BAD", http.NoBody)
- if err != nil {
- return fmt.Errorf("failed to create bad-prefix PUT request: %w", err)
- }
- req.Header.Set("X-Auth-Token", sc.TokenID)
- req.Header.Set("OpenStack-API-Version", "placement 1.6")
- resp, err = sc.HTTPClient.Do(req)
- if err != nil {
- return fmt.Errorf("failed to send bad-prefix PUT request: %w", err)
- }
- defer resp.Body.Close()
- if resp.StatusCode != http.StatusBadRequest {
- return fmt.Errorf("PUT /traits/HW_CORTEX_E2E_BAD: expected 400, got %d", resp.StatusCode)
- }
- log.Info("Correctly received 400 for PUT with non-CUSTOM_ prefix")
+ // Bad-prefix validation is only enforced by the shim in crd mode.
+ // In hybrid mode, writes forward to upstream which has different behavior.
+ if traitsMode == FeatureModeCRD {
+ // Test PUT /traits/{name} with bad prefix → 400.
+ log.Info("Testing PUT /traits/{name} with non-CUSTOM_ prefix")
+ req, err = http.NewRequestWithContext(ctx,
+ http.MethodPut, sc.Endpoint+"/traits/HW_CORTEX_E2E_BAD", http.NoBody)
+ if err != nil {
+ return fmt.Errorf("failed to create bad-prefix PUT request: %w", err)
+ }
+ req.Header.Set("X-Auth-Token", sc.TokenID)
+ req.Header.Set("OpenStack-API-Version", "placement 1.6")
+ resp, err = sc.HTTPClient.Do(req)
+ if err != nil {
+ return fmt.Errorf("failed to send bad-prefix PUT request: %w", err)
+ }
+ defer resp.Body.Close()
+ if resp.StatusCode != http.StatusBadRequest {
+ return fmt.Errorf("PUT /traits/HW_CORTEX_E2E_BAD: expected 400, got %d", resp.StatusCode)
+ }
+ log.Info("Correctly received 400 for PUT with non-CUSTOM_ prefix")
- // Test DELETE /traits/{name} with bad prefix → 400.
- log.Info("Testing DELETE /traits/{name} with non-CUSTOM_ prefix")
- req, err = http.NewRequestWithContext(ctx,
- http.MethodDelete, sc.Endpoint+"/traits/HW_CORTEX_E2E_BAD", http.NoBody)
- if err != nil {
- return fmt.Errorf("failed to create bad-prefix DELETE request: %w", err)
- }
- req.Header.Set("X-Auth-Token", sc.TokenID)
- req.Header.Set("OpenStack-API-Version", "placement 1.6")
- resp, err = sc.HTTPClient.Do(req)
- if err != nil {
- return fmt.Errorf("failed to send bad-prefix DELETE request: %w", err)
- }
- defer resp.Body.Close()
- if resp.StatusCode != http.StatusBadRequest {
- return fmt.Errorf("DELETE /traits/HW_CORTEX_E2E_BAD: expected 400, got %d", resp.StatusCode)
+ // Test DELETE /traits/{name} with bad prefix → 400.
+ log.Info("Testing DELETE /traits/{name} with non-CUSTOM_ prefix")
+ req, err = http.NewRequestWithContext(ctx,
+ http.MethodDelete, sc.Endpoint+"/traits/HW_CORTEX_E2E_BAD", http.NoBody)
+ if err != nil {
+ return fmt.Errorf("failed to create bad-prefix DELETE request: %w", err)
+ }
+ req.Header.Set("X-Auth-Token", sc.TokenID)
+ req.Header.Set("OpenStack-API-Version", "placement 1.6")
+ resp, err = sc.HTTPClient.Do(req)
+ if err != nil {
+ return fmt.Errorf("failed to send bad-prefix DELETE request: %w", err)
+ }
+ defer resp.Body.Close()
+ if resp.StatusCode != http.StatusBadRequest {
+ return fmt.Errorf("DELETE /traits/HW_CORTEX_E2E_BAD: expected 400, got %d", resp.StatusCode)
+ }
+ log.Info("Correctly received 400 for DELETE with non-CUSTOM_ prefix")
+ } else {
+ log.Info("Skipping bad-prefix validation tests (only enforced in crd mode)")
}
- log.Info("Correctly received 400 for DELETE with non-CUSTOM_ prefix")
return nil
}
func init() {
- e2eTests = append(e2eTests, e2eTest{name: "traits", run: e2eTestTraits})
+ e2eTests = append(e2eTests, e2eTest{name: "traits", run: e2eWrapWithModes(e2eTestTraits)})
}
diff --git a/internal/shim/placement/handle_usages_e2e.go b/internal/shim/placement/handle_usages_e2e.go
index c7ac8c965..66f5c40a9 100644
--- a/internal/shim/placement/handle_usages_e2e.go
+++ b/internal/shim/placement/handle_usages_e2e.go
@@ -39,6 +39,15 @@ func e2eTestUsages(ctx context.Context, _ client.Client) error {
const apiVersion = "placement 1.9"
+ // Probe: for non-passthrough modes, verify endpoint returns 501.
+ unimplemented, err := e2eProbeUnimplemented(ctx, sc, sc.Endpoint+"/usages?project_id=test")
+ if err != nil {
+ return fmt.Errorf("probe: %w", err)
+ }
+ if unimplemented {
+ return nil
+ }
+
// Get the list of projects from the identity service, so that we can test
// the /usages endpoint with a valid project id.
log.Info("Getting list of projects from identity service for usages e2e test")
@@ -113,5 +122,5 @@ func e2eTestUsages(ctx context.Context, _ client.Client) error {
}
func init() {
- e2eTests = append(e2eTests, e2eTest{name: "usages", run: e2eTestUsages})
+ e2eTests = append(e2eTests, e2eTest{name: "usages", run: e2eWrapWithModes(e2eTestUsages)})
}
diff --git a/internal/shim/placement/shim.go b/internal/shim/placement/shim.go
index 273273983..b46546b63 100644
--- a/internal/shim/placement/shim.go
+++ b/internal/shim/placement/shim.go
@@ -52,6 +52,18 @@ type requestIDContextKey struct{}
// header value through the request lifecycle for tracing.
var requestIDKey = requestIDContextKey{}
+// featureModeOverrideContextKey is a separate type for the per-request feature
+// mode override injected via the X-Cortex-Feature-Mode header.
+type featureModeOverrideContextKey struct{}
+
+// featureModeOverrideKey is the context key used to propagate the feature mode
+// override from the middleware to handlers.
+var featureModeOverrideKey = featureModeOverrideContextKey{}
+
+// headerFeatureModeOverride is the HTTP header that allows e2e tests to
+// override the configured feature mode on a per-request basis.
+const headerFeatureModeOverride = "X-Cortex-Feature-Mode"
+
// FeatureMode controls how an endpoint group interacts with upstream
// placement and the hypervisor CRD.
type FeatureMode string
@@ -90,16 +102,38 @@ func (m FeatureMode) valid() bool {
// dispatchPassthroughOnly forwards in passthrough mode, returns 501 for
// hybrid/crd, and 500 for unknown modes.
func (s *Shim) dispatchPassthroughOnly(w http.ResponseWriter, r *http.Request, mode FeatureMode) {
- switch mode.orDefault() {
+ resolved := s.featureModeFromConfOrHeader(r, mode)
+ switch resolved {
case FeatureModePassthrough:
s.forward(w, r)
case FeatureModeHybrid, FeatureModeCRD:
- http.Error(w, fmt.Sprintf("%s mode is not yet implemented for this endpoint", mode), http.StatusNotImplemented)
+ http.Error(w, fmt.Sprintf("%s mode is not yet implemented for this endpoint", resolved), http.StatusNotImplemented)
default:
http.Error(w, "unknown feature mode", http.StatusInternalServerError)
}
}
+// featureModeFromConfOrHeader returns the effective feature mode for the
+// current request. If a valid override is present in the request context
+// (injected by wrapHandler from the X-Cortex-Feature-Mode header), the
+// override takes precedence — unless it would escalate from passthrough into
+// a mode that requires backing config (Versioning, Traits) that was not
+// validated at startup. In that case the override is ignored and the
+// configured default is returned.
+func (s *Shim) featureModeFromConfOrHeader(r *http.Request, configured FeatureMode) FeatureMode {
+ override, ok := r.Context().Value(featureModeOverrideKey).(FeatureMode)
+ if !ok {
+ return configured.orDefault()
+ }
+ resolved := override.orDefault()
+ if resolved == FeatureModeHybrid || resolved == FeatureModeCRD {
+ if s.config.Versioning == nil && s.config.Traits == nil {
+ return configured.orDefault()
+ }
+ }
+ return resolved
+}
+
// featuresConfig controls the feature mode for each endpoint group.
// Every field defaults to passthrough (zero value) when omitted.
type featuresConfig struct {
@@ -472,13 +506,10 @@ func (s *Shim) SetupWithManager(ctx context.Context, mgr ctrl.Manager) (err erro
Buckets: prometheus.DefBuckets,
}, []string{"method", "pattern", "responsecode"})
- traitsMode := s.config.Features.Traits.orDefault()
- if traitsMode == FeatureModeHybrid || traitsMode == FeatureModeCRD {
- s.resourceLocker = resourcelock.NewResourceLocker(
- s.Client,
- os.Getenv("POD_NAMESPACE"),
- )
- }
+ s.resourceLocker = resourcelock.NewResourceLocker(
+ s.Client,
+ os.Getenv("POD_NAMESPACE"),
+ )
// Check that the provided client is a multicluster client, since we need
// that to watch for hypervisors across clusters.
diff --git a/internal/shim/placement/shim_e2e.go b/internal/shim/placement/shim_e2e.go
index e7f9e30f3..d839751a5 100644
--- a/internal/shim/placement/shim_e2e.go
+++ b/internal/shim/placement/shim_e2e.go
@@ -66,7 +66,7 @@ func makeE2EServiceClient(ctx context.Context, rc e2eRootConfig) (*gophercloud.S
log.Info("No SSO config provided, using plain transport for placement API")
transport = &http.Transport{}
}
- provider.HTTPClient.Transport = transport
+ provider.HTTPClient.Transport = &e2eModeTransport{base: transport}
if err := openstack.Authenticate(ctx, provider, authOpts); err != nil {
log.Error(err, "Failed to authenticate with keystone")
return nil, fmt.Errorf("failed to authenticate with keystone: %w", err)
@@ -88,6 +88,105 @@ type e2eTest struct {
// e2eTests is populated by init() functions in the handle_*_e2e.go files.
var e2eTests []e2eTest
+// e2eAllModes is the list of feature modes exercised by e2e tests when
+// AllowModeOverride is enabled.
+var e2eAllModes = []FeatureMode{
+ FeatureModePassthrough,
+ FeatureModeHybrid,
+ FeatureModeCRD,
+}
+
+// setFeatureModeHeader sets the X-Cortex-Feature-Mode override header on the
+// request so the shim dispatches to the specified mode regardless of its
+// configured mode.
+func setFeatureModeHeader(req *http.Request, mode FeatureMode) {
+ if mode != "" {
+ req.Header.Set(headerFeatureModeOverride, string(mode))
+ }
+}
+
+// e2eModeContextKey is used to pass the current test mode through context.
+type e2eModeContextKey struct{}
+
+// e2eCurrentMode retrieves the feature mode from context (set by
+// e2eWrapWithModes). Returns empty string if not set.
+func e2eCurrentMode(ctx context.Context) FeatureMode {
+ if m, ok := ctx.Value(e2eModeContextKey{}).(FeatureMode); ok {
+ return m
+ }
+ return ""
+}
+
+// e2eWrapWithModes returns a test function that iterates over all feature
+// modes. For each mode it injects the mode into context (retrievable via
+// e2eCurrentMode) so that the e2eModeTransport sets the override header on
+// every outgoing request.
+func e2eWrapWithModes(fn func(ctx context.Context, cl client.Client) error) func(ctx context.Context, cl client.Client) error {
+ return func(ctx context.Context, cl client.Client) error {
+ log := logf.FromContext(ctx)
+ for _, mode := range e2eAllModes {
+ modeLog := log.WithName(string(mode))
+ modeCtx := context.WithValue(ctx, e2eModeContextKey{}, mode)
+ modeCtx = logf.IntoContext(modeCtx, modeLog)
+ modeLog.Info("Starting mode")
+ if err := fn(modeCtx, cl); err != nil {
+ return fmt.Errorf("mode %s: %w", mode, err)
+ }
+ modeLog.Info("Mode passed")
+ }
+ return nil
+ }
+}
+
+// e2eProbeUnimplemented sends a single GET request with the mode override
+// header to verify the endpoint returns 501 Not Implemented. Returns true if
+// the endpoint is unimplemented for this mode (test should skip). Returns
+// false if the endpoint returned a success status (test should continue).
+// Returns an error for unexpected status codes (4xx/5xx other than 501).
+func e2eProbeUnimplemented(ctx context.Context, sc *gophercloud.ServiceClient, probeURL string) (bool, error) {
+ log := logf.FromContext(ctx)
+ mode := e2eCurrentMode(ctx)
+ if mode == "" || mode == FeatureModePassthrough {
+ return false, nil
+ }
+ req, err := http.NewRequestWithContext(ctx, http.MethodGet, probeURL, http.NoBody)
+ if err != nil {
+ return false, err
+ }
+ req.Header.Set("X-Auth-Token", sc.TokenID)
+ req.Header.Set("OpenStack-API-Version", "placement 1.6")
+ setFeatureModeHeader(req, mode)
+ resp, err := sc.HTTPClient.Do(req)
+ if err != nil {
+ return false, err
+ }
+ defer resp.Body.Close()
+ if resp.StatusCode == http.StatusNotImplemented {
+ log.Info("Endpoint correctly returns 501 for unimplemented mode", "mode", mode)
+ return true, nil
+ }
+ if resp.StatusCode >= http.StatusBadRequest {
+ return false, fmt.Errorf("probe %s in mode %s returned unexpected status %d", probeURL, mode, resp.StatusCode)
+ }
+ return false, nil
+}
+
+// e2eModeTransport wraps an http.RoundTripper to automatically inject the
+// X-Cortex-Feature-Mode header based on the mode stored in the request's
+// context (via e2eModeContextKey). This avoids manually calling
+// setFeatureModeHeader on every request in every e2e test.
+type e2eModeTransport struct {
+ base http.RoundTripper
+}
+
+func (t *e2eModeTransport) RoundTrip(req *http.Request) (*http.Response, error) {
+ if mode := e2eCurrentMode(req.Context()); mode != "" {
+ req = req.Clone(req.Context())
+ req.Header.Set(headerFeatureModeOverride, string(mode))
+ }
+ return t.base.RoundTrip(req)
+}
+
// RunE2E executes end-to-end tests for all placement shim handlers.
// It stops on the first failure and returns the error.
func RunE2E(ctx context.Context, cl client.Client) error {
diff --git a/internal/shim/placement/shim_io.go b/internal/shim/placement/shim_io.go
index 98d5ba0bc..792f8edc3 100644
--- a/internal/shim/placement/shim_io.go
+++ b/internal/shim/placement/shim_io.go
@@ -113,6 +113,14 @@ func (s *Shim) wrapHandler(pattern string, next http.HandlerFunc) http.HandlerFu
log = log.WithValues("requestID", reqID)
ctx = context.WithValue(ctx, requestIDKey, reqID)
}
+
+ // Read the feature mode override header and store in context.
+ if raw := r.Header.Get(headerFeatureModeOverride); raw != "" {
+ if fm := FeatureMode(raw); fm.valid() && fm != "" {
+ ctx = context.WithValue(ctx, featureModeOverrideKey, fm)
+ }
+ }
+
ctx = logf.IntoContext(ctx, log)
r = r.WithContext(ctx)
diff --git a/internal/shim/placement/shim_test.go b/internal/shim/placement/shim_test.go
index 503b94c72..ffc31e954 100644
--- a/internal/shim/placement/shim_test.go
+++ b/internal/shim/placement/shim_test.go
@@ -561,3 +561,137 @@ func TestWrapHandlerWithAuth(t *testing.T) {
}
})
}
+
+func TestFeatureModeFromConfOrHeader(t *testing.T) {
+ s := &Shim{config: config{
+ Traits: &traitsConfig{ConfigMapName: "test"},
+ }}
+
+ t.Run("returns configured mode when no override", func(t *testing.T) {
+ req := httptest.NewRequest(http.MethodGet, "/", http.NoBody)
+ got := s.featureModeFromConfOrHeader(req, FeatureModeHybrid)
+ if got != FeatureModeHybrid {
+ t.Fatalf("got %q, want %q", got, FeatureModeHybrid)
+ }
+ })
+
+ t.Run("defaults empty configured mode to passthrough", func(t *testing.T) {
+ req := httptest.NewRequest(http.MethodGet, "/", http.NoBody)
+ got := s.featureModeFromConfOrHeader(req, "")
+ if got != FeatureModePassthrough {
+ t.Fatalf("got %q, want %q", got, FeatureModePassthrough)
+ }
+ })
+
+ t.Run("returns override when present in context and backing config exists", func(t *testing.T) {
+ req := httptest.NewRequest(http.MethodGet, "/", http.NoBody)
+ ctx := context.WithValue(req.Context(), featureModeOverrideKey, FeatureModeCRD)
+ req = req.WithContext(ctx)
+ got := s.featureModeFromConfOrHeader(req, FeatureModePassthrough)
+ if got != FeatureModeCRD {
+ t.Fatalf("got %q, want %q", got, FeatureModeCRD)
+ }
+ })
+
+ t.Run("override to hybrid/crd ignored when no backing config", func(t *testing.T) {
+ bare := &Shim{}
+ req := httptest.NewRequest(http.MethodGet, "/", http.NoBody)
+ ctx := context.WithValue(req.Context(), featureModeOverrideKey, FeatureModeCRD)
+ req = req.WithContext(ctx)
+ got := bare.featureModeFromConfOrHeader(req, FeatureModePassthrough)
+ if got != FeatureModePassthrough {
+ t.Fatalf("got %q, want %q (override should be rejected without backing config)", got, FeatureModePassthrough)
+ }
+ })
+
+ t.Run("override to passthrough always allowed", func(t *testing.T) {
+ bare := &Shim{}
+ req := httptest.NewRequest(http.MethodGet, "/", http.NoBody)
+ ctx := context.WithValue(req.Context(), featureModeOverrideKey, FeatureModePassthrough)
+ req = req.WithContext(ctx)
+ got := bare.featureModeFromConfOrHeader(req, FeatureModeHybrid)
+ if got != FeatureModePassthrough {
+ t.Fatalf("got %q, want %q", got, FeatureModePassthrough)
+ }
+ })
+
+ t.Run("override defaults empty to passthrough", func(t *testing.T) {
+ req := httptest.NewRequest(http.MethodGet, "/", http.NoBody)
+ ctx := context.WithValue(req.Context(), featureModeOverrideKey, FeatureMode(""))
+ req = req.WithContext(ctx)
+ got := s.featureModeFromConfOrHeader(req, FeatureModeHybrid)
+ if got != FeatureModePassthrough {
+ t.Fatalf("got %q, want %q", got, FeatureModePassthrough)
+ }
+ })
+}
+
+func TestWrapHandlerFeatureModeOverride(t *testing.T) {
+ t.Run("valid header injects override into context", func(t *testing.T) {
+ var gotMode FeatureMode
+ down, up := newTestTimers()
+ s := &Shim{
+ config: config{PlacementURL: "http://unused"},
+ maxBodyLogSize: 4096,
+ downstreamRequestTimer: down,
+ upstreamRequestTimer: up,
+ }
+ wrapped := s.wrapHandler("/test", func(w http.ResponseWriter, r *http.Request) {
+ if override, ok := r.Context().Value(featureModeOverrideKey).(FeatureMode); ok {
+ gotMode = override
+ }
+ w.WriteHeader(http.StatusOK)
+ })
+ req := httptest.NewRequest(http.MethodGet, "/test", http.NoBody)
+ req.Header.Set(headerFeatureModeOverride, string(FeatureModeCRD))
+ w := httptest.NewRecorder()
+ wrapped(w, req)
+ if gotMode != FeatureModeCRD {
+ t.Fatalf("context override = %q, want %q", gotMode, FeatureModeCRD)
+ }
+ })
+
+ t.Run("invalid header value is ignored", func(t *testing.T) {
+ var gotOverride bool
+ down, up := newTestTimers()
+ s := &Shim{
+ config: config{PlacementURL: "http://unused"},
+ maxBodyLogSize: 4096,
+ downstreamRequestTimer: down,
+ upstreamRequestTimer: up,
+ }
+ wrapped := s.wrapHandler("/test", func(w http.ResponseWriter, r *http.Request) {
+ _, gotOverride = r.Context().Value(featureModeOverrideKey).(FeatureMode)
+ w.WriteHeader(http.StatusOK)
+ })
+ req := httptest.NewRequest(http.MethodGet, "/test", http.NoBody)
+ req.Header.Set(headerFeatureModeOverride, "bogus")
+ w := httptest.NewRecorder()
+ wrapped(w, req)
+ if gotOverride {
+ t.Fatal("override should not be set for invalid mode value")
+ }
+ })
+
+ t.Run("empty header value is ignored", func(t *testing.T) {
+ var gotOverride bool
+ down, up := newTestTimers()
+ s := &Shim{
+ config: config{PlacementURL: "http://unused"},
+ maxBodyLogSize: 4096,
+ downstreamRequestTimer: down,
+ upstreamRequestTimer: up,
+ }
+ wrapped := s.wrapHandler("/test", func(w http.ResponseWriter, r *http.Request) {
+ _, gotOverride = r.Context().Value(featureModeOverrideKey).(FeatureMode)
+ w.WriteHeader(http.StatusOK)
+ })
+ req := httptest.NewRequest(http.MethodGet, "/test", http.NoBody)
+ req.Header.Set(headerFeatureModeOverride, "")
+ w := httptest.NewRecorder()
+ wrapped(w, req)
+ if gotOverride {
+ t.Fatal("override should not be set for empty header")
+ }
+ })
+}
From 063e91c174245ed855f04cee90d2ad562ad35d3a Mon Sep 17 00:00:00 2001
From: mblos <156897072+mblos@users.noreply.github.com>
Date: Wed, 29 Apr 2026 08:11:38 +0200
Subject: [PATCH 05/54] feat: CommittedResource CRD controller added (#765)
Introduces committed_resource_controller.go that watches CommittedResource CRDs and owns all child Reservation CRUD.
---
api/v1alpha1/committed_resource_types.go | 18 +
cmd/manager/main.go | 9 +
.../committed-resource-reservations.md | 152 ++--
.../crds/cortex.cloud_committedresources.yaml | 12 +
.../committed_resource_controller.go | 279 ++++++
.../committed_resource_controller_test.go | 441 ++++++++++
.../committed_resource_integration_test.go | 331 ++++++++
.../reservations/commitments/field_index.go | 43 +
...ontroller.go => reservation_controller.go} | 0
...test.go => reservation_controller_test.go} | 181 +---
.../commitments/reservation_manager.go | 12 +-
.../commitments/reservation_manager_test.go | 798 ++++--------------
.../reservations/commitments/state.go | 4 +
.../reservations/commitments/syncer_test.go | 9 +-
14 files changed, 1467 insertions(+), 822 deletions(-)
create mode 100644 internal/scheduling/reservations/commitments/committed_resource_controller.go
create mode 100644 internal/scheduling/reservations/commitments/committed_resource_controller_test.go
create mode 100644 internal/scheduling/reservations/commitments/committed_resource_integration_test.go
create mode 100644 internal/scheduling/reservations/commitments/field_index.go
rename internal/scheduling/reservations/commitments/{controller.go => reservation_controller.go} (100%)
rename internal/scheduling/reservations/commitments/{controller_test.go => reservation_controller_test.go} (75%)
diff --git a/api/v1alpha1/committed_resource_types.go b/api/v1alpha1/committed_resource_types.go
index 5ed61a11a..a6f1bd217 100644
--- a/api/v1alpha1/committed_resource_types.go
+++ b/api/v1alpha1/committed_resource_types.go
@@ -90,6 +90,18 @@ type CommittedResourceSpec struct {
// +kubebuilder:validation:Enum=planned;pending;guaranteed;confirmed;superseded;expired
// +kubebuilder:validation:Required
State CommitmentStatus `json:"state"`
+
+ // AllowRejection controls what the CommittedResource controller does when placement fails
+ // for a guaranteed or confirmed commitment.
+ // true — controller may reject: on failure, child Reservations are rolled back and the CR
+ // is marked Rejected. Use this when the caller is making a first-time placement
+ // decision and a "no" answer is acceptable (e.g. the change-commitments API).
+ // false — controller must retry: on failure, existing child Reservations are kept and the
+ // CR is set to Reserving so the controller retries later. Use this when the caller
+ // is restoring already-committed state that Cortex must honour (e.g. the syncer).
+ // Only meaningful for state=guaranteed or state=confirmed; ignored for all other states.
+ // +kubebuilder:validation:Optional
+ AllowRejection bool `json:"allowRejection,omitempty"`
}
// CommittedResourceStatus defines the observed state of CommittedResource.
@@ -131,6 +143,12 @@ type CommittedResourceStatus struct {
Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"`
}
+const (
+ // CommittedResourceConditionReady indicates whether the CommittedResource has been
+ // successfully reconciled into active Reservation CRDs.
+ CommittedResourceConditionReady = "Ready"
+)
+
// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
// +kubebuilder:resource:scope=Cluster
diff --git a/cmd/manager/main.go b/cmd/manager/main.go
index b74b21d1b..ba1cd52e8 100644
--- a/cmd/manager/main.go
+++ b/cmd/manager/main.go
@@ -548,6 +548,15 @@ func main() {
setupLog.Error(err, "unable to create controller", "controller", "CommitmentReservation")
os.Exit(1)
}
+
+ if err := (&commitments.CommittedResourceController{
+ Client: multiclusterClient,
+ Scheme: mgr.GetScheme(),
+ Conf: commitmentsConfig,
+ }).SetupWithManager(mgr, multiclusterClient); err != nil {
+ setupLog.Error(err, "unable to create controller", "controller", "CommittedResource")
+ os.Exit(1)
+ }
}
if slices.Contains(mainConfig.EnabledControllers, "datasource-controllers") {
setupLog.Info("enabling controller", "controller", "datasource-controllers")
diff --git a/docs/reservations/committed-resource-reservations.md b/docs/reservations/committed-resource-reservations.md
index 52890bf75..95f8b8bd5 100644
--- a/docs/reservations/committed-resource-reservations.md
+++ b/docs/reservations/committed-resource-reservations.md
@@ -7,17 +7,20 @@ Cortex reserves hypervisor capacity for customers who pre-commit resources (comm
- [Configuration and Observability](#configuration-and-observability)
- [Lifecycle Management](#lifecycle-management)
- [State (CRDs)](#state-crds)
- - [CR Reservation Lifecycle](#cr-reservation-lifecycle)
- - [VM Lifecycle](#vm-lifecycle)
- - [Capacity Blocking](#capacity-blocking)
+ - [CR Commitment Lifecycle](#cr-commitment-lifecycle)
+ - [CommittedResource Controller](#committedresource-controller)
+ - [Reservation Lifecycle](#reservation-lifecycle)
+ - [VM Lifecycle](#vm-lifecycle)
+ - [Capacity Blocking](#capacity-blocking)
+ - [Reservation Controller](#reservation-controller)
- [Change-Commitments API](#change-commitments-api)
- [Syncer Task](#syncer-task)
- - [Controller (Reconciliation)](#controller-reconciliation)
- [Usage API](#usage-api)
The CR reservation implementation is located in `internal/scheduling/reservations/commitments/`. Key components include:
-- Controller logic (`controller.go`)
-- API handlers in the `api/` subpackage (`change_commitments.go`, `report_capacity.go`, `report_usage.go`)
+- `CommittedResource` controller (`committed_resource_controller.go`) — acceptance, rejection, child Reservation CRUD
+- `Reservation` controller (`reservation_controller.go`) — placement, VM allocation verification
+- API endpoints (`api_*.go`)
- Capacity and usage calculation logic (`capacity.go`, `usage.go`)
- Syncer for periodic state sync (`syncer.go`)
@@ -35,47 +38,103 @@ The CR reservation implementation is located in `internal/scheduling/reservation
## Lifecycle Management
-### State (CRDs)
-Defined in `api/v1alpha1/reservation_types.go`, which contains definitions for CR reservations and failover reservations (see [./failover-reservations.md](./failover-reservations.md)).
-
-A reservation CRD represents a single reservation slot on a hypervisor, which holds multiple VMs.
-A single CR entry typically refers to multiple reservation CRDs (slots).
-
-
-### CR Reservation Lifecycle
+The system is organized around two CRD types and two controllers. `CommittedResource` CRDs represent customer commitments; `Reservation` CRDs represent individual hypervisor capacity slots. Each has its own controller with a well-defined responsibility boundary.
```mermaid
flowchart LR
subgraph State
+ CR[(CommittedResource CRDs)]
Res[(Reservation CRDs)]
end
-
+
Syncer[Syncer Task]
ChangeAPI[Change API]
CapacityAPI[Capacity API]
- Controller[Controller]
+ CRCtrl[CommittedResource Controller]
+ ResCtrl[Reservation Controller]
UsageAPI[Usage API]
Scheduler[Scheduler API]
-
- ChangeAPI -->|CRUD| Res
- Syncer -->|CRUD| Res
+
+ ChangeAPI -->|CRUD| CR
+ Syncer -->|CRUD| CR
+ UsageAPI -->|read| CR
UsageAPI -->|read| Res
CapacityAPI -->|read| Res
CapacityAPI -->|capacity request| Scheduler
- Res -->|watch| Controller
- Controller -->|update spec/status| Res
- Controller -->|reservation placement request| Scheduler
+ CR -->|watch| CRCtrl
+ CRCtrl -->|CRUD child Reservation slots| Res
+ CRCtrl -->|update status| CR
+ Res -->|watch| CRCtrl
+ Res -->|watch| ResCtrl
+ ResCtrl -->|placement request| Scheduler
+ ResCtrl -->|update status| Res
+```
+
+### State (CRDs)
+
+**`CommittedResource` CRD** (`committed_resource_types.go`) — primary source of truth for a commitment accepted by Cortex. One CRD per commitment UUID. Spec holds the commitment identity (project, flavor group, ...). Status holds the acceptance outcome (`Ready` condition with reason `Planned`/`Reserving`/`Rejected`) and the accepted amount.
+
+**`Reservation` CRD** (`reservation_types.go`) — a single reservation slot on a hypervisor, owned by a `CommittedResource`. One `CommittedResource` typically drives multiple `Reservation` CRDs (one per flavor-sized slot). See [./failover-reservations.md](./failover-reservations.md) for the failover reservation type.
+
+### CR Commitment Lifecycle
+
+The CR commitment lifecycle covers everything from a commitment being accepted by Limes through to Cortex confirming or rejecting it. The `CommittedResource` CRD is the entry point; the `CommittedResource` controller owns the acceptance decision.
+
+**Limes state → Cortex action:**
+
+| Limes State | Meaning | Cortex action |
+|---|---|---|
+| `planned` | Future start, no guarantee yet | No Reservations — capacity not blocked |
+| `pending` | Limes asking for a yes/no decision now | One-shot attempt — accept or reject; no retry |
+| `guaranteed` / `confirmed` | Capacity must be honoured | Place Reservations and keep them in sync; see failure handling below |
+| `superseded` / `expired` | Commitment no longer active | Remove all child Reservations |
+
+**CommittedResource status conditions (Cortex-side):**
+
+```mermaid
+stateDiagram-v2
+ direction LR
+ state "Planned (Ready=False)" as Planned
+ state "Reserving (Ready=False)" as Reserving
+ state "Active (Ready=True)" as Active
+ state "Rejected (Ready=False)" as Rejected
+
+ [*] --> Planned : state=planned
+ [*] --> Reserving : state=pending / guaranteed / confirmed
+ Planned --> Reserving : state changes to pending/guaranteed/confirmed
+ Reserving --> Active : placement succeeded
+ Reserving --> Rejected : placement failed — pending, or AllowRejection=true
+ Reserving --> Reserving : placement failed — retrying (AllowRejection=false)
+ Active --> Reserving : spec changed (e.g. resize)
+ Active --> [*] : state=superseded / expired
+ Rejected --> [*] : deleted
+ Planned --> [*] : deleted
```
-Reservations are managed through the Change API, Syncer Task, and Controller reconciliation.
+#### CommittedResource Controller
+
+The controller's job is to keep child `Reservation` CRDs in sync with the desired state expressed in `Spec.Amount`. The key rules:
+
+- **`pending`**: Cortex is being asked for a yes/no decision. If placement fails for any reason, child Reservations are removed and the CR is marked Rejected. The caller (e.g. the change-commitments API) reads the outcome and reports back to Limes. No retry.
+
+- **`guaranteed` / `confirmed`**: Cortex is expected to honour the commitment. The default is to keep retrying until placement succeeds (`Ready=False, Reason=Reserving`). Callers that can accept "no" as an answer (e.g. the change-commitments API on a resize request) set `Spec.AllowRejection=true`; the controller then rejects on failure instead of retrying.
+
+- **On rejection**: rolls back child Reservations to the last successfully placed quantity (`Status.AcceptedAmount`). For a CR that was never accepted, this means removing all child Reservations.
+
+The controller communicates with the Reservation controller only through CRDs — no direct calls.
+
+### Reservation Lifecycle
| Component | Event | Timing | Action |
|-----------|-------|--------|--------|
-| **Change API / Syncer** | CR Create, Resize, Delete | Immediate/Hourly | Create/update/delete Reservation CRDs |
-| **Controller** | Placement | On creation | Find host via scheduler API, set `TargetHost` |
-| **Controller** | Optimize unused slots | >> minutes | Assign PAYG VMs or re-place reservations |
+| **Reservation Controller** | `Reservation` created | Immediate (watch) | Find host via scheduler API, set `TargetHost` |
+| **Scheduling Pipeline** | VM Create, Migrate, Resize | Immediate | Add VM to `Spec.Allocations` |
+| **Reservation Controller** | Reservation CRD updated | `committedResourceRequeueIntervalGracePeriod` (default: 1 min) | Defer verification for new VMs still spawning; update `Status.Allocations` |
+| **Reservation Controller** | Hypervisor CRD updated (VM appeared/disappeared) | Immediate (event-driven) | Verify allocations via Hypervisor CRD; remove gone VMs from `Spec.Allocations` |
+| **Reservation Controller** | Periodic safety-net | `committedResourceRequeueIntervalActive` (default: 5 min) | Same as above; catches any missed events |
+| **Reservation Controller** | Optimize unused slots | >> minutes | Assign PAYG VMs or re-place reservations |
-### VM Lifecycle
+#### VM Lifecycle
VM allocations are tracked within reservations:
@@ -87,19 +146,12 @@ flowchart LR
end
A[Nova Scheduler] -->|VM Create/Migrate/Resize| B[Scheduling Pipeline]
B -->|update Spec.Allocations| Res
- Res -->|watch| C[Controller]
+ Res -->|watch| C[Reservation Controller]
HV -->|watch - instance changes| C
Res -->|periodic safety-net requeue| C
C -->|update Spec/Status.Allocations| Res
```
-| Component | Event | Timing | Action |
-|-----------|-------|--------|--------|
-| **Scheduling Pipeline** | VM Create, Migrate, Resize | Immediate | Add VM to `Spec.Allocations` |
-| **Controller** | Reservation CRD updated | `committedResourceRequeueIntervalGracePeriod` (default: 1 min) | Defer verification for new VMs still spawning; update `Status.Allocations` |
-| **Controller** | Hypervisor CRD updated (VM appeared/disappeared) | Immediate (event-driven) | Verify allocations via Hypervisor CRD; remove gone VMs from `Spec.Allocations` |
-| **Controller** | Periodic safety-net | `committedResourceRequeueIntervalActive` (default: 5 min) | Same as above; catches any missed events |
-
**Allocation fields**:
- `Spec.Allocations` — Expected VMs (written by the scheduling pipeline on placement)
- `Status.Allocations` — Confirmed VMs (written by the controller after verifying the VM is on the expected host)
@@ -124,7 +176,7 @@ stateDiagram-v2
**Note**: VM allocations may not consume all resources of a reservation slot. A reservation with 128 GB may have VMs totaling only 96 GB if that fits the project's needs. Allocations may exceed reservation capacity (e.g., after VM resize).
-### Capacity Blocking
+#### Capacity Blocking
**Blocking rules by allocation state:**
@@ -161,6 +213,19 @@ When a reservation is being migrated to a new host, block the full `max(Spec.Res
- **VM live migration within a reservation** (VM moves away from the reservation's host): handled implicitly by `hv.Status.Allocation`. Libvirt reports resource consumption on both source and target during live migration, so both hosts' `hv.Status.Allocation` already reflects the in-flight state. No special filter logic needed. The reservation controller will eventually remove the VM from the reservation once it's confirmed on the wrong host past the grace period.
+#### Reservation Controller
+
+The `Reservation` controller (`CommitmentReservationController`) watches `Reservation` CRDs and `Hypervisor` CRDs. `MaxConcurrentReconciles=1` prevents overbooking during concurrent placements.
+
+**Placement** — finds hosts for new reservations (calls scheduler API)
+
+**Allocation Verification** — tracks VM lifecycle on reservations. The controller uses the Hypervisor CRD as the sole source of truth, with two triggers:
+- New VMs (within `committedResourceAllocationGracePeriod`, default: 15 min): verification deferred — VM may still be spawning; requeued every `committedResourceRequeueIntervalGracePeriod` (default: 1 min)
+- Established VMs: verified reactively when the Hypervisor CRD changes (VM appeared or disappeared in `Status.Instances`), with `committedResourceRequeueIntervalActive` (default: 5 min) as a safety-net fallback
+- Missing VMs: removed from `Spec.Allocations` when not found on the Hypervisor CRD after the grace period
+
+**Reservation migration is not supported yet.**
+
### Change-Commitments API
The change-commitments API receives batched commitment changes from Limes and manages reservations accordingly.
@@ -176,19 +241,6 @@ The change-commitments API receives batched commitment changes from Limes and ma
The syncer task runs periodically and syncs local Reservation CRD state to match Limes' view of commitments, correcting drift from missed API calls or restarts.
-### Controller (Reconciliation)
-
-The controller watches Reservation CRDs and performs two types of reconciliation:
-
-**Placement** - Finds hosts for new reservations (calls scheduler API)
-
-**Allocation Verification** - Tracks VM lifecycle on reservations. The controller uses the Hypervisor CRD as the sole source of truth, with two triggers:
-- New VMs (within `committedResourceAllocationGracePeriod`, default: 15 min): verification deferred — VM may still be spawning; requeued every `committedResourceRequeueIntervalGracePeriod` (default: 1 min)
-- Established VMs: verified reactively when the Hypervisor CRD changes (VM appeared or disappeared in `Status.Instances`), with `committedResourceRequeueIntervalActive` (default: 5 min) as a safety-net fallback
-- Missing VMs: removed from `Spec.Allocations` when not found on the Hypervisor CRD after the grace period
-
-**Reservation migration is not supported yet.**
-
### Usage API
For each flavor group `X` that accepts commitments, Cortex exposes three resource types:
@@ -196,4 +248,4 @@ For each flavor group `X` that accepts commitments, Cortex exposes three resourc
- `hw_version_X_cores` — CPU cores derived from RAM via fixed ratio (`HandlesCommitments=false`)
- `hw_version_X_instances` — instance count (`HandlesCommitments=false`)
-For each VM, the API reports whether it accounts to a specific commitment or PAYG. This assignment is deterministic and may differ from the actual Cortex internal assignment used for scheduling.
\ No newline at end of file
+For each VM, the API reports whether it accounts to a specific commitment or PAYG. This assignment is deterministic and may differ from the actual Cortex internal assignment used for scheduling.
diff --git a/helm/library/cortex/files/crds/cortex.cloud_committedresources.yaml b/helm/library/cortex/files/crds/cortex.cloud_committedresources.yaml
index 73cc8f9a2..092827edd 100644
--- a/helm/library/cortex/files/crds/cortex.cloud_committedresources.yaml
+++ b/helm/library/cortex/files/crds/cortex.cloud_committedresources.yaml
@@ -75,6 +75,18 @@ spec:
spec:
description: CommittedResourceSpec defines the desired state of CommittedResource,
properties:
+ allowRejection:
+ description: |-
+ AllowRejection controls what the CommittedResource controller does when placement fails
+ for a guaranteed or confirmed commitment.
+ true — controller may reject: on failure, child Reservations are rolled back and the CR
+ is marked Rejected. Use this when the caller is making a first-time placement
+ decision and a "no" answer is acceptable (e.g. the change-commitments API).
+ false — controller must retry: on failure, existing child Reservations are kept and the
+ CR is set to Reserving so the controller retries later. Use this when the caller
+ is restoring already-committed state that Cortex must honour (e.g. the syncer).
+ Only meaningful for state=guaranteed or state=confirmed; ignored for all other states.
+ type: boolean
amount:
anyOf:
- type: integer
diff --git a/internal/scheduling/reservations/commitments/committed_resource_controller.go b/internal/scheduling/reservations/commitments/committed_resource_controller.go
new file mode 100644
index 000000000..a25d63e3a
--- /dev/null
+++ b/internal/scheduling/reservations/commitments/committed_resource_controller.go
@@ -0,0 +1,279 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package commitments
+
+import (
+ "context"
+ "fmt"
+
+ "github.com/go-logr/logr"
+ "k8s.io/apimachinery/pkg/api/meta"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/apimachinery/pkg/runtime"
+ "k8s.io/apimachinery/pkg/types"
+ ctrl "sigs.k8s.io/controller-runtime"
+ "sigs.k8s.io/controller-runtime/pkg/client"
+ "sigs.k8s.io/controller-runtime/pkg/controller"
+ "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
+ "sigs.k8s.io/controller-runtime/pkg/handler"
+
+ "github.com/cobaltcore-dev/cortex/api/v1alpha1"
+ "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations"
+ "github.com/cobaltcore-dev/cortex/pkg/multicluster"
+)
+
+const crFinalizer = "committed-resource.reservations.cortex.cloud/cleanup"
+
+// CommittedResourceController reconciles CommittedResource CRDs and owns all child Reservation CRUD.
+type CommittedResourceController struct {
+ client.Client
+ Scheme *runtime.Scheme
+ Conf Config
+}
+
+func (r *CommittedResourceController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
+ var cr v1alpha1.CommittedResource
+ if err := r.Get(ctx, req.NamespacedName, &cr); err != nil {
+ return ctrl.Result{}, client.IgnoreNotFound(err)
+ }
+
+ ctx = WithNewGlobalRequestID(ctx)
+ logger := LoggerFromContext(ctx).WithValues(
+ "component", "committed-resource-controller",
+ "committedResource", req.Name,
+ )
+
+ if !cr.DeletionTimestamp.IsZero() {
+ return r.reconcileDeletion(ctx, logger, &cr)
+ }
+
+ if !controllerutil.ContainsFinalizer(&cr, crFinalizer) {
+ controllerutil.AddFinalizer(&cr, crFinalizer)
+ if err := r.Update(ctx, &cr); err != nil {
+ return ctrl.Result{}, fmt.Errorf("failed to add finalizer: %w", err)
+ }
+ return ctrl.Result{}, nil
+ }
+
+ switch cr.Spec.State {
+ case v1alpha1.CommitmentStatusPlanned:
+ return ctrl.Result{}, r.setNotReady(ctx, &cr, "Planned", "commitment is not yet active")
+ case v1alpha1.CommitmentStatusPending:
+ return r.reconcilePending(ctx, logger, &cr)
+ case v1alpha1.CommitmentStatusGuaranteed, v1alpha1.CommitmentStatusConfirmed:
+ return r.reconcileCommitted(ctx, logger, &cr)
+ case v1alpha1.CommitmentStatusSuperseded, v1alpha1.CommitmentStatusExpired:
+ return r.reconcileInactive(ctx, logger, &cr)
+ default:
+ logger.Info("unknown commitment state, skipping", "state", cr.Spec.State)
+ return ctrl.Result{}, nil
+ }
+}
+
+// reconcilePending handles a one-shot confirmation attempt (Limes state: pending).
+// If placement fails for any reason, all partial reservations are removed and the
+// CR is marked Rejected so the HTTP API can report the outcome back to Limes.
+func (r *CommittedResourceController) reconcilePending(ctx context.Context, logger logr.Logger, cr *v1alpha1.CommittedResource) (ctrl.Result, error) {
+ if applyErr := r.applyReservationState(ctx, logger, cr); applyErr != nil {
+ logger.Error(applyErr, "pending commitment placement failed, rejecting")
+ if rollbackErr := r.deleteChildReservations(ctx, cr); rollbackErr != nil {
+ return ctrl.Result{}, rollbackErr
+ }
+ return ctrl.Result{}, r.setNotReady(ctx, cr, "Rejected", applyErr.Error())
+ }
+ return ctrl.Result{}, r.setAccepted(ctx, cr)
+}
+
+func (r *CommittedResourceController) reconcileCommitted(ctx context.Context, logger logr.Logger, cr *v1alpha1.CommittedResource) (ctrl.Result, error) {
+ // Spec errors are permanent regardless of AllowRejection — a bad spec won't fix itself.
+ if _, err := FromCommittedResource(*cr); err != nil {
+ logger.Error(err, "invalid commitment spec, rejecting")
+ return ctrl.Result{}, r.setNotReady(ctx, cr, "Rejected", err.Error())
+ }
+ if applyErr := r.applyReservationState(ctx, logger, cr); applyErr != nil {
+ if cr.Spec.AllowRejection {
+ logger.Error(applyErr, "committed placement failed, rolling back to accepted amount")
+ if rollbackErr := r.rollbackToAccepted(ctx, logger, cr); rollbackErr != nil {
+ return ctrl.Result{}, rollbackErr
+ }
+ return ctrl.Result{}, r.setNotReady(ctx, cr, "Rejected", applyErr.Error())
+ }
+ logger.Error(applyErr, "committed placement incomplete, will retry", "requeueAfter", r.Conf.RequeueIntervalRetry)
+ return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalRetry}, r.setNotReady(ctx, cr, "Reserving", applyErr.Error())
+ }
+ return ctrl.Result{}, r.setAccepted(ctx, cr)
+}
+
+func (r *CommittedResourceController) applyReservationState(ctx context.Context, logger logr.Logger, cr *v1alpha1.CommittedResource) error {
+ knowledge := &reservations.FlavorGroupKnowledgeClient{Client: r.Client}
+ flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil)
+ if err != nil {
+ return fmt.Errorf("flavor knowledge not ready: %w", err)
+ }
+
+ state, err := FromCommittedResource(*cr)
+ if err != nil {
+ return fmt.Errorf("invalid commitment spec: %w", err)
+ }
+ state.NamePrefix = cr.Name + "-"
+ state.CreatorRequestID = reservations.GlobalRequestIDFromContext(ctx)
+
+ result, err := NewReservationManager(r.Client).ApplyCommitmentState(ctx, logger, state, flavorGroups, "committed-resource-controller")
+ if err != nil {
+ return err
+ }
+ logger.Info("commitment state applied", "created", result.Created, "deleted", result.Deleted, "repaired", result.Repaired)
+ return nil
+}
+
+func (r *CommittedResourceController) setAccepted(ctx context.Context, cr *v1alpha1.CommittedResource) error {
+ now := metav1.Now()
+ old := cr.DeepCopy()
+ acceptedAmount := cr.Spec.Amount.DeepCopy()
+ cr.Status.AcceptedAmount = &acceptedAmount
+ cr.Status.AcceptedAt = &now
+ meta.SetStatusCondition(&cr.Status.Conditions, metav1.Condition{
+ Type: v1alpha1.CommittedResourceConditionReady,
+ Status: metav1.ConditionTrue,
+ Reason: "Accepted",
+ Message: "commitment successfully reserved",
+ LastTransitionTime: now,
+ })
+ if err := r.Status().Patch(ctx, cr, client.MergeFrom(old)); err != nil {
+ return client.IgnoreNotFound(err)
+ }
+ return nil
+}
+
+func (r *CommittedResourceController) reconcileInactive(ctx context.Context, logger logr.Logger, cr *v1alpha1.CommittedResource) (ctrl.Result, error) {
+ if err := r.deleteChildReservations(ctx, cr); err != nil {
+ return ctrl.Result{}, err
+ }
+ logger.Info("commitment inactive, child reservations removed", "state", cr.Spec.State)
+ return ctrl.Result{}, r.setNotReady(ctx, cr, string(cr.Spec.State), "commitment is no longer active")
+}
+
+func (r *CommittedResourceController) reconcileDeletion(ctx context.Context, logger logr.Logger, cr *v1alpha1.CommittedResource) (ctrl.Result, error) {
+ if err := r.deleteChildReservations(ctx, cr); err != nil {
+ return ctrl.Result{}, err
+ }
+ controllerutil.RemoveFinalizer(cr, crFinalizer)
+ if err := r.Update(ctx, cr); err != nil {
+ return ctrl.Result{}, client.IgnoreNotFound(err)
+ }
+ logger.Info("committed resource deleted, child reservations cleaned up")
+ return ctrl.Result{}, nil
+}
+
+// deleteChildReservations deletes all Reservation CRDs owned by this CommittedResource,
+// identified by matching CommitmentUUID in the reservation spec.
+func (r *CommittedResourceController) deleteChildReservations(ctx context.Context, cr *v1alpha1.CommittedResource) error {
+ var list v1alpha1.ReservationList
+ if err := r.List(ctx, &list, client.MatchingLabels{
+ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
+ }); err != nil {
+ return fmt.Errorf("failed to list reservations: %w", err)
+ }
+ for i := range list.Items {
+ res := &list.Items[i]
+ if res.Spec.CommittedResourceReservation == nil ||
+ res.Spec.CommittedResourceReservation.CommitmentUUID != cr.Spec.CommitmentUUID {
+ continue
+ }
+ if err := r.Delete(ctx, res); client.IgnoreNotFound(err) != nil {
+ return fmt.Errorf("failed to delete reservation %s: %w", res.Name, err)
+ }
+ }
+ return nil
+}
+
+// rollbackToAccepted restores child Reservations to match Status.AcceptedAmount.
+// If AcceptedAmount is nil (new CR that was never accepted), all child Reservations are deleted.
+func (r *CommittedResourceController) rollbackToAccepted(ctx context.Context, logger logr.Logger, cr *v1alpha1.CommittedResource) error {
+ if cr.Status.AcceptedAmount == nil {
+ return r.deleteChildReservations(ctx, cr)
+ }
+ knowledge := &reservations.FlavorGroupKnowledgeClient{Client: r.Client}
+ flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil)
+ if err != nil {
+ // Can't compute the rollback target — fall back to full delete rather than leaving
+ // a partial state that's inconsistent with the unknown AcceptedAmount.
+ logger.Error(err, "flavor knowledge unavailable during rollback, deleting all child reservations")
+ return r.deleteChildReservations(ctx, cr)
+ }
+ state, err := FromCommittedResource(*cr)
+ if err != nil {
+ logger.Error(err, "invalid spec during rollback, deleting all child reservations")
+ return r.deleteChildReservations(ctx, cr)
+ }
+ state.TotalMemoryBytes = cr.Status.AcceptedAmount.Value()
+ state.NamePrefix = cr.Name + "-"
+ state.CreatorRequestID = reservations.GlobalRequestIDFromContext(ctx)
+ if _, err := NewReservationManager(r.Client).ApplyCommitmentState(ctx, logger, state, flavorGroups, "committed-resource-controller-rollback"); err != nil {
+ return fmt.Errorf("rollback apply failed: %w", err)
+ }
+ return nil
+}
+
+// setNotReady patches Ready=False on CommittedResource status.
+func (r *CommittedResourceController) setNotReady(ctx context.Context, cr *v1alpha1.CommittedResource, reason, message string) error {
+ old := cr.DeepCopy()
+ meta.SetStatusCondition(&cr.Status.Conditions, metav1.Condition{
+ Type: v1alpha1.CommittedResourceConditionReady,
+ Status: metav1.ConditionFalse,
+ Reason: reason,
+ Message: message,
+ LastTransitionTime: metav1.Now(),
+ })
+ if err := r.Status().Patch(ctx, cr, client.MergeFrom(old)); err != nil {
+ return client.IgnoreNotFound(err)
+ }
+ return nil
+}
+
+// SetupWithManager sets up the controller with the Manager.
+func (r *CommittedResourceController) SetupWithManager(mgr ctrl.Manager, mcl *multicluster.Client) error {
+ ctx := context.Background()
+ if err := IndexFields(ctx, mcl); err != nil {
+ return fmt.Errorf("failed to set up field indexes: %w", err)
+ }
+
+ bldr := multicluster.BuildController(mcl, mgr)
+ var err error
+ bldr, err = bldr.WatchesMulticluster(
+ &v1alpha1.CommittedResource{},
+ &handler.EnqueueRequestForObject{},
+ )
+ if err != nil {
+ return err
+ }
+ // Re-enqueue the parent CommittedResource when a child Reservation changes (e.g. external deletion).
+ bldr, err = bldr.WatchesMulticluster(
+ &v1alpha1.Reservation{},
+ handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, obj client.Object) []ctrl.Request {
+ res, ok := obj.(*v1alpha1.Reservation)
+ if !ok || res.Spec.CommittedResourceReservation == nil {
+ return nil
+ }
+ uuid := res.Spec.CommittedResourceReservation.CommitmentUUID
+ var crList v1alpha1.CommittedResourceList
+ if err := r.List(ctx, &crList, client.MatchingFields{idxCommittedResourceByUUID: uuid}); err != nil {
+ LoggerFromContext(ctx).Error(err, "failed to list CommittedResources by UUID", "uuid", uuid)
+ return nil
+ }
+ if len(crList.Items) == 0 {
+ return nil
+ }
+ return []ctrl.Request{{NamespacedName: types.NamespacedName{Name: crList.Items[0].Name}}}
+ }),
+ )
+ if err != nil {
+ return err
+ }
+ return bldr.Named("committed-resource").
+ WithOptions(controller.Options{
+ MaxConcurrentReconciles: 1,
+ }).
+ Complete(r)
+}
diff --git a/internal/scheduling/reservations/commitments/committed_resource_controller_test.go b/internal/scheduling/reservations/commitments/committed_resource_controller_test.go
new file mode 100644
index 000000000..6e6103972
--- /dev/null
+++ b/internal/scheduling/reservations/commitments/committed_resource_controller_test.go
@@ -0,0 +1,441 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package commitments
+
+import (
+ "context"
+ "encoding/json"
+ "testing"
+ "time"
+
+ hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
+ "k8s.io/apimachinery/pkg/api/meta"
+ "k8s.io/apimachinery/pkg/api/resource"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/apimachinery/pkg/runtime"
+ "k8s.io/apimachinery/pkg/types"
+ ctrl "sigs.k8s.io/controller-runtime"
+ "sigs.k8s.io/controller-runtime/pkg/client"
+ "sigs.k8s.io/controller-runtime/pkg/client/fake"
+
+ "github.com/cobaltcore-dev/cortex/api/v1alpha1"
+)
+
+// ============================================================================
+// Helpers
+// ============================================================================
+
+// newTestCommittedResource returns a CommittedResource with sensible defaults.
+// The finalizer is pre-populated so tests can call Reconcile once without a
+// separate finalizer-add round-trip.
+func newTestCommittedResource(name string, state v1alpha1.CommitmentStatus) *v1alpha1.CommittedResource {
+ return &v1alpha1.CommittedResource{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: name,
+ Finalizers: []string{crFinalizer},
+ },
+ Spec: v1alpha1.CommittedResourceSpec{
+ CommitmentUUID: "test-uuid-1234",
+ FlavorGroupName: "test-group",
+ ResourceType: v1alpha1.CommittedResourceTypeMemory,
+ Amount: resource.MustParse("4Gi"),
+ AvailabilityZone: "test-az",
+ ProjectID: "test-project",
+ DomainID: "test-domain",
+ State: state,
+ },
+ }
+}
+
+// newTestFlavorKnowledge returns a Knowledge CRD with a single 4 GiB flavor so
+// a 4 GiB commitment produces exactly one slot.
+func newTestFlavorKnowledge() *v1alpha1.Knowledge {
+ raw, err := json.Marshal(map[string]any{
+ "features": []map[string]any{
+ {
+ "name": "test-group",
+ "flavors": []map[string]any{
+ {
+ "name": "test-flavor",
+ "memoryMB": 4096,
+ "vcpus": 2,
+ "extraSpecs": map[string]string{},
+ },
+ },
+ },
+ },
+ })
+ if err != nil {
+ panic(err)
+ }
+ return &v1alpha1.Knowledge{
+ ObjectMeta: metav1.ObjectMeta{Name: "flavor-groups"},
+ Spec: v1alpha1.KnowledgeSpec{
+ SchedulingDomain: v1alpha1.SchedulingDomainNova,
+ Extractor: v1alpha1.KnowledgeExtractorSpec{Name: "flavor_groups"},
+ },
+ Status: v1alpha1.KnowledgeStatus{
+ Raw: runtime.RawExtension{Raw: raw},
+ RawLength: 1,
+ Conditions: []metav1.Condition{
+ {
+ Type: v1alpha1.KnowledgeConditionReady,
+ Status: metav1.ConditionTrue,
+ Reason: "Ready",
+ },
+ },
+ },
+ }
+}
+
+func newCRTestScheme(t *testing.T) *runtime.Scheme {
+ t.Helper()
+ scheme := runtime.NewScheme()
+ if err := v1alpha1.AddToScheme(scheme); err != nil {
+ t.Fatalf("failed to add v1alpha1 scheme: %v", err)
+ }
+ if err := hv1.AddToScheme(scheme); err != nil {
+ t.Fatalf("failed to add hv1 scheme: %v", err)
+ }
+ return scheme
+}
+
+func newCRTestClient(scheme *runtime.Scheme, objects ...client.Object) client.Client {
+ return fake.NewClientBuilder().
+ WithScheme(scheme).
+ WithObjects(objects...).
+ WithStatusSubresource(&v1alpha1.CommittedResource{}, &v1alpha1.Reservation{}).
+ Build()
+}
+
+func reconcileReq(name string) ctrl.Request {
+ return ctrl.Request{NamespacedName: types.NamespacedName{Name: name}}
+}
+
+// assertCondition checks the Ready condition status and reason on a CommittedResource.
+func assertCondition(t *testing.T, k8sClient client.Client, crName string, expectedStatus metav1.ConditionStatus, expectedReason string) {
+ t.Helper()
+ var cr v1alpha1.CommittedResource
+ if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: crName}, &cr); err != nil {
+ t.Fatalf("failed to get CommittedResource %s: %v", crName, err)
+ }
+ cond := meta.FindStatusCondition(cr.Status.Conditions, v1alpha1.CommittedResourceConditionReady)
+ if cond == nil {
+ t.Errorf("Ready condition not set on %s", crName)
+ return
+ }
+ if cond.Status != expectedStatus {
+ t.Errorf("%s: expected Ready=%s, got %s", crName, expectedStatus, cond.Status)
+ }
+ if cond.Reason != expectedReason {
+ t.Errorf("%s: expected Reason=%s, got %s", crName, expectedReason, cond.Reason)
+ }
+}
+
+// countChildReservations counts Reservation CRDs owned by the given CommitmentUUID,
+// using the same identity predicate as the controller.
+func countChildReservations(t *testing.T, k8sClient client.Client, commitmentUUID string) int {
+ t.Helper()
+ var list v1alpha1.ReservationList
+ if err := k8sClient.List(context.Background(), &list, client.MatchingLabels{
+ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
+ }); err != nil {
+ t.Fatalf("failed to list reservations: %v", err)
+ }
+ count := 0
+ for _, r := range list.Items {
+ if r.Spec.CommittedResourceReservation != nil &&
+ r.Spec.CommittedResourceReservation.CommitmentUUID == commitmentUUID {
+ count++
+ }
+ }
+ return count
+}
+
+// ============================================================================
+// Tests: per-state reconcile paths
+// ============================================================================
+
+func TestCommittedResourceController_Reconcile(t *testing.T) {
+ tests := []struct {
+ name string
+ state v1alpha1.CommitmentStatus
+ expectedStatus metav1.ConditionStatus
+ expectedReason string
+ expectedSlots int
+ needsKnowledge bool
+ }{
+ {
+ name: "planned: no Reservations created, Ready=False/Planned",
+ state: v1alpha1.CommitmentStatusPlanned,
+ expectedStatus: metav1.ConditionFalse,
+ expectedReason: "Planned",
+ expectedSlots: 0,
+ },
+ {
+ name: "pending: Reservations created, Ready=True",
+ state: v1alpha1.CommitmentStatusPending,
+ expectedStatus: metav1.ConditionTrue,
+ expectedReason: "Accepted",
+ expectedSlots: 1,
+ needsKnowledge: true,
+ },
+ {
+ name: "guaranteed: Reservations created, Ready=True",
+ state: v1alpha1.CommitmentStatusGuaranteed,
+ expectedStatus: metav1.ConditionTrue,
+ expectedReason: "Accepted",
+ expectedSlots: 1,
+ needsKnowledge: true,
+ },
+ {
+ name: "confirmed: Reservations created, Ready=True",
+ state: v1alpha1.CommitmentStatusConfirmed,
+ expectedStatus: metav1.ConditionTrue,
+ expectedReason: "Accepted",
+ expectedSlots: 1,
+ needsKnowledge: true,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ scheme := newCRTestScheme(t)
+ cr := newTestCommittedResource("test-cr", tt.state)
+ objects := []client.Object{cr}
+ if tt.needsKnowledge {
+ objects = append(objects, newTestFlavorKnowledge())
+ }
+ k8sClient := newCRTestClient(scheme, objects...)
+ controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: Config{}}
+
+ if _, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)); err != nil {
+ t.Fatalf("reconcile: %v", err)
+ }
+
+ assertCondition(t, k8sClient, cr.Name, tt.expectedStatus, tt.expectedReason)
+ if got := countChildReservations(t, k8sClient, cr.Spec.CommitmentUUID); got != tt.expectedSlots {
+ t.Errorf("expected %d child reservations, got %d", tt.expectedSlots, got)
+ }
+
+ if tt.expectedSlots > 0 {
+ var updated v1alpha1.CommittedResource
+ if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: cr.Name}, &updated); err != nil {
+ t.Fatalf("get CR: %v", err)
+ }
+ if updated.Status.AcceptedAmount == nil {
+ t.Errorf("expected AcceptedAmount to be set on acceptance")
+ }
+ }
+ })
+ }
+}
+
+func TestCommittedResourceController_InactiveStates(t *testing.T) {
+ tests := []struct {
+ name string
+ state v1alpha1.CommitmentStatus
+ }{
+ {name: "superseded: child Reservations deleted, Ready=False", state: v1alpha1.CommitmentStatusSuperseded},
+ {name: "expired: child Reservations deleted, Ready=False", state: v1alpha1.CommitmentStatusExpired},
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ scheme := newCRTestScheme(t)
+ cr := newTestCommittedResource("test-cr", tt.state)
+ existing := &v1alpha1.Reservation{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "test-cr-0",
+ Labels: map[string]string{
+ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
+ },
+ },
+ Spec: v1alpha1.ReservationSpec{
+ Type: v1alpha1.ReservationTypeCommittedResource,
+ CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{
+ CommitmentUUID: "test-uuid-1234",
+ },
+ },
+ }
+ k8sClient := newCRTestClient(scheme, cr, existing)
+ controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: Config{}}
+
+ if _, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)); err != nil {
+ t.Fatalf("reconcile: %v", err)
+ }
+
+ assertCondition(t, k8sClient, cr.Name, metav1.ConditionFalse, string(tt.state))
+ if got := countChildReservations(t, k8sClient, cr.Spec.CommitmentUUID); got != 0 {
+ t.Errorf("expected 0 child reservations after %s, got %d", tt.state, got)
+ }
+ })
+ }
+}
+
+// ============================================================================
+// Tests: placement failure paths
+// ============================================================================
+
+func TestCommittedResourceController_PlacementFailure(t *testing.T) {
+ // Knowledge absent → placement fails. Tests diverging behavior by state and AllowRejection.
+ tests := []struct {
+ name string
+ state v1alpha1.CommitmentStatus
+ allowRejection bool
+ expectedReason string
+ expectRequeue bool
+ }{
+ {
+ name: "pending: always rejects on failure, no retry",
+ state: v1alpha1.CommitmentStatusPending,
+ expectedReason: "Rejected",
+ expectRequeue: false,
+ },
+ {
+ name: "guaranteed AllowRejection=true: rejects on failure, no retry",
+ state: v1alpha1.CommitmentStatusGuaranteed,
+ allowRejection: true,
+ expectedReason: "Rejected",
+ expectRequeue: false,
+ },
+ {
+ name: "confirmed AllowRejection=true: rejects on failure, no retry",
+ state: v1alpha1.CommitmentStatusConfirmed,
+ allowRejection: true,
+ expectedReason: "Rejected",
+ expectRequeue: false,
+ },
+ {
+ name: "guaranteed AllowRejection=false: retries on failure",
+ state: v1alpha1.CommitmentStatusGuaranteed,
+ allowRejection: false,
+ expectedReason: "Reserving",
+ expectRequeue: true,
+ },
+ {
+ name: "confirmed AllowRejection=false: retries on failure",
+ state: v1alpha1.CommitmentStatusConfirmed,
+ allowRejection: false,
+ expectedReason: "Reserving",
+ expectRequeue: true,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ scheme := newCRTestScheme(t)
+ cr := newTestCommittedResource("test-cr", tt.state)
+ cr.Spec.AllowRejection = tt.allowRejection
+ k8sClient := newCRTestClient(scheme, cr) // no Knowledge → placement fails
+ controller := &CommittedResourceController{
+ Client: k8sClient,
+ Scheme: scheme,
+ Conf: Config{RequeueIntervalRetry: 1 * time.Minute},
+ }
+
+ result, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name))
+ if err != nil {
+ t.Fatalf("reconcile: %v", err)
+ }
+
+ assertCondition(t, k8sClient, cr.Name, metav1.ConditionFalse, tt.expectedReason)
+ if tt.expectRequeue && result.RequeueAfter == 0 {
+ t.Errorf("expected requeue after failure, got none")
+ }
+ if !tt.expectRequeue && result.RequeueAfter != 0 {
+ t.Errorf("expected no requeue after rejection, got RequeueAfter=%v", result.RequeueAfter)
+ }
+ if got := countChildReservations(t, k8sClient, cr.Spec.CommitmentUUID); got != 0 {
+ t.Errorf("expected 0 child reservations after failure, got %d", got)
+ }
+ })
+ }
+}
+
+func TestCommittedResourceController_BadSpec(t *testing.T) {
+ // Invalid UUID fails commitmentUUIDPattern — permanently broken regardless of AllowRejection.
+ scheme := newCRTestScheme(t)
+ cr := &v1alpha1.CommittedResource{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "test-cr",
+ Finalizers: []string{crFinalizer},
+ },
+ Spec: v1alpha1.CommittedResourceSpec{
+ CommitmentUUID: "x", // too short, fails commitmentUUIDPattern
+ FlavorGroupName: "test-group",
+ ResourceType: v1alpha1.CommittedResourceTypeMemory,
+ Amount: resource.MustParse("4Gi"),
+ AvailabilityZone: "test-az",
+ ProjectID: "test-project",
+ DomainID: "test-domain",
+ State: v1alpha1.CommitmentStatusConfirmed,
+ },
+ }
+ k8sClient := newCRTestClient(scheme, cr, newTestFlavorKnowledge())
+ controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: Config{}}
+
+ if _, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)); err != nil {
+ t.Fatalf("reconcile: %v", err)
+ }
+
+ assertCondition(t, k8sClient, cr.Name, metav1.ConditionFalse, "Rejected")
+ if got := countChildReservations(t, k8sClient, cr.Spec.CommitmentUUID); got != 0 {
+ t.Errorf("expected 0 child reservations after bad-spec rejection, got %d", got)
+ }
+}
+
+func TestCommittedResourceController_Idempotent(t *testing.T) {
+ scheme := newCRTestScheme(t)
+ cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusConfirmed)
+ k8sClient := newCRTestClient(scheme, cr, newTestFlavorKnowledge())
+ controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: Config{}}
+
+ for i := range 3 {
+ if _, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)); err != nil {
+ t.Fatalf("reconcile %d: %v", i+1, err)
+ }
+ }
+
+ if got := countChildReservations(t, k8sClient, cr.Spec.CommitmentUUID); got != 1 {
+ t.Errorf("expected 1 child reservation after 3 reconciles (idempotency), got %d", got)
+ }
+ assertCondition(t, k8sClient, cr.Name, metav1.ConditionTrue, "Accepted")
+}
+
+func TestCommittedResourceController_Deletion(t *testing.T) {
+ scheme := newCRTestScheme(t)
+ cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusConfirmed)
+ child := &v1alpha1.Reservation{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "test-cr-0",
+ Labels: map[string]string{
+ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
+ },
+ },
+ Spec: v1alpha1.ReservationSpec{
+ Type: v1alpha1.ReservationTypeCommittedResource,
+ CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{
+ CommitmentUUID: "test-uuid-1234",
+ },
+ },
+ }
+ k8sClient := newCRTestClient(scheme, cr, child)
+ controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: Config{}}
+
+ if err := k8sClient.Delete(context.Background(), cr); err != nil {
+ t.Fatalf("delete CR: %v", err)
+ }
+ if _, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)); err != nil {
+ t.Fatalf("reconcile: %v", err)
+ }
+
+ if got := countChildReservations(t, k8sClient, cr.Spec.CommitmentUUID); got != 0 {
+ t.Errorf("expected 0 child reservations after deletion, got %d", got)
+ }
+ var deleted v1alpha1.CommittedResource
+ if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: cr.Name}, &deleted); err == nil {
+ t.Errorf("expected CR to be gone after deletion, but it still exists with finalizers=%v", deleted.Finalizers)
+ }
+}
diff --git a/internal/scheduling/reservations/commitments/committed_resource_integration_test.go b/internal/scheduling/reservations/commitments/committed_resource_integration_test.go
new file mode 100644
index 000000000..01a0b4199
--- /dev/null
+++ b/internal/scheduling/reservations/commitments/committed_resource_integration_test.go
@@ -0,0 +1,331 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package commitments
+
+// Integration tests for the CR lifecycle spanning CommittedResourceController and
+// CommitmentReservationController. These tests drive both controllers against a shared
+// fake client and verify the end-to-end state transitions without mocking internal logic.
+//
+// Scope:
+// - State transition: planned → confirmed produces child Reservations
+// - State transition: confirmed → expired cleans up child Reservations
+// - Reservation controller places a child Reservation created by the CR controller
+// - CR deletion removes all child Reservations
+
+import (
+ "context"
+ "encoding/json"
+ "net/http"
+ "net/http/httptest"
+ "strings"
+ "testing"
+ "time"
+
+ hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
+ "k8s.io/apimachinery/pkg/api/meta"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/apimachinery/pkg/types"
+ ctrl "sigs.k8s.io/controller-runtime"
+ "sigs.k8s.io/controller-runtime/pkg/client"
+ "sigs.k8s.io/controller-runtime/pkg/client/fake"
+
+ schedulerdelegationapi "github.com/cobaltcore-dev/cortex/api/external/nova"
+ "github.com/cobaltcore-dev/cortex/api/v1alpha1"
+)
+
+// crIntegrationEnv holds shared state for integration tests.
+type crIntegrationEnv struct {
+ k8sClient client.Client
+ crController *CommittedResourceController
+ resController *CommitmentReservationController
+ schedulerServer *httptest.Server
+}
+
+func newCRIntegrationEnv(t *testing.T) *crIntegrationEnv {
+ t.Helper()
+ scheme := newCRTestScheme(t)
+
+ hypervisor := &hv1.Hypervisor{ObjectMeta: metav1.ObjectMeta{Name: "host-1"}}
+ k8sClient := fake.NewClientBuilder().
+ WithScheme(scheme).
+ WithObjects(newTestFlavorKnowledge(), hypervisor).
+ WithStatusSubresource(
+ &v1alpha1.CommittedResource{},
+ &v1alpha1.Reservation{},
+ &v1alpha1.Knowledge{},
+ ).
+ Build()
+
+ schedulerServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ resp := &schedulerdelegationapi.ExternalSchedulerResponse{Hosts: []string{"host-1"}}
+ if err := json.NewEncoder(w).Encode(resp); err != nil {
+ t.Errorf("scheduler encode: %v", err)
+ }
+ }))
+
+ crCtrl := &CommittedResourceController{
+ Client: k8sClient,
+ Scheme: scheme,
+ Conf: Config{RequeueIntervalRetry: 5 * time.Minute},
+ }
+
+ resCtrl := &CommitmentReservationController{
+ Client: k8sClient,
+ Scheme: scheme,
+ Conf: Config{
+ SchedulerURL: schedulerServer.URL,
+ AllocationGracePeriod: 15 * time.Minute,
+ RequeueIntervalActive: 5 * time.Minute,
+ },
+ }
+ if err := resCtrl.Init(context.Background(), k8sClient, resCtrl.Conf); err != nil {
+ t.Fatalf("resCtrl.Init: %v", err)
+ }
+
+ return &crIntegrationEnv{
+ k8sClient: k8sClient,
+ crController: crCtrl,
+ resController: resCtrl,
+ schedulerServer: schedulerServer,
+ }
+}
+
+func (e *crIntegrationEnv) close() { e.schedulerServer.Close() }
+
+func (e *crIntegrationEnv) reconcileCR(t *testing.T, crName string) {
+ t.Helper()
+ req := ctrl.Request{NamespacedName: types.NamespacedName{Name: crName}}
+ if _, err := e.crController.Reconcile(context.Background(), req); err != nil {
+ t.Fatalf("CR reconcile: %v", err)
+ }
+}
+
+func (e *crIntegrationEnv) reconcileReservation(t *testing.T, resName string) {
+ t.Helper()
+ req := ctrl.Request{NamespacedName: types.NamespacedName{Name: resName}}
+ if _, err := e.resController.Reconcile(context.Background(), req); err != nil {
+ t.Fatalf("reservation reconcile %s: %v", resName, err)
+ }
+}
+
+func (e *crIntegrationEnv) listChildReservations(t *testing.T, crName string) []v1alpha1.Reservation {
+ t.Helper()
+ var list v1alpha1.ReservationList
+ if err := e.k8sClient.List(context.Background(), &list, client.MatchingLabels{
+ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
+ }); err != nil {
+ t.Fatalf("list reservations: %v", err)
+ }
+ prefix := crName + "-"
+ var children []v1alpha1.Reservation
+ for _, r := range list.Items {
+ if strings.HasPrefix(r.Name, prefix) {
+ children = append(children, r)
+ }
+ }
+ return children
+}
+
+func (e *crIntegrationEnv) getCR(t *testing.T, name string) v1alpha1.CommittedResource {
+ t.Helper()
+ var cr v1alpha1.CommittedResource
+ if err := e.k8sClient.Get(context.Background(), types.NamespacedName{Name: name}, &cr); err != nil {
+ t.Fatalf("get CR %s: %v", name, err)
+ }
+ return cr
+}
+
+// ============================================================================
+// Integration tests
+// ============================================================================
+
+// TestCRLifecycle_PlannedToConfirmed verifies that transitioning a CR from planned
+// to confirmed causes the CR controller to create child Reservation CRDs.
+func TestCRLifecycle_PlannedToConfirmed(t *testing.T) {
+ env := newCRIntegrationEnv(t)
+ defer env.close()
+
+ cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusPlanned)
+ if err := env.k8sClient.Create(context.Background(), cr); err != nil {
+ t.Fatalf("create CR: %v", err)
+ }
+
+ // Reconcile as planned: finalizer added, no Reservations.
+ env.reconcileCR(t, cr.Name)
+ env.reconcileCR(t, cr.Name)
+ if got := env.listChildReservations(t, cr.Name); len(got) != 0 {
+ t.Fatalf("planned: expected 0 reservations, got %d", len(got))
+ }
+ crState := env.getCR(t, cr.Name)
+ cond := meta.FindStatusCondition(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady)
+ if cond == nil || cond.Reason != "Planned" {
+ t.Errorf("planned: expected Reason=Planned, got %v", cond)
+ }
+
+ // Transition to confirmed.
+ patch := client.MergeFrom(crState.DeepCopy())
+ crState.Spec.State = v1alpha1.CommitmentStatusConfirmed
+ if err := env.k8sClient.Patch(context.Background(), &crState, patch); err != nil {
+ t.Fatalf("patch state to confirmed: %v", err)
+ }
+
+ env.reconcileCR(t, cr.Name)
+
+ children := env.listChildReservations(t, cr.Name)
+ if len(children) != 1 {
+ t.Fatalf("confirmed: expected 1 reservation, got %d", len(children))
+ }
+ crState = env.getCR(t, cr.Name)
+ if !meta.IsStatusConditionTrue(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) {
+ t.Errorf("confirmed: expected Ready=True")
+ }
+}
+
+// TestCRLifecycle_ConfirmedToExpired verifies that transitioning a CR to expired
+// deletes all child Reservation CRDs and marks Ready=False.
+func TestCRLifecycle_ConfirmedToExpired(t *testing.T) {
+ env := newCRIntegrationEnv(t)
+ defer env.close()
+
+ cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed)
+ if err := env.k8sClient.Create(context.Background(), cr); err != nil {
+ t.Fatalf("create CR: %v", err)
+ }
+
+ // Bring to confirmed+Ready=True.
+ env.reconcileCR(t, cr.Name) // adds finalizer
+ env.reconcileCR(t, cr.Name) // creates Reservations
+
+ if got := env.listChildReservations(t, cr.Name); len(got) != 1 {
+ t.Fatalf("pre-expire: expected 1 reservation, got %d", len(got))
+ }
+
+ // Transition to expired.
+ crState := env.getCR(t, cr.Name)
+ patch := client.MergeFrom(crState.DeepCopy())
+ crState.Spec.State = v1alpha1.CommitmentStatusExpired
+ if err := env.k8sClient.Patch(context.Background(), &crState, patch); err != nil {
+ t.Fatalf("patch state to expired: %v", err)
+ }
+
+ env.reconcileCR(t, cr.Name)
+
+ if got := env.listChildReservations(t, cr.Name); len(got) != 0 {
+ t.Errorf("expired: expected 0 reservations, got %d", len(got))
+ }
+ crState = env.getCR(t, cr.Name)
+ cond := meta.FindStatusCondition(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady)
+ if cond == nil || cond.Status != metav1.ConditionFalse {
+ t.Errorf("expired: expected Ready=False, got %v", cond)
+ }
+ if cond != nil && cond.Reason != string(v1alpha1.CommitmentStatusExpired) {
+ t.Errorf("expired: expected Reason=%s, got %s", v1alpha1.CommitmentStatusExpired, cond.Reason)
+ }
+}
+
+// TestCRLifecycle_ReservationControllerPlacesChild verifies that after the CR controller
+// creates a child Reservation, the ReservationController can place it (scheduler call →
+// TargetHost set → Ready=True on the Reservation).
+func TestCRLifecycle_ReservationControllerPlacesChild(t *testing.T) {
+ env := newCRIntegrationEnv(t)
+ defer env.close()
+
+ cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed)
+ if err := env.k8sClient.Create(context.Background(), cr); err != nil {
+ t.Fatalf("create CR: %v", err)
+ }
+
+ // CR controller creates child Reservation.
+ env.reconcileCR(t, cr.Name)
+ env.reconcileCR(t, cr.Name)
+
+ children := env.listChildReservations(t, cr.Name)
+ if len(children) != 1 {
+ t.Fatalf("expected 1 child reservation, got %d", len(children))
+ }
+ child := children[0]
+
+ // Reservation controller places it (first reconcile: calls scheduler → sets TargetHost).
+ env.reconcileReservation(t, child.Name)
+
+ var afterFirst v1alpha1.Reservation
+ if err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: child.Name}, &afterFirst); err != nil {
+ t.Fatalf("get reservation after first reconcile: %v", err)
+ }
+ if afterFirst.Spec.TargetHost == "" {
+ t.Fatalf("expected TargetHost set after first reservation reconcile")
+ }
+
+ // Second reconcile: syncs TargetHost to Status, sets Ready=True.
+ env.reconcileReservation(t, child.Name)
+
+ var afterSecond v1alpha1.Reservation
+ if err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: child.Name}, &afterSecond); err != nil {
+ t.Fatalf("get reservation after second reconcile: %v", err)
+ }
+ if !meta.IsStatusConditionTrue(afterSecond.Status.Conditions, v1alpha1.ReservationConditionReady) {
+ t.Errorf("expected reservation Ready=True after placement, got %v", afterSecond.Status.Conditions)
+ }
+ if afterSecond.Status.Host != "host-1" {
+ t.Errorf("expected Status.Host=host-1, got %q", afterSecond.Status.Host)
+ }
+}
+
+// TestCRLifecycle_Deletion verifies that deleting a CR cleans up all child Reservations.
+func TestCRLifecycle_Deletion(t *testing.T) {
+ env := newCRIntegrationEnv(t)
+ defer env.close()
+
+ cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed)
+ if err := env.k8sClient.Create(context.Background(), cr); err != nil {
+ t.Fatalf("create CR: %v", err)
+ }
+
+ // newTestCommittedResource pre-populates the finalizer, so Delete() will set
+ // DeletionTimestamp without needing a prior reconcile.
+
+ // Pre-create a child Reservation to verify it gets cleaned up on deletion.
+ child := &v1alpha1.Reservation{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "my-cr-0",
+ Labels: map[string]string{
+ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
+ },
+ },
+ Spec: v1alpha1.ReservationSpec{
+ Type: v1alpha1.ReservationTypeCommittedResource,
+ CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{
+ CommitmentUUID: "test-uuid-1234",
+ },
+ },
+ }
+ if err := env.k8sClient.Create(context.Background(), child); err != nil {
+ t.Fatalf("create child reservation: %v", err)
+ }
+
+ // Delete sets DeletionTimestamp (object has finalizer, so it is not removed yet).
+ crState := env.getCR(t, cr.Name)
+ if err := env.k8sClient.Delete(context.Background(), &crState); err != nil {
+ t.Fatalf("delete CR: %v", err)
+ }
+
+ env.reconcileCR(t, cr.Name)
+
+ if got := env.listChildReservations(t, cr.Name); len(got) != 0 {
+ t.Errorf("post-deletion: expected 0 reservations, got %d", len(got))
+ }
+ // Finalizer removed — object either gone or has no finalizer.
+ var final v1alpha1.CommittedResource
+ err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: cr.Name}, &final)
+ if client.IgnoreNotFound(err) != nil {
+ t.Fatalf("unexpected error after deletion: %v", err)
+ }
+ if err == nil {
+ for _, f := range final.Finalizers {
+ if f == crFinalizer {
+ t.Errorf("finalizer not removed after deletion reconcile")
+ }
+ }
+ }
+}
diff --git a/internal/scheduling/reservations/commitments/field_index.go b/internal/scheduling/reservations/commitments/field_index.go
new file mode 100644
index 000000000..9e3fde378
--- /dev/null
+++ b/internal/scheduling/reservations/commitments/field_index.go
@@ -0,0 +1,43 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package commitments
+
+import (
+ "context"
+ "errors"
+
+ "github.com/cobaltcore-dev/cortex/api/v1alpha1"
+ "github.com/cobaltcore-dev/cortex/pkg/multicluster"
+ "sigs.k8s.io/controller-runtime/pkg/client"
+ logf "sigs.k8s.io/controller-runtime/pkg/log"
+)
+
+const idxCommittedResourceByUUID = "spec.commitmentUUID"
+
+// IndexFields registers field indexes required by the CommittedResource controller.
+func IndexFields(ctx context.Context, mcl *multicluster.Client) error {
+ log := logf.FromContext(ctx)
+ log.Info("Setting up field indexes for the CommittedResource controller")
+ if err := mcl.IndexField(ctx,
+ &v1alpha1.CommittedResource{},
+ &v1alpha1.CommittedResourceList{},
+ idxCommittedResourceByUUID,
+ func(obj client.Object) []string {
+ cr, ok := obj.(*v1alpha1.CommittedResource)
+ if !ok {
+ log.Error(errors.New("unexpected type"), "expected CommittedResource", "object", obj)
+ return nil
+ }
+ if cr.Spec.CommitmentUUID == "" {
+ return nil
+ }
+ return []string{cr.Spec.CommitmentUUID}
+ },
+ ); err != nil {
+ log.Error(err, "failed to set up index for commitmentUUID")
+ return err
+ }
+ log.Info("Successfully set up index for commitmentUUID")
+ return nil
+}
diff --git a/internal/scheduling/reservations/commitments/controller.go b/internal/scheduling/reservations/commitments/reservation_controller.go
similarity index 100%
rename from internal/scheduling/reservations/commitments/controller.go
rename to internal/scheduling/reservations/commitments/reservation_controller.go
diff --git a/internal/scheduling/reservations/commitments/controller_test.go b/internal/scheduling/reservations/commitments/reservation_controller_test.go
similarity index 75%
rename from internal/scheduling/reservations/commitments/controller_test.go
rename to internal/scheduling/reservations/commitments/reservation_controller_test.go
index afb8ebcfc..7c0d63ee7 100644
--- a/internal/scheduling/reservations/commitments/controller_test.go
+++ b/internal/scheduling/reservations/commitments/reservation_controller_test.go
@@ -15,24 +15,16 @@ import (
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
- "sigs.k8s.io/controller-runtime/pkg/client/fake"
schedulerdelegationapi "github.com/cobaltcore-dev/cortex/api/external/nova"
"github.com/cobaltcore-dev/cortex/api/v1alpha1"
)
func TestCommitmentReservationController_Reconcile(t *testing.T) {
- scheme := runtime.NewScheme()
- if err := v1alpha1.AddToScheme(scheme); err != nil {
- t.Fatalf("Failed to add scheme: %v", err)
- }
- if err := hv1.AddToScheme(scheme); err != nil {
- t.Fatalf("Failed to add hypervisor scheme: %v", err)
- }
+ scheme := newCRTestScheme(t)
tests := []struct {
name string
@@ -83,14 +75,10 @@ func TestCommitmentReservationController_Reconcile(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
- client := fake.NewClientBuilder().
- WithScheme(scheme).
- WithObjects(tt.reservation).
- WithStatusSubresource(&v1alpha1.Reservation{}).
- Build()
+ k8sClient := newCRTestClient(scheme, tt.reservation)
reconciler := &CommitmentReservationController{
- Client: client,
+ Client: k8sClient,
Scheme: scheme,
Conf: Config{
RequeueIntervalActive: 5 * time.Minute,
@@ -118,9 +106,8 @@ func TestCommitmentReservationController_Reconcile(t *testing.T) {
t.Errorf("Expected no requeue but got %v", result.RequeueAfter)
}
- // Verify the reservation status
var updated v1alpha1.Reservation
- err = client.Get(context.Background(), req.NamespacedName, &updated)
+ err = k8sClient.Get(context.Background(), req.NamespacedName, &updated)
if err != nil {
t.Errorf("Failed to get updated reservation: %v", err)
return
@@ -146,23 +133,18 @@ func TestCommitmentReservationController_Reconcile(t *testing.T) {
// ============================================================================
func TestReconcileAllocations_HypervisorCRDPath(t *testing.T) {
- scheme := runtime.NewScheme()
- if err := v1alpha1.AddToScheme(scheme); err != nil {
- t.Fatalf("Failed to add scheme: %v", err)
- }
- if err := hv1.AddToScheme(scheme); err != nil {
- t.Fatalf("Failed to add hypervisor scheme: %v", err)
- }
+ scheme := newCRTestScheme(t)
now := time.Now()
recentTime := metav1.NewTime(now.Add(-5 * time.Minute)) // 5 minutes ago (within grace period)
oldTime := metav1.NewTime(now.Add(-30 * time.Minute)) // 30 minutes ago (past grace period)
+ config := Config{AllocationGracePeriod: 15 * time.Minute}
+
tests := []struct {
name string
reservation *v1alpha1.Reservation
hypervisor *hv1.Hypervisor
- config Config
expectedStatusAllocations map[string]string
expectedSpecAllocations []string // VM UUIDs expected to remain in spec; nil means no check
expectedHasGracePeriodAllocs bool
@@ -175,7 +157,6 @@ func TestReconcileAllocations_HypervisorCRDPath(t *testing.T) {
hypervisor: newTestHypervisorCRD("host-1", []hv1.Instance{
{ID: "vm-1", Name: "vm-1", Active: true},
}),
- config: Config{AllocationGracePeriod: 15 * time.Minute},
expectedStatusAllocations: map[string]string{"vm-1": "host-1"},
expectedSpecAllocations: []string{"vm-1"},
expectedHasGracePeriodAllocs: false,
@@ -186,9 +167,8 @@ func TestReconcileAllocations_HypervisorCRDPath(t *testing.T) {
"vm-stopped": oldTime,
}),
hypervisor: newTestHypervisorCRD("host-1", []hv1.Instance{
- {ID: "vm-stopped", Name: "vm-stopped", Active: false}, // Inactive VM should still be found
+ {ID: "vm-stopped", Name: "vm-stopped", Active: false},
}),
- config: Config{AllocationGracePeriod: 15 * time.Minute},
expectedStatusAllocations: map[string]string{"vm-stopped": "host-1"},
expectedSpecAllocations: []string{"vm-stopped"},
expectedHasGracePeriodAllocs: false,
@@ -198,10 +178,9 @@ func TestReconcileAllocations_HypervisorCRDPath(t *testing.T) {
reservation: newTestCRReservation(map[string]metav1.Time{
"vm-1": oldTime,
}),
- hypervisor: newTestHypervisorCRD("host-1", []hv1.Instance{}), // Empty
- config: Config{AllocationGracePeriod: 15 * time.Minute},
+ hypervisor: newTestHypervisorCRD("host-1", []hv1.Instance{}),
expectedStatusAllocations: map[string]string{},
- expectedSpecAllocations: []string{}, // Removed from spec
+ expectedSpecAllocations: []string{},
expectedHasGracePeriodAllocs: false,
},
{
@@ -209,31 +188,26 @@ func TestReconcileAllocations_HypervisorCRDPath(t *testing.T) {
reservation: newTestCRReservation(map[string]metav1.Time{
"vm-1": recentTime,
}),
- hypervisor: nil,
- config: Config{AllocationGracePeriod: 15 * time.Minute},
expectedStatusAllocations: map[string]string{},
- expectedSpecAllocations: []string{"vm-1"}, // Kept in spec during grace period
+ expectedSpecAllocations: []string{"vm-1"},
expectedHasGracePeriodAllocs: true,
},
{
name: "mixed allocations - old verified via CRD, new in grace period",
reservation: newTestCRReservation(map[string]metav1.Time{
- "vm-new": recentTime, // In grace period
- "vm-old": oldTime, // Past grace period
+ "vm-new": recentTime,
+ "vm-old": oldTime,
}),
hypervisor: newTestHypervisorCRD("host-1", []hv1.Instance{
{ID: "vm-old", Name: "vm-old", Active: true},
}),
- config: Config{AllocationGracePeriod: 15 * time.Minute},
- expectedStatusAllocations: map[string]string{"vm-old": "host-1"}, // Only old one confirmed via CRD
+ expectedStatusAllocations: map[string]string{"vm-old": "host-1"},
expectedSpecAllocations: []string{"vm-new", "vm-old"},
expectedHasGracePeriodAllocs: true,
},
{
name: "empty allocations - no work to do",
reservation: newTestCRReservation(map[string]metav1.Time{}),
- hypervisor: nil,
- config: Config{AllocationGracePeriod: 15 * time.Minute},
expectedStatusAllocations: map[string]string{},
expectedHasGracePeriodAllocs: false,
},
@@ -242,10 +216,8 @@ func TestReconcileAllocations_HypervisorCRDPath(t *testing.T) {
reservation: newTestCRReservation(map[string]metav1.Time{
"vm-1": oldTime,
}),
- hypervisor: nil, // HV CRD does not exist (e.g. host deleted)
- config: Config{AllocationGracePeriod: 15 * time.Minute},
expectedStatusAllocations: map[string]string{},
- expectedSpecAllocations: []string{}, // Removed from spec
+ expectedSpecAllocations: []string{},
expectedHasGracePeriodAllocs: false,
},
{
@@ -253,32 +225,25 @@ func TestReconcileAllocations_HypervisorCRDPath(t *testing.T) {
reservation: newTestCRReservation(map[string]metav1.Time{
"vm-1": recentTime,
}),
- hypervisor: nil, // HV CRD does not exist
- config: Config{AllocationGracePeriod: 15 * time.Minute},
expectedStatusAllocations: map[string]string{},
- expectedSpecAllocations: []string{"vm-1"}, // Kept during grace period
+ expectedSpecAllocations: []string{"vm-1"},
expectedHasGracePeriodAllocs: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
- // Build fake client with objects
objects := []client.Object{tt.reservation}
if tt.hypervisor != nil {
objects = append(objects, tt.hypervisor)
}
- k8sClient := fake.NewClientBuilder().
- WithScheme(scheme).
- WithObjects(objects...).
- WithStatusSubresource(&v1alpha1.Reservation{}).
- Build()
+ k8sClient := newCRTestClient(scheme, objects...)
controller := &CommitmentReservationController{
Client: k8sClient,
Scheme: scheme,
- Conf: tt.config,
+ Conf: config,
}
ctx := WithNewGlobalRequestID(context.Background())
@@ -406,10 +371,7 @@ func newTestHypervisorCRD(name string, instances []hv1.Instance) *hv1.Hypervisor
// This covers the mapper logic; the watch wiring itself (informer → mapper → enqueue)
// is controller-runtime's responsibility and is not unit-testable without envtest.
func TestHypervisorToReservations(t *testing.T) {
- scheme := runtime.NewScheme()
- if err := v1alpha1.AddToScheme(scheme); err != nil {
- t.Fatalf("failed to add scheme: %v", err)
- }
+ scheme := newCRTestScheme(t)
res1 := &v1alpha1.Reservation{
ObjectMeta: metav1.ObjectMeta{Name: "res-host-1"},
@@ -436,11 +398,7 @@ func TestHypervisorToReservations(t *testing.T) {
Status: v1alpha1.ReservationStatus{Host: "host-1"},
}
- k8sClient := fake.NewClientBuilder().
- WithScheme(scheme).
- WithObjects(res1, res2, resOtherHost, resNoHost, resFailover).
- WithStatusSubresource(&v1alpha1.Reservation{}).
- Build()
+ k8sClient := newCRTestClient(scheme, res1, res2, resOtherHost, resNoHost, resFailover)
controller := &CommitmentReservationController{Client: k8sClient}
@@ -467,13 +425,7 @@ func TestHypervisorToReservations(t *testing.T) {
// ============================================================================
func TestCommitmentReservationController_reconcileInstanceReservation_Success(t *testing.T) {
- scheme := runtime.NewScheme()
- if err := v1alpha1.AddToScheme(scheme); err != nil {
- t.Fatalf("Failed to add scheme: %v", err)
- }
- if err := hv1.AddToScheme(scheme); err != nil {
- t.Fatalf("Failed to add hypervisor scheme: %v", err)
- }
+ scheme := newCRTestScheme(t)
reservation := &v1alpha1.Reservation{
ObjectMeta: ctrl.ObjectMeta{
@@ -486,91 +438,16 @@ func TestCommitmentReservationController_reconcileInstanceReservation_Success(t
ResourceName: "test-flavor",
},
Resources: map[hv1.ResourceName]resource.Quantity{
- hv1.ResourceMemory: resource.MustParse("1Gi"),
+ hv1.ResourceMemory: resource.MustParse("4Gi"),
hv1.ResourceCPU: resource.MustParse("2"),
},
},
}
- // Create flavor group knowledge CRD for the test
- flavorGroups := []struct {
- Name string `json:"name"`
- Flavors []struct {
- Name string `json:"name"`
- MemoryMB uint64 `json:"memoryMB"`
- VCPUs uint64 `json:"vcpus"`
- ExtraSpecs map[string]string `json:"extraSpecs"`
- } `json:"flavors"`
- }{
- {
- Name: "test-group",
- Flavors: []struct {
- Name string `json:"name"`
- MemoryMB uint64 `json:"memoryMB"`
- VCPUs uint64 `json:"vcpus"`
- ExtraSpecs map[string]string `json:"extraSpecs"`
- }{
- {
- Name: "test-flavor",
- MemoryMB: 1024,
- VCPUs: 2,
- ExtraSpecs: map[string]string{},
- },
- },
- },
- }
-
- // Marshal flavor groups into runtime.RawExtension
- flavorGroupsJSON, err := json.Marshal(map[string]interface{}{
- "features": flavorGroups,
- })
- if err != nil {
- t.Fatalf("Failed to marshal flavor groups: %v", err)
- }
-
- flavorGroupKnowledge := &v1alpha1.Knowledge{
- ObjectMeta: metav1.ObjectMeta{
- Name: "flavor-groups",
- },
- Spec: v1alpha1.KnowledgeSpec{
- SchedulingDomain: v1alpha1.SchedulingDomainNova,
- Extractor: v1alpha1.KnowledgeExtractorSpec{
- Name: "flavor_groups",
- },
- Recency: metav1.Duration{Duration: 0},
- },
- Status: v1alpha1.KnowledgeStatus{
- Raw: runtime.RawExtension{Raw: flavorGroupsJSON},
- RawLength: 1,
- Conditions: []metav1.Condition{
- {
- Type: v1alpha1.KnowledgeConditionReady,
- Status: metav1.ConditionTrue,
- Reason: "TestReady",
- },
- },
- },
- }
-
- // Create mock hypervisors
- hypervisor1 := &hv1.Hypervisor{
- ObjectMeta: metav1.ObjectMeta{
- Name: "test-host-1",
- },
- Spec: hv1.HypervisorSpec{},
- }
- hypervisor2 := &hv1.Hypervisor{
- ObjectMeta: metav1.ObjectMeta{
- Name: "test-host-2",
- },
- Spec: hv1.HypervisorSpec{},
- }
+ hypervisor1 := &hv1.Hypervisor{ObjectMeta: metav1.ObjectMeta{Name: "test-host-1"}}
+ hypervisor2 := &hv1.Hypervisor{ObjectMeta: metav1.ObjectMeta{Name: "test-host-2"}}
- client := fake.NewClientBuilder().
- WithScheme(scheme).
- WithObjects(reservation, flavorGroupKnowledge, hypervisor1, hypervisor2).
- WithStatusSubresource(&v1alpha1.Reservation{}, &v1alpha1.Knowledge{}).
- Build()
+ k8sClient := newCRTestClient(scheme, reservation, newTestFlavorKnowledge(), hypervisor1, hypervisor2)
// Create a mock server that returns a successful response
mockResponse := &schedulerdelegationapi.ExternalSchedulerResponse{
@@ -602,13 +479,13 @@ func TestCommitmentReservationController_reconcileInstanceReservation_Success(t
}
reconciler := &CommitmentReservationController{
- Client: client,
+ Client: k8sClient,
Scheme: scheme,
Conf: config,
}
// Initialize the reconciler (this sets up SchedulerClient)
- if err := reconciler.Init(context.Background(), client, config); err != nil {
+ if err := reconciler.Init(context.Background(), k8sClient, config); err != nil {
t.Fatalf("Failed to initialize reconciler: %v", err)
}
@@ -630,7 +507,7 @@ func TestCommitmentReservationController_reconcileInstanceReservation_Success(t
// Verify Spec.TargetHost is set after first reconcile
var afterFirstReconcile v1alpha1.Reservation
- if err = client.Get(context.Background(), req.NamespacedName, &afterFirstReconcile); err != nil {
+ if err = k8sClient.Get(context.Background(), req.NamespacedName, &afterFirstReconcile); err != nil {
t.Errorf("Failed to get reservation after first reconcile: %v", err)
return
}
@@ -650,7 +527,7 @@ func TestCommitmentReservationController_reconcileInstanceReservation_Success(t
// Verify the reservation status after second reconcile
var updated v1alpha1.Reservation
- if err = client.Get(context.Background(), req.NamespacedName, &updated); err != nil {
+ if err = k8sClient.Get(context.Background(), req.NamespacedName, &updated); err != nil {
t.Errorf("Failed to get updated reservation: %v", err)
return
}
diff --git a/internal/scheduling/reservations/commitments/reservation_manager.go b/internal/scheduling/reservations/commitments/reservation_manager.go
index 0cdbc9f12..d7a75cc7a 100644
--- a/internal/scheduling/reservations/commitments/reservation_manager.go
+++ b/internal/scheduling/reservations/commitments/reservation_manager.go
@@ -77,11 +77,11 @@ func (m *ReservationManager) ApplyCommitmentState(
return nil, fmt.Errorf("failed to list reservations: %w", err)
}
- // Filter by name prefix to find reservations for this commitment
- namePrefix := fmt.Sprintf("commitment-%s-", desiredState.CommitmentUUID)
+ // Filter by CommitmentUUID to find reservations for this commitment
var existing []v1alpha1.Reservation
for _, res := range allReservations.Items {
- if len(res.Name) >= len(namePrefix) && res.Name[:len(namePrefix)] == namePrefix {
+ if res.Spec.CommittedResourceReservation != nil &&
+ res.Spec.CommittedResourceReservation.CommitmentUUID == desiredState.CommitmentUUID {
existing = append(existing, res)
}
}
@@ -266,7 +266,11 @@ func (m *ReservationManager) newReservation(
creator string,
) *v1alpha1.Reservation {
- name := fmt.Sprintf("commitment-%s-%d", state.CommitmentUUID, slotIndex)
+ namePrefix := state.NamePrefix
+ if namePrefix == "" {
+ namePrefix = fmt.Sprintf("commitment-%s-", state.CommitmentUUID)
+ }
+ name := fmt.Sprintf("%s%d", namePrefix, slotIndex)
// Select first flavor that fits remaining memory (flavors sorted descending by size)
flavorInGroup := flavorGroup.Flavors[len(flavorGroup.Flavors)-1] // default to smallest
diff --git a/internal/scheduling/reservations/commitments/reservation_manager_test.go b/internal/scheduling/reservations/commitments/reservation_manager_test.go
index 7733cb6c2..b512fc9b5 100644
--- a/internal/scheduling/reservations/commitments/reservation_manager_test.go
+++ b/internal/scheduling/reservations/commitments/reservation_manager_test.go
@@ -13,691 +13,276 @@ import (
"github.com/go-logr/logr"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
- "sigs.k8s.io/controller-runtime/pkg/client/fake"
)
-func TestApplyCommitmentState_CreatesNewReservations(t *testing.T) {
- scheme := runtime.NewScheme()
- if err := v1alpha1.AddToScheme(scheme); err != nil {
- t.Fatal(err)
+// newTestCRSlot creates a Reservation slot for commitment "abc123" / project "project-1".
+// Pass nil allocs for an empty allocation map.
+func newTestCRSlot(name string, memGiB int64, targetHost, resourceGroup string, allocs map[string]v1alpha1.CommittedResourceAllocation) v1alpha1.Reservation {
+ if allocs == nil {
+ allocs = map[string]v1alpha1.CommittedResourceAllocation{}
}
-
- client := fake.NewClientBuilder().
- WithScheme(scheme).
- Build()
-
- manager := NewReservationManager(client)
- flavorGroup := testFlavorGroup()
- flavorGroups := map[string]compute.FlavorGroupFeature{
- "test-group": flavorGroup,
- }
-
- // Desired state: 3 multiples of smallest flavor (24 GiB)
- desiredState := &CommitmentState{
- CommitmentUUID: "abc123",
- ProjectID: "project-1",
- FlavorGroupName: "test-group",
- TotalMemoryBytes: 3 * 8192 * 1024 * 1024,
- }
-
- applyResult, err := manager.ApplyCommitmentState(
- context.Background(),
- logr.Discard(),
- desiredState,
- flavorGroups,
- "syncer",
- )
-
- if err != nil {
- t.Fatalf("unexpected error: %v", err)
- }
-
- if len(applyResult.RemovedReservations) != 0 {
- t.Errorf("expected 0 applyResult.RemovedReservations reservations, got %d", len(applyResult.RemovedReservations))
- }
-
- // Should create reservations to fulfill the commitment
- if len(applyResult.TouchedReservations) == 0 {
- t.Fatal("expected at least one reservation to be created")
- }
-
- // Verify created reservations sum to desired state
- totalMemory := int64(0)
- for _, res := range applyResult.TouchedReservations {
- memQuantity := res.Spec.Resources[hv1.ResourceMemory]
- totalMemory += memQuantity.Value()
+ return v1alpha1.Reservation{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: name,
+ Labels: map[string]string{
+ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
+ },
+ },
+ Spec: v1alpha1.ReservationSpec{
+ TargetHost: targetHost,
+ Resources: map[hv1.ResourceName]resource.Quantity{
+ hv1.ResourceMemory: *resource.NewQuantity(memGiB*1024*1024*1024, resource.BinarySI),
+ },
+ CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{
+ CommitmentUUID: "abc123",
+ ProjectID: "project-1",
+ ResourceGroup: resourceGroup,
+ Creator: "syncer",
+ Allocations: allocs,
+ },
+ },
}
+}
- if totalMemory != desiredState.TotalMemoryBytes {
- t.Errorf("expected total memory %d, got %d", desiredState.TotalMemoryBytes, totalMemory)
- }
+// testFlavorGroups returns the default flavor groups map used across tests.
+func testFlavorGroups() map[string]compute.FlavorGroupFeature {
+ return map[string]compute.FlavorGroupFeature{"test-group": testFlavorGroup()}
}
-func TestApplyCommitmentState_DeletesExcessReservations(t *testing.T) {
- scheme := runtime.NewScheme()
- if err := v1alpha1.AddToScheme(scheme); err != nil {
- t.Fatal(err)
- }
+// ============================================================================
+// Tests: ApplyCommitmentState
+// ============================================================================
- // Create existing reservations (32 GiB total)
- existingReservations := []v1alpha1.Reservation{
+func TestApplyCommitmentState(t *testing.T) {
+ tests := []struct {
+ name string
+ existingSlots []v1alpha1.Reservation
+ desiredMemoryGiB int64
+ flavorGroupOverride map[string]compute.FlavorGroupFeature // nil = testFlavorGroups()
+ wantError bool
+ wantRemovedCount int // exact count; -1 = at least one
+ validateRemoved func(t *testing.T, removed []v1alpha1.Reservation)
+ validateTouched func(t *testing.T, touched []v1alpha1.Reservation)
+ validateRemaining func(t *testing.T, remaining []v1alpha1.Reservation)
+ }{
{
- ObjectMeta: metav1.ObjectMeta{
- Name: "commitment-abc123-0",
- Labels: map[string]string{
- v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
- },
+ name: "creates reservations to match desired memory",
+ desiredMemoryGiB: 24, // 3 × 8 GiB slots
+ validateTouched: func(t *testing.T, touched []v1alpha1.Reservation) {
+ if len(touched) == 0 {
+ t.Fatal("expected at least one reservation created")
+ }
+ var total int64
+ for _, r := range touched {
+ q := r.Spec.Resources[hv1.ResourceMemory]
+ total += q.Value()
+ }
+ if want := int64(24 * 1024 * 1024 * 1024); total != want {
+ t.Errorf("expected total memory %d, got %d", want, total)
+ }
},
- Spec: v1alpha1.ReservationSpec{
- Resources: map[hv1.ResourceName]resource.Quantity{
- hv1.ResourceMemory: *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI),
- },
- CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{
- ProjectID: "project-1",
- ResourceGroup: "test-group",
- Creator: "syncer",
- Allocations: map[string]v1alpha1.CommittedResourceAllocation{},
- },
+ },
+ {
+ // Algorithm removes both 16 GiB slots and creates a new 8 GiB one.
+ name: "removes excess reservations, remaining memory matches desired",
+ existingSlots: []v1alpha1.Reservation{
+ newTestCRSlot("commitment-abc123-0", 16, "", "test-group", nil),
+ newTestCRSlot("commitment-abc123-1", 16, "", "test-group", nil),
+ },
+ desiredMemoryGiB: 8,
+ wantRemovedCount: -1,
+ validateRemaining: func(t *testing.T, remaining []v1alpha1.Reservation) {
+ var total int64
+ for _, r := range remaining {
+ q := r.Spec.Resources[hv1.ResourceMemory]
+ total += q.Value()
+ }
+ if want := int64(8 * 1024 * 1024 * 1024); total != want {
+ t.Errorf("expected remaining memory %d, got %d", want, total)
+ }
},
},
{
- ObjectMeta: metav1.ObjectMeta{
- Name: "commitment-abc123-1",
- Labels: map[string]string{
- v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
- },
+ name: "zero desired memory removes all reservations",
+ existingSlots: []v1alpha1.Reservation{
+ newTestCRSlot("commitment-abc123-0", 8, "", "test-group", nil),
},
- Spec: v1alpha1.ReservationSpec{
- Resources: map[hv1.ResourceName]resource.Quantity{
- hv1.ResourceMemory: *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI),
- },
- CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{
- ProjectID: "project-1",
- ResourceGroup: "test-group",
- Creator: "syncer",
- Allocations: map[string]v1alpha1.CommittedResourceAllocation{},
- },
+ desiredMemoryGiB: 0,
+ wantRemovedCount: 1,
+ validateRemaining: func(t *testing.T, remaining []v1alpha1.Reservation) {
+ if len(remaining) != 0 {
+ t.Errorf("expected 0 remaining, got %d", len(remaining))
+ }
},
},
- }
-
- client := fake.NewClientBuilder().
- WithScheme(scheme).
- WithObjects(&existingReservations[0], &existingReservations[1]).
- Build()
-
- manager := NewReservationManager(client)
- flavorGroup := testFlavorGroup()
- flavorGroups := map[string]compute.FlavorGroupFeature{
- "test-group": flavorGroup,
- }
-
- // Desired state: only 8 GiB (need to reduce)
- desiredState := &CommitmentState{
- CommitmentUUID: "abc123",
- ProjectID: "project-1",
- FlavorGroupName: "test-group",
- TotalMemoryBytes: 8 * 1024 * 1024 * 1024,
- }
-
- applyResult, err := manager.ApplyCommitmentState(
- context.Background(),
- logr.Discard(),
- desiredState,
- flavorGroups,
- "syncer",
- )
-
- if err != nil {
- t.Fatalf("unexpected error: %v", err)
- }
-
- // Note: May create a new 8GiB reservation while removing the two 16GiB ones
- // This is expected behavior based on the slot sizing algorithm
-
- // Should remove excess reservations
- if len(applyResult.RemovedReservations) == 0 {
- t.Fatal("expected reservations to be removed")
- }
-
- // Verify remaining capacity matches desired state
- var remainingList v1alpha1.ReservationList
- if err := client.List(context.Background(), &remainingList); err != nil {
- t.Fatal(err)
- }
-
- totalMemory := int64(0)
- for _, res := range remainingList.Items {
- memQuantity := res.Spec.Resources[hv1.ResourceMemory]
- totalMemory += memQuantity.Value()
- }
-
- if totalMemory != desiredState.TotalMemoryBytes {
- t.Errorf("expected remaining memory %d, got %d", desiredState.TotalMemoryBytes, totalMemory)
- }
-}
-
-func TestApplyCommitmentState_DeletionPriority(t *testing.T) {
- tests := []struct {
- name string
- existingReservations []v1alpha1.Reservation
- desiredMemoryBytes int64
- expectedRemovedCount int
- validateRemoved func(t *testing.T, removed []v1alpha1.Reservation)
- validateRemaining func(t *testing.T, remaining []v1alpha1.Reservation)
- }{
{
- name: "Priority 1: Unscheduled reservations (no TargetHost) deleted first",
- existingReservations: []v1alpha1.Reservation{
- // Reservation 0: Has TargetHost and allocations - lowest priority (should remain)
- {
- ObjectMeta: metav1.ObjectMeta{
- Name: "commitment-abc123-0",
- Labels: map[string]string{
- v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
- },
- },
- Spec: v1alpha1.ReservationSpec{
- TargetHost: "host-1",
- Resources: map[hv1.ResourceName]resource.Quantity{
- hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI),
- },
- CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{
- ProjectID: "project-1",
- ResourceGroup: "test-group",
- Creator: "syncer",
- Allocations: map[string]v1alpha1.CommittedResourceAllocation{
- "vm-123": {},
- },
- },
- },
- },
- // Reservation 1: No TargetHost and no allocations - highest priority (should be deleted)
- {
- ObjectMeta: metav1.ObjectMeta{
- Name: "commitment-abc123-1",
- Labels: map[string]string{
- v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
- },
- },
- Spec: v1alpha1.ReservationSpec{
- TargetHost: "",
- Resources: map[hv1.ResourceName]resource.Quantity{
- hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI),
- },
- CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{
- ProjectID: "project-1",
- ResourceGroup: "test-group",
- Creator: "syncer",
- Allocations: map[string]v1alpha1.CommittedResourceAllocation{},
- },
- },
- },
+ name: "replaces reservation with wrong flavor group",
+ existingSlots: []v1alpha1.Reservation{
+ newTestCRSlot("commitment-abc123-0", 8, "", "wrong-group", nil),
+ },
+ desiredMemoryGiB: 8,
+ wantRemovedCount: 1,
+ validateTouched: func(t *testing.T, touched []v1alpha1.Reservation) {
+ if len(touched) != 1 {
+ t.Fatalf("expected 1 new reservation, got %d", len(touched))
+ }
+ if got := touched[0].Spec.CommittedResourceReservation.ResourceGroup; got != "test-group" {
+ t.Errorf("expected flavor group test-group, got %s", got)
+ }
},
- desiredMemoryBytes: 8 * 1024 * 1024 * 1024, // Need to delete one
- expectedRemovedCount: 1,
+ },
+ {
+ name: "unknown flavor group returns error",
+ desiredMemoryGiB: 8,
+ flavorGroupOverride: map[string]compute.FlavorGroupFeature{},
+ wantError: true,
+ },
+ {
+ name: "deletion priority: unscheduled (no TargetHost) deleted before scheduled",
+ existingSlots: []v1alpha1.Reservation{
+ newTestCRSlot("commitment-abc123-0", 8, "host-1", "test-group", map[string]v1alpha1.CommittedResourceAllocation{"vm-123": {}}),
+ newTestCRSlot("commitment-abc123-1", 8, "", "test-group", nil),
+ },
+ desiredMemoryGiB: 8,
+ wantRemovedCount: 1,
validateRemoved: func(t *testing.T, removed []v1alpha1.Reservation) {
- // Should have removed the unscheduled one (no TargetHost)
if removed[0].Spec.TargetHost != "" {
- t.Errorf("expected unscheduled reservation to be removed, but removed %s with TargetHost %s",
- removed[0].Name, removed[0].Spec.TargetHost)
+ t.Errorf("expected unscheduled reservation removed, got TargetHost=%q", removed[0].Spec.TargetHost)
}
},
validateRemaining: func(t *testing.T, remaining []v1alpha1.Reservation) {
if len(remaining) != 1 {
- t.Fatalf("expected 1 remaining reservation, got %d", len(remaining))
+ t.Fatalf("expected 1 remaining, got %d", len(remaining))
}
- // Should have kept the scheduled one with allocations
- if remaining[0].Spec.TargetHost == "" {
- t.Error("expected scheduled reservation to remain")
- }
- if len(remaining[0].Spec.CommittedResourceReservation.Allocations) == 0 {
- t.Error("expected reservation with allocations to remain")
+ if remaining[0].Spec.TargetHost == "" || len(remaining[0].Spec.CommittedResourceReservation.Allocations) == 0 {
+ t.Error("expected scheduled reservation with allocations to remain")
}
},
},
{
- name: "Priority 2: Unused scheduled reservations (no allocations) deleted next",
- existingReservations: []v1alpha1.Reservation{
- // Has TargetHost AND allocations - lowest priority for deletion
- {
- ObjectMeta: metav1.ObjectMeta{
- Name: "commitment-abc123-0",
- Labels: map[string]string{
- v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
- },
- },
- Spec: v1alpha1.ReservationSpec{
- TargetHost: "host-1",
- Resources: map[hv1.ResourceName]resource.Quantity{
- hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI),
- },
- CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{
- ProjectID: "project-1",
- ResourceGroup: "test-group",
- Creator: "syncer",
- Allocations: map[string]v1alpha1.CommittedResourceAllocation{
- "vm-123": {},
- },
- },
- },
- },
- // Has TargetHost but NO allocations - medium priority
- {
- ObjectMeta: metav1.ObjectMeta{
- Name: "commitment-abc123-1",
- Labels: map[string]string{
- v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
- },
- },
- Spec: v1alpha1.ReservationSpec{
- TargetHost: "host-2",
- Resources: map[hv1.ResourceName]resource.Quantity{
- hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI),
- },
- CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{
- ProjectID: "project-1",
- ResourceGroup: "test-group",
- Creator: "syncer",
- Allocations: map[string]v1alpha1.CommittedResourceAllocation{},
- },
- },
- },
+ name: "deletion priority: unused scheduled (no allocations) deleted before allocated",
+ existingSlots: []v1alpha1.Reservation{
+ newTestCRSlot("commitment-abc123-0", 8, "host-1", "test-group", map[string]v1alpha1.CommittedResourceAllocation{"vm-123": {}}),
+ newTestCRSlot("commitment-abc123-1", 8, "host-2", "test-group", nil),
},
- desiredMemoryBytes: 8 * 1024 * 1024 * 1024,
- expectedRemovedCount: 1,
+ desiredMemoryGiB: 8,
+ wantRemovedCount: 1,
validateRemoved: func(t *testing.T, removed []v1alpha1.Reservation) {
- // Should have removed the one without allocations
if len(removed[0].Spec.CommittedResourceReservation.Allocations) != 0 {
t.Error("expected reservation without allocations to be removed")
}
},
validateRemaining: func(t *testing.T, remaining []v1alpha1.Reservation) {
if len(remaining) != 1 {
- t.Fatalf("expected 1 remaining reservation, got %d", len(remaining))
+ t.Fatalf("expected 1 remaining, got %d", len(remaining))
}
- // Should have kept the one with allocations
if len(remaining[0].Spec.CommittedResourceReservation.Allocations) == 0 {
t.Error("expected reservation with allocations to remain")
}
},
},
{
- name: "Mixed scenario: comprehensive deletion priority test",
- existingReservations: []v1alpha1.Reservation{
- // Reservation 0: Has TargetHost + has allocations (lowest priority - should remain)
- {
- ObjectMeta: metav1.ObjectMeta{
- Name: "commitment-abc123-0",
- Labels: map[string]string{
- v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
- },
- },
- Spec: v1alpha1.ReservationSpec{
- TargetHost: "host-1",
- Resources: map[hv1.ResourceName]resource.Quantity{
- hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI),
- },
- CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{
- ProjectID: "project-1",
- ResourceGroup: "test-group",
- Creator: "syncer",
- Allocations: map[string]v1alpha1.CommittedResourceAllocation{
- "vm-allocated": {},
- },
- },
- },
- },
- // Reservation 1: Has TargetHost + no allocations (medium priority - should remain)
- {
- ObjectMeta: metav1.ObjectMeta{
- Name: "commitment-abc123-1",
- Labels: map[string]string{
- v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
- },
- },
- Spec: v1alpha1.ReservationSpec{
- TargetHost: "host-2",
- Resources: map[hv1.ResourceName]resource.Quantity{
- hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI),
- },
- CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{
- ProjectID: "project-1",
- ResourceGroup: "test-group",
- Creator: "syncer",
- Allocations: map[string]v1alpha1.CommittedResourceAllocation{},
- },
- },
- },
- // Reservation 2: No TargetHost + no allocations (highest priority - should be deleted)
- {
- ObjectMeta: metav1.ObjectMeta{
- Name: "commitment-abc123-2",
- Labels: map[string]string{
- v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
- },
- },
- Spec: v1alpha1.ReservationSpec{
- TargetHost: "",
- Resources: map[hv1.ResourceName]resource.Quantity{
- hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI),
- },
- CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{
- ProjectID: "project-1",
- ResourceGroup: "test-group",
- Creator: "syncer",
- Allocations: map[string]v1alpha1.CommittedResourceAllocation{},
- },
- },
- },
- // Reservation 3: No TargetHost + no allocations (highest priority - should be deleted)
- {
- ObjectMeta: metav1.ObjectMeta{
- Name: "commitment-abc123-3",
- Labels: map[string]string{
- v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
- },
- },
- Spec: v1alpha1.ReservationSpec{
- TargetHost: "",
- Resources: map[hv1.ResourceName]resource.Quantity{
- hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI),
- },
- CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{
- ProjectID: "project-1",
- ResourceGroup: "test-group",
- Creator: "syncer",
- Allocations: map[string]v1alpha1.CommittedResourceAllocation{},
- },
- },
- },
+ name: "deletion priority: unscheduled removed first across mixed set",
+ existingSlots: []v1alpha1.Reservation{
+ newTestCRSlot("commitment-abc123-0", 8, "host-1", "test-group", map[string]v1alpha1.CommittedResourceAllocation{"vm-allocated": {}}),
+ newTestCRSlot("commitment-abc123-1", 8, "host-2", "test-group", nil),
+ newTestCRSlot("commitment-abc123-2", 8, "", "test-group", nil),
+ newTestCRSlot("commitment-abc123-3", 8, "", "test-group", nil),
},
- desiredMemoryBytes: 16 * 1024 * 1024 * 1024, // Need to delete 2 out of 4
- expectedRemovedCount: 2,
+ desiredMemoryGiB: 16,
+ wantRemovedCount: 2,
validateRemoved: func(t *testing.T, removed []v1alpha1.Reservation) {
- // Both removed should have no TargetHost (highest priority for deletion)
- for _, res := range removed {
- if res.Spec.TargetHost != "" {
- t.Errorf("expected unscheduled reservations to be removed first, but removed %s with TargetHost %s",
- res.Name, res.Spec.TargetHost)
+ for _, r := range removed {
+ if r.Spec.TargetHost != "" {
+ t.Errorf("expected unscheduled reservations removed first, got TargetHost=%q on %s", r.Spec.TargetHost, r.Name)
}
}
},
validateRemaining: func(t *testing.T, remaining []v1alpha1.Reservation) {
if len(remaining) != 2 {
- t.Fatalf("expected 2 remaining reservations, got %d", len(remaining))
- }
- // Both remaining should have TargetHost
- for _, res := range remaining {
- if res.Spec.TargetHost == "" {
- t.Errorf("expected scheduled reservations to remain, but %s has no TargetHost", res.Name)
- }
+ t.Fatalf("expected 2 remaining, got %d", len(remaining))
}
- // At least one should have allocations (the one with lowest deletion priority)
- hasAllocations := false
- for _, res := range remaining {
- if len(res.Spec.CommittedResourceReservation.Allocations) > 0 {
- hasAllocations = true
- break
+ for _, r := range remaining {
+ if r.Spec.TargetHost == "" {
+ t.Errorf("expected scheduled reservations to remain, got empty TargetHost on %s", r.Name)
}
}
- if !hasAllocations {
- t.Error("expected at least one remaining reservation to have allocations")
- }
},
},
}
+ scheme := newCRTestScheme(t)
+
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
- scheme := runtime.NewScheme()
- if err := v1alpha1.AddToScheme(scheme); err != nil {
- t.Fatal(err)
+ objects := make([]client.Object, len(tt.existingSlots))
+ for i := range tt.existingSlots {
+ objects[i] = &tt.existingSlots[i]
}
+ k8sClient := newCRTestClient(scheme, objects...)
+ manager := NewReservationManager(k8sClient)
- // Convert slice to individual objects for WithObjects
- objects := make([]client.Object, len(tt.existingReservations))
- for i := range tt.existingReservations {
- objects[i] = &tt.existingReservations[i]
+ flavorGroups := testFlavorGroups()
+ if tt.flavorGroupOverride != nil {
+ flavorGroups = tt.flavorGroupOverride
}
-
- client := fake.NewClientBuilder().
- WithScheme(scheme).
- WithObjects(objects...).
- Build()
-
- manager := NewReservationManager(client)
- flavorGroup := testFlavorGroup()
- flavorGroups := map[string]compute.FlavorGroupFeature{
- "test-group": flavorGroup,
- }
-
desiredState := &CommitmentState{
CommitmentUUID: "abc123",
ProjectID: "project-1",
FlavorGroupName: "test-group",
- TotalMemoryBytes: tt.desiredMemoryBytes,
+ TotalMemoryBytes: tt.desiredMemoryGiB * 1024 * 1024 * 1024,
}
applyResult, err := manager.ApplyCommitmentState(
- context.Background(),
- logr.Discard(),
- desiredState,
- flavorGroups,
- "syncer",
+ context.Background(), logr.Discard(), desiredState, flavorGroups, "syncer",
)
+ if tt.wantError {
+ if err == nil {
+ t.Fatal("expected error, got nil")
+ }
+ return
+ }
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
- if len(applyResult.RemovedReservations) != tt.expectedRemovedCount {
- t.Fatalf("expected %d removed reservations, got %d", tt.expectedRemovedCount, len(applyResult.RemovedReservations))
+ switch {
+ case tt.wantRemovedCount > 0:
+ if len(applyResult.RemovedReservations) != tt.wantRemovedCount {
+ t.Fatalf("expected %d removed, got %d", tt.wantRemovedCount, len(applyResult.RemovedReservations))
+ }
+ case tt.wantRemovedCount == 0:
+ if len(applyResult.RemovedReservations) != 0 {
+ t.Errorf("expected 0 removed, got %d", len(applyResult.RemovedReservations))
+ }
+ case tt.wantRemovedCount == -1:
+ if len(applyResult.RemovedReservations) == 0 {
+ t.Fatal("expected at least one removed reservation")
+ }
}
if tt.validateRemoved != nil {
tt.validateRemoved(t, applyResult.RemovedReservations)
}
-
- // Get remaining reservations
- var remainingList v1alpha1.ReservationList
- if err := client.List(context.Background(), &remainingList); err != nil {
- t.Fatal(err)
+ if tt.validateTouched != nil {
+ tt.validateTouched(t, applyResult.TouchedReservations)
}
-
if tt.validateRemaining != nil {
- tt.validateRemaining(t, remainingList.Items)
+ var remaining v1alpha1.ReservationList
+ if err := k8sClient.List(context.Background(), &remaining); err != nil {
+ t.Fatal(err)
+ }
+ tt.validateRemaining(t, remaining.Items)
}
})
}
}
-func TestApplyCommitmentState_HandlesZeroCapacity(t *testing.T) {
- scheme := runtime.NewScheme()
- if err := v1alpha1.AddToScheme(scheme); err != nil {
- t.Fatal(err)
- }
-
- // Create existing reservation
- existingReservation := v1alpha1.Reservation{
- ObjectMeta: metav1.ObjectMeta{
- Name: "commitment-abc123-0",
- Labels: map[string]string{
- v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
- },
- },
- Spec: v1alpha1.ReservationSpec{
- Resources: map[hv1.ResourceName]resource.Quantity{
- hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI),
- },
- CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{
- ProjectID: "project-1",
- ResourceGroup: "test-group",
- Creator: "syncer",
- Allocations: map[string]v1alpha1.CommittedResourceAllocation{},
- },
- },
- }
-
- client := fake.NewClientBuilder().
- WithScheme(scheme).
- WithObjects(&existingReservation).
- Build()
-
- manager := NewReservationManager(client)
- flavorGroup := testFlavorGroup()
- flavorGroups := map[string]compute.FlavorGroupFeature{
- "test-group": flavorGroup,
- }
-
- // Desired state: zero capacity (commitment expired or canceled)
- desiredState := &CommitmentState{
- CommitmentUUID: "abc123",
- ProjectID: "project-1",
- FlavorGroupName: "test-group",
- TotalMemoryBytes: 0,
- }
-
- applyResult, err := manager.ApplyCommitmentState(
- context.Background(),
- logr.Discard(),
- desiredState,
- flavorGroups,
- "syncer",
- )
-
- if err != nil {
- t.Fatalf("unexpected error: %v", err)
- }
-
- if len(applyResult.TouchedReservations) != 0 {
- t.Errorf("expected 0 new reservations, got %d", len(applyResult.TouchedReservations))
- }
-
- // Should remove all reservations
- if len(applyResult.RemovedReservations) != 1 {
- t.Fatalf("expected 1 removed reservation, got %d", len(applyResult.RemovedReservations))
- }
-
- // Verify no reservations remain
- var remainingList v1alpha1.ReservationList
- if err := client.List(context.Background(), &remainingList); err != nil {
- t.Fatal(err)
- }
-
- if len(remainingList.Items) != 0 {
- t.Errorf("expected 0 remaining reservations, got %d", len(remainingList.Items))
- }
-}
-
-func TestApplyCommitmentState_FixesWrongFlavorGroup(t *testing.T) {
- scheme := runtime.NewScheme()
- if err := v1alpha1.AddToScheme(scheme); err != nil {
- t.Fatal(err)
- }
-
- // Create reservation with wrong flavor group
- existingReservation := v1alpha1.Reservation{
- ObjectMeta: metav1.ObjectMeta{
- Name: "commitment-abc123-0",
- Labels: map[string]string{
- v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
- },
- },
- Spec: v1alpha1.ReservationSpec{
- Resources: map[hv1.ResourceName]resource.Quantity{
- hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI),
- },
- CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{
- ProjectID: "project-1",
- ResourceGroup: "wrong-group", // Wrong flavor group
- Creator: "syncer",
- Allocations: map[string]v1alpha1.CommittedResourceAllocation{},
- },
- },
- }
-
- client := fake.NewClientBuilder().
- WithScheme(scheme).
- WithObjects(&existingReservation).
- Build()
-
- manager := NewReservationManager(client)
- flavorGroup := testFlavorGroup()
- flavorGroups := map[string]compute.FlavorGroupFeature{
- "test-group": flavorGroup,
- }
-
- // Desired state with correct flavor group
- desiredState := &CommitmentState{
- CommitmentUUID: "abc123",
- ProjectID: "project-1",
- FlavorGroupName: "test-group",
- TotalMemoryBytes: 8 * 1024 * 1024 * 1024,
- }
-
- applyResult, err := manager.ApplyCommitmentState(
- context.Background(),
- logr.Discard(),
- desiredState,
- flavorGroups,
- "syncer",
- )
-
- if err != nil {
- t.Fatalf("unexpected error: %v", err)
- }
-
- // Should remove wrong reservation and create new one
- if len(applyResult.RemovedReservations) != 1 {
- t.Fatalf("expected 1 removed reservation, got %d", len(applyResult.RemovedReservations))
- }
-
- if len(applyResult.TouchedReservations) != 1 {
- t.Fatalf("expected 1 new reservation, got %d", len(applyResult.TouchedReservations))
- }
-
- // Verify new reservation has correct flavor group
- if applyResult.TouchedReservations[0].Spec.CommittedResourceReservation.ResourceGroup != "test-group" {
- t.Errorf("expected flavor group test-group, got %s",
- applyResult.TouchedReservations[0].Spec.CommittedResourceReservation.ResourceGroup)
- }
-}
-
-func TestApplyCommitmentState_UnknownFlavorGroup(t *testing.T) {
- scheme := runtime.NewScheme()
- if err := v1alpha1.AddToScheme(scheme); err != nil {
- t.Fatal(err)
- }
-
- client := fake.NewClientBuilder().
- WithScheme(scheme).
- Build()
-
- manager := NewReservationManager(client)
- flavorGroups := map[string]compute.FlavorGroupFeature{} // Empty
-
- desiredState := &CommitmentState{
- CommitmentUUID: "abc123",
- ProjectID: "project-1",
- FlavorGroupName: "unknown-group",
- TotalMemoryBytes: 8 * 1024 * 1024 * 1024,
- }
-
- _, err := manager.ApplyCommitmentState(
- context.Background(),
- logr.Discard(),
- desiredState,
- flavorGroups,
- "syncer",
- )
-
- if err == nil {
- t.Fatal("expected error for unknown flavor group, got nil")
- }
-}
+// ============================================================================
+// Tests: newReservation flavor selection
+// ============================================================================
func TestNewReservation_SelectsAppropriateFlavor(t *testing.T) {
manager := &ReservationManager{}
@@ -729,8 +314,8 @@ func TestNewReservation_SelectsAppropriateFlavor(t *testing.T) {
},
{
name: "oversized uses largest available flavor",
- deltaMemory: 100 * 1024 * 1024 * 1024, // 100 GiB (larger than any flavor)
- expectedName: "large", // Will use largest available
+ deltaMemory: 100 * 1024 * 1024 * 1024, // 100 GiB
+ expectedName: "large",
expectedCores: 16,
},
}
@@ -744,26 +329,15 @@ func TestNewReservation_SelectsAppropriateFlavor(t *testing.T) {
TotalMemoryBytes: tt.deltaMemory,
}
- reservation := manager.newReservation(
- state,
- 0,
- tt.deltaMemory,
- flavorGroup,
- "syncer",
- )
+ reservation := manager.newReservation(state, 0, tt.deltaMemory, flavorGroup, "syncer")
- // Verify flavor selection
if reservation.Spec.CommittedResourceReservation.ResourceName != tt.expectedName {
t.Errorf("expected flavor %s, got %s",
- tt.expectedName,
- reservation.Spec.CommittedResourceReservation.ResourceName)
+ tt.expectedName, reservation.Spec.CommittedResourceReservation.ResourceName)
}
-
- // Verify CPU allocation
cpuQuantity := reservation.Spec.Resources[hv1.ResourceCPU]
if cpuQuantity.Value() != tt.expectedCores {
- t.Errorf("expected %d cores, got %d",
- tt.expectedCores, cpuQuantity.Value())
+ t.Errorf("expected %d cores, got %d", tt.expectedCores, cpuQuantity.Value())
}
})
}
diff --git a/internal/scheduling/reservations/commitments/state.go b/internal/scheduling/reservations/commitments/state.go
index 698aea428..96ede88ac 100644
--- a/internal/scheduling/reservations/commitments/state.go
+++ b/internal/scheduling/reservations/commitments/state.go
@@ -93,6 +93,10 @@ type CommitmentState struct {
EndTime *time.Time
// CreatorRequestID is the request ID that triggered this state change (for traceability)
CreatorRequestID string
+ // NamePrefix overrides the default "commitment--" reservation naming convention.
+ // When set (e.g. "-"), Reservation CRDs are named "".
+ // Used by the CommittedResource controller; leave empty for the legacy syncer path.
+ NamePrefix string
}
// FromCommitment converts Limes commitment to CommitmentState.
diff --git a/internal/scheduling/reservations/commitments/syncer_test.go b/internal/scheduling/reservations/commitments/syncer_test.go
index e4bf6e841..e30f286c7 100644
--- a/internal/scheduling/reservations/commitments/syncer_test.go
+++ b/internal/scheduling/reservations/commitments/syncer_test.go
@@ -344,10 +344,11 @@ func TestSyncer_SyncReservations_UpdateExisting(t *testing.T) {
Spec: v1alpha1.ReservationSpec{
Type: v1alpha1.ReservationTypeCommittedResource,
CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{
- ProjectID: "old-project",
- ResourceName: "old-flavor",
- ResourceGroup: "old_group",
- Creator: CreatorValue,
+ CommitmentUUID: "12345-67890-abcdef",
+ ProjectID: "old-project",
+ ResourceName: "old-flavor",
+ ResourceGroup: "old_group",
+ Creator: CreatorValue,
},
Resources: map[hv1.ResourceName]resource.Quantity{
hv1.ResourceMemory: resource.MustParse("512Mi"),
From a449a97cb1ebbd0dbf30b79c16b2055f0934297c Mon Sep 17 00:00:00 2001
From: Markus Wieland <44964229+SoWieMarkus@users.noreply.github.com>
Date: Wed, 29 Apr 2026 10:12:52 +0200
Subject: [PATCH 06/54] feat: add vmware project utilization kpi (#768)
## Changes
- Add new project utilization KPI that tracks the amount of instances
per project and flavor and the capacity used by these flavors per
compute host
- Created an infrastructure sub directory in
`internal/knowledge/kpis/plugins` to collect all infrastructure
dashboard related metrics since they share some common features
- In follow up PRs I will refactor and move the other infrastructure
kpis into this directory as well
---------
Co-authored-by: Copilot
---
helm/bundles/cortex-nova/templates/kpis.yaml | 19 +-
.../plugins/infrastructure/vmware_metrics.go | 80 ++
.../vmware_project_utilization.go | 209 +++++
.../vmware_project_utilization_test.go | 713 ++++++++++++++++++
internal/knowledge/kpis/supported_kpis.go | 3 +
5 files changed, 1023 insertions(+), 1 deletion(-)
create mode 100644 internal/knowledge/kpis/plugins/infrastructure/vmware_metrics.go
create mode 100644 internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization.go
create mode 100644 internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization_test.go
diff --git a/helm/bundles/cortex-nova/templates/kpis.yaml b/helm/bundles/cortex-nova/templates/kpis.yaml
index a84989b7b..3234fcc4a 100644
--- a/helm/bundles/cortex-nova/templates/kpis.yaml
+++ b/helm/bundles/cortex-nova/templates/kpis.yaml
@@ -198,4 +198,21 @@ spec:
- name: nova-flavors
- name: nova-servers
description: |
- This KPI tracks unused VMware commitments based on project commitments and usage.
\ No newline at end of file
+ This KPI tracks unused VMware commitments based on project commitments and usage.
+---
+apiVersion: cortex.cloud/v1alpha1
+kind: KPI
+metadata:
+ name: vmware-project-utilization
+spec:
+ schedulingDomain: nova
+ impl: vmware_project_utilization_kpi
+ dependencies:
+ datasources:
+ - name: nova-servers
+ - name: nova-flavors
+ - name: identity-projects
+ knowledges:
+ - name: host-details
+ description: |
+ This KPI tracks the resource utilization of projects running VMs on VMware hosts.
\ No newline at end of file
diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_metrics.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_metrics.go
new file mode 100644
index 000000000..d92e8c3c2
--- /dev/null
+++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_metrics.go
@@ -0,0 +1,80 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package infrastructure
+
+import (
+ "regexp"
+ "strconv"
+
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute"
+)
+
+const (
+ hostDetailsKnowledgeName = "host-details"
+ vmwareIronicHypervisorType = "ironic"
+ hypervisorFamilyVMware = "vmware"
+ vmwareComputeHostPattern = "nova-compute-%"
+ vmwareIronicComputeHostPattern = "nova-compute-ironic-%"
+)
+
+// vmwareHost wraps HostDetails with Prometheus metric helpers.
+type vmwareHost struct {
+ compute.HostDetails
+}
+
+func (h vmwareHost) getHostLabels() []string {
+ pinnedProjectIds := ""
+ pinnedProjects := false
+ if h.PinnedProjects != nil {
+ pinnedProjectIds = *h.PinnedProjects
+ pinnedProjects = true
+ }
+ disabledReason := "-"
+ if h.DisabledReason != nil {
+ disabledReason = *h.DisabledReason
+ }
+ return []string{
+ h.AvailabilityZone,
+ h.ComputeHost,
+ h.CPUArchitecture,
+ h.WorkloadType,
+ h.HypervisorFamily,
+ strconv.FormatBool(h.Enabled),
+ strconv.FormatBool(h.Decommissioned),
+ strconv.FormatBool(h.ExternalCustomer),
+ disabledReason,
+ strconv.FormatBool(pinnedProjects),
+ pinnedProjectIds,
+ }
+}
+
+var vmwareHostLabels = []string{
+ "availability_zone",
+ "compute_host",
+ "cpu_architecture",
+ "workload_type",
+ "hypervisor_family",
+ "enabled",
+ "decommissioned",
+ "external_customer",
+ "disabled_reason",
+ "pinned_projects",
+ "pinned_project_ids",
+}
+
+var fqNameRe = regexp.MustCompile(`fqName: "([^"]+)"`)
+
+func getMetricName(desc string) string {
+ match := fqNameRe.FindStringSubmatch(desc)
+ if len(match) > 1 {
+ return match[1]
+ }
+ return ""
+}
+
+type collectedVMwareMetric struct {
+ Name string
+ Labels map[string]string
+ Value float64
+}
diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization.go
new file mode 100644
index 000000000..2d48b9737
--- /dev/null
+++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization.go
@@ -0,0 +1,209 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package infrastructure
+
+import (
+ "context"
+ "log/slog"
+
+ "github.com/cobaltcore-dev/cortex/api/v1alpha1"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/identity"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/db"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins"
+ "github.com/cobaltcore-dev/cortex/pkg/conf"
+ "github.com/prometheus/client_golang/prometheus"
+ "sigs.k8s.io/controller-runtime/pkg/client"
+)
+
+type vmwareProjectInstanceCount struct {
+ ProjectID string `db:"project_id"`
+ ProjectName string `db:"project_name"`
+ ComputeHost string `db:"compute_host"`
+ FlavorName string `db:"flavor_name"`
+ AvailabilityZone string `db:"availability_zone"`
+ InstanceCount float64 `db:"instance_count"`
+}
+
+type vmwareProjectCapacityUsage struct {
+ ProjectID string `db:"project_id"`
+ ProjectName string `db:"project_name"`
+ ComputeHost string `db:"compute_host"`
+ AvailabilityZone string `db:"availability_zone"`
+ TotalVCPUs float64 `db:"total_vcpus"`
+ TotalRAMMB float64 `db:"total_ram_mb"`
+ TotalDiskGB float64 `db:"total_disk_gb"`
+}
+
+type VMwareProjectUtilizationKPI struct {
+ // BaseKPI provides common fields and methods for all KPIs, such as database connection and Kubernetes client.
+ plugins.BaseKPI[struct{}]
+
+ // instanceCountPerProjectAndHostAndFlavor is a Prometheus descriptor for the number of running instances per project, hypervisor, and flavor on VMware.
+ instanceCountPerProjectAndHostAndFlavor *prometheus.Desc
+
+ // capacityUsagePerProjectAndHost is a Prometheus descriptor for the resource capacity used by a project per VMware hypervisor and flavor. CPU in vCPUs, memory and disk in bytes.
+ capacityUsagePerProjectAndHost *prometheus.Desc
+}
+
+func (k *VMwareProjectUtilizationKPI) GetName() string {
+ return "vmware_project_utilization_kpi"
+}
+
+func (k *VMwareProjectUtilizationKPI) Init(dbConn *db.DB, c client.Client, opts conf.RawOpts) error {
+ if err := k.BaseKPI.Init(dbConn, c, opts); err != nil {
+ return err
+ }
+
+ k.instanceCountPerProjectAndHostAndFlavor = prometheus.NewDesc(
+ "cortex_vmware_project_instances",
+ "Number of running instances per project, hypervisor, and flavor on VMware.",
+ append(vmwareHostLabels, "project_id", "project_name", "flavor_name"), nil,
+ )
+ k.capacityUsagePerProjectAndHost = prometheus.NewDesc(
+ "cortex_vmware_project_capacity_usage",
+ "Resource capacity used by a project per VMware hypervisor and flavor. CPU in vCPUs, memory and disk in bytes.",
+ append(vmwareHostLabels, "project_id", "project_name", "resource"), nil,
+ )
+ return nil
+}
+
+func (k *VMwareProjectUtilizationKPI) Describe(ch chan<- *prometheus.Desc) {
+ ch <- k.instanceCountPerProjectAndHostAndFlavor
+ ch <- k.capacityUsagePerProjectAndHost
+}
+
+func (k *VMwareProjectUtilizationKPI) Collect(ch chan<- prometheus.Metric) {
+ hosts, err := k.getVMwareHosts()
+ if err != nil {
+ // Log the error and return early to avoid panicking. The KPI will be retried on the next scrape.
+ slog.Error("vmware_project_utilization: Failed to get VMware hosts for project utilization KPI", "error", err)
+ return
+ }
+
+ // Export project x flavor x compute_host instance count metric
+ projectInstanceCounts, err := k.queryProjectInstanceCount()
+ if err != nil {
+ slog.Error("vmware_project_utilization: Failed to query project instance count for project utilization KPI", "error", err)
+ return
+ }
+ for _, projectInstanceCount := range projectInstanceCounts {
+ host, ok := hosts[projectInstanceCount.ComputeHost]
+ if !ok {
+ slog.Warn("vmware_project_utilization: Compute host not found for project instance count", "compute_host", projectInstanceCount.ComputeHost)
+ continue
+ }
+ hostLabels := host.getHostLabels()
+ hostLabels = append(hostLabels, projectInstanceCount.ProjectID, projectInstanceCount.ProjectName, projectInstanceCount.FlavorName)
+ ch <- prometheus.MustNewConstMetric(k.instanceCountPerProjectAndHostAndFlavor, prometheus.GaugeValue, projectInstanceCount.InstanceCount, hostLabels...)
+ }
+
+ // Export project x compute_host x resource capacity usage metric
+ projectCapacityUsages, err := k.queryProjectCapacityUsage()
+ if err != nil {
+ slog.Error("vmware_project_utilization: Failed to query project capacity usage for project utilization KPI", "error", err)
+ return
+ }
+ for _, projectCapacityUsage := range projectCapacityUsages {
+ host, ok := hosts[projectCapacityUsage.ComputeHost]
+ if !ok {
+ slog.Warn("vmware_project_utilization: Compute host not found for project capacity usage", "compute_host", projectCapacityUsage.ComputeHost)
+ continue
+ }
+ hostLabels := host.getHostLabels()
+ hostLabels = append(hostLabels, projectCapacityUsage.ProjectID, projectCapacityUsage.ProjectName)
+
+ memoryUsageBytes := projectCapacityUsage.TotalRAMMB * 1024 * 1024
+ diskUsageBytes := projectCapacityUsage.TotalDiskGB * 1024 * 1024 * 1024
+
+ ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, projectCapacityUsage.TotalVCPUs, append(hostLabels, "vcpu")...)
+ ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, memoryUsageBytes, append(hostLabels, "memory")...)
+ ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, diskUsageBytes, append(hostLabels, "disk")...)
+ }
+}
+
+// getVMwareHosts retrieves the mapping of VMware hypervisors to their corresponding host information
+func (k *VMwareProjectUtilizationKPI) getVMwareHosts() (map[string]vmwareHost, error) {
+ knowledge := &v1alpha1.Knowledge{}
+ if err := k.Client.Get(context.Background(), client.ObjectKey{Name: hostDetailsKnowledgeName}, knowledge); err != nil {
+ return nil, err
+ }
+
+ hostDetails, err := v1alpha1.UnboxFeatureList[compute.HostDetails](knowledge.Status.Raw)
+ if err != nil {
+ return nil, err
+ }
+
+ hostMapping := make(map[string]vmwareHost)
+ for _, host := range hostDetails {
+ if host.HypervisorType == vmwareIronicHypervisorType || host.HypervisorFamily != hypervisorFamilyVMware {
+ continue
+ }
+ hostMapping[host.ComputeHost] = vmwareHost{HostDetails: host}
+ }
+
+ return hostMapping, nil
+}
+
+// queryProjectInstanceCount retrieves the number of running instances per project, hypervisor, and flavor on VMware from the database.
+func (k *VMwareProjectUtilizationKPI) queryProjectCapacityUsage() ([]vmwareProjectCapacityUsage, error) {
+ // This query will fetch all active instances. It will perform a join with the openstack projects to get the project name.
+ // It will also join with the flavors table to get the flavor information, which is needed for the capacity usage metrics.
+ // The results will be grouped by project, compute host, and availability zone to get the total capacity usage per project and hypervisor.
+ // We will filter the results to only include instances that are running on VMware hypervisors by checking the compute host name pattern.
+ // This assumes that all VMware hypervisors have a compute host name that starts with "nova-compute-",
+ // which is a naming convention in SAP Cloud Infrastructure and may need to be adjusted based on the actual environment.
+ query := `
+ SELECT
+ s.tenant_id AS project_id,
+ COALESCE(p.name, '') AS project_name,
+ s.os_ext_srv_attr_host AS compute_host,
+ s.os_ext_az_availability_zone AS availability_zone,
+ COALESCE(SUM(f.vcpus), 0) AS total_vcpus,
+ COALESCE(SUM(f.ram), 0) AS total_ram_mb,
+ COALESCE(SUM(f.disk), 0) AS total_disk_gb
+ FROM ` + nova.Server{}.TableName() + ` s
+ LEFT JOIN ` + nova.Flavor{}.TableName() + ` f ON s.flavor_name = f.name
+ LEFT JOIN ` + identity.Project{}.TableName() + ` p ON p.id = s.tenant_id
+ WHERE s.status NOT IN ('DELETED', 'ERROR')
+ AND s.os_ext_srv_attr_host LIKE '` + vmwareComputeHostPattern + `'
+ AND s.os_ext_srv_attr_host NOT LIKE '` + vmwareIronicComputeHostPattern + `'
+ GROUP BY s.tenant_id, p.name, s.os_ext_srv_attr_host, s.os_ext_az_availability_zone
+ `
+ var usages []vmwareProjectCapacityUsage
+ if _, err := k.DB.Select(&usages, query); err != nil {
+ return nil, err
+ }
+ return usages, nil
+}
+
+// queryProjectInstanceCount retrieves the number of running instances per project, hypervisor, and flavor on VMware.
+func (k *VMwareProjectUtilizationKPI) queryProjectInstanceCount() ([]vmwareProjectInstanceCount, error) {
+ // This query will fetch all active instances. It will perform a join with the openstack projects to get the project name.
+ // The results will be grouped by project, hypervisor, flavor, and availability zone to get the instance count.
+ // We will filter the results to only include instances that are running on VMware hypervisors by checking the compute host name pattern.
+ // This assumes that all VMware hypervisors have a compute host name that starts with "nova-compute-",
+ // which is a naming convention in SAP Cloud Infrastructure and may need to be adjusted based on the actual environment.
+ query := `
+ SELECT
+ s.tenant_id AS project_id,
+ COALESCE(p.name, '') AS project_name,
+ s.os_ext_srv_attr_host AS compute_host,
+ s.os_ext_az_availability_zone AS availability_zone,
+ s.flavor_name,
+ COUNT(*) AS instance_count
+ FROM ` + nova.Server{}.TableName() + ` s
+ LEFT JOIN ` + identity.Project{}.TableName() + ` p ON p.id = s.tenant_id
+ WHERE s.status NOT IN ('DELETED', 'ERROR')
+ AND s.os_ext_srv_attr_host LIKE '` + vmwareComputeHostPattern + `'
+ AND s.os_ext_srv_attr_host NOT LIKE '` + vmwareIronicComputeHostPattern + `'
+ GROUP BY s.tenant_id, p.name, s.os_ext_srv_attr_host, s.flavor_name, s.os_ext_az_availability_zone
+ `
+ var usages []vmwareProjectInstanceCount
+ if _, err := k.DB.Select(&usages, query); err != nil {
+ return nil, err
+ }
+ return usages, nil
+}
diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization_test.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization_test.go
new file mode 100644
index 000000000..9f6d84786
--- /dev/null
+++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization_test.go
@@ -0,0 +1,713 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package infrastructure
+
+import (
+ "reflect"
+ "testing"
+
+ "github.com/cobaltcore-dev/cortex/api/v1alpha1"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/identity"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/db"
+ testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute"
+ "github.com/cobaltcore-dev/cortex/pkg/conf"
+ "github.com/prometheus/client_golang/prometheus"
+ prometheusgo "github.com/prometheus/client_model/go"
+ v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "sigs.k8s.io/controller-runtime/pkg/client/fake"
+)
+
+func buildMetricKey(name string, labels map[string]string) string {
+ switch name {
+ case "cortex_vmware_project_instances":
+ return name + "|" + labels["compute_host"] + "|" + labels["project_id"] +
+ "|" + labels["flavor_name"] + "|" + labels["availability_zone"]
+ case "cortex_vmware_project_capacity_usage":
+ return name + "|" + labels["compute_host"] + "|" + labels["project_id"] +
+ "|" + labels["availability_zone"] + "|" + labels["resource"]
+ default:
+ return name
+ }
+}
+
+func hostLabels(computeHost, az string) map[string]string {
+ return map[string]string{
+ "availability_zone": az,
+ "compute_host": computeHost,
+ "cpu_architecture": "",
+ "workload_type": "",
+ "hypervisor_family": "vmware",
+ "enabled": "false",
+ "decommissioned": "false",
+ "external_customer": "false",
+ "disabled_reason": "-",
+ "pinned_projects": "false",
+ "pinned_project_ids": "",
+ }
+}
+
+func instanceMetric(computeHost, az, projectID, projectName, flavorName string, value float64) collectedVMwareMetric {
+ labels := hostLabels(computeHost, az)
+ labels["project_id"] = projectID
+ labels["project_name"] = projectName
+ labels["flavor_name"] = flavorName
+ return collectedVMwareMetric{Name: "cortex_vmware_project_instances", Labels: labels, Value: value}
+}
+
+func capacityMetric(computeHost, az, projectID, projectName, resource string, value float64) collectedVMwareMetric {
+ labels := hostLabels(computeHost, az)
+ labels["project_id"] = projectID
+ labels["project_name"] = projectName
+ labels["resource"] = resource
+ return collectedVMwareMetric{Name: "cortex_vmware_project_capacity_usage", Labels: labels, Value: value}
+}
+
+func buildVMwareHostDetailsClient(t *testing.T, hostDetails []compute.HostDetails) *fake.ClientBuilder {
+ t.Helper()
+ scheme, err := v1alpha1.SchemeBuilder.Build()
+ if err != nil {
+ t.Fatalf("failed to build scheme: %v", err)
+ }
+ raw, err := v1alpha1.BoxFeatureList(hostDetails)
+ if err != nil {
+ t.Fatalf("failed to box host details: %v", err)
+ }
+ return fake.NewClientBuilder().WithScheme(scheme).WithRuntimeObjects(
+ &v1alpha1.Knowledge{
+ ObjectMeta: v1.ObjectMeta{Name: "host-details"},
+ Status: v1alpha1.KnowledgeStatus{Raw: raw},
+ },
+ )
+}
+
+func TestVMwareProjectUtilizationKPI_Init(t *testing.T) {
+ dbEnv := testlibDB.SetupDBEnv(t)
+ testDB := db.DB{DbMap: dbEnv.DbMap}
+ defer dbEnv.Close()
+ kpi := &VMwareProjectUtilizationKPI{}
+ if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+}
+
+func TestVMwareProjectUtilizationKPI_getVMwareHosts(t *testing.T) {
+ hostDetails := []compute.HostDetails{
+ {
+ ComputeHost: "nova-compute-1",
+ HypervisorFamily: hypervisorFamilyVMware,
+ },
+ {
+ ComputeHost: "nova-compute-2",
+ HypervisorFamily: hypervisorFamilyVMware,
+ },
+ {
+ ComputeHost: "nova-compute-ironic-1",
+ HypervisorType: vmwareIronicHypervisorType,
+ HypervisorFamily: hypervisorFamilyVMware,
+ },
+ {
+ ComputeHost: "nova-compute-3",
+ HypervisorFamily: "other",
+ },
+ }
+
+ clientBuilder := buildVMwareHostDetailsClient(t, hostDetails)
+ kpi := &VMwareProjectUtilizationKPI{}
+ kpi.Client = clientBuilder.Build()
+
+ hostMapping, err := kpi.getVMwareHosts()
+ if err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ expectedHosts := map[string]vmwareHost{
+ "nova-compute-1": {HostDetails: hostDetails[0]},
+ "nova-compute-2": {HostDetails: hostDetails[1]},
+ }
+
+ if len(hostMapping) != len(expectedHosts) {
+ t.Fatalf("expected %d hosts, got %d", len(expectedHosts), len(hostMapping))
+ }
+
+ for computeHost, expectedHost := range expectedHosts {
+ host, ok := hostMapping[computeHost]
+ if !ok {
+ t.Fatalf("expected host %s not found in mapping", computeHost)
+ }
+ if host.ComputeHost != expectedHost.ComputeHost || host.HypervisorFamily != expectedHost.HypervisorFamily {
+ t.Errorf("host details mismatch for %s: expected %+v, got %+v", computeHost, expectedHost, host)
+ }
+ }
+}
+
+func TestVMwareProjectUtilizationKPI_queryProjectInstanceCount(t *testing.T) {
+ tests := []struct {
+ name string
+ servers []nova.Server
+ projects []identity.Project
+ expectedCounts map[string]vmwareProjectInstanceCount
+ }{
+ {
+ name: "single instance in one project",
+ servers: []nova.Server{
+ {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ },
+ projects: []identity.Project{{ID: "project-1", Name: "Project One"}},
+ expectedCounts: map[string]vmwareProjectInstanceCount{
+ "project-1|nova-compute-1|flavor-1|az1": {ProjectID: "project-1", ProjectName: "Project One", ComputeHost: "nova-compute-1", FlavorName: "flavor-1", AvailabilityZone: "az1", InstanceCount: 1},
+ },
+ },
+ {
+ name: "multiple instances across projects and hosts",
+ servers: []nova.Server{
+ {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ {ID: "server-2", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-2", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ {ID: "server-3", TenantID: "project-2", OSEXTSRVATTRHost: "nova-compute-2", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az2"},
+ {ID: "server-4", TenantID: "project-2", OSEXTSRVATTRHost: "nova-compute-2", FlavorName: "flavor-2", Status: "ACTIVE", OSEXTAvailabilityZone: "az2"},
+ },
+ projects: []identity.Project{
+ {ID: "project-1", Name: "Project One"},
+ {ID: "project-2", Name: "Project Two"},
+ },
+ expectedCounts: map[string]vmwareProjectInstanceCount{
+ "project-1|nova-compute-1|flavor-1|az1": {ProjectID: "project-1", ProjectName: "Project One", ComputeHost: "nova-compute-1", FlavorName: "flavor-1", AvailabilityZone: "az1", InstanceCount: 1},
+ "project-1|nova-compute-1|flavor-2|az1": {ProjectID: "project-1", ProjectName: "Project One", ComputeHost: "nova-compute-1", FlavorName: "flavor-2", AvailabilityZone: "az1", InstanceCount: 1},
+ "project-2|nova-compute-2|flavor-1|az2": {ProjectID: "project-2", ProjectName: "Project Two", ComputeHost: "nova-compute-2", FlavorName: "flavor-1", AvailabilityZone: "az2", InstanceCount: 1},
+ "project-2|nova-compute-2|flavor-2|az2": {ProjectID: "project-2", ProjectName: "Project Two", ComputeHost: "nova-compute-2", FlavorName: "flavor-2", AvailabilityZone: "az2", InstanceCount: 1},
+ },
+ },
+ {
+ name: "instances on non-VMware hosts are excluded",
+ servers: []nova.Server{
+ {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ {ID: "server-2", TenantID: "project-1", OSEXTSRVATTRHost: "node-3", FlavorName: "flavor-2", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ {ID: "server-3", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-ironic-1", FlavorName: "flavor-2", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ },
+ projects: []identity.Project{{ID: "project-1", Name: "Project One"}},
+ expectedCounts: map[string]vmwareProjectInstanceCount{
+ "project-1|nova-compute-1|flavor-1|az1": {ProjectID: "project-1", ProjectName: "Project One", ComputeHost: "nova-compute-1", FlavorName: "flavor-1", AvailabilityZone: "az1", InstanceCount: 1},
+ },
+ },
+ {
+ name: "instances with non-ACTIVE status are excluded",
+ servers: []nova.Server{
+ {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "DELETED", OSEXTAvailabilityZone: "az1"},
+ {ID: "server-2", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-2", Status: "ERROR", OSEXTAvailabilityZone: "az1"},
+ {ID: "server-3", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-3", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ },
+ projects: []identity.Project{{ID: "project-1", Name: "Project One"}},
+ expectedCounts: map[string]vmwareProjectInstanceCount{
+ "project-1|nova-compute-1|flavor-3|az1": {ProjectID: "project-1", ProjectName: "Project One", ComputeHost: "nova-compute-1", FlavorName: "flavor-3", AvailabilityZone: "az1", InstanceCount: 1},
+ },
+ },
+ {
+ name: "multiple instances with same key are counted correctly",
+ servers: []nova.Server{
+ {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ {ID: "server-2", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ {ID: "server-3", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-2", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az2"},
+ {ID: "server-4", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-2", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az2"},
+ },
+ projects: []identity.Project{{ID: "project-1", Name: "Project One"}},
+ expectedCounts: map[string]vmwareProjectInstanceCount{
+ "project-1|nova-compute-1|flavor-1|az1": {ProjectID: "project-1", ProjectName: "Project One", ComputeHost: "nova-compute-1", FlavorName: "flavor-1", AvailabilityZone: "az1", InstanceCount: 2},
+ "project-1|nova-compute-2|flavor-1|az2": {ProjectID: "project-1", ProjectName: "Project One", ComputeHost: "nova-compute-2", FlavorName: "flavor-1", AvailabilityZone: "az2", InstanceCount: 2},
+ },
+ },
+ {
+ name: "missing project entry results in empty project_name",
+ servers: []nova.Server{
+ {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ },
+ projects: []identity.Project{},
+ expectedCounts: map[string]vmwareProjectInstanceCount{
+ "project-1|nova-compute-1|flavor-1|az1": {ProjectID: "project-1", ProjectName: "", ComputeHost: "nova-compute-1", FlavorName: "flavor-1", AvailabilityZone: "az1", InstanceCount: 1},
+ },
+ },
+ {
+ name: "no instances returns empty result",
+ servers: []nova.Server{},
+ projects: []identity.Project{{ID: "project-1", Name: "Project One"}},
+ expectedCounts: map[string]vmwareProjectInstanceCount{},
+ },
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ dbEnv := testlibDB.SetupDBEnv(t)
+ testDB := db.DB{DbMap: dbEnv.DbMap}
+ defer dbEnv.Close()
+
+ if err := testDB.CreateTable(
+ testDB.AddTable(nova.Server{}),
+ testDB.AddTable(identity.Project{}),
+ ); err != nil {
+ t.Fatalf("failed to create tables: %v", err)
+ }
+
+ var mockData []any
+ for i := range tt.servers {
+ mockData = append(mockData, &tt.servers[i])
+ }
+ for i := range tt.projects {
+ mockData = append(mockData, &tt.projects[i])
+ }
+ if len(mockData) > 0 {
+ if err := testDB.Insert(mockData...); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+ }
+
+ client := buildVMwareHostDetailsClient(t, []compute.HostDetails{})
+ kpi := &VMwareProjectUtilizationKPI{}
+ if err := kpi.Init(&testDB, client.Build(), conf.NewRawOpts("{}")); err != nil {
+ t.Fatalf("expected no error on Init, got %v", err)
+ }
+ counts, err := kpi.queryProjectInstanceCount()
+ if err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ if len(counts) != len(tt.expectedCounts) {
+ t.Fatalf("expected %d counts, got %d", len(tt.expectedCounts), len(counts))
+ }
+ for _, got := range counts {
+ key := got.ProjectID + "|" + got.ComputeHost + "|" + got.FlavorName + "|" + got.AvailabilityZone
+ exp, ok := tt.expectedCounts[key]
+ if !ok {
+ t.Errorf("unexpected count for key %q: %+v", key, got)
+ continue
+ }
+ if got != exp {
+ t.Errorf("count mismatch for key %q: expected %+v, got %+v", key, exp, got)
+ }
+ }
+ })
+ }
+}
+
+func TestVMwareProjectUtilizationKPI_queryProjectCapacityUsage(t *testing.T) {
+ tests := []struct {
+ name string
+ servers []nova.Server
+ projects []identity.Project
+ flavors []nova.Flavor
+ expectedUsages map[string]vmwareProjectCapacityUsage
+ }{
+ {
+ name: "single instance with flavor details",
+ servers: []nova.Server{
+ {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ },
+ projects: []identity.Project{{ID: "project-1", Name: "Project One"}},
+ flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}},
+ expectedUsages: map[string]vmwareProjectCapacityUsage{
+ "project-1|nova-compute-1|az1": {ProjectID: "project-1", ProjectName: "Project One", ComputeHost: "nova-compute-1", AvailabilityZone: "az1", TotalVCPUs: 2, TotalRAMMB: 4096, TotalDiskGB: 1},
+ },
+ },
+ {
+ name: "multiple instances with different flavors and projects",
+ servers: []nova.Server{
+ {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ {ID: "server-2", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-2", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ {ID: "server-3", TenantID: "project-2", OSEXTSRVATTRHost: "nova-compute-2", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az2"},
+ },
+ projects: []identity.Project{
+ {ID: "project-1", Name: "Project One"},
+ {ID: "project-2", Name: "Project Two"},
+ },
+ flavors: []nova.Flavor{
+ {ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1},
+ {ID: "f2", Name: "flavor-2", VCPUs: 4, RAM: 8192, Disk: 2},
+ },
+ expectedUsages: map[string]vmwareProjectCapacityUsage{
+ "project-1|nova-compute-1|az1": {ProjectID: "project-1", ProjectName: "Project One", ComputeHost: "nova-compute-1", AvailabilityZone: "az1", TotalVCPUs: 6, TotalRAMMB: 12288, TotalDiskGB: 3},
+ "project-2|nova-compute-2|az2": {ProjectID: "project-2", ProjectName: "Project Two", ComputeHost: "nova-compute-2", AvailabilityZone: "az2", TotalVCPUs: 2, TotalRAMMB: 4096, TotalDiskGB: 1},
+ },
+ },
+ {
+ name: "missing flavor entry results in zero capacity",
+ servers: []nova.Server{
+ {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-missing", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ },
+ projects: []identity.Project{{ID: "project-1", Name: "Project One"}},
+ flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}},
+ expectedUsages: map[string]vmwareProjectCapacityUsage{
+ "project-1|nova-compute-1|az1": {ProjectID: "project-1", ProjectName: "Project One", ComputeHost: "nova-compute-1", AvailabilityZone: "az1", TotalVCPUs: 0, TotalRAMMB: 0, TotalDiskGB: 0},
+ },
+ },
+ {
+ name: "instances on non-VMware hosts are excluded",
+ servers: []nova.Server{
+ {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "node-3", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ },
+ projects: []identity.Project{{ID: "project-1", Name: "Project One"}},
+ flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}},
+ expectedUsages: map[string]vmwareProjectCapacityUsage{},
+ },
+ {
+ name: "instances with non-ACTIVE status are excluded",
+ servers: []nova.Server{
+ {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "DELETED", OSEXTAvailabilityZone: "az1"},
+ },
+ projects: []identity.Project{{ID: "project-1", Name: "Project One"}},
+ flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}},
+ expectedUsages: map[string]vmwareProjectCapacityUsage{},
+ },
+ {
+ name: "no instances returns empty capacity usage",
+ servers: []nova.Server{},
+ projects: []identity.Project{
+ {ID: "project-1", Name: "Project One"},
+ },
+ flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}},
+ expectedUsages: map[string]vmwareProjectCapacityUsage{},
+ },
+ {
+ name: "multiple instances with same flavor aggregate capacity correctly",
+ servers: []nova.Server{
+ {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ {ID: "server-2", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ },
+ projects: []identity.Project{{ID: "project-1", Name: "Project One"}},
+ flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}},
+ expectedUsages: map[string]vmwareProjectCapacityUsage{
+ "project-1|nova-compute-1|az1": {ProjectID: "project-1", ProjectName: "Project One", ComputeHost: "nova-compute-1", AvailabilityZone: "az1", TotalVCPUs: 4, TotalRAMMB: 8192, TotalDiskGB: 2},
+ },
+ },
+ {
+ name: "ironic host instances are excluded",
+ servers: []nova.Server{
+ {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-ironic-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ },
+ projects: []identity.Project{{ID: "project-1", Name: "Project One"}},
+ flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}},
+ expectedUsages: map[string]vmwareProjectCapacityUsage{},
+ },
+ {
+ name: "missing project entry results in empty project_name",
+ servers: []nova.Server{
+ {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ },
+ projects: []identity.Project{},
+ flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}},
+ expectedUsages: map[string]vmwareProjectCapacityUsage{
+ "project-1|nova-compute-1|az1": {ProjectID: "project-1", ProjectName: "", ComputeHost: "nova-compute-1", AvailabilityZone: "az1", TotalVCPUs: 2, TotalRAMMB: 4096, TotalDiskGB: 1},
+ },
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ dbEnv := testlibDB.SetupDBEnv(t)
+ testDB := db.DB{DbMap: dbEnv.DbMap}
+ defer dbEnv.Close()
+
+ if err := testDB.CreateTable(
+ testDB.AddTable(nova.Server{}),
+ testDB.AddTable(identity.Project{}),
+ testDB.AddTable(nova.Flavor{}),
+ ); err != nil {
+ t.Fatalf("failed to create tables: %v", err)
+ }
+
+ var mockData []any
+ for i := range tt.servers {
+ mockData = append(mockData, &tt.servers[i])
+ }
+ for i := range tt.projects {
+ mockData = append(mockData, &tt.projects[i])
+ }
+ for i := range tt.flavors {
+ mockData = append(mockData, &tt.flavors[i])
+ }
+ if len(mockData) > 0 {
+ if err := testDB.Insert(mockData...); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+ }
+
+ client := buildVMwareHostDetailsClient(t, []compute.HostDetails{})
+ kpi := &VMwareProjectUtilizationKPI{}
+ if err := kpi.Init(&testDB, client.Build(), conf.NewRawOpts("{}")); err != nil {
+ t.Fatalf("expected no error on Init, got %v", err)
+ }
+ usages, err := kpi.queryProjectCapacityUsage()
+ if err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ if len(usages) != len(tt.expectedUsages) {
+ t.Fatalf("expected %d usages, got %d", len(tt.expectedUsages), len(usages))
+ }
+ for _, got := range usages {
+ key := got.ProjectID + "|" + got.ComputeHost + "|" + got.AvailabilityZone
+ exp, ok := tt.expectedUsages[key]
+ if !ok {
+ t.Errorf("unexpected usage for key %q: %+v", key, got)
+ continue
+ }
+ if got != exp {
+ t.Errorf("usage mismatch for key %q: expected %+v, got %+v", key, exp, got)
+ }
+ }
+ })
+ }
+}
+
+func TestVMwareProjectUtilizationKPI_Collect(t *testing.T) {
+ tests := []struct {
+ name string
+ servers []nova.Server
+ projects []identity.Project
+ flavors []nova.Flavor
+ hostDetails []compute.HostDetails
+ expectedMetrics []collectedVMwareMetric
+ }{
+ {
+ name: "single instance in one project",
+ servers: []nova.Server{
+ {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ },
+ projects: []identity.Project{{ID: "project-1", Name: "Project One"}},
+ flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}},
+ hostDetails: []compute.HostDetails{
+ {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"},
+ },
+ expectedMetrics: []collectedVMwareMetric{
+ instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "flavor-1", 1),
+ capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "vcpu", 2),
+ capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "memory", 4096*1024*1024),
+ capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "disk", 1*1024*1024*1024),
+ },
+ },
+ {
+ name: "multiple instances across hosts, projects, and flavors",
+ servers: []nova.Server{
+ {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ {ID: "s2", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-2", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ {ID: "s3", TenantID: "project-2", OSEXTSRVATTRHost: "nova-compute-2", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az2"},
+ },
+ projects: []identity.Project{
+ {ID: "project-1", Name: "Project One"},
+ {ID: "project-2", Name: "Project Two"},
+ },
+ flavors: []nova.Flavor{
+ {ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1},
+ {ID: "f2", Name: "flavor-2", VCPUs: 4, RAM: 8192, Disk: 2},
+ },
+ hostDetails: []compute.HostDetails{
+ {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"},
+ {ComputeHost: "nova-compute-2", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az2"},
+ },
+ expectedMetrics: []collectedVMwareMetric{
+ instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "flavor-1", 1),
+ instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "flavor-2", 1),
+ instanceMetric("nova-compute-2", "az2", "project-2", "Project Two", "flavor-1", 1),
+ // nova-compute-1/project-1: 1*flavor-1 + 1*flavor-2
+ capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "vcpu", 6),
+ capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "memory", 12288*1024*1024),
+ capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "disk", 3*1024*1024*1024),
+ // nova-compute-2/project-2: 1*flavor-1
+ capacityMetric("nova-compute-2", "az2", "project-2", "Project Two", "vcpu", 2),
+ capacityMetric("nova-compute-2", "az2", "project-2", "Project Two", "memory", 4096*1024*1024),
+ capacityMetric("nova-compute-2", "az2", "project-2", "Project Two", "disk", 1*1024*1024*1024),
+ },
+ },
+ {
+ name: "non-VMware and ironic hosts are excluded",
+ servers: []nova.Server{
+ {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ {ID: "s2", TenantID: "project-1", OSEXTSRVATTRHost: "node-3", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ {ID: "s3", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-ironic-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ },
+ projects: []identity.Project{{ID: "project-1", Name: "Project One"}},
+ flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}},
+ hostDetails: []compute.HostDetails{
+ {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"},
+ },
+ expectedMetrics: []collectedVMwareMetric{
+ instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "flavor-1", 1),
+ capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "vcpu", 2),
+ capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "memory", 4096*1024*1024),
+ capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "disk", 1*1024*1024*1024),
+ },
+ },
+ {
+ name: "DELETED and ERROR instances are excluded",
+ servers: []nova.Server{
+ {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "DELETED", OSEXTAvailabilityZone: "az1"},
+ {ID: "s2", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-2", Status: "ERROR", OSEXTAvailabilityZone: "az1"},
+ {ID: "s3", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-3", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ },
+ projects: []identity.Project{{ID: "project-1", Name: "Project One"}},
+ flavors: []nova.Flavor{
+ {ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1},
+ {ID: "f2", Name: "flavor-2", VCPUs: 4, RAM: 8192, Disk: 2},
+ {ID: "f3", Name: "flavor-3", VCPUs: 8, RAM: 16384, Disk: 4},
+ },
+ hostDetails: []compute.HostDetails{
+ {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"},
+ },
+ expectedMetrics: []collectedVMwareMetric{
+ instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "flavor-3", 1),
+ capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "vcpu", 8),
+ capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "memory", 16384*1024*1024),
+ capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "disk", 4*1024*1024*1024),
+ },
+ },
+ {
+ name: "multiple instances with same flavor are aggregated correctly",
+ servers: []nova.Server{
+ {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ {ID: "s2", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ {ID: "s3", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-2", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az2"},
+ {ID: "s4", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-2", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az2"},
+ },
+ projects: []identity.Project{{ID: "project-1", Name: "Project One"}},
+ flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}},
+ hostDetails: []compute.HostDetails{
+ {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"},
+ {ComputeHost: "nova-compute-2", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az2"},
+ },
+ expectedMetrics: []collectedVMwareMetric{
+ instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "flavor-1", 2),
+ instanceMetric("nova-compute-2", "az2", "project-1", "Project One", "flavor-1", 2),
+ capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "vcpu", 4),
+ capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "memory", 2*4096*1024*1024),
+ capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "disk", 2*1024*1024*1024),
+ capacityMetric("nova-compute-2", "az2", "project-1", "Project One", "vcpu", 4),
+ capacityMetric("nova-compute-2", "az2", "project-1", "Project One", "memory", 2*4096*1024*1024),
+ capacityMetric("nova-compute-2", "az2", "project-1", "Project One", "disk", 2*1024*1024*1024),
+ },
+ },
+ {
+ name: "missing project entry results in empty project_name label",
+ servers: []nova.Server{
+ {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ },
+ projects: []identity.Project{},
+ flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}},
+ hostDetails: []compute.HostDetails{
+ {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"},
+ },
+ expectedMetrics: []collectedVMwareMetric{
+ instanceMetric("nova-compute-1", "az1", "project-1", "", "flavor-1", 1),
+ capacityMetric("nova-compute-1", "az1", "project-1", "", "vcpu", 2),
+ capacityMetric("nova-compute-1", "az1", "project-1", "", "memory", 4096*1024*1024),
+ capacityMetric("nova-compute-1", "az1", "project-1", "", "disk", 1*1024*1024*1024),
+ },
+ },
+ {
+ name: "missing flavor entry results in zero capacity",
+ servers: []nova.Server{
+ {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-missing", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"},
+ },
+ projects: []identity.Project{{ID: "project-1", Name: "Project One"}},
+ flavors: []nova.Flavor{},
+ hostDetails: []compute.HostDetails{
+ {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"},
+ },
+ expectedMetrics: []collectedVMwareMetric{
+ instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "flavor-missing", 1),
+ capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "vcpu", 0),
+ capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "memory", 0),
+ capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "disk", 0),
+ },
+ },
+ {
+ name: "no instances produces no metrics",
+ servers: []nova.Server{},
+ projects: []identity.Project{
+ {ID: "project-1", Name: "Project One"},
+ },
+ flavors: []nova.Flavor{
+ {ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1},
+ },
+ hostDetails: []compute.HostDetails{
+ {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"},
+ },
+ expectedMetrics: []collectedVMwareMetric{},
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ dbEnv := testlibDB.SetupDBEnv(t)
+ testDB := db.DB{DbMap: dbEnv.DbMap}
+ defer dbEnv.Close()
+
+ if err := testDB.CreateTable(
+ testDB.AddTable(nova.Server{}),
+ testDB.AddTable(identity.Project{}),
+ testDB.AddTable(nova.Flavor{}),
+ ); err != nil {
+ t.Fatalf("failed to create tables: %v", err)
+ }
+
+ var mockData []any
+ for i := range tt.servers {
+ mockData = append(mockData, &tt.servers[i])
+ }
+ for i := range tt.projects {
+ mockData = append(mockData, &tt.projects[i])
+ }
+ for i := range tt.flavors {
+ mockData = append(mockData, &tt.flavors[i])
+ }
+ if len(mockData) > 0 {
+ if err := testDB.Insert(mockData...); err != nil {
+ t.Fatalf("expected no error inserting data, got %v", err)
+ }
+ }
+
+ client := buildVMwareHostDetailsClient(t, tt.hostDetails)
+ kpi := &VMwareProjectUtilizationKPI{}
+ if err := kpi.Init(&testDB, client.Build(), conf.NewRawOpts("{}")); err != nil {
+ t.Fatalf("expected no error on Init, got %v", err)
+ }
+
+ ch := make(chan prometheus.Metric, 100)
+ kpi.Collect(ch)
+ close(ch)
+
+ actual := make(map[string]collectedVMwareMetric)
+ for m := range ch {
+ var pm prometheusgo.Metric
+ if err := m.Write(&pm); err != nil {
+ t.Fatalf("failed to write metric: %v", err)
+ }
+ labels := make(map[string]string)
+ for _, lbl := range pm.Label {
+ labels[lbl.GetName()] = lbl.GetValue()
+ }
+ name := getMetricName(m.Desc().String())
+ key := buildMetricKey(name, labels)
+ if _, exists := actual[key]; exists {
+ t.Fatalf("duplicate metric key %q", key)
+ }
+ actual[key] = collectedVMwareMetric{Name: name, Labels: labels, Value: pm.GetGauge().GetValue()}
+ }
+
+ if len(actual) != len(tt.expectedMetrics) {
+ t.Errorf("expected %d metrics, got %d: actual=%v", len(tt.expectedMetrics), len(actual), actual)
+ }
+ for _, exp := range tt.expectedMetrics {
+ key := buildMetricKey(exp.Name, exp.Labels)
+ got, ok := actual[key]
+ if !ok {
+ t.Errorf("missing metric %q", key)
+ continue
+ }
+ if got.Value != exp.Value {
+ t.Errorf("metric %q value: expected %v, got %v", key, exp.Value, got.Value)
+ }
+ if !reflect.DeepEqual(exp.Labels, got.Labels) {
+ t.Errorf("metric %q labels: expected %v, got %v", key, exp.Labels, got.Labels)
+ }
+ }
+ })
+ }
+}
diff --git a/internal/knowledge/kpis/supported_kpis.go b/internal/knowledge/kpis/supported_kpis.go
index a812943e1..c1a2b336c 100644
--- a/internal/knowledge/kpis/supported_kpis.go
+++ b/internal/knowledge/kpis/supported_kpis.go
@@ -7,6 +7,7 @@ import (
"github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins"
"github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins/compute"
"github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins/deployment"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins/infrastructure"
"github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins/storage"
)
@@ -24,6 +25,8 @@ var supportedKPIs = map[string]plugins.KPI{
"vm_faults_kpi": &compute.VMFaultsKPI{},
"vmware_commitments_kpi": &compute.VMwareResourceCommitmentsKPI{},
+ "vmware_project_utilization_kpi": &infrastructure.VMwareProjectUtilizationKPI{},
+
"netapp_storage_pool_cpu_usage_kpi": &storage.NetAppStoragePoolCPUUsageKPI{},
"datasource_state_kpi": &deployment.DatasourceStateKPI{},
From 95866017ec2b293dae4f647fe459ff1fbff511fb Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Wed, 29 Apr 2026 08:22:47 +0000
Subject: [PATCH 07/54] Bump cortex chart appVersions to sha-a449a97c [skip ci]
---
helm/library/cortex/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml
index d00071154..2dff5955c 100644
--- a/helm/library/cortex/Chart.yaml
+++ b/helm/library/cortex/Chart.yaml
@@ -3,6 +3,6 @@ name: cortex
description: A Helm chart to distribute cortex.
type: application
version: 0.0.44
-appVersion: "sha-1ee94125"
+appVersion: "sha-a449a97c"
icon: "https://example.com/icon.png"
dependencies: []
From fd90261469e14448e177a37257f62c2d4a9198e5 Mon Sep 17 00:00:00 2001
From: Philipp Matthes
Date: Wed, 29 Apr 2026 11:26:03 +0200
Subject: [PATCH 08/54] Add pull-request-creator agent
Adds a Claude Code agent that creates clean pull requests with
plain-text descriptions suitable for commit messages, following
the kernel Assisted-by convention for AI-assisted contributions.
---
.claude/agents/pull-request-creator.md | 53 ++++++++++++++++++++++++++
1 file changed, 53 insertions(+)
create mode 100644 .claude/agents/pull-request-creator.md
diff --git a/.claude/agents/pull-request-creator.md b/.claude/agents/pull-request-creator.md
new file mode 100644
index 000000000..1586719e2
--- /dev/null
+++ b/.claude/agents/pull-request-creator.md
@@ -0,0 +1,53 @@
+---
+name: pull-request-creator
+description: Use this agent to create clean pull requests. It reviews the diff, takes an optional motivation or summary, and opens a PR with a concise description suitable for a commit message. No markdown, no file change summaries, no artificial linebreaks.
+tools: Bash, Read
+model: inherit
+---
+
+You are a pull request creator. Your job is to review the current branch's diff against the base branch, accept an optional motivation or summary from the caller, and open a clean pull request.
+
+## Workflow
+
+1. Determine the base branch (usually `main`).
+2. Run `git log main..HEAD` and `git diff main...HEAD --stat` to understand what changed.
+3. Read the diff carefully to understand the substance of the changes.
+4. Write a PR title (imperative, under 70 characters).
+5. Write a PR description following the rules below.
+6. Push the branch if needed and create the PR using `gh pr create`.
+
+## PR Description Rules
+
+The description will be used directly as a commit message body. Follow these rules strictly:
+
+- No markdown formatting (no headers, no bold, no bullet points, no code blocks).
+- No artificial linebreaks within paragraphs. Let text flow naturally.
+- No file change summaries or lists of modified files.
+- Concise: explain what changed and why in a few sentences. Focus on motivation and effect, not mechanics.
+- End the description with a blank line followed by an Assisted-by trailer.
+
+## Assisted-by Trailer
+
+Add the following trailer at the end of the PR description, separated by a blank line. This follows the linux kernel convention for AI-assisted contributions:
+
+```
+Assisted-by: AGENT_NAME:MODEL_VERSION [TOOL1] [TOOL2] ...
+```
+
+Use your own agent name and model version, and list the tools you actually used.
+
+## Example Description
+
+```
+Refactor traits API from two-ConfigMap model to a single shim-owned ConfigMap with a Syncer interface. The Helm-managed static ConfigMap is removed; the shim now creates and owns the ConfigMap on startup and syncs from upstream placement periodically. This simplifies the deployment model and removes the merge logic that combined two sources at query time.
+
+Assisted-by: Claude Code:claude-opus-4-20250514 [Bash] [Read]
+```
+
+## Important
+
+- If the caller provides a motivation or summary, incorporate it into the description naturally.
+- If no motivation is given, derive it from the diff.
+- Never invent changes that aren't in the diff.
+- Always push the branch before creating the PR.
+- Use `gh pr create` with `--body` for the description.
From b7d4a10a5c4c2b341773ede4e79b09f885de3c07 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Wed, 29 Apr 2026 09:36:24 +0000
Subject: [PATCH 09/54] Bump cortex chart appVersions to sha-fd902614 [skip ci]
---
helm/library/cortex/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml
index 2dff5955c..9f37b23c8 100644
--- a/helm/library/cortex/Chart.yaml
+++ b/helm/library/cortex/Chart.yaml
@@ -3,6 +3,6 @@ name: cortex
description: A Helm chart to distribute cortex.
type: application
version: 0.0.44
-appVersion: "sha-a449a97c"
+appVersion: "sha-fd902614"
icon: "https://example.com/icon.png"
dependencies: []
From ebbf9d44df8b792a679aa25804d58809e98ba2f2 Mon Sep 17 00:00:00 2001
From: Philipp Matthes <27271818+PhilippMatthes@users.noreply.github.com>
Date: Wed, 29 Apr 2026 11:48:15 +0200
Subject: [PATCH 10/54] Refactor /traits API to single-ConfigMap model with
Syncer interface (#771)
Refactors the placement shim's /traits API from a two-ConfigMap model
(Helm-managed static + shim-managed dynamic) to a single shim-owned
ConfigMap with a reusable Syncer interface pattern.
Changes:
- Remove the Helm-managed static traits ConfigMap template and
values.yaml static field.
- The shim now creates and owns the traits ConfigMap on startup via
TraitSyncer.Init.
- Periodic upstream sync logic moved from inline startTraitSyncLoop into
the TraitSyncer struct implementing the new Syncer interface (Init +
Run).
- Eliminate the merge of static/custom traits at query time; getTraits
reads from the single ConfigMap.
- Hybrid mode now uses forwardWithHook to forward then update the local
ConfigMap on success, instead of best-effort syncing to upstream after
local writes.
- Remove the separate -custom ConfigMap and the two-ConfigMap lock
naming; locking is now on the single ConfigMap name.
- E2E tests no longer skip CRUD in passthrough-configured mode since the
ConfigMap always exists.
- Unit tests simplified to use a single ConfigMap fixture;
upstream-contact tests replaced.
This establishes the Syncer pattern for future resource types (e.g.
/resource_classes).
Assisted-by: Claude (claude-code)
---
.../templates/configmap-traits.yaml | 11 -
.../bundles/cortex-placement-shim/values.yaml | 3 -
internal/shim/placement/handle_traits.go | 482 +++++++-----------
internal/shim/placement/handle_traits_e2e.go | 21 +-
internal/shim/placement/handle_traits_test.go | 42 +-
internal/shim/placement/shim.go | 42 +-
internal/shim/placement/syncer.go | 19 +
internal/shim/placement/syncer_traits.go | 173 +++++++
internal/shim/placement/syncer_traits_test.go | 145 ++++++
9 files changed, 564 insertions(+), 374 deletions(-)
delete mode 100644 helm/bundles/cortex-placement-shim/templates/configmap-traits.yaml
create mode 100644 internal/shim/placement/syncer.go
create mode 100644 internal/shim/placement/syncer_traits.go
create mode 100644 internal/shim/placement/syncer_traits_test.go
diff --git a/helm/bundles/cortex-placement-shim/templates/configmap-traits.yaml b/helm/bundles/cortex-placement-shim/templates/configmap-traits.yaml
deleted file mode 100644
index b6969aaa7..000000000
--- a/helm/bundles/cortex-placement-shim/templates/configmap-traits.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-{{- if ne ((index .Values "cortex-shim").conf.features.traits | default "passthrough") "passthrough" }}
-{{- $cmName := (index .Values "cortex-shim").conf.traits.configMapName }}
-apiVersion: v1
-kind: ConfigMap
-metadata:
- name: {{ $cmName }}
- labels:
- {{- include "chart.labels" . | nindent 4 }}
-data:
- traits: {{ (index .Values "cortex-shim").conf.traits.static | toJson | quote }}
-{{- end }}
diff --git a/helm/bundles/cortex-placement-shim/values.yaml b/helm/bundles/cortex-placement-shim/values.yaml
index 7e1818e9e..54805cee3 100644
--- a/helm/bundles/cortex-placement-shim/values.yaml
+++ b/helm/bundles/cortex-placement-shim/values.yaml
@@ -62,9 +62,6 @@ cortex-shim:
status: "CURRENT"
traits:
configMapName: "cortex-placement-shim-traits"
- # Static traits included in every Helm install/upgrade. The shim
- # merges them with dynamic CUSTOM_* traits at request time.
- static: []
auth:
tokenCacheTTL: "5m"
policies:
diff --git a/internal/shim/placement/handle_traits.go b/internal/shim/placement/handle_traits.go
index 429b87cfc..d815b9d55 100644
--- a/internal/shim/placement/handle_traits.go
+++ b/internal/shim/placement/handle_traits.go
@@ -7,44 +7,22 @@ import (
"context"
"encoding/json"
"fmt"
- "math/rand"
+ "io"
"net/http"
- "net/url"
"os"
"sort"
"strings"
"time"
- "github.com/go-logr/logr"
- "github.com/gophercloud/gophercloud/v2"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
logf "sigs.k8s.io/controller-runtime/pkg/log"
)
const configMapKeyTraits = "traits"
-func (s *Shim) staticTraitsConfigMapKey() client.ObjectKey {
- return client.ObjectKey{
- Namespace: os.Getenv("POD_NAMESPACE"),
- Name: s.config.Traits.ConfigMapName,
- }
-}
-
-func (s *Shim) customTraitsConfigMapKey() client.ObjectKey {
- return client.ObjectKey{
- Namespace: os.Getenv("POD_NAMESPACE"),
- Name: s.config.Traits.ConfigMapName + "-custom",
- }
-}
-
-func (s *Shim) traitsLockName() string {
- return s.config.Traits.ConfigMapName + "-custom-lock"
-}
-
// traitsListResponse matches the OpenStack Placement GET /traits response.
type traitsListResponse struct {
Traits []string `json:"traits"`
@@ -52,10 +30,10 @@ type traitsListResponse struct {
// HandleListTraits handles GET /traits requests.
//
-// Returns a sorted list of trait strings merged from the static (Helm-managed)
-// and dynamic (CUSTOM_*) ConfigMaps. Supports optional query parameter "name"
-// for filtering: "in:TRAIT_A,TRAIT_B" returns only named traits,
-// "startswith:CUSTOM_" returns prefix matches.
+// Feature modes:
+// - passthrough: forwards to upstream placement.
+// - hybrid: forwards to upstream placement.
+// - crd: serves the trait list from the local ConfigMap.
//
// See: https://docs.openstack.org/api-ref/placement/#list-traits
func (s *Shim) HandleListTraits(w http.ResponseWriter, r *http.Request) {
@@ -67,15 +45,15 @@ func (s *Shim) HandleListTraits(w http.ResponseWriter, r *http.Request) {
s.forward(w, r)
return
case FeatureModeCRD:
- // Serve from local ConfigMaps.
+ // Serve from local ConfigMap.
default:
http.Error(w, "unknown feature mode", http.StatusInternalServerError)
return
}
- traitSet, err := s.getAllTraits(ctx)
+ traitSet, err := s.getTraits(ctx)
if err != nil {
- log.Error(err, "failed to list traits from configmaps")
+ log.Error(err, "failed to list traits from configmap")
http.Error(w, "failed to list traits", http.StatusInternalServerError)
return
}
@@ -124,8 +102,10 @@ func (s *Shim) HandleListTraits(w http.ResponseWriter, r *http.Request) {
// HandleShowTrait handles GET /traits/{name} requests.
//
-// Checks whether a trait with the given name exists in either the static
-// or dynamic ConfigMap. Returns 204 No Content if found, 404 Not Found otherwise.
+// Feature modes:
+// - passthrough: forwards to upstream placement.
+// - hybrid: forwards to upstream placement.
+// - crd: checks the local ConfigMap for the trait.
//
// See: https://docs.openstack.org/api-ref/placement/#show-traits
func (s *Shim) HandleShowTrait(w http.ResponseWriter, r *http.Request) {
@@ -137,7 +117,7 @@ func (s *Shim) HandleShowTrait(w http.ResponseWriter, r *http.Request) {
s.forward(w, r)
return
case FeatureModeCRD:
- // Serve from local ConfigMaps.
+ // Serve from local ConfigMap.
default:
http.Error(w, "unknown feature mode", http.StatusInternalServerError)
return
@@ -164,22 +144,26 @@ func (s *Shim) HandleShowTrait(w http.ResponseWriter, r *http.Request) {
// HandleUpdateTrait handles PUT /traits/{name} requests.
//
-// Creates a new custom trait in the dynamic ConfigMap. Only traits prefixed
-// with CUSTOM_ may be created. Returns 201 Created if the trait is newly
-// inserted, or 204 No Content if it already exists (in either ConfigMap).
-// Returns 400 Bad Request if the name does not carry the CUSTOM_ prefix.
+// Feature modes:
+// - passthrough: forwards to upstream placement.
+// - hybrid: forwards to upstream; on success, adds the trait to the local ConfigMap.
+// - crd: writes the trait to the local ConfigMap (CUSTOM_ prefix required).
//
// See: https://docs.openstack.org/api-ref/placement/#update-trait
func (s *Shim) HandleUpdateTrait(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()
log := logf.FromContext(ctx)
- switch s.featureModeFromConfOrHeader(r, s.config.Features.Traits) {
- case FeatureModePassthrough, FeatureModeHybrid:
+ mode := s.featureModeFromConfOrHeader(r, s.config.Features.Traits)
+ switch mode {
+ case FeatureModePassthrough:
s.forward(w, r)
return
+ case FeatureModeHybrid:
+ s.handleUpdateTraitHybrid(w, r)
+ return
case FeatureModeCRD:
- // Serve from local ConfigMaps.
+ // Handle locally.
default:
http.Error(w, "unknown feature mode", http.StatusInternalServerError)
return
@@ -195,115 +179,71 @@ func (s *Shim) HandleUpdateTrait(w http.ResponseWriter, r *http.Request) {
return
}
- // Fast path: trait already exists in either ConfigMap (no lock needed).
- allTraits, err := s.getAllTraits(ctx)
+ created, err := s.addTraitToConfigMap(ctx, name)
if err != nil {
- log.Error(err, "failed to read traits for existence check", "trait", name)
+ log.Error(err, "failed to create trait", "trait", name)
http.Error(w, "failed to create trait", http.StatusInternalServerError)
return
}
- if _, exists := allTraits[name]; exists {
- log.Info("trait already exists, nothing to do", "trait", name)
+ if created {
+ w.WriteHeader(http.StatusCreated)
+ } else {
w.WriteHeader(http.StatusNoContent)
- return
}
+}
- // Slow path: acquire lock, read/create dynamic ConfigMap, add trait.
- host, err := os.Hostname()
- if err != nil {
- host = "unknown"
- }
- lockerID := fmt.Sprintf("shim-%s-%d", host, time.Now().UnixNano())
- if err := s.resourceLocker.AcquireLock(ctx, s.traitsLockName(), lockerID); err != nil {
- log.Error(err, "failed to acquire traits lock", "trait", name)
- http.Error(w, "failed to create trait", http.StatusInternalServerError)
+// handleUpdateTraitHybrid forwards PUT /traits/{name} to upstream, then
+// updates the local ConfigMap on success.
+func (s *Shim) handleUpdateTraitHybrid(w http.ResponseWriter, r *http.Request) {
+ ctx := r.Context()
+ log := logf.FromContext(ctx)
+
+ name, ok := requiredPathParam(w, r, "name")
+ if !ok {
return
}
- defer func() {
- releaseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
- defer cancel()
- if err := s.resourceLocker.ReleaseLock(releaseCtx, s.traitsLockName(), lockerID); err != nil {
- log.Error(err, "failed to release traits lock")
- }
- }()
- cm := &corev1.ConfigMap{}
- err = s.Get(ctx, s.customTraitsConfigMapKey(), cm)
- if apierrors.IsNotFound(err) {
- // Dynamic ConfigMap does not exist yet — create it with the new trait.
- cm = &corev1.ConfigMap{
- ObjectMeta: metav1.ObjectMeta{
- Name: s.customTraitsConfigMapKey().Name,
- Namespace: s.customTraitsConfigMapKey().Namespace,
- },
- Data: map[string]string{configMapKeyTraits: "[]"},
- }
- current := map[string]struct{}{name: {}}
- if err := s.writeTraits(cm, current); err != nil {
- log.Error(err, "failed to serialize traits", "trait", name)
- http.Error(w, "failed to create trait", http.StatusInternalServerError)
- return
+ s.forwardWithHook(w, r, func(w http.ResponseWriter, resp *http.Response) {
+ for k, vs := range resp.Header {
+ for _, v := range vs {
+ w.Header().Add(k, v)
+ }
}
- if err := s.Create(ctx, cm); err != nil {
- log.Error(err, "failed to create custom traits configmap", "trait", name)
- http.Error(w, "failed to create trait", http.StatusInternalServerError)
- return
+ w.WriteHeader(resp.StatusCode)
+ if resp.Body != nil {
+ io.Copy(w, resp.Body) //nolint:errcheck
}
- log.Info("created custom traits configmap with new trait", "trait", name)
- s.syncTraitToUpstream(ctx, name, r.Header)
- w.WriteHeader(http.StatusCreated)
- return
- }
- if err != nil {
- log.Error(err, "failed to get custom traits configmap", "trait", name)
- http.Error(w, "failed to create trait", http.StatusInternalServerError)
- return
- }
- current, err := parseTraits(cm)
- if err != nil {
- log.Error(err, "failed to parse custom traits configmap", "trait", name)
- http.Error(w, "failed to create trait", http.StatusInternalServerError)
- return
- }
- if _, exists := current[name]; exists {
- log.Info("trait already exists in custom configmap after lock acquisition", "trait", name)
- w.WriteHeader(http.StatusNoContent)
- return
- }
- current[name] = struct{}{}
- if err := s.writeTraits(cm, current); err != nil {
- log.Error(err, "failed to serialize traits", "trait", name)
- http.Error(w, "failed to create trait", http.StatusInternalServerError)
- return
- }
- if err := s.Update(ctx, cm); err != nil {
- log.Error(err, "failed to update custom traits configmap", "trait", name)
- http.Error(w, "failed to create trait", http.StatusInternalServerError)
- return
- }
- log.Info("added custom trait to configmap", "trait", name)
- s.syncTraitToUpstream(ctx, name, r.Header)
- w.WriteHeader(http.StatusCreated)
+ if resp.StatusCode == http.StatusCreated || resp.StatusCode == http.StatusNoContent {
+ if _, err := s.addTraitToConfigMap(ctx, name); err != nil {
+ log.Error(err, "hybrid: failed to add trait to local configmap", "trait", name)
+ }
+ }
+ })
}
// HandleDeleteTrait handles DELETE /traits/{name} requests.
//
-// Deletes a custom trait from the dynamic ConfigMap. Standard traits (those
-// without the CUSTOM_ prefix) cannot be deleted and return 400 Bad Request.
-// Returns 404 if the trait does not exist. Returns 204 No Content on success.
+// Feature modes:
+// - passthrough: forwards to upstream placement.
+// - hybrid: forwards to upstream; on success, removes the trait from the local ConfigMap.
+// - crd: removes the trait from the local ConfigMap (CUSTOM_ prefix required).
//
// See: https://docs.openstack.org/api-ref/placement/#delete-traits
func (s *Shim) HandleDeleteTrait(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()
log := logf.FromContext(ctx)
- switch s.featureModeFromConfOrHeader(r, s.config.Features.Traits) {
- case FeatureModePassthrough, FeatureModeHybrid:
+ mode := s.featureModeFromConfOrHeader(r, s.config.Features.Traits)
+ switch mode {
+ case FeatureModePassthrough:
s.forward(w, r)
return
+ case FeatureModeHybrid:
+ s.handleDeleteTraitHybrid(w, r)
+ return
case FeatureModeCRD:
- // Serve from local ConfigMaps.
+ // Handle locally.
default:
http.Error(w, "unknown feature mode", http.StatusInternalServerError)
return
@@ -319,101 +259,60 @@ func (s *Shim) HandleDeleteTrait(w http.ResponseWriter, r *http.Request) {
return
}
- host, err := os.Hostname()
+ removed, err := s.removeTraitFromConfigMap(ctx, name)
if err != nil {
- host = "unknown"
- }
- lockerID := fmt.Sprintf("shim-%s-%d", host, time.Now().UnixNano())
- if err := s.resourceLocker.AcquireLock(ctx, s.traitsLockName(), lockerID); err != nil {
- log.Error(err, "failed to acquire traits lock", "trait", name)
+ log.Error(err, "failed to delete trait", "trait", name)
http.Error(w, "failed to delete trait", http.StatusInternalServerError)
return
}
- defer func() {
- releaseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
- defer cancel()
- if err := s.resourceLocker.ReleaseLock(releaseCtx, s.traitsLockName(), lockerID); err != nil {
- log.Error(err, "failed to release traits lock")
- }
- }()
-
- cm := &corev1.ConfigMap{}
- err = s.Get(ctx, s.customTraitsConfigMapKey(), cm)
- if apierrors.IsNotFound(err) {
- log.Info("custom traits configmap not found, trait does not exist", "trait", name)
- http.Error(w, "trait not found", http.StatusNotFound)
- return
- }
- if err != nil {
- log.Error(err, "failed to get custom traits configmap", "trait", name)
- http.Error(w, "failed to delete trait", http.StatusInternalServerError)
- return
- }
- current, err := parseTraits(cm)
- if err != nil {
- log.Error(err, "failed to parse custom traits configmap", "trait", name)
- http.Error(w, "failed to delete trait", http.StatusInternalServerError)
- return
- }
- if _, exists := current[name]; !exists {
- log.Info("trait not found in custom configmap", "trait", name)
+ if !removed {
+ log.Info("trait not found in configmap", "trait", name)
http.Error(w, "trait not found", http.StatusNotFound)
return
}
- delete(current, name)
- if err := s.writeTraits(cm, current); err != nil {
- log.Error(err, "failed to serialize traits", "trait", name)
- http.Error(w, "failed to delete trait", http.StatusInternalServerError)
- return
- }
- if err := s.Update(ctx, cm); err != nil {
- log.Error(err, "failed to update custom traits configmap", "trait", name)
- http.Error(w, "failed to delete trait", http.StatusInternalServerError)
- return
- }
- log.Info("deleted custom trait from configmap", "trait", name)
+ log.Info("deleted trait from configmap", "trait", name)
w.WriteHeader(http.StatusNoContent)
}
-// getStaticTraits reads traits from the Helm-managed static ConfigMap.
-func (s *Shim) getStaticTraits(ctx context.Context) (map[string]struct{}, error) {
- cm := &corev1.ConfigMap{}
- if err := s.Get(ctx, s.staticTraitsConfigMapKey(), cm); err != nil {
- return nil, fmt.Errorf("get static configmap %s: %w", s.config.Traits.ConfigMapName, err)
+// handleDeleteTraitHybrid forwards DELETE /traits/{name} to upstream, then
+// updates the local ConfigMap on success.
+func (s *Shim) handleDeleteTraitHybrid(w http.ResponseWriter, r *http.Request) {
+ ctx := r.Context()
+ log := logf.FromContext(ctx)
+
+ name, ok := requiredPathParam(w, r, "name")
+ if !ok {
+ return
}
- return parseTraits(cm)
+
+ s.forwardWithHook(w, r, func(w http.ResponseWriter, resp *http.Response) {
+ for k, vs := range resp.Header {
+ for _, v := range vs {
+ w.Header().Add(k, v)
+ }
+ }
+ w.WriteHeader(resp.StatusCode)
+ if resp.Body != nil {
+ io.Copy(w, resp.Body) //nolint:errcheck
+ }
+
+ if resp.StatusCode == http.StatusNoContent {
+ if _, err := s.removeTraitFromConfigMap(ctx, name); err != nil {
+ log.Error(err, "hybrid: failed to remove trait from local configmap", "trait", name)
+ }
+ }
+ })
}
-// getCustomTraits reads traits from the dynamic ConfigMap created by the shim.
-// Returns an empty set if the ConfigMap does not exist yet.
-func (s *Shim) getCustomTraits(ctx context.Context) (map[string]struct{}, error) {
+// getTraits reads traits from the single ConfigMap.
+func (s *Shim) getTraits(ctx context.Context) (map[string]struct{}, error) {
cm := &corev1.ConfigMap{}
- err := s.Get(ctx, s.customTraitsConfigMapKey(), cm)
- if apierrors.IsNotFound(err) {
- return make(map[string]struct{}), nil
- }
- if err != nil {
- return nil, fmt.Errorf("get custom configmap %s-custom: %w", s.config.Traits.ConfigMapName, err)
+ if err := s.Get(ctx, client.ObjectKey{Namespace: os.Getenv("POD_NAMESPACE"), Name: s.config.Traits.ConfigMapName}, cm); err != nil {
+ return nil, fmt.Errorf("get traits configmap %s: %w", s.config.Traits.ConfigMapName, err)
}
return parseTraits(cm)
}
-// getAllTraits merges static and custom traits into a single set.
-func (s *Shim) getAllTraits(ctx context.Context) (map[string]struct{}, error) {
- static, err := s.getStaticTraits(ctx)
- if err != nil {
- return nil, err
- }
- custom, err := s.getCustomTraits(ctx)
- if err != nil {
- return nil, err
- }
- for t := range custom {
- static[t] = struct{}{}
- }
- return static, nil
-}
-
// parseTraits extracts the trait set from a ConfigMap.
func parseTraits(cm *corev1.ConfigMap) (map[string]struct{}, error) {
raw, ok := cm.Data[configMapKeyTraits]
@@ -432,7 +331,7 @@ func parseTraits(cm *corev1.ConfigMap) (map[string]struct{}, error) {
}
func (s *Shim) hasTrait(ctx context.Context, name string) (bool, error) {
- traits, err := s.getAllTraits(ctx)
+ traits, err := s.getTraits(ctx)
if err != nil {
return false, err
}
@@ -440,8 +339,8 @@ func (s *Shim) hasTrait(ctx context.Context, name string) (bool, error) {
return ok, nil
}
-// writeTraits serializes the trait set into the ConfigMap's data field.
-func (s *Shim) writeTraits(cm *corev1.ConfigMap, traitSet map[string]struct{}) error {
+// writeTraitsToConfigMap serializes the trait set into the ConfigMap's data field.
+func writeTraitsToConfigMap(cm *corev1.ConfigMap, traitSet map[string]struct{}) error {
traits := make([]string, 0, len(traitSet))
for t := range traitSet {
traits = append(traits, t)
@@ -459,121 +358,110 @@ func (s *Shim) writeTraits(cm *corev1.ConfigMap, traitSet map[string]struct{}) e
return nil
}
-// syncTraitToUpstream best-effort creates the trait in upstream placement so
-// that endpoints forwarded to upstream (e.g. PUT /resource_providers/{uuid}/traits)
-// can reference locally-created custom traits. Errors are logged but never
-// propagated — upstream may be unreachable and that is acceptable.
-func (s *Shim) syncTraitToUpstream(ctx context.Context, name string, incomingHeader http.Header) {
- log := logf.FromContext(ctx)
- if s.httpClient == nil {
- log.V(1).Info("skipping upstream trait sync, no http client configured", "trait", name)
- return
- }
- u, err := url.Parse(s.config.PlacementURL)
+// addTraitToConfigMap adds a trait to the ConfigMap under the resource lock.
+// Returns true if the trait was newly created, false if it already existed.
+func (s *Shim) addTraitToConfigMap(ctx context.Context, name string) (bool, error) {
+ // Fast path: trait already exists (no lock needed).
+ traits, err := s.getTraits(ctx)
if err != nil {
- log.Error(err, "failed to parse placement URL for trait sync", "trait", name)
- return
+ return false, err
}
- u.Path, err = url.JoinPath(u.Path, "/traits/"+name)
- if err != nil {
- log.Error(err, "failed to build upstream trait URL", "trait", name)
- return
+ if _, exists := traits[name]; exists {
+ return false, nil
}
- req, err := http.NewRequestWithContext(ctx, http.MethodPut, u.String(), http.NoBody)
+
+ // Slow path: acquire lock, re-read, add trait.
+ host, err := os.Hostname()
if err != nil {
- log.Error(err, "failed to create upstream trait request", "trait", name)
- return
+ return false, fmt.Errorf("get hostname: %w", err)
}
- // Forward authentication headers so upstream placement accepts the request.
- req.Header = incomingHeader.Clone()
- resp, err := s.httpClient.Do(req)
- if err != nil {
- log.Info("best-effort upstream trait sync failed, upstream may be down", "trait", name, "error", err.Error())
- return
+ lockerID := fmt.Sprintf("shim-%s-%d", host, time.Now().UnixNano())
+ if err := s.resourceLocker.AcquireLock(ctx, s.config.Traits.ConfigMapName+"-lock", lockerID); err != nil {
+ return false, fmt.Errorf("acquire traits lock: %w", err)
}
- defer resp.Body.Close()
- log.Info("synced custom trait to upstream placement", "trait", name, "status", resp.StatusCode)
-}
+ defer func() {
+ releaseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+ _ = s.resourceLocker.ReleaseLock(releaseCtx, s.config.Traits.ConfigMapName+"-lock", lockerID) //nolint:errcheck
+ }()
-// startTraitSyncLoop runs a periodic goroutine that fetches traits from
-// upstream placement and writes them into the static ConfigMap. Only active
-// when features.traits is hybrid. The loop exits when ctx is cancelled.
-func (s *Shim) startTraitSyncLoop(ctx context.Context) {
- if s.config.Features.Traits.orDefault() != FeatureModeHybrid {
- return
+ cm := &corev1.ConfigMap{}
+ key := client.ObjectKey{Namespace: os.Getenv("POD_NAMESPACE"), Name: s.config.Traits.ConfigMapName}
+ if err := s.Get(ctx, key, cm); err != nil {
+ if apierrors.IsNotFound(err) {
+ cm = &corev1.ConfigMap{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: key.Name,
+ Namespace: key.Namespace,
+ },
+ Data: map[string]string{configMapKeyTraits: "[]"},
+ }
+ current := map[string]struct{}{name: {}}
+ if err := writeTraitsToConfigMap(cm, current); err != nil {
+ return false, err
+ }
+ if err := s.Create(ctx, cm); err != nil {
+ return false, fmt.Errorf("create traits configmap: %w", err)
+ }
+ return true, nil
+ }
+ return false, fmt.Errorf("get traits configmap: %w", err)
}
- log := ctrl.Log.WithName("placement-shim").WithName("trait-sync")
- jitter := time.Duration(rand.Int63n(int64(30 * time.Second))) //nolint:gosec
- log.Info("starting trait sync loop", "jitter", jitter)
- select {
- case <-ctx.Done():
- return
- case <-time.After(jitter):
+ current, err := parseTraits(cm)
+ if err != nil {
+ return false, err
}
-
- s.syncTraitsFromUpstream(ctx, log)
-
- ticker := time.NewTicker(60 * time.Second)
- defer ticker.Stop()
- for {
- select {
- case <-ctx.Done():
- return
- case <-ticker.C:
- s.syncTraitsFromUpstream(ctx, log)
- }
+ if _, exists := current[name]; exists {
+ return false, nil
}
-}
-
-// syncTraitsFromUpstream fetches GET /traits from upstream placement and
-// writes the result into the static ConfigMap so that the shim's local
-// view stays in sync with upstream. Uses the gophercloud ServiceClient
-// for automatic token management (including reauth on 401).
-func (s *Shim) syncTraitsFromUpstream(ctx context.Context, log logr.Logger) {
- if s.placementServiceClient == nil {
- log.V(1).Info("skipping upstream trait sync, no placement service client configured")
- return
+ current[name] = struct{}{}
+ if err := writeTraitsToConfigMap(cm, current); err != nil {
+ return false, err
}
- u, err := url.JoinPath(s.placementServiceClient.Endpoint, "/traits")
- if err != nil {
- log.Error(err, "failed to build upstream traits URL")
- return
+ if err := s.Update(ctx, cm); err != nil {
+ return false, fmt.Errorf("update traits configmap: %w", err)
}
- resp, err := s.placementServiceClient.Request(ctx, http.MethodGet, u, &gophercloud.RequestOpts{
- OkCodes: []int{http.StatusOK},
- MoreHeaders: map[string]string{
- "OpenStack-API-Version": "placement 1.6",
- },
- KeepResponseBody: true,
- })
+ return true, nil
+}
+
+// removeTraitFromConfigMap removes a trait from the ConfigMap under the
+// resource lock. Returns true if the trait was found and removed.
+func (s *Shim) removeTraitFromConfigMap(ctx context.Context, name string) (bool, error) {
+ host, err := os.Hostname()
if err != nil {
- log.Info("upstream trait sync failed", "error", err.Error())
- return
+ return false, fmt.Errorf("get hostname: %w", err)
}
- defer resp.Body.Close()
- var body traitsListResponse
- if err := json.NewDecoder(resp.Body).Decode(&body); err != nil {
- log.Error(err, "failed to decode upstream trait list")
- return
+ lockerID := fmt.Sprintf("shim-%s-%d", host, time.Now().UnixNano())
+ if err := s.resourceLocker.AcquireLock(ctx, s.config.Traits.ConfigMapName+"-lock", lockerID); err != nil {
+ return false, fmt.Errorf("acquire traits lock: %w", err)
}
+ defer func() {
+ releaseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+ _ = s.resourceLocker.ReleaseLock(releaseCtx, s.config.Traits.ConfigMapName+"-lock", lockerID) //nolint:errcheck
+ }()
cm := &corev1.ConfigMap{}
- if err := s.Get(ctx, s.staticTraitsConfigMapKey(), cm); err != nil {
- log.Error(err, "failed to get static traits configmap for sync")
- return
+ if err := s.Get(ctx, client.ObjectKey{Namespace: os.Getenv("POD_NAMESPACE"), Name: s.config.Traits.ConfigMapName}, cm); err != nil {
+ if apierrors.IsNotFound(err) {
+ return false, nil
+ }
+ return false, fmt.Errorf("get traits configmap: %w", err)
}
- traitSet := make(map[string]struct{}, len(body.Traits))
- for _, t := range body.Traits {
- traitSet[t] = struct{}{}
+ current, err := parseTraits(cm)
+ if err != nil {
+ return false, err
}
- if err := s.writeTraits(cm, traitSet); err != nil {
- log.Error(err, "failed to serialize synced traits")
- return
+ if _, exists := current[name]; !exists {
+ return false, nil
+ }
+ delete(current, name)
+ if err := writeTraitsToConfigMap(cm, current); err != nil {
+ return false, err
}
if err := s.Update(ctx, cm); err != nil {
- log.Error(err, "failed to update static traits configmap with upstream data")
- return
+ return false, fmt.Errorf("update traits configmap: %w", err)
}
- log.Info("synced traits from upstream placement", "count", len(body.Traits))
+ return true, nil
}
diff --git a/internal/shim/placement/handle_traits_e2e.go b/internal/shim/placement/handle_traits_e2e.go
index 4a5831f72..9c204e0f8 100644
--- a/internal/shim/placement/handle_traits_e2e.go
+++ b/internal/shim/placement/handle_traits_e2e.go
@@ -80,13 +80,10 @@ func e2eTestTraits(ctx context.Context, _ client.Client) error {
if err := json.NewDecoder(resp.Body).Decode(&listResp); err != nil {
return fmt.Errorf("failed to decode GET /traits response: %w", err)
}
- // When traits are served locally (hybrid or crd mode) the static list may
- // be empty. Only require at least one trait when forwarding to upstream
+ // When traits are served locally (hybrid or crd mode) the list may be
+ // empty. Only require at least one trait when forwarding to upstream
// placement, which always has standard traits.
traitsMode := e2eCurrentMode(ctx)
- if traitsMode == "" {
- traitsMode = config.Features.Traits.orDefault()
- }
if traitsMode == FeatureModePassthrough && len(listResp.Traits) == 0 {
return errors.New("GET /traits: expected at least one trait, got 0")
}
@@ -135,19 +132,9 @@ func e2eTestTraits(ctx context.Context, _ client.Client) error {
}
log.Info("Correctly received 404 for nonexistent trait")
- // ==================== Phase 2: CRUD tests (feature-gated) ====================
-
- // CRUD tests require traits ConfigMaps which are only created when the
- // configured traits mode is hybrid or crd. The override header changes
- // handler routing but cannot create ConfigMaps that don't exist.
- configuredTraitsMode := config.Features.Traits.orDefault()
- if traitsMode == FeatureModePassthrough || configuredTraitsMode == FeatureModePassthrough {
- log.Info("Skipping trait CRUD e2e tests",
- "overrideMode", traitsMode, "configuredMode", configuredTraitsMode)
- return nil
- }
+ // ==================== Phase 2: CRUD tests ====================
- log.Info("=== Phase 2: CRUD trait tests (traits mode non-passthrough) ===")
+ log.Info("=== Phase 2: CRUD trait tests ===")
const testTrait = "CUSTOM_CORTEX_E2E_TRAIT"
diff --git a/internal/shim/placement/handle_traits_test.go b/internal/shim/placement/handle_traits_test.go
index bf692fd41..c7531796a 100644
--- a/internal/shim/placement/handle_traits_test.go
+++ b/internal/shim/placement/handle_traits_test.go
@@ -7,7 +7,6 @@ import (
"context"
"encoding/json"
"net/http"
- "net/http/httptest"
"testing"
"github.com/cobaltcore-dev/cortex/pkg/resourcelock"
@@ -42,13 +41,12 @@ func newTestConfigMap(namespace, name string, traits []string) *corev1.ConfigMap
}
}
-func newTraitShim(t *testing.T, staticTraits []string, customTraits ...string) *Shim {
+func newTraitShim(t *testing.T, traits []string, extraTraits ...string) *Shim {
t.Helper()
t.Setenv("POD_NAMESPACE", "default")
- objs := []client.Object{newTestConfigMap("default", "test-cm", staticTraits)}
- if len(customTraits) > 0 {
- objs = append(objs, newTestConfigMap("default", "test-cm-custom", customTraits))
- }
+ all := append([]string{}, traits...)
+ all = append(all, extraTraits...)
+ objs := []client.Object{newTestConfigMap("default", "test-cm", all)}
cl := newFakeClientWithScheme(t, objs...)
down, up := newTestTimers()
return &Shim{
@@ -264,39 +262,11 @@ func TestHandleUpdateTraitLocalBadPrefix(t *testing.T) {
}
}
-func TestHandleUpdateTraitLocalSyncsToUpstream(t *testing.T) {
- var gotMethod, gotPath string
- upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
- gotMethod = r.Method
- gotPath = r.URL.Path
- w.WriteHeader(http.StatusNoContent)
- }))
- t.Cleanup(upstream.Close)
+func TestHandleUpdateTraitLocalNoUpstreamContact(t *testing.T) {
s := newTraitShim(t, nil)
- s.config.PlacementURL = upstream.URL
- s.httpClient = upstream.Client()
-
- w := serveHandler(t, "PUT", "/traits/{name}", s.HandleUpdateTrait, "/traits/CUSTOM_NEW")
- if w.Code != http.StatusCreated {
- t.Fatalf("status = %d, want %d", w.Code, http.StatusCreated)
- }
- if gotMethod != "PUT" || gotPath != "/traits/CUSTOM_NEW" {
- t.Fatalf("upstream got %s %s, want PUT /traits/CUSTOM_NEW", gotMethod, gotPath)
- }
-}
-
-func TestHandleUpdateTraitLocalUpstreamDown(t *testing.T) {
- upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
- w.WriteHeader(http.StatusInternalServerError)
- }))
- t.Cleanup(upstream.Close)
- s := newTraitShim(t, nil)
- s.config.PlacementURL = upstream.URL
- s.httpClient = upstream.Client()
-
w := serveHandler(t, "PUT", "/traits/{name}", s.HandleUpdateTrait, "/traits/CUSTOM_NEW")
if w.Code != http.StatusCreated {
- t.Fatalf("status = %d, want %d; upstream failure should not block local creation", w.Code, http.StatusCreated)
+ t.Fatalf("status = %d, want %d; CRD mode should not contact upstream", w.Code, http.StatusCreated)
}
}
diff --git a/internal/shim/placement/shim.go b/internal/shim/placement/shim.go
index b46546b63..f601788a4 100644
--- a/internal/shim/placement/shim.go
+++ b/internal/shim/placement/shim.go
@@ -247,15 +247,15 @@ func (c *config) validate() error {
}
}
traitsMode := c.Features.Traits.orDefault()
- if traitsMode == FeatureModeHybrid || traitsMode == FeatureModeCRD {
- if c.Traits == nil {
- return fmt.Errorf("traits config is required when features.traits is %s", traitsMode)
- }
+ if traitsMode != FeatureModePassthrough && c.Traits == nil {
+ return fmt.Errorf("traits config is required when features.traits is %s", traitsMode)
+ }
+ if c.Traits != nil {
if c.Traits.ConfigMapName == "" {
- return fmt.Errorf("traits.configMapName is required when features.traits is %s", traitsMode)
+ return errors.New("traits.configMapName is required when traits config is present")
}
- if traitsMode == FeatureModeCRD && os.Getenv("POD_NAMESPACE") == "" {
- return errors.New("pod namespace (POD_NAMESPACE) is required when features.traits is crd")
+ if os.Getenv("POD_NAMESPACE") == "" {
+ return errors.New("pod namespace (POD_NAMESPACE) is required when traits config is present")
}
}
if c.Auth != nil && c.KeystoneURL == "" {
@@ -303,14 +303,17 @@ type Shim struct {
tokenCache *tokenCache
// tokenIntrospector validates tokens against Keystone.
tokenIntrospector tokenIntrospector
- // resourceLocker serializes writes to the custom traits ConfigMap
- // across replicas using a Kubernetes Lease.
+ // resourceLocker serializes writes to ConfigMaps across replicas
+ // using a Kubernetes Lease.
resourceLocker *resourcelock.ResourceLocker
// placementServiceClient is an authenticated gophercloud service client
// used by background tasks (trait sync) to make requests to upstream
// placement with automatic token management (including reauth on 401).
// Nil when Keystone credentials are not configured.
placementServiceClient *gophercloud.ServiceClient
+ // syncers are background workers that manage ConfigMap-backed local
+ // stores (e.g. traits, resource classes). Started uniformly in Start.
+ syncers []Syncer
}
// Describe implements prometheus.Collector.
@@ -433,7 +436,26 @@ func (s *Shim) Start(ctx context.Context) error {
if err := s.initPlacementServiceClient(ctx); err != nil {
return err
}
- go s.startTraitSyncLoop(ctx)
+ if s.config.Traits != nil {
+ s.syncers = append(s.syncers, NewTraitSyncer(
+ s.Client,
+ s.config.Traits.ConfigMapName,
+ os.Getenv("POD_NAMESPACE"),
+ s.placementServiceClient,
+ s.resourceLocker,
+ ))
+ }
+ for _, syncer := range s.syncers {
+ if err := syncer.Init(ctx); err != nil {
+ return err
+ }
+ }
+ traitsMode := s.config.Features.Traits.orDefault()
+ for _, syncer := range s.syncers {
+ if traitsMode == FeatureModeHybrid || traitsMode == FeatureModePassthrough {
+ go syncer.Run(ctx)
+ }
+ }
return nil
}
diff --git a/internal/shim/placement/syncer.go b/internal/shim/placement/syncer.go
new file mode 100644
index 000000000..f00487fe9
--- /dev/null
+++ b/internal/shim/placement/syncer.go
@@ -0,0 +1,19 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package placement
+
+import "context"
+
+// Syncer manages the lifecycle of a ConfigMap-backed local store:
+// creating the ConfigMap on startup, and running a periodic background
+// sync from upstream placement.
+type Syncer interface {
+ // Init creates the ConfigMap if it does not exist. Called once during
+ // Shim.Start before any requests are served.
+ Init(ctx context.Context) error
+
+ // Run starts the periodic background sync from upstream. Blocks until
+ // ctx is cancelled. Called as a goroutine from Shim.Start.
+ Run(ctx context.Context)
+}
diff --git a/internal/shim/placement/syncer_traits.go b/internal/shim/placement/syncer_traits.go
new file mode 100644
index 000000000..d8067e1f3
--- /dev/null
+++ b/internal/shim/placement/syncer_traits.go
@@ -0,0 +1,173 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package placement
+
+import (
+ "context"
+ "encoding/json"
+ "fmt"
+ "math/rand"
+ "net/http"
+ "net/url"
+ "os"
+ "time"
+
+ "github.com/cobaltcore-dev/cortex/pkg/resourcelock"
+ "github.com/gophercloud/gophercloud/v2"
+ corev1 "k8s.io/api/core/v1"
+ apierrors "k8s.io/apimachinery/pkg/api/errors"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ ctrl "sigs.k8s.io/controller-runtime"
+ "sigs.k8s.io/controller-runtime/pkg/client"
+)
+
+// TraitSyncer manages the lifecycle of the single traits ConfigMap.
+// It creates the ConfigMap on startup and periodically syncs from upstream.
+type TraitSyncer struct {
+ client client.Client
+ configMapName string
+ namespace string
+ placementClient *gophercloud.ServiceClient
+ resourceLocker *resourcelock.ResourceLocker
+}
+
+func NewTraitSyncer(
+ cl client.Client,
+ configMapName string,
+ namespace string,
+ placementClient *gophercloud.ServiceClient,
+ resourceLocker *resourcelock.ResourceLocker,
+) *TraitSyncer {
+
+ return &TraitSyncer{
+ client: cl,
+ configMapName: configMapName,
+ namespace: namespace,
+ placementClient: placementClient,
+ resourceLocker: resourceLocker,
+ }
+}
+
+// Init creates the traits ConfigMap if it does not already exist.
+func (ts *TraitSyncer) Init(ctx context.Context) error {
+ log := ctrl.Log.WithName("placement-shim").WithName("trait-syncer")
+ cm := &corev1.ConfigMap{}
+ key := client.ObjectKey{Namespace: ts.namespace, Name: ts.configMapName}
+ err := ts.client.Get(ctx, key, cm)
+ if err == nil {
+ log.Info("Traits ConfigMap already exists", "name", ts.configMapName)
+ return nil
+ }
+ if !apierrors.IsNotFound(err) {
+ return fmt.Errorf("checking traits configmap: %w", err)
+ }
+ cm = &corev1.ConfigMap{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: ts.configMapName,
+ Namespace: ts.namespace,
+ },
+ Data: map[string]string{configMapKeyTraits: "[]"},
+ }
+ if err := ts.client.Create(ctx, cm); err != nil {
+ if apierrors.IsAlreadyExists(err) {
+ log.Info("Traits ConfigMap was created concurrently", "name", ts.configMapName)
+ return nil
+ }
+ return fmt.Errorf("creating traits configmap: %w", err)
+ }
+ log.Info("Created traits ConfigMap", "name", ts.configMapName)
+ return nil
+}
+
+// Run starts the periodic background sync from upstream placement.
+// Blocks until ctx is cancelled.
+func (ts *TraitSyncer) Run(ctx context.Context) {
+ log := ctrl.Log.WithName("placement-shim").WithName("trait-syncer")
+ if ts.placementClient == nil {
+ log.Info("No placement service client configured, trait sync loop will not run")
+ return
+ }
+
+ jitter := time.Duration(rand.Int63n(int64(30 * time.Second))) //nolint:gosec
+ log.Info("Starting trait sync loop", "jitter", jitter)
+
+ select {
+ case <-ctx.Done():
+ return
+ case <-time.After(jitter):
+ }
+
+ ts.sync(ctx)
+
+ ticker := time.NewTicker(60 * time.Second)
+ defer ticker.Stop()
+ for {
+ select {
+ case <-ctx.Done():
+ return
+ case <-ticker.C:
+ ts.sync(ctx)
+ }
+ }
+}
+
+// sync fetches GET /traits from upstream placement and writes the result
+// into the ConfigMap under the resource lock.
+func (ts *TraitSyncer) sync(ctx context.Context) {
+ log := ctrl.Log.WithName("placement-shim").WithName("trait-syncer")
+ u, err := url.JoinPath(ts.placementClient.Endpoint, "/traits")
+ if err != nil {
+ log.Error(err, "Failed to build upstream traits URL")
+ return
+ }
+ resp, err := ts.placementClient.Request(ctx, http.MethodGet, u, &gophercloud.RequestOpts{
+ OkCodes: []int{http.StatusOK},
+ MoreHeaders: map[string]string{
+ "OpenStack-API-Version": "placement 1.6",
+ },
+ KeepResponseBody: true,
+ })
+ if err != nil {
+ log.Info("Upstream trait sync failed", "error", err.Error())
+ return
+ }
+ defer resp.Body.Close()
+ var body traitsListResponse
+ if err := json.NewDecoder(resp.Body).Decode(&body); err != nil {
+ log.Error(err, "Failed to decode upstream trait list")
+ return
+ }
+
+ host, _ := os.Hostname() //nolint:errcheck
+ lockerID := fmt.Sprintf("syncer-%s-%d", host, time.Now().UnixNano())
+ lockName := ts.configMapName + "-lock"
+ if err := ts.resourceLocker.AcquireLock(ctx, lockName, lockerID); err != nil {
+ log.Error(err, "Failed to acquire lock for trait sync")
+ return
+ }
+ defer func() {
+ releaseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+ _ = ts.resourceLocker.ReleaseLock(releaseCtx, lockName, lockerID) //nolint:errcheck
+ }()
+
+ cm := &corev1.ConfigMap{}
+ if err := ts.client.Get(ctx, client.ObjectKey{Namespace: ts.namespace, Name: ts.configMapName}, cm); err != nil {
+ log.Error(err, "Failed to get traits ConfigMap for sync")
+ return
+ }
+ traitSet := make(map[string]struct{}, len(body.Traits))
+ for _, t := range body.Traits {
+ traitSet[t] = struct{}{}
+ }
+ if err := writeTraitsToConfigMap(cm, traitSet); err != nil {
+ log.Error(err, "Failed to serialize synced traits")
+ return
+ }
+ if err := ts.client.Update(ctx, cm); err != nil {
+ log.Error(err, "Failed to update traits ConfigMap with upstream data")
+ return
+ }
+ log.Info("Synced traits from upstream placement", "count", len(body.Traits))
+}
diff --git a/internal/shim/placement/syncer_traits_test.go b/internal/shim/placement/syncer_traits_test.go
new file mode 100644
index 000000000..23cada155
--- /dev/null
+++ b/internal/shim/placement/syncer_traits_test.go
@@ -0,0 +1,145 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package placement
+
+import (
+ "context"
+ "encoding/json"
+ "net/http"
+ "net/http/httptest"
+ "testing"
+ "time"
+
+ "github.com/cobaltcore-dev/cortex/pkg/resourcelock"
+ "github.com/gophercloud/gophercloud/v2"
+ corev1 "k8s.io/api/core/v1"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "sigs.k8s.io/controller-runtime/pkg/client"
+)
+
+func TestTraitSyncerInitCreatesConfigMap(t *testing.T) {
+ cl := newFakeClientWithScheme(t)
+ ts := NewTraitSyncer(cl, "test-traits", "default", nil, resourcelock.NewResourceLocker(cl, "default"))
+
+ if err := ts.Init(context.Background()); err != nil {
+ t.Fatalf("Init: %v", err)
+ }
+
+ cm := &corev1.ConfigMap{}
+ if err := cl.Get(context.Background(), client.ObjectKey{Namespace: "default", Name: "test-traits"}, cm); err != nil {
+ t.Fatalf("get ConfigMap: %v", err)
+ }
+ if cm.Data[configMapKeyTraits] != "[]" {
+ t.Fatalf("expected empty traits array, got %q", cm.Data[configMapKeyTraits])
+ }
+}
+
+func TestTraitSyncerInitIdempotent(t *testing.T) {
+ existing := &corev1.ConfigMap{
+ ObjectMeta: metav1.ObjectMeta{Name: "test-traits", Namespace: "default"},
+ Data: map[string]string{configMapKeyTraits: `["CUSTOM_EXISTING"]`},
+ }
+ cl := newFakeClientWithScheme(t, existing)
+ ts := NewTraitSyncer(cl, "test-traits", "default", nil, resourcelock.NewResourceLocker(cl, "default"))
+
+ if err := ts.Init(context.Background()); err != nil {
+ t.Fatalf("Init: %v", err)
+ }
+
+ cm := &corev1.ConfigMap{}
+ if err := cl.Get(context.Background(), client.ObjectKey{Namespace: "default", Name: "test-traits"}, cm); err != nil {
+ t.Fatalf("get ConfigMap: %v", err)
+ }
+ if cm.Data[configMapKeyTraits] != `["CUSTOM_EXISTING"]` {
+ t.Fatalf("Init overwrote existing data: got %q", cm.Data[configMapKeyTraits])
+ }
+}
+
+func TestTraitSyncerRunNoClient(t *testing.T) {
+ cl := newFakeClientWithScheme(t)
+ ts := NewTraitSyncer(cl, "test-traits", "default", nil, resourcelock.NewResourceLocker(cl, "default"))
+
+ ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
+ defer cancel()
+ ts.Run(ctx)
+}
+
+func TestTraitSyncerSyncWritesUpstreamTraits(t *testing.T) {
+ upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ if r.URL.Path != "/traits" {
+ t.Errorf("unexpected path: %s", r.URL.Path)
+ }
+ w.Header().Set("Content-Type", "application/json")
+ if err := json.NewEncoder(w).Encode(traitsListResponse{
+ Traits: []string{"HW_CPU_X86_AVX2", "CUSTOM_SYNCED"},
+ }); err != nil {
+ t.Errorf("encode response: %v", err)
+ }
+ }))
+ t.Cleanup(upstream.Close)
+
+ existing := &corev1.ConfigMap{
+ ObjectMeta: metav1.ObjectMeta{Name: "test-traits", Namespace: "default"},
+ Data: map[string]string{configMapKeyTraits: "[]"},
+ }
+ cl := newFakeClientWithScheme(t, existing)
+
+ sc := &gophercloud.ServiceClient{
+ ProviderClient: &gophercloud.ProviderClient{},
+ Endpoint: upstream.URL,
+ }
+ sc.HTTPClient = *upstream.Client()
+
+ ts := NewTraitSyncer(cl, "test-traits", "default", sc, resourcelock.NewResourceLocker(cl, "default"))
+ ts.sync(context.Background())
+
+ cm := &corev1.ConfigMap{}
+ if err := cl.Get(context.Background(), client.ObjectKey{Namespace: "default", Name: "test-traits"}, cm); err != nil {
+ t.Fatalf("get ConfigMap: %v", err)
+ }
+
+ var traits []string
+ if err := json.Unmarshal([]byte(cm.Data[configMapKeyTraits]), &traits); err != nil {
+ t.Fatalf("unmarshal: %v", err)
+ }
+ if len(traits) != 2 {
+ t.Fatalf("expected 2 traits, got %d: %v", len(traits), traits)
+ }
+ want := map[string]bool{"CUSTOM_SYNCED": true, "HW_CPU_X86_AVX2": true}
+ for _, tr := range traits {
+ if !want[tr] {
+ t.Errorf("unexpected trait: %s", tr)
+ }
+ }
+}
+
+func TestTraitSyncerSyncUpstreamError(t *testing.T) {
+ upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+ w.WriteHeader(http.StatusServiceUnavailable)
+ }))
+ t.Cleanup(upstream.Close)
+
+ existing := &corev1.ConfigMap{
+ ObjectMeta: metav1.ObjectMeta{Name: "test-traits", Namespace: "default"},
+ Data: map[string]string{configMapKeyTraits: `["CUSTOM_ORIGINAL"]`},
+ }
+ cl := newFakeClientWithScheme(t, existing)
+
+ sc := &gophercloud.ServiceClient{
+ ProviderClient: &gophercloud.ProviderClient{},
+ Endpoint: upstream.URL,
+ }
+ sc.HTTPClient = *upstream.Client()
+
+ ts := NewTraitSyncer(cl, "test-traits", "default", sc, resourcelock.NewResourceLocker(cl, "default"))
+ ts.sync(context.Background())
+
+ cm := &corev1.ConfigMap{}
+ if err := cl.Get(context.Background(), client.ObjectKey{Namespace: "default", Name: "test-traits"}, cm); err != nil {
+ t.Fatalf("get ConfigMap: %v", err)
+ }
+ if cm.Data[configMapKeyTraits] != `["CUSTOM_ORIGINAL"]` {
+ t.Fatalf("sync should not have modified ConfigMap on error, got %q", cm.Data[configMapKeyTraits])
+ }
+}
From 72186d46074ecb8510155222fd2d4d54a8dfbd0d Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Wed, 29 Apr 2026 10:05:25 +0000
Subject: [PATCH 11/54] Bump cortex-shim chart appVersions to sha-ebbf9d44
[skip ci]
---
helm/library/cortex-shim/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex-shim/Chart.yaml b/helm/library/cortex-shim/Chart.yaml
index b95c5c282..d26f4fe7b 100644
--- a/helm/library/cortex-shim/Chart.yaml
+++ b/helm/library/cortex-shim/Chart.yaml
@@ -3,6 +3,6 @@ name: cortex-shim
description: A Helm chart to distribute cortex shims.
type: application
version: 0.0.3
-appVersion: "sha-166b515f"
+appVersion: "sha-ebbf9d44"
icon: "https://example.com/icon.png"
dependencies: []
From 9f2c0bbd9d45aa175ebbe6bf185a6131c013eb66 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Wed, 29 Apr 2026 10:05:27 +0000
Subject: [PATCH 12/54] Bump cortex chart appVersions to sha-ebbf9d44 [skip ci]
---
helm/library/cortex/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml
index 9f37b23c8..3cc0004cd 100644
--- a/helm/library/cortex/Chart.yaml
+++ b/helm/library/cortex/Chart.yaml
@@ -3,6 +3,6 @@ name: cortex
description: A Helm chart to distribute cortex.
type: application
version: 0.0.44
-appVersion: "sha-fd902614"
+appVersion: "sha-ebbf9d44"
icon: "https://example.com/icon.png"
dependencies: []
From 805531c0e46c9a841ad3baf7d3023c7fae0be27e Mon Sep 17 00:00:00 2001
From: mblos <156897072+mblos@users.noreply.github.com>
Date: Wed, 29 Apr 2026 12:05:38 +0200
Subject: [PATCH 13/54] fix: bump artifact handles concurrent changes on main
(#770)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The workflow had two bugs when multiple PRs merged in quick succession:
Push races: separate sed and commit/push steps meant concurrent runs
could push simultaneously and one would fail, losing that chart bump
permanently with no retry.
Commit-age races: serializing pushes doesn't help if an older build
finishes after a newer one — the stale run would overwrite the chart
with an older SHA.
Fixed by adding a concurrency group to serialize runs, and a freshness
check in each bump step that skips if a newer code commit already covers
that component. The check uses git log scoped to the relevant paths for
path-gated charts, and excludes [skip ci] bump commits so earlier steps
in the same run don't cause a false skip.
---
.github/scripts/bump-chart.sh | 37 ++++++++++++++++++
.github/workflows/update-appversion.yml | 52 ++++++++-----------------
2 files changed, 53 insertions(+), 36 deletions(-)
create mode 100644 .github/scripts/bump-chart.sh
diff --git a/.github/scripts/bump-chart.sh b/.github/scripts/bump-chart.sh
new file mode 100644
index 000000000..b57f7965b
--- /dev/null
+++ b/.github/scripts/bump-chart.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+# Bumps appVersion in a Helm Chart.yaml, skipping if a newer code commit already
+# covers this component.
+#
+# Usage: bump-chart.sh [path-filter ...]
+#
+# path-filter args scope the freshness check to specific paths (e.g. postgres/).
+# Omit them for an unconditional bump (the main cortex chart).
+set -euo pipefail
+
+CHART=$1; SHORT_SHA=$2; TRIGGER_SHA=$3; shift 3
+PATHS=("$@")
+
+git config user.name "github-actions[bot]"
+git config user.email "github-actions[bot]@users.noreply.github.com"
+git fetch origin main
+git reset --hard origin/main
+
+# Exclude bump commits ([skip ci]) so earlier steps in this same run don't
+# falsely count as "newer code". Only real code commits trigger a skip.
+if [ ${#PATHS[@]} -gt 0 ]; then
+ NEWER=$(git log --oneline --invert-grep --grep='\[skip ci\]' "$TRIGGER_SHA..HEAD" -- "${PATHS[@]}")
+else
+ NEWER=$(git log --oneline --invert-grep --grep='\[skip ci\]' "$TRIGGER_SHA..HEAD")
+fi
+
+if [ -n "$NEWER" ]; then
+ echo "Skipping $CHART: newer code commits exist on main for this component"
+ exit 0
+fi
+
+CHART_NAME=$(basename "$(dirname "$CHART")")
+sed -i 's/^\([ ]*appVersion:[ ]*\).*/\1"'"$SHORT_SHA"'"/' "$CHART"
+git add "$CHART"
+git diff --cached --quiet && { echo "No changes to commit for $CHART_NAME"; exit 0; }
+git commit -m "Bump $CHART_NAME chart appVersions to $SHORT_SHA [skip ci]"
+git push origin HEAD:main
diff --git a/.github/workflows/update-appversion.yml b/.github/workflows/update-appversion.yml
index 20087fa80..89aba9f88 100644
--- a/.github/workflows/update-appversion.yml
+++ b/.github/workflows/update-appversion.yml
@@ -11,6 +11,13 @@ jobs:
if: >-
${{ github.event.workflow_run.conclusion == 'success' && !contains(github.event.workflow_run.head_commit.message, '[skip ci]') }}
runs-on: ubuntu-latest
+ # Serialize runs so concurrent merges don't race on git push.
+ # Example: PR A changes shim, PR B changes cortex — both trigger this
+ # workflow. Without serialization, one push fails and that chart bump is
+ # lost permanently (no future run will retry it).
+ concurrency:
+ group: update-appversion
+ cancel-in-progress: false
steps:
- name: Checkout repository
uses: actions/checkout@v6
@@ -30,21 +37,6 @@ jobs:
files: |
postgres/**
- # Only bumped if there are changes in the postgres directory.
- - name: Update appVersion in cortex-postgres Chart.yaml
- if: steps.changed_postgres_files.outputs.all_changed_files != ''
- run: |
- sed -i 's/^\([ ]*appVersion:[ ]*\).*/\1"${{ steps.vars.outputs.sha }}"/' helm/library/cortex-postgres/Chart.yaml
- - name: Commit and push changes for cortex-postgres
- if: steps.changed_postgres_files.outputs.all_changed_files != ''
- run: |
- git config user.name "github-actions[bot]"
- git config user.email "github-actions[bot]@users.noreply.github.com"
- git add helm/library/cortex-postgres/Chart.yaml
- git commit -m "Bump cortex-postgres chart appVersions to ${{ steps.vars.outputs.sha }} [skip ci]" || echo "No changes to commit"
- git push origin HEAD:main
-
- # Only bumped if there are changes in shim-related directories
- name: Get all changed shim files
id: changed_shim_files
uses: tj-actions/changed-files@v47
@@ -52,26 +44,14 @@ jobs:
files: |
internal/shim/**
cmd/shim/**
- - name: Update appVersion in cortex-shim Chart.yaml
- if: steps.changed_shim_files.outputs.all_changed_files != ''
- run: |
- sed -i 's/^\([ ]*appVersion:[ ]*\).*/\1"${{ steps.vars.outputs.sha }}"/' helm/library/cortex-shim/Chart.yaml
- - name: Commit and push changes for cortex-shim
+
+ - name: Bump and push cortex-postgres appVersion
+ if: steps.changed_postgres_files.outputs.all_changed_files != ''
+ run: bash .github/scripts/bump-chart.sh helm/library/cortex-postgres/Chart.yaml "${{ steps.vars.outputs.sha }}" "${{ github.event.workflow_run.head_sha }}" postgres/
+
+ - name: Bump and push cortex-shim appVersion
if: steps.changed_shim_files.outputs.all_changed_files != ''
- run: |
- git config user.name "github-actions[bot]"
- git config user.email "github-actions[bot]@users.noreply.github.com"
- git add helm/library/cortex-shim/Chart.yaml
- git commit -m "Bump cortex-shim chart appVersions to ${{ steps.vars.outputs.sha }} [skip ci]" || echo "No changes to commit"
- git push origin HEAD:main
+ run: bash .github/scripts/bump-chart.sh helm/library/cortex-shim/Chart.yaml "${{ steps.vars.outputs.sha }}" "${{ github.event.workflow_run.head_sha }}" internal/shim/ cmd/shim/
- - name: Update appVersion in helm/library/cortex/Chart.yaml
- run: |
- sed -i 's/^\([ ]*appVersion:[ ]*\).*/\1"${{ steps.vars.outputs.sha }}"/' helm/library/cortex/Chart.yaml
- - name: Commit and push changes for cortex
- run: |
- git config user.name "github-actions[bot]"
- git config user.email "github-actions[bot]@users.noreply.github.com"
- git add helm/library/cortex/Chart.yaml
- git commit -m "Bump cortex chart appVersions to ${{ steps.vars.outputs.sha }} [skip ci]" || echo "No changes to commit"
- git push origin HEAD:main
+ - name: Bump and push cortex appVersion
+ run: bash .github/scripts/bump-chart.sh helm/library/cortex/Chart.yaml "${{ steps.vars.outputs.sha }}" "${{ github.event.workflow_run.head_sha }}"
From 17050b2fb06620783b28b615938d22108260d92c Mon Sep 17 00:00:00 2001
From: Philipp Matthes <27271818+PhilippMatthes@users.noreply.github.com>
Date: Wed, 29 Apr 2026 13:01:24 +0200
Subject: [PATCH 14/54] Feature-gated /resource_classes API with ConfigMap
storage (#772)
Implements three-mode support for the /resource_classes placement API
endpoints, following the same pattern established in the traits refactor
(PR #771). Previously these endpoints only forwarded to upstream
placement (returning 501 for hybrid/crd modes). Now passthrough forwards
to upstream, hybrid forwards and mirrors locally, and crd serves
entirely from a shim-owned ConfigMap. A ResourceClassSyncer periodically
fetches upstream state into the ConfigMap so that crd mode can serve
without depending on live upstream availability. This is part of the
phased placement API migration.
Assisted-by: claude-code:claude-opus-latest [Bash] [Read]
---
.../bundles/cortex-placement-shim/values.yaml | 2 +
.../shim/placement/handle_resource_classes.go | 528 ++++++++++++++++--
.../placement/handle_resource_classes_e2e.go | 271 +++++----
.../placement/handle_resource_classes_test.go | 344 +++++++++---
internal/shim/placement/handle_traits.go | 17 +-
internal/shim/placement/shim.go | 56 +-
internal/shim/placement/shim_test.go | 36 ++
.../shim/placement/syncer_resource_classes.go | 196 +++++++
.../placement/syncer_resource_classes_test.go | 149 +++++
internal/shim/placement/syncer_traits.go | 10 +-
10 files changed, 1368 insertions(+), 241 deletions(-)
create mode 100644 internal/shim/placement/syncer_resource_classes.go
create mode 100644 internal/shim/placement/syncer_resource_classes_test.go
diff --git a/helm/bundles/cortex-placement-shim/values.yaml b/helm/bundles/cortex-placement-shim/values.yaml
index 54805cee3..eaf725d07 100644
--- a/helm/bundles/cortex-placement-shim/values.yaml
+++ b/helm/bundles/cortex-placement-shim/values.yaml
@@ -62,6 +62,8 @@ cortex-shim:
status: "CURRENT"
traits:
configMapName: "cortex-placement-shim-traits"
+ resourceClasses:
+ configMapName: "cortex-placement-shim-resource-classes"
auth:
tokenCacheTTL: "5m"
policies:
diff --git a/internal/shim/placement/handle_resource_classes.go b/internal/shim/placement/handle_resource_classes.go
index 9067079fd..9a3a9d4c2 100644
--- a/internal/shim/placement/handle_resource_classes.go
+++ b/internal/shim/placement/handle_resource_classes.go
@@ -4,68 +4,532 @@
package placement
import (
+ "context"
+ "encoding/json"
+ "fmt"
+ "io"
"net/http"
+ "os"
+ "sort"
+ "strings"
+ "time"
+
+ corev1 "k8s.io/api/core/v1"
+ apierrors "k8s.io/apimachinery/pkg/api/errors"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ ctrl "sigs.k8s.io/controller-runtime"
+ "sigs.k8s.io/controller-runtime/pkg/client"
+ logf "sigs.k8s.io/controller-runtime/pkg/log"
)
// HandleListResourceClasses handles GET /resource_classes requests.
//
-// Returns the complete list of all resource classes, including both standard
-// classes (e.g. VCPU, MEMORY_MB, DISK_GB, PCI_DEVICE, SRIOV_NET_VF) and
-// deployer-defined custom classes prefixed with CUSTOM_. Resource classes
-// categorize the types of resources that resource providers can offer as
-// inventory. Available since microversion 1.2.
+// Feature modes:
+// - passthrough: forwards to upstream placement.
+// - hybrid: forwards to upstream placement.
+// - crd: serves the resource class list from the local ConfigMap.
+//
+// See: https://docs.openstack.org/api-ref/placement/#list-resource-classes
func (s *Shim) HandleListResourceClasses(w http.ResponseWriter, r *http.Request) {
- s.dispatchPassthroughOnly(w, r, s.config.Features.ResourceClasses)
+ ctx := r.Context()
+ log := logf.FromContext(ctx)
+
+ switch s.featureModeFromConfOrHeader(r, s.config.Features.ResourceClasses) {
+ case FeatureModePassthrough, FeatureModeHybrid:
+ s.forward(w, r)
+ return
+ case FeatureModeCRD:
+ // Serve from local ConfigMap.
+ default:
+ http.Error(w, "unknown feature mode", http.StatusInternalServerError)
+ return
+ }
+
+ rcSet, err := s.getResourceClasses(ctx)
+ if err != nil {
+ log.Error(err, "failed to list resource classes from configmap")
+ http.Error(w, "failed to list resource classes", http.StatusInternalServerError)
+ return
+ }
+
+ entries := make([]resourceClassEntry, 0, len(rcSet))
+ for name := range rcSet {
+ entries = append(entries, resourceClassEntry{
+ Name: name,
+ Links: []resourceClassLink{{Rel: "self", Href: "/resource_classes/" + name}},
+ })
+ }
+ sort.Slice(entries, func(i, j int) bool { return entries[i].Name < entries[j].Name })
+
+ log.Info("listing all resource classes", "count", len(entries))
+ s.writeJSON(w, http.StatusOK, resourceClassesListResponse{ResourceClasses: entries})
}
// HandleCreateResourceClass handles POST /resource_classes requests.
//
-// Creates a new custom resource class. The name must be prefixed with CUSTOM_
-// to distinguish it from standard resource classes. Returns 201 Created with
-// a Location header on success. Returns 400 Bad Request if the CUSTOM_ prefix
-// is missing, and 409 Conflict if a class with the same name already exists.
-// Available since microversion 1.2.
+// Feature modes:
+// - passthrough: forwards to upstream placement.
+// - hybrid: forwards to upstream; on success, adds the class to the local ConfigMap.
+// - crd: writes the class to the local ConfigMap (CUSTOM_ prefix required).
+//
+// See: https://docs.openstack.org/api-ref/placement/#create-resource-class
func (s *Shim) HandleCreateResourceClass(w http.ResponseWriter, r *http.Request) {
- s.dispatchPassthroughOnly(w, r, s.config.Features.ResourceClasses)
+ ctx := r.Context()
+ log := logf.FromContext(ctx)
+
+ mode := s.featureModeFromConfOrHeader(r, s.config.Features.ResourceClasses)
+ switch mode {
+ case FeatureModePassthrough:
+ s.forward(w, r)
+ return
+ case FeatureModeHybrid:
+ s.handleCreateResourceClassHybrid(w, r)
+ return
+ case FeatureModeCRD:
+ // Handle locally.
+ default:
+ http.Error(w, "unknown feature mode", http.StatusInternalServerError)
+ return
+ }
+
+ var body struct {
+ Name string `json:"name"`
+ }
+ if err := json.NewDecoder(r.Body).Decode(&body); err != nil || body.Name == "" {
+ http.Error(w, "request body must contain a valid 'name' field", http.StatusBadRequest)
+ return
+ }
+ if !strings.HasPrefix(body.Name, "CUSTOM_") {
+ log.Info("rejected resource class without CUSTOM_ prefix", "class", body.Name)
+ http.Error(w, "resource class name must start with CUSTOM_", http.StatusBadRequest)
+ return
+ }
+
+ exists, err := s.hasResourceClass(ctx, body.Name)
+ if err != nil {
+ log.Error(err, "failed to check resource class", "class", body.Name)
+ http.Error(w, "failed to check resource class", http.StatusInternalServerError)
+ return
+ }
+ if exists {
+ http.Error(w, "resource class already exists", http.StatusConflict)
+ return
+ }
+
+ if _, err := s.addResourceClassToConfigMap(ctx, body.Name); err != nil {
+ log.Error(err, "failed to create resource class", "class", body.Name)
+ http.Error(w, "failed to create resource class", http.StatusInternalServerError)
+ return
+ }
+ w.WriteHeader(http.StatusCreated)
+}
+
+// handleCreateResourceClassHybrid forwards POST /resource_classes to upstream,
+// then updates the local ConfigMap on success.
+func (s *Shim) handleCreateResourceClassHybrid(w http.ResponseWriter, r *http.Request) {
+ ctx := r.Context()
+ log := logf.FromContext(ctx)
+
+ s.forwardWithHook(w, r, func(w http.ResponseWriter, resp *http.Response) {
+ body, err := io.ReadAll(resp.Body)
+ if err != nil {
+ log.Error(err, "hybrid: failed to read upstream response body")
+ }
+ for k, vs := range resp.Header {
+ for _, v := range vs {
+ w.Header().Add(k, v)
+ }
+ }
+ w.WriteHeader(resp.StatusCode)
+ if _, err := w.Write(body); err != nil {
+ log.Error(err, "hybrid: failed to write response body")
+ }
+
+ if resp.StatusCode == http.StatusCreated {
+ var created struct {
+ Name string `json:"name"`
+ }
+ if err := json.Unmarshal(body, &created); err == nil && created.Name != "" {
+ if _, err := s.addResourceClassToConfigMap(ctx, created.Name); err != nil {
+ log.Error(err, "hybrid: failed to add resource class to local configmap", "class", created.Name)
+ }
+ }
+ }
+ })
}
// HandleShowResourceClass handles GET /resource_classes/{name} requests.
//
-// Returns a representation of a single resource class identified by name.
-// This can be used to verify the existence of a resource class. Returns 404
-// if the class does not exist. Available since microversion 1.2.
+// Feature modes:
+// - passthrough: forwards to upstream placement.
+// - hybrid: forwards to upstream placement.
+// - crd: checks the local ConfigMap for the resource class.
+//
+// See: https://docs.openstack.org/api-ref/placement/#show-resource-class
func (s *Shim) HandleShowResourceClass(w http.ResponseWriter, r *http.Request) {
- if _, ok := requiredPathParam(w, r, "name"); !ok {
+ ctx := r.Context()
+ log := logf.FromContext(ctx)
+
+ switch s.featureModeFromConfOrHeader(r, s.config.Features.ResourceClasses) {
+ case FeatureModePassthrough, FeatureModeHybrid:
+ s.forward(w, r)
+ return
+ case FeatureModeCRD:
+ // Serve from local ConfigMap.
+ default:
+ http.Error(w, "unknown feature mode", http.StatusInternalServerError)
+ return
+ }
+
+ name, ok := requiredPathParam(w, r, "name")
+ if !ok {
+ return
+ }
+ found, err := s.hasResourceClass(ctx, name)
+ if err != nil {
+ log.Error(err, "failed to check resource class", "class", name)
+ http.Error(w, "failed to check resource class", http.StatusInternalServerError)
return
}
- s.dispatchPassthroughOnly(w, r, s.config.Features.ResourceClasses)
+ if !found {
+ log.Info("resource class not found", "class", name)
+ http.Error(w, "resource class not found", http.StatusNotFound)
+ return
+ }
+ log.Info("resource class found", "class", name)
+ s.writeJSON(w, http.StatusOK, resourceClassEntry{
+ Name: name,
+ Links: []resourceClassLink{{Rel: "self", Href: "/resource_classes/" + name}},
+ })
}
// HandleUpdateResourceClass handles PUT /resource_classes/{name} requests.
//
-// Behavior differs by microversion. Since microversion 1.7, this endpoint
-// creates or validates the existence of a single resource class: it returns
-// 201 Created for a new class or 204 No Content if the class already exists.
-// The name must carry the CUSTOM_ prefix. In earlier versions (1.2-1.6), the
-// endpoint allowed renaming a class via a request body, but this usage is
-// discouraged. Returns 400 Bad Request if the CUSTOM_ prefix is missing.
+// Feature modes:
+// - passthrough: forwards to upstream placement.
+// - hybrid: forwards to upstream; on success, adds the class to the local ConfigMap.
+// - crd: writes the class to the local ConfigMap (CUSTOM_ prefix required).
+//
+// See: https://docs.openstack.org/api-ref/placement/#update-resource-class
func (s *Shim) HandleUpdateResourceClass(w http.ResponseWriter, r *http.Request) {
- if _, ok := requiredPathParam(w, r, "name"); !ok {
+ ctx := r.Context()
+ log := logf.FromContext(ctx)
+
+ mode := s.featureModeFromConfOrHeader(r, s.config.Features.ResourceClasses)
+ switch mode {
+ case FeatureModePassthrough:
+ s.forward(w, r)
+ return
+ case FeatureModeHybrid:
+ s.handleUpdateResourceClassHybrid(w, r)
+ return
+ case FeatureModeCRD:
+ // Handle locally.
+ default:
+ http.Error(w, "unknown feature mode", http.StatusInternalServerError)
+ return
+ }
+
+ name, ok := requiredPathParam(w, r, "name")
+ if !ok {
+ return
+ }
+ if !strings.HasPrefix(name, "CUSTOM_") {
+ log.Info("rejected resource class without CUSTOM_ prefix", "class", name)
+ http.Error(w, "resource class name must start with CUSTOM_", http.StatusBadRequest)
+ return
+ }
+
+ created, err := s.addResourceClassToConfigMap(ctx, name)
+ if err != nil {
+ log.Error(err, "failed to create resource class", "class", name)
+ http.Error(w, "failed to create resource class", http.StatusInternalServerError)
+ return
+ }
+ if created {
+ w.WriteHeader(http.StatusCreated)
+ } else {
+ w.WriteHeader(http.StatusNoContent)
+ }
+}
+
+// handleUpdateResourceClassHybrid forwards PUT /resource_classes/{name} to
+// upstream, then updates the local ConfigMap on success.
+func (s *Shim) handleUpdateResourceClassHybrid(w http.ResponseWriter, r *http.Request) {
+ ctx := r.Context()
+ log := logf.FromContext(ctx)
+
+ name, ok := requiredPathParam(w, r, "name")
+ if !ok {
return
}
- s.dispatchPassthroughOnly(w, r, s.config.Features.ResourceClasses)
+
+ s.forwardWithHook(w, r, func(w http.ResponseWriter, resp *http.Response) {
+ for k, vs := range resp.Header {
+ for _, v := range vs {
+ w.Header().Add(k, v)
+ }
+ }
+ w.WriteHeader(resp.StatusCode)
+ if resp.Body != nil {
+ if _, err := io.Copy(w, resp.Body); err != nil {
+ log.Error(err, "hybrid: failed to copy upstream response body")
+ }
+ }
+
+ if resp.StatusCode == http.StatusCreated || resp.StatusCode == http.StatusNoContent {
+ if _, err := s.addResourceClassToConfigMap(ctx, name); err != nil {
+ log.Error(err, "hybrid: failed to add resource class to local configmap", "class", name)
+ }
+ }
+ })
}
// HandleDeleteResourceClass handles DELETE /resource_classes/{name} requests.
//
-// Deletes a custom resource class. Only custom classes (prefixed with CUSTOM_)
-// may be deleted; attempting to delete a standard class returns 400 Bad
-// Request. Returns 409 Conflict if any resource provider has inventory of this
-// class, and 404 if the class does not exist. Returns 204 No Content on
-// success. Available since microversion 1.2.
+// Feature modes:
+// - passthrough: forwards to upstream placement.
+// - hybrid: forwards to upstream; on success, removes the class from the local ConfigMap.
+// - crd: removes the class from the local ConfigMap (CUSTOM_ prefix required).
+//
+// See: https://docs.openstack.org/api-ref/placement/#delete-resource-class
func (s *Shim) HandleDeleteResourceClass(w http.ResponseWriter, r *http.Request) {
- if _, ok := requiredPathParam(w, r, "name"); !ok {
+ ctx := r.Context()
+ log := logf.FromContext(ctx)
+
+ mode := s.featureModeFromConfOrHeader(r, s.config.Features.ResourceClasses)
+ switch mode {
+ case FeatureModePassthrough:
+ s.forward(w, r)
+ return
+ case FeatureModeHybrid:
+ s.handleDeleteResourceClassHybrid(w, r)
+ return
+ case FeatureModeCRD:
+ // Handle locally.
+ default:
+ http.Error(w, "unknown feature mode", http.StatusInternalServerError)
+ return
+ }
+
+ name, ok := requiredPathParam(w, r, "name")
+ if !ok {
+ return
+ }
+ if !strings.HasPrefix(name, "CUSTOM_") {
+ log.Info("rejected deletion of standard resource class", "class", name)
+ http.Error(w, "cannot delete standard resource classes", http.StatusBadRequest)
+ return
+ }
+
+ removed, err := s.removeResourceClassFromConfigMap(ctx, name)
+ if err != nil {
+ log.Error(err, "failed to delete resource class", "class", name)
+ http.Error(w, "failed to delete resource class", http.StatusInternalServerError)
+ return
+ }
+ if !removed {
+ log.Info("resource class not found in configmap", "class", name)
+ http.Error(w, "resource class not found", http.StatusNotFound)
return
}
- s.dispatchPassthroughOnly(w, r, s.config.Features.ResourceClasses)
+ log.Info("deleted resource class from configmap", "class", name)
+ w.WriteHeader(http.StatusNoContent)
+}
+
+// handleDeleteResourceClassHybrid forwards DELETE /resource_classes/{name} to
+// upstream, then updates the local ConfigMap on success.
+func (s *Shim) handleDeleteResourceClassHybrid(w http.ResponseWriter, r *http.Request) {
+ ctx := r.Context()
+ log := logf.FromContext(ctx)
+
+ name, ok := requiredPathParam(w, r, "name")
+ if !ok {
+ return
+ }
+
+ s.forwardWithHook(w, r, func(w http.ResponseWriter, resp *http.Response) {
+ for k, vs := range resp.Header {
+ for _, v := range vs {
+ w.Header().Add(k, v)
+ }
+ }
+ w.WriteHeader(resp.StatusCode)
+ if resp.Body != nil {
+ if _, err := io.Copy(w, resp.Body); err != nil {
+ log.Error(err, "hybrid: failed to copy upstream response body")
+ }
+ }
+
+ if resp.StatusCode == http.StatusNoContent {
+ if _, err := s.removeResourceClassFromConfigMap(ctx, name); err != nil {
+ log.Error(err, "hybrid: failed to remove resource class from local configmap", "class", name)
+ }
+ }
+ })
+}
+
+// getResourceClasses reads resource classes from the single ConfigMap.
+func (s *Shim) getResourceClasses(ctx context.Context) (map[string]struct{}, error) {
+ cm := &corev1.ConfigMap{}
+ if err := s.Get(ctx, client.ObjectKey{Namespace: os.Getenv("POD_NAMESPACE"), Name: s.config.ResourceClasses.ConfigMapName}, cm); err != nil {
+ return nil, fmt.Errorf("get resource classes configmap %s: %w", s.config.ResourceClasses.ConfigMapName, err)
+ }
+ return parseResourceClasses(cm)
+}
+
+// parseResourceClasses extracts the resource class set from a ConfigMap.
+func parseResourceClasses(cm *corev1.ConfigMap) (map[string]struct{}, error) {
+ raw, ok := cm.Data[configMapKeyResourceClasses]
+ if !ok || raw == "" {
+ return make(map[string]struct{}), nil
+ }
+ var classes []string
+ if err := json.Unmarshal([]byte(raw), &classes); err != nil {
+ return nil, fmt.Errorf("unmarshal resource classes from configmap: %w", err)
+ }
+ m := make(map[string]struct{}, len(classes))
+ for _, c := range classes {
+ m[c] = struct{}{}
+ }
+ return m, nil
+}
+
+func (s *Shim) hasResourceClass(ctx context.Context, name string) (bool, error) {
+ classes, err := s.getResourceClasses(ctx)
+ if err != nil {
+ return false, err
+ }
+ _, ok := classes[name]
+ return ok, nil
+}
+
+// writeResourceClassesToConfigMap serializes the resource class set into the ConfigMap.
+func writeResourceClassesToConfigMap(cm *corev1.ConfigMap, rcSet map[string]struct{}) error {
+ classes := make([]string, 0, len(rcSet))
+ for c := range rcSet {
+ classes = append(classes, c)
+ }
+ sort.Strings(classes)
+
+ data, err := json.Marshal(classes)
+ if err != nil {
+ return fmt.Errorf("marshal resource classes: %w", err)
+ }
+ if cm.Data == nil {
+ cm.Data = make(map[string]string)
+ }
+ cm.Data[configMapKeyResourceClasses] = string(data)
+ return nil
+}
+
+// addResourceClassToConfigMap adds a resource class to the ConfigMap under the
+// resource lock. Returns true if the class was newly created, false if it
+// already existed.
+func (s *Shim) addResourceClassToConfigMap(ctx context.Context, name string) (bool, error) {
+ classes, err := s.getResourceClasses(ctx)
+ if err != nil {
+ return false, err
+ }
+ if _, exists := classes[name]; exists {
+ return false, nil
+ }
+
+ host, err := os.Hostname()
+ if err != nil {
+ return false, fmt.Errorf("get hostname: %w", err)
+ }
+ lockerID := fmt.Sprintf("shim-%s-%d", host, time.Now().UnixNano())
+ if err := s.resourceLocker.AcquireLock(ctx, s.config.ResourceClasses.ConfigMapName+"-lock", lockerID); err != nil {
+ return false, fmt.Errorf("acquire resource classes lock: %w", err)
+ }
+ defer func() {
+ releaseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+ if err := s.resourceLocker.ReleaseLock(releaseCtx, s.config.ResourceClasses.ConfigMapName+"-lock", lockerID); err != nil {
+ ctrl.Log.WithName("placement-shim").Error(err, "failed to release resource classes lock")
+ }
+ }()
+
+ cm := &corev1.ConfigMap{}
+ key := client.ObjectKey{Namespace: os.Getenv("POD_NAMESPACE"), Name: s.config.ResourceClasses.ConfigMapName}
+ if err := s.Get(ctx, key, cm); err != nil {
+ if apierrors.IsNotFound(err) {
+ cm = &corev1.ConfigMap{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: key.Name,
+ Namespace: key.Namespace,
+ },
+ Data: map[string]string{configMapKeyResourceClasses: "[]"},
+ }
+ current := map[string]struct{}{name: {}}
+ if err := writeResourceClassesToConfigMap(cm, current); err != nil {
+ return false, err
+ }
+ if err := s.Create(ctx, cm); err != nil {
+ return false, fmt.Errorf("create resource classes configmap: %w", err)
+ }
+ return true, nil
+ }
+ return false, fmt.Errorf("get resource classes configmap: %w", err)
+ }
+
+ current, err := parseResourceClasses(cm)
+ if err != nil {
+ return false, err
+ }
+ if _, exists := current[name]; exists {
+ return false, nil
+ }
+ current[name] = struct{}{}
+ if err := writeResourceClassesToConfigMap(cm, current); err != nil {
+ return false, err
+ }
+ if err := s.Update(ctx, cm); err != nil {
+ return false, fmt.Errorf("update resource classes configmap: %w", err)
+ }
+ return true, nil
+}
+
+// removeResourceClassFromConfigMap removes a resource class from the ConfigMap
+// under the resource lock. Returns true if the class was found and removed.
+func (s *Shim) removeResourceClassFromConfigMap(ctx context.Context, name string) (bool, error) {
+ host, err := os.Hostname()
+ if err != nil {
+ return false, fmt.Errorf("get hostname: %w", err)
+ }
+ lockerID := fmt.Sprintf("shim-%s-%d", host, time.Now().UnixNano())
+ if err := s.resourceLocker.AcquireLock(ctx, s.config.ResourceClasses.ConfigMapName+"-lock", lockerID); err != nil {
+ return false, fmt.Errorf("acquire resource classes lock: %w", err)
+ }
+ defer func() {
+ releaseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+ if err := s.resourceLocker.ReleaseLock(releaseCtx, s.config.ResourceClasses.ConfigMapName+"-lock", lockerID); err != nil {
+ ctrl.Log.WithName("placement-shim").Error(err, "failed to release resource classes lock")
+ }
+ }()
+
+ cm := &corev1.ConfigMap{}
+ if err := s.Get(ctx, client.ObjectKey{Namespace: os.Getenv("POD_NAMESPACE"), Name: s.config.ResourceClasses.ConfigMapName}, cm); err != nil {
+ if apierrors.IsNotFound(err) {
+ return false, nil
+ }
+ return false, fmt.Errorf("get resource classes configmap: %w", err)
+ }
+ current, err := parseResourceClasses(cm)
+ if err != nil {
+ return false, err
+ }
+ if _, exists := current[name]; !exists {
+ return false, nil
+ }
+ delete(current, name)
+ if err := writeResourceClassesToConfigMap(cm, current); err != nil {
+ return false, err
+ }
+ if err := s.Update(ctx, cm); err != nil {
+ return false, fmt.Errorf("update resource classes configmap: %w", err)
+ }
+ return true, nil
}
diff --git a/internal/shim/placement/handle_resource_classes_e2e.go b/internal/shim/placement/handle_resource_classes_e2e.go
index e848ee034..6bc302cfe 100644
--- a/internal/shim/placement/handle_resource_classes_e2e.go
+++ b/internal/shim/placement/handle_resource_classes_e2e.go
@@ -6,6 +6,7 @@ package placement
import (
"context"
"encoding/json"
+ "errors"
"fmt"
"net/http"
@@ -17,13 +18,24 @@ import (
// e2eTestResourceClasses tests the /resource_classes and
// /resource_classes/{name} endpoints.
//
-// 1. Pre-cleanup: DELETE any leftover custom resource class (ignore 404).
-// 2. GET /resource_classes — list all classes and verify the response.
-// 3. GET /resource_classes/VCPU — confirm a standard class is retrievable.
-// 4. PUT /resource_classes/{name} — create a custom test class.
-// 5. GET /resource_classes/{name} — verify the custom class now exists.
-// 6. DELETE /resource_classes/{name} — remove the custom class.
-// 7. GET /resource_classes/{name} — confirm deletion returns 404.
+// Phase 1 — read-only (always runs):
+//
+// 1. GET /resource_classes — list all resource classes; when mode is
+// passthrough (forwarding to upstream) verify at least one class exists.
+// 2. GET /resource_classes/VCPU — verify a standard class is retrievable
+// (skipped when the list is empty).
+// 3. GET /resource_classes/{name} — show a nonexistent class and verify 404.
+//
+// Phase 2 — CRUD (only when mode is non-passthrough):
+//
+// 1. Pre-cleanup: DELETE any leftover test class (ignore 404).
+// 2. PUT /resource_classes/{name} — create a custom test class → 201.
+// 3. PUT /resource_classes/{name} — idempotent create → 204.
+// 4. GET /resource_classes/{name} — verify the custom class exists → 200.
+// 5. DELETE /resource_classes/{name} — remove the custom class → 204.
+// 6. GET /resource_classes/{name} — confirm deletion → 404.
+// 7. PUT /resource_classes/{name} — bad prefix → 400.
+// 8. DELETE /resource_classes/{name} — bad prefix → 400.
func e2eTestResourceClasses(ctx context.Context, _ client.Client) error {
log := logf.FromContext(ctx)
log.Info("Running resource classes endpoint e2e test")
@@ -40,197 +52,250 @@ func e2eTestResourceClasses(ctx context.Context, _ client.Client) error {
}
log.Info("Successfully created openstack client for resource classes e2e test")
- const testRC = "CUSTOM_CORTEX_E2E_RC"
+ // ==================== Phase 1: read-only tests ====================
- // Probe: for non-passthrough modes, verify endpoint returns 501.
- unimplemented, err := e2eProbeUnimplemented(ctx, sc, sc.Endpoint+"/resource_classes")
- if err != nil {
- return fmt.Errorf("probe: %w", err)
- }
- if unimplemented {
- return nil
- }
+ log.Info("=== Phase 1: read-only resource class tests ===")
- // Pre-cleanup: delete any leftover test resource class from a prior run.
- log.Info("Pre-cleanup: deleting leftover test resource class", "class", testRC)
+ rcMode := e2eCurrentMode(ctx)
+
+ // Test GET /resource_classes
+ log.Info("Testing GET /resource_classes endpoint")
req, err := http.NewRequestWithContext(ctx,
- http.MethodDelete, sc.Endpoint+"/resource_classes/"+testRC, http.NoBody)
+ http.MethodGet, sc.Endpoint+"/resource_classes", http.NoBody)
if err != nil {
- log.Error(err, "failed to create pre-cleanup request")
- return err
+ return fmt.Errorf("failed to create GET /resource_classes request: %w", err)
}
req.Header.Set("X-Auth-Token", sc.TokenID)
req.Header.Set("OpenStack-API-Version", "placement 1.7")
+ req.Header.Set("Accept", "application/json")
resp, err := sc.HTTPClient.Do(req)
if err != nil {
- log.Error(err, "failed to send pre-cleanup request")
- return err
+ return fmt.Errorf("failed to send GET /resource_classes request: %w", err)
}
defer resp.Body.Close()
- // Ignore 404 (not found) — that's expected if no leftover exists.
- if resp.StatusCode != http.StatusNotFound &&
- (resp.StatusCode < 200 || resp.StatusCode >= 300) {
- err := fmt.Errorf("unexpected status code during pre-cleanup: %d", resp.StatusCode)
- log.Error(err, "pre-cleanup failed")
- return err
+ if resp.StatusCode != http.StatusOK {
+ return fmt.Errorf("GET /resource_classes: expected 200, got %d", resp.StatusCode)
}
- log.Info("Pre-cleanup completed", "status", resp.StatusCode)
+ var listResp resourceClassesListResponse
+ if err := json.NewDecoder(resp.Body).Decode(&listResp); err != nil {
+ return fmt.Errorf("failed to decode GET /resource_classes response: %w", err)
+ }
+ if rcMode == FeatureModePassthrough && len(listResp.ResourceClasses) == 0 {
+ return errors.New("GET /resource_classes: expected at least one class when forwarding to upstream, got 0")
+ }
+ log.Info("Successfully retrieved resource classes", "count", len(listResp.ResourceClasses))
- // Test GET /resource_classes
- log.Info("Testing GET /resource_classes endpoint of placement shim")
+ // Test GET /resource_classes/{name} for a known class (skip when list is empty).
+ if len(listResp.ResourceClasses) > 0 {
+ knownClass := listResp.ResourceClasses[0].Name
+ log.Info("Testing GET /resource_classes/{name} for known class", "class", knownClass)
+ req, err = http.NewRequestWithContext(ctx,
+ http.MethodGet, sc.Endpoint+"/resource_classes/"+knownClass, http.NoBody)
+ if err != nil {
+ return fmt.Errorf("failed to create GET request for class %s: %w", knownClass, err)
+ }
+ req.Header.Set("X-Auth-Token", sc.TokenID)
+ req.Header.Set("OpenStack-API-Version", "placement 1.7")
+ req.Header.Set("Accept", "application/json")
+ resp, err = sc.HTTPClient.Do(req)
+ if err != nil {
+ return fmt.Errorf("failed to send GET request for class %s: %w", knownClass, err)
+ }
+ defer resp.Body.Close()
+ if resp.StatusCode != http.StatusOK {
+ return fmt.Errorf("GET /resource_classes/%s: expected 200, got %d", knownClass, resp.StatusCode)
+ }
+ log.Info("Successfully verified known class exists", "class", knownClass)
+ } else {
+ log.Info("Skipping GET /resource_classes/{name} for known class, list is empty")
+ }
+
+ // Test GET /resource_classes/{name} for a nonexistent class.
+ log.Info("Testing GET /resource_classes/{name} for nonexistent class")
req, err = http.NewRequestWithContext(ctx,
- http.MethodGet, sc.Endpoint+"/resource_classes", http.NoBody)
+ http.MethodGet, sc.Endpoint+"/resource_classes/CUSTOM_CORTEX_E2E_NONEXISTENT", http.NoBody)
if err != nil {
- log.Error(err, "failed to create request for resource_classes endpoint")
- return err
+ return fmt.Errorf("failed to create GET request for nonexistent class: %w", err)
}
req.Header.Set("X-Auth-Token", sc.TokenID)
req.Header.Set("OpenStack-API-Version", "placement 1.7")
req.Header.Set("Accept", "application/json")
resp, err = sc.HTTPClient.Do(req)
if err != nil {
- log.Error(err, "failed to send request to /resource_classes endpoint")
- return err
+ return fmt.Errorf("failed to send GET request for nonexistent class: %w", err)
}
defer resp.Body.Close()
- if resp.StatusCode < 200 || resp.StatusCode >= 300 {
- err := fmt.Errorf("unexpected status code: %d", resp.StatusCode)
- log.Error(err, "/resource_classes endpoint returned an error")
- return err
+ if resp.StatusCode != http.StatusNotFound {
+ return fmt.Errorf("GET /resource_classes/CUSTOM_CORTEX_E2E_NONEXISTENT: expected 404, got %d", resp.StatusCode)
}
- var list struct {
- ResourceClasses []struct {
- Name string `json:"name"`
- } `json:"resource_classes"`
+ log.Info("Correctly received 404 for nonexistent resource class")
+
+ // ==================== Phase 2: CRUD tests ====================
+
+ log.Info("=== Phase 2: CRUD resource class tests ===")
+
+ const testRC = "CUSTOM_CORTEX_E2E_RC"
+
+ // Pre-cleanup: delete any leftover test class from a prior run.
+ log.Info("Pre-cleanup: deleting leftover test resource class", "class", testRC)
+ req, err = http.NewRequestWithContext(ctx,
+ http.MethodDelete, sc.Endpoint+"/resource_classes/"+testRC, http.NoBody)
+ if err != nil {
+ return fmt.Errorf("failed to create pre-cleanup request: %w", err)
}
- err = json.NewDecoder(resp.Body).Decode(&list)
+ req.Header.Set("X-Auth-Token", sc.TokenID)
+ req.Header.Set("OpenStack-API-Version", "placement 1.7")
+ resp, err = sc.HTTPClient.Do(req)
if err != nil {
- log.Error(err, "failed to decode response from /resource_classes endpoint")
- return err
+ return fmt.Errorf("failed to send pre-cleanup request: %w", err)
+ }
+ defer resp.Body.Close()
+ if resp.StatusCode != http.StatusNotFound && resp.StatusCode != http.StatusNoContent {
+ return fmt.Errorf("pre-cleanup DELETE /resource_classes/%s: unexpected status %d", testRC, resp.StatusCode)
}
- log.Info("Successfully retrieved resource classes from placement shim",
- "count", len(list.ResourceClasses))
+ log.Info("Pre-cleanup completed", "status", resp.StatusCode)
- // Test GET /resource_classes/{name} for a standard class
- log.Info("Testing GET /resource_classes/VCPU endpoint of placement shim")
+ // Test PUT /resource_classes/{name} — create → 201.
+ log.Info("Testing PUT /resource_classes/{name} to create custom class", "class", testRC)
req, err = http.NewRequestWithContext(ctx,
- http.MethodGet, sc.Endpoint+"/resource_classes/VCPU", http.NoBody)
+ http.MethodPut, sc.Endpoint+"/resource_classes/"+testRC, http.NoBody)
if err != nil {
- log.Error(err, "failed to create request for resource_classes/VCPU endpoint")
- return err
+ return fmt.Errorf("failed to create PUT request for class %s: %w", testRC, err)
}
req.Header.Set("X-Auth-Token", sc.TokenID)
req.Header.Set("OpenStack-API-Version", "placement 1.7")
- req.Header.Set("Accept", "application/json")
resp, err = sc.HTTPClient.Do(req)
if err != nil {
- log.Error(err, "failed to send request to /resource_classes/VCPU endpoint")
- return err
+ return fmt.Errorf("failed to send PUT request for class %s: %w", testRC, err)
}
defer resp.Body.Close()
- if resp.StatusCode < 200 || resp.StatusCode >= 300 {
- err := fmt.Errorf("unexpected status code: %d", resp.StatusCode)
- log.Error(err, "/resource_classes/VCPU endpoint returned an error")
- return err
+ if resp.StatusCode != http.StatusCreated {
+ return fmt.Errorf("PUT /resource_classes/%s (create): expected 201, got %d", testRC, resp.StatusCode)
}
- log.Info("Successfully retrieved standard resource class VCPU from placement shim")
+ log.Info("Successfully created custom resource class", "class", testRC)
- // Test PUT /resource_classes/{name} (create custom class)
- log.Info("Testing PUT /resource_classes/{name} to create custom class", "class", testRC)
+ // Test PUT /resource_classes/{name} — idempotent → 204.
+ log.Info("Testing PUT /resource_classes/{name} idempotent create", "class", testRC)
req, err = http.NewRequestWithContext(ctx,
http.MethodPut, sc.Endpoint+"/resource_classes/"+testRC, http.NoBody)
if err != nil {
- log.Error(err, "failed to create PUT request for resource_classes", "class", testRC)
- return err
+ return fmt.Errorf("failed to create idempotent PUT request: %w", err)
}
req.Header.Set("X-Auth-Token", sc.TokenID)
req.Header.Set("OpenStack-API-Version", "placement 1.7")
- req.Header.Set("Accept", "application/json")
resp, err = sc.HTTPClient.Do(req)
if err != nil {
- log.Error(err, "failed to send PUT request to /resource_classes", "class", testRC)
- return err
+ return fmt.Errorf("failed to send idempotent PUT request: %w", err)
}
defer resp.Body.Close()
- if resp.StatusCode < 200 || resp.StatusCode >= 300 {
- err := fmt.Errorf("unexpected status code: %d", resp.StatusCode)
- log.Error(err, "PUT /resource_classes returned an error", "class", testRC)
- return err
+ if resp.StatusCode != http.StatusNoContent {
+ return fmt.Errorf("PUT /resource_classes/%s (idempotent): expected 204, got %d", testRC, resp.StatusCode)
}
- log.Info("Successfully created custom resource class", "class", testRC,
- "status", resp.StatusCode)
+ log.Info("Successfully verified idempotent PUT", "class", testRC)
- // Test GET /resource_classes/{name} for the custom class
+ // Test GET /resource_classes/{name} — verify exists → 200.
log.Info("Testing GET /resource_classes/{name} for custom class", "class", testRC)
req, err = http.NewRequestWithContext(ctx,
http.MethodGet, sc.Endpoint+"/resource_classes/"+testRC, http.NoBody)
if err != nil {
- log.Error(err, "failed to create GET request for custom resource class", "class", testRC)
- return err
+ return fmt.Errorf("failed to create GET request for class %s: %w", testRC, err)
}
req.Header.Set("X-Auth-Token", sc.TokenID)
req.Header.Set("OpenStack-API-Version", "placement 1.7")
req.Header.Set("Accept", "application/json")
resp, err = sc.HTTPClient.Do(req)
if err != nil {
- log.Error(err, "failed to send GET request for custom resource class", "class", testRC)
- return err
+ return fmt.Errorf("failed to send GET request for class %s: %w", testRC, err)
}
defer resp.Body.Close()
- if resp.StatusCode < 200 || resp.StatusCode >= 300 {
- err := fmt.Errorf("unexpected status code: %d", resp.StatusCode)
- log.Error(err, "GET custom resource class returned an error", "class", testRC)
- return err
+ if resp.StatusCode != http.StatusOK {
+ return fmt.Errorf("GET /resource_classes/%s: expected 200, got %d", testRC, resp.StatusCode)
}
log.Info("Successfully verified custom resource class exists", "class", testRC)
- // Cleanup: Test DELETE /resource_classes/{name}
- log.Info("Cleaning up test resource class from placement shim", "class", testRC)
+ // Cleanup: DELETE /resource_classes/{name} → 204.
+ log.Info("Cleaning up test resource class", "class", testRC)
req, err = http.NewRequestWithContext(ctx,
http.MethodDelete, sc.Endpoint+"/resource_classes/"+testRC, http.NoBody)
if err != nil {
- log.Error(err, "failed to create DELETE request for resource class", "class", testRC)
- return err
+ return fmt.Errorf("failed to create DELETE request for class %s: %w", testRC, err)
}
req.Header.Set("X-Auth-Token", sc.TokenID)
req.Header.Set("OpenStack-API-Version", "placement 1.7")
resp, err = sc.HTTPClient.Do(req)
if err != nil {
- log.Error(err, "failed to send DELETE request for resource class", "class", testRC)
- return err
+ return fmt.Errorf("failed to send DELETE request for class %s: %w", testRC, err)
}
defer resp.Body.Close()
- if resp.StatusCode < 200 || resp.StatusCode >= 300 {
- err := fmt.Errorf("unexpected status code: %d", resp.StatusCode)
- log.Error(err, "DELETE resource class returned an error", "class", testRC)
- return err
+ if resp.StatusCode != http.StatusNoContent {
+ return fmt.Errorf("DELETE /resource_classes/%s: expected 204, got %d", testRC, resp.StatusCode)
}
log.Info("Successfully deleted test resource class", "class", testRC)
- // Verify deletion: GET should return 404
+ // Verify deletion: GET → 404.
log.Info("Verifying test resource class was deleted", "class", testRC)
req, err = http.NewRequestWithContext(ctx,
http.MethodGet, sc.Endpoint+"/resource_classes/"+testRC, http.NoBody)
if err != nil {
- log.Error(err, "failed to create verification GET request", "class", testRC)
- return err
+ return fmt.Errorf("failed to create verification GET request: %w", err)
}
req.Header.Set("X-Auth-Token", sc.TokenID)
req.Header.Set("OpenStack-API-Version", "placement 1.7")
req.Header.Set("Accept", "application/json")
resp, err = sc.HTTPClient.Do(req)
if err != nil {
- log.Error(err, "failed to send verification GET request", "class", testRC)
- return err
+ return fmt.Errorf("failed to send verification GET request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusNotFound {
- err := fmt.Errorf("expected 404 after deletion, got: %d", resp.StatusCode)
- log.Error(err, "resource class still exists after deletion", "class", testRC)
- return err
+ return fmt.Errorf("GET /resource_classes/%s after deletion: expected 404, got %d",
+ testRC, resp.StatusCode)
}
log.Info("Verified test resource class was deleted", "class", testRC)
+ // Bad-prefix validation is only enforced by the shim in crd mode.
+ if rcMode == FeatureModeCRD {
+ // Test PUT /resource_classes/{name} with bad prefix → 400.
+ log.Info("Testing PUT /resource_classes/{name} with non-CUSTOM_ prefix")
+ req, err = http.NewRequestWithContext(ctx,
+ http.MethodPut, sc.Endpoint+"/resource_classes/VCPU_CORTEX_E2E_BAD", http.NoBody)
+ if err != nil {
+ return fmt.Errorf("failed to create bad-prefix PUT request: %w", err)
+ }
+ req.Header.Set("X-Auth-Token", sc.TokenID)
+ req.Header.Set("OpenStack-API-Version", "placement 1.7")
+ resp, err = sc.HTTPClient.Do(req)
+ if err != nil {
+ return fmt.Errorf("failed to send bad-prefix PUT request: %w", err)
+ }
+ defer resp.Body.Close()
+ if resp.StatusCode != http.StatusBadRequest {
+ return fmt.Errorf("PUT /resource_classes/VCPU_CORTEX_E2E_BAD: expected 400, got %d", resp.StatusCode)
+ }
+ log.Info("Correctly received 400 for PUT with non-CUSTOM_ prefix")
+
+ // Test DELETE /resource_classes/{name} with bad prefix → 400.
+ log.Info("Testing DELETE /resource_classes/{name} with non-CUSTOM_ prefix")
+ req, err = http.NewRequestWithContext(ctx,
+ http.MethodDelete, sc.Endpoint+"/resource_classes/VCPU_CORTEX_E2E_BAD", http.NoBody)
+ if err != nil {
+ return fmt.Errorf("failed to create bad-prefix DELETE request: %w", err)
+ }
+ req.Header.Set("X-Auth-Token", sc.TokenID)
+ req.Header.Set("OpenStack-API-Version", "placement 1.7")
+ resp, err = sc.HTTPClient.Do(req)
+ if err != nil {
+ return fmt.Errorf("failed to send bad-prefix DELETE request: %w", err)
+ }
+ defer resp.Body.Close()
+ if resp.StatusCode != http.StatusBadRequest {
+ return fmt.Errorf("DELETE /resource_classes/VCPU_CORTEX_E2E_BAD: expected 400, got %d", resp.StatusCode)
+ }
+ log.Info("Correctly received 400 for DELETE with non-CUSTOM_ prefix")
+ } else {
+ log.Info("Skipping bad-prefix validation tests (only enforced in crd mode)")
+ }
+
return nil
}
diff --git a/internal/shim/placement/handle_resource_classes_test.go b/internal/shim/placement/handle_resource_classes_test.go
index 330cae9be..d3f1c4ee1 100644
--- a/internal/shim/placement/handle_resource_classes_test.go
+++ b/internal/shim/placement/handle_resource_classes_test.go
@@ -4,11 +4,53 @@
package placement
import (
+ "bytes"
+ "context"
+ "encoding/json"
"net/http"
+ "net/http/httptest"
"testing"
+
+ "github.com/cobaltcore-dev/cortex/pkg/resourcelock"
+ corev1 "k8s.io/api/core/v1"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "sigs.k8s.io/controller-runtime/pkg/client"
)
-func TestHandleListResourceClasses(t *testing.T) {
+func newTestResourceClassConfigMap(namespace, name string, classes []string) *corev1.ConfigMap {
+ b, err := json.Marshal(classes)
+ if err != nil {
+ panic("marshal resource classes: " + err.Error())
+ }
+ return &corev1.ConfigMap{
+ ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: namespace},
+ Data: map[string]string{configMapKeyResourceClasses: string(b)},
+ }
+}
+
+func newResourceClassShim(t *testing.T, classes []string) *Shim {
+ t.Helper()
+ t.Setenv("POD_NAMESPACE", "default")
+ objs := []client.Object{newTestResourceClassConfigMap("default", "test-rc-cm", classes)}
+ cl := newFakeClientWithScheme(t, objs...)
+ down, up := newTestTimers()
+ return &Shim{
+ Client: cl,
+ config: config{
+ PlacementURL: "http://should-not-be-called:1234",
+ Features: featuresConfig{ResourceClasses: FeatureModeCRD},
+ ResourceClasses: &resourceClassesConfig{ConfigMapName: "test-rc-cm"},
+ },
+ maxBodyLogSize: 4096,
+ downstreamRequestTimer: down,
+ upstreamRequestTimer: up,
+ resourceLocker: resourcelock.NewResourceLocker(cl, "default"),
+ }
+}
+
+// --- Passthrough mode tests ---
+
+func TestHandleListResourceClassesPassthrough(t *testing.T) {
var gotPath string
s := newTestShim(t, http.StatusOK, `{"resource_classes":[]}`, &gotPath)
w := serveHandler(t, "GET", "/resource_classes", s.HandleListResourceClasses, "/resource_classes")
@@ -20,7 +62,7 @@ func TestHandleListResourceClasses(t *testing.T) {
}
}
-func TestHandleCreateResourceClass(t *testing.T) {
+func TestHandleCreateResourceClassPassthrough(t *testing.T) {
s := newTestShim(t, http.StatusCreated, "{}", nil)
w := serveHandler(t, "POST", "/resource_classes", s.HandleCreateResourceClass, "/resource_classes")
if w.Code != http.StatusCreated {
@@ -28,7 +70,7 @@ func TestHandleCreateResourceClass(t *testing.T) {
}
}
-func TestHandleShowResourceClass(t *testing.T) {
+func TestHandleShowResourceClassPassthrough(t *testing.T) {
var gotPath string
s := newTestShim(t, http.StatusOK, "{}", &gotPath)
w := serveHandler(t, "GET", "/resource_classes/{name}", s.HandleShowResourceClass, "/resource_classes/VCPU")
@@ -40,7 +82,7 @@ func TestHandleShowResourceClass(t *testing.T) {
}
}
-func TestHandleUpdateResourceClass(t *testing.T) {
+func TestHandleUpdateResourceClassPassthrough(t *testing.T) {
s := newTestShim(t, http.StatusNoContent, "", nil)
w := serveHandler(t, "PUT", "/resource_classes/{name}", s.HandleUpdateResourceClass, "/resource_classes/CUSTOM_FOO")
if w.Code != http.StatusNoContent {
@@ -48,7 +90,7 @@ func TestHandleUpdateResourceClass(t *testing.T) {
}
}
-func TestHandleDeleteResourceClass(t *testing.T) {
+func TestHandleDeleteResourceClassPassthrough(t *testing.T) {
s := newTestShim(t, http.StatusNoContent, "", nil)
w := serveHandler(t, "DELETE", "/resource_classes/{name}", s.HandleDeleteResourceClass, "/resource_classes/CUSTOM_BAR")
if w.Code != http.StatusNoContent {
@@ -56,98 +98,226 @@ func TestHandleDeleteResourceClass(t *testing.T) {
}
}
-func TestHandleResourceClasses_HybridMode(t *testing.T) {
- down, up := newTestTimers()
- s := &Shim{
- config: config{
- PlacementURL: "http://should-not-be-called:1234",
- Features: featuresConfig{ResourceClasses: FeatureModeHybrid},
- },
- maxBodyLogSize: 4096,
- downstreamRequestTimer: down,
- upstreamRequestTimer: up,
+// --- CRD mode handler tests ---
+
+func TestHandleListResourceClassesLocal(t *testing.T) {
+ s := newResourceClassShim(t, []string{"CUSTOM_FOO", "MEMORY_MB", "VCPU"})
+
+ w := serveHandler(t, "GET", "/resource_classes", s.HandleListResourceClasses, "/resource_classes")
+ if w.Code != http.StatusOK {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusOK)
}
- t.Run("GET list returns 501", func(t *testing.T) {
- w := serveHandler(t, "GET", "/resource_classes",
- s.HandleListResourceClasses, "/resource_classes")
- if w.Code != http.StatusNotImplemented {
- t.Fatalf("status = %d, want %d", w.Code, http.StatusNotImplemented)
- }
- })
- t.Run("POST returns 501", func(t *testing.T) {
- w := serveHandler(t, "POST", "/resource_classes",
- s.HandleCreateResourceClass, "/resource_classes")
- if w.Code != http.StatusNotImplemented {
- t.Fatalf("status = %d, want %d", w.Code, http.StatusNotImplemented)
- }
- })
- t.Run("GET show returns 501", func(t *testing.T) {
- w := serveHandler(t, "GET", "/resource_classes/{name}",
- s.HandleShowResourceClass, "/resource_classes/VCPU")
- if w.Code != http.StatusNotImplemented {
- t.Fatalf("status = %d, want %d", w.Code, http.StatusNotImplemented)
- }
- })
- t.Run("PUT returns 501", func(t *testing.T) {
- w := serveHandler(t, "PUT", "/resource_classes/{name}",
- s.HandleUpdateResourceClass, "/resource_classes/CUSTOM_FOO")
- if w.Code != http.StatusNotImplemented {
- t.Fatalf("status = %d, want %d", w.Code, http.StatusNotImplemented)
+ var resp resourceClassesListResponse
+ if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
+ t.Fatalf("decode: %v", err)
+ }
+ if len(resp.ResourceClasses) != 3 {
+ t.Fatalf("got %d classes, want 3: %v", len(resp.ResourceClasses), resp.ResourceClasses)
+ }
+ want := []string{"CUSTOM_FOO", "MEMORY_MB", "VCPU"}
+ for i, rc := range resp.ResourceClasses {
+ if rc.Name != want[i] {
+ t.Errorf("class[%d] = %q, want %q", i, rc.Name, want[i])
}
- })
- t.Run("DELETE returns 501", func(t *testing.T) {
- w := serveHandler(t, "DELETE", "/resource_classes/{name}",
- s.HandleDeleteResourceClass, "/resource_classes/CUSTOM_BAR")
- if w.Code != http.StatusNotImplemented {
- t.Fatalf("status = %d, want %d", w.Code, http.StatusNotImplemented)
+ if len(rc.Links) != 1 || rc.Links[0].Rel != "self" || rc.Links[0].Href != "/resource_classes/"+rc.Name {
+ t.Errorf("class[%d] links = %v, want self link", i, rc.Links)
}
- })
+ }
+}
+
+func TestHandleShowResourceClassLocalFound(t *testing.T) {
+ s := newResourceClassShim(t, []string{"VCPU", "MEMORY_MB"})
+ w := serveHandler(t, "GET", "/resource_classes/{name}", s.HandleShowResourceClass, "/resource_classes/VCPU")
+ if w.Code != http.StatusOK {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusOK)
+ }
+}
+
+func TestHandleShowResourceClassLocalNotFound(t *testing.T) {
+ s := newResourceClassShim(t, []string{"VCPU"})
+ w := serveHandler(t, "GET", "/resource_classes/{name}", s.HandleShowResourceClass, "/resource_classes/NONEXISTENT")
+ if w.Code != http.StatusNotFound {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusNotFound)
+ }
+}
+
+func TestHandleUpdateResourceClassLocalCreated(t *testing.T) {
+ s := newResourceClassShim(t, nil)
+ w := serveHandler(t, "PUT", "/resource_classes/{name}", s.HandleUpdateResourceClass, "/resource_classes/CUSTOM_NEW")
+ if w.Code != http.StatusCreated {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusCreated)
+ }
+ found, err := s.hasResourceClass(context.Background(), "CUSTOM_NEW")
+ if err != nil {
+ t.Fatalf("hasResourceClass: %v", err)
+ }
+ if !found {
+ t.Error("expected resource class to be in store")
+ }
+}
+
+func TestHandleUpdateResourceClassLocalAlreadyExists(t *testing.T) {
+ s := newResourceClassShim(t, []string{"CUSTOM_EXISTING"})
+ w := serveHandler(t, "PUT", "/resource_classes/{name}", s.HandleUpdateResourceClass, "/resource_classes/CUSTOM_EXISTING")
+ if w.Code != http.StatusNoContent {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusNoContent)
+ }
+}
+
+func TestHandleUpdateResourceClassLocalBadPrefix(t *testing.T) {
+ s := newResourceClassShim(t, nil)
+ w := serveHandler(t, "PUT", "/resource_classes/{name}", s.HandleUpdateResourceClass, "/resource_classes/VCPU")
+ if w.Code != http.StatusBadRequest {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest)
+ }
+}
+
+func TestHandleCreateResourceClassLocalCreated(t *testing.T) {
+ s := newResourceClassShim(t, nil)
+ body := bytes.NewBufferString(`{"name":"CUSTOM_NEW"}`)
+ w := serveHandlerWithBody(t, "POST", "/resource_classes", s.HandleCreateResourceClass, "/resource_classes", body)
+ if w.Code != http.StatusCreated {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusCreated)
+ }
+ found, err := s.hasResourceClass(context.Background(), "CUSTOM_NEW")
+ if err != nil {
+ t.Fatalf("hasResourceClass: %v", err)
+ }
+ if !found {
+ t.Error("expected resource class to be in store")
+ }
}
-func TestHandleResourceClasses_CRDMode(t *testing.T) {
+func TestHandleCreateResourceClassLocalConflict(t *testing.T) {
+ s := newResourceClassShim(t, []string{"CUSTOM_EXISTING"})
+ body := bytes.NewBufferString(`{"name":"CUSTOM_EXISTING"}`)
+ w := serveHandlerWithBody(t, "POST", "/resource_classes", s.HandleCreateResourceClass, "/resource_classes", body)
+ if w.Code != http.StatusConflict {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusConflict)
+ }
+}
+
+func TestHandleCreateResourceClassLocalBadPrefix(t *testing.T) {
+ s := newResourceClassShim(t, nil)
+ body := bytes.NewBufferString(`{"name":"VCPU"}`)
+ w := serveHandlerWithBody(t, "POST", "/resource_classes", s.HandleCreateResourceClass, "/resource_classes", body)
+ if w.Code != http.StatusBadRequest {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest)
+ }
+}
+
+func TestHandleDeleteResourceClassLocal(t *testing.T) {
+ s := newResourceClassShim(t, []string{"CUSTOM_DEL"})
+ w := serveHandler(t, "DELETE", "/resource_classes/{name}", s.HandleDeleteResourceClass, "/resource_classes/CUSTOM_DEL")
+ if w.Code != http.StatusNoContent {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusNoContent)
+ }
+ found, err := s.hasResourceClass(context.Background(), "CUSTOM_DEL")
+ if err != nil {
+ t.Fatalf("hasResourceClass: %v", err)
+ }
+ if found {
+ t.Error("expected resource class to be deleted")
+ }
+}
+
+func TestHandleDeleteResourceClassLocalNotFound(t *testing.T) {
+ s := newResourceClassShim(t, nil)
+ w := serveHandler(t, "DELETE", "/resource_classes/{name}", s.HandleDeleteResourceClass, "/resource_classes/CUSTOM_GONE")
+ if w.Code != http.StatusNotFound {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusNotFound)
+ }
+}
+
+func TestHandleDeleteResourceClassLocalBadPrefix(t *testing.T) {
+ s := newResourceClassShim(t, []string{"VCPU"})
+ w := serveHandler(t, "DELETE", "/resource_classes/{name}", s.HandleDeleteResourceClass, "/resource_classes/VCPU")
+ if w.Code != http.StatusBadRequest {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest)
+ }
+}
+
+// --- Hybrid mode tests ---
+
+func newHybridResourceClassShim(t *testing.T, upstreamStatus int, upstreamBody string, classes []string) *Shim {
+ t.Helper()
+ t.Setenv("POD_NAMESPACE", "default")
+ upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+ w.WriteHeader(upstreamStatus)
+ if upstreamBody != "" {
+ if _, err := w.Write([]byte(upstreamBody)); err != nil {
+ t.Errorf("failed to write upstream body: %v", err)
+ }
+ }
+ }))
+ t.Cleanup(upstream.Close)
+ objs := []client.Object{newTestResourceClassConfigMap("default", "test-rc-cm", classes)}
+ cl := newFakeClientWithScheme(t, objs...)
down, up := newTestTimers()
- s := &Shim{
+ return &Shim{
+ Client: cl,
config: config{
- PlacementURL: "http://should-not-be-called:1234",
- Features: featuresConfig{ResourceClasses: FeatureModeCRD},
+ PlacementURL: upstream.URL,
+ Features: featuresConfig{ResourceClasses: FeatureModeHybrid},
+ ResourceClasses: &resourceClassesConfig{ConfigMapName: "test-rc-cm"},
},
+ httpClient: upstream.Client(),
maxBodyLogSize: 4096,
downstreamRequestTimer: down,
upstreamRequestTimer: up,
+ resourceLocker: resourcelock.NewResourceLocker(cl, "default"),
+ }
+}
+
+func TestHandleListResourceClassesHybridForwards(t *testing.T) {
+ s := newHybridResourceClassShim(t, http.StatusOK, `{"resource_classes":[{"name":"VCPU"}]}`, nil)
+ w := serveHandler(t, "GET", "/resource_classes", s.HandleListResourceClasses, "/resource_classes")
+ if w.Code != http.StatusOK {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusOK)
+ }
+}
+
+func TestHandleUpdateResourceClassHybridUpdatesLocal(t *testing.T) {
+ s := newHybridResourceClassShim(t, http.StatusCreated, "", nil)
+ w := serveHandler(t, "PUT", "/resource_classes/{name}", s.HandleUpdateResourceClass, "/resource_classes/CUSTOM_HYB")
+ if w.Code != http.StatusCreated {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusCreated)
+ }
+ found, err := s.hasResourceClass(context.Background(), "CUSTOM_HYB")
+ if err != nil {
+ t.Fatalf("hasResourceClass: %v", err)
+ }
+ if !found {
+ t.Error("expected resource class to be added to local configmap in hybrid mode")
+ }
+}
+
+func TestHandleDeleteResourceClassHybridUpdatesLocal(t *testing.T) {
+ s := newHybridResourceClassShim(t, http.StatusNoContent, "", []string{"CUSTOM_DEL"})
+ w := serveHandler(t, "DELETE", "/resource_classes/{name}", s.HandleDeleteResourceClass, "/resource_classes/CUSTOM_DEL")
+ if w.Code != http.StatusNoContent {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusNoContent)
+ }
+ found, err := s.hasResourceClass(context.Background(), "CUSTOM_DEL")
+ if err != nil {
+ t.Fatalf("hasResourceClass: %v", err)
+ }
+ if found {
+ t.Error("expected resource class to be removed from local configmap in hybrid mode")
+ }
+}
+
+func TestHandleUpdateResourceClassHybridUpstreamFailure(t *testing.T) {
+ s := newHybridResourceClassShim(t, http.StatusInternalServerError, "upstream error", nil)
+ w := serveHandler(t, "PUT", "/resource_classes/{name}", s.HandleUpdateResourceClass, "/resource_classes/CUSTOM_FAIL")
+ if w.Code != http.StatusInternalServerError {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusInternalServerError)
+ }
+ found, err := s.hasResourceClass(context.Background(), "CUSTOM_FAIL")
+ if err != nil {
+ t.Fatalf("hasResourceClass: %v", err)
+ }
+ if found {
+ t.Error("expected resource class NOT to be added when upstream fails")
}
- t.Run("GET list returns 501", func(t *testing.T) {
- w := serveHandler(t, "GET", "/resource_classes",
- s.HandleListResourceClasses, "/resource_classes")
- if w.Code != http.StatusNotImplemented {
- t.Fatalf("status = %d, want %d", w.Code, http.StatusNotImplemented)
- }
- })
- t.Run("POST returns 501", func(t *testing.T) {
- w := serveHandler(t, "POST", "/resource_classes",
- s.HandleCreateResourceClass, "/resource_classes")
- if w.Code != http.StatusNotImplemented {
- t.Fatalf("status = %d, want %d", w.Code, http.StatusNotImplemented)
- }
- })
- t.Run("GET show returns 501", func(t *testing.T) {
- w := serveHandler(t, "GET", "/resource_classes/{name}",
- s.HandleShowResourceClass, "/resource_classes/VCPU")
- if w.Code != http.StatusNotImplemented {
- t.Fatalf("status = %d, want %d", w.Code, http.StatusNotImplemented)
- }
- })
- t.Run("PUT returns 501", func(t *testing.T) {
- w := serveHandler(t, "PUT", "/resource_classes/{name}",
- s.HandleUpdateResourceClass, "/resource_classes/CUSTOM_FOO")
- if w.Code != http.StatusNotImplemented {
- t.Fatalf("status = %d, want %d", w.Code, http.StatusNotImplemented)
- }
- })
- t.Run("DELETE returns 501", func(t *testing.T) {
- w := serveHandler(t, "DELETE", "/resource_classes/{name}",
- s.HandleDeleteResourceClass, "/resource_classes/CUSTOM_BAR")
- if w.Code != http.StatusNotImplemented {
- t.Fatalf("status = %d, want %d", w.Code, http.StatusNotImplemented)
- }
- })
}
diff --git a/internal/shim/placement/handle_traits.go b/internal/shim/placement/handle_traits.go
index d815b9d55..0cd4e659e 100644
--- a/internal/shim/placement/handle_traits.go
+++ b/internal/shim/placement/handle_traits.go
@@ -17,6 +17,7 @@ import (
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
logf "sigs.k8s.io/controller-runtime/pkg/log"
)
@@ -211,7 +212,9 @@ func (s *Shim) handleUpdateTraitHybrid(w http.ResponseWriter, r *http.Request) {
}
w.WriteHeader(resp.StatusCode)
if resp.Body != nil {
- io.Copy(w, resp.Body) //nolint:errcheck
+ if _, err := io.Copy(w, resp.Body); err != nil {
+ log.Error(err, "hybrid: failed to copy upstream response body")
+ }
}
if resp.StatusCode == http.StatusCreated || resp.StatusCode == http.StatusNoContent {
@@ -293,7 +296,9 @@ func (s *Shim) handleDeleteTraitHybrid(w http.ResponseWriter, r *http.Request) {
}
w.WriteHeader(resp.StatusCode)
if resp.Body != nil {
- io.Copy(w, resp.Body) //nolint:errcheck
+ if _, err := io.Copy(w, resp.Body); err != nil {
+ log.Error(err, "hybrid: failed to copy upstream response body")
+ }
}
if resp.StatusCode == http.StatusNoContent {
@@ -382,7 +387,9 @@ func (s *Shim) addTraitToConfigMap(ctx context.Context, name string) (bool, erro
defer func() {
releaseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
- _ = s.resourceLocker.ReleaseLock(releaseCtx, s.config.Traits.ConfigMapName+"-lock", lockerID) //nolint:errcheck
+ if err := s.resourceLocker.ReleaseLock(releaseCtx, s.config.Traits.ConfigMapName+"-lock", lockerID); err != nil {
+ ctrl.Log.WithName("placement-shim").Error(err, "failed to release traits lock")
+ }
}()
cm := &corev1.ConfigMap{}
@@ -439,7 +446,9 @@ func (s *Shim) removeTraitFromConfigMap(ctx context.Context, name string) (bool,
defer func() {
releaseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
- _ = s.resourceLocker.ReleaseLock(releaseCtx, s.config.Traits.ConfigMapName+"-lock", lockerID) //nolint:errcheck
+ if err := s.resourceLocker.ReleaseLock(releaseCtx, s.config.Traits.ConfigMapName+"-lock", lockerID); err != nil {
+ ctrl.Log.WithName("placement-shim").Error(err, "failed to release traits lock")
+ }
}()
cm := &corev1.ConfigMap{}
diff --git a/internal/shim/placement/shim.go b/internal/shim/placement/shim.go
index f601788a4..53de85056 100644
--- a/internal/shim/placement/shim.go
+++ b/internal/shim/placement/shim.go
@@ -127,7 +127,7 @@ func (s *Shim) featureModeFromConfOrHeader(r *http.Request, configured FeatureMo
}
resolved := override.orDefault()
if resolved == FeatureModeHybrid || resolved == FeatureModeCRD {
- if s.config.Versioning == nil && s.config.Traits == nil {
+ if s.config.Versioning == nil && s.config.Traits == nil && s.config.ResourceClasses == nil {
return configured.orDefault()
}
}
@@ -167,6 +167,14 @@ type traitsConfig struct {
ConfigMapName string `json:"configMapName"`
}
+// resourceClassesConfig configures the local resource class store used when
+// features.resourceClasses is hybrid or crd.
+type resourceClassesConfig struct {
+ // ConfigMapName is the name of the ConfigMap used to persist resource classes.
+ // Must exist in the same namespace as the shim pod.
+ ConfigMapName string `json:"configMapName"`
+}
+
// config holds configuration for the placement shim.
type config struct {
// SSO is an optional configuration for the certificates the http client
@@ -212,6 +220,9 @@ type config struct {
// Traits configures the local trait store used when
// features.traits is hybrid or crd.
Traits *traitsConfig `json:"traits,omitempty"`
+ // ResourceClasses configures the local resource class store used when
+ // features.resourceClasses is hybrid or crd.
+ ResourceClasses *resourceClassesConfig `json:"resourceClasses,omitempty"`
}
// validate checks the config for required fields and returns an error if the
@@ -258,6 +269,18 @@ func (c *config) validate() error {
return errors.New("pod namespace (POD_NAMESPACE) is required when traits config is present")
}
}
+ rcMode := c.Features.ResourceClasses.orDefault()
+ if rcMode != FeatureModePassthrough && c.ResourceClasses == nil {
+ return fmt.Errorf("resourceClasses config is required when features.resourceClasses is %s", rcMode)
+ }
+ if c.ResourceClasses != nil {
+ if c.ResourceClasses.ConfigMapName == "" {
+ return errors.New("resourceClasses.configMapName is required when resourceClasses config is present")
+ }
+ if os.Getenv("POD_NAMESPACE") == "" {
+ return errors.New("pod namespace (POD_NAMESPACE) is required when resourceClasses config is present")
+ }
+ }
if c.Auth != nil && c.KeystoneURL == "" {
return errors.New("keystoneURL is required when auth is configured")
}
@@ -311,9 +334,6 @@ type Shim struct {
// placement with automatic token management (including reauth on 401).
// Nil when Keystone credentials are not configured.
placementServiceClient *gophercloud.ServiceClient
- // syncers are background workers that manage ConfigMap-backed local
- // stores (e.g. traits, resource classes). Started uniformly in Start.
- syncers []Syncer
}
// Describe implements prometheus.Collector.
@@ -437,23 +457,33 @@ func (s *Shim) Start(ctx context.Context) error {
return err
}
if s.config.Traits != nil {
- s.syncers = append(s.syncers, NewTraitSyncer(
+ ts := NewTraitSyncer(
s.Client,
s.config.Traits.ConfigMapName,
os.Getenv("POD_NAMESPACE"),
s.placementServiceClient,
s.resourceLocker,
- ))
- }
- for _, syncer := range s.syncers {
- if err := syncer.Init(ctx); err != nil {
+ )
+ if err := ts.Init(ctx); err != nil {
return err
}
+ if s.config.Features.Traits.orDefault() != FeatureModeCRD {
+ go ts.Run(ctx)
+ }
}
- traitsMode := s.config.Features.Traits.orDefault()
- for _, syncer := range s.syncers {
- if traitsMode == FeatureModeHybrid || traitsMode == FeatureModePassthrough {
- go syncer.Run(ctx)
+ if s.config.ResourceClasses != nil {
+ rs := NewResourceClassSyncer(
+ s.Client,
+ s.config.ResourceClasses.ConfigMapName,
+ os.Getenv("POD_NAMESPACE"),
+ s.placementServiceClient,
+ s.resourceLocker,
+ )
+ if err := rs.Init(ctx); err != nil {
+ return err
+ }
+ if s.config.Features.ResourceClasses.orDefault() != FeatureModeCRD {
+ go rs.Run(ctx)
}
}
return nil
diff --git a/internal/shim/placement/shim_test.go b/internal/shim/placement/shim_test.go
index ffc31e954..46805676a 100644
--- a/internal/shim/placement/shim_test.go
+++ b/internal/shim/placement/shim_test.go
@@ -83,6 +83,18 @@ func serveHandler(t *testing.T, method, pattern string, handler http.HandlerFunc
return w
}
+// serveHandlerWithBody is like serveHandler but allows providing a request body.
+func serveHandlerWithBody(t *testing.T, method, pattern string, handler http.HandlerFunc, reqPath string, body io.Reader) *httptest.ResponseRecorder {
+ t.Helper()
+ mux := http.NewServeMux()
+ mux.HandleFunc(method+" "+pattern, handler)
+ req := httptest.NewRequest(method, reqPath, body)
+ req.Header.Set("Content-Type", "application/json")
+ w := httptest.NewRecorder()
+ mux.ServeHTTP(w, req)
+ return w
+}
+
func TestForward(t *testing.T) {
tests := []struct {
name string
@@ -482,6 +494,30 @@ func TestConfigValidateTraitsCRDRequiresConfig(t *testing.T) {
}
}
+func TestConfigValidateResourceClassesCRDRequiresConfig(t *testing.T) {
+ t.Setenv("POD_NAMESPACE", "")
+
+ c := config{
+ PlacementURL: "http://placement:8778",
+ Features: featuresConfig{ResourceClasses: FeatureModeCRD},
+ }
+ if err := c.validate(); err == nil {
+ t.Fatal("expected error when resourceClasses mode is crd without config")
+ }
+ c.ResourceClasses = &resourceClassesConfig{}
+ if err := c.validate(); err == nil {
+ t.Fatal("expected error when resourceClasses.configMapName is empty")
+ }
+ c.ResourceClasses.ConfigMapName = "cortex-placement-shim-resource-classes"
+ if err := c.validate(); err == nil {
+ t.Fatal("expected error when POD_NAMESPACE is not set")
+ }
+ t.Setenv("POD_NAMESPACE", "default")
+ if err := c.validate(); err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+}
+
func TestWrapHandlerWithAuth(t *testing.T) {
upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK)
diff --git a/internal/shim/placement/syncer_resource_classes.go b/internal/shim/placement/syncer_resource_classes.go
new file mode 100644
index 000000000..a6aa854b5
--- /dev/null
+++ b/internal/shim/placement/syncer_resource_classes.go
@@ -0,0 +1,196 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package placement
+
+import (
+ "context"
+ "encoding/json"
+ "fmt"
+ "math/rand"
+ "net/http"
+ "net/url"
+ "os"
+ "time"
+
+ "github.com/cobaltcore-dev/cortex/pkg/resourcelock"
+ "github.com/gophercloud/gophercloud/v2"
+ corev1 "k8s.io/api/core/v1"
+ apierrors "k8s.io/apimachinery/pkg/api/errors"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ ctrl "sigs.k8s.io/controller-runtime"
+ "sigs.k8s.io/controller-runtime/pkg/client"
+)
+
+const configMapKeyResourceClasses = "resource_classes"
+
+// resourceClassesListResponse matches the OpenStack Placement GET /resource_classes response.
+type resourceClassesListResponse struct {
+ ResourceClasses []resourceClassEntry `json:"resource_classes"`
+}
+
+type resourceClassEntry struct {
+ Name string `json:"name"`
+ Links []resourceClassLink `json:"links,omitempty"`
+}
+
+type resourceClassLink struct {
+ Rel string `json:"rel"`
+ Href string `json:"href"`
+}
+
+// ResourceClassSyncer manages the lifecycle of the resource classes ConfigMap.
+// It creates the ConfigMap on startup and periodically syncs from upstream.
+type ResourceClassSyncer struct {
+ client client.Client
+ configMapName string
+ namespace string
+ placementClient *gophercloud.ServiceClient
+ resourceLocker *resourcelock.ResourceLocker
+}
+
+func NewResourceClassSyncer(
+ cl client.Client,
+ configMapName string,
+ namespace string,
+ placementClient *gophercloud.ServiceClient,
+ resourceLocker *resourcelock.ResourceLocker,
+) *ResourceClassSyncer {
+
+ return &ResourceClassSyncer{
+ client: cl,
+ configMapName: configMapName,
+ namespace: namespace,
+ placementClient: placementClient,
+ resourceLocker: resourceLocker,
+ }
+}
+
+// Init creates the resource classes ConfigMap if it does not already exist.
+func (rs *ResourceClassSyncer) Init(ctx context.Context) error {
+ log := ctrl.Log.WithName("placement-shim").WithName("resource-class-syncer")
+ cm := &corev1.ConfigMap{}
+ key := client.ObjectKey{Namespace: rs.namespace, Name: rs.configMapName}
+ err := rs.client.Get(ctx, key, cm)
+ if err == nil {
+ log.Info("Resource classes ConfigMap already exists", "name", rs.configMapName)
+ return nil
+ }
+ if !apierrors.IsNotFound(err) {
+ return fmt.Errorf("checking resource classes configmap: %w", err)
+ }
+ cm = &corev1.ConfigMap{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: rs.configMapName,
+ Namespace: rs.namespace,
+ },
+ Data: map[string]string{configMapKeyResourceClasses: "[]"},
+ }
+ if err := rs.client.Create(ctx, cm); err != nil {
+ if apierrors.IsAlreadyExists(err) {
+ log.Info("Resource classes ConfigMap was created concurrently", "name", rs.configMapName)
+ return nil
+ }
+ return fmt.Errorf("creating resource classes configmap: %w", err)
+ }
+ log.Info("Created resource classes ConfigMap", "name", rs.configMapName)
+ return nil
+}
+
+// Run starts the periodic background sync from upstream placement.
+// Blocks until ctx is cancelled.
+func (rs *ResourceClassSyncer) Run(ctx context.Context) {
+ log := ctrl.Log.WithName("placement-shim").WithName("resource-class-syncer")
+ if rs.placementClient == nil {
+ log.Info("No placement service client configured, resource class sync loop will not run")
+ return
+ }
+
+ jitter := time.Duration(rand.Int63n(int64(30 * time.Second))) //nolint:gosec
+ log.Info("Starting resource class sync loop", "jitter", jitter)
+
+ select {
+ case <-ctx.Done():
+ return
+ case <-time.After(jitter):
+ }
+
+ rs.sync(ctx)
+
+ ticker := time.NewTicker(60 * time.Second)
+ defer ticker.Stop()
+ for {
+ select {
+ case <-ctx.Done():
+ return
+ case <-ticker.C:
+ rs.sync(ctx)
+ }
+ }
+}
+
+// sync fetches GET /resource_classes from upstream placement and writes the
+// result into the ConfigMap under the resource lock.
+func (rs *ResourceClassSyncer) sync(ctx context.Context) {
+ log := ctrl.Log.WithName("placement-shim").WithName("resource-class-syncer")
+ u, err := url.JoinPath(rs.placementClient.Endpoint, "/resource_classes")
+ if err != nil {
+ log.Error(err, "Failed to build upstream resource classes URL")
+ return
+ }
+ resp, err := rs.placementClient.Request(ctx, http.MethodGet, u, &gophercloud.RequestOpts{
+ OkCodes: []int{http.StatusOK},
+ MoreHeaders: map[string]string{
+ "OpenStack-API-Version": "placement 1.7",
+ },
+ KeepResponseBody: true,
+ })
+ if err != nil {
+ log.Info("Upstream resource class sync failed", "error", err.Error())
+ return
+ }
+ defer resp.Body.Close()
+ var body resourceClassesListResponse
+ if err := json.NewDecoder(resp.Body).Decode(&body); err != nil {
+ log.Error(err, "Failed to decode upstream resource class list")
+ return
+ }
+
+ host, err := os.Hostname()
+ if err != nil {
+ log.Error(err, "Failed to get hostname for resource class sync lock")
+ return
+ }
+ lockerID := fmt.Sprintf("syncer-%s-%d", host, time.Now().UnixNano())
+ lockName := rs.configMapName + "-lock"
+ if err := rs.resourceLocker.AcquireLock(ctx, lockName, lockerID); err != nil {
+ log.Error(err, "Failed to acquire lock for resource class sync")
+ return
+ }
+ defer func() {
+ releaseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+ if err := rs.resourceLocker.ReleaseLock(releaseCtx, lockName, lockerID); err != nil {
+ log.Error(err, "Failed to release lock after resource class sync")
+ }
+ }()
+
+ cm := &corev1.ConfigMap{}
+ if err := rs.client.Get(ctx, client.ObjectKey{Namespace: rs.namespace, Name: rs.configMapName}, cm); err != nil {
+ log.Error(err, "Failed to get resource classes ConfigMap for sync")
+ return
+ }
+ rcSet := make(map[string]struct{}, len(body.ResourceClasses))
+ for _, rc := range body.ResourceClasses {
+ rcSet[rc.Name] = struct{}{}
+ }
+ if err := writeResourceClassesToConfigMap(cm, rcSet); err != nil {
+ log.Error(err, "Failed to serialize synced resource classes")
+ return
+ }
+ if err := rs.client.Update(ctx, cm); err != nil {
+ log.Error(err, "Failed to update resource classes ConfigMap with upstream data")
+ return
+ }
+ log.Info("Synced resource classes from upstream placement", "count", len(body.ResourceClasses))
+}
diff --git a/internal/shim/placement/syncer_resource_classes_test.go b/internal/shim/placement/syncer_resource_classes_test.go
new file mode 100644
index 000000000..83d4d3ad0
--- /dev/null
+++ b/internal/shim/placement/syncer_resource_classes_test.go
@@ -0,0 +1,149 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package placement
+
+import (
+ "context"
+ "encoding/json"
+ "net/http"
+ "net/http/httptest"
+ "testing"
+ "time"
+
+ "github.com/cobaltcore-dev/cortex/pkg/resourcelock"
+ "github.com/gophercloud/gophercloud/v2"
+ corev1 "k8s.io/api/core/v1"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "sigs.k8s.io/controller-runtime/pkg/client"
+)
+
+func TestResourceClassSyncerInitCreatesConfigMap(t *testing.T) {
+ cl := newFakeClientWithScheme(t)
+ rs := NewResourceClassSyncer(cl, "test-rc", "default", nil, resourcelock.NewResourceLocker(cl, "default"))
+
+ if err := rs.Init(context.Background()); err != nil {
+ t.Fatalf("Init: %v", err)
+ }
+
+ cm := &corev1.ConfigMap{}
+ if err := cl.Get(context.Background(), client.ObjectKey{Namespace: "default", Name: "test-rc"}, cm); err != nil {
+ t.Fatalf("get ConfigMap: %v", err)
+ }
+ if cm.Data[configMapKeyResourceClasses] != "[]" {
+ t.Fatalf("expected empty resource classes array, got %q", cm.Data[configMapKeyResourceClasses])
+ }
+}
+
+func TestResourceClassSyncerInitIdempotent(t *testing.T) {
+ existing := &corev1.ConfigMap{
+ ObjectMeta: metav1.ObjectMeta{Name: "test-rc", Namespace: "default"},
+ Data: map[string]string{configMapKeyResourceClasses: `["CUSTOM_EXISTING"]`},
+ }
+ cl := newFakeClientWithScheme(t, existing)
+ rs := NewResourceClassSyncer(cl, "test-rc", "default", nil, resourcelock.NewResourceLocker(cl, "default"))
+
+ if err := rs.Init(context.Background()); err != nil {
+ t.Fatalf("Init: %v", err)
+ }
+
+ cm := &corev1.ConfigMap{}
+ if err := cl.Get(context.Background(), client.ObjectKey{Namespace: "default", Name: "test-rc"}, cm); err != nil {
+ t.Fatalf("get ConfigMap: %v", err)
+ }
+ if cm.Data[configMapKeyResourceClasses] != `["CUSTOM_EXISTING"]` {
+ t.Fatalf("Init overwrote existing data: got %q", cm.Data[configMapKeyResourceClasses])
+ }
+}
+
+func TestResourceClassSyncerRunNoClient(t *testing.T) {
+ cl := newFakeClientWithScheme(t)
+ rs := NewResourceClassSyncer(cl, "test-rc", "default", nil, resourcelock.NewResourceLocker(cl, "default"))
+
+ ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
+ defer cancel()
+ rs.Run(ctx)
+}
+
+func TestResourceClassSyncerSyncWritesUpstreamClasses(t *testing.T) {
+ upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ if r.URL.Path != "/resource_classes" {
+ t.Errorf("unexpected path: %s", r.URL.Path)
+ }
+ w.Header().Set("Content-Type", "application/json")
+ if err := json.NewEncoder(w).Encode(resourceClassesListResponse{
+ ResourceClasses: []resourceClassEntry{
+ {Name: "VCPU"},
+ {Name: "MEMORY_MB"},
+ {Name: "CUSTOM_SYNCED"},
+ },
+ }); err != nil {
+ t.Errorf("encode response: %v", err)
+ }
+ }))
+ t.Cleanup(upstream.Close)
+
+ existing := &corev1.ConfigMap{
+ ObjectMeta: metav1.ObjectMeta{Name: "test-rc", Namespace: "default"},
+ Data: map[string]string{configMapKeyResourceClasses: "[]"},
+ }
+ cl := newFakeClientWithScheme(t, existing)
+
+ sc := &gophercloud.ServiceClient{
+ ProviderClient: &gophercloud.ProviderClient{},
+ Endpoint: upstream.URL,
+ }
+ sc.HTTPClient = *upstream.Client()
+
+ rs := NewResourceClassSyncer(cl, "test-rc", "default", sc, resourcelock.NewResourceLocker(cl, "default"))
+ rs.sync(context.Background())
+
+ cm := &corev1.ConfigMap{}
+ if err := cl.Get(context.Background(), client.ObjectKey{Namespace: "default", Name: "test-rc"}, cm); err != nil {
+ t.Fatalf("get ConfigMap: %v", err)
+ }
+
+ var classes []string
+ if err := json.Unmarshal([]byte(cm.Data[configMapKeyResourceClasses]), &classes); err != nil {
+ t.Fatalf("unmarshal: %v", err)
+ }
+ if len(classes) != 3 {
+ t.Fatalf("expected 3 classes, got %d: %v", len(classes), classes)
+ }
+ want := map[string]bool{"VCPU": true, "MEMORY_MB": true, "CUSTOM_SYNCED": true}
+ for _, c := range classes {
+ if !want[c] {
+ t.Errorf("unexpected class: %s", c)
+ }
+ }
+}
+
+func TestResourceClassSyncerSyncUpstreamError(t *testing.T) {
+ upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+ w.WriteHeader(http.StatusServiceUnavailable)
+ }))
+ t.Cleanup(upstream.Close)
+
+ existing := &corev1.ConfigMap{
+ ObjectMeta: metav1.ObjectMeta{Name: "test-rc", Namespace: "default"},
+ Data: map[string]string{configMapKeyResourceClasses: `["CUSTOM_ORIGINAL"]`},
+ }
+ cl := newFakeClientWithScheme(t, existing)
+
+ sc := &gophercloud.ServiceClient{
+ ProviderClient: &gophercloud.ProviderClient{},
+ Endpoint: upstream.URL,
+ }
+ sc.HTTPClient = *upstream.Client()
+
+ rs := NewResourceClassSyncer(cl, "test-rc", "default", sc, resourcelock.NewResourceLocker(cl, "default"))
+ rs.sync(context.Background())
+
+ cm := &corev1.ConfigMap{}
+ if err := cl.Get(context.Background(), client.ObjectKey{Namespace: "default", Name: "test-rc"}, cm); err != nil {
+ t.Fatalf("get ConfigMap: %v", err)
+ }
+ if cm.Data[configMapKeyResourceClasses] != `["CUSTOM_ORIGINAL"]` {
+ t.Fatalf("sync should not have modified ConfigMap on error, got %q", cm.Data[configMapKeyResourceClasses])
+ }
+}
diff --git a/internal/shim/placement/syncer_traits.go b/internal/shim/placement/syncer_traits.go
index d8067e1f3..f9748d66d 100644
--- a/internal/shim/placement/syncer_traits.go
+++ b/internal/shim/placement/syncer_traits.go
@@ -139,7 +139,11 @@ func (ts *TraitSyncer) sync(ctx context.Context) {
return
}
- host, _ := os.Hostname() //nolint:errcheck
+ host, err := os.Hostname()
+ if err != nil {
+ log.Error(err, "Failed to get hostname for trait sync lock")
+ return
+ }
lockerID := fmt.Sprintf("syncer-%s-%d", host, time.Now().UnixNano())
lockName := ts.configMapName + "-lock"
if err := ts.resourceLocker.AcquireLock(ctx, lockName, lockerID); err != nil {
@@ -149,7 +153,9 @@ func (ts *TraitSyncer) sync(ctx context.Context) {
defer func() {
releaseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
- _ = ts.resourceLocker.ReleaseLock(releaseCtx, lockName, lockerID) //nolint:errcheck
+ if err := ts.resourceLocker.ReleaseLock(releaseCtx, lockName, lockerID); err != nil {
+ log.Error(err, "Failed to release lock after trait sync")
+ }
}()
cm := &corev1.ConfigMap{}
From d5900a494d47b2f7798059c9353b39fc022cf4cd Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Wed, 29 Apr 2026 11:18:34 +0000
Subject: [PATCH 15/54] Bump cortex-shim chart appVersions to sha-17050b2f
[skip ci]
---
helm/library/cortex-shim/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex-shim/Chart.yaml b/helm/library/cortex-shim/Chart.yaml
index d26f4fe7b..b8a88feb7 100644
--- a/helm/library/cortex-shim/Chart.yaml
+++ b/helm/library/cortex-shim/Chart.yaml
@@ -3,6 +3,6 @@ name: cortex-shim
description: A Helm chart to distribute cortex shims.
type: application
version: 0.0.3
-appVersion: "sha-ebbf9d44"
+appVersion: "sha-17050b2f"
icon: "https://example.com/icon.png"
dependencies: []
From 98a0d0802dbc3c660d9c937130dc0f074a242d68 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Wed, 29 Apr 2026 11:18:35 +0000
Subject: [PATCH 16/54] Bump cortex chart appVersions to sha-17050b2f [skip ci]
---
helm/library/cortex/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml
index 3cc0004cd..4ab0c4e21 100644
--- a/helm/library/cortex/Chart.yaml
+++ b/helm/library/cortex/Chart.yaml
@@ -3,6 +3,6 @@ name: cortex
description: A Helm chart to distribute cortex.
type: application
version: 0.0.44
-appVersion: "sha-ebbf9d44"
+appVersion: "sha-17050b2f"
icon: "https://example.com/icon.png"
dependencies: []
From 839845cfb41bd615d212e550bcc419c1ea75d6ec Mon Sep 17 00:00:00 2001
From: Markus Wieland <44964229+SoWieMarkus@users.noreply.github.com>
Date: Wed, 29 Apr 2026 13:25:18 +0200
Subject: [PATCH 17/54] refactor: moved unused commitments kpi for vmware
(#774)
## Changes
- KPI that tracks unused vmware general purpose & hana instance
commitments
- Moved to infrastructure directory
---------
Co-authored-by: Copilot
---
helm/bundles/cortex-nova/templates/kpis.yaml | 22 +-
.../compute/resource_commitments_vmware.go | 197 -------
.../resource_commitments_vmware_test.go | 301 ----------
.../{vmware_metrics.go => shared.go} | 55 ++
.../plugins/infrastructure/shared_test.go | 78 +++
.../vmware_project_utilization.go | 12 +-
.../vmware_resource_commitments.go | 271 +++++++++
.../vmware_resource_commitments_test.go | 523 ++++++++++++++++++
internal/knowledge/kpis/supported_kpis.go | 4 +-
9 files changed, 950 insertions(+), 513 deletions(-)
delete mode 100644 internal/knowledge/kpis/plugins/compute/resource_commitments_vmware.go
delete mode 100644 internal/knowledge/kpis/plugins/compute/resource_commitments_vmware_test.go
rename internal/knowledge/kpis/plugins/infrastructure/{vmware_metrics.go => shared.go} (50%)
create mode 100644 internal/knowledge/kpis/plugins/infrastructure/shared_test.go
create mode 100644 internal/knowledge/kpis/plugins/infrastructure/vmware_resource_commitments.go
create mode 100644 internal/knowledge/kpis/plugins/infrastructure/vmware_resource_commitments_test.go
diff --git a/helm/bundles/cortex-nova/templates/kpis.yaml b/helm/bundles/cortex-nova/templates/kpis.yaml
index 3234fcc4a..22774c62a 100644
--- a/helm/bundles/cortex-nova/templates/kpis.yaml
+++ b/helm/bundles/cortex-nova/templates/kpis.yaml
@@ -188,31 +188,31 @@ spec:
apiVersion: cortex.cloud/v1alpha1
kind: KPI
metadata:
- name: vmware-commitments
+ name: vmware-project-utilization
spec:
schedulingDomain: nova
- impl: vmware_commitments_kpi
+ impl: vmware_project_utilization_kpi
dependencies:
datasources:
- - name: limes-project-commitments
- - name: nova-flavors
- name: nova-servers
+ - name: nova-flavors
+ - name: identity-projects
+ knowledges:
+ - name: host-details
description: |
- This KPI tracks unused VMware commitments based on project commitments and usage.
+ This KPI tracks the resource utilization of projects running VMs on VMware hosts.
---
apiVersion: cortex.cloud/v1alpha1
kind: KPI
metadata:
- name: vmware-project-utilization
+ name: vmware-resource-commitments
spec:
schedulingDomain: nova
- impl: vmware_project_utilization_kpi
+ impl: vmware_resource_commitments_kpi
dependencies:
datasources:
- name: nova-servers
- name: nova-flavors
- - name: identity-projects
- knowledges:
- - name: host-details
+ - name: limes-project-commitments
description: |
- This KPI tracks the resource utilization of projects running VMs on VMware hosts.
\ No newline at end of file
+ This KPI tracks the resource commitments of projects running VMs on VMware hosts.
\ No newline at end of file
diff --git a/internal/knowledge/kpis/plugins/compute/resource_commitments_vmware.go b/internal/knowledge/kpis/plugins/compute/resource_commitments_vmware.go
deleted file mode 100644
index 74cde06a0..000000000
--- a/internal/knowledge/kpis/plugins/compute/resource_commitments_vmware.go
+++ /dev/null
@@ -1,197 +0,0 @@
-// Copyright SAP SE
-// SPDX-License-Identifier: Apache-2.0
-
-package compute
-
-import (
- "log/slog"
- "strings"
-
- "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/limes"
- "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova"
- "github.com/cobaltcore-dev/cortex/internal/knowledge/db"
- "github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins"
- "github.com/cobaltcore-dev/cortex/pkg/conf"
- "github.com/prometheus/client_golang/prometheus"
- "sigs.k8s.io/controller-runtime/pkg/client"
-)
-
-type VMwareResourceCommitmentsKPI struct {
- // Common base for all KPIs that provides standard functionality.
- plugins.BaseKPI[struct{}] // No options passed through yaml config
-
- unusedInstanceCommitments *prometheus.Desc
-}
-
-func (VMwareResourceCommitmentsKPI) GetName() string {
- return "vmware_commitments_kpi"
-}
-
-func (k *VMwareResourceCommitmentsKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) error {
- if err := k.BaseKPI.Init(db, client, opts); err != nil {
- return err
- }
- k.unusedInstanceCommitments = prometheus.NewDesc(
- "cortex_vmware_hana_unused_instance_commitments",
- "Unused instance commitment capacity summed across all projects (vcpus / ram_mb / disk_gb).",
- []string{
- "resource", // "cpu", "ram", "disk"
- "availability_zone",
- "cpu_architecture", // "sapphire-rapids" (_v2 suffix) or "cascade-lake"
- },
- nil,
- )
- return nil
-}
-
-func (k *VMwareResourceCommitmentsKPI) Describe(ch chan<- *prometheus.Desc) {
- ch <- k.unusedInstanceCommitments
-}
-
-func (k *VMwareResourceCommitmentsKPI) Collect(ch chan<- prometheus.Metric) {
- k.collectUnusedCommitments(ch)
-}
-
-// getRunningHANAServers loads all running HANA servers from the database. We consider a server "running" if its status is not DELETED or ERROR.
-func (k *VMwareResourceCommitmentsKPI) getRunningHANAServers() ([]nova.Server, error) {
- // Load running HANA servers (non-deleted, non-error).
- var servers []nova.Server
- if _, err := k.DB.Select(&servers, `
- SELECT * FROM `+nova.Server{}.TableName()+`
- WHERE flavor_name LIKE 'hana_%'
- AND status NOT IN ('DELETED', 'ERROR')
- `); err != nil {
- return nil, err
- }
- return servers, nil
-}
-
-// getFlavorsByName loads all flavors from the database and returns a map of flavor name to flavor struct for easy lookup.
-func (k *VMwareResourceCommitmentsKPI) getFlavorsByName() (map[string]nova.Flavor, error) {
- var flavors []nova.Flavor
- if _, err := k.DB.Select(&flavors, "SELECT * FROM "+nova.Flavor{}.TableName()); err != nil {
- return nil, err
- }
- flavorsByName := make(map[string]nova.Flavor, len(flavors))
- for _, flavor := range flavors {
- flavorsByName[flavor.Name] = flavor
- }
- return flavorsByName, nil
-}
-
-// getInstanceCommitments loads all confirmed or guaranteed instance commitments from the database.
-func (k *VMwareResourceCommitmentsKPI) getInstanceCommitments() ([]limes.Commitment, error) {
- var commitments []limes.Commitment
- if _, err := k.DB.Select(&commitments, `
- SELECT * FROM `+limes.Commitment{}.TableName()+`
- WHERE service_type = 'compute'
- AND resource_name LIKE 'instances_%'
- AND status IN ('confirmed', 'guaranteed')
- `); err != nil {
- return nil, err
- }
- return commitments, nil
-}
-
-// cpuArchitectureForFlavor returns the CPU architecture label for a HANA flavor name.
-// Flavors with a "_v2" suffix run on sapphire-rapids; all others are cascade-lake.
-func cpuArchitectureForFlavor(flavorName string) string {
- if strings.HasSuffix(flavorName, "_v2") {
- return "sapphire-rapids"
- }
- return "cascade-lake"
-}
-
-// resourceKey identifies an aggregated capacity bucket by (resource, az, architecture).
-type resourceKey struct{ resource, az, architecture string }
-
-// calculateUnusedInstanceCapacity computes per-(resource, az, architecture) capacity sums for unused
-// HANA VMware commitments. It filters out non-HANA and KVM (hana_k_) commitments, then for each
-// (project, flavor, az, architecture) bucket subtracts running servers from committed amount; over-used
-// buckets are clamped to zero and omitted from the result.
-func calculateUnusedInstanceCapacity(
- commitments []limes.Commitment,
- servers []nova.Server,
- flavorsByName map[string]nova.Flavor,
-) map[resourceKey]float64 {
- // running: (project_id, flavor_name, az) -> count of non-deleted/non-error servers.
- type serverCountKey struct{ projectID, flavorName, az string }
- running := make(map[serverCountKey]uint64, len(servers))
- for _, s := range servers {
- running[serverCountKey{s.TenantID, s.FlavorName, s.OSEXTAvailabilityZone}]++
- }
-
- // committed: (project_id, flavor_name, az, cpuArchitecture) -> total committed amount.
- type commitmentKey struct{ projectID, flavorName, az, cpuArchitecture string }
- committed := make(map[commitmentKey]uint64)
- for _, c := range commitments {
- flavorName := strings.TrimPrefix(c.ResourceName, "instances_")
- if !strings.HasPrefix(flavorName, "hana_") {
- continue
- }
- if strings.HasPrefix(flavorName, "hana_k_") {
- slog.Debug("unused_commitments: skipping hana kvm commitment", "flavor", flavorName, "project_id", c.ProjectID)
- continue
- }
- key := commitmentKey{c.ProjectID, flavorName, c.AvailabilityZone, cpuArchitectureForFlavor(flavorName)}
- committed[key] += c.Amount
- }
-
- sum := make(map[resourceKey]float64)
- for ck, total := range committed {
- run := running[serverCountKey{ck.projectID, ck.flavorName, ck.az}]
- if run >= total {
- continue
- }
- unused := total - run
- flavor, ok := flavorsByName[ck.flavorName]
- if !ok {
- slog.Warn("unused_commitments: flavor not found in flavor table", "flavor", ck.flavorName)
- continue
- }
- sum[resourceKey{"cpu", ck.az, ck.cpuArchitecture}] += float64(unused) * float64(flavor.VCPUs)
- sum[resourceKey{"ram", ck.az, ck.cpuArchitecture}] += float64(unused) * float64(flavor.RAM)
- sum[resourceKey{"disk", ck.az, ck.cpuArchitecture}] += float64(unused) * float64(flavor.Disk)
- }
- return sum
-}
-
-func (k *VMwareResourceCommitmentsKPI) collectUnusedCommitments(ch chan<- prometheus.Metric) {
- if k.DB == nil {
- return
- }
-
- // Load confirmed/guaranteed instance commitments.
- commitments, err := k.getInstanceCommitments()
- if err != nil {
- slog.Error("unused_commitments: failed to load commitments", "err", err)
- return
- }
-
- // Load flavors for capacity lookup.
- flavorsByName, err := k.getFlavorsByName()
- if err != nil {
- slog.Error("unused_commitments: failed to load flavors", "err", err)
- return
- }
-
- // Load running HANA servers.
- servers, err := k.getRunningHANAServers()
- if err != nil {
- slog.Error("unused_commitments: failed to get running HANA servers", "err", err)
- return
- }
-
- sumByResource := calculateUnusedInstanceCapacity(commitments, servers, flavorsByName)
-
- for rk, value := range sumByResource {
- ch <- prometheus.MustNewConstMetric(
- k.unusedInstanceCommitments,
- prometheus.GaugeValue,
- value,
- rk.resource,
- rk.az,
- rk.architecture,
- )
- }
-}
diff --git a/internal/knowledge/kpis/plugins/compute/resource_commitments_vmware_test.go b/internal/knowledge/kpis/plugins/compute/resource_commitments_vmware_test.go
deleted file mode 100644
index 90a1abd3b..000000000
--- a/internal/knowledge/kpis/plugins/compute/resource_commitments_vmware_test.go
+++ /dev/null
@@ -1,301 +0,0 @@
-// Copyright SAP SE
-// SPDX-License-Identifier: Apache-2.0
-
-package compute
-
-import (
- "reflect"
- "testing"
-
- "github.com/cobaltcore-dev/cortex/api/v1alpha1"
- "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/limes"
- "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova"
- "github.com/cobaltcore-dev/cortex/internal/knowledge/db"
- testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing"
- "github.com/cobaltcore-dev/cortex/pkg/conf"
- "github.com/prometheus/client_golang/prometheus"
- prometheusgo "github.com/prometheus/client_model/go"
- v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "sigs.k8s.io/controller-runtime/pkg/client/fake"
-)
-
-func TestCPUArchitectureForFlavor(t *testing.T) {
- tests := []struct {
- flavorName string
- want string
- }{
- {"hana_small", "cascade-lake"},
- {"hana_large", "cascade-lake"},
- {"hana_small_v2", "sapphire-rapids"},
- {"hana_large_v2", "sapphire-rapids"},
- {"hana_v2_extra", "cascade-lake"}, // _v2 must be a suffix
- {"hana_x_v2", "sapphire-rapids"},
- }
- for _, tt := range tests {
- t.Run(tt.flavorName, func(t *testing.T) {
- got := cpuArchitectureForFlavor(tt.flavorName)
- if got != tt.want {
- t.Errorf("cpuArchitectureForFlavor(%q) = %q, want %q", tt.flavorName, got, tt.want)
- }
- })
- }
-}
-
-func TestCalculateUnusedInstanceCapacity(t *testing.T) {
- flavors := map[string]nova.Flavor{
- "hana_small": {VCPUs: 4, RAM: 16384, Disk: 100},
- "hana_large_v2": {VCPUs: 16, RAM: 65536, Disk: 400},
- }
-
- t.Run("basic unused", func(t *testing.T) {
- commitments := []limes.Commitment{
- {ProjectID: "p1", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 3},
- }
- servers := []nova.Server{
- {TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1"}, // 1 running -> 2 unused
- }
- got := calculateUnusedInstanceCapacity(commitments, servers, flavors)
-
- if got[resourceKey{"cpu", "az1", "cascade-lake"}] != 8 { // 2 * 4
- t.Errorf("expected cpu=8, got %v", got[resourceKey{"cpu", "az1", "cascade-lake"}])
- }
- if got[resourceKey{"ram", "az1", "cascade-lake"}] != 32768 { // 2 * 16384
- t.Errorf("expected ram=32768, got %v", got[resourceKey{"ram", "az1", "cascade-lake"}])
- }
- if got[resourceKey{"disk", "az1", "cascade-lake"}] != 200 { // 2 * 100
- t.Errorf("expected disk=200, got %v", got[resourceKey{"disk", "az1", "cascade-lake"}])
- }
- })
-
- t.Run("non-hana and kvm commitments are skipped", func(t *testing.T) {
- commitments := []limes.Commitment{
- {ProjectID: "p1", ResourceName: "instances_hana_k_foo", AvailabilityZone: "az1", Amount: 5},
- {ProjectID: "p2", ResourceName: "instances_general_medium", AvailabilityZone: "az1", Amount: 3},
- }
- got := calculateUnusedInstanceCapacity(commitments, nil, flavors)
- if len(got) != 0 {
- t.Errorf("expected no metrics for kvm/non-hana commitments, got %v", got)
- }
- })
-
- t.Run("amounts for the same key are summed", func(t *testing.T) {
- commitments := []limes.Commitment{
- {ProjectID: "p1", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 3},
- {ProjectID: "p1", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 2},
- }
- got := calculateUnusedInstanceCapacity(commitments, nil, flavors) // nil servers -> all unused
- if got[resourceKey{"cpu", "az1", "cascade-lake"}] != 20 { // 5 * 4
- t.Errorf("expected cpu=20 for summed commitments, got %v", got[resourceKey{"cpu", "az1", "cascade-lake"}])
- }
- })
-
- t.Run("over-used bucket emits no metric", func(t *testing.T) {
- commitments := []limes.Commitment{
- {ProjectID: "p1", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 2},
- }
- servers := []nova.Server{ // 5 running > 2 committed
- {TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1"},
- {TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1"},
- {TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1"},
- {TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1"},
- {TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1"},
- }
- got := calculateUnusedInstanceCapacity(commitments, servers, flavors)
- if len(got) != 0 {
- t.Errorf("expected no metrics for over-used bucket, got %v", got)
- }
- })
-
- t.Run("exactly-used bucket emits no metric", func(t *testing.T) {
- commitments := []limes.Commitment{
- {ProjectID: "p1", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 3},
- }
- servers := []nova.Server{ // 3 running == 3 committed
- {TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1"},
- {TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1"},
- {TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1"},
- }
- got := calculateUnusedInstanceCapacity(commitments, servers, flavors)
- if len(got) != 0 {
- t.Errorf("expected no metrics for fully-used bucket, got %v", got)
- }
- })
-
- t.Run("unknown flavor is skipped", func(t *testing.T) {
- commitments := []limes.Commitment{
- {ProjectID: "p1", ResourceName: "instances_hana_unknown", AvailabilityZone: "az1", Amount: 3},
- }
- got := calculateUnusedInstanceCapacity(commitments, nil, flavors)
- if len(got) != 0 {
- t.Errorf("expected no metrics for unknown flavor, got %v", got)
- }
- })
-
- t.Run("multiple keys aggregated correctly", func(t *testing.T) {
- commitments := []limes.Commitment{
- {ProjectID: "p1", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 2},
- {ProjectID: "p2", ResourceName: "instances_hana_large_v2", AvailabilityZone: "az1", Amount: 1},
- }
- got := calculateUnusedInstanceCapacity(commitments, nil, flavors) // nil running -> all unused
-
- if got[resourceKey{"cpu", "az1", "cascade-lake"}] != 8 { // 2 * 4
- t.Errorf("expected cpu cascade-lake=8, got %v", got[resourceKey{"cpu", "az1", "cascade-lake"}])
- }
- if got[resourceKey{"cpu", "az1", "sapphire-rapids"}] != 16 { // 1 * 16
- t.Errorf("expected cpu sapphire-rapids=16, got %v", got[resourceKey{"cpu", "az1", "sapphire-rapids"}])
- }
- })
-}
-
-func TestVMwareResourceCommitmentsKPI_CollectHanaUnusedCommitments(t *testing.T) {
- scheme, err := v1alpha1.SchemeBuilder.Build()
- if err != nil {
- t.Fatalf("expected no error building scheme, got %v", err)
- }
-
- dbEnv := testlibDB.SetupDBEnv(t)
- testDB := db.DB{DbMap: dbEnv.DbMap}
- defer dbEnv.Close()
-
- if err := testDB.CreateTable(
- testDB.AddTable(limes.Commitment{}),
- testDB.AddTable(nova.Flavor{}),
- testDB.AddTable(nova.Server{}),
- ); err != nil {
- t.Fatalf("expected no error creating tables, got %v", err)
- }
-
- // Flavors: hana_small (4 vcpu, 16384 MB ram, 100 GB disk)
- // hana_large_v2 (16 vcpu, 65536 MB ram, 400 GB disk)
- if err := testDB.Insert(
- &nova.Flavor{ID: "f1", Name: "hana_small", VCPUs: 4, RAM: 16384, Disk: 100},
- &nova.Flavor{ID: "f2", Name: "hana_large_v2", VCPUs: 16, RAM: 65536, Disk: 400},
- &nova.Flavor{ID: "f3", Name: "general_medium", VCPUs: 8, RAM: 32768, Disk: 200},
- ); err != nil {
- t.Fatalf("expected no error inserting flavors, got %v", err)
- }
-
- // Commitments across two AZs to verify per-AZ aggregation:
- // project-A: 3 x hana_small in az1 (cascade-lake)
- // project-B: 2 x hana_large_v2 in az1 (sapphire-rapids)
- // project-A: 4 x hana_small in az2 (cascade-lake) — separate AZ bucket
- // project-C: 1 x hana_k_foo in az1 — hana_k_ prefix, should be skipped
- // project-D: 1 x general_medium — not hana_, should be skipped
- // project-A: 10 x hana_small pending — should be excluded (wrong status)
- // project-E: 2 x hana_small in az1 — running will exceed this (over-used, no metric)
- // project-F: 3 x hana_large_v2 in az2 — running exactly equals this (fully used, no metric)
- if err := testDB.Insert(
- &limes.Commitment{ID: 1, ServiceType: "compute", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 3, Status: "confirmed", ProjectID: "project-A"},
- &limes.Commitment{ID: 2, ServiceType: "compute", ResourceName: "instances_hana_large_v2", AvailabilityZone: "az1", Amount: 2, Status: "confirmed", ProjectID: "project-B"},
- &limes.Commitment{ID: 3, ServiceType: "compute", ResourceName: "instances_hana_small", AvailabilityZone: "az2", Amount: 4, Status: "guaranteed", ProjectID: "project-A"},
- &limes.Commitment{ID: 4, ServiceType: "compute", ResourceName: "instances_hana_k_foo", AvailabilityZone: "az1", Amount: 5, Status: "confirmed", ProjectID: "project-C"},
- &limes.Commitment{ID: 5, ServiceType: "compute", ResourceName: "instances_general_medium", AvailabilityZone: "az1", Amount: 1, Status: "confirmed", ProjectID: "project-D"},
- &limes.Commitment{ID: 6, ServiceType: "compute", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 10, Status: "pending", ProjectID: "project-A"},
- &limes.Commitment{ID: 7, ServiceType: "compute", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 2, Status: "confirmed", ProjectID: "project-E"},
- &limes.Commitment{ID: 8, ServiceType: "compute", ResourceName: "instances_hana_large_v2", AvailabilityZone: "az2", Amount: 3, Status: "confirmed", ProjectID: "project-F"},
- ); err != nil {
- t.Fatalf("expected no error inserting commitments, got %v", err)
- }
-
- // Running servers:
- // project-A/az1: 1 hana_small ACTIVE, 1 DELETED (ignored) -> 2 unused in az1
- // project-B/az1: 0 hana_large_v2 -> 2 unused in az1
- // project-A/az2: 1 hana_small ACTIVE -> 3 unused in az2
- // project-E/az1: 5 hana_small ACTIVE -> 5 > 2 committed -> 0 unused (over-used, clamped)
- // project-F/az2: 3 hana_large_v2 ACTIVE -> 3 == 3 committed -> 0 unused (fully used, clamped)
- if err := testDB.Insert(
- &nova.Server{ID: "s1", TenantID: "project-A", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"},
- &nova.Server{ID: "s2", TenantID: "project-A", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "DELETED"},
- &nova.Server{ID: "s3", TenantID: "project-A", FlavorName: "hana_small", OSEXTAvailabilityZone: "az2", Status: "ACTIVE"},
- &nova.Server{ID: "s4", TenantID: "project-E", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"},
- &nova.Server{ID: "s5", TenantID: "project-E", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"},
- &nova.Server{ID: "s6", TenantID: "project-E", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"},
- &nova.Server{ID: "s7", TenantID: "project-E", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"},
- &nova.Server{ID: "s8", TenantID: "project-E", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"},
- &nova.Server{ID: "s9", TenantID: "project-F", FlavorName: "hana_large_v2", OSEXTAvailabilityZone: "az2", Status: "ACTIVE"},
- &nova.Server{ID: "s10", TenantID: "project-F", FlavorName: "hana_large_v2", OSEXTAvailabilityZone: "az2", Status: "ACTIVE"},
- &nova.Server{ID: "s11", TenantID: "project-F", FlavorName: "hana_large_v2", OSEXTAvailabilityZone: "az2", Status: "ACTIVE"},
- ); err != nil {
- t.Fatalf("expected no error inserting servers, got %v", err)
- }
-
- k8sClient := fake.NewClientBuilder().
- WithScheme(scheme).
- WithRuntimeObjects(
- &v1alpha1.Knowledge{ObjectMeta: v1.ObjectMeta{Name: "host-details"}},
- ).
- Build()
-
- kpi := &VMwareResourceCommitmentsKPI{}
- if err := kpi.Init(&testDB, k8sClient, conf.NewRawOpts("{}")); err != nil {
- t.Fatalf("expected no error, got %v", err)
- }
-
- ch := make(chan prometheus.Metric, 100)
- kpi.Collect(ch)
- close(ch)
-
- type UnusedMetric struct {
- Resource string
- AZ string
- Arch string
- Value float64
- }
-
- actual := make(map[string]UnusedMetric)
- for metric := range ch {
- if getMetricName(metric.Desc().String()) != "cortex_vmware_hana_unused_instance_commitments" {
- continue
- }
- var m prometheusgo.Metric
- if err := metric.Write(&m); err != nil {
- t.Fatalf("failed to write metric: %v", err)
- }
- labels := make(map[string]string)
- for _, lbl := range m.Label {
- labels[lbl.GetName()] = lbl.GetValue()
- }
- key := labels["resource"] + "/" + labels["availability_zone"] + "/" + labels["cpu_architecture"]
- if _, exists := actual[key]; exists {
- t.Fatalf("duplicate metric key %q (resource=%q, availability_zone=%q, cpu_architecture=%q)",
- key, labels["resource"], labels["availability_zone"], labels["cpu_architecture"])
- }
- actual[key] = UnusedMetric{
- Resource: labels["resource"],
- AZ: labels["availability_zone"],
- Arch: labels["cpu_architecture"],
- Value: m.GetGauge().GetValue(),
- }
- }
-
- // project-A/az1: 2 unused hana_small (cascade-lake) -> cpu=2*4=8, ram=2*16384=32768, disk=2*100=200
- // project-B/az1: 2 unused hana_large_v2 (sapphire-rapids) -> cpu=2*16=32, ram=2*65536=131072, disk=2*400=800
- // project-A/az2: 3 unused hana_small (cascade-lake) -> cpu=3*4=12, ram=3*16384=49152, disk=3*100=300
- // project-E/az1: 5 running > 2 committed hana_small -> clamped to 0, no metric emitted
- // project-F/az2: 3 running == 3 committed hana_large_v2 -> clamped to 0, no metric emitted
- expected := map[string]UnusedMetric{
- "cpu/az1/cascade-lake": {Resource: "cpu", AZ: "az1", Arch: "cascade-lake", Value: 8},
- "ram/az1/cascade-lake": {Resource: "ram", AZ: "az1", Arch: "cascade-lake", Value: 32768},
- "disk/az1/cascade-lake": {Resource: "disk", AZ: "az1", Arch: "cascade-lake", Value: 200},
- "cpu/az1/sapphire-rapids": {Resource: "cpu", AZ: "az1", Arch: "sapphire-rapids", Value: 32},
- "ram/az1/sapphire-rapids": {Resource: "ram", AZ: "az1", Arch: "sapphire-rapids", Value: 131072},
- "disk/az1/sapphire-rapids": {Resource: "disk", AZ: "az1", Arch: "sapphire-rapids", Value: 800},
- "cpu/az2/cascade-lake": {Resource: "cpu", AZ: "az2", Arch: "cascade-lake", Value: 12},
- "ram/az2/cascade-lake": {Resource: "ram", AZ: "az2", Arch: "cascade-lake", Value: 49152},
- "disk/az2/cascade-lake": {Resource: "disk", AZ: "az2", Arch: "cascade-lake", Value: 300},
- }
-
- if len(actual) != len(expected) {
- t.Errorf("expected %d metrics, got %d: %v", len(expected), len(actual), actual)
- }
- for key, exp := range expected {
- got, ok := actual[key]
- if !ok {
- t.Errorf("missing metric %q", key)
- continue
- }
- if !reflect.DeepEqual(exp, got) {
- t.Errorf("metric %q: expected %+v, got %+v", key, exp, got)
- }
- }
-}
diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_metrics.go b/internal/knowledge/kpis/plugins/infrastructure/shared.go
similarity index 50%
rename from internal/knowledge/kpis/plugins/infrastructure/vmware_metrics.go
rename to internal/knowledge/kpis/plugins/infrastructure/shared.go
index d92e8c3c2..4c011492c 100644
--- a/internal/knowledge/kpis/plugins/infrastructure/vmware_metrics.go
+++ b/internal/knowledge/kpis/plugins/infrastructure/shared.go
@@ -4,6 +4,7 @@
package infrastructure
import (
+ "fmt"
"regexp"
"strconv"
@@ -78,3 +79,57 @@ type collectedVMwareMetric struct {
Labels map[string]string
Value float64
}
+
+// kvmFlavorPattern matches KVM flavors where the second underscore-delimited
+// segment is "k" (e.g. "m1_k_small", "hana_k_large").
+var kvmFlavorPattern = regexp.MustCompile(`^[^_]+_k_`)
+
+// isKVMFlavor reports whether flavorName belongs to a KVM hypervisor.
+func isKVMFlavor(name string) bool {
+ return kvmFlavorPattern.MatchString(name)
+}
+
+// cpuArchitectureRule maps a flavor name regex to a CPU architecture label.
+type cpuArchitectureRule struct {
+ pattern *regexp.Regexp
+ arch string
+}
+
+// flavorCPUArchitectureRules maps flavor name patterns to CPU architecture labels in priority order.
+// The first matching rule wins; defaultCPUArch is used when none match.
+var flavorCPUArchitectureRules = []cpuArchitectureRule{
+ {regexp.MustCompile(`_v2$`), "sapphire-rapids"},
+}
+
+const defaultCPUArchitecture = "cascade-lake"
+
+// flavorCPUArchitecture derives the CPU architecture label from a flavor name.
+func flavorCPUArchitecture(flavorName string) string {
+ for _, rule := range flavorCPUArchitectureRules {
+ if rule.pattern.MatchString(flavorName) {
+ return rule.arch
+ }
+ }
+ return defaultCPUArchitecture
+}
+
+// bytesPerUnit maps memory unit strings to their byte multipliers.
+var bytesPerUnit = map[string]float64{
+ "": 1,
+ "B": 1,
+ "KiB": 1024,
+ "MB": 1024 * 1024,
+ "MiB": 1024 * 1024,
+ "GB": 1024 * 1024 * 1024,
+ "GiB": 1024 * 1024 * 1024,
+ "TiB": 1024 * 1024 * 1024 * 1024,
+}
+
+// bytesFromUnit converts an amount in the given unit to bytes.
+func bytesFromUnit(amount float64, unit string) (float64, error) {
+ multiplier, ok := bytesPerUnit[unit]
+ if !ok {
+ return 0, fmt.Errorf("unknown memory unit: %s", unit)
+ }
+ return amount * multiplier, nil
+}
diff --git a/internal/knowledge/kpis/plugins/infrastructure/shared_test.go b/internal/knowledge/kpis/plugins/infrastructure/shared_test.go
new file mode 100644
index 000000000..dc720d159
--- /dev/null
+++ b/internal/knowledge/kpis/plugins/infrastructure/shared_test.go
@@ -0,0 +1,78 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package infrastructure
+
+import "testing"
+
+func TestIsKVMFlavor(t *testing.T) {
+ tests := []struct {
+ flavor string
+ want bool
+ }{
+ {"m1_k_small", true},
+ {"hana_k_large", true},
+ {"hana_small", false},
+ {"hana_c128_m1600", false},
+ {"hana_c128_m1600_v2", false},
+ {"small", false},
+ {"m1_large", false},
+ }
+ for _, tt := range tests {
+ if got := isKVMFlavor(tt.flavor); got != tt.want {
+ t.Errorf("isKVMFlavor(%q) = %v, want %v", tt.flavor, got, tt.want)
+ }
+ }
+}
+
+func TestFlavorCPUArchitecture(t *testing.T) {
+ tests := []struct {
+ flavor string
+ want string
+ }{
+ {"hana_c128_m1600_v2", "sapphire-rapids"},
+ {"hana_c256_m3200_v2", "sapphire-rapids"},
+ {"hana_c128_m1600", "cascade-lake"},
+ {"hana_small", "cascade-lake"},
+ }
+ for _, tt := range tests {
+ if got := flavorCPUArchitecture(tt.flavor); got != tt.want {
+ t.Errorf("flavorCPUArchitecture(%q) = %q, want %q", tt.flavor, got, tt.want)
+ }
+ }
+}
+
+func TestVmwareBytesFromUnit(t *testing.T) {
+ tests := []struct {
+ amount float64
+ unit string
+ want float64
+ errMsg string
+ }{
+ {1024, "MiB", 1024 * 1024 * 1024, ""},
+ {1024, "MB", 1024 * 1024 * 1024, ""},
+ {2, "GiB", 2 * 1024 * 1024 * 1024, ""},
+ {2, "GB", 2 * 1024 * 1024 * 1024, ""},
+ {1, "TiB", 1024 * 1024 * 1024 * 1024, ""},
+ {512, "KiB", 512 * 1024, ""},
+ {100, "B", 100, ""},
+ {100, "", 100, ""},
+ {1, "TB", 0, "unknown memory unit: TB"},
+ }
+ for _, tt := range tests {
+ got, err := bytesFromUnit(tt.amount, tt.unit)
+ if tt.errMsg != "" {
+ if err == nil || err.Error() != tt.errMsg {
+ t.Errorf("vmwareBytesFromUnit(%v, %q): expected error %q, got %v", tt.amount, tt.unit, tt.errMsg, err)
+ }
+ continue
+ }
+ if err != nil {
+ t.Errorf("vmwareBytesFromUnit(%v, %q): unexpected error: %v", tt.amount, tt.unit, err)
+ continue
+ }
+ if got != tt.want {
+ t.Errorf("vmwareBytesFromUnit(%v, %q) = %f, want %f", tt.amount, tt.unit, got, tt.want)
+ }
+ }
+}
diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization.go
index 2d48b9737..16fcac857 100644
--- a/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization.go
+++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization.go
@@ -115,8 +115,16 @@ func (k *VMwareProjectUtilizationKPI) Collect(ch chan<- prometheus.Metric) {
hostLabels := host.getHostLabels()
hostLabels = append(hostLabels, projectCapacityUsage.ProjectID, projectCapacityUsage.ProjectName)
- memoryUsageBytes := projectCapacityUsage.TotalRAMMB * 1024 * 1024
- diskUsageBytes := projectCapacityUsage.TotalDiskGB * 1024 * 1024 * 1024
+ memoryUsageBytes, err := bytesFromUnit(projectCapacityUsage.TotalRAMMB, "MB")
+ if err != nil {
+ slog.Error("vmware_project_utilization: failed to convert memory to bytes", "err", err)
+ continue
+ }
+ diskUsageBytes, err := bytesFromUnit(projectCapacityUsage.TotalDiskGB, "GB")
+ if err != nil {
+ slog.Error("vmware_project_utilization: failed to convert disk to bytes", "err", err)
+ continue
+ }
ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, projectCapacityUsage.TotalVCPUs, append(hostLabels, "vcpu")...)
ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, memoryUsageBytes, append(hostLabels, "memory")...)
diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_resource_commitments.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_resource_commitments.go
new file mode 100644
index 000000000..0d3d5d3ed
--- /dev/null
+++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_resource_commitments.go
@@ -0,0 +1,271 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package infrastructure
+
+import (
+ "log/slog"
+ "strings"
+
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/limes"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/db"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins"
+ "github.com/cobaltcore-dev/cortex/pkg/conf"
+ "github.com/prometheus/client_golang/prometheus"
+ "sigs.k8s.io/controller-runtime/pkg/client"
+)
+
+// This KPI tracks committed resources in the VMware environment, based on commitments provided by Limes.
+// For KVM we can map a commitment to a reservation on a specific host. In VMware this is not possible.
+// For general purpose workload customer can specific amounts of resources.
+// For HANA workloads customers commit a certain number of HANA instances (based on flavor).
+// Like this it is possible to determine the workload type of a commitment.
+// For general purpose workloads its not possible to differentiate the cpu architecture. To avoid weird behavior in a dashboard we don't export this label for the metric.
+// For HANA flavors the cpu architecture is part of the flavor name (_v2 suffix for sapphire rapids, without suffix for cascade lake).
+// For both types of workload however we can not determine on which host the commitment is fulfilled.
+type VMwareResourceCommitmentsKPI struct {
+ // BaseKPI provides common fields and methods for all KPIs, such as database connection and Kubernetes client.
+ plugins.BaseKPI[struct{}]
+
+ unusedGeneralPurposeCommitmentsPerProject *prometheus.Desc
+ unusedHanaCommittedResourcesPerProject *prometheus.Desc
+}
+
+func (k *VMwareResourceCommitmentsKPI) GetName() string {
+ return "vmware_resource_commitments_kpi"
+}
+
+func (k *VMwareResourceCommitmentsKPI) Init(dbConn *db.DB, c client.Client, opts conf.RawOpts) error {
+ if err := k.BaseKPI.Init(dbConn, c, opts); err != nil {
+ return err
+ }
+
+ k.unusedGeneralPurposeCommitmentsPerProject = prometheus.NewDesc(
+ "cortex_vmware_commitments_general_purpose",
+ "Committed general purpose resources that are currently unused. CPU (resource=cpu) in vCPUs, memory (resource=ram) in bytes.",
+ []string{"availability_zone", "resource", "project_id"}, nil,
+ )
+ k.unusedHanaCommittedResourcesPerProject = prometheus.NewDesc(
+ "cortex_vmware_commitments_hana_resources",
+ "Total committed HANA instances capacity that is currently unused, translated to resources. CPU in vCPUs, memory and disk in bytes.",
+ []string{"availability_zone", "cpu_architecture", "resource", "project_id"}, nil,
+ )
+ return nil
+}
+
+func (k *VMwareResourceCommitmentsKPI) Describe(ch chan<- *prometheus.Desc) {
+ ch <- k.unusedGeneralPurposeCommitmentsPerProject
+ ch <- k.unusedHanaCommittedResourcesPerProject
+}
+
+func (k *VMwareResourceCommitmentsKPI) Collect(ch chan<- prometheus.Metric) {
+ if k.DB == nil {
+ return
+ }
+
+ flavorsByName, err := k.getFlavorsByName()
+ if err != nil {
+ slog.Error("vmware_resource_commitments: failed to load flavors", "err", err)
+ return
+ }
+
+ k.collectGeneralPurpose(ch, flavorsByName)
+ k.collectHana(ch, flavorsByName)
+}
+
+// getFlavorsByName loads all flavors and returns them keyed by name.
+func (k *VMwareResourceCommitmentsKPI) getFlavorsByName() (map[string]nova.Flavor, error) {
+ var flavors []nova.Flavor
+ if _, err := k.DB.Select(&flavors, "SELECT * FROM "+nova.Flavor{}.TableName()); err != nil {
+ return nil, err
+ }
+ byName := make(map[string]nova.Flavor, len(flavors))
+ for _, f := range flavors {
+ byName[f.Name] = f
+ }
+ return byName, nil
+}
+
+// getGeneralPurposeCommitments loads confirmed/guaranteed cores and ram commitments.
+func (k *VMwareResourceCommitmentsKPI) getGeneralPurposeCommitments() ([]limes.Commitment, error) {
+ var commitments []limes.Commitment
+ if _, err := k.DB.Select(&commitments, `
+ SELECT * FROM `+limes.Commitment{}.TableName()+`
+ WHERE service_type = 'compute'
+ AND resource_name IN ('cores', 'ram')
+ AND status IN ('confirmed', 'guaranteed')
+ `); err != nil {
+ return nil, err
+ }
+ return commitments, nil
+}
+
+// getGeneralPurposeServers loads running non-HANA servers for general purpose usage accounting.
+// KVM-specific flavors are filtered out in Go since SQL LIKE cannot express the segment-exact pattern.
+func (k *VMwareResourceCommitmentsKPI) getGeneralPurposeServers() ([]nova.Server, error) {
+ var servers []nova.Server
+ if _, err := k.DB.Select(&servers, `
+ SELECT * FROM `+nova.Server{}.TableName()+`
+ WHERE status NOT IN ('DELETED', 'ERROR')
+ AND flavor_name NOT LIKE 'hana_%'
+ `); err != nil {
+ return nil, err
+ }
+ result := make([]nova.Server, 0, len(servers))
+ for _, s := range servers {
+ if !isKVMFlavor(s.FlavorName) {
+ result = append(result, s)
+ }
+ }
+ return result, nil
+}
+
+// getHanaInstanceCommitments loads confirmed/guaranteed HANA instance commitments.
+func (k *VMwareResourceCommitmentsKPI) getHanaInstanceCommitments() ([]limes.Commitment, error) {
+ var commitments []limes.Commitment
+ if _, err := k.DB.Select(&commitments, `
+ SELECT * FROM `+limes.Commitment{}.TableName()+`
+ WHERE service_type = 'compute'
+ AND resource_name LIKE 'instances_hana_%'
+ AND status IN ('confirmed', 'guaranteed')
+ `); err != nil {
+ return nil, err
+ }
+ return commitments, nil
+}
+
+// getRunningHanaServers loads all running HANA VMware servers (KVM HANA flavors excluded in Go).
+func (k *VMwareResourceCommitmentsKPI) getRunningHanaServers() ([]nova.Server, error) {
+ var servers []nova.Server
+ if _, err := k.DB.Select(&servers, `
+ SELECT * FROM `+nova.Server{}.TableName()+`
+ WHERE status NOT IN ('DELETED', 'ERROR')
+ AND flavor_name LIKE 'hana_%'
+ `); err != nil {
+ return nil, err
+ }
+ result := make([]nova.Server, 0, len(servers))
+ for _, s := range servers {
+ if !isKVMFlavor(s.FlavorName) {
+ result = append(result, s)
+ }
+ }
+ return result, nil
+}
+
+// collectGeneralPurpose computes and emits unused general purpose committed resources per project.
+// Unused = committed - in-use (clamped to zero; zero values are not emitted).
+func (k *VMwareResourceCommitmentsKPI) collectGeneralPurpose(ch chan<- prometheus.Metric, flavorsByName map[string]nova.Flavor) {
+ commitments, err := k.getGeneralPurposeCommitments()
+ if err != nil {
+ slog.Error("vmware_resource_commitments: failed to load gp commitments", "err", err)
+ return
+ }
+ servers, err := k.getGeneralPurposeServers()
+ if err != nil {
+ slog.Error("vmware_resource_commitments: failed to load gp servers", "err", err)
+ return
+ }
+
+ type gpKey struct{ projectID, az, resource string }
+
+ committed := make(map[gpKey]float64)
+ for _, c := range commitments {
+ switch c.ResourceName {
+ case "cores":
+ committed[gpKey{c.ProjectID, c.AvailabilityZone, "cpu"}] += float64(c.Amount)
+ case "ram":
+ bytes, err := bytesFromUnit(float64(c.Amount), c.Unit)
+ if err != nil {
+ slog.Warn("vmware_resource_commitments: unknown ram unit", "unit", c.Unit, "err", err)
+ continue
+ }
+ committed[gpKey{c.ProjectID, c.AvailabilityZone, "ram"}] += bytes
+ }
+ }
+
+ used := make(map[gpKey]float64)
+ for _, s := range servers {
+ flavor, ok := flavorsByName[s.FlavorName]
+ if !ok {
+ slog.Warn("vmware_resource_commitments: gp flavor not found", "flavor", s.FlavorName)
+ continue
+ }
+ used[gpKey{s.TenantID, s.OSEXTAvailabilityZone, "cpu"}] += float64(flavor.VCPUs)
+ used[gpKey{s.TenantID, s.OSEXTAvailabilityZone, "ram"}] += float64(flavor.RAM) * 1024 * 1024
+ }
+
+ for key, committedAmt := range committed {
+ unused := committedAmt - used[key]
+ if unused <= 0 {
+ continue
+ }
+ ch <- prometheus.MustNewConstMetric(
+ k.unusedGeneralPurposeCommitmentsPerProject,
+ prometheus.GaugeValue,
+ unused,
+ key.az, key.resource, key.projectID,
+ )
+ }
+}
+
+// collectHana computes and emits unused committed HANA instance resources per project.
+// Each HANA instance commitment is compared against running servers; the remainder is
+// translated to cpu/ram/disk capacity using the flavor spec.
+func (k *VMwareResourceCommitmentsKPI) collectHana(ch chan<- prometheus.Metric, flavorsByName map[string]nova.Flavor) {
+ commitments, err := k.getHanaInstanceCommitments()
+ if err != nil {
+ slog.Error("vmware_resource_commitments: failed to load hana commitments", "err", err)
+ return
+ }
+ servers, err := k.getRunningHanaServers()
+ if err != nil {
+ slog.Error("vmware_resource_commitments: failed to load hana servers", "err", err)
+ return
+ }
+
+ type serverKey struct{ projectID, flavorName, az string }
+ running := make(map[serverKey]uint64, len(servers))
+ for _, s := range servers {
+ running[serverKey{s.TenantID, s.FlavorName, s.OSEXTAvailabilityZone}]++
+ }
+
+ type commitKey struct{ projectID, flavorName, az, cpuArch string }
+ committedInstances := make(map[commitKey]uint64)
+ for _, c := range commitments {
+ flavorName := strings.TrimPrefix(c.ResourceName, "instances_")
+ if isKVMFlavor(flavorName) {
+ continue
+ }
+ key := commitKey{c.ProjectID, flavorName, c.AvailabilityZone, flavorCPUArchitecture(flavorName)}
+ committedInstances[key] += c.Amount
+ }
+
+ type resourceKey struct{ projectID, az, cpuArch, resource string }
+ totals := make(map[resourceKey]float64)
+ for ck, total := range committedInstances {
+ run := running[serverKey{ck.projectID, ck.flavorName, ck.az}]
+ if run >= total {
+ continue
+ }
+ unused := total - run
+ flavor, ok := flavorsByName[ck.flavorName]
+ if !ok {
+ slog.Warn("vmware_resource_commitments: hana flavor not found", "flavor", ck.flavorName)
+ continue
+ }
+ totals[resourceKey{ck.projectID, ck.az, ck.cpuArch, "cpu"}] += float64(unused) * float64(flavor.VCPUs)
+ totals[resourceKey{ck.projectID, ck.az, ck.cpuArch, "ram"}] += float64(unused) * float64(flavor.RAM) * 1024 * 1024
+ totals[resourceKey{ck.projectID, ck.az, ck.cpuArch, "disk"}] += float64(unused) * float64(flavor.Disk) * 1024 * 1024 * 1024
+ }
+
+ for key, value := range totals {
+ ch <- prometheus.MustNewConstMetric(
+ k.unusedHanaCommittedResourcesPerProject,
+ prometheus.GaugeValue,
+ value,
+ key.az, key.cpuArch, key.resource, key.projectID,
+ )
+ }
+}
diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_resource_commitments_test.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_resource_commitments_test.go
new file mode 100644
index 000000000..6616dc558
--- /dev/null
+++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_resource_commitments_test.go
@@ -0,0 +1,523 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package infrastructure
+
+import (
+ "testing"
+
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/limes"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/db"
+ testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing"
+ "github.com/cobaltcore-dev/cortex/pkg/conf"
+ "github.com/prometheus/client_golang/prometheus"
+ prometheusgo "github.com/prometheus/client_model/go"
+)
+
+func setupResourceCommitmentsDB(t *testing.T) (testDB *db.DB, cleanup func()) {
+ t.Helper()
+ dbEnv := testlibDB.SetupDBEnv(t)
+ testDB = &db.DB{DbMap: dbEnv.DbMap}
+ if err := testDB.CreateTable(
+ testDB.AddTable(limes.Commitment{}),
+ testDB.AddTable(nova.Server{}),
+ testDB.AddTable(nova.Flavor{}),
+ ); err != nil {
+ t.Fatalf("failed to create tables: %v", err)
+ }
+ return testDB, dbEnv.Close
+}
+
+// collectResourceCommitmentsMetrics runs the KPI and returns all emitted metrics keyed by
+// "metricName|az|cpu_architecture|resource|project_id". GP metrics have an empty cpu_architecture
+// segment since the descriptor does not include that label.
+func collectResourceCommitmentsMetrics(t *testing.T, testDB *db.DB) map[string]float64 {
+ t.Helper()
+ kpi := &VMwareResourceCommitmentsKPI{}
+ if err := kpi.Init(testDB, nil, conf.NewRawOpts("{}")); err != nil {
+ t.Fatalf("failed to init KPI: %v", err)
+ }
+ ch := make(chan prometheus.Metric, 200)
+ kpi.Collect(ch)
+ close(ch)
+
+ result := make(map[string]float64)
+ for m := range ch {
+ var pm prometheusgo.Metric
+ if err := m.Write(&pm); err != nil {
+ t.Fatalf("failed to write metric: %v", err)
+ }
+ lbls := make(map[string]string)
+ for _, lp := range pm.Label {
+ lbls[lp.GetName()] = lp.GetValue()
+ }
+ name := getMetricName(m.Desc().String())
+ key := name + "|" + lbls["availability_zone"] + "|" + lbls["cpu_architecture"] + "|" + lbls["resource"] + "|" + lbls["project_id"]
+ result[key] = pm.GetGauge().GetValue()
+ }
+ return result
+}
+
+// gpKey builds the expected map key for a general-purpose metric.
+// cpu_architecture is always empty because the GP metric descriptor omits that label.
+func gpKey(az, resource, projectID string) string {
+ return "cortex_vmware_commitments_general_purpose|" + az + "||" + resource + "|" + projectID
+}
+
+// hKey builds the expected map key for a HANA metric.
+func hKey(az, cpuArch, resource, projectID string) string {
+ return "cortex_vmware_commitments_hana_resources|" + az + "|" + cpuArch + "|" + resource + "|" + projectID
+}
+
+func TestVMwareResourceCommitmentsKPI_Init(t *testing.T) {
+ dbEnv := testlibDB.SetupDBEnv(t)
+ testDB := db.DB{DbMap: dbEnv.DbMap}
+ defer dbEnv.Close()
+ kpi := &VMwareResourceCommitmentsKPI{}
+ if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+}
+func TestVMwareResourceCommitmentsKPI_Collect_GeneralPurpose(t *testing.T) {
+ tests := []struct {
+ name string
+ commitments []limes.Commitment
+ servers []nova.Server
+ flavors []nova.Flavor
+ want map[string]float64
+ }{
+ {
+ name: "no commitments produces no metrics",
+ want: map[string]float64{},
+ },
+ {
+ name: "fully unused cores commitment",
+ commitments: []limes.Commitment{
+ {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 10, Status: "confirmed", ProjectID: "p1"},
+ },
+ want: map[string]float64{
+ gpKey("az1", "cpu", "p1"): 10,
+ },
+ },
+ {
+ name: "fully unused ram commitment with MiB unit",
+ commitments: []limes.Commitment{
+ {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "ram", AvailabilityZone: "az1", Amount: 1024, Unit: "MiB", Status: "confirmed", ProjectID: "p1"},
+ },
+ want: map[string]float64{
+ gpKey("az1", "ram", "p1"): 1024 * 1024 * 1024,
+ },
+ },
+ {
+ name: "fully unused ram commitment with GiB unit",
+ commitments: []limes.Commitment{
+ {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "ram", AvailabilityZone: "az1", Amount: 2, Unit: "GiB", Status: "confirmed", ProjectID: "p1"},
+ },
+ want: map[string]float64{
+ gpKey("az1", "ram", "p1"): 2 * 1024 * 1024 * 1024,
+ },
+ },
+ {
+ name: "partial cpu usage reduces unused",
+ commitments: []limes.Commitment{
+ {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 10, Status: "confirmed", ProjectID: "p1"},
+ },
+ servers: []nova.Server{
+ {ID: "s1", TenantID: "p1", FlavorName: "small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"},
+ {ID: "s2", TenantID: "p1", FlavorName: "small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"},
+ },
+ flavors: []nova.Flavor{
+ {ID: "f1", Name: "small", VCPUs: 3, RAM: 0, Disk: 0},
+ },
+ want: map[string]float64{
+ gpKey("az1", "cpu", "p1"): 4, // 10 - 2×3 = 4
+ },
+ },
+ {
+ name: "fully covered cpu produces no metric",
+ commitments: []limes.Commitment{
+ {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 4, Status: "confirmed", ProjectID: "p1"},
+ },
+ servers: []nova.Server{
+ {ID: "s1", TenantID: "p1", FlavorName: "small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"},
+ },
+ flavors: []nova.Flavor{
+ {ID: "f1", Name: "small", VCPUs: 4, RAM: 0, Disk: 0},
+ },
+ want: map[string]float64{},
+ },
+ {
+ name: "over-used cpu produces no metric",
+ commitments: []limes.Commitment{
+ {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 2, Status: "confirmed", ProjectID: "p1"},
+ },
+ servers: []nova.Server{
+ {ID: "s1", TenantID: "p1", FlavorName: "large", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"},
+ },
+ flavors: []nova.Flavor{
+ {ID: "f1", Name: "large", VCPUs: 8, RAM: 0, Disk: 0},
+ },
+ want: map[string]float64{},
+ },
+ {
+ name: "hana servers not counted against gp commitments",
+ commitments: []limes.Commitment{
+ {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 10, Status: "confirmed", ProjectID: "p1"},
+ },
+ servers: []nova.Server{
+ {ID: "s1", TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"},
+ },
+ flavors: []nova.Flavor{
+ {ID: "f1", Name: "hana_small", VCPUs: 8, RAM: 0, Disk: 0},
+ },
+ want: map[string]float64{
+ gpKey("az1", "cpu", "p1"): 10,
+ },
+ },
+ {
+ name: "kvm servers not counted against gp commitments",
+ commitments: []limes.Commitment{
+ {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 10, Status: "confirmed", ProjectID: "p1"},
+ },
+ servers: []nova.Server{
+ {ID: "s1", TenantID: "p1", FlavorName: "m1_k_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"},
+ },
+ flavors: []nova.Flavor{
+ {ID: "f1", Name: "m1_k_small", VCPUs: 4, RAM: 0, Disk: 0},
+ },
+ want: map[string]float64{
+ gpKey("az1", "cpu", "p1"): 10,
+ },
+ },
+ {
+ name: "DELETED and ERROR servers excluded from usage",
+ commitments: []limes.Commitment{
+ {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 10, Status: "confirmed", ProjectID: "p1"},
+ },
+ servers: []nova.Server{
+ {ID: "s1", TenantID: "p1", FlavorName: "small", OSEXTAvailabilityZone: "az1", Status: "DELETED"},
+ {ID: "s2", TenantID: "p1", FlavorName: "small", OSEXTAvailabilityZone: "az1", Status: "ERROR"},
+ {ID: "s3", TenantID: "p1", FlavorName: "small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"},
+ },
+ flavors: []nova.Flavor{
+ {ID: "f1", Name: "small", VCPUs: 2, RAM: 0, Disk: 0},
+ },
+ want: map[string]float64{
+ gpKey("az1", "cpu", "p1"): 8, // only 1 ACTIVE × 2 subtracted
+ },
+ },
+ {
+ name: "guaranteed commitments counted",
+ commitments: []limes.Commitment{
+ {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 5, Status: "guaranteed", ProjectID: "p1"},
+ },
+ want: map[string]float64{
+ gpKey("az1", "cpu", "p1"): 5,
+ },
+ },
+ {
+ name: "pending commitments excluded",
+ commitments: []limes.Commitment{
+ {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 100, Status: "pending", ProjectID: "p1"},
+ },
+ want: map[string]float64{},
+ },
+ {
+ name: "non-compute service type excluded",
+ commitments: []limes.Commitment{
+ {ID: 1, UUID: "c1", ServiceType: "network", ResourceName: "cores", AvailabilityZone: "az1", Amount: 100, Status: "confirmed", ProjectID: "p1"},
+ },
+ want: map[string]float64{},
+ },
+ {
+ name: "multiple commitments per project and AZ summed",
+ commitments: []limes.Commitment{
+ {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 10, Status: "confirmed", ProjectID: "p1"},
+ {ID: 2, UUID: "c2", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 5, Status: "confirmed", ProjectID: "p1"},
+ {ID: 3, UUID: "c3", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az2", Amount: 20, Status: "confirmed", ProjectID: "p1"},
+ {ID: 4, UUID: "c4", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 8, Status: "confirmed", ProjectID: "p2"},
+ },
+ want: map[string]float64{
+ gpKey("az1", "cpu", "p1"): 15,
+ gpKey("az2", "cpu", "p1"): 20,
+ gpKey("az1", "cpu", "p2"): 8,
+ },
+ },
+ {
+ name: "cpu and ram unused reported separately",
+ commitments: []limes.Commitment{
+ {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 8, Status: "confirmed", ProjectID: "p1"},
+ {ID: 2, UUID: "c2", ServiceType: "compute", ResourceName: "ram", AvailabilityZone: "az1", Amount: 512, Unit: "MiB", Status: "confirmed", ProjectID: "p1"},
+ },
+ servers: []nova.Server{
+ {ID: "s1", TenantID: "p1", FlavorName: "medium", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"},
+ },
+ flavors: []nova.Flavor{
+ {ID: "f1", Name: "medium", VCPUs: 2, RAM: 256, Disk: 0},
+ },
+ want: map[string]float64{
+ gpKey("az1", "cpu", "p1"): 6, // 8 - 1×2
+ gpKey("az1", "ram", "p1"): (512 - 256) * 1024 * 1024, // 512MiB - 256MB (flavor.RAM is in MB)
+ },
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ testDB, cleanup := setupResourceCommitmentsDB(t)
+ defer cleanup()
+
+ var rows []any
+ for i := range tt.commitments {
+ rows = append(rows, &tt.commitments[i])
+ }
+ for i := range tt.servers {
+ rows = append(rows, &tt.servers[i])
+ }
+ for i := range tt.flavors {
+ rows = append(rows, &tt.flavors[i])
+ }
+ if len(rows) > 0 {
+ if err := testDB.Insert(rows...); err != nil {
+ t.Fatalf("failed to insert test data: %v", err)
+ }
+ }
+
+ got := collectResourceCommitmentsMetrics(t, testDB)
+
+ if len(got) != len(tt.want) {
+ t.Errorf("expected %d metrics, got %d: %v", len(tt.want), len(got), got)
+ }
+ for k, wantVal := range tt.want {
+ gotVal, ok := got[k]
+ if !ok {
+ t.Errorf("missing metric %q", k)
+ continue
+ }
+ if gotVal != wantVal {
+ t.Errorf("metric %q: expected %f, got %f", k, wantVal, gotVal)
+ }
+ }
+ })
+ }
+}
+
+func TestVMwareResourceCommitmentsKPI_Collect_HANA(t *testing.T) {
+ tests := []struct {
+ name string
+ commitments []limes.Commitment
+ servers []nova.Server
+ flavors []nova.Flavor
+ want map[string]float64
+ }{
+ {
+ name: "no commitments produces no metrics",
+ want: map[string]float64{},
+ },
+ {
+ name: "fully unused hana instance commitment",
+ commitments: []limes.Commitment{
+ {ID: 1, UUID: "h1", ServiceType: "compute", ResourceName: "instances_hana_c128_m1600", AvailabilityZone: "az1", Amount: 2, Status: "confirmed", ProjectID: "p1"},
+ },
+ flavors: []nova.Flavor{
+ {ID: "f1", Name: "hana_c128_m1600", VCPUs: 128, RAM: 1638400, Disk: 100},
+ },
+ want: map[string]float64{
+ hKey("az1", "cascade-lake", "cpu", "p1"): 2 * 128,
+ hKey("az1", "cascade-lake", "ram", "p1"): 2 * 1638400 * 1024 * 1024,
+ hKey("az1", "cascade-lake", "disk", "p1"): 2 * 100 * 1024 * 1024 * 1024,
+ },
+ },
+ {
+ name: "partial hana usage reduces unused instances",
+ commitments: []limes.Commitment{
+ {ID: 1, UUID: "h1", ServiceType: "compute", ResourceName: "instances_hana_c128_m1600", AvailabilityZone: "az1", Amount: 3, Status: "confirmed", ProjectID: "p1"},
+ },
+ servers: []nova.Server{
+ {ID: "s1", TenantID: "p1", FlavorName: "hana_c128_m1600", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"},
+ },
+ flavors: []nova.Flavor{
+ {ID: "f1", Name: "hana_c128_m1600", VCPUs: 128, RAM: 1638400, Disk: 100},
+ },
+ want: map[string]float64{
+ hKey("az1", "cascade-lake", "cpu", "p1"): 2 * 128,
+ hKey("az1", "cascade-lake", "ram", "p1"): 2 * 1638400 * 1024 * 1024,
+ hKey("az1", "cascade-lake", "disk", "p1"): 2 * 100 * 1024 * 1024 * 1024,
+ },
+ },
+ {
+ name: "fully used hana produces no metric",
+ commitments: []limes.Commitment{
+ {ID: 1, UUID: "h1", ServiceType: "compute", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 2, Status: "confirmed", ProjectID: "p1"},
+ },
+ servers: []nova.Server{
+ {ID: "s1", TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"},
+ {ID: "s2", TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"},
+ },
+ flavors: []nova.Flavor{
+ {ID: "f1", Name: "hana_small", VCPUs: 64, RAM: 819200, Disk: 50},
+ },
+ want: map[string]float64{},
+ },
+ {
+ name: "over-used hana produces no metric",
+ commitments: []limes.Commitment{
+ {ID: 1, UUID: "h1", ServiceType: "compute", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 1, Status: "confirmed", ProjectID: "p1"},
+ },
+ servers: []nova.Server{
+ {ID: "s1", TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"},
+ {ID: "s2", TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"},
+ },
+ flavors: []nova.Flavor{
+ {ID: "f1", Name: "hana_small", VCPUs: 64, RAM: 819200, Disk: 50},
+ },
+ want: map[string]float64{},
+ },
+ {
+ name: "sapphire-rapids arch from _v2 suffix",
+ commitments: []limes.Commitment{
+ {ID: 1, UUID: "h1", ServiceType: "compute", ResourceName: "instances_hana_c256_m3200_v2", AvailabilityZone: "az1", Amount: 1, Status: "confirmed", ProjectID: "p1"},
+ },
+ flavors: []nova.Flavor{
+ {ID: "f1", Name: "hana_c256_m3200_v2", VCPUs: 256, RAM: 3276800, Disk: 200},
+ },
+ want: map[string]float64{
+ hKey("az1", "sapphire-rapids", "cpu", "p1"): 256,
+ hKey("az1", "sapphire-rapids", "ram", "p1"): 3276800 * 1024 * 1024,
+ hKey("az1", "sapphire-rapids", "disk", "p1"): 200 * 1024 * 1024 * 1024,
+ },
+ },
+ {
+ name: "cascade-lake and sapphire-rapids aggregated separately",
+ commitments: []limes.Commitment{
+ {ID: 1, UUID: "h1", ServiceType: "compute", ResourceName: "instances_hana_c128_m1600", AvailabilityZone: "az1", Amount: 2, Status: "confirmed", ProjectID: "p1"},
+ {ID: 2, UUID: "h2", ServiceType: "compute", ResourceName: "instances_hana_c128_m1600_v2", AvailabilityZone: "az1", Amount: 1, Status: "confirmed", ProjectID: "p1"},
+ },
+ flavors: []nova.Flavor{
+ {ID: "f1", Name: "hana_c128_m1600", VCPUs: 128, RAM: 1638400, Disk: 100},
+ {ID: "f2", Name: "hana_c128_m1600_v2", VCPUs: 128, RAM: 1638400, Disk: 100},
+ },
+ want: map[string]float64{
+ hKey("az1", "cascade-lake", "cpu", "p1"): 2 * 128,
+ hKey("az1", "cascade-lake", "ram", "p1"): 2 * 1638400 * 1024 * 1024,
+ hKey("az1", "cascade-lake", "disk", "p1"): 2 * 100 * 1024 * 1024 * 1024,
+ hKey("az1", "sapphire-rapids", "cpu", "p1"): 1 * 128,
+ hKey("az1", "sapphire-rapids", "ram", "p1"): 1 * 1638400 * 1024 * 1024,
+ hKey("az1", "sapphire-rapids", "disk", "p1"): 1 * 100 * 1024 * 1024 * 1024,
+ },
+ },
+ {
+ name: "kvm hana commitments excluded",
+ commitments: []limes.Commitment{
+ // hana_k_large is a KVM HANA flavor — must be filtered out
+ {ID: 1, UUID: "h1", ServiceType: "compute", ResourceName: "instances_hana_k_large", AvailabilityZone: "az1", Amount: 5, Status: "confirmed", ProjectID: "p1"},
+ },
+ flavors: []nova.Flavor{
+ {ID: "f1", Name: "hana_k_large", VCPUs: 64, RAM: 819200, Disk: 50},
+ },
+ want: map[string]float64{},
+ },
+ {
+ name: "DELETED and ERROR hana servers excluded from running count",
+ commitments: []limes.Commitment{
+ {ID: 1, UUID: "h1", ServiceType: "compute", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 3, Status: "confirmed", ProjectID: "p1"},
+ },
+ servers: []nova.Server{
+ {ID: "s1", TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "DELETED"},
+ {ID: "s2", TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ERROR"},
+ {ID: "s3", TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"},
+ },
+ flavors: []nova.Flavor{
+ {ID: "f1", Name: "hana_small", VCPUs: 64, RAM: 819200, Disk: 50},
+ },
+ want: map[string]float64{
+ hKey("az1", "cascade-lake", "cpu", "p1"): 2 * 64, // 3 committed - 1 ACTIVE = 2 unused
+ hKey("az1", "cascade-lake", "ram", "p1"): 2 * 819200 * 1024 * 1024,
+ hKey("az1", "cascade-lake", "disk", "p1"): 2 * 50 * 1024 * 1024 * 1024,
+ },
+ },
+ {
+ name: "guaranteed hana commitments counted",
+ commitments: []limes.Commitment{
+ {ID: 1, UUID: "h1", ServiceType: "compute", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 1, Status: "guaranteed", ProjectID: "p1"},
+ },
+ flavors: []nova.Flavor{
+ {ID: "f1", Name: "hana_small", VCPUs: 64, RAM: 819200, Disk: 50},
+ },
+ want: map[string]float64{
+ hKey("az1", "cascade-lake", "cpu", "p1"): 64,
+ hKey("az1", "cascade-lake", "ram", "p1"): 819200 * 1024 * 1024,
+ hKey("az1", "cascade-lake", "disk", "p1"): 50 * 1024 * 1024 * 1024,
+ },
+ },
+ {
+ name: "unknown flavor is skipped without panic",
+ commitments: []limes.Commitment{
+ {ID: 1, UUID: "h1", ServiceType: "compute", ResourceName: "instances_hana_nonexistent", AvailabilityZone: "az1", Amount: 2, Status: "confirmed", ProjectID: "p1"},
+ },
+ want: map[string]float64{},
+ },
+ {
+ name: "multiple projects and AZs aggregated per bucket",
+ commitments: []limes.Commitment{
+ {ID: 1, UUID: "h1", ServiceType: "compute", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 2, Status: "confirmed", ProjectID: "p1"},
+ {ID: 2, UUID: "h2", ServiceType: "compute", ResourceName: "instances_hana_small", AvailabilityZone: "az2", Amount: 3, Status: "confirmed", ProjectID: "p1"},
+ {ID: 3, UUID: "h3", ServiceType: "compute", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 1, Status: "confirmed", ProjectID: "p2"},
+ },
+ flavors: []nova.Flavor{
+ {ID: "f1", Name: "hana_small", VCPUs: 64, RAM: 819200, Disk: 50},
+ },
+ want: map[string]float64{
+ hKey("az1", "cascade-lake", "cpu", "p1"): 2 * 64,
+ hKey("az1", "cascade-lake", "ram", "p1"): 2 * 819200 * 1024 * 1024,
+ hKey("az1", "cascade-lake", "disk", "p1"): 2 * 50 * 1024 * 1024 * 1024,
+ hKey("az2", "cascade-lake", "cpu", "p1"): 3 * 64,
+ hKey("az2", "cascade-lake", "ram", "p1"): 3 * 819200 * 1024 * 1024,
+ hKey("az2", "cascade-lake", "disk", "p1"): 3 * 50 * 1024 * 1024 * 1024,
+ hKey("az1", "cascade-lake", "cpu", "p2"): 1 * 64,
+ hKey("az1", "cascade-lake", "ram", "p2"): 1 * 819200 * 1024 * 1024,
+ hKey("az1", "cascade-lake", "disk", "p2"): 1 * 50 * 1024 * 1024 * 1024,
+ },
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ testDB, cleanup := setupResourceCommitmentsDB(t)
+ defer cleanup()
+
+ var rows []any
+ for i := range tt.commitments {
+ rows = append(rows, &tt.commitments[i])
+ }
+ for i := range tt.servers {
+ rows = append(rows, &tt.servers[i])
+ }
+ for i := range tt.flavors {
+ rows = append(rows, &tt.flavors[i])
+ }
+ if len(rows) > 0 {
+ if err := testDB.Insert(rows...); err != nil {
+ t.Fatalf("failed to insert test data: %v", err)
+ }
+ }
+
+ got := collectResourceCommitmentsMetrics(t, testDB)
+
+ if len(got) != len(tt.want) {
+ t.Errorf("expected %d metrics, got %d: %v", len(tt.want), len(got), got)
+ }
+ for k, wantVal := range tt.want {
+ gotVal, ok := got[k]
+ if !ok {
+ t.Errorf("missing metric %q", k)
+ continue
+ }
+ if gotVal != wantVal {
+ t.Errorf("metric %q: expected %f, got %f", k, wantVal, gotVal)
+ }
+ }
+ })
+ }
+}
diff --git a/internal/knowledge/kpis/supported_kpis.go b/internal/knowledge/kpis/supported_kpis.go
index c1a2b336c..19726a488 100644
--- a/internal/knowledge/kpis/supported_kpis.go
+++ b/internal/knowledge/kpis/supported_kpis.go
@@ -23,9 +23,9 @@ var supportedKPIs = map[string]plugins.KPI{
"vm_life_span_kpi": &compute.VMLifeSpanKPI{},
"vm_commitments_kpi": &compute.VMCommitmentsKPI{},
"vm_faults_kpi": &compute.VMFaultsKPI{},
- "vmware_commitments_kpi": &compute.VMwareResourceCommitmentsKPI{},
- "vmware_project_utilization_kpi": &infrastructure.VMwareProjectUtilizationKPI{},
+ "vmware_project_utilization_kpi": &infrastructure.VMwareProjectUtilizationKPI{},
+ "vmware_resource_commitments_kpi": &infrastructure.VMwareResourceCommitmentsKPI{},
"netapp_storage_pool_cpu_usage_kpi": &storage.NetAppStoragePoolCPUUsageKPI{},
From 1c7250ea82b14863a0f2f79305708b6442dedf3a Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Wed, 29 Apr 2026 11:34:34 +0000
Subject: [PATCH 18/54] Bump cortex chart appVersions to sha-839845cf [skip ci]
---
helm/library/cortex/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml
index 4ab0c4e21..101552c3b 100644
--- a/helm/library/cortex/Chart.yaml
+++ b/helm/library/cortex/Chart.yaml
@@ -3,6 +3,6 @@ name: cortex
description: A Helm chart to distribute cortex.
type: application
version: 0.0.44
-appVersion: "sha-17050b2f"
+appVersion: "sha-839845cf"
icon: "https://example.com/icon.png"
dependencies: []
From dc6bbe7cf350c53059eb48c74f78013debd875f7 Mon Sep 17 00:00:00 2001
From: Markus Wieland <44964229+SoWieMarkus@users.noreply.github.com>
Date: Wed, 29 Apr 2026 14:24:19 +0200
Subject: [PATCH 19/54] Add changelog update command and workflow for release
PRs (#746)
Add changelog command and workflow that is triggered on PR closed on
release.
---------
Co-authored-by: Philipp Matthes <27271818+PhilippMatthes@users.noreply.github.com>
Co-authored-by: Philipp Matthes
---
.claude/commands/release.md | 164 ++++++++++++++++++++++++++
.claude/commands/update-changelog.md | 54 +++++++++
.coderabbit.yaml | 2 +-
.github/workflows/claude-release.yaml | 52 ++++++++
4 files changed, 271 insertions(+), 1 deletion(-)
create mode 100644 .claude/commands/release.md
create mode 100644 .claude/commands/update-changelog.md
create mode 100644 .github/workflows/claude-release.yaml
diff --git a/.claude/commands/release.md b/.claude/commands/release.md
new file mode 100644
index 000000000..38ff4a60f
--- /dev/null
+++ b/.claude/commands/release.md
@@ -0,0 +1,164 @@
+---
+allowed-tools: Read, Write, Edit, Bash(*), WebSearch, WebFetch, Agent
+description: Release orchestrator — builds a digest of what changed in a release PR, opens a changelog PR, and references the bump PR. Usage: /release PR_NUMBER
+---
+
+# Release Orchestrator
+
+Your job is to orchestrate the release process for a given PR. This involves analyzing the PR's commits and changed files to build a structured digest of what changed, determining if there are any breaking changes, preparing a changelog, opening a PR to bump chart versions if needed, and updating the original PR description with the changelog and references to the new PRs.
+
+---
+
+## Phase 1: Collect — Build the release digest
+
+1. Fetch PR metadata:
+ ```
+ gh pr view $ARGUMENTS --json number,title,body,commits,files
+ ```
+
+2. For each commit SHA in the PR, inspect the changed files:
+ ```
+ git show --name-only --format="%H %s"
+ ```
+
+3. Classify each commit to a component:
+ - Cortex shim: code touching the shim layer (internal/shim and cmd/shim)
+ - Cortex postgres: code touching the postgres docker image, or its helm chart
+ - Cortex core: core code touching anything else: the manager or external scheduler logic of cortex
+ - General: CI, tooling, docs, or other non-code changes
+
+4. Finally, read through the cortex helm charts in the helm/ folder, and check which ones have updated appVersions, indicating a new Docker image is available and that the chart should be included in the release notes.
+
+Produce a structured digest in this exact format — the subagents depend on it:
+
+```
+## Release Digest — PR #NNN "{title}"
+
+### Changed Charts
+- cortex v1.2.3 (sha-xxxxxxxx)
+- cortex-postgres v1.2.3 (sha-xxxxxxxx)
+- cortex-nova v1.2.3 — includes cortex v1.2.3, cortex-postgres v1.2.3
+
+### Commits by Component
+
+#### cortex core
+-
+
+#### cortex postgres
+-
+
+#### cortex shim
+-
+
+#### General
+-
+```
+
+**Important**: Do NOT skip or shallow this phase. Read actual file diffs. The subagents depend entirely on the quality of this digest.
+
+---
+
+## Phase 2: Determine Breaking Changes and Prepare a Changelog
+
+Reason for each change by looking at the commit's diff, if it is a breaking change that requires special attention.
+
+**Important**: Do NOT skip or shallow this phase. Read actual file diffs. The PR reviewers depend entirely on the quality of this analysis to know what to focus on in their review.
+
+### When is a change "breaking"?
+
+A change should be classified as "breaking" if it meets any of the following criteria:
+
+- It changes or removes the public API of any component (e.g., CRD schemas, CLI flags, or REST API endpoints). Note: additions to the public API are not breaking.
+- It requires a config format change (e.g., renaming or removing a values.yaml key, changing the expected format of a value, etc)
+
+Once the digest is complete, read each agent file, then dispatch all three **in parallel** using the Agent tool in a single message. Each subagent operates independently — do not wait for one before starting the others.
+
+### Prepare the changelog
+
+Generate a changelog following this template:
+
+```markdown
+# Changelog
+
+## YYYY-MM-DD — [#NNN]()
+
+### v ()
+
+Breaking changes:
+-
+
+Non-breaking changes:
+-
+
+... repeat for each changed chart ...
+
+### General
+
+Breaking changes:
+-
+
+Non-breaking changes:
+-
+```
+
+One `###` section per changed chart only. For bundle sections, list which library versions they include, then any bundle-specific changes (values.yaml keys, template/CRD changes). Omit `### General` if empty. No commit SHAs, one line per bullet.
+
+Example:
+```markdown
+# Changelog
+
+## 2026-04-24 — [#123](https://github.com/cobaltcore-dev/cortex/pull/123)
+
+### cortex v0.0.43 (sha-xxxxxxxx)
+
+Breaking changes:
+- Check hypervisor resources against reservations
+
+Non-breaking changes:
+- Commitments usage API uses postgres database instead of calling nova
+
+### cortex-postgres v0.5.14 (sha-xxxxxxxx)
+
+Non-breaking changes:
+- Add commitments table migration
+
+### cortex-nova v0.0.56 (sha-xxxxxxxx)
+
+Includes updated charts cortex v0.0.43 and cortex-postgres v0.5.14.
+
+Non-breaking changes:
+- values.yaml: added `reservations.enabled` (default: false)
+
+### General
+
+Non-breaking changes:
+- Update golangci-lint to v2.1.0
+```
+
+## Phase 3: Bump Chart Versions
+
+Prepare chart version bumps so GitHub pushes bumped charts to the registry immediately after the release PR is merged.
+
+For each changed library chart, patch-bump its `version` in `helm/library//Chart.yaml` (e.g. `0.0.43` → `0.1.0`), if there was no breaking change, otherwise minor-bump it. Do not touch `appVersion`. Then update the matching `dependencies[].version` entry in every `helm/bundles/*/Chart.yaml` that references it.
+
+Open a single PR to `main` with all the bumps, branch `release/bump-charts-`, noting in the body that it should be merged before the release PR. Use the pull-request-creator agent for this subtask, and include the chart changes in the motivation so they are included in the PR description.
+
+## Phase 4: Update the PR Description
+
+Use `gh pr edit` with `--body` to update the PR description with the changelog. It is fine for release pull request descriptions to utilize markdown formatting. Reference the opened bump PR in the description as well as a dependency.
+
+## Phase 5: Create a Changelog PR
+
+If the CHANGELOG.md does not exists, create it with a `# Changelog` header. Then create a new PR to `main` with branch `release/changelog-`, title `Update changelog for release PR #`, and a body noting it should be merged after the release PR. Use the pull-request-creator agent for this subtask.
+
+## Phase 6: Summarize — Report what happened
+
+After all subagents return, produce a short summary:
+
+```
+## Release #NNN Post-Open Summary
+
+- PR description updated with changelog and bump PR reference
+- Bump PR #XXX opened to update chart versions
+- Changelog PR #YYY opened to update CHANGELOG.md
+```
diff --git a/.claude/commands/update-changelog.md b/.claude/commands/update-changelog.md
new file mode 100644
index 000000000..79a5e1f44
--- /dev/null
+++ b/.claude/commands/update-changelog.md
@@ -0,0 +1,54 @@
+---
+allowed-tools: Read, Write, Edit, Bash(*), WebSearch, WebFetch
+description: Create a changelog entry for a merged release PR and open a PR to main. Usage: /update-changelog PR_NUMBER
+---
+
+A release PR (#$ARGUMENTS) was merged into the `release` branch. Create a changelog entry for it and open a PR to `main`.
+
+To build the entry, use the PR's commit subjects (no diffs) and the changed Helm charts as your sources. Only include charts whose Chart.yaml actually changed in this PR.
+
+Format each entry as:
+
+## {merged_at date in UTC, formatted YYYY-MM-DD} — {PR title} ([#NNN](https://github.com/cobaltcore-dev/cortex/pull/NNN))
+
+One `###` section per changed chart: `### v ()`
+Under each section, bullet the commit subjects that relate to that chart.
+
+Attribution: for each commit, inspect its changed files with `git show --name-only ` and map to the chart whose files were touched:
+
+- `postgres/**` → cortex-postgres
+- `cmd/shim/**` or `internal/shim/**` → cortex-shim
+- `helm/bundles/cortex-/**` → that specific bundle chart
+- anything else → cortex (core)
+
+Commits that only touch CI, docs, or tooling go into `### General`. Skip commits containing "[skip ci]" or that are pure version-bump message.
+
+For bundle chart sections (helm/bundles/*), add a note listing which library chart versions they now include (read the bundle's Chart.yaml dependencies). Then inspect the actual diff of the bundle's own files with `git show -- helm/bundles//` for any commit that touched that bundle, and surface specific changes:
+
+- **values.yaml** changes: call out new, removed, or renamed keys and changed defaults
+- **templates/** or **crds/** changes: call out added, removed, or modified resources by kind and name
+
+Prepend the new entry below the `# Changelog` header in `CHANGELOG.md` (create the file if it doesn't exist). Then open a PR to `main` referencing this release PR.
+
+## Example
+
+```markdown
+## 2026-04-24 — Release libs cortex v0.0.43 + bundles v0.0.56 ([#722](https://github.com/cobaltcore-dev/cortex/pull/722))
+
+### cortex v0.0.43 (sha-xxxxxxxx)
+- Commitments usage API uses postgres database instead of calling nova
+- Check hypervisor resources against reservations
+- Add committed resource reservations to capacity calculation
+
+### cortex-postgres v0.5.14 (sha-xxxxxxxx)
+- Add commitments table migration
+
+### cortex-nova v0.0.56 (sha-xxxxxxxx)
+- Update nova bundle for committed reservations support
+
+### cortex-manila v0.0.56 (sha-xxxxxxxx)
+- Update manila bundle for committed reservations support
+
+### General
+- Update golangci-lint to v2.1.0
+```
diff --git a/.coderabbit.yaml b/.coderabbit.yaml
index d158def60..e9c45c1bc 100644
--- a/.coderabbit.yaml
+++ b/.coderabbit.yaml
@@ -1,2 +1,2 @@
reviews:
- high_level_summary: false
+ high_level_summary: false
\ No newline at end of file
diff --git a/.github/workflows/claude-release.yaml b/.github/workflows/claude-release.yaml
new file mode 100644
index 000000000..216716809
--- /dev/null
+++ b/.github/workflows/claude-release.yaml
@@ -0,0 +1,52 @@
+name: Claude Code Release Orchestrator
+
+on:
+ pull_request:
+ types: [opened, synchronize, reopened]
+ branches:
+ - release
+
+jobs:
+ release:
+ if: false # Temporarily disabled
+ runs-on: ubuntu-latest
+ concurrency:
+ group: changelog-release
+ cancel-in-progress: false
+ permissions:
+ contents: write
+ pull-requests: write
+ id-token: write
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v6
+
+ - name: Set up Go
+ uses: actions/setup-go@v6
+ with:
+ go-version-file: 'go.mod'
+
+ - uses: ./.github/actions/setup-claude-code-action
+
+ - uses: ./.github/actions/start-litellm-proxy
+ env:
+ AICORE_RESOURCE_GROUP: ${{ secrets.AICORE_RESOURCE_GROUP }}
+ AICORE_BASE_URL: ${{ secrets.AICORE_BASE_URL }}
+ AICORE_AUTH_URL: ${{ secrets.AICORE_AUTH_URL }}
+ AICORE_CLIENT_ID: ${{ secrets.AICORE_CLIENT_ID }}
+ AICORE_CLIENT_SECRET: ${{ secrets.AICORE_CLIENT_SECRET }}
+
+ - uses: ./.claude-code-action
+ with:
+ claude_args: |
+ --max-turns 1000
+ --permission-mode auto
+ --allowedTools "Read,Write,Edit,Bash(*),WebSearch,WebFetch,Agent"
+ use_litellm: "true"
+ litellm_model: "sap/anthropic--claude-4.6-opus"
+ github_token: ${{ secrets.GITHUB_TOKEN }}
+ show_full_output: "true"
+ prompt: "/release ${{ github.event.pull_request.number }}"
+
+ - uses: ./.github/actions/stop-litellm-proxy
+ if: always()
From 643ae36cc9f2abfdeded6571749215496a14f1c5 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Wed, 29 Apr 2026 12:34:07 +0000
Subject: [PATCH 20/54] Bump cortex chart appVersions to sha-dc6bbe7c [skip ci]
---
helm/library/cortex/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml
index 101552c3b..8e981e365 100644
--- a/helm/library/cortex/Chart.yaml
+++ b/helm/library/cortex/Chart.yaml
@@ -3,6 +3,6 @@ name: cortex
description: A Helm chart to distribute cortex.
type: application
version: 0.0.44
-appVersion: "sha-839845cf"
+appVersion: "sha-dc6bbe7c"
icon: "https://example.com/icon.png"
dependencies: []
From c9d8943a3fa9fdbe77b2180b7972503490998788 Mon Sep 17 00:00:00 2001
From: Markus Wieland <44964229+SoWieMarkus@users.noreply.github.com>
Date: Wed, 29 Apr 2026 14:41:35 +0200
Subject: [PATCH 21/54] refactor: vmware host capacity kpi (#775)
---
helm/bundles/cortex-nova/templates/kpis.yaml | 30 +-
.../compute/resource_capacity_kvm_test.go | 11 +
.../compute/resource_capacity_vmware.go | 201 -------
.../compute/resource_capacity_vmware_test.go | 503 ------------------
.../kpis/plugins/infrastructure/shared.go | 3 +-
.../plugins/infrastructure/shared_test.go | 89 +++-
.../infrastructure/vmware_host_capacity.go | 119 +++++
.../vmware_host_capacity_test.go | 335 ++++++++++++
.../vmware_project_utilization_test.go | 20 +-
internal/knowledge/kpis/supported_kpis.go | 2 +-
10 files changed, 572 insertions(+), 741 deletions(-)
delete mode 100644 internal/knowledge/kpis/plugins/compute/resource_capacity_vmware.go
delete mode 100644 internal/knowledge/kpis/plugins/compute/resource_capacity_vmware_test.go
create mode 100644 internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity.go
create mode 100644 internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity_test.go
diff --git a/helm/bundles/cortex-nova/templates/kpis.yaml b/helm/bundles/cortex-nova/templates/kpis.yaml
index 22774c62a..6979b0e29 100644
--- a/helm/bundles/cortex-nova/templates/kpis.yaml
+++ b/helm/bundles/cortex-nova/templates/kpis.yaml
@@ -29,20 +29,6 @@ spec:
---
apiVersion: cortex.cloud/v1alpha1
kind: KPI
-metadata:
- name: vmware-host-capacity
-spec:
- schedulingDomain: nova
- impl: vmware_host_capacity_kpi
- dependencies:
- knowledges:
- - name: host-details
- - name: host-utilization
- description: |
- This KPI tracks the total, utilized, reserved and failover capacity of VMware hosts.
----
-apiVersion: cortex.cloud/v1alpha1
-kind: KPI
metadata:
name: host-running-vms
spec:
@@ -215,4 +201,18 @@ spec:
- name: nova-flavors
- name: limes-project-commitments
description: |
- This KPI tracks the resource commitments of projects running VMs on VMware hosts.
\ No newline at end of file
+ This KPI tracks the resource commitments of projects running VMs on VMware hosts.
+---
+apiVersion: cortex.cloud/v1alpha1
+kind: KPI
+metadata:
+ name: vmware-host-capacity
+spec:
+ schedulingDomain: nova
+ impl: vmware_host_capacity_kpi
+ dependencies:
+ knowledges:
+ - name: host-details
+ - name: host-utilization
+ description: |
+ This KPI tracks the capacity and utilization of VMware hosts in terms of CPU, RAM, and disk resources.
\ No newline at end of file
diff --git a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go
index 6e9d38c7b..c233cfd4c 100644
--- a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go
+++ b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go
@@ -4,6 +4,7 @@
package compute
import (
+ "regexp"
"testing"
"github.com/cobaltcore-dev/cortex/api/v1alpha1"
@@ -38,6 +39,16 @@ type kvmMetricLabels struct {
Maintenance string
}
+var fqNameRe = regexp.MustCompile(`fqName: "([^"]+)"`)
+
+func getMetricName(desc string) string {
+ match := fqNameRe.FindStringSubmatch(desc)
+ if len(match) > 1 {
+ return match[1]
+ }
+ return ""
+}
+
type kvmExpectedMetric struct {
Name string // metric family name (e.g. "cortex_kvm_host_capacity_total")
Labels kvmMetricLabels
diff --git a/internal/knowledge/kpis/plugins/compute/resource_capacity_vmware.go b/internal/knowledge/kpis/plugins/compute/resource_capacity_vmware.go
deleted file mode 100644
index 8bd2d4177..000000000
--- a/internal/knowledge/kpis/plugins/compute/resource_capacity_vmware.go
+++ /dev/null
@@ -1,201 +0,0 @@
-// Copyright SAP SE
-// SPDX-License-Identifier: Apache-2.0
-
-package compute
-
-import (
- "context"
- "log/slog"
- "strconv"
-
- "github.com/cobaltcore-dev/cortex/api/v1alpha1"
- "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute"
- "sigs.k8s.io/controller-runtime/pkg/client"
-
- "github.com/cobaltcore-dev/cortex/internal/knowledge/db"
- "github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins"
- "github.com/cobaltcore-dev/cortex/pkg/conf"
- "github.com/prometheus/client_golang/prometheus"
-)
-
-type VMwareResourceCapacityKPI struct {
- // Common base for all KPIs that provides standard functionality.
- plugins.BaseKPI[struct{}] // No options passed through yaml config
-
- availableCapacityPerHost *prometheus.Desc
- totalCapacityPerHost *prometheus.Desc
-}
-
-func (VMwareResourceCapacityKPI) GetName() string {
- return "vmware_host_capacity_kpi"
-}
-
-func (k *VMwareResourceCapacityKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) error {
- if err := k.BaseKPI.Init(db, client, opts); err != nil {
- return err
- }
- k.availableCapacityPerHost = prometheus.NewDesc(
- "cortex_vmware_host_capacity_available",
- "Available capacity per resource on the hosts currently (individually by host).",
- []string{
- "compute_host",
- "resource",
- "availability_zone",
- "cpu_architecture",
- "workload_type",
- "enabled",
- "decommissioned",
- "external_customer",
- "pinned_projects",
- "disabled_reason",
- "pinned_project_ids",
- },
- nil,
- )
- k.totalCapacityPerHost = prometheus.NewDesc(
- "cortex_vmware_host_capacity_total",
- "Total resources available on the hosts currently (individually by host).",
- []string{
- "compute_host",
- "resource",
- "availability_zone",
- "cpu_architecture",
- "workload_type",
- "enabled",
- "decommissioned",
- "external_customer",
- "pinned_projects",
- "pinned_project_ids",
- },
- nil,
- )
- return nil
-}
-
-func (k *VMwareResourceCapacityKPI) Describe(ch chan<- *prometheus.Desc) {
- ch <- k.availableCapacityPerHost
- ch <- k.totalCapacityPerHost
-}
-
-func (k *VMwareResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) {
- hostDetailsKnowledge := &v1alpha1.Knowledge{}
- if err := k.Client.Get(
- context.Background(),
- client.ObjectKey{Name: "host-details"},
- hostDetailsKnowledge,
- ); err != nil {
- slog.Error("failed to get knowledge host-details", "err", err)
- return
- }
- hostDetails, err := v1alpha1.
- UnboxFeatureList[compute.HostDetails](hostDetailsKnowledge.Status.Raw)
- if err != nil {
- slog.Error("failed to unbox storage pool cpu usage", "err", err)
- return
- }
- detailsByComputeHost := make(map[string]compute.HostDetails)
- for _, detail := range hostDetails {
- detailsByComputeHost[detail.ComputeHost] = detail
- }
-
- hostUtilizationKnowledge := &v1alpha1.Knowledge{}
- if err := k.Client.Get(
- context.Background(),
- client.ObjectKey{Name: "host-utilization"},
- hostUtilizationKnowledge,
- ); err != nil {
- slog.Error("failed to get knowledge host-utilization", "err", err)
- return
- }
- hostUtilizations, err := v1alpha1.
- UnboxFeatureList[compute.HostUtilization](hostUtilizationKnowledge.Status.Raw)
- if err != nil {
- slog.Error("failed to unbox host utilization", "err", err)
- return
- }
-
- for _, utilization := range hostUtilizations {
- detail, exists := detailsByComputeHost[utilization.ComputeHost]
- if !exists {
- slog.Warn("host_available_capacity: missing host details for compute host", "compute_host", utilization.ComputeHost)
- continue
- }
- if detail.HypervisorType == "ironic" {
- continue // Ironic hosts do not run VMs/instances
- }
-
- if detail.HypervisorFamily != "vmware" {
- continue
- }
-
- if utilization.TotalRAMAllocatableMB == 0 || utilization.TotalVCPUsAllocatable == 0 || utilization.TotalDiskAllocatableGB == 0 {
- slog.Info(
- "Skipping host since placement is reporting zero allocatable resources",
- "metric", "cortex_available_capacity_per_host",
- "host", utilization.ComputeHost,
- "cpu", utilization.TotalVCPUsAllocatable,
- "ram", utilization.TotalRAMAllocatableMB,
- "disk", utilization.TotalDiskAllocatableGB,
- )
- continue
- }
-
- availableCPUs := float64(utilization.TotalVCPUsAllocatable - utilization.VCPUsUsed)
- availableRAMMB := float64(utilization.TotalRAMAllocatableMB - utilization.RAMUsedMB)
- availableDiskGB := float64(utilization.TotalDiskAllocatableGB - utilization.DiskUsedGB)
-
- k.exportCapacityMetricVMware(ch, "cpu", availableCPUs, utilization.TotalVCPUsAllocatable, detail)
- k.exportCapacityMetricVMware(ch, "ram", availableRAMMB, utilization.TotalRAMAllocatableMB, detail)
- k.exportCapacityMetricVMware(ch, "disk", availableDiskGB, utilization.TotalDiskAllocatableGB, detail)
- }
-}
-
-func (k *VMwareResourceCapacityKPI) exportCapacityMetricVMware(ch chan<- prometheus.Metric, resource string, available, total float64, host compute.HostDetails) {
- enabled := strconv.FormatBool(host.Enabled)
- decommissioned := strconv.FormatBool(host.Decommissioned)
- externalCustomer := strconv.FormatBool(host.ExternalCustomer)
- pinnedProjectIds := ""
- pinnedProjects := "false"
- if host.PinnedProjects != nil {
- pinnedProjectIds = *host.PinnedProjects
- pinnedProjects = "true"
- }
-
- disabledReason := "-"
- if host.DisabledReason != nil {
- disabledReason = *host.DisabledReason
- }
-
- ch <- prometheus.MustNewConstMetric(
- k.availableCapacityPerHost,
- prometheus.GaugeValue,
- available,
- host.ComputeHost,
- resource,
- host.AvailabilityZone,
- host.CPUArchitecture,
- host.WorkloadType,
- enabled,
- decommissioned,
- externalCustomer,
- pinnedProjects,
- disabledReason,
- pinnedProjectIds,
- )
-
- ch <- prometheus.MustNewConstMetric(
- k.totalCapacityPerHost,
- prometheus.GaugeValue,
- total,
- host.ComputeHost,
- resource,
- host.AvailabilityZone,
- host.CPUArchitecture,
- host.WorkloadType,
- enabled,
- decommissioned,
- externalCustomer,
- pinnedProjects,
- pinnedProjectIds,
- )
-}
diff --git a/internal/knowledge/kpis/plugins/compute/resource_capacity_vmware_test.go b/internal/knowledge/kpis/plugins/compute/resource_capacity_vmware_test.go
deleted file mode 100644
index 875be6357..000000000
--- a/internal/knowledge/kpis/plugins/compute/resource_capacity_vmware_test.go
+++ /dev/null
@@ -1,503 +0,0 @@
-// Copyright SAP SE
-// SPDX-License-Identifier: Apache-2.0
-
-package compute
-
-import (
- "reflect"
- "regexp"
- "testing"
-
- "github.com/cobaltcore-dev/cortex/api/v1alpha1"
- "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute"
- "github.com/cobaltcore-dev/cortex/pkg/conf"
- testlib "github.com/cobaltcore-dev/cortex/pkg/testing"
- "github.com/prometheus/client_golang/prometheus"
- prometheusgo "github.com/prometheus/client_model/go"
- v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "sigs.k8s.io/controller-runtime/pkg/client/fake"
-)
-
-func TestVMwareResourceCapacityKPI_Init(t *testing.T) {
- kpi := &VMwareResourceCapacityKPI{}
- if err := kpi.Init(nil, nil, conf.NewRawOpts("{}")); err != nil {
- t.Fatalf("expected no error, got %v", err)
- }
-}
-
-var fqNameRe = regexp.MustCompile(`fqName: "([^"]+)"`)
-
-func getMetricName(desc string) string {
- match := fqNameRe.FindStringSubmatch(desc)
- if len(match) > 1 {
- return match[1]
- }
- return ""
-}
-
-func TestVMwareResourceCapacityKPI_Collect_AbsoluteMetric(t *testing.T) {
- scheme, err := v1alpha1.SchemeBuilder.Build()
- if err != nil {
- t.Fatalf("expected no error, got %v", err)
- }
-
- hostDetails, err := v1alpha1.BoxFeatureList([]any{
- &compute.HostDetails{
- ComputeHost: "vmware-host",
- AvailabilityZone: "az1",
- CPUArchitecture: "cascade-lake",
- HypervisorType: "vcenter",
- HypervisorFamily: "vmware",
- WorkloadType: "general-purpose",
- Enabled: true,
- Decommissioned: true,
- ExternalCustomer: true,
- DisabledReason: nil,
- PinnedProjects: nil,
- },
- // Skip this because it's not a VMware host
- &compute.HostDetails{
- ComputeHost: "kvm-host",
- AvailabilityZone: "az2",
- CPUArchitecture: "cascade-lake",
- HypervisorType: "qemu",
- HypervisorFamily: "kvm",
- WorkloadType: "hana",
- Enabled: false,
- Decommissioned: false,
- ExternalCustomer: false,
- DisabledReason: testlib.Ptr("test"),
- PinnedProjects: testlib.Ptr("project1,project2"),
- },
- // Skip this because placement doesn't report any capacity for this host
- &compute.HostDetails{
- ComputeHost: "vmware-host-2",
- AvailabilityZone: "az2",
- CPUArchitecture: "cascade-lake",
- HypervisorType: "qemu",
- HypervisorFamily: "vmware",
- WorkloadType: "hana",
- Enabled: false,
- Decommissioned: false,
- ExternalCustomer: false,
- DisabledReason: testlib.Ptr("test"),
- PinnedProjects: testlib.Ptr("project1,project2"),
- },
- // Skip this because it's a ironic host
- &compute.HostDetails{
- ComputeHost: "ironic-host",
- AvailabilityZone: "az2",
- CPUArchitecture: "cascade-lake",
- HypervisorType: "ironic",
- HypervisorFamily: "vmware",
- WorkloadType: "hana",
- Enabled: false,
- Decommissioned: false,
- ExternalCustomer: false,
- DisabledReason: testlib.Ptr("test"),
- PinnedProjects: testlib.Ptr("project1"),
- },
- })
- if err != nil {
- t.Fatalf("expected no error, got %v", err)
- }
-
- hostUtilizations, err := v1alpha1.BoxFeatureList([]any{
- &compute.HostUtilization{
- ComputeHost: "vmware-host",
- TotalVCPUsAllocatable: 100,
- TotalRAMAllocatableMB: 200,
- TotalDiskAllocatableGB: 300,
- VCPUsUsed: 40,
- RAMUsedMB: 40,
- DiskUsedGB: 40,
- },
- &compute.HostUtilization{
- ComputeHost: "kvm-host",
- TotalVCPUsAllocatable: 100,
- TotalRAMAllocatableMB: 100,
- TotalDiskAllocatableGB: 100,
- VCPUsUsed: 75,
- RAMUsedMB: 80,
- DiskUsedGB: 85,
- },
- &compute.HostUtilization{
- ComputeHost: "ironic-host",
- TotalVCPUsAllocatable: 0,
- TotalRAMAllocatableMB: 0,
- TotalDiskAllocatableGB: 0,
- VCPUsUsed: 0,
- RAMUsedMB: 0,
- DiskUsedGB: 0,
- },
- // No Capacity reported for host kvm-host-2
- })
- if err != nil {
- t.Fatalf("expected no error, got %v", err)
- }
-
- kpi := &VMwareResourceCapacityKPI{}
- client := fake.NewClientBuilder().
- WithScheme(scheme).
- WithRuntimeObjects(&v1alpha1.Knowledge{
- ObjectMeta: v1.ObjectMeta{Name: "host-details"},
- Status: v1alpha1.KnowledgeStatus{Raw: hostDetails},
- }, &v1alpha1.Knowledge{
- ObjectMeta: v1.ObjectMeta{Name: "host-utilization"},
- Status: v1alpha1.KnowledgeStatus{Raw: hostUtilizations},
- }).
- Build()
- if err := kpi.Init(nil, client, conf.NewRawOpts("{}")); err != nil {
- t.Fatalf("expected no error, got %v", err)
- }
-
- ch := make(chan prometheus.Metric, 100)
- kpi.Collect(ch)
- close(ch)
-
- type HostResourceMetric struct {
- ComputeHost string
- Resource string
- AvailabilityZone string
- Enabled string
- Decommissioned string
- ExternalCustomer string
- CPUArchitecture string
- WorkloadType string
- DisabledReason string
- PinnedProjects string
- PinnedProjectIds string
- Value float64
- }
-
- actualMetrics := make(map[string]HostResourceMetric, 0)
-
- for metric := range ch {
- desc := metric.Desc().String()
- metricName := getMetricName(desc)
-
- // Only consider cortex_vmware_host_capacity_available metric in this test
- if metricName != "cortex_vmware_host_capacity_available" {
- continue
- }
-
- var m prometheusgo.Metric
- if err := metric.Write(&m); err != nil {
- t.Fatalf("failed to write metric: %v", err)
- }
-
- labels := make(map[string]string)
- for _, label := range m.Label {
- labels[label.GetName()] = label.GetValue()
- }
-
- key := labels["compute_host"] + "-" + labels["resource"]
-
- actualMetrics[key] = HostResourceMetric{
- ComputeHost: labels["compute_host"],
- Resource: labels["resource"],
- AvailabilityZone: labels["availability_zone"],
- Enabled: labels["enabled"],
- Decommissioned: labels["decommissioned"],
- ExternalCustomer: labels["external_customer"],
- CPUArchitecture: labels["cpu_architecture"],
- WorkloadType: labels["workload_type"],
- DisabledReason: labels["disabled_reason"],
- PinnedProjects: labels["pinned_projects"],
- PinnedProjectIds: labels["pinned_project_ids"],
- Value: m.GetGauge().GetValue(),
- }
- }
-
- expectedMetrics := map[string]HostResourceMetric{
- "vmware-host-cpu": {
- ComputeHost: "vmware-host",
- Resource: "cpu",
- AvailabilityZone: "az1",
- Enabled: "true",
- Decommissioned: "true",
- ExternalCustomer: "true",
- CPUArchitecture: "cascade-lake",
- WorkloadType: "general-purpose",
- DisabledReason: "-",
- PinnedProjects: "false",
- PinnedProjectIds: "",
- Value: 60, // 100 - 40
- },
- "vmware-host-ram": {
- ComputeHost: "vmware-host",
- Resource: "ram",
- AvailabilityZone: "az1",
- Enabled: "true",
- Decommissioned: "true",
- ExternalCustomer: "true",
- CPUArchitecture: "cascade-lake",
- WorkloadType: "general-purpose",
- DisabledReason: "-",
- PinnedProjects: "false",
- PinnedProjectIds: "",
- Value: 160, // 200 - 40
- },
- "vmware-host-disk": {
- ComputeHost: "vmware-host",
- Resource: "disk",
- AvailabilityZone: "az1",
- Enabled: "true",
- Decommissioned: "true",
- ExternalCustomer: "true",
- CPUArchitecture: "cascade-lake",
- WorkloadType: "general-purpose",
- DisabledReason: "-",
- PinnedProjects: "false",
- PinnedProjectIds: "",
- Value: 260, // 300 - 40
- },
- }
-
- if len(expectedMetrics) != len(actualMetrics) {
- t.Errorf("expected %d metrics, got %d", len(expectedMetrics), len(actualMetrics))
- }
-
- for key, expected := range expectedMetrics {
- actual, ok := actualMetrics[key]
- if !ok {
- t.Errorf("expected metric %q not found", key)
- continue
- }
-
- if !reflect.DeepEqual(expected, actual) {
- t.Errorf("metric %q: expected %+v, got %+v", key, expected, actual)
- }
- }
-}
-
-func TestVMwareResourceCapacityKPI_Collect_TotalMetric(t *testing.T) {
- scheme, err := v1alpha1.SchemeBuilder.Build()
- if err != nil {
- t.Fatalf("expected no error, got %v", err)
- }
-
- hostDetails, err := v1alpha1.BoxFeatureList([]any{
- &compute.HostDetails{
- ComputeHost: "vmware-host",
- AvailabilityZone: "az1",
- CPUArchitecture: "cascade-lake",
- HypervisorType: "vcenter",
- HypervisorFamily: "vmware",
- WorkloadType: "general-purpose",
- Enabled: true,
- Decommissioned: true,
- ExternalCustomer: true,
- DisabledReason: nil,
- PinnedProjects: testlib.Ptr("project1,project2"),
- },
- // Skip this because it's not a VMware host
- &compute.HostDetails{
- ComputeHost: "kvm-host",
- AvailabilityZone: "az2",
- CPUArchitecture: "cascade-lake",
- HypervisorType: "qemu",
- HypervisorFamily: "kvm",
- WorkloadType: "hana",
- Enabled: false,
- Decommissioned: false,
- ExternalCustomer: false,
- DisabledReason: testlib.Ptr("test"),
- PinnedProjects: testlib.Ptr("project1,project2"),
- },
- // Skip this because placement doesn't report any capacity for this host
- &compute.HostDetails{
- ComputeHost: "vmware-host-2",
- AvailabilityZone: "az2",
- CPUArchitecture: "cascade-lake",
- HypervisorType: "qemu",
- HypervisorFamily: "vmware",
- WorkloadType: "hana",
- Enabled: false,
- Decommissioned: false,
- ExternalCustomer: false,
- DisabledReason: testlib.Ptr("test"),
- PinnedProjects: testlib.Ptr("project1,project2"),
- },
- // Skip this because it's a ironic host
- &compute.HostDetails{
- ComputeHost: "ironic-host",
- AvailabilityZone: "az2",
- CPUArchitecture: "cascade-lake",
- HypervisorType: "ironic",
- HypervisorFamily: "vmware",
- WorkloadType: "hana",
- Enabled: false,
- Decommissioned: false,
- ExternalCustomer: false,
- DisabledReason: testlib.Ptr("test"),
- PinnedProjects: testlib.Ptr("project1"),
- },
- })
- if err != nil {
- t.Fatalf("expected no error, got %v", err)
- }
-
- hostUtilizations, err := v1alpha1.BoxFeatureList([]any{
- &compute.HostUtilization{
- ComputeHost: "vmware-host",
- TotalVCPUsAllocatable: 100,
- TotalRAMAllocatableMB: 200,
- TotalDiskAllocatableGB: 300,
- VCPUsUsed: 40,
- RAMUsedMB: 40,
- DiskUsedGB: 40,
- },
- &compute.HostUtilization{
- ComputeHost: "kvm-host",
- TotalVCPUsAllocatable: 100,
- TotalRAMAllocatableMB: 100,
- TotalDiskAllocatableGB: 100,
- VCPUsUsed: 75,
- RAMUsedMB: 80,
- DiskUsedGB: 85,
- },
- &compute.HostUtilization{
- ComputeHost: "ironic-host",
- TotalVCPUsAllocatable: 0,
- TotalRAMAllocatableMB: 0,
- TotalDiskAllocatableGB: 0,
- VCPUsUsed: 0,
- RAMUsedMB: 0,
- DiskUsedGB: 0,
- },
- // No Capacity reported for host kvm-host-2
- })
- if err != nil {
- t.Fatalf("expected no error, got %v", err)
- }
-
- kpi := &VMwareResourceCapacityKPI{}
- client := fake.NewClientBuilder().
- WithScheme(scheme).
- WithRuntimeObjects(&v1alpha1.Knowledge{
- ObjectMeta: v1.ObjectMeta{Name: "host-details"},
- Status: v1alpha1.KnowledgeStatus{Raw: hostDetails},
- }, &v1alpha1.Knowledge{
- ObjectMeta: v1.ObjectMeta{Name: "host-utilization"},
- Status: v1alpha1.KnowledgeStatus{Raw: hostUtilizations},
- }).
- Build()
- if err := kpi.Init(nil, client, conf.NewRawOpts("{}")); err != nil {
- t.Fatalf("expected no error, got %v", err)
- }
-
- ch := make(chan prometheus.Metric, 100)
- kpi.Collect(ch)
- close(ch)
-
- type HostResourceMetric struct {
- ComputeHost string
- Resource string
- AvailabilityZone string
- Enabled string
- Decommissioned string
- ExternalCustomer string
- CPUArchitecture string
- WorkloadType string
- PinnedProjects string
- PinnedProjectIds string
- Value float64
- }
-
- actualMetrics := make(map[string]HostResourceMetric, 0)
-
- for metric := range ch {
- desc := metric.Desc().String()
- metricName := getMetricName(desc)
-
- // Only consider cortex_vmware_host_capacity_total metric in this test
- if metricName != "cortex_vmware_host_capacity_total" {
- continue
- }
-
- var m prometheusgo.Metric
- if err := metric.Write(&m); err != nil {
- t.Fatalf("failed to write metric: %v", err)
- }
-
- labels := make(map[string]string)
- for _, label := range m.Label {
- labels[label.GetName()] = label.GetValue()
- }
-
- key := labels["compute_host"] + "-" + labels["resource"]
-
- actualMetrics[key] = HostResourceMetric{
- ComputeHost: labels["compute_host"],
- Resource: labels["resource"],
- AvailabilityZone: labels["availability_zone"],
- Enabled: labels["enabled"],
- Decommissioned: labels["decommissioned"],
- ExternalCustomer: labels["external_customer"],
- CPUArchitecture: labels["cpu_architecture"],
- WorkloadType: labels["workload_type"],
- PinnedProjects: labels["pinned_projects"],
- PinnedProjectIds: labels["pinned_project_ids"],
- Value: m.GetGauge().GetValue(),
- }
- }
-
- expectedMetrics := map[string]HostResourceMetric{
- "vmware-host-cpu": {
- ComputeHost: "vmware-host",
- Resource: "cpu",
- AvailabilityZone: "az1",
- Enabled: "true",
- Decommissioned: "true",
- ExternalCustomer: "true",
- CPUArchitecture: "cascade-lake",
- WorkloadType: "general-purpose",
- PinnedProjects: "true",
- PinnedProjectIds: "project1,project2",
- Value: 100,
- },
- "vmware-host-ram": {
- ComputeHost: "vmware-host",
- Resource: "ram",
- AvailabilityZone: "az1",
- Enabled: "true",
- Decommissioned: "true",
- ExternalCustomer: "true",
- CPUArchitecture: "cascade-lake",
- WorkloadType: "general-purpose",
- PinnedProjects: "true",
- PinnedProjectIds: "project1,project2",
- Value: 200,
- },
- "vmware-host-disk": {
- ComputeHost: "vmware-host",
- Resource: "disk",
- AvailabilityZone: "az1",
- Enabled: "true",
- Decommissioned: "true",
- ExternalCustomer: "true",
- CPUArchitecture: "cascade-lake",
- WorkloadType: "general-purpose",
- PinnedProjects: "true",
- PinnedProjectIds: "project1,project2",
- Value: 300,
- },
- }
-
- if len(expectedMetrics) != len(actualMetrics) {
- t.Errorf("expected %d metrics, got %d", len(expectedMetrics), len(actualMetrics))
- }
-
- for key, expected := range expectedMetrics {
- actual, ok := actualMetrics[key]
- if !ok {
- t.Errorf("expected metric %q not found", key)
- continue
- }
-
- if !reflect.DeepEqual(expected, actual) {
- t.Errorf("metric %q: expected %+v, got %+v", key, expected, actual)
- }
- }
-}
diff --git a/internal/knowledge/kpis/plugins/infrastructure/shared.go b/internal/knowledge/kpis/plugins/infrastructure/shared.go
index 4c011492c..62eb44e9c 100644
--- a/internal/knowledge/kpis/plugins/infrastructure/shared.go
+++ b/internal/knowledge/kpis/plugins/infrastructure/shared.go
@@ -13,6 +13,7 @@ import (
const (
hostDetailsKnowledgeName = "host-details"
+ hostUtilizationKnowledgeName = "host-utilization"
vmwareIronicHypervisorType = "ironic"
hypervisorFamilyVMware = "vmware"
vmwareComputeHostPattern = "nova-compute-%"
@@ -40,7 +41,6 @@ func (h vmwareHost) getHostLabels() []string {
h.ComputeHost,
h.CPUArchitecture,
h.WorkloadType,
- h.HypervisorFamily,
strconv.FormatBool(h.Enabled),
strconv.FormatBool(h.Decommissioned),
strconv.FormatBool(h.ExternalCustomer),
@@ -55,7 +55,6 @@ var vmwareHostLabels = []string{
"compute_host",
"cpu_architecture",
"workload_type",
- "hypervisor_family",
"enabled",
"decommissioned",
"external_customer",
diff --git a/internal/knowledge/kpis/plugins/infrastructure/shared_test.go b/internal/knowledge/kpis/plugins/infrastructure/shared_test.go
index dc720d159..351fedc50 100644
--- a/internal/knowledge/kpis/plugins/infrastructure/shared_test.go
+++ b/internal/knowledge/kpis/plugins/infrastructure/shared_test.go
@@ -3,7 +3,94 @@
package infrastructure
-import "testing"
+import (
+ "testing"
+
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute"
+)
+
+func mockVMwareHostLabels(computeHost, az string) map[string]string {
+ return map[string]string{
+ "availability_zone": az,
+ "compute_host": computeHost,
+ "cpu_architecture": "",
+ "workload_type": "",
+ "enabled": "false",
+ "decommissioned": "false",
+ "external_customer": "false",
+ "disabled_reason": "-",
+ "pinned_projects": "false",
+ "pinned_project_ids": "",
+ }
+}
+
+func TestVMwareHostGetHostLabels(t *testing.T) {
+ str := func(s string) *string { return &s }
+
+ tests := []struct {
+ name string
+ host vmwareHost
+ want []string
+ }{
+ {
+ name: "all optional fields nil",
+ host: vmwareHost{compute.HostDetails{
+ AvailabilityZone: "az1",
+ ComputeHost: "nova-compute-1",
+ CPUArchitecture: "cascade-lake",
+ WorkloadType: "general-purpose",
+ Enabled: true,
+ Decommissioned: false,
+ ExternalCustomer: false,
+ DisabledReason: nil,
+ PinnedProjects: nil,
+ }},
+ want: []string{"az1", "nova-compute-1", "cascade-lake", "general-purpose", "true", "false", "false", "-", "false", ""},
+ },
+ {
+ name: "disabled reason set",
+ host: vmwareHost{compute.HostDetails{
+ AvailabilityZone: "az2",
+ ComputeHost: "nova-compute-2",
+ DisabledReason: str("scheduled-maintenance"),
+ }},
+ want: []string{"az2", "nova-compute-2", "", "", "false", "false", "false", "scheduled-maintenance", "false", ""},
+ },
+ {
+ name: "pinned projects set",
+ host: vmwareHost{compute.HostDetails{
+ AvailabilityZone: "az1",
+ ComputeHost: "nova-compute-3",
+ PinnedProjects: str("proj-a,proj-b"),
+ }},
+ want: []string{"az1", "nova-compute-3", "", "", "false", "false", "false", "-", "true", "proj-a,proj-b"},
+ },
+ {
+ name: "decommissioned and external customer",
+ host: vmwareHost{compute.HostDetails{
+ AvailabilityZone: "az3",
+ ComputeHost: "nova-compute-4",
+ Decommissioned: true,
+ ExternalCustomer: true,
+ }},
+ want: []string{"az3", "nova-compute-4", "", "", "false", "true", "true", "-", "false", ""},
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ got := tt.host.getHostLabels()
+ if len(got) != len(vmwareHostLabels) {
+ t.Fatalf("getHostLabels() returned %d values, want %d (matching vmwareHostLabels)", len(got), len(vmwareHostLabels))
+ }
+ for i, want := range tt.want {
+ if got[i] != want {
+ t.Errorf("label[%d] (%s) = %q, want %q", i, vmwareHostLabels[i], got[i], want)
+ }
+ }
+ })
+ }
+}
func TestIsKVMFlavor(t *testing.T) {
tests := []struct {
diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity.go
new file mode 100644
index 000000000..c7976db3a
--- /dev/null
+++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity.go
@@ -0,0 +1,119 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package infrastructure
+
+import (
+ "context"
+ "log/slog"
+
+ "github.com/cobaltcore-dev/cortex/api/v1alpha1"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/db"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins"
+ "github.com/cobaltcore-dev/cortex/pkg/conf"
+ "github.com/prometheus/client_golang/prometheus"
+ "sigs.k8s.io/controller-runtime/pkg/client"
+)
+
+type VMwareHostCapacityKPI struct {
+ plugins.BaseKPI[struct{}]
+
+ capacityUsagePerHost *prometheus.Desc
+ capacityTotalPerHost *prometheus.Desc
+}
+
+func (k *VMwareHostCapacityKPI) GetName() string {
+ return "vmware_host_capacity_kpi"
+}
+
+func (k *VMwareHostCapacityKPI) Init(dbConn *db.DB, c client.Client, opts conf.RawOpts) error {
+ if err := k.BaseKPI.Init(dbConn, c, opts); err != nil {
+ return err
+ }
+ k.capacityUsagePerHost = prometheus.NewDesc(
+ "cortex_vmware_host_capacity_usage",
+ "Capacity usage per VMware host. CPU in vCPUs, memory and disk in bytes.",
+ append(vmwareHostLabels, "resource"), nil,
+ )
+ k.capacityTotalPerHost = prometheus.NewDesc(
+ "cortex_vmware_host_capacity_total",
+ "Total allocatable capacity per VMware host. CPU in vCPUs, memory and disk in bytes.",
+ append(vmwareHostLabels, "resource"), nil,
+ )
+ return nil
+}
+
+func (k *VMwareHostCapacityKPI) Describe(ch chan<- *prometheus.Desc) {
+ ch <- k.capacityUsagePerHost
+ ch <- k.capacityTotalPerHost
+}
+
+func (k *VMwareHostCapacityKPI) Collect(ch chan<- prometheus.Metric) {
+ hosts, err := k.getVMwareHosts()
+ if err != nil {
+ slog.Error("vmware_host_capacity: failed to get vmware hosts", "error", err)
+ return
+ }
+ utilizations, err := k.getHostUtilizations()
+ if err != nil {
+ slog.Error("vmware_host_capacity: failed to get host utilizations", "error", err)
+ return
+ }
+ for _, host := range hosts {
+ util, ok := utilizations[host.ComputeHost]
+ if !ok {
+ slog.Warn("vmware_host_capacity: missing utilization for host", "compute_host", host.ComputeHost)
+ continue
+ }
+
+ labels := host.getHostLabels()
+
+ ch <- prometheus.MustNewConstMetric(k.capacityUsagePerHost, prometheus.GaugeValue, util.VCPUsUsed, append(labels, "cpu")...)
+ ch <- prometheus.MustNewConstMetric(k.capacityUsagePerHost, prometheus.GaugeValue, util.RAMUsedMB*1024*1024, append(labels, "ram")...)
+ ch <- prometheus.MustNewConstMetric(k.capacityUsagePerHost, prometheus.GaugeValue, util.DiskUsedGB*1024*1024*1024, append(labels, "disk")...)
+
+ ch <- prometheus.MustNewConstMetric(k.capacityTotalPerHost, prometheus.GaugeValue, util.TotalVCPUsAllocatable, append(labels, "cpu")...)
+ ch <- prometheus.MustNewConstMetric(k.capacityTotalPerHost, prometheus.GaugeValue, util.TotalRAMAllocatableMB*1024*1024, append(labels, "ram")...)
+ ch <- prometheus.MustNewConstMetric(k.capacityTotalPerHost, prometheus.GaugeValue, util.TotalDiskAllocatableGB*1024*1024*1024, append(labels, "disk")...)
+ }
+}
+
+func (k *VMwareHostCapacityKPI) getVMwareHosts() ([]vmwareHost, error) {
+ knowledge := &v1alpha1.Knowledge{}
+ if err := k.Client.Get(context.Background(), client.ObjectKey{Name: hostDetailsKnowledgeName}, knowledge); err != nil {
+ return nil, err
+ }
+ details, err := v1alpha1.UnboxFeatureList[compute.HostDetails](knowledge.Status.Raw)
+ if err != nil {
+ return nil, err
+ }
+ hosts := make([]vmwareHost, 0, len(details))
+ for _, d := range details {
+ if d.HypervisorType == vmwareIronicHypervisorType || d.HypervisorFamily != hypervisorFamilyVMware {
+ continue
+ }
+ hosts = append(hosts, vmwareHost{HostDetails: d})
+ }
+ return hosts, nil
+}
+
+func (k *VMwareHostCapacityKPI) getHostUtilizations() (map[string]compute.HostUtilization, error) {
+ knowledge := &v1alpha1.Knowledge{}
+ if err := k.Client.Get(context.Background(), client.ObjectKey{Name: hostUtilizationKnowledgeName}, knowledge); err != nil {
+ return nil, err
+ }
+ utils, err := v1alpha1.UnboxFeatureList[compute.HostUtilization](knowledge.Status.Raw)
+ if err != nil {
+ return nil, err
+ }
+ m := make(map[string]compute.HostUtilization, len(utils))
+ for _, u := range utils {
+ if u.TotalVCPUsAllocatable == 0 || u.TotalRAMAllocatableMB == 0 || u.TotalDiskAllocatableGB == 0 {
+ slog.Warn("vmware_host_capacity: skipping host with zero allocatable resources", "compute_host", u.ComputeHost)
+ continue
+ }
+ m[u.ComputeHost] = u
+ }
+ return m, nil
+}
diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity_test.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity_test.go
new file mode 100644
index 000000000..f0a025db4
--- /dev/null
+++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity_test.go
@@ -0,0 +1,335 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package infrastructure
+
+import (
+ "reflect"
+ "testing"
+
+ "github.com/cobaltcore-dev/cortex/api/v1alpha1"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/db"
+ testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute"
+ "github.com/cobaltcore-dev/cortex/pkg/conf"
+ "github.com/prometheus/client_golang/prometheus"
+ prometheusgo "github.com/prometheus/client_model/go"
+ v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "sigs.k8s.io/controller-runtime/pkg/client/fake"
+)
+
+func buildHostCapacityClient(t *testing.T, hostDetails []compute.HostDetails, utilizations []compute.HostUtilization) *fake.ClientBuilder {
+ t.Helper()
+ scheme, err := v1alpha1.SchemeBuilder.Build()
+ if err != nil {
+ t.Fatalf("failed to build scheme: %v", err)
+ }
+ rawDetails, err := v1alpha1.BoxFeatureList(hostDetails)
+ if err != nil {
+ t.Fatalf("failed to box host details: %v", err)
+ }
+ rawUtils, err := v1alpha1.BoxFeatureList(utilizations)
+ if err != nil {
+ t.Fatalf("failed to box host utilizations: %v", err)
+ }
+ return fake.NewClientBuilder().WithScheme(scheme).WithRuntimeObjects(
+ &v1alpha1.Knowledge{
+ ObjectMeta: v1.ObjectMeta{Name: hostDetailsKnowledgeName},
+ Status: v1alpha1.KnowledgeStatus{Raw: rawDetails},
+ },
+ &v1alpha1.Knowledge{
+ ObjectMeta: v1.ObjectMeta{Name: hostUtilizationKnowledgeName},
+ Status: v1alpha1.KnowledgeStatus{Raw: rawUtils},
+ },
+ )
+}
+
+func TestVMwareHostCapacityKPI_Init(t *testing.T) {
+ dbEnv := testlibDB.SetupDBEnv(t)
+ testDB := db.DB{DbMap: dbEnv.DbMap}
+ defer dbEnv.Close()
+ kpi := &VMwareHostCapacityKPI{}
+ if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+}
+
+func TestVMwareHostCapacityKPI_getVMwareHosts(t *testing.T) {
+ hostDetails := []compute.HostDetails{
+ {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware},
+ {ComputeHost: "nova-compute-2", HypervisorFamily: hypervisorFamilyVMware},
+ {ComputeHost: "nova-compute-ironic-1", HypervisorType: vmwareIronicHypervisorType, HypervisorFamily: hypervisorFamilyVMware},
+ {ComputeHost: "nova-compute-3", HypervisorFamily: "other"},
+ }
+
+ client := buildHostCapacityClient(t, hostDetails, nil)
+ kpi := &VMwareHostCapacityKPI{}
+ kpi.Client = client.Build()
+
+ hosts, err := kpi.getVMwareHosts()
+ if err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ if len(hosts) != 2 {
+ t.Fatalf("expected 2 hosts, got %d", len(hosts))
+ }
+ seen := make(map[string]bool)
+ for _, h := range hosts {
+ seen[h.ComputeHost] = true
+ }
+ for _, name := range []string{"nova-compute-1", "nova-compute-2"} {
+ if !seen[name] {
+ t.Errorf("expected host %q in result", name)
+ }
+ }
+}
+
+func TestVMwareHostCapacityKPI_getHostUtilizations(t *testing.T) {
+ tests := []struct {
+ name string
+ utilizations []compute.HostUtilization
+ expectedHosts []string
+ }{
+ {
+ name: "normal utilizations are returned",
+ utilizations: []compute.HostUtilization{
+ {ComputeHost: "h1", TotalVCPUsAllocatable: 10, TotalRAMAllocatableMB: 1024, TotalDiskAllocatableGB: 100},
+ {ComputeHost: "h2", TotalVCPUsAllocatable: 20, TotalRAMAllocatableMB: 2048, TotalDiskAllocatableGB: 200},
+ },
+ expectedHosts: []string{"h1", "h2"},
+ },
+ {
+ name: "zero TotalVCPUsAllocatable is skipped",
+ utilizations: []compute.HostUtilization{
+ {ComputeHost: "h1", TotalVCPUsAllocatable: 0, TotalRAMAllocatableMB: 1024, TotalDiskAllocatableGB: 100},
+ },
+ expectedHosts: []string{},
+ },
+ {
+ name: "zero TotalRAMAllocatableMB is skipped",
+ utilizations: []compute.HostUtilization{
+ {ComputeHost: "h1", TotalVCPUsAllocatable: 10, TotalRAMAllocatableMB: 0, TotalDiskAllocatableGB: 100},
+ },
+ expectedHosts: []string{},
+ },
+ {
+ name: "zero TotalDiskAllocatableGB is skipped",
+ utilizations: []compute.HostUtilization{
+ {ComputeHost: "h1", TotalVCPUsAllocatable: 10, TotalRAMAllocatableMB: 1024, TotalDiskAllocatableGB: 0},
+ },
+ expectedHosts: []string{},
+ },
+ {
+ name: "mix of valid and zero-allocatable entries",
+ utilizations: []compute.HostUtilization{
+ {ComputeHost: "h1", TotalVCPUsAllocatable: 10, TotalRAMAllocatableMB: 1024, TotalDiskAllocatableGB: 100},
+ {ComputeHost: "h2", TotalVCPUsAllocatable: 0, TotalRAMAllocatableMB: 1024, TotalDiskAllocatableGB: 100},
+ },
+ expectedHosts: []string{"h1"},
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ client := buildHostCapacityClient(t, nil, tt.utilizations)
+ kpi := &VMwareHostCapacityKPI{}
+ kpi.Client = client.Build()
+
+ m, err := kpi.getHostUtilizations()
+ if err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+ if len(m) != len(tt.expectedHosts) {
+ t.Fatalf("expected %d entries, got %d: %v", len(tt.expectedHosts), len(m), m)
+ }
+ for _, host := range tt.expectedHosts {
+ if _, ok := m[host]; !ok {
+ t.Errorf("expected host %q in result", host)
+ }
+ }
+ })
+ }
+}
+
+func TestVMwareHostCapacityKPI_Collect(t *testing.T) {
+ tests := []struct {
+ name string
+ hostDetails []compute.HostDetails
+ utilizations []compute.HostUtilization
+ expectedMetrics []collectedVMwareMetric
+ }{
+ {
+ name: "single host emits usage and total metrics",
+ hostDetails: []compute.HostDetails{
+ {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"},
+ },
+ utilizations: []compute.HostUtilization{
+ {
+ ComputeHost: "nova-compute-1",
+ VCPUsUsed: 4,
+ TotalVCPUsAllocatable: 16,
+ RAMUsedMB: 2048,
+ TotalRAMAllocatableMB: 8192,
+ DiskUsedGB: 50,
+ TotalDiskAllocatableGB: 500,
+ },
+ },
+ expectedMetrics: []collectedVMwareMetric{
+ {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "cpu"), Value: 4},
+ {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "ram"), Value: 2048 * 1024 * 1024},
+ {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "disk"), Value: 50 * 1024 * 1024 * 1024},
+ {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "cpu"), Value: 16},
+ {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "ram"), Value: 8192 * 1024 * 1024},
+ {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "disk"), Value: 500 * 1024 * 1024 * 1024},
+ },
+ },
+ {
+ name: "multiple hosts each emit their own metrics",
+ hostDetails: []compute.HostDetails{
+ {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"},
+ {ComputeHost: "nova-compute-2", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az2"},
+ },
+ utilizations: []compute.HostUtilization{
+ {ComputeHost: "nova-compute-1", VCPUsUsed: 2, TotalVCPUsAllocatable: 8, RAMUsedMB: 512, TotalRAMAllocatableMB: 2048, DiskUsedGB: 10, TotalDiskAllocatableGB: 100},
+ {ComputeHost: "nova-compute-2", VCPUsUsed: 6, TotalVCPUsAllocatable: 12, RAMUsedMB: 1024, TotalRAMAllocatableMB: 4096, DiskUsedGB: 20, TotalDiskAllocatableGB: 200},
+ },
+ expectedMetrics: []collectedVMwareMetric{
+ {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "cpu"), Value: 2},
+ {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "ram"), Value: 512 * 1024 * 1024},
+ {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "disk"), Value: 10 * 1024 * 1024 * 1024},
+ {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "cpu"), Value: 8},
+ {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "ram"), Value: 2048 * 1024 * 1024},
+ {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "disk"), Value: 100 * 1024 * 1024 * 1024},
+ {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-2", "az2", "cpu"), Value: 6},
+ {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-2", "az2", "ram"), Value: 1024 * 1024 * 1024},
+ {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-2", "az2", "disk"), Value: 20 * 1024 * 1024 * 1024},
+ {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-2", "az2", "cpu"), Value: 12},
+ {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-2", "az2", "ram"), Value: 4096 * 1024 * 1024},
+ {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-2", "az2", "disk"), Value: 200 * 1024 * 1024 * 1024},
+ },
+ },
+ {
+ name: "ironic hosts are excluded",
+ hostDetails: []compute.HostDetails{
+ {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"},
+ {ComputeHost: "nova-compute-ironic-1", HypervisorType: vmwareIronicHypervisorType, HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"},
+ },
+ utilizations: []compute.HostUtilization{
+ {ComputeHost: "nova-compute-1", VCPUsUsed: 2, TotalVCPUsAllocatable: 8, RAMUsedMB: 512, TotalRAMAllocatableMB: 2048, DiskUsedGB: 10, TotalDiskAllocatableGB: 100},
+ {ComputeHost: "nova-compute-ironic-1", VCPUsUsed: 4, TotalVCPUsAllocatable: 16, RAMUsedMB: 1024, TotalRAMAllocatableMB: 4096, DiskUsedGB: 20, TotalDiskAllocatableGB: 200},
+ },
+ expectedMetrics: []collectedVMwareMetric{
+ {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "cpu"), Value: 2},
+ {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "ram"), Value: 512 * 1024 * 1024},
+ {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "disk"), Value: 10 * 1024 * 1024 * 1024},
+ {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "cpu"), Value: 8},
+ {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "ram"), Value: 2048 * 1024 * 1024},
+ {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "disk"), Value: 100 * 1024 * 1024 * 1024},
+ },
+ },
+ {
+ name: "non-vmware hosts are excluded",
+ hostDetails: []compute.HostDetails{
+ {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"},
+ {ComputeHost: "nova-compute-2", HypervisorFamily: "kvm", AvailabilityZone: "az1"},
+ },
+ utilizations: []compute.HostUtilization{
+ {ComputeHost: "nova-compute-1", VCPUsUsed: 2, TotalVCPUsAllocatable: 8, RAMUsedMB: 512, TotalRAMAllocatableMB: 2048, DiskUsedGB: 10, TotalDiskAllocatableGB: 100},
+ {ComputeHost: "nova-compute-2", VCPUsUsed: 4, TotalVCPUsAllocatable: 16, RAMUsedMB: 1024, TotalRAMAllocatableMB: 4096, DiskUsedGB: 20, TotalDiskAllocatableGB: 200},
+ },
+ expectedMetrics: []collectedVMwareMetric{
+ {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "cpu"), Value: 2},
+ {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "ram"), Value: 512 * 1024 * 1024},
+ {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "disk"), Value: 10 * 1024 * 1024 * 1024},
+ {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "cpu"), Value: 8},
+ {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "ram"), Value: 2048 * 1024 * 1024},
+ {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "disk"), Value: 100 * 1024 * 1024 * 1024},
+ },
+ },
+ {
+ name: "host without matching utilization produces no metrics",
+ hostDetails: []compute.HostDetails{
+ {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"},
+ },
+ utilizations: []compute.HostUtilization{},
+ expectedMetrics: []collectedVMwareMetric{},
+ },
+ {
+ name: "utilization with zero allocatable resources is skipped",
+ hostDetails: []compute.HostDetails{
+ {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"},
+ },
+ utilizations: []compute.HostUtilization{
+ {ComputeHost: "nova-compute-1", VCPUsUsed: 2, TotalVCPUsAllocatable: 0, RAMUsedMB: 512, TotalRAMAllocatableMB: 2048, DiskUsedGB: 10, TotalDiskAllocatableGB: 100},
+ },
+ expectedMetrics: []collectedVMwareMetric{},
+ },
+ {
+ name: "no hosts produces no metrics",
+ hostDetails: []compute.HostDetails{},
+ utilizations: []compute.HostUtilization{},
+ expectedMetrics: []collectedVMwareMetric{},
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ dbEnv := testlibDB.SetupDBEnv(t)
+ testDB := db.DB{DbMap: dbEnv.DbMap}
+ defer dbEnv.Close()
+
+ client := buildHostCapacityClient(t, tt.hostDetails, tt.utilizations)
+ kpi := &VMwareHostCapacityKPI{}
+ if err := kpi.Init(&testDB, client.Build(), conf.NewRawOpts("{}")); err != nil {
+ t.Fatalf("expected no error on Init, got %v", err)
+ }
+
+ ch := make(chan prometheus.Metric, 200)
+ kpi.Collect(ch)
+ close(ch)
+
+ actual := make(map[string]collectedVMwareMetric)
+ for m := range ch {
+ var pm prometheusgo.Metric
+ if err := m.Write(&pm); err != nil {
+ t.Fatalf("failed to write metric: %v", err)
+ }
+ labels := make(map[string]string)
+ for _, lbl := range pm.Label {
+ labels[lbl.GetName()] = lbl.GetValue()
+ }
+ name := getMetricName(m.Desc().String())
+ key := name + "|" + labels["compute_host"] + "|" + labels["resource"]
+ if _, exists := actual[key]; exists {
+ t.Fatalf("duplicate metric key %q", key)
+ }
+ actual[key] = collectedVMwareMetric{Name: name, Labels: labels, Value: pm.GetGauge().GetValue()}
+ }
+
+ if len(actual) != len(tt.expectedMetrics) {
+ t.Errorf("expected %d metrics, got %d: actual=%v", len(tt.expectedMetrics), len(actual), actual)
+ }
+ for _, exp := range tt.expectedMetrics {
+ key := exp.Name + "|" + exp.Labels["compute_host"] + "|" + exp.Labels["resource"]
+ got, ok := actual[key]
+ if !ok {
+ t.Errorf("missing metric %q", key)
+ continue
+ }
+ if got.Value != exp.Value {
+ t.Errorf("metric %q value: expected %v, got %v", key, exp.Value, got.Value)
+ }
+ if !reflect.DeepEqual(exp.Labels, got.Labels) {
+ t.Errorf("metric %q labels: expected %v, got %v", key, exp.Labels, got.Labels)
+ }
+ }
+ })
+ }
+}
+
+func hostCapacityLabels(computeHost, az, resource string) map[string]string {
+ labels := mockVMwareHostLabels(computeHost, az)
+ labels["resource"] = resource
+ return labels
+}
diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization_test.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization_test.go
index 9f6d84786..4c43c893b 100644
--- a/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization_test.go
+++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization_test.go
@@ -33,24 +33,8 @@ func buildMetricKey(name string, labels map[string]string) string {
}
}
-func hostLabels(computeHost, az string) map[string]string {
- return map[string]string{
- "availability_zone": az,
- "compute_host": computeHost,
- "cpu_architecture": "",
- "workload_type": "",
- "hypervisor_family": "vmware",
- "enabled": "false",
- "decommissioned": "false",
- "external_customer": "false",
- "disabled_reason": "-",
- "pinned_projects": "false",
- "pinned_project_ids": "",
- }
-}
-
func instanceMetric(computeHost, az, projectID, projectName, flavorName string, value float64) collectedVMwareMetric {
- labels := hostLabels(computeHost, az)
+ labels := mockVMwareHostLabels(computeHost, az)
labels["project_id"] = projectID
labels["project_name"] = projectName
labels["flavor_name"] = flavorName
@@ -58,7 +42,7 @@ func instanceMetric(computeHost, az, projectID, projectName, flavorName string,
}
func capacityMetric(computeHost, az, projectID, projectName, resource string, value float64) collectedVMwareMetric {
- labels := hostLabels(computeHost, az)
+ labels := mockVMwareHostLabels(computeHost, az)
labels["project_id"] = projectID
labels["project_name"] = projectName
labels["resource"] = resource
diff --git a/internal/knowledge/kpis/supported_kpis.go b/internal/knowledge/kpis/supported_kpis.go
index 19726a488..63a35866b 100644
--- a/internal/knowledge/kpis/supported_kpis.go
+++ b/internal/knowledge/kpis/supported_kpis.go
@@ -16,7 +16,6 @@ var supportedKPIs = map[string]plugins.KPI{
"kvm_host_capacity_kpi": &compute.KVMResourceCapacityKPI{},
"vmware_host_contention_kpi": &compute.VMwareHostContentionKPI{},
"vmware_project_noisiness_kpi": &compute.VMwareProjectNoisinessKPI{},
- "vmware_host_capacity_kpi": &compute.VMwareResourceCapacityKPI{},
"host_running_vms_kpi": &compute.HostRunningVMsKPI{},
"flavor_running_vms_kpi": &compute.FlavorRunningVMsKPI{},
"vm_migration_statistics_kpi": &compute.VMMigrationStatisticsKPI{},
@@ -26,6 +25,7 @@ var supportedKPIs = map[string]plugins.KPI{
"vmware_project_utilization_kpi": &infrastructure.VMwareProjectUtilizationKPI{},
"vmware_resource_commitments_kpi": &infrastructure.VMwareResourceCommitmentsKPI{},
+ "vmware_host_capacity_kpi": &infrastructure.VMwareHostCapacityKPI{},
"netapp_storage_pool_cpu_usage_kpi": &storage.NetAppStoragePoolCPUUsageKPI{},
From a9cf68856373c9b12deaf8478b3202a8403a6365 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Wed, 29 Apr 2026 12:51:22 +0000
Subject: [PATCH 22/54] Bump cortex chart appVersions to sha-c9d8943a [skip ci]
---
helm/library/cortex/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml
index 8e981e365..a00036565 100644
--- a/helm/library/cortex/Chart.yaml
+++ b/helm/library/cortex/Chart.yaml
@@ -3,6 +3,6 @@ name: cortex
description: A Helm chart to distribute cortex.
type: application
version: 0.0.44
-appVersion: "sha-dc6bbe7c"
+appVersion: "sha-c9d8943a"
icon: "https://example.com/icon.png"
dependencies: []
From 5a1a8838a0dc8e09eb3a9585d35f79c7b8e6ef5d Mon Sep 17 00:00:00 2001
From: Philipp Matthes <27271818+PhilippMatthes@users.noreply.github.com>
Date: Thu, 30 Apr 2026 08:33:36 +0200
Subject: [PATCH 23/54] Support traits and aggregates endpoints in placement
shim (#766)
Implements GET/PUT/DELETE /resource_providers/{uuid}/traits and GET/PUT
/resource_providers/{uuid}/aggregates with three feature modes
(passthrough, hybrid, crd-only). Hybrid mode serves from the Hypervisor
CRD for KVM providers and forwards to upstream placement for non-KVM.
CRD-only mode returns 404 for non-KVM providers. Uses
metadata.generation for optimistic concurrency (409 on mismatch).
Includes unit tests and e2e tests for all modes.
---
Tiltfile | 2 +-
go.mod | 2 +-
go.sum | 4 +-
.../handle_resource_provider_aggregates.go | 261 ++++++++++++-
...handle_resource_provider_aggregates_e2e.go | 345 ++++++++++++------
...andle_resource_provider_aggregates_test.go | 218 +++++++++--
.../handle_resource_provider_traits.go | 279 +++++++++++++-
.../handle_resource_provider_traits_e2e.go | 330 +++++++++++------
.../handle_resource_provider_traits_test.go | 201 +++++++++-
.../handle_resource_providers_e2e.go | 3 -
internal/shim/placement/shim_e2e.go | 24 ++
11 files changed, 1368 insertions(+), 301 deletions(-)
diff --git a/Tiltfile b/Tiltfile
index 87d8d026d..ef1ee3b02 100644
--- a/Tiltfile
+++ b/Tiltfile
@@ -83,7 +83,7 @@ local('kubectl wait --namespace cert-manager --for=condition=available deploymen
########### Dependency CRDs
# Make sure the local cluster is running if you are running into startup issues here.
-url = 'https://raw.githubusercontent.com/cobaltcore-dev/openstack-hypervisor-operator/refs/heads/main/charts/openstack-hypervisor-operator/crds/kvm.cloud.sap_hypervisors.yaml'
+url = 'https://raw.githubusercontent.com/cobaltcore-dev/openstack-hypervisor-operator/d35f2bc2c5d4fd634b17e7a8dd77ff3025758fbb/charts/openstack-hypervisor-operator/crds/kvm.cloud.sap_hypervisors.yaml'
local('curl -L ' + url + ' | kubectl apply -f -')
########### Cortex Manager & CRDs
diff --git a/go.mod b/go.mod
index a23aa4ff8..0c3b9c736 100644
--- a/go.mod
+++ b/go.mod
@@ -3,7 +3,7 @@ module github.com/cobaltcore-dev/cortex
go 1.26.0
require (
- github.com/cobaltcore-dev/openstack-hypervisor-operator v1.0.2-0.20260423190401-f34871697a61
+ github.com/cobaltcore-dev/openstack-hypervisor-operator v1.0.2-0.20260429064011-d35f2bc2c5d4
github.com/go-gorp/gorp v2.2.0+incompatible
github.com/gophercloud/gophercloud/v2 v2.12.0
github.com/ironcore-dev/ironcore v0.3.0
diff --git a/go.sum b/go.sum
index f26c7a92b..66a44dd4d 100644
--- a/go.sum
+++ b/go.sum
@@ -20,8 +20,8 @@ github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1x
github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
-github.com/cobaltcore-dev/openstack-hypervisor-operator v1.0.2-0.20260423190401-f34871697a61 h1:I0qmFydo/Bibw0JLRypLmLnlZOx5fl4NNPaOiLKUfmU=
-github.com/cobaltcore-dev/openstack-hypervisor-operator v1.0.2-0.20260423190401-f34871697a61/go.mod h1:fTJ5LAHj8NJ0AuQtsEX16Z1LXtCKqJfg+UhGfEnwImA=
+github.com/cobaltcore-dev/openstack-hypervisor-operator v1.0.2-0.20260429064011-d35f2bc2c5d4 h1:Umm6n7LMDnqqZ6QIMIFxzJmuBX/Bke4uvstm+KFKcaQ=
+github.com/cobaltcore-dev/openstack-hypervisor-operator v1.0.2-0.20260429064011-d35f2bc2c5d4/go.mod h1:fTJ5LAHj8NJ0AuQtsEX16Z1LXtCKqJfg+UhGfEnwImA=
github.com/containerd/continuity v0.4.5 h1:ZRoN1sXq9u7V6QoHMcVWGhOwDFqZ4B9i5H6un1Wh0x4=
github.com/containerd/continuity v0.4.5/go.mod h1:/lNJvtJKUQStBzpVQ1+rasXO1LAWtUQssk28EZvJ3nE=
github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI=
diff --git a/internal/shim/placement/handle_resource_provider_aggregates.go b/internal/shim/placement/handle_resource_provider_aggregates.go
index 3d7205193..8d509f732 100644
--- a/internal/shim/placement/handle_resource_provider_aggregates.go
+++ b/internal/shim/placement/handle_resource_provider_aggregates.go
@@ -4,42 +4,271 @@
package placement
import (
+ "encoding/json"
"net/http"
+
+ hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
+ apierrors "k8s.io/apimachinery/pkg/api/errors"
+ "sigs.k8s.io/controller-runtime/pkg/client"
+ logf "sigs.k8s.io/controller-runtime/pkg/log"
)
+// resourceProviderAggregatesResponse is the JSON body returned by
+// GET /resource_providers/{uuid}/aggregates and
+// PUT /resource_providers/{uuid}/aggregates (microversion 1.19+).
+//
+// https://docs.openstack.org/api-ref/placement/#resource-provider-aggregates
+type resourceProviderAggregatesResponse struct {
+ Aggregates []string `json:"aggregates"`
+ ResourceProviderGeneration int64 `json:"resource_provider_generation"`
+}
+
+// resourceProviderAggregatesRequest is the JSON body expected by
+// PUT /resource_providers/{uuid}/aggregates (microversion 1.19+).
+type resourceProviderAggregatesRequest struct {
+ Aggregates []string `json:"aggregates"`
+ ResourceProviderGeneration int64 `json:"resource_provider_generation"`
+}
+
// HandleListResourceProviderAggregates handles
// GET /resource_providers/{uuid}/aggregates requests.
//
// Returns the list of aggregate UUIDs associated with the resource provider.
// Aggregates model relationships among providers such as shared storage,
// affinity/anti-affinity groups, and availability zones. Returns an empty
-// list if the provider has no aggregate associations. Available since
-// microversion 1.1.
+// list if the provider has no aggregate associations.
+//
+// Routing: the uuid is used to determine if the resource provider is a KVM
+// hypervisor or vmware/ironic hypervisor. Passthrough mode forwards all
+// requests to upstream placement. Hybrid mode uses the hypervisor CRD for
+// KVM hypervisors and forwards for anything else. CRD-only mode rejects
+// any non-KVM calls with 404.
//
-// The response format changed at microversion 1.19: earlier versions return
-// only a flat array of UUIDs, while 1.19+ returns an object that also
-// includes the resource_provider_generation for concurrency tracking. Returns
-// 404 if the provider does not exist.
+// https://docs.openstack.org/api-ref/placement/#list-resource-provider-aggregates
func (s *Shim) HandleListResourceProviderAggregates(w http.ResponseWriter, r *http.Request) {
- if _, ok := requiredUUIDPathParam(w, r, "uuid"); !ok {
+ uuid, ok := requiredUUIDPathParam(w, r, "uuid")
+ if !ok {
return
}
- s.dispatchPassthroughOnly(w, r, s.config.Features.Aggregates)
+ switch s.featureModeFromConfOrHeader(r, s.config.Features.Aggregates) {
+ case FeatureModePassthrough:
+ s.forward(w, r)
+ case FeatureModeHybrid:
+ s.listResourceProviderAggregatesHybrid(w, r, uuid)
+ case FeatureModeCRD:
+ s.listResourceProviderAggregatesCRD(w, r, uuid)
+ default:
+ http.Error(w, "unknown feature mode", http.StatusInternalServerError)
+ }
+}
+
+// listResourceProviderAggregatesHybrid serves from the CRD if the provider is
+// a KVM hypervisor, otherwise forwards to upstream placement.
+func (s *Shim) listResourceProviderAggregatesHybrid(w http.ResponseWriter, r *http.Request, uuid string) {
+ ctx := r.Context()
+ log := logf.FromContext(ctx)
+
+ var hvs hv1.HypervisorList
+ err := s.List(ctx, &hvs, client.MatchingFields{idxHypervisorOpenStackId: uuid})
+ if err != nil || len(hvs.Items) != 1 {
+ log.Info("resource provider not resolved from kubernetes, forwarding to upstream placement", "uuid", uuid)
+ s.forward(w, r)
+ return
+ }
+ log.Info("resolved resource provider from CRD, serving aggregates", "uuid", uuid, "hypervisor", hvs.Items[0].Name)
+ s.writeAggregatesFromCRD(w, &hvs.Items[0])
+}
+
+// listResourceProviderAggregatesCRD serves exclusively from the CRD, returning
+// 404 if the provider is not a known KVM hypervisor.
+func (s *Shim) listResourceProviderAggregatesCRD(w http.ResponseWriter, r *http.Request, uuid string) {
+ ctx := r.Context()
+ log := logf.FromContext(ctx)
+
+ var hvs hv1.HypervisorList
+ err := s.List(ctx, &hvs, client.MatchingFields{idxHypervisorOpenStackId: uuid})
+ if apierrors.IsNotFound(err) || len(hvs.Items) == 0 {
+ log.Info("resource provider not found in kubernetes (crd mode)", "uuid", uuid)
+ http.Error(w, "resource provider not found", http.StatusNotFound)
+ return
+ }
+ if err != nil {
+ log.Error(err, "failed to list hypervisors with OpenStack ID index")
+ http.Error(w, "Internal Server Error", http.StatusInternalServerError)
+ return
+ }
+ if len(hvs.Items) > 1 {
+ log.Error(nil, "multiple hypervisors found with the same OpenStack ID", "uuid", uuid)
+ http.Error(w, "Internal Server Error", http.StatusInternalServerError)
+ return
+ }
+ log.Info("serving aggregates from CRD", "uuid", uuid, "hypervisor", hvs.Items[0].Name)
+ s.writeAggregatesFromCRD(w, &hvs.Items[0])
+}
+
+func (s *Shim) writeAggregatesFromCRD(w http.ResponseWriter, hv *hv1.Hypervisor) {
+ aggGroups := hv1.GetAggregates(hv.Spec.Groups)
+ aggregates := make([]string, 0, len(aggGroups))
+ for _, ag := range aggGroups {
+ aggregates = append(aggregates, ag.UUID)
+ }
+ s.writeJSON(w, http.StatusOK, resourceProviderAggregatesResponse{
+ Aggregates: aggregates,
+ ResourceProviderGeneration: hv.Generation,
+ })
}
// HandleUpdateResourceProviderAggregates handles
// PUT /resource_providers/{uuid}/aggregates requests.
//
// Replaces the complete set of aggregate associations for a resource provider.
-// Any aggregate UUIDs that do not yet exist are created automatically. The
-// request format changed at microversion 1.19: earlier versions accept a
-// plain array of UUIDs, while 1.19+ expects an object containing an
-// aggregates array and a resource_provider_generation for optimistic
-// concurrency control. Returns 409 Conflict if the generation does not match
-// (1.19+). Returns 200 with the updated aggregate list on success.
+// The request body must include an aggregates array and a
+// resource_provider_generation for optimistic concurrency control. Returns
+// 409 Conflict if the generation does not match. Returns 200 with the
+// updated aggregate list on success.
+//
+// Routing: same selective per-provider dispatch as GET.
+//
+// https://docs.openstack.org/api-ref/placement/#update-resource-provider-aggregates
func (s *Shim) HandleUpdateResourceProviderAggregates(w http.ResponseWriter, r *http.Request) {
- if _, ok := requiredUUIDPathParam(w, r, "uuid"); !ok {
+ uuid, ok := requiredUUIDPathParam(w, r, "uuid")
+ if !ok {
+ return
+ }
+ switch s.featureModeFromConfOrHeader(r, s.config.Features.Aggregates) {
+ case FeatureModePassthrough:
+ s.forward(w, r)
+ case FeatureModeHybrid:
+ s.updateResourceProviderAggregatesHybrid(w, r, uuid)
+ case FeatureModeCRD:
+ s.updateResourceProviderAggregatesCRD(w, r, uuid)
+ default:
+ http.Error(w, "unknown feature mode", http.StatusInternalServerError)
+ }
+}
+
+// updateResourceProviderAggregatesHybrid updates aggregates via the CRD if the
+// provider is a KVM hypervisor, otherwise forwards to upstream placement.
+func (s *Shim) updateResourceProviderAggregatesHybrid(w http.ResponseWriter, r *http.Request, uuid string) {
+ ctx := r.Context()
+ log := logf.FromContext(ctx)
+
+ var hvs hv1.HypervisorList
+ err := s.List(ctx, &hvs, client.MatchingFields{idxHypervisorOpenStackId: uuid})
+ if err != nil || len(hvs.Items) != 1 {
+ log.Info("resource provider not resolved from kubernetes, forwarding to upstream placement", "uuid", uuid)
+ s.forward(w, r)
return
}
- s.dispatchPassthroughOnly(w, r, s.config.Features.Aggregates)
+ hv := &hvs.Items[0]
+ log.Info("resolved resource provider from CRD, updating aggregates", "uuid", uuid, "hypervisor", hv.Name)
+
+ var req resourceProviderAggregatesRequest
+ if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+ http.Error(w, "malformed request body", http.StatusBadRequest)
+ return
+ }
+ if req.ResourceProviderGeneration != hv.Generation {
+ log.Info("generation mismatch on aggregate update",
+ "expected", req.ResourceProviderGeneration, "actual", hv.Generation)
+ http.Error(w, "resource provider generation conflict", http.StatusConflict)
+ return
+ }
+
+ var newGroups []hv1.Group
+ for i := range hv.Spec.Groups {
+ if hv.Spec.Groups[i].Aggregate == nil {
+ newGroups = append(newGroups, hv.Spec.Groups[i])
+ }
+ }
+ for _, aggUUID := range req.Aggregates {
+ newGroups = append(newGroups, hv1.Group{
+ Aggregate: &hv1.AggregateGroup{Name: aggUUID, UUID: aggUUID},
+ })
+ }
+ hv.Spec.Groups = newGroups
+
+ if err := s.Update(ctx, hv); err != nil {
+ if apierrors.IsConflict(err) {
+ http.Error(w, "resource provider generation conflict", http.StatusConflict)
+ return
+ }
+ log.Error(err, "failed to update hypervisor aggregates")
+ http.Error(w, "Internal Server Error", http.StatusInternalServerError)
+ return
+ }
+
+ log.Info("successfully updated aggregates via CRD", "uuid", uuid, "aggregateCount", len(req.Aggregates))
+ s.writeJSON(w, http.StatusOK, resourceProviderAggregatesResponse{
+ Aggregates: req.Aggregates,
+ ResourceProviderGeneration: hv.Generation,
+ })
+}
+
+// updateResourceProviderAggregatesCRD updates aggregates exclusively via the
+// CRD, returning 404 if the provider is not a known KVM hypervisor.
+func (s *Shim) updateResourceProviderAggregatesCRD(w http.ResponseWriter, r *http.Request, uuid string) {
+ ctx := r.Context()
+ log := logf.FromContext(ctx)
+
+ var hvs hv1.HypervisorList
+ err := s.List(ctx, &hvs, client.MatchingFields{idxHypervisorOpenStackId: uuid})
+ if apierrors.IsNotFound(err) || len(hvs.Items) == 0 {
+ log.Info("resource provider not found in kubernetes (crd mode)", "uuid", uuid)
+ http.Error(w, "resource provider not found", http.StatusNotFound)
+ return
+ }
+ if err != nil {
+ log.Error(err, "failed to list hypervisors with OpenStack ID index")
+ http.Error(w, "Internal Server Error", http.StatusInternalServerError)
+ return
+ }
+ if len(hvs.Items) > 1 {
+ log.Error(nil, "multiple hypervisors found with the same OpenStack ID", "uuid", uuid)
+ http.Error(w, "Internal Server Error", http.StatusInternalServerError)
+ return
+ }
+ hv := &hvs.Items[0]
+ log.Info("updating aggregates via CRD", "uuid", uuid, "hypervisor", hv.Name)
+
+ var req resourceProviderAggregatesRequest
+ if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+ http.Error(w, "malformed request body", http.StatusBadRequest)
+ return
+ }
+ if req.ResourceProviderGeneration != hv.Generation {
+ log.Info("generation mismatch on aggregate update",
+ "expected", req.ResourceProviderGeneration, "actual", hv.Generation)
+ http.Error(w, "resource provider generation conflict", http.StatusConflict)
+ return
+ }
+
+ var newGroups []hv1.Group
+ for i := range hv.Spec.Groups {
+ if hv.Spec.Groups[i].Aggregate == nil {
+ newGroups = append(newGroups, hv.Spec.Groups[i])
+ }
+ }
+ for _, aggUUID := range req.Aggregates {
+ newGroups = append(newGroups, hv1.Group{
+ Aggregate: &hv1.AggregateGroup{Name: aggUUID, UUID: aggUUID},
+ })
+ }
+ hv.Spec.Groups = newGroups
+
+ if err := s.Update(ctx, hv); err != nil {
+ if apierrors.IsConflict(err) {
+ http.Error(w, "resource provider generation conflict", http.StatusConflict)
+ return
+ }
+ log.Error(err, "failed to update hypervisor aggregates")
+ http.Error(w, "Internal Server Error", http.StatusInternalServerError)
+ return
+ }
+
+ log.Info("successfully updated aggregates via CRD", "uuid", uuid, "aggregateCount", len(req.Aggregates))
+ s.writeJSON(w, http.StatusOK, resourceProviderAggregatesResponse{
+ Aggregates: req.Aggregates,
+ ResourceProviderGeneration: hv.Generation,
+ })
}
diff --git a/internal/shim/placement/handle_resource_provider_aggregates_e2e.go b/internal/shim/placement/handle_resource_provider_aggregates_e2e.go
index 7eb6ba089..3f7f55424 100644
--- a/internal/shim/placement/handle_resource_provider_aggregates_e2e.go
+++ b/internal/shim/placement/handle_resource_provider_aggregates_e2e.go
@@ -10,8 +10,12 @@ import (
"fmt"
"net/http"
"slices"
+ "time"
"github.com/cobaltcore-dev/cortex/pkg/conf"
+ hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
+ "github.com/gophercloud/gophercloud/v2"
+ apierrors "k8s.io/apimachinery/pkg/api/errors"
"sigs.k8s.io/controller-runtime/pkg/client"
logf "sigs.k8s.io/controller-runtime/pkg/log"
)
@@ -19,15 +23,11 @@ import (
// e2eTestResourceProviderAggregates tests the
// /resource_providers/{uuid}/aggregates endpoints.
//
-// 1. Pre-cleanup: DELETE any leftover test RP (ignore 404).
-// 2. POST /resource_providers — create a test RP.
-// 3. GET /{uuid}/aggregates — verify aggregates are empty, store generation.
-// 4. PUT /{uuid}/aggregates — associate two aggregate UUIDs with the RP.
-// 5. GET /{uuid}/aggregates — verify both aggregate UUIDs are present.
-// 6. PUT /{uuid}/aggregates — clear aggregates by sending an empty list.
-// 7. GET /{uuid}/aggregates — verify aggregates are empty after clear.
-// 8. Cleanup: DELETE the test RP (also runs via deferred cleanup on failure).
-func e2eTestResourceProviderAggregates(ctx context.Context, _ client.Client) error {
+// In passthrough mode: exercises the upstream placement path with a
+// dynamically created resource provider.
+// In hybrid/crd mode: exercises the spec.groups-backed CRD path using a
+// real KVM hypervisor discovered from the cluster.
+func e2eTestResourceProviderAggregates(ctx context.Context, cl client.Client) error {
log := logf.FromContext(ctx)
log.Info("Running resource provider aggregates endpoint e2e test")
config, err := conf.GetConfig[e2eRootConfig]()
@@ -43,21 +43,26 @@ func e2eTestResourceProviderAggregates(ctx context.Context, _ client.Client) err
}
log.Info("Successfully created openstack client for resource provider aggregates e2e test")
+ mode := e2eCurrentMode(ctx)
+ switch mode {
+ case FeatureModePassthrough:
+ return e2ePassthroughResourceProviderAggregates(ctx, sc)
+ case FeatureModeHybrid, FeatureModeCRD:
+ return e2eCRDResourceProviderAggregates(ctx, sc, cl)
+ default:
+ return fmt.Errorf("unexpected mode %q", mode)
+ }
+}
+
+func e2ePassthroughResourceProviderAggregates(ctx context.Context, sc *gophercloud.ServiceClient) error {
+ log := logf.FromContext(ctx)
+
const testRPUUID = "e2e10000-0000-0000-0000-000000000004"
const testRPName = "cortex-e2e-test-rp-agg"
const testAggUUID1 = "e2e30000-0000-0000-0000-000000000001"
const testAggUUID2 = "e2e30000-0000-0000-0000-000000000002"
- // Probe: for non-passthrough modes, verify endpoint returns 501.
- unimplemented, err := e2eProbeUnimplemented(ctx, sc, sc.Endpoint+"/resource_providers/"+testRPUUID+"/aggregates")
- if err != nil {
- return fmt.Errorf("probe: %w", err)
- }
- if unimplemented {
- return nil
- }
-
- // Pre-cleanup: delete any leftover test resource provider from a prior run.
+ // Pre-cleanup: delete leftover test RP.
log.Info("Pre-cleanup: deleting leftover test resource provider", "uuid", testRPUUID)
req, err := http.NewRequestWithContext(ctx,
http.MethodDelete, sc.Endpoint+"/resource_providers/"+testRPUUID, http.NoBody)
@@ -116,8 +121,7 @@ func e2eTestResourceProviderAggregates(ctx context.Context, _ client.Client) err
log.Info("Successfully created test resource provider for aggregates test",
"uuid", testRPUUID)
- // Deferred cleanup: always delete the test RP on exit so a failed
- // assertion doesn't leave the fixed UUID behind.
+ // Deferred cleanup.
defer func() {
log.Info("Deferred cleanup: deleting test resource provider", "uuid", testRPUUID)
dReq, dErr := http.NewRequestWithContext(ctx,
@@ -137,13 +141,11 @@ func e2eTestResourceProviderAggregates(ctx context.Context, _ client.Client) err
log.Info("Deferred cleanup completed", "status", dResp.StatusCode)
}()
- // Test GET /resource_providers/{uuid}/aggregates (empty).
- log.Info("Testing GET /resource_providers/{uuid}/aggregates (empty)",
- "uuid", testRPUUID)
+ // Test GET (empty).
+ log.Info("Testing GET /resource_providers/{uuid}/aggregates (empty)", "uuid", testRPUUID)
req, err = http.NewRequestWithContext(ctx,
http.MethodGet, sc.Endpoint+"/resource_providers/"+testRPUUID+"/aggregates", http.NoBody)
if err != nil {
- log.Error(err, "failed to create GET request for RP aggregates", "uuid", testRPUUID)
return err
}
req.Header.Set("X-Auth-Token", sc.TokenID)
@@ -151,44 +153,36 @@ func e2eTestResourceProviderAggregates(ctx context.Context, _ client.Client) err
req.Header.Set("Accept", "application/json")
resp, err = sc.HTTPClient.Do(req)
if err != nil {
- log.Error(err, "failed to send GET request for RP aggregates", "uuid", testRPUUID)
return err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
- err := fmt.Errorf("unexpected status code: %d", resp.StatusCode)
- log.Error(err, "GET RP aggregates returned an error", "uuid", testRPUUID)
- return err
+ return fmt.Errorf("GET RP aggregates: unexpected status %d", resp.StatusCode)
}
var aggResp struct {
Aggregates []string `json:"aggregates"`
ResourceProviderGeneration int `json:"resource_provider_generation"`
}
- err = json.NewDecoder(resp.Body).Decode(&aggResp)
- if err != nil {
- log.Error(err, "failed to decode RP aggregates response", "uuid", testRPUUID)
+ if err := json.NewDecoder(resp.Body).Decode(&aggResp); err != nil {
return err
}
- log.Info("Successfully retrieved empty aggregates for test resource provider",
- "uuid", testRPUUID, "aggregates", len(aggResp.Aggregates),
- "generation", aggResp.ResourceProviderGeneration)
+ if len(aggResp.Aggregates) != 0 {
+ return fmt.Errorf("expected 0 initial aggregates, got %d", len(aggResp.Aggregates))
+ }
+ log.Info("Verified empty aggregates", "generation", aggResp.ResourceProviderGeneration)
- // Test PUT /resource_providers/{uuid}/aggregates (set two aggregates).
- log.Info("Testing PUT /resource_providers/{uuid}/aggregates to set aggregates",
- "uuid", testRPUUID, "agg1", testAggUUID1, "agg2", testAggUUID2)
+ // Test PUT (associate aggregates).
putBody, err := json.Marshal(map[string]any{
"resource_provider_generation": aggResp.ResourceProviderGeneration,
"aggregates": []string{testAggUUID1, testAggUUID2},
})
if err != nil {
- log.Error(err, "failed to marshal request body")
return err
}
req, err = http.NewRequestWithContext(ctx,
http.MethodPut, sc.Endpoint+"/resource_providers/"+testRPUUID+"/aggregates",
bytes.NewReader(putBody))
if err != nil {
- log.Error(err, "failed to create PUT request for RP aggregates", "uuid", testRPUUID)
return err
}
req.Header.Set("X-Auth-Token", sc.TokenID)
@@ -197,35 +191,18 @@ func e2eTestResourceProviderAggregates(ctx context.Context, _ client.Client) err
req.Header.Set("Accept", "application/json")
resp, err = sc.HTTPClient.Do(req)
if err != nil {
- log.Error(err, "failed to send PUT request for RP aggregates", "uuid", testRPUUID)
return err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
- err := fmt.Errorf("unexpected status code: %d", resp.StatusCode)
- log.Error(err, "PUT RP aggregates returned an error", "uuid", testRPUUID)
- return err
- }
- var putAggResp struct {
- Aggregates []string `json:"aggregates"`
- ResourceProviderGeneration int `json:"resource_provider_generation"`
- }
- err = json.NewDecoder(resp.Body).Decode(&putAggResp)
- if err != nil {
- log.Error(err, "failed to decode PUT RP aggregates response", "uuid", testRPUUID)
- return err
+ return fmt.Errorf("PUT RP aggregates: unexpected status %d", resp.StatusCode)
}
- log.Info("Successfully set aggregates on test resource provider",
- "uuid", testRPUUID, "aggregates", len(putAggResp.Aggregates),
- "generation", putAggResp.ResourceProviderGeneration)
+ log.Info("Successfully associated aggregates")
- // Test GET /resource_providers/{uuid}/aggregates (after PUT).
- log.Info("Testing GET /resource_providers/{uuid}/aggregates (after PUT)",
- "uuid", testRPUUID)
+ // Test GET (after PUT).
req, err = http.NewRequestWithContext(ctx,
http.MethodGet, sc.Endpoint+"/resource_providers/"+testRPUUID+"/aggregates", http.NoBody)
if err != nil {
- log.Error(err, "failed to create GET request for RP aggregates", "uuid", testRPUUID)
return err
}
req.Header.Set("X-Auth-Token", sc.TokenID)
@@ -233,47 +210,29 @@ func e2eTestResourceProviderAggregates(ctx context.Context, _ client.Client) err
req.Header.Set("Accept", "application/json")
resp, err = sc.HTTPClient.Do(req)
if err != nil {
- log.Error(err, "failed to send GET request for RP aggregates", "uuid", testRPUUID)
return err
}
defer resp.Body.Close()
- if resp.StatusCode < 200 || resp.StatusCode >= 300 {
- err := fmt.Errorf("unexpected status code: %d", resp.StatusCode)
- log.Error(err, "GET RP aggregates returned an error", "uuid", testRPUUID)
- return err
- }
- err = json.NewDecoder(resp.Body).Decode(&aggResp)
- if err != nil {
- log.Error(err, "failed to decode RP aggregates response", "uuid", testRPUUID)
+ if err := json.NewDecoder(resp.Body).Decode(&aggResp); err != nil {
return err
}
- if len(aggResp.Aggregates) != 2 ||
- !slices.Contains(aggResp.Aggregates, testAggUUID1) ||
- !slices.Contains(aggResp.Aggregates, testAggUUID2) {
- err := fmt.Errorf("expected aggregates %v, got %v",
- []string{testAggUUID1, testAggUUID2}, aggResp.Aggregates)
- log.Error(err, "aggregate mismatch", "uuid", testRPUUID)
- return err
+ if !slices.Contains(aggResp.Aggregates, testAggUUID1) || !slices.Contains(aggResp.Aggregates, testAggUUID2) {
+ return fmt.Errorf("expected aggregates %v and %v, got %v", testAggUUID1, testAggUUID2, aggResp.Aggregates)
}
- log.Info("Successfully verified aggregates on test resource provider",
- "uuid", testRPUUID, "aggregates", aggResp.Aggregates)
+ log.Info("Verified aggregates present after PUT")
// Clear aggregates by PUT with empty list.
- log.Info("Testing PUT /resource_providers/{uuid}/aggregates to clear aggregates",
- "uuid", testRPUUID)
putBody, err = json.Marshal(map[string]any{
"resource_provider_generation": aggResp.ResourceProviderGeneration,
"aggregates": []string{},
})
if err != nil {
- log.Error(err, "failed to marshal request body")
return err
}
req, err = http.NewRequestWithContext(ctx,
http.MethodPut, sc.Endpoint+"/resource_providers/"+testRPUUID+"/aggregates",
bytes.NewReader(putBody))
if err != nil {
- log.Error(err, "failed to create PUT request to clear RP aggregates", "uuid", testRPUUID)
return err
}
req.Header.Set("X-Auth-Token", sc.TokenID)
@@ -282,24 +241,18 @@ func e2eTestResourceProviderAggregates(ctx context.Context, _ client.Client) err
req.Header.Set("Accept", "application/json")
resp, err = sc.HTTPClient.Do(req)
if err != nil {
- log.Error(err, "failed to send PUT request to clear RP aggregates", "uuid", testRPUUID)
return err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
- err := fmt.Errorf("unexpected status code: %d", resp.StatusCode)
- log.Error(err, "PUT to clear RP aggregates returned an error", "uuid", testRPUUID)
- return err
+ return fmt.Errorf("PUT RP aggregates (clear): unexpected status %d", resp.StatusCode)
}
- log.Info("Successfully cleared aggregates on test resource provider",
- "uuid", testRPUUID)
+ log.Info("Successfully cleared aggregates")
- // Verify aggregates are empty after clear.
- log.Info("Verifying aggregates are empty after clear", "uuid", testRPUUID)
+ // Verify empty after clear.
req, err = http.NewRequestWithContext(ctx,
http.MethodGet, sc.Endpoint+"/resource_providers/"+testRPUUID+"/aggregates", http.NoBody)
if err != nil {
- log.Error(err, "failed to create GET request for RP aggregates", "uuid", testRPUUID)
return err
}
req.Header.Set("X-Auth-Token", sc.TokenID)
@@ -307,49 +260,221 @@ func e2eTestResourceProviderAggregates(ctx context.Context, _ client.Client) err
req.Header.Set("Accept", "application/json")
resp, err = sc.HTTPClient.Do(req)
if err != nil {
- log.Error(err, "failed to send GET request for RP aggregates", "uuid", testRPUUID)
return err
}
defer resp.Body.Close()
- if resp.StatusCode < 200 || resp.StatusCode >= 300 {
- err := fmt.Errorf("unexpected status code: %d", resp.StatusCode)
- log.Error(err, "GET RP aggregates returned an error", "uuid", testRPUUID)
+ if err := json.NewDecoder(resp.Body).Decode(&aggResp); err != nil {
return err
}
- err = json.NewDecoder(resp.Body).Decode(&aggResp)
+ if len(aggResp.Aggregates) != 0 {
+ return fmt.Errorf("expected 0 aggregates after clear, got %d", len(aggResp.Aggregates))
+ }
+ log.Info("Verified aggregates empty after clear")
+
+ // Cleanup.
+ req, err = http.NewRequestWithContext(ctx,
+ http.MethodDelete, sc.Endpoint+"/resource_providers/"+testRPUUID, http.NoBody)
if err != nil {
- log.Error(err, "failed to decode RP aggregates response", "uuid", testRPUUID)
return err
}
- if len(aggResp.Aggregates) != 0 {
- err := fmt.Errorf("expected 0 aggregates after clear, got %d", len(aggResp.Aggregates))
- log.Error(err, "aggregates not empty after clear", "uuid", testRPUUID)
+ req.Header.Set("X-Auth-Token", sc.TokenID)
+ req.Header.Set("OpenStack-API-Version", "placement 1.19")
+ resp, err = sc.HTTPClient.Do(req)
+ if err != nil {
+ return err
+ }
+ resp.Body.Close()
+
+ return nil
+}
+
+// e2eCRDResourceProviderAggregates tests the CRD/hybrid path by discovering a
+// real KVM hypervisor in the cluster, seeding spec.groups, and exercising
+// GET/PUT through the shim.
+func e2eCRDResourceProviderAggregates(ctx context.Context, sc *gophercloud.ServiceClient, cl client.Client) error {
+ log := logf.FromContext(ctx)
+
+ // Discover a KVM hypervisor with a non-empty OpenStack ID.
+ var hvs hv1.HypervisorList
+ if err := cl.List(ctx, &hvs); err != nil {
+ log.Error(err, "failed to list hypervisors for CRD aggregates path")
+ return err
+ }
+ var kvmHV *hv1.Hypervisor
+ for i := range hvs.Items {
+ if hvs.Items[i].Status.HypervisorID != "" {
+ kvmHV = &hvs.Items[i]
+ break
+ }
+ }
+ if kvmHV == nil {
+ log.Info("No KVM hypervisors with OpenStack ID found, skipping CRD aggregates tests")
+ return nil
+ }
+ kvmUUID := kvmHV.Status.HypervisorID
+ log.Info("Using KVM hypervisor for CRD aggregates e2e tests", "uuid", kvmUUID, "name", kvmHV.Name)
+
+ // Save original groups for restoration.
+ originalGroups := kvmHV.Spec.Groups
+
+ // Seed spec.groups with test aggregates (preserve non-aggregate groups).
+ const testAgg1UUID = "e2e40000-0000-0000-0000-000000000001"
+ const testAgg2UUID = "e2e40000-0000-0000-0000-000000000002"
+ var nonAggGroups []hv1.Group
+ for i := range kvmHV.Spec.Groups {
+ if kvmHV.Spec.Groups[i].Aggregate == nil {
+ nonAggGroups = append(nonAggGroups, kvmHV.Spec.Groups[i])
+ }
+ }
+ nonAggGroups = append(nonAggGroups,
+ hv1.Group{Aggregate: &hv1.AggregateGroup{Name: testAgg1UUID, UUID: testAgg1UUID}},
+ hv1.Group{Aggregate: &hv1.AggregateGroup{Name: testAgg2UUID, UUID: testAgg2UUID}},
+ )
+ kvmHV.Spec.Groups = nonAggGroups
+ if err := cl.Update(ctx, kvmHV); err != nil {
+ return fmt.Errorf("failed to seed spec.groups with test aggregates: %w", err)
+ }
+ log.Info("Seeded spec.groups with test aggregates", "uuid", kvmUUID)
+
+ // Always restore original groups on exit (retry on conflict).
+ defer func() {
+ log.Info("Restoring original spec.groups", "uuid", kvmUUID)
+ for range 5 {
+ if err := cl.Get(ctx, client.ObjectKeyFromObject(kvmHV), kvmHV); err != nil {
+ log.Error(err, "failed to refetch hypervisor for restoration")
+ return
+ }
+ kvmHV.Spec.Groups = originalGroups
+ if err := cl.Update(ctx, kvmHV); err != nil {
+ if apierrors.IsConflict(err) {
+ continue
+ }
+ log.Error(err, "failed to restore original spec.groups")
+ return
+ }
+ return
+ }
+ log.Error(nil, "exhausted retries restoring original spec.groups")
+ }()
+
+ // Refetch to get updated generation.
+ if err := cl.Get(ctx, client.ObjectKeyFromObject(kvmHV), kvmHV); err != nil {
+ return fmt.Errorf("failed to refetch hypervisor after seed: %w", err)
+ }
+
+ // Test GET — should return the seeded aggregates.
+ // Poll because the shim's informer cache may take a moment to observe the update.
+ log.Info("Testing GET /resource_providers/{uuid}/aggregates (CRD)", "uuid", kvmUUID)
+ var aggResp struct {
+ Aggregates []string `json:"aggregates"`
+ ResourceProviderGeneration int64 `json:"resource_provider_generation"`
+ }
+ if err := e2ePollUntil(ctx, 10*time.Second, func() (bool, error) {
+ req, err := http.NewRequestWithContext(ctx,
+ http.MethodGet, sc.Endpoint+"/resource_providers/"+kvmUUID+"/aggregates", http.NoBody)
+ if err != nil {
+ return false, err
+ }
+ req.Header.Set("X-Auth-Token", sc.TokenID)
+ req.Header.Set("OpenStack-API-Version", "placement 1.19")
+ req.Header.Set("Accept", "application/json")
+ resp, err := sc.HTTPClient.Do(req)
+ if err != nil {
+ return false, err
+ }
+ defer resp.Body.Close()
+ if resp.StatusCode != http.StatusOK {
+ return false, fmt.Errorf("GET CRD aggregates: expected 200, got %d", resp.StatusCode)
+ }
+ if err := json.NewDecoder(resp.Body).Decode(&aggResp); err != nil {
+ return false, fmt.Errorf("failed to decode CRD aggregates response: %w", err)
+ }
+ return slices.Contains(aggResp.Aggregates, testAgg1UUID) &&
+ slices.Contains(aggResp.Aggregates, testAgg2UUID), nil
+ }); err != nil {
+ return fmt.Errorf("waiting for seeded aggregates: %w (got %v)", err, aggResp.Aggregates)
+ }
+ log.Info("Verified GET returns seeded aggregates from CRD",
+ "aggregates", aggResp.Aggregates, "generation", aggResp.ResourceProviderGeneration)
+
+ // Test PUT — replace aggregates.
+ const replacementAggUUID = "e2e40000-0000-0000-0000-000000000099"
+ putBody, err := json.Marshal(map[string]any{
+ "resource_provider_generation": aggResp.ResourceProviderGeneration,
+ "aggregates": []string{replacementAggUUID},
+ })
+ if err != nil {
+ return err
+ }
+ req, err := http.NewRequestWithContext(ctx,
+ http.MethodPut, sc.Endpoint+"/resource_providers/"+kvmUUID+"/aggregates",
+ bytes.NewReader(putBody))
+ if err != nil {
+ return err
+ }
+ req.Header.Set("X-Auth-Token", sc.TokenID)
+ req.Header.Set("OpenStack-API-Version", "placement 1.19")
+ req.Header.Set("Content-Type", "application/json")
+ req.Header.Set("Accept", "application/json")
+ resp, err := sc.HTTPClient.Do(req)
+ if err != nil {
return err
}
- log.Info("Verified aggregates are empty after clear", "uuid", testRPUUID)
+ defer resp.Body.Close()
+ if resp.StatusCode != http.StatusOK {
+ return fmt.Errorf("PUT CRD aggregates: expected 200, got %d", resp.StatusCode)
+ }
+ log.Info("Successfully replaced aggregates via PUT (CRD)")
- // Cleanup: delete the test resource provider.
- log.Info("Cleaning up test resource provider", "uuid", testRPUUID)
+ // Test PUT with stale generation — should return 409.
+ putBody, err = json.Marshal(map[string]any{
+ "resource_provider_generation": aggResp.ResourceProviderGeneration,
+ "aggregates": []string{"stale-uuid"},
+ })
+ if err != nil {
+ return err
+ }
req, err = http.NewRequestWithContext(ctx,
- http.MethodDelete, sc.Endpoint+"/resource_providers/"+testRPUUID, http.NoBody)
+ http.MethodPut, sc.Endpoint+"/resource_providers/"+kvmUUID+"/aggregates",
+ bytes.NewReader(putBody))
if err != nil {
- log.Error(err, "failed to create DELETE request for resource provider", "uuid", testRPUUID)
return err
}
req.Header.Set("X-Auth-Token", sc.TokenID)
req.Header.Set("OpenStack-API-Version", "placement 1.19")
+ req.Header.Set("Content-Type", "application/json")
+ req.Header.Set("Accept", "application/json")
resp, err = sc.HTTPClient.Do(req)
if err != nil {
- log.Error(err, "failed to send DELETE request for resource provider", "uuid", testRPUUID)
return err
}
defer resp.Body.Close()
- if resp.StatusCode < 200 || resp.StatusCode >= 300 {
- err := fmt.Errorf("unexpected status code: %d", resp.StatusCode)
- log.Error(err, "DELETE /resource_providers/{uuid} returned an error", "uuid", testRPUUID)
+ if resp.StatusCode != http.StatusConflict {
+ return fmt.Errorf("PUT CRD aggregates (stale gen): expected 409, got %d", resp.StatusCode)
+ }
+ log.Info("Verified generation conflict returns 409")
+
+ // Test GET — verify replacement persisted.
+ req, err = http.NewRequestWithContext(ctx,
+ http.MethodGet, sc.Endpoint+"/resource_providers/"+kvmUUID+"/aggregates", http.NoBody)
+ if err != nil {
+ return err
+ }
+ req.Header.Set("X-Auth-Token", sc.TokenID)
+ req.Header.Set("OpenStack-API-Version", "placement 1.19")
+ req.Header.Set("Accept", "application/json")
+ resp, err = sc.HTTPClient.Do(req)
+ if err != nil {
return err
}
- log.Info("Successfully deleted test resource provider", "uuid", testRPUUID)
+ defer resp.Body.Close()
+ if err := json.NewDecoder(resp.Body).Decode(&aggResp); err != nil {
+ return err
+ }
+ if len(aggResp.Aggregates) != 1 || aggResp.Aggregates[0] != replacementAggUUID {
+ return fmt.Errorf("expected [%s], got %v", replacementAggUUID, aggResp.Aggregates)
+ }
+ log.Info("Verified replacement aggregate persisted")
return nil
}
diff --git a/internal/shim/placement/handle_resource_provider_aggregates_test.go b/internal/shim/placement/handle_resource_provider_aggregates_test.go
index eb35d665c..c4e1b1b27 100644
--- a/internal/shim/placement/handle_resource_provider_aggregates_test.go
+++ b/internal/shim/placement/handle_resource_provider_aggregates_test.go
@@ -4,8 +4,12 @@
package placement
import (
+ "encoding/json"
"net/http"
"testing"
+
+ hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
+ "sigs.k8s.io/controller-runtime/pkg/client"
)
func TestHandleListResourceProviderAggregates(t *testing.T) {
@@ -51,59 +55,207 @@ func TestHandleUpdateResourceProviderAggregates(t *testing.T) {
}
func TestHandleResourceProviderAggregates_HybridMode(t *testing.T) {
- down, up := newTestTimers()
- s := &Shim{
- config: config{
- PlacementURL: "http://should-not-be-called:1234",
- Features: featuresConfig{Aggregates: FeatureModeHybrid},
- },
- maxBodyLogSize: 4096,
- downstreamRequestTimer: down,
- upstreamRequestTimer: up,
- }
- t.Run("GET returns 501", func(t *testing.T) {
+ s := newTestShimWithHypervisors(t, http.StatusOK, `{"aggregates":["uuid-1"],"resource_provider_generation":1}`)
+ s.config.Features.Aggregates = FeatureModeHybrid
+ t.Run("GET forwards to upstream when provider not in CRD", func(t *testing.T) {
w := serveHandler(t, "GET", "/resource_providers/{uuid}/aggregates",
s.HandleListResourceProviderAggregates,
"/resource_providers/"+validUUID+"/aggregates")
- if w.Code != http.StatusNotImplemented {
- t.Fatalf("status = %d, want %d", w.Code, http.StatusNotImplemented)
+ if w.Code != http.StatusOK {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusOK)
}
})
- t.Run("PUT returns 501", func(t *testing.T) {
+ t.Run("PUT forwards to upstream when provider not in CRD", func(t *testing.T) {
w := serveHandler(t, "PUT", "/resource_providers/{uuid}/aggregates",
s.HandleUpdateResourceProviderAggregates,
"/resource_providers/"+validUUID+"/aggregates")
- if w.Code != http.StatusNotImplemented {
- t.Fatalf("status = %d, want %d", w.Code, http.StatusNotImplemented)
+ if w.Code != http.StatusOK {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusOK)
+ }
+ })
+
+ t.Run("GET serves from CRD when provider is KVM", func(t *testing.T) {
+ hv := testHypervisorWithGroups("kvm-hybrid-agg", validUUID, []hv1.Group{
+ {Aggregate: &hv1.AggregateGroup{Name: "az-west", UUID: "agg-uuid-1"}},
+ })
+ sKVM := newTestShimWithHypervisors(t, http.StatusOK, "{}", hv)
+ sKVM.config.Features.Aggregates = FeatureModeHybrid
+ w := serveHandler(t, "GET", "/resource_providers/{uuid}/aggregates",
+ sKVM.HandleListResourceProviderAggregates,
+ "/resource_providers/"+validUUID+"/aggregates")
+ if w.Code != http.StatusOK {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusOK)
+ }
+ var resp resourceProviderAggregatesResponse
+ if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
+ t.Fatalf("failed to decode response: %v", err)
+ }
+ if len(resp.Aggregates) != 1 || resp.Aggregates[0] != "agg-uuid-1" {
+ t.Fatalf("expected [agg-uuid-1], got %v", resp.Aggregates)
}
})
}
func TestHandleResourceProviderAggregates_CRDMode(t *testing.T) {
- down, up := newTestTimers()
- s := &Shim{
- config: config{
- PlacementURL: "http://should-not-be-called:1234",
- Features: featuresConfig{Aggregates: FeatureModeCRD},
- },
- maxBodyLogSize: 4096,
- downstreamRequestTimer: down,
- upstreamRequestTimer: up,
+ groups := []hv1.Group{
+ {Trait: &hv1.TraitGroup{Name: "HW_CPU_X86_AVX2"}},
+ {Aggregate: &hv1.AggregateGroup{Name: "fast-storage", UUID: "agg-uuid-1"}},
+ {Aggregate: &hv1.AggregateGroup{Name: "az-west", UUID: "agg-uuid-2"}},
}
- t.Run("GET returns 501", func(t *testing.T) {
+ hv := testHypervisorWithGroups("kvm-host-1", validUUID, groups)
+ s := newTestShimWithHypervisors(t, http.StatusOK, "{}", hv)
+ s.config.Features.Aggregates = FeatureModeCRD
+
+ t.Run("GET returns aggregate UUIDs from spec.groups", func(t *testing.T) {
w := serveHandler(t, "GET", "/resource_providers/{uuid}/aggregates",
s.HandleListResourceProviderAggregates,
"/resource_providers/"+validUUID+"/aggregates")
- if w.Code != http.StatusNotImplemented {
- t.Fatalf("status = %d, want %d", w.Code, http.StatusNotImplemented)
+ if w.Code != http.StatusOK {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusOK)
+ }
+ var resp resourceProviderAggregatesResponse
+ if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
+ t.Fatalf("failed to decode response: %v", err)
+ }
+ if len(resp.Aggregates) != 2 {
+ t.Fatalf("aggregates count = %d, want 2", len(resp.Aggregates))
+ }
+ if resp.Aggregates[0] != "agg-uuid-1" {
+ t.Errorf("aggregates[0] = %q, want agg-uuid-1", resp.Aggregates[0])
+ }
+ if resp.Aggregates[1] != "agg-uuid-2" {
+ t.Errorf("aggregates[1] = %q, want agg-uuid-2", resp.Aggregates[1])
}
})
- t.Run("PUT returns 501", func(t *testing.T) {
- w := serveHandler(t, "PUT", "/resource_providers/{uuid}/aggregates",
+
+ t.Run("GET returns empty aggregates when spec.groups has no aggregates", func(t *testing.T) {
+ hvNoAggs := testHypervisorWithGroups("kvm-no-aggs", "b1b2b3b4-c5c6-d7d8-e9e0-f1f2f3f4f5f6", []hv1.Group{
+ {Trait: &hv1.TraitGroup{Name: "CUSTOM_T"}},
+ })
+ s2 := newTestShimWithHypervisors(t, http.StatusOK, "{}", hvNoAggs)
+ s2.config.Features.Aggregates = FeatureModeCRD
+ w := serveHandler(t, "GET", "/resource_providers/{uuid}/aggregates",
+ s2.HandleListResourceProviderAggregates,
+ "/resource_providers/b1b2b3b4-c5c6-d7d8-e9e0-f1f2f3f4f5f6/aggregates")
+ if w.Code != http.StatusOK {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusOK)
+ }
+ var resp resourceProviderAggregatesResponse
+ if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
+ t.Fatalf("failed to decode response: %v", err)
+ }
+ if len(resp.Aggregates) != 0 {
+ t.Fatalf("aggregates count = %d, want 0", len(resp.Aggregates))
+ }
+ })
+
+ t.Run("GET returns 404 for non-existent provider", func(t *testing.T) {
+ nonExistUUID := "a1b2c3d4-e5f6-7890-abcd-ef1234567890"
+ w := serveHandler(t, "GET", "/resource_providers/{uuid}/aggregates",
+ s.HandleListResourceProviderAggregates,
+ "/resource_providers/"+nonExistUUID+"/aggregates")
+ if w.Code != http.StatusNotFound {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusNotFound)
+ }
+ })
+
+ t.Run("PUT replaces aggregates in spec.groups preserving traits", func(t *testing.T) {
+ hvPut := testHypervisorWithGroups("kvm-put-aggs", "c1c2c3c4-d5d6-e7e8-f9f0-a1a2a3a4a5a6", []hv1.Group{
+ {Aggregate: &hv1.AggregateGroup{Name: "old-agg", UUID: "old-uuid"}},
+ {Trait: &hv1.TraitGroup{Name: "KEEP_TRAIT"}},
+ })
+ sPut := newTestShimWithHypervisors(t, http.StatusOK, "{}", hvPut)
+ sPut.config.Features.Aggregates = FeatureModeCRD
+
+ body := `{"aggregates":["new-uuid-1","new-uuid-2"],"resource_provider_generation":0}`
+ w := serveHandlerWithBody(t, "PUT", "/resource_providers/{uuid}/aggregates",
+ sPut.HandleUpdateResourceProviderAggregates,
+ "/resource_providers/c1c2c3c4-d5d6-e7e8-f9f0-a1a2a3a4a5a6/aggregates", body)
+ if w.Code != http.StatusOK {
+ t.Fatalf("status = %d, want %d; body: %s", w.Code, http.StatusOK, w.Body.String())
+ }
+ var resp resourceProviderAggregatesResponse
+ if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
+ t.Fatalf("failed to decode response: %v", err)
+ }
+ if len(resp.Aggregates) != 2 {
+ t.Fatalf("aggregates count = %d, want 2", len(resp.Aggregates))
+ }
+
+ // Verify traits were preserved.
+ var updated hv1.Hypervisor
+ if err := sPut.Get(t.Context(), client.ObjectKeyFromObject(hvPut), &updated); err != nil {
+ t.Fatalf("failed to get updated hypervisor: %v", err)
+ }
+ traits := hv1.GetTraits(updated.Spec.Groups)
+ if len(traits) != 1 || traits[0].Name != "KEEP_TRAIT" {
+ t.Fatalf("traits were not preserved: got %+v", traits)
+ }
+ })
+
+ t.Run("PUT returns 409 on generation mismatch", func(t *testing.T) {
+ hvConflict := testHypervisorWithGroups("kvm-agg-conflict", "d1d2d3d4-e5e6-f7f8-a9a0-b1b2b3b4b5b6", nil)
+ sConflict := newTestShimWithHypervisors(t, http.StatusOK, "{}", hvConflict)
+ sConflict.config.Features.Aggregates = FeatureModeCRD
+
+ body := `{"aggregates":["u1"],"resource_provider_generation":999}`
+ w := serveHandlerWithBody(t, "PUT", "/resource_providers/{uuid}/aggregates",
+ sConflict.HandleUpdateResourceProviderAggregates,
+ "/resource_providers/d1d2d3d4-e5e6-f7f8-a9a0-b1b2b3b4b5b6/aggregates", body)
+ if w.Code != http.StatusConflict {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusConflict)
+ }
+ })
+
+ t.Run("PUT returns 404 for non-existent provider", func(t *testing.T) {
+ body := `{"aggregates":["u1"],"resource_provider_generation":0}`
+ w := serveHandlerWithBody(t, "PUT", "/resource_providers/{uuid}/aggregates",
s.HandleUpdateResourceProviderAggregates,
- "/resource_providers/"+validUUID+"/aggregates")
- if w.Code != http.StatusNotImplemented {
- t.Fatalf("status = %d, want %d", w.Code, http.StatusNotImplemented)
+ "/resource_providers/e1e2e3e4-f5f6-a7a8-b9b0-c1c2c3c4c5c6/aggregates", body)
+ if w.Code != http.StatusNotFound {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusNotFound)
+ }
+ })
+
+ t.Run("PUT with empty list removes all aggregates", func(t *testing.T) {
+ hvClear := testHypervisorWithGroups("kvm-clear-aggs", "e1e2e3e4-f5f6-a7a8-b9b0-c1c2c3c4c5c6", []hv1.Group{
+ {Aggregate: &hv1.AggregateGroup{Name: "remove-me", UUID: "remove-uuid"}},
+ {Trait: &hv1.TraitGroup{Name: "KEEP_TRAIT"}},
+ })
+ sClear := newTestShimWithHypervisors(t, http.StatusOK, "{}", hvClear)
+ sClear.config.Features.Aggregates = FeatureModeCRD
+
+ body := `{"aggregates":[],"resource_provider_generation":0}`
+ w := serveHandlerWithBody(t, "PUT", "/resource_providers/{uuid}/aggregates",
+ sClear.HandleUpdateResourceProviderAggregates,
+ "/resource_providers/e1e2e3e4-f5f6-a7a8-b9b0-c1c2c3c4c5c6/aggregates", body)
+ if w.Code != http.StatusOK {
+ t.Fatalf("status = %d, want %d; body: %s", w.Code, http.StatusOK, w.Body.String())
+ }
+ var resp resourceProviderAggregatesResponse
+ if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
+ t.Fatalf("failed to decode response: %v", err)
+ }
+ if len(resp.Aggregates) != 0 {
+ t.Fatalf("expected 0 aggregates, got %d", len(resp.Aggregates))
+ }
+
+ var updated hv1.Hypervisor
+ if err := sClear.Get(t.Context(), client.ObjectKeyFromObject(hvClear), &updated); err != nil {
+ t.Fatalf("failed to get updated hypervisor: %v", err)
+ }
+ traits := hv1.GetTraits(updated.Spec.Groups)
+ if len(traits) != 1 || traits[0].Name != "KEEP_TRAIT" {
+ t.Fatalf("traits were not preserved: got %+v", traits)
+ }
+ })
+
+ t.Run("PUT returns 400 for malformed body", func(t *testing.T) {
+ w := serveHandlerWithBody(t, "PUT", "/resource_providers/{uuid}/aggregates",
+ s.HandleUpdateResourceProviderAggregates,
+ "/resource_providers/"+validUUID+"/aggregates", "not json")
+ if w.Code != http.StatusBadRequest {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest)
}
})
}
diff --git a/internal/shim/placement/handle_resource_provider_traits.go b/internal/shim/placement/handle_resource_provider_traits.go
index b23ac8e59..463edfed7 100644
--- a/internal/shim/placement/handle_resource_provider_traits.go
+++ b/internal/shim/placement/handle_resource_provider_traits.go
@@ -4,6 +4,7 @@
package placement
import (
+ "encoding/json"
"net/http"
hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
@@ -15,11 +16,20 @@ import (
// resourceProviderTraitsResponse is the JSON body returned by
// GET /resource_providers/{uuid}/traits and
// PUT /resource_providers/{uuid}/traits.
+//
+// https://docs.openstack.org/api-ref/placement/#resource-provider-traits
type resourceProviderTraitsResponse struct {
Traits []string `json:"traits"`
ResourceProviderGeneration int64 `json:"resource_provider_generation"`
}
+// resourceProviderTraitsRequest is the JSON body expected by
+// PUT /resource_providers/{uuid}/traits.
+type resourceProviderTraitsRequest struct {
+ Traits []string `json:"traits"`
+ ResourceProviderGeneration int64 `json:"resource_provider_generation"`
+}
+
// HandleListResourceProviderTraits handles
// GET /resource_providers/{uuid}/traits requests.
//
@@ -27,6 +37,8 @@ type resourceProviderTraitsResponse struct {
// by {uuid}. The response includes an array of trait name strings and the
// resource_provider_generation for concurrency tracking. Returns 404 if the
// provider does not exist.
+//
+// https://docs.openstack.org/api-ref/placement/#list-resource-provider-traits
func (s *Shim) HandleListResourceProviderTraits(w http.ResponseWriter, r *http.Request) {
uuid, ok := requiredUUIDPathParam(w, r, "uuid")
if !ok {
@@ -36,7 +48,7 @@ func (s *Shim) HandleListResourceProviderTraits(w http.ResponseWriter, r *http.R
case FeatureModePassthrough:
s.forward(w, r)
case FeatureModeHybrid:
- s.forward(w, r)
+ s.listResourceProviderTraitsHybrid(w, r, uuid)
case FeatureModeCRD:
s.listResourceProviderTraitsCRD(w, r, uuid)
default:
@@ -44,6 +56,25 @@ func (s *Shim) HandleListResourceProviderTraits(w http.ResponseWriter, r *http.R
}
}
+// listResourceProviderTraitsHybrid serves from the CRD if the provider is a
+// KVM hypervisor, otherwise forwards to upstream placement.
+func (s *Shim) listResourceProviderTraitsHybrid(w http.ResponseWriter, r *http.Request, uuid string) {
+ ctx := r.Context()
+ log := logf.FromContext(ctx)
+
+ var hvs hv1.HypervisorList
+ err := s.List(ctx, &hvs, client.MatchingFields{idxHypervisorOpenStackId: uuid})
+ if err != nil || len(hvs.Items) != 1 {
+ log.Info("resource provider not resolved from kubernetes, forwarding to upstream placement", "uuid", uuid)
+ s.forward(w, r)
+ return
+ }
+ log.Info("resolved resource provider from CRD, serving traits", "uuid", uuid, "hypervisor", hvs.Items[0].Name)
+ s.writeTraitsFromCRD(w, &hvs.Items[0])
+}
+
+// listResourceProviderTraitsCRD serves exclusively from the CRD, returning 404
+// if the provider is not a known KVM hypervisor.
func (s *Shim) listResourceProviderTraitsCRD(w http.ResponseWriter, r *http.Request, uuid string) {
ctx := r.Context()
log := logf.FromContext(ctx)
@@ -65,11 +96,15 @@ func (s *Shim) listResourceProviderTraitsCRD(w http.ResponseWriter, r *http.Requ
http.Error(w, "Internal Server Error", http.StatusInternalServerError)
return
}
+ log.Info("serving traits from CRD", "uuid", uuid, "hypervisor", hvs.Items[0].Name)
+ s.writeTraitsFromCRD(w, &hvs.Items[0])
+}
- hv := hvs.Items[0]
- traits := hv.Status.Traits
- if traits == nil {
- traits = []string{}
+func (s *Shim) writeTraitsFromCRD(w http.ResponseWriter, hv *hv1.Hypervisor) {
+ traitGroups := hv1.GetTraits(hv.Spec.Groups)
+ traits := make([]string, 0, len(traitGroups))
+ for _, tg := range traitGroups {
+ traits = append(traits, tg.Name)
}
s.writeJSON(w, http.StatusOK, resourceProviderTraitsResponse{
Traits: traits,
@@ -84,25 +119,152 @@ func (s *Shim) listResourceProviderTraitsCRD(w http.ResponseWriter, r *http.Requ
// The request body must include a traits array and the
// resource_provider_generation for optimistic concurrency control. All
// previously associated traits are removed and replaced by the specified set.
-// Returns 400 Bad Request if any of the specified traits are invalid (i.e.
-// not returned by GET /traits). Returns 409 Conflict if the generation does
-// not match.
+// Returns 409 Conflict if the generation does not match.
+//
+// https://docs.openstack.org/api-ref/placement/#update-resource-provider-traits
func (s *Shim) HandleUpdateResourceProviderTraits(w http.ResponseWriter, r *http.Request) {
- if _, ok := requiredUUIDPathParam(w, r, "uuid"); !ok {
+ uuid, ok := requiredUUIDPathParam(w, r, "uuid")
+ if !ok {
return
}
switch s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviderTraits) {
case FeatureModePassthrough:
s.forward(w, r)
case FeatureModeHybrid:
- s.forward(w, r)
+ s.updateResourceProviderTraitsHybrid(w, r, uuid)
case FeatureModeCRD:
- http.Error(w, "crd mode is not yet implemented for resource provider trait writes", http.StatusNotImplemented)
+ s.updateResourceProviderTraitsCRD(w, r, uuid)
default:
http.Error(w, "unknown feature mode", http.StatusInternalServerError)
}
}
+// updateResourceProviderTraitsHybrid updates traits via the CRD if the
+// provider is a KVM hypervisor, otherwise forwards to upstream placement.
+func (s *Shim) updateResourceProviderTraitsHybrid(w http.ResponseWriter, r *http.Request, uuid string) {
+ ctx := r.Context()
+ log := logf.FromContext(ctx)
+
+ var hvs hv1.HypervisorList
+ err := s.List(ctx, &hvs, client.MatchingFields{idxHypervisorOpenStackId: uuid})
+ if err != nil || len(hvs.Items) != 1 {
+ log.Info("resource provider not resolved from kubernetes, forwarding to upstream placement", "uuid", uuid)
+ s.forward(w, r)
+ return
+ }
+ hv := &hvs.Items[0]
+ log.Info("resolved resource provider from CRD, updating traits", "uuid", uuid, "hypervisor", hv.Name)
+
+ var req resourceProviderTraitsRequest
+ if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+ http.Error(w, "malformed request body", http.StatusBadRequest)
+ return
+ }
+ if req.ResourceProviderGeneration != hv.Generation {
+ log.Info("generation mismatch on trait update",
+ "expected", req.ResourceProviderGeneration, "actual", hv.Generation)
+ http.Error(w, "resource provider generation conflict", http.StatusConflict)
+ return
+ }
+
+ var newGroups []hv1.Group
+ for i := range hv.Spec.Groups {
+ if hv.Spec.Groups[i].Trait == nil {
+ newGroups = append(newGroups, hv.Spec.Groups[i])
+ }
+ }
+ for _, name := range req.Traits {
+ newGroups = append(newGroups, hv1.Group{
+ Trait: &hv1.TraitGroup{Name: name},
+ })
+ }
+ hv.Spec.Groups = newGroups
+
+ if err := s.Update(ctx, hv); err != nil {
+ if apierrors.IsConflict(err) {
+ http.Error(w, "resource provider generation conflict", http.StatusConflict)
+ return
+ }
+ log.Error(err, "failed to update hypervisor traits")
+ http.Error(w, "Internal Server Error", http.StatusInternalServerError)
+ return
+ }
+
+ log.Info("successfully updated traits via CRD", "uuid", uuid, "traitCount", len(req.Traits))
+ s.writeJSON(w, http.StatusOK, resourceProviderTraitsResponse{
+ Traits: req.Traits,
+ ResourceProviderGeneration: hv.Generation,
+ })
+}
+
+// updateResourceProviderTraitsCRD updates traits exclusively via the CRD,
+// returning 404 if the provider is not a known KVM hypervisor.
+func (s *Shim) updateResourceProviderTraitsCRD(w http.ResponseWriter, r *http.Request, uuid string) {
+ ctx := r.Context()
+ log := logf.FromContext(ctx)
+
+ var hvs hv1.HypervisorList
+ err := s.List(ctx, &hvs, client.MatchingFields{idxHypervisorOpenStackId: uuid})
+ if apierrors.IsNotFound(err) || len(hvs.Items) == 0 {
+ log.Info("resource provider not found in kubernetes (crd mode)", "uuid", uuid)
+ http.Error(w, "resource provider not found", http.StatusNotFound)
+ return
+ }
+ if err != nil {
+ log.Error(err, "failed to list hypervisors with OpenStack ID index")
+ http.Error(w, "Internal Server Error", http.StatusInternalServerError)
+ return
+ }
+ if len(hvs.Items) > 1 {
+ log.Error(nil, "multiple hypervisors found with the same OpenStack ID", "uuid", uuid)
+ http.Error(w, "Internal Server Error", http.StatusInternalServerError)
+ return
+ }
+ hv := &hvs.Items[0]
+ log.Info("updating traits via CRD", "uuid", uuid, "hypervisor", hv.Name)
+
+ var req resourceProviderTraitsRequest
+ if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+ http.Error(w, "malformed request body", http.StatusBadRequest)
+ return
+ }
+ if req.ResourceProviderGeneration != hv.Generation {
+ log.Info("generation mismatch on trait update",
+ "expected", req.ResourceProviderGeneration, "actual", hv.Generation)
+ http.Error(w, "resource provider generation conflict", http.StatusConflict)
+ return
+ }
+
+ var newGroups []hv1.Group
+ for i := range hv.Spec.Groups {
+ if hv.Spec.Groups[i].Trait == nil {
+ newGroups = append(newGroups, hv.Spec.Groups[i])
+ }
+ }
+ for _, name := range req.Traits {
+ newGroups = append(newGroups, hv1.Group{
+ Trait: &hv1.TraitGroup{Name: name},
+ })
+ }
+ hv.Spec.Groups = newGroups
+
+ if err := s.Update(ctx, hv); err != nil {
+ if apierrors.IsConflict(err) {
+ http.Error(w, "resource provider generation conflict", http.StatusConflict)
+ return
+ }
+ log.Error(err, "failed to update hypervisor traits")
+ http.Error(w, "Internal Server Error", http.StatusInternalServerError)
+ return
+ }
+
+ log.Info("successfully updated traits via CRD", "uuid", uuid, "traitCount", len(req.Traits))
+ s.writeJSON(w, http.StatusOK, resourceProviderTraitsResponse{
+ Traits: req.Traits,
+ ResourceProviderGeneration: hv.Generation,
+ })
+}
+
// HandleDeleteResourceProviderTraits handles
// DELETE /resource_providers/{uuid}/traits requests.
//
@@ -112,18 +274,107 @@ func (s *Shim) HandleUpdateResourceProviderTraits(w http.ResponseWriter, r *http
// for the same provider, prefer PUT with an empty traits list instead.
// Returns 404 if the provider does not exist. Returns 409 Conflict on
// concurrent modification. Returns 204 No Content on success.
+//
+// https://docs.openstack.org/api-ref/placement/#delete-resource-provider-traits
func (s *Shim) HandleDeleteResourceProviderTraits(w http.ResponseWriter, r *http.Request) {
- if _, ok := requiredUUIDPathParam(w, r, "uuid"); !ok {
+ uuid, ok := requiredUUIDPathParam(w, r, "uuid")
+ if !ok {
return
}
switch s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviderTraits) {
case FeatureModePassthrough:
s.forward(w, r)
case FeatureModeHybrid:
- s.forward(w, r)
+ s.deleteResourceProviderTraitsHybrid(w, r, uuid)
case FeatureModeCRD:
- http.Error(w, "crd mode is not yet implemented for resource provider trait writes", http.StatusNotImplemented)
+ s.deleteResourceProviderTraitsCRD(w, r, uuid)
default:
http.Error(w, "unknown feature mode", http.StatusInternalServerError)
}
}
+
+// deleteResourceProviderTraitsHybrid removes all traits via the CRD if the
+// provider is a KVM hypervisor, otherwise forwards to upstream placement.
+func (s *Shim) deleteResourceProviderTraitsHybrid(w http.ResponseWriter, r *http.Request, uuid string) {
+ ctx := r.Context()
+ log := logf.FromContext(ctx)
+
+ var hvs hv1.HypervisorList
+ err := s.List(ctx, &hvs, client.MatchingFields{idxHypervisorOpenStackId: uuid})
+ if err != nil || len(hvs.Items) != 1 {
+ log.Info("resource provider not resolved from kubernetes, forwarding to upstream placement", "uuid", uuid)
+ s.forward(w, r)
+ return
+ }
+ hv := &hvs.Items[0]
+ log.Info("resolved resource provider from CRD, deleting traits", "uuid", uuid, "hypervisor", hv.Name)
+
+ var newGroups []hv1.Group
+ for i := range hv.Spec.Groups {
+ if hv.Spec.Groups[i].Trait == nil {
+ newGroups = append(newGroups, hv.Spec.Groups[i])
+ }
+ }
+ hv.Spec.Groups = newGroups
+
+ if err := s.Update(ctx, hv); err != nil {
+ if apierrors.IsConflict(err) {
+ http.Error(w, "resource provider generation conflict", http.StatusConflict)
+ return
+ }
+ log.Error(err, "failed to delete hypervisor traits")
+ http.Error(w, "Internal Server Error", http.StatusInternalServerError)
+ return
+ }
+
+ log.Info("successfully deleted all traits via CRD", "uuid", uuid)
+ w.WriteHeader(http.StatusNoContent)
+}
+
+// deleteResourceProviderTraitsCRD removes all traits exclusively via the CRD,
+// returning 404 if the provider is not a known KVM hypervisor.
+func (s *Shim) deleteResourceProviderTraitsCRD(w http.ResponseWriter, r *http.Request, uuid string) {
+ ctx := r.Context()
+ log := logf.FromContext(ctx)
+
+ var hvs hv1.HypervisorList
+ err := s.List(ctx, &hvs, client.MatchingFields{idxHypervisorOpenStackId: uuid})
+ if apierrors.IsNotFound(err) || len(hvs.Items) == 0 {
+ log.Info("resource provider not found in kubernetes (crd mode)", "uuid", uuid)
+ http.Error(w, "resource provider not found", http.StatusNotFound)
+ return
+ }
+ if err != nil {
+ log.Error(err, "failed to list hypervisors with OpenStack ID index")
+ http.Error(w, "Internal Server Error", http.StatusInternalServerError)
+ return
+ }
+ if len(hvs.Items) > 1 {
+ log.Error(nil, "multiple hypervisors found with the same OpenStack ID", "uuid", uuid)
+ http.Error(w, "Internal Server Error", http.StatusInternalServerError)
+ return
+ }
+ hv := &hvs.Items[0]
+ log.Info("deleting all traits via CRD", "uuid", uuid, "hypervisor", hv.Name)
+
+ var newGroups []hv1.Group
+ for i := range hv.Spec.Groups {
+ if hv.Spec.Groups[i].Trait == nil {
+ newGroups = append(newGroups, hv.Spec.Groups[i])
+ }
+ }
+ hv.Spec.Groups = newGroups
+
+ if err := s.Update(ctx, hv); err != nil {
+ if apierrors.IsConflict(err) {
+ http.Error(w, "resource provider generation conflict", http.StatusConflict)
+ return
+ }
+ log.Error(err, "failed to delete hypervisor traits")
+ http.Error(w, "Internal Server Error", http.StatusInternalServerError)
+ return
+ }
+
+ log.Info("successfully deleted all traits via CRD", "uuid", uuid)
+ w.WriteHeader(http.StatusNoContent)
+}
diff --git a/internal/shim/placement/handle_resource_provider_traits_e2e.go b/internal/shim/placement/handle_resource_provider_traits_e2e.go
index 4acd665b0..7dc50b016 100644
--- a/internal/shim/placement/handle_resource_provider_traits_e2e.go
+++ b/internal/shim/placement/handle_resource_provider_traits_e2e.go
@@ -9,8 +9,13 @@ import (
"encoding/json"
"fmt"
"net/http"
+ "slices"
+ "time"
"github.com/cobaltcore-dev/cortex/pkg/conf"
+ hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
+ "github.com/gophercloud/gophercloud/v2"
+ apierrors "k8s.io/apimachinery/pkg/api/errors"
"sigs.k8s.io/controller-runtime/pkg/client"
logf "sigs.k8s.io/controller-runtime/pkg/log"
)
@@ -18,15 +23,11 @@ import (
// e2eTestResourceProviderTraits tests the
// /resource_providers/{uuid}/traits endpoints.
//
-// 1. Pre-cleanup: DELETE leftover RP traits, RP, and custom trait (ignore 404).
-// 2. Create fixtures: PUT a custom trait, POST a test RP.
-// 3. GET /{uuid}/traits — verify the trait list is empty, store generation.
-// 4. PUT /{uuid}/traits — associate the custom trait with the RP.
-// 5. GET /{uuid}/traits — verify the custom trait is now present.
-// 6. DELETE /{uuid}/traits — disassociate all traits from the RP.
-// 7. GET /{uuid}/traits — verify the trait list is empty again.
-// 8. Cleanup: DELETE the test RP and custom trait.
-func e2eTestResourceProviderTraits(ctx context.Context, _ client.Client) error {
+// In passthrough mode: exercises the upstream placement path with a
+// dynamically created resource provider.
+// In hybrid/crd mode: exercises the spec.groups-backed CRD path using a
+// real KVM hypervisor discovered from the cluster.
+func e2eTestResourceProviderTraits(ctx context.Context, cl client.Client) error {
log := logf.FromContext(ctx)
log.Info("Running resource provider traits endpoint e2e test")
config, err := conf.GetConfig[e2eRootConfig]()
@@ -42,14 +43,19 @@ func e2eTestResourceProviderTraits(ctx context.Context, _ client.Client) error {
}
log.Info("Successfully created openstack client for resource provider traits e2e test")
- // Resource provider trait writes (PUT/DELETE) are not yet implemented in
- // crd mode, and the test RP created via POST won't exist as a Hypervisor
- // CRD either, so skip the entire test in crd mode.
- rpTraitsMode := e2eCurrentMode(ctx)
- if rpTraitsMode == FeatureModeCRD {
- log.Info("Skipping resource provider traits e2e test because mode is crd (writes not implemented)")
- return nil
+ mode := e2eCurrentMode(ctx)
+ switch mode {
+ case FeatureModePassthrough:
+ return e2ePassthroughResourceProviderTraits(ctx, sc)
+ case FeatureModeHybrid, FeatureModeCRD:
+ return e2eCRDResourceProviderTraits(ctx, sc, cl)
+ default:
+ return fmt.Errorf("unexpected mode %q", mode)
}
+}
+
+func e2ePassthroughResourceProviderTraits(ctx context.Context, sc *gophercloud.ServiceClient) error {
+ log := logf.FromContext(ctx)
const testRPUUID = "e2e10000-0000-0000-0000-000000000003"
const testRPName = "cortex-e2e-test-rp-traits"
@@ -145,8 +151,7 @@ func e2eTestResourceProviderTraits(ctx context.Context, _ client.Client) error {
log.Info("Successfully created test resource provider for RP traits test",
"uuid", testRPUUID)
- // Deferred cleanup: always delete test fixtures on exit so a failed
- // assertion doesn't leave the fixed UUID/trait behind.
+ // Deferred cleanup.
defer func() {
log.Info("Deferred cleanup: deleting test resources")
for _, c := range []struct {
@@ -174,12 +179,11 @@ func e2eTestResourceProviderTraits(ctx context.Context, _ client.Client) error {
}
}()
- // Test GET /resource_providers/{uuid}/traits (empty).
+ // Test GET (empty).
log.Info("Testing GET /resource_providers/{uuid}/traits (empty)", "uuid", testRPUUID)
req, err = http.NewRequestWithContext(ctx,
http.MethodGet, sc.Endpoint+"/resource_providers/"+testRPUUID+"/traits", http.NoBody)
if err != nil {
- log.Error(err, "failed to create GET request for RP traits", "uuid", testRPUUID)
return err
}
req.Header.Set("X-Auth-Token", sc.TokenID)
@@ -187,49 +191,36 @@ func e2eTestResourceProviderTraits(ctx context.Context, _ client.Client) error {
req.Header.Set("Accept", "application/json")
resp, err = sc.HTTPClient.Do(req)
if err != nil {
- log.Error(err, "failed to send GET request for RP traits", "uuid", testRPUUID)
return err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
- err := fmt.Errorf("unexpected status code: %d", resp.StatusCode)
- log.Error(err, "GET RP traits returned an error", "uuid", testRPUUID)
- return err
+ return fmt.Errorf("GET RP traits: unexpected status %d", resp.StatusCode)
}
var traitsResp struct {
Traits []string `json:"traits"`
ResourceProviderGeneration int `json:"resource_provider_generation"`
}
- err = json.NewDecoder(resp.Body).Decode(&traitsResp)
- if err != nil {
- log.Error(err, "failed to decode RP traits response", "uuid", testRPUUID)
+ if err := json.NewDecoder(resp.Body).Decode(&traitsResp); err != nil {
return err
}
if len(traitsResp.Traits) != 0 {
- err := fmt.Errorf("expected 0 initial traits, got %d", len(traitsResp.Traits))
- log.Error(err, "initial traits not empty", "uuid", testRPUUID)
- return err
+ return fmt.Errorf("expected 0 initial traits, got %d", len(traitsResp.Traits))
}
- log.Info("Successfully retrieved empty traits for test resource provider",
- "uuid", testRPUUID, "traits", len(traitsResp.Traits),
- "generation", traitsResp.ResourceProviderGeneration)
+ log.Info("Verified empty traits", "generation", traitsResp.ResourceProviderGeneration)
- // Test PUT /resource_providers/{uuid}/traits (associate trait).
- log.Info("Testing PUT /resource_providers/{uuid}/traits to associate trait",
- "uuid", testRPUUID, "trait", testTrait)
+ // Test PUT (associate trait).
putBody, err := json.Marshal(map[string]any{
"resource_provider_generation": traitsResp.ResourceProviderGeneration,
"traits": []string{testTrait},
})
if err != nil {
- log.Error(err, "failed to marshal request body")
return err
}
req, err = http.NewRequestWithContext(ctx,
http.MethodPut, sc.Endpoint+"/resource_providers/"+testRPUUID+"/traits",
bytes.NewReader(putBody))
if err != nil {
- log.Error(err, "failed to create PUT request for RP traits", "uuid", testRPUUID)
return err
}
req.Header.Set("X-Auth-Token", sc.TokenID)
@@ -238,25 +229,18 @@ func e2eTestResourceProviderTraits(ctx context.Context, _ client.Client) error {
req.Header.Set("Accept", "application/json")
resp, err = sc.HTTPClient.Do(req)
if err != nil {
- log.Error(err, "failed to send PUT request for RP traits", "uuid", testRPUUID)
return err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
- err := fmt.Errorf("unexpected status code: %d", resp.StatusCode)
- log.Error(err, "PUT RP traits returned an error", "uuid", testRPUUID)
- return err
+ return fmt.Errorf("PUT RP traits: unexpected status %d", resp.StatusCode)
}
- log.Info("Successfully associated trait with test resource provider",
- "uuid", testRPUUID, "trait", testTrait)
+ log.Info("Successfully associated trait")
- // Test GET /resource_providers/{uuid}/traits (after PUT).
- log.Info("Testing GET /resource_providers/{uuid}/traits (after PUT)",
- "uuid", testRPUUID)
+ // Test GET (after PUT).
req, err = http.NewRequestWithContext(ctx,
http.MethodGet, sc.Endpoint+"/resource_providers/"+testRPUUID+"/traits", http.NoBody)
if err != nil {
- log.Error(err, "failed to create GET request for RP traits", "uuid", testRPUUID)
return err
}
req.Header.Set("X-Auth-Token", sc.TokenID)
@@ -264,128 +248,270 @@ func e2eTestResourceProviderTraits(ctx context.Context, _ client.Client) error {
req.Header.Set("Accept", "application/json")
resp, err = sc.HTTPClient.Do(req)
if err != nil {
- log.Error(err, "failed to send GET request for RP traits", "uuid", testRPUUID)
return err
}
defer resp.Body.Close()
- if resp.StatusCode < 200 || resp.StatusCode >= 300 {
- err := fmt.Errorf("unexpected status code: %d", resp.StatusCode)
- log.Error(err, "GET RP traits returned an error", "uuid", testRPUUID)
- return err
- }
- err = json.NewDecoder(resp.Body).Decode(&traitsResp)
- if err != nil {
- log.Error(err, "failed to decode RP traits response", "uuid", testRPUUID)
+ if err := json.NewDecoder(resp.Body).Decode(&traitsResp); err != nil {
return err
}
- if len(traitsResp.Traits) != 1 || traitsResp.Traits[0] != testTrait {
- err := fmt.Errorf("expected trait %s, got %v", testTrait, traitsResp.Traits)
- log.Error(err, "trait mismatch", "uuid", testRPUUID)
- return err
+ if !slices.Contains(traitsResp.Traits, testTrait) {
+ return fmt.Errorf("expected trait %s, got %v", testTrait, traitsResp.Traits)
}
- log.Info("Successfully verified trait on test resource provider",
- "uuid", testRPUUID, "traits", traitsResp.Traits)
+ log.Info("Verified trait present after PUT")
- // Test DELETE /resource_providers/{uuid}/traits.
- log.Info("Testing DELETE /resource_providers/{uuid}/traits", "uuid", testRPUUID)
+ // Test DELETE.
req, err = http.NewRequestWithContext(ctx,
http.MethodDelete, sc.Endpoint+"/resource_providers/"+testRPUUID+"/traits", http.NoBody)
if err != nil {
- log.Error(err, "failed to create DELETE request for RP traits", "uuid", testRPUUID)
return err
}
req.Header.Set("X-Auth-Token", sc.TokenID)
req.Header.Set("OpenStack-API-Version", "placement 1.6")
resp, err = sc.HTTPClient.Do(req)
if err != nil {
- log.Error(err, "failed to send DELETE request for RP traits", "uuid", testRPUUID)
return err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
- err := fmt.Errorf("unexpected status code: %d", resp.StatusCode)
- log.Error(err, "DELETE RP traits returned an error", "uuid", testRPUUID)
+ return fmt.Errorf("DELETE RP traits: unexpected status %d", resp.StatusCode)
+ }
+ log.Info("Successfully deleted traits")
+
+ // Cleanup.
+ req, err = http.NewRequestWithContext(ctx,
+ http.MethodDelete, sc.Endpoint+"/resource_providers/"+testRPUUID, http.NoBody)
+ if err != nil {
+ return err
+ }
+ req.Header.Set("X-Auth-Token", sc.TokenID)
+ req.Header.Set("OpenStack-API-Version", "placement 1.6")
+ resp, err = sc.HTTPClient.Do(req)
+ if err != nil {
return err
}
- log.Info("Successfully deleted traits from test resource provider", "uuid", testRPUUID)
+ resp.Body.Close()
- // Verify traits cleared.
- log.Info("Verifying traits cleared on test resource provider", "uuid", testRPUUID)
req, err = http.NewRequestWithContext(ctx,
- http.MethodGet, sc.Endpoint+"/resource_providers/"+testRPUUID+"/traits", http.NoBody)
+ http.MethodDelete, sc.Endpoint+"/traits/"+testTrait, http.NoBody)
if err != nil {
- log.Error(err, "failed to create GET request for RP traits", "uuid", testRPUUID)
return err
}
req.Header.Set("X-Auth-Token", sc.TokenID)
req.Header.Set("OpenStack-API-Version", "placement 1.6")
- req.Header.Set("Accept", "application/json")
resp, err = sc.HTTPClient.Do(req)
if err != nil {
- log.Error(err, "failed to send GET request for RP traits", "uuid", testRPUUID)
+ return err
+ }
+ resp.Body.Close()
+
+ return nil
+}
+
+// e2eCRDResourceProviderTraits tests the CRD/hybrid path by discovering a
+// real KVM hypervisor in the cluster, seeding spec.groups, and exercising
+// GET/PUT/DELETE through the shim.
+func e2eCRDResourceProviderTraits(ctx context.Context, sc *gophercloud.ServiceClient, cl client.Client) error {
+ log := logf.FromContext(ctx)
+
+ // Discover a KVM hypervisor with a non-empty OpenStack ID.
+ var hvs hv1.HypervisorList
+ if err := cl.List(ctx, &hvs); err != nil {
+ log.Error(err, "failed to list hypervisors for CRD traits path")
+ return err
+ }
+ var kvmHV *hv1.Hypervisor
+ for i := range hvs.Items {
+ if hvs.Items[i].Status.HypervisorID != "" {
+ kvmHV = &hvs.Items[i]
+ break
+ }
+ }
+ if kvmHV == nil {
+ log.Info("No KVM hypervisors with OpenStack ID found, skipping CRD traits tests")
+ return nil
+ }
+ kvmUUID := kvmHV.Status.HypervisorID
+ log.Info("Using KVM hypervisor for CRD traits e2e tests", "uuid", kvmUUID, "name", kvmHV.Name)
+
+ // Save original groups for restoration.
+ originalGroups := kvmHV.Spec.Groups
+
+ // Seed spec.groups with test traits (preserve non-trait groups).
+ const testTrait1 = "CUSTOM_E2E_CRD_TRAIT_1"
+ const testTrait2 = "CUSTOM_E2E_CRD_TRAIT_2"
+ var nonTraitGroups []hv1.Group
+ for i := range kvmHV.Spec.Groups {
+ if kvmHV.Spec.Groups[i].Trait == nil {
+ nonTraitGroups = append(nonTraitGroups, kvmHV.Spec.Groups[i])
+ }
+ }
+ nonTraitGroups = append(nonTraitGroups,
+ hv1.Group{Trait: &hv1.TraitGroup{Name: testTrait1}},
+ hv1.Group{Trait: &hv1.TraitGroup{Name: testTrait2}},
+ )
+ kvmHV.Spec.Groups = nonTraitGroups
+ if err := cl.Update(ctx, kvmHV); err != nil {
+ return fmt.Errorf("failed to seed spec.groups with test traits: %w", err)
+ }
+ log.Info("Seeded spec.groups with test traits", "uuid", kvmUUID)
+
+ // Always restore original groups on exit (retry on conflict).
+ defer func() {
+ log.Info("Restoring original spec.groups", "uuid", kvmUUID)
+ for range 5 {
+ if err := cl.Get(ctx, client.ObjectKeyFromObject(kvmHV), kvmHV); err != nil {
+ log.Error(err, "failed to refetch hypervisor for restoration")
+ return
+ }
+ kvmHV.Spec.Groups = originalGroups
+ if err := cl.Update(ctx, kvmHV); err != nil {
+ if apierrors.IsConflict(err) {
+ continue
+ }
+ log.Error(err, "failed to restore original spec.groups")
+ return
+ }
+ return
+ }
+ log.Error(nil, "exhausted retries restoring original spec.groups")
+ }()
+
+ // Refetch to get updated generation.
+ if err := cl.Get(ctx, client.ObjectKeyFromObject(kvmHV), kvmHV); err != nil {
+ return fmt.Errorf("failed to refetch hypervisor after seed: %w", err)
+ }
+
+ // Test GET — should return the seeded traits.
+ // Poll because the shim's informer cache may take a moment to observe the update.
+ log.Info("Testing GET /resource_providers/{uuid}/traits (CRD)", "uuid", kvmUUID)
+ var traitsResp struct {
+ Traits []string `json:"traits"`
+ ResourceProviderGeneration int64 `json:"resource_provider_generation"`
+ }
+ if err := e2ePollUntil(ctx, 10*time.Second, func() (bool, error) {
+ req, err := http.NewRequestWithContext(ctx,
+ http.MethodGet, sc.Endpoint+"/resource_providers/"+kvmUUID+"/traits", http.NoBody)
+ if err != nil {
+ return false, err
+ }
+ req.Header.Set("X-Auth-Token", sc.TokenID)
+ req.Header.Set("OpenStack-API-Version", "placement 1.6")
+ req.Header.Set("Accept", "application/json")
+ resp, err := sc.HTTPClient.Do(req)
+ if err != nil {
+ return false, err
+ }
+ defer resp.Body.Close()
+ if resp.StatusCode != http.StatusOK {
+ return false, fmt.Errorf("GET CRD traits: expected 200, got %d", resp.StatusCode)
+ }
+ if err := json.NewDecoder(resp.Body).Decode(&traitsResp); err != nil {
+ return false, fmt.Errorf("failed to decode CRD traits response: %w", err)
+ }
+ return slices.Contains(traitsResp.Traits, testTrait1) &&
+ slices.Contains(traitsResp.Traits, testTrait2), nil
+ }); err != nil {
+ return fmt.Errorf("waiting for seeded traits: %w (got %v)", err, traitsResp.Traits)
+ }
+ log.Info("Verified GET returns seeded traits from CRD",
+ "traits", traitsResp.Traits, "generation", traitsResp.ResourceProviderGeneration)
+
+ // Test PUT — replace traits.
+ const replacementTrait = "CUSTOM_E2E_CRD_REPLACED"
+ putBody, err := json.Marshal(map[string]any{
+ "resource_provider_generation": traitsResp.ResourceProviderGeneration,
+ "traits": []string{replacementTrait},
+ })
+ if err != nil {
+ return err
+ }
+ req, err := http.NewRequestWithContext(ctx,
+ http.MethodPut, sc.Endpoint+"/resource_providers/"+kvmUUID+"/traits",
+ bytes.NewReader(putBody))
+ if err != nil {
+ return err
+ }
+ req.Header.Set("X-Auth-Token", sc.TokenID)
+ req.Header.Set("OpenStack-API-Version", "placement 1.6")
+ req.Header.Set("Content-Type", "application/json")
+ req.Header.Set("Accept", "application/json")
+ resp, err := sc.HTTPClient.Do(req)
+ if err != nil {
return err
}
defer resp.Body.Close()
- if resp.StatusCode < 200 || resp.StatusCode >= 300 {
- err := fmt.Errorf("unexpected status code: %d", resp.StatusCode)
- log.Error(err, "GET RP traits returned an error", "uuid", testRPUUID)
+ if resp.StatusCode != http.StatusOK {
+ return fmt.Errorf("PUT CRD traits: expected 200, got %d", resp.StatusCode)
+ }
+ log.Info("Successfully replaced traits via PUT (CRD)")
+
+ // Test PUT with stale generation — should return 409.
+ putBody, err = json.Marshal(map[string]any{
+ "resource_provider_generation": traitsResp.ResourceProviderGeneration,
+ "traits": []string{"STALE"},
+ })
+ if err != nil {
return err
}
- err = json.NewDecoder(resp.Body).Decode(&traitsResp)
+ req, err = http.NewRequestWithContext(ctx,
+ http.MethodPut, sc.Endpoint+"/resource_providers/"+kvmUUID+"/traits",
+ bytes.NewReader(putBody))
if err != nil {
- log.Error(err, "failed to decode RP traits response", "uuid", testRPUUID)
return err
}
- if len(traitsResp.Traits) != 0 {
- err := fmt.Errorf("expected 0 traits, got %d", len(traitsResp.Traits))
- log.Error(err, "traits not cleared", "uuid", testRPUUID)
+ req.Header.Set("X-Auth-Token", sc.TokenID)
+ req.Header.Set("OpenStack-API-Version", "placement 1.6")
+ req.Header.Set("Content-Type", "application/json")
+ req.Header.Set("Accept", "application/json")
+ resp, err = sc.HTTPClient.Do(req)
+ if err != nil {
return err
}
- log.Info("Verified traits cleared on test resource provider", "uuid", testRPUUID)
+ defer resp.Body.Close()
+ if resp.StatusCode != http.StatusConflict {
+ return fmt.Errorf("PUT CRD traits (stale gen): expected 409, got %d", resp.StatusCode)
+ }
+ log.Info("Verified generation conflict returns 409")
- // Cleanup: delete the test resource provider and custom trait.
- log.Info("Cleaning up test resources")
+ // Test GET — verify replacement persisted.
req, err = http.NewRequestWithContext(ctx,
- http.MethodDelete, sc.Endpoint+"/resource_providers/"+testRPUUID, http.NoBody)
+ http.MethodGet, sc.Endpoint+"/resource_providers/"+kvmUUID+"/traits", http.NoBody)
if err != nil {
- log.Error(err, "failed to create DELETE request for resource provider", "uuid", testRPUUID)
return err
}
req.Header.Set("X-Auth-Token", sc.TokenID)
req.Header.Set("OpenStack-API-Version", "placement 1.6")
+ req.Header.Set("Accept", "application/json")
resp, err = sc.HTTPClient.Do(req)
if err != nil {
- log.Error(err, "failed to send DELETE request for resource provider", "uuid", testRPUUID)
return err
}
defer resp.Body.Close()
- if resp.StatusCode < 200 || resp.StatusCode >= 300 {
- err := fmt.Errorf("unexpected status code: %d", resp.StatusCode)
- log.Error(err, "DELETE resource provider returned an error", "uuid", testRPUUID)
+ if err := json.NewDecoder(resp.Body).Decode(&traitsResp); err != nil {
return err
}
- log.Info("Successfully deleted test resource provider", "uuid", testRPUUID)
+ if len(traitsResp.Traits) != 1 || traitsResp.Traits[0] != replacementTrait {
+ return fmt.Errorf("expected [%s], got %v", replacementTrait, traitsResp.Traits)
+ }
+ log.Info("Verified replacement trait persisted")
+ // Test DELETE — remove all traits.
req, err = http.NewRequestWithContext(ctx,
- http.MethodDelete, sc.Endpoint+"/traits/"+testTrait, http.NoBody)
+ http.MethodDelete, sc.Endpoint+"/resource_providers/"+kvmUUID+"/traits", http.NoBody)
if err != nil {
- log.Error(err, "failed to create DELETE request for trait", "trait", testTrait)
return err
}
req.Header.Set("X-Auth-Token", sc.TokenID)
req.Header.Set("OpenStack-API-Version", "placement 1.6")
resp, err = sc.HTTPClient.Do(req)
if err != nil {
- log.Error(err, "failed to send DELETE request for trait", "trait", testTrait)
return err
}
defer resp.Body.Close()
- if resp.StatusCode < 200 || resp.StatusCode >= 300 {
- err := fmt.Errorf("unexpected status code: %d", resp.StatusCode)
- log.Error(err, "DELETE trait returned an error", "trait", testTrait)
- return err
+ if resp.StatusCode != http.StatusNoContent {
+ return fmt.Errorf("DELETE CRD traits: expected 204, got %d", resp.StatusCode)
}
- log.Info("Successfully deleted custom trait", "trait", testTrait)
+ log.Info("Verified DELETE returns 204")
return nil
}
diff --git a/internal/shim/placement/handle_resource_provider_traits_test.go b/internal/shim/placement/handle_resource_provider_traits_test.go
index 69ac8cd8c..d7e94ae0e 100644
--- a/internal/shim/placement/handle_resource_provider_traits_test.go
+++ b/internal/shim/placement/handle_resource_provider_traits_test.go
@@ -6,9 +6,38 @@ package placement
import (
"encoding/json"
"net/http"
+ "net/http/httptest"
+ "strings"
"testing"
+
+ hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "sigs.k8s.io/controller-runtime/pkg/client"
)
+func testHypervisorWithGroups(name, openstackID string, groups []hv1.Group) *hv1.Hypervisor {
+ return &hv1.Hypervisor{
+ ObjectMeta: metav1.ObjectMeta{Name: name},
+ Spec: hv1.HypervisorSpec{Groups: groups},
+ Status: hv1.HypervisorStatus{HypervisorID: openstackID},
+ }
+}
+
+func serveHandlerWithBody(t *testing.T, method, pattern string, handler http.HandlerFunc, reqPath, body string) *httptest.ResponseRecorder { //nolint:unparam
+ t.Helper()
+ mux := http.NewServeMux()
+ mux.HandleFunc(method+" "+pattern, handler)
+ var req *http.Request
+ if body != "" {
+ req = httptest.NewRequest(method, reqPath, strings.NewReader(body))
+ } else {
+ req = httptest.NewRequest(method, reqPath, http.NoBody)
+ }
+ w := httptest.NewRecorder()
+ mux.ServeHTTP(w, req)
+ return w
+}
+
func TestHandleListResourceProviderTraits(t *testing.T) {
t.Run("valid uuid", func(t *testing.T) {
s := newTestShim(t, http.StatusOK, "{}", nil)
@@ -73,9 +102,9 @@ func TestHandleDeleteResourceProviderTraits(t *testing.T) {
}
func TestHandleResourceProviderTraits_HybridMode(t *testing.T) {
- s := newTestShim(t, http.StatusOK, `{"traits":["CUSTOM_HW_FPGA"],"resource_provider_generation":1}`, nil)
+ s := newTestShimWithHypervisors(t, http.StatusOK, `{"traits":["CUSTOM_HW_FPGA"],"resource_provider_generation":1}`)
s.config.Features.ResourceProviderTraits = FeatureModeHybrid
- t.Run("GET forwards to upstream", func(t *testing.T) {
+ t.Run("GET forwards to upstream when provider not in CRD", func(t *testing.T) {
w := serveHandler(t, "GET", "/resource_providers/{uuid}/traits",
s.HandleListResourceProviderTraits,
"/resource_providers/"+validUUID+"/traits")
@@ -83,7 +112,7 @@ func TestHandleResourceProviderTraits_HybridMode(t *testing.T) {
t.Fatalf("status = %d, want %d", w.Code, http.StatusOK)
}
})
- t.Run("PUT forwards to upstream", func(t *testing.T) {
+ t.Run("PUT forwards to upstream when provider not in CRD", func(t *testing.T) {
w := serveHandler(t, "PUT", "/resource_providers/{uuid}/traits",
s.HandleUpdateResourceProviderTraits,
"/resource_providers/"+validUUID+"/traits")
@@ -92,9 +121,9 @@ func TestHandleResourceProviderTraits_HybridMode(t *testing.T) {
}
})
- sDel := newTestShim(t, http.StatusNoContent, "", nil)
+ sDel := newTestShimWithHypervisors(t, http.StatusNoContent, "")
sDel.config.Features.ResourceProviderTraits = FeatureModeHybrid
- t.Run("DELETE forwards to upstream", func(t *testing.T) {
+ t.Run("DELETE forwards to upstream when provider not in CRD", func(t *testing.T) {
w := serveHandler(t, "DELETE", "/resource_providers/{uuid}/traits",
sDel.HandleDeleteResourceProviderTraits,
"/resource_providers/"+validUUID+"/traits")
@@ -102,15 +131,41 @@ func TestHandleResourceProviderTraits_HybridMode(t *testing.T) {
t.Fatalf("status = %d, want %d", w.Code, http.StatusNoContent)
}
})
+
+ t.Run("GET serves from CRD when provider is KVM", func(t *testing.T) {
+ hv := testHypervisorWithGroups("kvm-hybrid", validUUID, []hv1.Group{
+ {Trait: &hv1.TraitGroup{Name: "CUSTOM_KVM_TRAIT"}},
+ })
+ sKVM := newTestShimWithHypervisors(t, http.StatusOK, "{}", hv)
+ sKVM.config.Features.ResourceProviderTraits = FeatureModeHybrid
+ w := serveHandler(t, "GET", "/resource_providers/{uuid}/traits",
+ sKVM.HandleListResourceProviderTraits,
+ "/resource_providers/"+validUUID+"/traits")
+ if w.Code != http.StatusOK {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusOK)
+ }
+ var resp resourceProviderTraitsResponse
+ if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
+ t.Fatalf("failed to decode response: %v", err)
+ }
+ if len(resp.Traits) != 1 || resp.Traits[0] != "CUSTOM_KVM_TRAIT" {
+ t.Fatalf("expected [CUSTOM_KVM_TRAIT], got %v", resp.Traits)
+ }
+ })
}
func TestHandleResourceProviderTraits_CRDMode(t *testing.T) {
- hv := testHypervisorFull("kvm-host-1", validUUID, nil, []string{"CUSTOM_HW_FPGA", "HW_CPU_X86_SSE42"}, nil)
- s := newTestShimWithHypervisors(t, http.StatusOK, "{}", &hv)
+ groups := []hv1.Group{
+ {Trait: &hv1.TraitGroup{Name: "CUSTOM_HW_FPGA"}},
+ {Trait: &hv1.TraitGroup{Name: "HW_CPU_X86_SSE42"}},
+ {Aggregate: &hv1.AggregateGroup{Name: "az1", UUID: "agg-uuid-1"}},
+ }
+ hv := testHypervisorWithGroups("kvm-host-1", validUUID, groups)
+ s := newTestShimWithHypervisors(t, http.StatusOK, "{}", hv)
s.config.Features.ResourceProviderTraits = FeatureModeCRD
s.config.Features.ResourceProviders = FeatureModeCRD
- t.Run("GET returns traits from CRD for KVM provider", func(t *testing.T) {
+ t.Run("GET returns traits from spec.groups", func(t *testing.T) {
w := serveHandler(t, "GET", "/resource_providers/{uuid}/traits",
s.HandleListResourceProviderTraits,
"/resource_providers/"+validUUID+"/traits")
@@ -124,8 +179,36 @@ func TestHandleResourceProviderTraits_CRDMode(t *testing.T) {
if len(resp.Traits) != 2 {
t.Fatalf("traits count = %d, want 2", len(resp.Traits))
}
+ if resp.Traits[0] != "CUSTOM_HW_FPGA" {
+ t.Errorf("traits[0] = %q, want CUSTOM_HW_FPGA", resp.Traits[0])
+ }
+ if resp.Traits[1] != "HW_CPU_X86_SSE42" {
+ t.Errorf("traits[1] = %q, want HW_CPU_X86_SSE42", resp.Traits[1])
+ }
+ })
+
+ t.Run("GET returns empty traits when spec.groups has no traits", func(t *testing.T) {
+ hvNoTraits := testHypervisorWithGroups("kvm-no-traits", "b1b2b3b4-c5c6-d7d8-e9e0-f1f2f3f4f5f6", []hv1.Group{
+ {Aggregate: &hv1.AggregateGroup{Name: "az1", UUID: "agg-1"}},
+ })
+ s2 := newTestShimWithHypervisors(t, http.StatusOK, "{}", hvNoTraits)
+ s2.config.Features.ResourceProviderTraits = FeatureModeCRD
+ w := serveHandler(t, "GET", "/resource_providers/{uuid}/traits",
+ s2.HandleListResourceProviderTraits,
+ "/resource_providers/b1b2b3b4-c5c6-d7d8-e9e0-f1f2f3f4f5f6/traits")
+ if w.Code != http.StatusOK {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusOK)
+ }
+ var resp resourceProviderTraitsResponse
+ if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
+ t.Fatalf("failed to decode response: %v", err)
+ }
+ if len(resp.Traits) != 0 {
+ t.Fatalf("traits count = %d, want 0", len(resp.Traits))
+ }
})
- t.Run("GET returns 404 for non-KVM provider", func(t *testing.T) {
+
+ t.Run("GET returns 404 for non-existent provider", func(t *testing.T) {
nonKVMUUID := "a1b2c3d4-e5f6-7890-abcd-ef1234567890"
w := serveHandler(t, "GET", "/resource_providers/{uuid}/traits",
s.HandleListResourceProviderTraits,
@@ -134,20 +217,100 @@ func TestHandleResourceProviderTraits_CRDMode(t *testing.T) {
t.Fatalf("status = %d, want %d", w.Code, http.StatusNotFound)
}
})
- t.Run("PUT returns 501", func(t *testing.T) {
- w := serveHandler(t, "PUT", "/resource_providers/{uuid}/traits",
+
+ t.Run("PUT replaces traits in spec.groups preserving aggregates", func(t *testing.T) {
+ hvPut := testHypervisorWithGroups("kvm-put-traits", "c1c2c3c4-d5d6-e7e8-f9f0-a1a2a3a4a5a6", []hv1.Group{
+ {Trait: &hv1.TraitGroup{Name: "OLD_TRAIT"}},
+ {Aggregate: &hv1.AggregateGroup{Name: "keep-me", UUID: "keep-uuid"}},
+ })
+ sPut := newTestShimWithHypervisors(t, http.StatusOK, "{}", hvPut)
+ sPut.config.Features.ResourceProviderTraits = FeatureModeCRD
+
+ body := `{"traits":["NEW_TRAIT_1","NEW_TRAIT_2"],"resource_provider_generation":0}`
+ w := serveHandlerWithBody(t, "PUT", "/resource_providers/{uuid}/traits",
+ sPut.HandleUpdateResourceProviderTraits,
+ "/resource_providers/c1c2c3c4-d5d6-e7e8-f9f0-a1a2a3a4a5a6/traits", body)
+ if w.Code != http.StatusOK {
+ t.Fatalf("status = %d, want %d; body: %s", w.Code, http.StatusOK, w.Body.String())
+ }
+ var resp resourceProviderTraitsResponse
+ if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
+ t.Fatalf("failed to decode response: %v", err)
+ }
+ if len(resp.Traits) != 2 {
+ t.Fatalf("traits count = %d, want 2", len(resp.Traits))
+ }
+
+ // Verify aggregates were preserved by fetching the updated object.
+ var updated hv1.Hypervisor
+ if err := sPut.Get(t.Context(), client.ObjectKeyFromObject(hvPut), &updated); err != nil {
+ t.Fatalf("failed to get updated hypervisor: %v", err)
+ }
+ aggs := hv1.GetAggregates(updated.Spec.Groups)
+ if len(aggs) != 1 || aggs[0].UUID != "keep-uuid" {
+ t.Fatalf("aggregates were not preserved: got %+v", aggs)
+ }
+ })
+
+ t.Run("PUT returns 409 on generation mismatch", func(t *testing.T) {
+ hvConflict := testHypervisorWithGroups("kvm-conflict", "d1d2d3d4-e5e6-f7f8-a9a0-b1b2b3b4b5b6", nil)
+ sConflict := newTestShimWithHypervisors(t, http.StatusOK, "{}", hvConflict)
+ sConflict.config.Features.ResourceProviderTraits = FeatureModeCRD
+
+ body := `{"traits":["T1"],"resource_provider_generation":999}`
+ w := serveHandlerWithBody(t, "PUT", "/resource_providers/{uuid}/traits",
+ sConflict.HandleUpdateResourceProviderTraits,
+ "/resource_providers/d1d2d3d4-e5e6-f7f8-a9a0-b1b2b3b4b5b6/traits", body)
+ if w.Code != http.StatusConflict {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusConflict)
+ }
+ })
+
+ t.Run("PUT returns 404 for non-existent provider", func(t *testing.T) {
+ body := `{"traits":["T1"],"resource_provider_generation":0}`
+ w := serveHandlerWithBody(t, "PUT", "/resource_providers/{uuid}/traits",
s.HandleUpdateResourceProviderTraits,
- "/resource_providers/"+validUUID+"/traits")
- if w.Code != http.StatusNotImplemented {
- t.Fatalf("status = %d, want %d", w.Code, http.StatusNotImplemented)
+ "/resource_providers/e1e2e3e4-f5f6-a7a8-b9b0-c1c2c3c4c5c6/traits", body)
+ if w.Code != http.StatusNotFound {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusNotFound)
}
})
- t.Run("DELETE returns 501", func(t *testing.T) {
+
+ t.Run("PUT returns 400 for malformed body", func(t *testing.T) {
+ w := serveHandlerWithBody(t, "PUT", "/resource_providers/{uuid}/traits",
+ s.HandleUpdateResourceProviderTraits,
+ "/resource_providers/"+validUUID+"/traits", "not json")
+ if w.Code != http.StatusBadRequest {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest)
+ }
+ })
+
+ t.Run("DELETE removes all traits preserving aggregates", func(t *testing.T) {
+ hvDel := testHypervisorWithGroups("kvm-del-traits", "f1f2f3f4-a5a6-b7b8-c9c0-d1d2d3d4d5d6", []hv1.Group{
+ {Trait: &hv1.TraitGroup{Name: "REMOVE_ME"}},
+ {Aggregate: &hv1.AggregateGroup{Name: "stay", UUID: "stay-uuid"}},
+ })
+ sDel := newTestShimWithHypervisors(t, http.StatusOK, "{}", hvDel)
+ sDel.config.Features.ResourceProviderTraits = FeatureModeCRD
+
w := serveHandler(t, "DELETE", "/resource_providers/{uuid}/traits",
- s.HandleDeleteResourceProviderTraits,
- "/resource_providers/"+validUUID+"/traits")
- if w.Code != http.StatusNotImplemented {
- t.Fatalf("status = %d, want %d", w.Code, http.StatusNotImplemented)
+ sDel.HandleDeleteResourceProviderTraits,
+ "/resource_providers/f1f2f3f4-a5a6-b7b8-c9c0-d1d2d3d4d5d6/traits")
+ if w.Code != http.StatusNoContent {
+ t.Fatalf("status = %d, want %d", w.Code, http.StatusNoContent)
+ }
+
+ var updated hv1.Hypervisor
+ if err := sDel.Get(t.Context(), client.ObjectKeyFromObject(hvDel), &updated); err != nil {
+ t.Fatalf("failed to get updated hypervisor: %v", err)
+ }
+ traits := hv1.GetTraits(updated.Spec.Groups)
+ if len(traits) != 0 {
+ t.Fatalf("expected no traits, got %+v", traits)
+ }
+ aggs := hv1.GetAggregates(updated.Spec.Groups)
+ if len(aggs) != 1 || aggs[0].UUID != "stay-uuid" {
+ t.Fatalf("aggregates were not preserved: got %+v", aggs)
}
})
}
diff --git a/internal/shim/placement/handle_resource_providers_e2e.go b/internal/shim/placement/handle_resource_providers_e2e.go
index 2faca2fc2..963f44533 100644
--- a/internal/shim/placement/handle_resource_providers_e2e.go
+++ b/internal/shim/placement/handle_resource_providers_e2e.go
@@ -61,9 +61,6 @@ func e2eTestResourceProviders(ctx context.Context, cl client.Client) error {
// The VMware path creates synthetic test RPs against upstream placement.
// In crd mode there is no upstream, so skip it.
mode := e2eCurrentMode(ctx)
- if mode == "" {
- mode = config.Features.ResourceProviders.orDefault()
- }
if mode != FeatureModeCRD {
log.Info("=== VMware path: passthrough resource provider tests ===")
if err := e2eVMwareResourceProviders(ctx, sc); err != nil {
diff --git a/internal/shim/placement/shim_e2e.go b/internal/shim/placement/shim_e2e.go
index d839751a5..b678fa6e5 100644
--- a/internal/shim/placement/shim_e2e.go
+++ b/internal/shim/placement/shim_e2e.go
@@ -212,3 +212,27 @@ func RunE2E(ctx context.Context, cl client.Client) error {
"took_ms", time.Since(totalStart).Milliseconds())
return nil
}
+
+// e2ePollUntil retries check at short intervals until it returns true or the
+// timeout expires. Used to wait for the informer cache to pick up a CRD
+// update before asserting via the HTTP API.
+func e2ePollUntil(ctx context.Context, timeout time.Duration, check func() (bool, error)) error {
+ deadline := time.Now().Add(timeout)
+ for {
+ ok, err := check()
+ if err != nil {
+ return err
+ }
+ if ok {
+ return nil
+ }
+ if time.Now().After(deadline) {
+ return fmt.Errorf("timed out after %s waiting for condition", timeout)
+ }
+ select {
+ case <-ctx.Done():
+ return ctx.Err()
+ case <-time.After(500 * time.Millisecond):
+ }
+ }
+}
From 1311fd43ec0cf5acadf86a0f0f35016d26a4c4ab Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Thu, 30 Apr 2026 06:51:07 +0000
Subject: [PATCH 24/54] Bump cortex-shim chart appVersions to sha-5a1a8838
[skip ci]
---
helm/library/cortex-shim/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex-shim/Chart.yaml b/helm/library/cortex-shim/Chart.yaml
index b8a88feb7..d72eb882f 100644
--- a/helm/library/cortex-shim/Chart.yaml
+++ b/helm/library/cortex-shim/Chart.yaml
@@ -3,6 +3,6 @@ name: cortex-shim
description: A Helm chart to distribute cortex shims.
type: application
version: 0.0.3
-appVersion: "sha-17050b2f"
+appVersion: "sha-5a1a8838"
icon: "https://example.com/icon.png"
dependencies: []
From 2b2863286d42b50ceb02503d9a3a3c7dadcec758 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Thu, 30 Apr 2026 06:51:08 +0000
Subject: [PATCH 25/54] Bump cortex chart appVersions to sha-5a1a8838 [skip ci]
---
helm/library/cortex/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml
index a00036565..4aea0d6ed 100644
--- a/helm/library/cortex/Chart.yaml
+++ b/helm/library/cortex/Chart.yaml
@@ -3,6 +3,6 @@ name: cortex
description: A Helm chart to distribute cortex.
type: application
version: 0.0.44
-appVersion: "sha-c9d8943a"
+appVersion: "sha-5a1a8838"
icon: "https://example.com/icon.png"
dependencies: []
From edd309cda8320117c290fcc018c5556a3c89471b Mon Sep 17 00:00:00 2001
From: Malte <140147670+umswmayj@users.noreply.github.com>
Date: Thu, 30 Apr 2026 08:56:28 +0200
Subject: [PATCH 26/54] feat: Add basic support for flavor groups for failover
reservation (#754)
---
api/external/nova/messages.go | 5 +
api/v1alpha1/reservation_types.go | 7 +
helm/bundles/cortex-nova/values.yaml | 2 +
.../files/crds/cortex.cloud_reservations.yaml | 4 +
.../plugins/compute/resource_capacity_kvm.go | 5 +-
.../filters/filter_has_enough_capacity.go | 3 +-
.../weighers/kvm_failover_evacuation.go | 4 +-
.../kvm_failover_reservation_consolidation.go | 164 ++++++++
...failover_reservation_consolidation_test.go | 368 ++++++++++++++++++
.../commitments/reservation_controller.go | 26 +-
.../reservations/failover/config.go | 9 +
.../reservations/failover/controller.go | 27 +-
.../reservations/failover/helpers.go | 109 +++++-
.../failover/reservation_scheduling.go | 40 +-
.../failover/reservation_scheduling_test.go | 180 ++++++++-
.../scheduling/reservations/flavor_groups.go | 14 +
16 files changed, 891 insertions(+), 76 deletions(-)
create mode 100644 internal/scheduling/nova/plugins/weighers/kvm_failover_reservation_consolidation.go
create mode 100644 internal/scheduling/nova/plugins/weighers/kvm_failover_reservation_consolidation_test.go
diff --git a/api/external/nova/messages.go b/api/external/nova/messages.go
index a401df269..e82568941 100644
--- a/api/external/nova/messages.go
+++ b/api/external/nova/messages.go
@@ -151,6 +151,11 @@ const (
ReserveForFailoverIntent v1alpha1.SchedulingIntent = "reserve_for_failover"
// ReserveForCommittedResourceIntent indicates that the request is for CR reservation scheduling.
ReserveForCommittedResourceIntent v1alpha1.SchedulingIntent = "reserve_for_committed_resource"
+
+ // HintKeyResourceGroup is the scheduler hint key used to pass the resource group
+ // (e.g., flavor group name) for failover reservation scheduling.
+ // The weigher uses this to compare against existing reservations' ResourceGroup.
+ HintKeyResourceGroup = "_cortex_resource_group"
)
// GetIntent analyzes the request spec and determines the intent of the scheduling request.
diff --git a/api/v1alpha1/reservation_types.go b/api/v1alpha1/reservation_types.go
index aee4a165d..21c96efad 100644
--- a/api/v1alpha1/reservation_types.go
+++ b/api/v1alpha1/reservation_types.go
@@ -5,6 +5,7 @@ package v1alpha1
import (
hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
+ "k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
@@ -212,6 +213,7 @@ type ReservationStatus struct {
// +kubebuilder:printcolumn:name="Host",type="string",JSONPath=".status.host"
// +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status"
// +kubebuilder:printcolumn:name="ResourceGroup",type="string",JSONPath=".spec.committedResourceReservation.resourceGroup"
+// +kubebuilder:printcolumn:name="HA ResourceGroup",type="string",JSONPath=".spec.failoverReservation.resourceGroup",priority=1
// +kubebuilder:printcolumn:name="Project",type="string",JSONPath=".spec.committedResourceReservation.projectID"
// +kubebuilder:printcolumn:name="AZ",type="string",JSONPath=".spec.availabilityZone"
// +kubebuilder:printcolumn:name="StartTime",type="string",JSONPath=".spec.startTime",priority=1
@@ -248,6 +250,11 @@ type ReservationList struct {
Items []Reservation `json:"items"`
}
+// IsReady returns true if the reservation has the Ready condition set to True.
+func (r *Reservation) IsReady() bool {
+ return meta.IsStatusConditionTrue(r.Status.Conditions, ReservationConditionReady)
+}
+
func init() {
SchemeBuilder.Register(&Reservation{}, &ReservationList{})
}
diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml
index c40849739..65ad879dd 100644
--- a/helm/bundles/cortex-nova/values.yaml
+++ b/helm/bundles/cortex-nova/values.yaml
@@ -203,6 +203,8 @@ cortex-scheduling-controllers:
revalidationInterval: 30m
# Prevents creating multiple new reservations on the same hypervisor per cycle
limitOneNewReservationPerHypervisor: false
+ # Size failover reservations based on LargestFlavor in the flavor group
+ useFlavorGroupResources: false
cortex-knowledge-controllers:
<<: *cortex
diff --git a/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml b/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml
index 8ab0fade9..686aa60fe 100644
--- a/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml
+++ b/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml
@@ -27,6 +27,10 @@ spec:
- jsonPath: .spec.committedResourceReservation.resourceGroup
name: ResourceGroup
type: string
+ - jsonPath: .spec.failoverReservation.resourceGroup
+ name: HA ResourceGroup
+ priority: 1
+ type: string
- jsonPath: .spec.committedResourceReservation.projectID
name: Project
type: string
diff --git a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go
index 7d3bb33ef..4a4040fd5 100644
--- a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go
+++ b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go
@@ -9,9 +9,7 @@ import (
"strconv"
"strings"
- "k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/api/resource"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
"github.com/cobaltcore-dev/cortex/api/v1alpha1"
@@ -177,8 +175,7 @@ func aggregateReservationsByHost(reservations []v1alpha1.Reservation) (
continue
}
- readyCondition := meta.FindStatusCondition(reservation.Status.Conditions, v1alpha1.ReservationConditionReady)
- if readyCondition == nil || readyCondition.Status != metav1.ConditionTrue {
+ if !reservation.IsReady() {
continue
}
diff --git a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go
index e6956609a..5b471f789 100644
--- a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go
+++ b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go
@@ -13,7 +13,6 @@ import (
"github.com/cobaltcore-dev/cortex/api/v1alpha1"
"github.com/cobaltcore-dev/cortex/internal/scheduling/lib"
hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
- "k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/api/resource"
)
@@ -92,7 +91,7 @@ func (s *FilterHasEnoughCapacity) Run(traceLog *slog.Logger, request api.Externa
return nil, err
}
for _, reservation := range reservations.Items {
- if !meta.IsStatusConditionTrue(reservation.Status.Conditions, v1alpha1.ReservationConditionReady) {
+ if !reservation.IsReady() {
continue // Only consider active reservations (Ready=True).
}
diff --git a/internal/scheduling/nova/plugins/weighers/kvm_failover_evacuation.go b/internal/scheduling/nova/plugins/weighers/kvm_failover_evacuation.go
index 1f404f6b5..dcbcbf8bd 100644
--- a/internal/scheduling/nova/plugins/weighers/kvm_failover_evacuation.go
+++ b/internal/scheduling/nova/plugins/weighers/kvm_failover_evacuation.go
@@ -10,7 +10,6 @@ import (
api "github.com/cobaltcore-dev/cortex/api/external/nova"
"github.com/cobaltcore-dev/cortex/api/v1alpha1"
"github.com/cobaltcore-dev/cortex/internal/scheduling/lib"
- "k8s.io/apimachinery/pkg/api/meta"
)
// Options for the KVM failover evacuation weigher.
@@ -72,8 +71,7 @@ func (s *KVMFailoverEvacuationStep) Run(traceLog *slog.Logger, request api.Exter
failoverHosts := make(map[string]bool)
for _, reservation := range reservations.Items {
// Only consider active failover reservations (Ready condition is True)
- readyCondition := meta.FindStatusCondition(reservation.Status.Conditions, v1alpha1.ReservationConditionReady)
- if readyCondition == nil || readyCondition.Status != "True" {
+ if !reservation.IsReady() {
continue
}
if reservation.Spec.Type != v1alpha1.ReservationTypeFailover {
diff --git a/internal/scheduling/nova/plugins/weighers/kvm_failover_reservation_consolidation.go b/internal/scheduling/nova/plugins/weighers/kvm_failover_reservation_consolidation.go
new file mode 100644
index 000000000..727afce33
--- /dev/null
+++ b/internal/scheduling/nova/plugins/weighers/kvm_failover_reservation_consolidation.go
@@ -0,0 +1,164 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package weighers
+
+import (
+ "context"
+ "errors"
+ "log/slog"
+
+ api "github.com/cobaltcore-dev/cortex/api/external/nova"
+ "github.com/cobaltcore-dev/cortex/api/v1alpha1"
+ "github.com/cobaltcore-dev/cortex/internal/scheduling/lib"
+)
+
+// Options for the KVM failover reservation consolidation weigher.
+type KVMFailoverReservationConsolidationOpts struct {
+ // Weight multiplier for the total failover reservation count per host (consolidation signal).
+ // Higher values more aggressively pack failover reservations onto fewer hosts.
+ // Default: 1.0
+ TotalCountWeight *float64 `json:"totalCountWeight,omitempty"`
+ // Penalty multiplier for same-spec reservation count per host (diversity signal).
+ // Higher values more aggressively avoid clustering reservations of the same size on one host.
+ // Should be less than TotalCountWeight to ensure consolidation is the primary goal.
+ // Default: 0.1
+ SameSpecPenalty *float64 `json:"sameSpecPenalty,omitempty"`
+}
+
+func (o KVMFailoverReservationConsolidationOpts) Validate() error {
+ w := o.GetTotalCountWeight()
+ p := o.GetSameSpecPenalty()
+ if w < 0 {
+ return errors.New("totalCountWeight must be non-negative")
+ }
+ if p < 0 {
+ return errors.New("sameSpecPenalty must be non-negative")
+ }
+ if w == 0 && p > 0 {
+ return errors.New("sameSpecPenalty must be zero when totalCountWeight is zero")
+ }
+ if w > 0 && p >= w {
+ return errors.New("sameSpecPenalty must be less than totalCountWeight")
+ }
+ return nil
+}
+
+func (o KVMFailoverReservationConsolidationOpts) GetTotalCountWeight() float64 {
+ if o.TotalCountWeight == nil {
+ return 1.0
+ }
+ return *o.TotalCountWeight
+}
+
+func (o KVMFailoverReservationConsolidationOpts) GetSameSpecPenalty() float64 {
+ if o.SameSpecPenalty == nil {
+ return 0.1
+ }
+ return *o.SameSpecPenalty
+}
+
+// KVMFailoverReservationConsolidationStep weighs hosts for failover reservation placement.
+// It encourages consolidating failover reservations onto as few hosts as possible (primary goal),
+// while preferring hosts with fewer reservations of the same ResourceGroup (secondary tiebreaker).
+//
+// The ResourceGroup is passed via the scheduler hint "_cortex_resource_group" and compared against
+// each existing reservation's Spec.FailoverReservation.ResourceGroup. This groups reservations
+// by flavor group (or individual flavor name when no group exists).
+//
+// Score formula (normalized by total reservation count T):
+//
+// score = (totalCountWeight / T) × hostCount - (sameSpecPenalty / T) × sameGroupCount
+//
+// This produces bounded output (~0 to 1) that plays nicely with other weighers.
+type KVMFailoverReservationConsolidationStep struct {
+ lib.BaseWeigher[api.ExternalSchedulerRequest, KVMFailoverReservationConsolidationOpts]
+}
+
+// Run the weigher step.
+// For reserve_for_failover requests, hosts are scored based on existing failover reservation density
+// and same-spec diversity. For all other request types, this weigher has no effect.
+func (s *KVMFailoverReservationConsolidationStep) Run(traceLog *slog.Logger, request api.ExternalSchedulerRequest) (*lib.FilterWeigherPipelineStepResult, error) {
+ result := s.IncludeAllHostsFromRequest(request)
+
+ intent, err := request.GetIntent()
+ if err != nil || intent != api.ReserveForFailoverIntent {
+ traceLog.Info("skipping failover reservation consolidation weigher for non-failover-reservation request")
+ return result, nil //nolint:nilerr // intentionally skip weigher on error
+ }
+
+ // Extract the resource group from the scheduler hint.
+ // This identifies which "spec group" the incoming reservation belongs to.
+ // If the hint is missing, requestResourceGroup will be empty and the same-group penalty is skipped.
+ requestResourceGroup, _ := request.Spec.Data.GetSchedulerHintStr(api.HintKeyResourceGroup) //nolint:errcheck // missing hint is fine, same-group penalty is simply skipped
+
+ // Fetch all reservations.
+ var reservations v1alpha1.ReservationList
+ if err := s.Client.List(context.Background(), &reservations); err != nil {
+ return nil, err
+ }
+
+ // Count failover reservations per host, and same-group reservations per host.
+ totalPerHost := make(map[string]float64)
+ sameGroupPerHost := make(map[string]float64)
+ totalReservations := 0
+
+ for _, reservation := range reservations.Items {
+ // Only consider active failover reservations (Ready condition is True).
+ if !reservation.IsReady() {
+ continue
+ }
+ if reservation.Spec.Type != v1alpha1.ReservationTypeFailover {
+ continue
+ }
+
+ host := reservation.Status.Host
+ if host == "" {
+ continue
+ }
+
+ totalReservations++
+ totalPerHost[host]++
+
+ // Check if this reservation belongs to the same resource group as the request.
+ if requestResourceGroup != "" && reservation.Spec.FailoverReservation != nil &&
+ reservation.Spec.FailoverReservation.ResourceGroup == requestResourceGroup {
+ sameGroupPerHost[host]++
+ }
+ }
+
+ // If there are no failover reservations, the weigher has no information to act on.
+ if totalReservations == 0 {
+ traceLog.Info("no active failover reservations found, skipping consolidation weigher")
+ return result, nil
+ }
+
+ totalCountWeight := s.Options.GetTotalCountWeight()
+ sameSpecPenalty := s.Options.GetSameSpecPenalty()
+ t := float64(totalReservations)
+
+ for _, host := range request.Hosts {
+ hostTotal := totalPerHost[host.ComputeHost]
+ hostSameGroup := sameGroupPerHost[host.ComputeHost]
+
+ // Normalized score: bounded output for compatibility with other weighers.
+ score := (totalCountWeight/t)*hostTotal - (sameSpecPenalty/t)*hostSameGroup
+
+ result.Activations[host.ComputeHost] = score
+ traceLog.Info("calculated failover consolidation score for host",
+ "host", host.ComputeHost,
+ "totalOnHost", hostTotal,
+ "sameGroupOnHost", hostSameGroup,
+ "resourceGroup", requestResourceGroup,
+ "totalReservations", totalReservations,
+ "score", score)
+ }
+
+ return result, nil
+}
+
+func init() {
+ Index["kvm_failover_reservation_consolidation"] = func() NovaWeigher {
+ return &KVMFailoverReservationConsolidationStep{}
+ }
+}
diff --git a/internal/scheduling/nova/plugins/weighers/kvm_failover_reservation_consolidation_test.go b/internal/scheduling/nova/plugins/weighers/kvm_failover_reservation_consolidation_test.go
new file mode 100644
index 000000000..62d69d319
--- /dev/null
+++ b/internal/scheduling/nova/plugins/weighers/kvm_failover_reservation_consolidation_test.go
@@ -0,0 +1,368 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package weighers
+
+import (
+ "log/slog"
+ "math"
+ "testing"
+
+ api "github.com/cobaltcore-dev/cortex/api/external/nova"
+ "github.com/cobaltcore-dev/cortex/api/v1alpha1"
+ testlib "github.com/cobaltcore-dev/cortex/pkg/testing"
+ hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
+ "k8s.io/apimachinery/pkg/api/resource"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "sigs.k8s.io/controller-runtime/pkg/client"
+ "sigs.k8s.io/controller-runtime/pkg/client/fake"
+)
+
+// newFailoverReservationWithGroup creates a failover reservation with a specific resource group.
+func newFailoverReservationWithGroup(name, targetHost, resourceGroup string) *v1alpha1.Reservation {
+ return &v1alpha1.Reservation{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: name,
+ },
+ Spec: v1alpha1.ReservationSpec{
+ Type: v1alpha1.ReservationTypeFailover,
+ TargetHost: targetHost,
+ Resources: map[hv1.ResourceName]resource.Quantity{
+ hv1.ResourceCPU: *resource.NewQuantity(4, resource.DecimalSI),
+ hv1.ResourceMemory: *resource.NewQuantity(8192*1_000_000, resource.DecimalSI),
+ },
+ FailoverReservation: &v1alpha1.FailoverReservationSpec{
+ ResourceGroup: resourceGroup,
+ },
+ },
+ Status: v1alpha1.ReservationStatus{
+ Conditions: []metav1.Condition{
+ {
+ Type: v1alpha1.ReservationConditionReady,
+ Status: metav1.ConditionTrue,
+ Reason: "ReservationActive",
+ },
+ },
+ Host: targetHost,
+ FailoverReservation: &v1alpha1.FailoverReservationStatus{
+ Allocations: map[string]string{"some-vm": "some-host"},
+ },
+ },
+ }
+}
+
+func newFailoverReservationRequest(resourceGroup string, hosts []string) api.ExternalSchedulerRequest {
+ hostList := make([]api.ExternalSchedulerHost, len(hosts))
+ for i, h := range hosts {
+ hostList[i] = api.ExternalSchedulerHost{ComputeHost: h}
+ }
+
+ spec := api.NovaSpec{
+ ProjectID: "project-A",
+ InstanceUUID: "test-instance",
+ NumInstances: 1,
+ SchedulerHints: map[string]any{
+ "_nova_check_type": string(api.ReserveForFailoverIntent),
+ api.HintKeyResourceGroup: resourceGroup,
+ },
+ Flavor: api.NovaObject[api.NovaFlavor]{
+ Data: api.NovaFlavor{
+ Name: "m1.large",
+ VCPUs: 4,
+ MemoryMB: 8192,
+ ExtraSpecs: map[string]string{
+ "capabilities:hypervisor_type": "qemu",
+ },
+ },
+ },
+ }
+
+ weights := make(map[string]float64)
+ for _, h := range hosts {
+ weights[h] = 1.0
+ }
+
+ return api.ExternalSchedulerRequest{
+ Spec: api.NovaObject[api.NovaSpec]{Data: spec},
+ Hosts: hostList,
+ Weights: weights,
+ }
+}
+
+func approxEqual(a, b, epsilon float64) bool {
+ return math.Abs(a-b) < epsilon
+}
+
+func TestKVMFailoverReservationConsolidationStep_Run(t *testing.T) {
+ scheme := buildTestScheme(t)
+
+ tests := []struct {
+ name string
+ reservations []*v1alpha1.Reservation
+ request api.ExternalSchedulerRequest
+ opts KVMFailoverReservationConsolidationOpts
+ expectedWeights map[string]float64
+ }{
+ {
+ name: "consolidation: prefer host with existing failover reservations",
+ reservations: []*v1alpha1.Reservation{
+ // host1 has 3 reservations (different groups)
+ newFailoverReservationWithGroup("res-1", "host1", "group-A"),
+ newFailoverReservationWithGroup("res-2", "host1", "group-B"),
+ newFailoverReservationWithGroup("res-3", "host1", "group-C"),
+ // host2 has 1 reservation
+ newFailoverReservationWithGroup("res-4", "host2", "group-B"),
+ },
+ // Request for group-D - no same-group on any host
+ request: newFailoverReservationRequest("group-D", []string{"host1", "host2", "host3"}),
+ opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: testlib.Ptr(1.0), SameSpecPenalty: testlib.Ptr(0.1)},
+ // T=4, host1: (1/4)*3=0.75, host2: (1/4)*1=0.25, host3: 0
+ expectedWeights: map[string]float64{"host1": 0.75, "host2": 0.25, "host3": 0},
+ },
+ {
+ name: "same-group penalty: prefer host with fewer same-group reservations",
+ reservations: []*v1alpha1.Reservation{
+ // host1 has 5 reservations, 0 same-group (group-A)
+ newFailoverReservationWithGroup("res-1", "host1", "group-B"),
+ newFailoverReservationWithGroup("res-2", "host1", "group-B"),
+ newFailoverReservationWithGroup("res-3", "host1", "group-C"),
+ newFailoverReservationWithGroup("res-4", "host1", "group-C"),
+ newFailoverReservationWithGroup("res-5", "host1", "group-D"),
+ // host2 has 5 reservations, 3 same-group (group-A)
+ newFailoverReservationWithGroup("res-6", "host2", "group-A"),
+ newFailoverReservationWithGroup("res-7", "host2", "group-A"),
+ newFailoverReservationWithGroup("res-8", "host2", "group-A"),
+ newFailoverReservationWithGroup("res-9", "host2", "group-C"),
+ newFailoverReservationWithGroup("res-10", "host2", "group-D"),
+ },
+ request: newFailoverReservationRequest("group-A", []string{"host1", "host2", "host3"}),
+ opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: testlib.Ptr(1.0), SameSpecPenalty: testlib.Ptr(0.1)},
+ // T=10
+ // host1: (1/10)*5 - (0.1/10)*0 = 0.5
+ // host2: (1/10)*5 - (0.1/10)*3 = 0.5 - 0.03 = 0.47
+ // host3: 0
+ expectedWeights: map[string]float64{"host1": 0.5, "host2": 0.47, "host3": 0},
+ },
+ {
+ name: "consolidation dominates: host with reservations preferred over empty host even with same-group",
+ reservations: []*v1alpha1.Reservation{
+ // host2 has 5 reservations, 3 same-group (group-A)
+ newFailoverReservationWithGroup("res-1", "host2", "group-A"),
+ newFailoverReservationWithGroup("res-2", "host2", "group-A"),
+ newFailoverReservationWithGroup("res-3", "host2", "group-A"),
+ newFailoverReservationWithGroup("res-4", "host2", "group-C"),
+ newFailoverReservationWithGroup("res-5", "host2", "group-D"),
+ },
+ request: newFailoverReservationRequest("group-A", []string{"host2", "host3"}),
+ opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: testlib.Ptr(1.0), SameSpecPenalty: testlib.Ptr(0.1)},
+ // T=5
+ // host2: (1/5)*5 - (0.1/5)*3 = 1.0 - 0.06 = 0.94
+ // host3: 0
+ expectedWeights: map[string]float64{"host2": 0.94, "host3": 0},
+ },
+ {
+ name: "no reservations: all hosts get default weight (no effect)",
+ reservations: []*v1alpha1.Reservation{},
+ request: newFailoverReservationRequest("group-A", []string{"host1", "host2"}),
+ opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: testlib.Ptr(1.0), SameSpecPenalty: testlib.Ptr(0.1)},
+ expectedWeights: map[string]float64{"host1": 0, "host2": 0},
+ },
+ {
+ name: "non-failover request: weigher has no effect",
+ reservations: []*v1alpha1.Reservation{
+ newFailoverReservationWithGroup("res-1", "host1", "group-A"),
+ },
+ // Use a non-failover request (evacuation)
+ request: newNovaRequest("instance-123", true, []string{"host1", "host2"}),
+ opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: testlib.Ptr(1.0), SameSpecPenalty: testlib.Ptr(0.1)},
+ expectedWeights: map[string]float64{"host1": 0, "host2": 0},
+ },
+ {
+ name: "non-failover request without hints: weigher has no effect",
+ reservations: []*v1alpha1.Reservation{
+ newFailoverReservationWithGroup("res-1", "host1", "group-A"),
+ },
+ // Use a non-failover request (no hints = create intent)
+ request: newNovaRequest("instance-123", false, []string{"host1", "host2"}),
+ opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: testlib.Ptr(1.0), SameSpecPenalty: testlib.Ptr(0.1)},
+ expectedWeights: map[string]float64{"host1": 0, "host2": 0},
+ },
+ {
+ name: "default options work correctly",
+ reservations: []*v1alpha1.Reservation{
+ newFailoverReservationWithGroup("res-1", "host1", "group-B"),
+ newFailoverReservationWithGroup("res-2", "host1", "group-A"), // same group
+ newFailoverReservationWithGroup("res-3", "host2", "group-B"),
+ },
+ request: newFailoverReservationRequest("group-A", []string{"host1", "host2", "host3"}),
+ opts: KVMFailoverReservationConsolidationOpts{}, // nil = use defaults
+ // Defaults: TotalCountWeight=1.0, SameSpecPenalty=0.1, T=3
+ // host1: (1/3)*2 - (0.1/3)*1 ≈ 0.6667 - 0.0333 = 0.6333
+ // host2: (1/3)*1 - (0.1/3)*0 ≈ 0.3333
+ // host3: 0
+ expectedWeights: map[string]float64{"host1": 2.0/3.0 - 0.1/3.0, "host2": 1.0 / 3.0, "host3": 0},
+ },
+ {
+ name: "committed resource reservations are ignored",
+ reservations: []*v1alpha1.Reservation{
+ newFailoverReservationWithGroup("res-1", "host1", "group-A"),
+ newCommittedReservation("committed-1", "host2"),
+ },
+ request: newFailoverReservationRequest("group-A", []string{"host1", "host2", "host3"}),
+ opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: testlib.Ptr(1.0), SameSpecPenalty: testlib.Ptr(0.1)},
+ // T=1 (only 1 failover reservation), committed reservation ignored
+ // host1: (1/1)*1 - (0.1/1)*1 = 0.9
+ // host2: 0 (committed reservation not counted)
+ // host3: 0
+ expectedWeights: map[string]float64{"host1": 0.9, "host2": 0, "host3": 0},
+ },
+ {
+ name: "failed reservations are ignored",
+ reservations: []*v1alpha1.Reservation{
+ newFailoverReservationWithGroup("res-1", "host1", "group-A"),
+ newFailoverReservation("failed-res", "host2", true, map[string]string{"vm-1": "h-1"}),
+ },
+ request: newFailoverReservationRequest("group-A", []string{"host1", "host2"}),
+ opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: testlib.Ptr(1.0), SameSpecPenalty: testlib.Ptr(0.1)},
+ // T=1 (failed reservation ignored)
+ // host1: (1/1)*1 - (0.1/1)*1 = 0.9
+ // host2: 0
+ expectedWeights: map[string]float64{"host1": 0.9, "host2": 0},
+ },
+ {
+ name: "custom weights adjust scoring",
+ reservations: []*v1alpha1.Reservation{
+ newFailoverReservationWithGroup("res-1", "host1", "group-A"),
+ newFailoverReservationWithGroup("res-2", "host1", "group-A"),
+ newFailoverReservationWithGroup("res-3", "host2", "group-B"),
+ },
+ request: newFailoverReservationRequest("group-A", []string{"host1", "host2"}),
+ opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: testlib.Ptr(2.0), SameSpecPenalty: testlib.Ptr(0.5)},
+ // T=3, W=2.0, P=0.5
+ // host1: (2/3)*2 - (0.5/3)*2 = 1.3333 - 0.3333 = 1.0
+ // host2: (2/3)*1 - (0.5/3)*0 = 0.6667
+ expectedWeights: map[string]float64{"host1": 1.0, "host2": 2.0 / 3.0},
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ objects := make([]client.Object, 0, len(tt.reservations))
+ for _, r := range tt.reservations {
+ objects = append(objects, r)
+ }
+
+ step := &KVMFailoverReservationConsolidationStep{}
+ step.Client = fake.NewClientBuilder().WithScheme(scheme).WithObjects(objects...).Build()
+ step.Options = tt.opts
+
+ result, err := step.Run(slog.Default(), tt.request)
+ if err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ for host, expectedWeight := range tt.expectedWeights {
+ actualWeight, ok := result.Activations[host]
+ if !ok {
+ t.Errorf("expected host %s to be in activations", host)
+ continue
+ }
+ if !approxEqual(actualWeight, expectedWeight, 0.0001) {
+ t.Errorf("host %s: expected weight %v, got %v", host, expectedWeight, actualWeight)
+ }
+ }
+ })
+ }
+}
+
+func TestKVMFailoverReservationConsolidationOpts_Defaults(t *testing.T) {
+ opts := KVMFailoverReservationConsolidationOpts{}
+ if opts.GetTotalCountWeight() != 1.0 {
+ t.Errorf("expected default TotalCountWeight 1.0, got %v", opts.GetTotalCountWeight())
+ }
+ if opts.GetSameSpecPenalty() != 0.1 {
+ t.Errorf("expected default SameSpecPenalty 0.1, got %v", opts.GetSameSpecPenalty())
+ }
+}
+
+func TestKVMFailoverReservationConsolidationOpts_Validate(t *testing.T) {
+ tests := []struct {
+ name string
+ opts KVMFailoverReservationConsolidationOpts
+ wantErr string
+ }{
+ {
+ name: "valid: both set, p < w",
+ opts: KVMFailoverReservationConsolidationOpts{
+ TotalCountWeight: testlib.Ptr(2.0),
+ SameSpecPenalty: testlib.Ptr(0.5),
+ },
+ },
+ {
+ name: "valid: defaults (nil)",
+ opts: KVMFailoverReservationConsolidationOpts{},
+ },
+ {
+ name: "valid: both zero",
+ opts: KVMFailoverReservationConsolidationOpts{
+ TotalCountWeight: testlib.Ptr(0.0),
+ SameSpecPenalty: testlib.Ptr(0.0),
+ },
+ },
+ {
+ name: "invalid: negative totalCountWeight",
+ opts: KVMFailoverReservationConsolidationOpts{
+ TotalCountWeight: testlib.Ptr(-1.0),
+ },
+ wantErr: "totalCountWeight must be non-negative",
+ },
+ {
+ name: "invalid: negative sameSpecPenalty",
+ opts: KVMFailoverReservationConsolidationOpts{
+ SameSpecPenalty: testlib.Ptr(-0.1),
+ },
+ wantErr: "sameSpecPenalty must be non-negative",
+ },
+ {
+ name: "invalid: p >= w",
+ opts: KVMFailoverReservationConsolidationOpts{
+ TotalCountWeight: testlib.Ptr(1.0),
+ SameSpecPenalty: testlib.Ptr(1.0),
+ },
+ wantErr: "sameSpecPenalty must be less than totalCountWeight",
+ },
+ {
+ name: "invalid: w=0 with p>0 (default penalty with zero weight)",
+ opts: KVMFailoverReservationConsolidationOpts{
+ TotalCountWeight: testlib.Ptr(0.0),
+ // SameSpecPenalty defaults to 0.1
+ },
+ wantErr: "sameSpecPenalty must be zero when totalCountWeight is zero",
+ },
+ {
+ name: "invalid: w=0 with explicit p>0",
+ opts: KVMFailoverReservationConsolidationOpts{
+ TotalCountWeight: testlib.Ptr(0.0),
+ SameSpecPenalty: testlib.Ptr(0.5),
+ },
+ wantErr: "sameSpecPenalty must be zero when totalCountWeight is zero",
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ err := tt.opts.Validate()
+ if tt.wantErr == "" {
+ if err != nil {
+ t.Errorf("expected no error, got %v", err)
+ }
+ } else {
+ if err == nil {
+ t.Errorf("expected error %q, got nil", tt.wantErr)
+ } else if err.Error() != tt.wantErr {
+ t.Errorf("expected error %q, got %q", tt.wantErr, err.Error())
+ }
+ }
+ })
+ }
+}
diff --git a/internal/scheduling/reservations/commitments/reservation_controller.go b/internal/scheduling/reservations/commitments/reservation_controller.go
index b6078daf1..312a65530 100644
--- a/internal/scheduling/reservations/commitments/reservation_controller.go
+++ b/internal/scheduling/reservations/commitments/reservation_controller.go
@@ -5,7 +5,6 @@ package commitments
import (
"context"
- "errors"
"fmt"
"time"
@@ -25,7 +24,6 @@ import (
schedulerdelegationapi "github.com/cobaltcore-dev/cortex/api/external/nova"
"github.com/cobaltcore-dev/cortex/api/v1alpha1"
- "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute"
"github.com/cobaltcore-dev/cortex/internal/scheduling/reservations"
"github.com/cobaltcore-dev/cortex/pkg/multicluster"
hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
@@ -91,7 +89,7 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr
return ctrl.Result{}, nil // Don't need to requeue.
}
- if meta.IsStatusConditionTrue(res.Status.Conditions, v1alpha1.ReservationConditionReady) {
+ if res.IsReady() {
logger.V(1).Info("reservation is active, verifying allocations")
// Verify all allocations in Spec against actual VM state
@@ -195,25 +193,9 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr
}
// Search for the flavor across all flavor groups
- // Also capture the flavor group name for pipeline selection
- var flavorDetails *compute.FlavorInGroup
- var flavorGroupName string
- for groupName, fg := range flavorGroups {
- for _, flavor := range fg.Flavors {
- if flavor.Name == resourceName {
- flavorDetails = &flavor
- flavorGroupName = groupName
- break
- }
- }
- if flavorDetails != nil {
- break
- }
- }
-
- // Check if flavor was found
- if flavorDetails == nil {
- logger.Error(errors.New("flavor not found"), "flavor not found in any flavor group",
+ flavorGroupName, flavorDetails, err := reservations.FindFlavorInGroups(resourceName, flavorGroups)
+ if err != nil {
+ logger.Error(err, "flavor not found in any flavor group",
"resourceName", resourceName)
return ctrl.Result{RequeueAfter: 5 * time.Minute}, nil
}
diff --git a/internal/scheduling/reservations/failover/config.go b/internal/scheduling/reservations/failover/config.go
index 79dc94480..b8bde49a6 100644
--- a/internal/scheduling/reservations/failover/config.go
+++ b/internal/scheduling/reservations/failover/config.go
@@ -77,6 +77,15 @@ type FailoverConfig struct {
// rotates to process different VMs. This ensures all VMs eventually get processed.
// Default: 4 (rotate every 4th reconcile cycle). Use 0 to disable rotation.
VMSelectionRotationInterval *int `json:"vmSelectionRotationInterval"`
+
+ // UseFlavorGroupResources when true, sizes failover reservation resources based on
+ // the LargestFlavor in the VM's flavor group instead of the VM's actual resources.
+ // This enables better sharing: a single reservation can accommodate any flavor in the
+ // group since it's sized for the largest one.
+ // When false (or when the flavor group lookup fails), falls back to using the VM's
+ // own reported resources (memory + vcpus).
+ // Default: false
+ UseFlavorGroupResources bool `json:"useFlavorGroupResources"`
}
// intPtr returns a pointer to the given int value.
diff --git a/internal/scheduling/reservations/failover/controller.go b/internal/scheduling/reservations/failover/controller.go
index 297ed09cc..c66295293 100644
--- a/internal/scheduling/reservations/failover/controller.go
+++ b/internal/scheduling/reservations/failover/controller.go
@@ -13,6 +13,7 @@ import (
"time"
"github.com/cobaltcore-dev/cortex/api/v1alpha1"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute"
"github.com/cobaltcore-dev/cortex/internal/scheduling/reservations"
"github.com/cobaltcore-dev/cortex/pkg/multicluster"
hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
@@ -319,8 +320,22 @@ func (c *FailoverReservationController) ReconcilePeriodic(ctx context.Context) (
}
summary.reservationsDeleted = len(emptyReservationsToDelete)
- // 6. Create and assign reservations for VMs that need them
- assignSummary, hitMaxVMsLimit := c.reconcileCreateAndAssignReservations(ctx, vms, failoverReservations, allHypervisors)
+ // 6. Fetch flavor groups for reservation sizing (if configured)
+ var flavorGroups map[string]compute.FlavorGroupFeature
+ if c.Config.UseFlavorGroupResources {
+ knowledge := &reservations.FlavorGroupKnowledgeClient{Client: c.Client}
+ fg, err := knowledge.GetAllFlavorGroups(ctx, nil)
+ if err != nil {
+ logger.Info("flavor group knowledge not available, will fall back to VM resources for sizing",
+ "error", err)
+ // flavorGroups remains nil — newFailoverReservation will fall back to VM resources
+ } else {
+ flavorGroups = fg
+ }
+ }
+
+ // 7. Create and assign reservations for VMs that need them
+ assignSummary, hitMaxVMsLimit := c.reconcileCreateAndAssignReservations(ctx, vms, failoverReservations, allHypervisors, flavorGroups)
summary.vmsMissingFailover = assignSummary.vmsMissingFailover
summary.vmsProcessed = assignSummary.vmsProcessed
summary.reservationsNeeded = assignSummary.reservationsNeeded
@@ -572,6 +587,7 @@ func (c *FailoverReservationController) reconcileCreateAndAssignReservations(
vms []VM,
failoverReservations []v1alpha1.Reservation,
allHypervisors []string,
+ flavorGroups map[string]compute.FlavorGroupFeature, // passed to resolveVMForScheduling per-VM
) (reconcileSummary, bool) {
logger := LoggerFromContext(ctx)
@@ -604,8 +620,11 @@ func (c *FailoverReservationController) reconcileCreateAndAssignReservations(
vmLogger := LoggerFromContext(vmCtx).WithValues("vmUUID", need.VM.UUID)
vmLogger.Info("processing VM for failover reservation")
+ // Resolve VM resources once per VM (may use LargestFlavor from flavor group)
+ resSpec := resolveVMSpecForScheduling(vmCtx, need.VM, c.Config.UseFlavorGroupResources, flavorGroups)
+
for i := range need.Count {
- reusedRes := c.tryReuseExistingReservation(vmCtx, need.VM, failoverReservations, allHypervisors)
+ reusedRes := c.tryReuseExistingReservation(vmCtx, need.VM, failoverReservations, allHypervisors, resSpec)
if reusedRes != nil {
if err := c.patchReservationStatus(vmCtx, reusedRes); err != nil {
@@ -628,7 +647,7 @@ func (c *FailoverReservationController) reconcileCreateAndAssignReservations(
continue
}
- newRes, err := c.scheduleAndBuildNewFailoverReservation(vmCtx, need.VM, allHypervisors, failoverReservations, excludeHypervisors)
+ newRes, err := c.scheduleAndBuildNewFailoverReservation(vmCtx, need.VM, allHypervisors, failoverReservations, excludeHypervisors, resSpec)
if err != nil {
vmLogger.V(1).Info("failed to schedule failover reservation", "error", err, "iteration", i+1, "needed", need.Count)
vmFailed++
diff --git a/internal/scheduling/reservations/failover/helpers.go b/internal/scheduling/reservations/failover/helpers.go
index 0ec75456c..8623f117f 100644
--- a/internal/scheduling/reservations/failover/helpers.go
+++ b/internal/scheduling/reservations/failover/helpers.go
@@ -8,11 +8,91 @@ import (
"fmt"
"github.com/cobaltcore-dev/cortex/api/v1alpha1"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute"
+ "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations"
hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
+// resolvedReservationSpec holds the resolved resource spec for scheduling and reservation sizing.
+// When UseFlavorGroupResources is enabled and the VM's flavor is found in a group,
+// resources are sized to the LargestFlavor. Otherwise, they come from the VM directly.
+type resolvedReservationSpec struct {
+ FlavorName string // may be overridden to LargestFlavor.Name
+ FlavorGroupName string // flavor group name if found, empty otherwise
+ MemoryMB uint64
+ VCPUs uint64
+}
+
+// ResourceGroup returns the flavor group name if available, otherwise falls back to the provided fallback value.
+func (r resolvedReservationSpec) ResourceGroup(fallback string) string {
+ if r.FlavorGroupName != "" {
+ return r.FlavorGroupName
+ }
+ return fallback
+}
+
+// HypervisorResources returns the reservation spec resources as a map suitable for the reservation CRD.
+// We use "cpu" (not "vcpus") as the canonical key because the scheduling capacity logic
+// (e.g., nova filter_has_enough_capacity) uses "cpu".
+func (r resolvedReservationSpec) HypervisorResources() map[hv1.ResourceName]resource.Quantity {
+ return map[hv1.ResourceName]resource.Quantity{
+ hv1.ResourceMemory: *resource.NewQuantity(int64(r.MemoryMB)*1024*1024, resource.BinarySI), //nolint:gosec // flavor memory (MiB) from specs, realistically bounded; use binary to match commitments/state.go and vm_source.go
+ hv1.ResourceCPU: *resource.NewQuantity(int64(r.VCPUs), resource.DecimalSI), //nolint:gosec // flavor vcpus from specs, realistically bounded
+ }
+}
+
+// resolveVMSpecForScheduling resolves the VM's resources for scheduling.
+// When useFlavorGroupResources is true and the flavor is found in a group,
+// returns the LargestFlavor's name and size. Otherwise falls back to VM resources.
+func resolveVMSpecForScheduling(
+ ctx context.Context,
+ vm VM,
+ useFlavorGroupResources bool,
+ flavorGroups map[string]compute.FlavorGroupFeature,
+) resolvedReservationSpec {
+
+ logger := LoggerFromContext(ctx)
+
+ if useFlavorGroupResources && flavorGroups != nil {
+ groupName, _, err := reservations.FindFlavorInGroups(vm.FlavorName, flavorGroups)
+ if err == nil {
+ fg := flavorGroups[groupName]
+ largest := fg.LargestFlavor
+ logger.V(1).Info("resolved VM resources from flavor group LargestFlavor",
+ "vmFlavor", vm.FlavorName,
+ "flavorGroup", groupName,
+ "largestFlavor", largest.Name,
+ "memoryMB", largest.MemoryMB,
+ "vcpus", largest.VCPUs)
+ return resolvedReservationSpec{
+ FlavorName: largest.Name,
+ FlavorGroupName: groupName,
+ MemoryMB: largest.MemoryMB,
+ VCPUs: largest.VCPUs,
+ }
+ }
+ logger.V(1).Info("flavor group lookup failed, falling back to VM resources",
+ "vmFlavor", vm.FlavorName,
+ "error", err)
+ }
+
+ // Fallback: use VM's own resources
+ var memoryMB, vcpus uint64
+ if memory, ok := vm.Resources["memory"]; ok {
+ memoryMB = uint64(memory.Value() / (1024 * 1024)) //nolint:gosec // memory values won't overflow; binary MiB matches commitments/state.go and vm_source.go
+ }
+ if v, ok := vm.Resources["vcpus"]; ok {
+ vcpus = uint64(v.Value()) //nolint:gosec // vcpus values won't overflow
+ }
+ return resolvedReservationSpec{
+ FlavorName: vm.FlavorName,
+ MemoryMB: memoryMB,
+ VCPUs: vcpus,
+ }
+}
+
// getFailoverAllocations safely returns the allocations map from a failover reservation.
// Returns an empty map if the reservation has no failover status or allocations.
func getFailoverAllocations(res *v1alpha1.Reservation) map[string]string {
@@ -90,23 +170,20 @@ func ValidateFailoverReservationResources(res *v1alpha1.Reservation) error {
// newFailoverReservation creates a new failover reservation for a VM on a specific hypervisor.
// This does NOT persist the reservation to the cluster - it only creates the in-memory object.
// The caller is responsible for persisting the reservation.
-func newFailoverReservation(ctx context.Context, vm VM, hypervisor, creator string) *v1alpha1.Reservation {
- logger := LoggerFromContext(ctx)
+//
+// The resolved parameter contains the pre-computed resources (from resolveVMForScheduling),
+// which may come from the VM's flavor group LargestFlavor or from the VM's own resources.
+// This ensures the same sizing is used for both the scheduler query and the reservation CRD.
+func newFailoverReservation(
+ ctx context.Context,
+ vm VM,
+ hypervisor, creator string,
+ resSpec resolvedReservationSpec,
+) *v1alpha1.Reservation {
- // Build resources from VM's Resources map
- // The VM struct uses "vcpus" and "memory" keys (see vm_source.go)
- // We convert "vcpus" to "cpu" for the reservation because the scheduling capacity logic
- // (e.g., nova filter_has_enough_capacity) uses "cpu" as the canonical key.
+ logger := LoggerFromContext(ctx)
- // TODO we may want to use different resource (bigger) to enable better sharing
- resources := make(map[hv1.ResourceName]resource.Quantity)
- if memory, ok := vm.Resources["memory"]; ok {
- resources["memory"] = memory
- }
- if vcpus, ok := vm.Resources["vcpus"]; ok {
- // todo check if that is correct, i.e. that the cpu reported on e.g. hypervisors is vcpu and not pcpu
- resources["cpu"] = vcpus
- }
+ resources := resSpec.HypervisorResources()
reservation := &v1alpha1.Reservation{
ObjectMeta: metav1.ObjectMeta{
@@ -123,7 +200,7 @@ func newFailoverReservation(ctx context.Context, vm VM, hypervisor, creator stri
Resources: resources,
TargetHost: hypervisor, // Set the desired hypervisor from scheduler response
FailoverReservation: &v1alpha1.FailoverReservationSpec{
- ResourceGroup: vm.FlavorName,
+ ResourceGroup: resSpec.ResourceGroup(vm.FlavorName),
},
},
}
diff --git a/internal/scheduling/reservations/failover/reservation_scheduling.go b/internal/scheduling/reservations/failover/reservation_scheduling.go
index 6859d2cc4..f482f3393 100644
--- a/internal/scheduling/reservations/failover/reservation_scheduling.go
+++ b/internal/scheduling/reservations/failover/reservation_scheduling.go
@@ -30,7 +30,7 @@ const (
PipelineAcknowledgeFailoverReservation = "kvm-acknowledge-failover-reservation"
)
-func (c *FailoverReservationController) queryHypervisorsFromScheduler(ctx context.Context, vm VM, allHypervisors []string, pipeline string) ([]string, error) {
+func (c *FailoverReservationController) queryHypervisorsFromScheduler(ctx context.Context, vm VM, allHypervisors []string, pipeline string, resSpec resolvedReservationSpec) ([]string, error) {
logger := LoggerFromContext(ctx)
// Build list of eligible hypervisors (excluding VM's current hypervisor)
@@ -52,18 +52,6 @@ func (c *FailoverReservationController) queryHypervisorsFromScheduler(ctx contex
ignoreHypervisors := []string{vm.CurrentHypervisor}
- // Get memory and vcpus from VM resources
- // The VM struct uses "vcpus" and "memory" keys (see vm_source.go)
- var memoryMB uint64
- var vcpus uint64
- if memory, ok := vm.Resources["memory"]; ok {
- // Convert from bytes to MB
- memoryMB = uint64(memory.Value() / (1024 * 1024)) //nolint:gosec // memory values won't overflow
- }
- if vcpusRes, ok := vm.Resources["vcpus"]; ok {
- vcpus = uint64(vcpusRes.Value()) //nolint:gosec // vcpus values won't overflow
- }
-
// Build flavor extra specs from VM's extra specs
// Start with the VM's actual extra specs, then ensure required defaults are set
flavorExtraSpecs := make(map[string]string)
@@ -78,18 +66,23 @@ func (c *FailoverReservationController) queryHypervisorsFromScheduler(ctx contex
// Schedule the reservation using the SchedulerClient.
// Note: We pass all hypervisors (from all AZs) in EligibleHosts. The scheduler pipeline's
// filter_correct_az filter will exclude hosts that are not in the VM's availability zone.
+ // Use resSpec.FlavorName and reservation spec resources so the scheduler checks capacity for the
+ // correct flavor size (which may be the LargestFlavor from the flavor group).
scheduleReq := reservations.ScheduleReservationRequest{
InstanceUUID: vm.UUID,
ProjectID: vm.ProjectID,
- FlavorName: vm.FlavorName,
+ FlavorName: resSpec.FlavorName,
FlavorExtraSpecs: flavorExtraSpecs,
- MemoryMB: memoryMB,
- VCPUs: vcpus,
+ MemoryMB: resSpec.MemoryMB,
+ VCPUs: resSpec.VCPUs,
EligibleHosts: eligibleHypervisors,
IgnoreHosts: ignoreHypervisors,
Pipeline: pipeline,
AvailabilityZone: vm.AvailabilityZone,
- SchedulerHints: map[string]any{"_nova_check_type": string(api.ReserveForFailoverIntent)},
+ SchedulerHints: map[string]any{
+ "_nova_check_type": string(api.ReserveForFailoverIntent),
+ api.HintKeyResourceGroup: resSpec.ResourceGroup(vm.FlavorName),
+ },
}
logger.V(1).Info("scheduling failover reservation",
@@ -123,11 +116,12 @@ func (c *FailoverReservationController) tryReuseExistingReservation(
vm VM,
failoverReservations []v1alpha1.Reservation,
allHypervisors []string,
+ resSpec resolvedReservationSpec,
) *v1alpha1.Reservation {
logger := LoggerFromContext(ctx)
- validHypervisors, err := c.queryHypervisorsFromScheduler(ctx, vm, allHypervisors, PipelineReuseFailoverReservation)
+ validHypervisors, err := c.queryHypervisorsFromScheduler(ctx, vm, allHypervisors, PipelineReuseFailoverReservation, resSpec)
if err != nil {
logger.Error(err, "failed to get potential hypervisors for VM", "vmUUID", vm.UUID)
return nil
@@ -263,12 +257,14 @@ func (c *FailoverReservationController) scheduleAndBuildNewFailoverReservation(
allHypervisors []string,
failoverReservations []v1alpha1.Reservation,
excludeHypervisors map[string]bool,
+ resSpec resolvedReservationSpec,
) (*v1alpha1.Reservation, error) {
logger := LoggerFromContext(ctx)
- // Get potential hypervisors from scheduler
- validHypervisors, err := c.queryHypervisorsFromScheduler(ctx, vm, allHypervisors, PipelineNewFailoverReservation)
+ // Get potential hypervisors from scheduler using the reservation spec resources
+ // (which may be sized to the LargestFlavor from the flavor group)
+ validHypervisors, err := c.queryHypervisorsFromScheduler(ctx, vm, allHypervisors, PipelineNewFailoverReservation, resSpec)
if err != nil {
return nil, fmt.Errorf("failed to get potential hypervisors for VM: %w", err)
}
@@ -307,8 +303,8 @@ func (c *FailoverReservationController) scheduleAndBuildNewFailoverReservation(
"selectedHypervisor", selectedHypervisor,
"allReturnedHypervisors", validHypervisors)
- // Build the failover reservation on the selected hypervisor (in-memory only)
- reservation := newFailoverReservation(ctx, vm, selectedHypervisor, c.Config.Creator)
+ // Build the failover reservation using the same reservation spec resources
+ reservation := newFailoverReservation(ctx, vm, selectedHypervisor, c.Config.Creator, resSpec)
return reservation, nil
}
diff --git a/internal/scheduling/reservations/failover/reservation_scheduling_test.go b/internal/scheduling/reservations/failover/reservation_scheduling_test.go
index 0ae69f8db..fa987d34b 100644
--- a/internal/scheduling/reservations/failover/reservation_scheduling_test.go
+++ b/internal/scheduling/reservations/failover/reservation_scheduling_test.go
@@ -8,6 +8,7 @@ import (
"testing"
"github.com/cobaltcore-dev/cortex/api/v1alpha1"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute"
hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -138,7 +139,9 @@ func TestBuildNewFailoverReservation(t *testing.T) {
ctx := context.Background()
creator := "test-creator"
- result := newFailoverReservation(ctx, tt.vm, tt.hypervisor, creator)
+ // Resolve using VM's own resources (no flavor groups)
+ resolved := resolveVMSpecForScheduling(ctx, tt.vm, false, nil)
+ result := newFailoverReservation(ctx, tt.vm, tt.hypervisor, creator, resolved)
// Verify Status.Host
if result.Status.Host != tt.wantHost {
@@ -167,8 +170,10 @@ func TestBuildNewFailoverReservation(t *testing.T) {
t.Errorf("allocated host = %v, want %v", allocatedHost, tt.vm.CurrentHypervisor)
}
- // Verify resources are copied from VM
- // Note: VM uses "vcpus" but reservation uses "cpu" as the canonical key
+ // Verify resources are derived from VM
+ // Note: VM uses "vcpus" but reservation uses "cpu" as the canonical key.
+ // Memory uses binary MiB (bytes / 1024*1024 → MiB → MiB * 1024*1024), matching
+ // commitments/state.go and vm_source.go conventions.
if tt.vm.Resources != nil {
if memory, ok := tt.vm.Resources["memory"]; ok {
if resMemory, ok := result.Spec.Resources[hv1.ResourceMemory]; !ok {
@@ -221,6 +226,175 @@ func TestBuildNewFailoverReservation(t *testing.T) {
}
}
+// ============================================================================
+// Test: resolveVMSpecForScheduling + newFailoverReservation with flavor group resources
+// ============================================================================
+
+func TestResolveVMForSchedulingAndNewFailoverReservation(t *testing.T) {
+ // Build a flavor group where the VM's flavor is "hana_c60_m960" (small)
+ // but the LargestFlavor is "hana_c120_m1920" (large).
+ // When UseFlavorGroupResources is true, the resolved resources should use
+ // the LargestFlavor's name and size. The reservation should then be sized accordingly.
+ flavorGroups := map[string]compute.FlavorGroupFeature{
+ "hana_v2": {
+ Name: "hana_v2",
+ Flavors: []compute.FlavorInGroup{
+ {Name: "hana_c120_m1920", VCPUs: 120, MemoryMB: 1966080},
+ {Name: "hana_c60_m960", VCPUs: 60, MemoryMB: 983040},
+ {Name: "hana_c30_m480", VCPUs: 30, MemoryMB: 491520},
+ },
+ LargestFlavor: compute.FlavorInGroup{Name: "hana_c120_m1920", VCPUs: 120, MemoryMB: 1966080},
+ SmallestFlavor: compute.FlavorInGroup{Name: "hana_c30_m480", VCPUs: 30, MemoryMB: 491520},
+ },
+ }
+
+ tests := []struct {
+ name string
+ vm VM
+ useFlavorGroupResources bool
+ flavorGroups map[string]compute.FlavorGroupFeature
+ wantFlavorName string
+ wantFlavorGroupName string
+ wantResourceGroup string
+ wantMemoryMB uint64
+ wantVCPUs uint64
+ }{
+ {
+ name: "uses LargestFlavor resources when enabled and flavor found",
+ vm: VM{
+ UUID: "vm-1",
+ CurrentHypervisor: "host1",
+ FlavorName: "hana_c60_m960",
+ ProjectID: "test-project",
+ Resources: map[string]resource.Quantity{
+ "vcpus": *resource.NewQuantity(60, resource.DecimalSI),
+ "memory": *resource.NewQuantity(983040*1024*1024, resource.BinarySI),
+ },
+ },
+ useFlavorGroupResources: true,
+ flavorGroups: flavorGroups,
+ wantFlavorName: "hana_c120_m1920", // LargestFlavor name
+ wantFlavorGroupName: "hana_v2", // flavor group name
+ wantResourceGroup: "hana_v2", // ResourceGroup = flavor group name
+ wantMemoryMB: 1966080, // LargestFlavor memory
+ wantVCPUs: 120, // LargestFlavor vcpus
+ },
+ {
+ name: "falls back to VM resources when disabled",
+ vm: VM{
+ UUID: "vm-2",
+ CurrentHypervisor: "host1",
+ FlavorName: "hana_c60_m960",
+ ProjectID: "test-project",
+ Resources: map[string]resource.Quantity{
+ "vcpus": *resource.NewQuantity(60, resource.DecimalSI),
+ "memory": *resource.NewQuantity(983040*1024*1024, resource.BinarySI),
+ },
+ },
+ useFlavorGroupResources: false,
+ flavorGroups: flavorGroups,
+ wantFlavorName: "hana_c60_m960", // VM's own flavor name
+ wantFlavorGroupName: "", // no flavor group (disabled)
+ wantResourceGroup: "hana_c60_m960", // ResourceGroup = fallback to flavor name
+ wantMemoryMB: 983040, // VM's own memory (MiB, binary)
+ wantVCPUs: 60, // VM's own vcpus
+ },
+ {
+ name: "falls back to VM resources when flavor not in any group",
+ vm: VM{
+ UUID: "vm-3",
+ CurrentHypervisor: "host1",
+ FlavorName: "unknown_flavor",
+ ProjectID: "test-project",
+ Resources: map[string]resource.Quantity{
+ "vcpus": *resource.NewQuantity(8, resource.DecimalSI),
+ "memory": *resource.NewQuantity(16384*1024*1024, resource.BinarySI),
+ },
+ },
+ useFlavorGroupResources: true,
+ flavorGroups: flavorGroups,
+ wantFlavorName: "unknown_flavor", // VM's own flavor name (fallback)
+ wantFlavorGroupName: "", // no flavor group (not found)
+ wantResourceGroup: "unknown_flavor", // ResourceGroup = fallback to flavor name
+ wantMemoryMB: 16384, // VM's own memory (MiB, binary)
+ wantVCPUs: 8, // VM's own vcpus (fallback)
+ },
+ {
+ name: "falls back to VM resources when flavorGroups is nil",
+ vm: VM{
+ UUID: "vm-4",
+ CurrentHypervisor: "host1",
+ FlavorName: "hana_c60_m960",
+ ProjectID: "test-project",
+ Resources: map[string]resource.Quantity{
+ "vcpus": *resource.NewQuantity(60, resource.DecimalSI),
+ "memory": *resource.NewQuantity(983040*1024*1024, resource.BinarySI),
+ },
+ },
+ useFlavorGroupResources: true,
+ flavorGroups: nil, // nil flavor groups
+ wantFlavorName: "hana_c60_m960", // VM's own flavor name (fallback)
+ wantFlavorGroupName: "", // no flavor group (nil groups)
+ wantResourceGroup: "hana_c60_m960", // ResourceGroup = fallback to flavor name
+ wantMemoryMB: 983040, // VM's own memory (MiB, binary)
+ wantVCPUs: 60, // VM's own vcpus (fallback)
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ ctx := context.Background()
+ creator := "test-creator"
+
+ // Test resolveVMSpecForScheduling
+ resolved := resolveVMSpecForScheduling(ctx, tt.vm, tt.useFlavorGroupResources, tt.flavorGroups)
+
+ if resolved.FlavorName != tt.wantFlavorName {
+ t.Errorf("resolved.FlavorName = %q, want %q", resolved.FlavorName, tt.wantFlavorName)
+ }
+ if resolved.MemoryMB != tt.wantMemoryMB {
+ t.Errorf("resolved.MemoryMB = %d, want %d", resolved.MemoryMB, tt.wantMemoryMB)
+ }
+ if resolved.VCPUs != tt.wantVCPUs {
+ t.Errorf("resolved.VCPUs = %d, want %d", resolved.VCPUs, tt.wantVCPUs)
+ }
+ if resolved.FlavorGroupName != tt.wantFlavorGroupName {
+ t.Errorf("resolved.FlavorGroupName = %q, want %q", resolved.FlavorGroupName, tt.wantFlavorGroupName)
+ }
+
+ // Test that newFailoverReservation uses the resolved values correctly
+ result := newFailoverReservation(ctx, tt.vm, "target-host", creator, resolved)
+
+ // Verify reservation memory matches resolved
+ resMemory, ok := result.Spec.Resources[hv1.ResourceMemory]
+ if !ok {
+ t.Fatal("reservation missing memory resource")
+ }
+ wantMemoryBytes := int64(tt.wantMemoryMB) * 1024 * 1024 //nolint:gosec // test values won't overflow; binary MiB matches OpenStack convention
+ if resMemory.Value() != wantMemoryBytes {
+ t.Errorf("reservation memory = %d bytes, want %d bytes", resMemory.Value(), wantMemoryBytes)
+ }
+
+ // Verify reservation CPU matches resolved
+ resCPU, ok := result.Spec.Resources[hv1.ResourceCPU]
+ if !ok {
+ t.Fatal("reservation missing cpu resource")
+ }
+ if resCPU.Value() != int64(tt.wantVCPUs) { //nolint:gosec // test values won't overflow
+ t.Errorf("reservation cpu = %d, want %d", resCPU.Value(), tt.wantVCPUs)
+ }
+
+ // Verify ResourceGroup on the reservation
+ if result.Spec.FailoverReservation == nil {
+ t.Fatal("reservation missing FailoverReservation spec")
+ }
+ if result.Spec.FailoverReservation.ResourceGroup != tt.wantResourceGroup {
+ t.Errorf("ResourceGroup = %q, want %q", result.Spec.FailoverReservation.ResourceGroup, tt.wantResourceGroup)
+ }
+ })
+ }
+}
+
// ============================================================================
// Test Helpers (local to this test file)
// ============================================================================
diff --git a/internal/scheduling/reservations/flavor_groups.go b/internal/scheduling/reservations/flavor_groups.go
index 197406eac..b6344630a 100644
--- a/internal/scheduling/reservations/flavor_groups.go
+++ b/internal/scheduling/reservations/flavor_groups.go
@@ -15,6 +15,20 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
)
+// FindFlavorInGroups searches all flavor groups for a flavor by name.
+// Returns the flavor group name and flavor details, or an error if the flavor
+// is not found in any group.
+func FindFlavorInGroups(flavorName string, flavorGroups map[string]compute.FlavorGroupFeature) (groupName string, flavor *compute.FlavorInGroup, err error) {
+ for gName, fg := range flavorGroups {
+ for i, f := range fg.Flavors {
+ if f.Name == flavorName {
+ return gName, &fg.Flavors[i], nil
+ }
+ }
+ }
+ return "", nil, fmt.Errorf("flavor %q not found in any flavor group", flavorName)
+}
+
// FlavorGroupKnowledgeClient accesses flavor group data from Knowledge CRDs.
type FlavorGroupKnowledgeClient struct {
client.Client
From 34c04f173f863fd4c44682df3e249f87e2991641 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Thu, 30 Apr 2026 07:12:25 +0000
Subject: [PATCH 27/54] Bump cortex chart appVersions to sha-edd309cd [skip ci]
---
helm/library/cortex/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml
index 4aea0d6ed..08eae6ae5 100644
--- a/helm/library/cortex/Chart.yaml
+++ b/helm/library/cortex/Chart.yaml
@@ -3,6 +3,6 @@ name: cortex
description: A Helm chart to distribute cortex.
type: application
version: 0.0.44
-appVersion: "sha-5a1a8838"
+appVersion: "sha-edd309cd"
icon: "https://example.com/icon.png"
dependencies: []
From d8871ab892a97ccda8a5602d7c04b9cb405b538b Mon Sep 17 00:00:00 2001
From: mblos <156897072+mblos@users.noreply.github.com>
Date: Thu, 30 Apr 2026 11:03:39 +0200
Subject: [PATCH 28/54] fix: resolve serveHandlerWithBody signature conflict in
shim placement tests (#782)
The traits and aggregates test files introduced in #766 duplicated
serveHandlerWithBody with a `string` body parameter, conflicting with
the existing `io.Reader` version in shim_test.go.
## Summary by CodeRabbit
* **Tests**
* Refactored internal test implementations for improved code
maintainability.
* **Chores**
* Enhanced test infrastructure and removed redundant test utilities.
**Note:** This release contains internal improvements with no
user-facing changes.
---
...andle_resource_provider_aggregates_test.go | 11 +++++----
.../handle_resource_provider_traits_test.go | 24 ++++---------------
2 files changed, 10 insertions(+), 25 deletions(-)
diff --git a/internal/shim/placement/handle_resource_provider_aggregates_test.go b/internal/shim/placement/handle_resource_provider_aggregates_test.go
index c4e1b1b27..5dfac22c6 100644
--- a/internal/shim/placement/handle_resource_provider_aggregates_test.go
+++ b/internal/shim/placement/handle_resource_provider_aggregates_test.go
@@ -6,6 +6,7 @@ package placement
import (
"encoding/json"
"net/http"
+ "strings"
"testing"
hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
@@ -170,7 +171,7 @@ func TestHandleResourceProviderAggregates_CRDMode(t *testing.T) {
body := `{"aggregates":["new-uuid-1","new-uuid-2"],"resource_provider_generation":0}`
w := serveHandlerWithBody(t, "PUT", "/resource_providers/{uuid}/aggregates",
sPut.HandleUpdateResourceProviderAggregates,
- "/resource_providers/c1c2c3c4-d5d6-e7e8-f9f0-a1a2a3a4a5a6/aggregates", body)
+ "/resource_providers/c1c2c3c4-d5d6-e7e8-f9f0-a1a2a3a4a5a6/aggregates", strings.NewReader(body))
if w.Code != http.StatusOK {
t.Fatalf("status = %d, want %d; body: %s", w.Code, http.StatusOK, w.Body.String())
}
@@ -201,7 +202,7 @@ func TestHandleResourceProviderAggregates_CRDMode(t *testing.T) {
body := `{"aggregates":["u1"],"resource_provider_generation":999}`
w := serveHandlerWithBody(t, "PUT", "/resource_providers/{uuid}/aggregates",
sConflict.HandleUpdateResourceProviderAggregates,
- "/resource_providers/d1d2d3d4-e5e6-f7f8-a9a0-b1b2b3b4b5b6/aggregates", body)
+ "/resource_providers/d1d2d3d4-e5e6-f7f8-a9a0-b1b2b3b4b5b6/aggregates", strings.NewReader(body))
if w.Code != http.StatusConflict {
t.Fatalf("status = %d, want %d", w.Code, http.StatusConflict)
}
@@ -211,7 +212,7 @@ func TestHandleResourceProviderAggregates_CRDMode(t *testing.T) {
body := `{"aggregates":["u1"],"resource_provider_generation":0}`
w := serveHandlerWithBody(t, "PUT", "/resource_providers/{uuid}/aggregates",
s.HandleUpdateResourceProviderAggregates,
- "/resource_providers/e1e2e3e4-f5f6-a7a8-b9b0-c1c2c3c4c5c6/aggregates", body)
+ "/resource_providers/e1e2e3e4-f5f6-a7a8-b9b0-c1c2c3c4c5c6/aggregates", strings.NewReader(body))
if w.Code != http.StatusNotFound {
t.Fatalf("status = %d, want %d", w.Code, http.StatusNotFound)
}
@@ -228,7 +229,7 @@ func TestHandleResourceProviderAggregates_CRDMode(t *testing.T) {
body := `{"aggregates":[],"resource_provider_generation":0}`
w := serveHandlerWithBody(t, "PUT", "/resource_providers/{uuid}/aggregates",
sClear.HandleUpdateResourceProviderAggregates,
- "/resource_providers/e1e2e3e4-f5f6-a7a8-b9b0-c1c2c3c4c5c6/aggregates", body)
+ "/resource_providers/e1e2e3e4-f5f6-a7a8-b9b0-c1c2c3c4c5c6/aggregates", strings.NewReader(body))
if w.Code != http.StatusOK {
t.Fatalf("status = %d, want %d; body: %s", w.Code, http.StatusOK, w.Body.String())
}
@@ -253,7 +254,7 @@ func TestHandleResourceProviderAggregates_CRDMode(t *testing.T) {
t.Run("PUT returns 400 for malformed body", func(t *testing.T) {
w := serveHandlerWithBody(t, "PUT", "/resource_providers/{uuid}/aggregates",
s.HandleUpdateResourceProviderAggregates,
- "/resource_providers/"+validUUID+"/aggregates", "not json")
+ "/resource_providers/"+validUUID+"/aggregates", strings.NewReader("not json"))
if w.Code != http.StatusBadRequest {
t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest)
}
diff --git a/internal/shim/placement/handle_resource_provider_traits_test.go b/internal/shim/placement/handle_resource_provider_traits_test.go
index d7e94ae0e..9483044d4 100644
--- a/internal/shim/placement/handle_resource_provider_traits_test.go
+++ b/internal/shim/placement/handle_resource_provider_traits_test.go
@@ -6,7 +6,6 @@ package placement
import (
"encoding/json"
"net/http"
- "net/http/httptest"
"strings"
"testing"
@@ -23,21 +22,6 @@ func testHypervisorWithGroups(name, openstackID string, groups []hv1.Group) *hv1
}
}
-func serveHandlerWithBody(t *testing.T, method, pattern string, handler http.HandlerFunc, reqPath, body string) *httptest.ResponseRecorder { //nolint:unparam
- t.Helper()
- mux := http.NewServeMux()
- mux.HandleFunc(method+" "+pattern, handler)
- var req *http.Request
- if body != "" {
- req = httptest.NewRequest(method, reqPath, strings.NewReader(body))
- } else {
- req = httptest.NewRequest(method, reqPath, http.NoBody)
- }
- w := httptest.NewRecorder()
- mux.ServeHTTP(w, req)
- return w
-}
-
func TestHandleListResourceProviderTraits(t *testing.T) {
t.Run("valid uuid", func(t *testing.T) {
s := newTestShim(t, http.StatusOK, "{}", nil)
@@ -229,7 +213,7 @@ func TestHandleResourceProviderTraits_CRDMode(t *testing.T) {
body := `{"traits":["NEW_TRAIT_1","NEW_TRAIT_2"],"resource_provider_generation":0}`
w := serveHandlerWithBody(t, "PUT", "/resource_providers/{uuid}/traits",
sPut.HandleUpdateResourceProviderTraits,
- "/resource_providers/c1c2c3c4-d5d6-e7e8-f9f0-a1a2a3a4a5a6/traits", body)
+ "/resource_providers/c1c2c3c4-d5d6-e7e8-f9f0-a1a2a3a4a5a6/traits", strings.NewReader(body))
if w.Code != http.StatusOK {
t.Fatalf("status = %d, want %d; body: %s", w.Code, http.StatusOK, w.Body.String())
}
@@ -260,7 +244,7 @@ func TestHandleResourceProviderTraits_CRDMode(t *testing.T) {
body := `{"traits":["T1"],"resource_provider_generation":999}`
w := serveHandlerWithBody(t, "PUT", "/resource_providers/{uuid}/traits",
sConflict.HandleUpdateResourceProviderTraits,
- "/resource_providers/d1d2d3d4-e5e6-f7f8-a9a0-b1b2b3b4b5b6/traits", body)
+ "/resource_providers/d1d2d3d4-e5e6-f7f8-a9a0-b1b2b3b4b5b6/traits", strings.NewReader(body))
if w.Code != http.StatusConflict {
t.Fatalf("status = %d, want %d", w.Code, http.StatusConflict)
}
@@ -270,7 +254,7 @@ func TestHandleResourceProviderTraits_CRDMode(t *testing.T) {
body := `{"traits":["T1"],"resource_provider_generation":0}`
w := serveHandlerWithBody(t, "PUT", "/resource_providers/{uuid}/traits",
s.HandleUpdateResourceProviderTraits,
- "/resource_providers/e1e2e3e4-f5f6-a7a8-b9b0-c1c2c3c4c5c6/traits", body)
+ "/resource_providers/e1e2e3e4-f5f6-a7a8-b9b0-c1c2c3c4c5c6/traits", strings.NewReader(body))
if w.Code != http.StatusNotFound {
t.Fatalf("status = %d, want %d", w.Code, http.StatusNotFound)
}
@@ -279,7 +263,7 @@ func TestHandleResourceProviderTraits_CRDMode(t *testing.T) {
t.Run("PUT returns 400 for malformed body", func(t *testing.T) {
w := serveHandlerWithBody(t, "PUT", "/resource_providers/{uuid}/traits",
s.HandleUpdateResourceProviderTraits,
- "/resource_providers/"+validUUID+"/traits", "not json")
+ "/resource_providers/"+validUUID+"/traits", strings.NewReader("not json"))
if w.Code != http.StatusBadRequest {
t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest)
}
From 822be22fa203438a3f39e88dbdc983235856c12e Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Thu, 30 Apr 2026 09:20:58 +0000
Subject: [PATCH 29/54] Bump cortex-shim chart appVersions to sha-d8871ab8
[skip ci]
---
helm/library/cortex-shim/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex-shim/Chart.yaml b/helm/library/cortex-shim/Chart.yaml
index d72eb882f..c7a63bacf 100644
--- a/helm/library/cortex-shim/Chart.yaml
+++ b/helm/library/cortex-shim/Chart.yaml
@@ -3,6 +3,6 @@ name: cortex-shim
description: A Helm chart to distribute cortex shims.
type: application
version: 0.0.3
-appVersion: "sha-5a1a8838"
+appVersion: "sha-d8871ab8"
icon: "https://example.com/icon.png"
dependencies: []
From f631f217f403c4824772e86df8c7812f96e42d41 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Thu, 30 Apr 2026 09:21:00 +0000
Subject: [PATCH 30/54] Bump cortex chart appVersions to sha-d8871ab8 [skip ci]
---
helm/library/cortex/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml
index 08eae6ae5..cf75f46f5 100644
--- a/helm/library/cortex/Chart.yaml
+++ b/helm/library/cortex/Chart.yaml
@@ -3,6 +3,6 @@ name: cortex
description: A Helm chart to distribute cortex.
type: application
version: 0.0.44
-appVersion: "sha-edd309cd"
+appVersion: "sha-d8871ab8"
icon: "https://example.com/icon.png"
dependencies: []
From 5d0fc93f3a3f3331e71f735db1d8c5df04b87e1d Mon Sep 17 00:00:00 2001
From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com>
Date: Fri, 1 May 2026 12:46:35 +0000
Subject: [PATCH 31/54] Renovate: Update External dependencies (#785)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This PR contains the following updates:
| Package | Change |
[Age](https://docs.renovatebot.com/merge-confidence/) |
[Adoption](https://docs.renovatebot.com/merge-confidence/) |
[Passing](https://docs.renovatebot.com/merge-confidence/) |
[Confidence](https://docs.renovatebot.com/merge-confidence/) | Type |
Update |
|---|---|---|---|---|---|---|---|
|
[github.com/cobaltcore-dev/openstack-hypervisor-operator](https://redirect.github.com/cobaltcore-dev/openstack-hypervisor-operator)
| `v1.0.2-0.20260429064011-d35f2bc2c5d4` → `v1.2.0` |

|

|

|

| require | minor |
|
[github.com/mattn/go-sqlite3](https://redirect.github.com/mattn/go-sqlite3)
| `v1.14.42` → `v1.14.44` |

|

|

|

| require | patch |
| [go.uber.org/zap](https://redirect.github.com/uber-go/zap) | `v1.27.1`
→ `v1.28.0` |

|

|

|

| require | minor |
|
[kube-prometheus-stack](https://redirect.github.com/prometheus-operator/kube-prometheus)
([source](https://redirect.github.com/prometheus-community/helm-charts))
| `84.0.0` → `84.4.0` |

|

|

|

| | minor |
|
[sigs.k8s.io/controller-runtime](https://redirect.github.com/kubernetes-sigs/controller-runtime)
| `v0.23.3` → `v0.24.0` |

|

|

|

| require | minor |
---
### Release Notes
cobaltcore-dev/openstack-hypervisor-operator
(github.com/cobaltcore-dev/openstack-hypervisor-operator)
###
[`v1.2.0`](https://redirect.github.com/cobaltcore-dev/openstack-hypervisor-operator/compare/v1.1.0...v1.2.0)
[Compare
Source](https://redirect.github.com/cobaltcore-dev/openstack-hypervisor-operator/compare/v1.1.0...v1.2.0)
mattn/go-sqlite3 (github.com/mattn/go-sqlite3)
###
[`v1.14.44`](https://redirect.github.com/mattn/go-sqlite3/compare/v1.14.43...v1.14.44)
[Compare
Source](https://redirect.github.com/mattn/go-sqlite3/compare/v1.14.43...v1.14.44)
###
[`v1.14.43`](https://redirect.github.com/mattn/go-sqlite3/compare/v1.14.42...v1.14.43)
[Compare
Source](https://redirect.github.com/mattn/go-sqlite3/compare/v1.14.42...v1.14.43)
uber-go/zap (go.uber.org/zap)
###
[`v1.28.0`](https://redirect.github.com/uber-go/zap/releases/tag/v1.28.0)
[Compare
Source](https://redirect.github.com/uber-go/zap/compare/v1.27.1...v1.28.0)
Enhancements:
- [#1534][]: Add `zapcore.CheckPreWriteHook` and
`CheckedEntry.Before` method for transforming entries before they are
written to any Cores.
[#1534]: https://redirect.github.com/uber-go/zap/pull/1534
prometheus-community/helm-charts
(kube-prometheus-stack)
###
[`v84.4.0`](https://redirect.github.com/prometheus-community/helm-charts/releases/tag/kube-prometheus-stack-84.4.0)
[Compare
Source](https://redirect.github.com/prometheus-community/helm-charts/compare/kube-prometheus-stack-84.3.0...kube-prometheus-stack-84.4.0)
kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards,
and Prometheus rules combined with documentation and scripts to provide
easy to operate end-to-end Kubernetes cluster monitoring with Prometheus
using the Prometheus Operator.
#### What's Changed
- \[kube-prometheus-stack] Update Helm release grafana to v12.3.0 by
[@renovate](https://redirect.github.com/renovate)\[bot] in
[#6877](https://redirect.github.com/prometheus-community/helm-charts/pull/6877)
**Full Changelog**:
###
[`v84.3.0`](https://redirect.github.com/prometheus-community/helm-charts/releases/tag/kube-prometheus-stack-84.3.0)
[Compare
Source](https://redirect.github.com/prometheus-community/helm-charts/compare/kube-prometheus-stack-84.2.1...kube-prometheus-stack-84.3.0)
kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards,
and Prometheus rules combined with documentation and scripts to provide
easy to operate end-to-end Kubernetes cluster monitoring with Prometheus
using the Prometheus Operator.
#### What's Changed
- \[kube-prometheus-stack] Update kube-prometheus-stack dependency
non-major updates by
[@renovate](https://redirect.github.com/renovate)\[bot] in
[#6871](https://redirect.github.com/prometheus-community/helm-charts/pull/6871)
**Full Changelog**:
###
[`v84.2.1`](https://redirect.github.com/prometheus-community/helm-charts/releases/tag/kube-prometheus-stack-84.2.1)
[Compare
Source](https://redirect.github.com/prometheus-community/helm-charts/compare/kube-prometheus-stack-84.2.0...kube-prometheus-stack-84.2.1)
kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards,
and Prometheus rules combined with documentation and scripts to provide
easy to operate end-to-end Kubernetes cluster monitoring with Prometheus
using the Prometheus Operator.
#### What's Changed
- \[kube-prometheus-stack] prom spec add thanos.image by
[@trouaux](https://redirect.github.com/trouaux) in
[#6849](https://redirect.github.com/prometheus-community/helm-charts/pull/6849)
**Full Changelog**:
###
[`v84.2.0`](https://redirect.github.com/prometheus-community/helm-charts/releases/tag/kube-prometheus-stack-84.2.0)
[Compare
Source](https://redirect.github.com/prometheus-community/helm-charts/compare/kube-prometheus-stack-84.1.2...kube-prometheus-stack-84.2.0)
kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards,
and Prometheus rules combined with documentation and scripts to provide
easy to operate end-to-end Kubernetes cluster monitoring with Prometheus
using the Prometheus Operator.
#### What's Changed
- \[kube-prometheus-stack] Update Helm release grafana to v12.2.1 by
[@renovate](https://redirect.github.com/renovate)\[bot] in
[#6868](https://redirect.github.com/prometheus-community/helm-charts/pull/6868)
**Full Changelog**:
###
[`v84.1.2`](https://redirect.github.com/prometheus-community/helm-charts/releases/tag/kube-prometheus-stack-84.1.2)
[Compare
Source](https://redirect.github.com/prometheus-community/helm-charts/compare/kube-prometheus-stack-84.1.1...kube-prometheus-stack-84.1.2)
kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards,
and Prometheus rules combined with documentation and scripts to provide
easy to operate end-to-end Kubernetes cluster monitoring with Prometheus
using the Prometheus Operator.
#### What's Changed
- \[kube-prometheus-stack] Update Helm release grafana to v12.1.3 by
[@renovate](https://redirect.github.com/renovate)\[bot] in
[#6867](https://redirect.github.com/prometheus-community/helm-charts/pull/6867)
**Full Changelog**:
###
[`v84.1.1`](https://redirect.github.com/prometheus-community/helm-charts/releases/tag/kube-prometheus-stack-84.1.1)
[Compare
Source](https://redirect.github.com/prometheus-community/helm-charts/compare/kube-prometheus-stack-84.1.0...kube-prometheus-stack-84.1.1)
kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards,
and Prometheus rules combined with documentation and scripts to provide
easy to operate end-to-end Kubernetes cluster monitoring with Prometheus
using the Prometheus Operator.
#### What's Changed
- \[kube-prometheus-stack] Update Helm release grafana to v12.1.2 by
[@renovate](https://redirect.github.com/renovate)\[bot] in
[#6866](https://redirect.github.com/prometheus-community/helm-charts/pull/6866)
**Full Changelog**:
###
[`v84.1.0`](https://redirect.github.com/prometheus-community/helm-charts/releases/tag/kube-prometheus-stack-84.1.0)
[Compare
Source](https://redirect.github.com/prometheus-community/helm-charts/compare/kube-prometheus-stack-84.0.1...kube-prometheus-stack-84.1.0)
kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards,
and Prometheus rules combined with documentation and scripts to provide
easy to operate end-to-end Kubernetes cluster monitoring with Prometheus
using the Prometheus Operator.
#### What's Changed
- \[kube-prometheus-stack] Add kubeApiServer jobNameOverride by
[@abelfodil](https://redirect.github.com/abelfodil) in
[#6865](https://redirect.github.com/prometheus-community/helm-charts/pull/6865)
**Full Changelog**:
###
[`v84.0.1`](https://redirect.github.com/prometheus-community/helm-charts/releases/tag/kube-prometheus-stack-84.0.1)
[Compare
Source](https://redirect.github.com/prometheus-community/helm-charts/compare/kube-prometheus-stack-84.0.0...kube-prometheus-stack-84.0.1)
kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards,
and Prometheus rules combined with documentation and scripts to provide
easy to operate end-to-end Kubernetes cluster monitoring with Prometheus
using the Prometheus Operator.
#### What's Changed
- \[kube-prometheus-stack] Update kube-prometheus-stack dependency
non-major updates by
[@renovate](https://redirect.github.com/renovate)\[bot] in
[#6863](https://redirect.github.com/prometheus-community/helm-charts/pull/6863)
**Full Changelog**:
kubernetes-sigs/controller-runtime
(sigs.k8s.io/controller-runtime)
###
[`v0.24.0`](https://redirect.github.com/kubernetes-sigs/controller-runtime/releases/tag/v0.24.0)
[Compare
Source](https://redirect.github.com/kubernetes-sigs/controller-runtime/compare/v0.23.3...v0.24.0)
#### :warning: Breaking Changes
- Dependencies: Update to k8s.io/\* v1.36
([#3506](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3506)
[#3462](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3462)
[#3486](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3486)
[#3450](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3450))
#### :bug: Bug Fixes
- Cache: Fix IndexField blocking until informer is synced
([#3445](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3445))
- Cache: Wait for cache sync when ReaderFailOnMissingInformer is true
([#3425](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3425))
- Client: Update typed ApplyConfigurations with server response
([#3475](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3475))
- Fakeclient: Fix SSA status patch resource version check
([#3443](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3443))
- Fakeclient: Fix panic when using CRs with embedded pointer structs
([#3431](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3431))
- Fakeclient: Fix status apply if existing object has managedFields set
([#3430](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3430))
- Fakeclient: Retry GenerateName on AlreadyExists collisions
([#3498](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3498))
- HTTP servers: Wire up base context into http servers
([#3452](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3452))
#### :seedling: Others
- Builder/Webhooks: Remove deprecated custom path function
([#3465](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3465))
- Cache: Test cache reader waits for cache sync
([#3434](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3434))
- Certwatcher: Deflake certwatcher tests
([#3457](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3457))
- Dependencies: Use forked version of btree
([#3449](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3449))
- Envtest: Ensure envtest stops the whole process group
([#3447](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3447))
- Logging: Add missing space in zap-log-level flag description
([#3492](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3492))
- Misc: Adopt new(x) over ptr.To(x) and re-enable newexpr lint
([#3489](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3489))
- Owners: Cleanup
([#3453](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3453))
- Recorder: Add logger into context for structured logging
([#3454](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3454))
- Recorder: Switch to `StartLogging` for event debug logs
([#3451](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3451))
- Scheme: Deprecate the scheme builder
([#3461](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3461))
- Source/Kind: Improve logging for dynamic type kind source
([#3494](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3494))
- Webhooks: Reduce memory usage of default webhooks
([#3463](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3463)
[#3468](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3468))
#### :seedling: CI & linters
- Chore: Update golangci-lint version to v2.8.0
([#3448](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3448))
- Chore: Update golangci-lint version to v2.10.1
([#3470](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3470))
- Chore: Update golangci-lint version to v2.11.3
([#3482](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3482))
- Migrate away from custom GitHub action approval workflow
([#3491](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3491))
- Release: Auto-create git tags for the `tools/setup-envtest` submodule
([#3476](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3476))
:book: Additionally, there has been 1 contribution to our documentation.
([#3477](https://redirect.github.com/kubernetes-sigs/controller-runtime/issues/3477))
#### Dependencies
##### Added
- github.com/cenkalti/backoff/v5:
[v5.0.3](https://redirect.github.com/cenkalti/backoff/tree/v5.0.3)
- gonum.org/v1/gonum: v0.16.0
- k8s.io/streaming: v0.36.0
##### Changed
- cel.dev/expr: v0.24.0 → v0.25.1
- cloud.google.com/go/compute/metadata: v0.6.0 → v0.9.0
-
github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp:
[v1.26.0 →
v1.30.0](https://redirect.github.com/GoogleCloudPlatform/opentelemetry-operations-go/compare/detectors/gcp/v1.26.0...detectors/gcp/v1.30.0)
- github.com/alecthomas/units: [b94a6e3 →
0f3dac3](https://redirect.github.com/alecthomas/units/compare/b94a6e3...0f3dac3)
- github.com/cncf/xds/go: [2f00578 →
ee656c7](https://redirect.github.com/cncf/xds/compare/2f00578...ee656c7)
- github.com/coreos/go-oidc: [v2.3.0+incompatible →
v2.5.0+incompatible](https://redirect.github.com/coreos/go-oidc/compare/v2.3.0...v2.5.0)
- github.com/coreos/go-systemd/v22: [v22.5.0 →
v22.7.0](https://redirect.github.com/coreos/go-systemd/compare/v22.5.0...v22.7.0)
- github.com/davecgh/go-spew: [v1.1.1 →
d8f796a](https://redirect.github.com/davecgh/go-spew/compare/v1.1.1...d8f796a)
- github.com/emicklei/go-restful/v3: [v3.12.2 →
v3.13.0](https://redirect.github.com/emicklei/go-restful/compare/v3.12.2...v3.13.0)
- github.com/envoyproxy/go-control-plane/envoy: [v1.32.4 →
v1.36.0](https://redirect.github.com/envoyproxy/go-control-plane/compare/envoy/v1.32.4...envoy/v1.36.0)
- github.com/envoyproxy/go-control-plane: [v0.13.4 →
v0.14.0](https://redirect.github.com/envoyproxy/go-control-plane/compare/v0.13.4...v0.14.0)
- github.com/envoyproxy/protoc-gen-validate: [v1.2.1 →
v1.3.0](https://redirect.github.com/envoyproxy/protoc-gen-validate/compare/v1.2.1...v1.3.0)
- github.com/go-jose/go-jose/v4: [v4.0.4 →
v4.1.3](https://redirect.github.com/go-jose/go-jose/compare/v4.0.4...v4.1.3)
- github.com/golang-jwt/jwt/v5: [v5.2.2 →
v5.3.0](https://redirect.github.com/golang-jwt/jwt/compare/v5.2.2...v5.3.0)
- github.com/golang/glog: [v1.2.4 →
v1.2.5](https://redirect.github.com/golang/glog/compare/v1.2.4...v1.2.5)
- github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus:
[v1.0.1 →
v1.1.0](https://redirect.github.com/grpc-ecosystem/go-grpc-middleware/compare/providers/prometheus/v1.0.1...providers/prometheus/v1.1.0)
- github.com/grpc-ecosystem/go-grpc-middleware/v2: [v2.3.0 →
v2.3.3](https://redirect.github.com/grpc-ecosystem/go-grpc-middleware/compare/v2.3.0...v2.3.3)
- github.com/grpc-ecosystem/grpc-gateway/v2: [v2.26.3 →
v2.27.7](https://redirect.github.com/grpc-ecosystem/grpc-gateway/compare/v2.26.3...v2.27.7)
- github.com/moby/spdystream: [v0.5.0 →
v0.5.1](https://redirect.github.com/moby/spdystream/compare/v0.5.0...v0.5.1)
- github.com/onsi/ginkgo/v2: [v2.27.2 →
v2.27.4](https://redirect.github.com/onsi/ginkgo/compare/v2.27.2...v2.27.4)
- github.com/onsi/gomega: [v1.38.2 →
v1.39.0](https://redirect.github.com/onsi/gomega/compare/v1.38.2...v1.39.0)
- github.com/pmezard/go-difflib: [v1.0.0 →
5d4384e](https://redirect.github.com/pmezard/go-difflib/compare/v1.0.0...5d4384e)
- github.com/prometheus/common: [v0.66.1 →
v0.67.5](https://redirect.github.com/prometheus/common/compare/v0.66.1...v0.67.5)
- github.com/prometheus/procfs: [v0.16.1 →
v0.19.2](https://redirect.github.com/prometheus/procfs/compare/v0.16.1...v0.19.2)
- github.com/spf13/cobra: [v1.10.0 →
v1.10.2](https://redirect.github.com/spf13/cobra/compare/v1.10.0...v1.10.2)
- github.com/spiffe/go-spiffe/v2: [v2.5.0 →
v2.6.0](https://redirect.github.com/spiffe/go-spiffe/compare/v2.5.0...v2.6.0)
- go.etcd.io/etcd/api/v3: v3.6.5 → v3.6.8
- go.etcd.io/etcd/client/pkg/v3: v3.6.5 → v3.6.8
- go.etcd.io/etcd/client/v3: v3.6.5 → v3.6.8
- go.etcd.io/etcd/pkg/v3: v3.6.5 → v3.6.8
- go.etcd.io/etcd/server/v3: v3.6.5 → v3.6.8
- go.opentelemetry.io/auto/sdk: v1.1.0 → v1.2.1
- go.opentelemetry.io/contrib/detectors/gcp: v1.34.0 → v1.39.0
-
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc:
v0.60.0 → v0.65.0
- go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp: v0.61.0
→ v0.65.0
- go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc:
v1.34.0 → v1.40.0
- go.opentelemetry.io/otel/exporters/otlp/otlptrace: v1.34.0 → v1.40.0
- go.opentelemetry.io/otel/metric: v1.36.0 → v1.41.0
- go.opentelemetry.io/otel/sdk/metric: v1.36.0 → v1.40.0
- go.opentelemetry.io/otel/sdk: v1.36.0 → v1.40.0
- go.opentelemetry.io/otel/trace: v1.36.0 → v1.41.0
- go.opentelemetry.io/otel: v1.36.0 → v1.41.0
- go.opentelemetry.io/proto/otlp: v1.5.0 → v1.9.0
- go.uber.org/zap: v1.27.0 → v1.27.1
- golang.org/x/crypto: v0.45.0 → v0.47.0
- golang.org/x/exp:
[`8a7402a`](https://redirect.github.com/kubernetes-sigs/controller-runtime/commit/8a7402a)
→
[`944ab1f`](https://redirect.github.com/kubernetes-sigs/controller-runtime/commit/944ab1f)
- golang.org/x/mod: v0.29.0 → v0.32.0
- golang.org/x/net: v0.47.0 → v0.49.0
- golang.org/x/oauth2: v0.30.0 → v0.34.0
- golang.org/x/sync: v0.18.0 → v0.19.0
- golang.org/x/sys: v0.38.0 → v0.40.0
- golang.org/x/telemetry:
[`078029d`](https://redirect.github.com/kubernetes-sigs/controller-runtime/commit/078029d)
→
[`bd525da`](https://redirect.github.com/kubernetes-sigs/controller-runtime/commit/bd525da)
- golang.org/x/term: v0.37.0 → v0.39.0
- golang.org/x/text: v0.31.0 → v0.33.0
- golang.org/x/time: v0.9.0 → v0.14.0
- golang.org/x/tools/go/expect: v0.1.0-deprecated → v0.1.1-deprecated
- golang.org/x/tools: v0.38.0 → v0.41.0
- google.golang.org/genproto/googleapis/api:
[`a0af3ef`](https://redirect.github.com/kubernetes-sigs/controller-runtime/commit/a0af3ef)
→
[`8636f87`](https://redirect.github.com/kubernetes-sigs/controller-runtime/commit/8636f87)
- google.golang.org/genproto/googleapis/rpc:
[`200df99`](https://redirect.github.com/kubernetes-sigs/controller-runtime/commit/200df99)
→
[`8636f87`](https://redirect.github.com/kubernetes-sigs/controller-runtime/commit/8636f87)
- google.golang.org/grpc: v1.72.2 → v1.79.3
- google.golang.org/protobuf: v1.36.8 →
[`f2248ac`](https://redirect.github.com/kubernetes-sigs/controller-runtime/commit/f2248ac)
- k8s.io/api: v0.35.0 → v0.36.0
- k8s.io/apiextensions-apiserver: v0.35.0 → v0.36.0
- k8s.io/apimachinery: v0.35.0 → v0.36.0
- k8s.io/apiserver: v0.35.0 → v0.36.0
- k8s.io/client-go: v0.35.0 → v0.36.0
- k8s.io/code-generator: v0.35.0 → v0.36.0
- k8s.io/component-base: v0.35.0 → v0.36.0
- k8s.io/klog/v2: v2.130.1 → v2.140.0
- k8s.io/kms: v0.35.0 → v0.36.0
- k8s.io/kube-openapi:
[`589584f`](https://redirect.github.com/kubernetes-sigs/controller-runtime/commit/589584f)
→
[`43fb72c`](https://redirect.github.com/kubernetes-sigs/controller-runtime/commit/43fb72c)
- k8s.io/utils:
[`bc988d5`](https://redirect.github.com/kubernetes-sigs/controller-runtime/commit/bc988d5)
→
[`b8788ab`](https://redirect.github.com/kubernetes-sigs/controller-runtime/commit/b8788ab)
- sigs.k8s.io/apiserver-network-proxy/konnectivity-client: v0.31.2 →
v0.34.0
- sigs.k8s.io/structured-merge-diff/v6: v6.3.0 → v6.3.2
##### Removed
- github.com/cenkalti/backoff/v4:
[v4.3.0](https://redirect.github.com/cenkalti/backoff/tree/v4.3.0)
- github.com/gregjones/httpcache:
[901d907](https://redirect.github.com/gregjones/httpcache/tree/901d907)
- github.com/grpc-ecosystem/go-grpc-prometheus:
[v1.2.0](https://redirect.github.com/grpc-ecosystem/go-grpc-prometheus/tree/v1.2.0)
- github.com/zeebo/errs:
[v1.4.0](https://redirect.github.com/zeebo/errs/tree/v1.4.0)
- golang.org/x/xerrors:
[`9bdfabe`](https://redirect.github.com/kubernetes-sigs/controller-runtime/commit/9bdfabe)
*Thanks to all our contributors!* 😊
---
### Configuration
📅 **Schedule**: (UTC)
- Branch creation
- "before 8am on Friday"
- Automerge
- At any time (no schedule defined)
🚦 **Automerge**: Enabled.
♻ **Rebasing**: Whenever PR is behind base branch, or you tick the
rebase/retry checkbox.
👻 **Immortal**: This PR will be recreated if closed unmerged. Get
[config
help](https://redirect.github.com/renovatebot/renovate/discussions) if
that's undesired.
---
- [ ] If you want to rebase/retry this PR, check
this box
---
This PR was generated by [Mend Renovate](https://mend.io/renovate/).
View the [repository job
log](https://developer.mend.io/github/cobaltcore-dev/cortex).
Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>
---
go.mod | 16 ++++-----
go.sum | 36 +++++++++----------
.../dev/cortex-prometheus-operator/Chart.yaml | 2 +-
3 files changed, 27 insertions(+), 27 deletions(-)
diff --git a/go.mod b/go.mod
index 0c3b9c736..52da7fef1 100644
--- a/go.mod
+++ b/go.mod
@@ -3,7 +3,7 @@ module github.com/cobaltcore-dev/cortex
go 1.26.0
require (
- github.com/cobaltcore-dev/openstack-hypervisor-operator v1.0.2-0.20260429064011-d35f2bc2c5d4
+ github.com/cobaltcore-dev/openstack-hypervisor-operator v1.2.0
github.com/go-gorp/gorp v2.2.0+incompatible
github.com/gophercloud/gophercloud/v2 v2.12.0
github.com/ironcore-dev/ironcore v0.3.0
@@ -14,7 +14,7 @@ require (
k8s.io/api v0.36.0
k8s.io/apimachinery v0.36.0
k8s.io/client-go v0.36.0
- sigs.k8s.io/controller-runtime v0.23.3
+ sigs.k8s.io/controller-runtime v0.24.0
)
require (
@@ -73,7 +73,7 @@ require (
github.com/json-iterator/go v1.1.12 // indirect
github.com/kylelemons/godebug v1.1.0 // indirect
github.com/lib/pq v1.12.3
- github.com/mattn/go-sqlite3 v1.14.42
+ github.com/mattn/go-sqlite3 v1.14.44
github.com/moby/sys/user v0.4.0 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
@@ -104,7 +104,7 @@ require (
go.opentelemetry.io/otel/trace v1.43.0 // indirect
go.opentelemetry.io/proto/otlp v1.9.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
- go.uber.org/zap v1.27.1
+ go.uber.org/zap v1.28.0
go.yaml.in/yaml/v2 v2.4.3 // indirect
go.yaml.in/yaml/v3 v3.0.4 // indirect
go4.org/netipx v0.0.0-20231129151722-fdeea329fbba // indirect
@@ -125,16 +125,16 @@ require (
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v3 v3.0.1
gotest.tools v2.2.0+incompatible // indirect
- k8s.io/apiextensions-apiserver v0.36.0-beta.0 // indirect
- k8s.io/apiserver v0.36.0-beta.0 // indirect
- k8s.io/component-base v0.36.0-beta.0 // indirect
+ k8s.io/apiextensions-apiserver v0.36.0 // indirect
+ k8s.io/apiserver v0.36.0 // indirect
+ k8s.io/component-base v0.36.0 // indirect
k8s.io/klog/v2 v2.140.0 // indirect
k8s.io/kube-openapi v0.0.0-20260317180543-43fb72c5454a // indirect
k8s.io/utils v0.0.0-20260319190234-28399d86e0b5 // indirect
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.34.0 // indirect
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect
sigs.k8s.io/randfill v1.0.0 // indirect
- sigs.k8s.io/structured-merge-diff/v6 v6.3.2 // indirect
+ sigs.k8s.io/structured-merge-diff/v6 v6.4.0 // indirect
sigs.k8s.io/yaml v1.6.0 // indirect
)
diff --git a/go.sum b/go.sum
index 66a44dd4d..638b047bd 100644
--- a/go.sum
+++ b/go.sum
@@ -20,8 +20,8 @@ github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1x
github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
-github.com/cobaltcore-dev/openstack-hypervisor-operator v1.0.2-0.20260429064011-d35f2bc2c5d4 h1:Umm6n7LMDnqqZ6QIMIFxzJmuBX/Bke4uvstm+KFKcaQ=
-github.com/cobaltcore-dev/openstack-hypervisor-operator v1.0.2-0.20260429064011-d35f2bc2c5d4/go.mod h1:fTJ5LAHj8NJ0AuQtsEX16Z1LXtCKqJfg+UhGfEnwImA=
+github.com/cobaltcore-dev/openstack-hypervisor-operator v1.2.0 h1:XYVIKTC19dj4jck2uinYzTNXcoED5HNTvv+BJ75M2E0=
+github.com/cobaltcore-dev/openstack-hypervisor-operator v1.2.0/go.mod h1:iuhqhW6ozxfYWbGlEeh9rW9xyTb/EgelkDJqzJXBclk=
github.com/containerd/continuity v0.4.5 h1:ZRoN1sXq9u7V6QoHMcVWGhOwDFqZ4B9i5H6un1Wh0x4=
github.com/containerd/continuity v0.4.5/go.mod h1:/lNJvtJKUQStBzpVQ1+rasXO1LAWtUQssk28EZvJ3nE=
github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI=
@@ -155,8 +155,8 @@ github.com/lib/pq v1.12.3 h1:tTWxr2YLKwIvK90ZXEw8GP7UFHtcbTtty8zsI+YjrfQ=
github.com/lib/pq v1.12.3/go.mod h1:/p+8NSbOcwzAEI7wiMXFlgydTwcgTr3OSKMsD2BitpA=
github.com/majewsky/gg v1.6.0 h1:QyUP+a1YHlCRmcvAlyVhOnqdpeDQogmAygQaeGU0VPc=
github.com/majewsky/gg v1.6.0/go.mod h1:KC7qUlln1VBY90OE0jXMNjXW2b9B4jJ1heYQ08OzeAg=
-github.com/mattn/go-sqlite3 v1.14.42 h1:MigqEP4ZmHw3aIdIT7T+9TLa90Z6smwcthx+Azv4Cgo=
-github.com/mattn/go-sqlite3 v1.14.42/go.mod h1:pjEuOr8IwzLJP2MfGeTb0A35jauH+C2kbHKBr7yXKVQ=
+github.com/mattn/go-sqlite3 v1.14.44 h1:3VSe+xafpbzsLbdr2AWlAZk9yRHiBhTBakioXaCKTF8=
+github.com/mattn/go-sqlite3 v1.14.44/go.mod h1:pjEuOr8IwzLJP2MfGeTb0A35jauH+C2kbHKBr7yXKVQ=
github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0=
github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo=
github.com/moby/sys/user v0.4.0 h1:jhcMKit7SA80hivmFJcbB1vqmw//wU61Zdui2eQXuMs=
@@ -174,8 +174,8 @@ github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7P
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/onsi/ginkgo v1.16.4 h1:29JGrr5oVBm5ulCWet69zQkzWipVXIol6ygQUe/EzNc=
-github.com/onsi/ginkgo/v2 v2.28.1 h1:S4hj+HbZp40fNKuLUQOYLDgZLwNUVn19N3Atb98NCyI=
-github.com/onsi/ginkgo/v2 v2.28.1/go.mod h1:CLtbVInNckU3/+gC8LzkGUb9oF+e8W8TdUsxPwvdOgE=
+github.com/onsi/ginkgo/v2 v2.28.2 h1:DTrMfpqxiNUyQ3Y0zhn1n3cOO2euFgQPYIpkWwxVFps=
+github.com/onsi/ginkgo/v2 v2.28.2/go.mod h1:CLtbVInNckU3/+gC8LzkGUb9oF+e8W8TdUsxPwvdOgE=
github.com/onsi/gomega v1.39.1 h1:1IJLAad4zjPn2PsnhH70V4DKRFlrCzGBNrNaru+Vf28=
github.com/onsi/gomega v1.39.1/go.mod h1:hL6yVALoTOxeWudERyfppUcZXjMwIMLnuSfruD2lcfg=
github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
@@ -259,8 +259,8 @@ go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
-go.uber.org/zap v1.27.1 h1:08RqriUEv8+ArZRYSTXy1LeBScaMpVSTBhCeaZYfMYc=
-go.uber.org/zap v1.27.1/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E=
+go.uber.org/zap v1.28.0 h1:IZzaP1Fv73/T/pBMLk4VutPl36uNC+OSUh3JLG3FIjo=
+go.uber.org/zap v1.28.0/go.mod h1:rDLpOi171uODNm/mxFcuYWxDsqWSAVkFdX4XojSKg/Q=
go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0=
go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8=
go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
@@ -315,16 +315,16 @@ gotest.tools v2.2.0+incompatible h1:VsBPFP1AI068pPrMxtb/S8Zkgf9xEmTLJjfM+P5UIEo=
gotest.tools v2.2.0+incompatible/go.mod h1:DsYFclhRJ6vuDpmuTbkuFWG+y2sxOXAzmJt81HFBacw=
k8s.io/api v0.36.0 h1:SgqDhZzHdOtMk40xVSvCXkP9ME0H05hPM3p9AB1kL80=
k8s.io/api v0.36.0/go.mod h1:m1LVrGPNYax5NBHdO+QuAedXyuzTt4RryI/qnmNvs34=
-k8s.io/apiextensions-apiserver v0.36.0-beta.0 h1:KstD2m0/HuCoyKpUo3rswTzMYB6rlliCS7bxeXleXP8=
-k8s.io/apiextensions-apiserver v0.36.0-beta.0/go.mod h1:puKb28c0ZYFZv/s+YYgzk7JIaOAhtaLAPnkuvVsVvkw=
+k8s.io/apiextensions-apiserver v0.36.0 h1:Wt7E8J+VBCbj4FjiBfDTK/neXDDjyJVJc7xfuOHImZ0=
+k8s.io/apiextensions-apiserver v0.36.0/go.mod h1:kGDjH0msuiIB3tgsYRV0kS9GqpMYMUsQ3GHv7TApyug=
k8s.io/apimachinery v0.36.0 h1:jZyPzhd5Z+3h9vJLt0z9XdzW9VzNzWAUw+P1xZ9PXtQ=
k8s.io/apimachinery v0.36.0/go.mod h1:FklypaRJt6n5wUIwWXIP6GJlIpUizTgfo1T/As+Tyxc=
-k8s.io/apiserver v0.36.0-beta.0 h1:p1ow5wWRkn8e2vLhInHv1vLEgsEK0rEoCiH+d/dcuc8=
-k8s.io/apiserver v0.36.0-beta.0/go.mod h1:ogUX8mFvFY1xoTFrd16A1P5IP9RM6cVPIee4L4ILOw0=
+k8s.io/apiserver v0.36.0 h1:Jg5OFAENUACByUCg15CmhZAYrr5ZyJ+jodyA1mHl3YE=
+k8s.io/apiserver v0.36.0/go.mod h1:mHvwdHf+qKEm+1/hYm756SV+oREOKSPnsjagOpx6Vho=
k8s.io/client-go v0.36.0 h1:pOYi7C4RHChYjMiHpZSpSbIM6ZxVbRXBy7CuiIwqA3c=
k8s.io/client-go v0.36.0/go.mod h1:ZKKcpwF0aLYfkHFCjillCKaTK/yBkEDHTDXCFY6AS9Y=
-k8s.io/component-base v0.36.0-beta.0 h1:jQIJsypS9vwTg1y2YCSovj5wr3ua4yFq/FT7rhV6/7M=
-k8s.io/component-base v0.36.0-beta.0/go.mod h1:WHn3mIHRrU6cfgf4vt/XsBBS0H/0eXwrDqosH4O43Ow=
+k8s.io/component-base v0.36.0 h1:hFjEktssxiJhrK1zfybkH4kJOi8iZuF+mIDCqS5+jRo=
+k8s.io/component-base v0.36.0/go.mod h1:JZvIfcNHk+uck+8LhJzhSBtydWXaZNQwX2OdL+Mnwsk=
k8s.io/klog/v2 v2.140.0 h1:Tf+J3AH7xnUzZyVVXhTgGhEKnFqye14aadWv7bzXdzc=
k8s.io/klog/v2 v2.140.0/go.mod h1:o+/RWfJ6PwpnFn7OyAG3QnO47BFsymfEfrz6XyYSSp0=
k8s.io/kube-openapi v0.0.0-20260317180543-43fb72c5454a h1:xCeOEAOoGYl2jnJoHkC3hkbPJgdATINPMAxaynU2Ovg=
@@ -335,13 +335,13 @@ k8s.io/utils v0.0.0-20260319190234-28399d86e0b5 h1:kBawHLSnx/mYHmRnNUf9d4CpjREbe
k8s.io/utils v0.0.0-20260319190234-28399d86e0b5/go.mod h1:xDxuJ0whA3d0I4mf/C4ppKHxXynQ+fxnkmQH0vTHnuk=
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.34.0 h1:hSfpvjjTQXQY2Fol2CS0QHMNs/WI1MOSGzCm1KhM5ec=
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.34.0/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw=
-sigs.k8s.io/controller-runtime v0.23.1-0.20260418192536-e4a998cc6b09 h1:wGxxs0wawYxrBIc8BrwZCugW+4lx7SkPBi+70sJI7KE=
-sigs.k8s.io/controller-runtime v0.23.1-0.20260418192536-e4a998cc6b09/go.mod h1:bTZXYvH6eWv12M5PgRYYTSMw/LDN1xBf0oNTycNo1YY=
+sigs.k8s.io/controller-runtime v0.24.0 h1:Ck6N2LdS8Lovy1o25BB4r1xjvLEKUl1s2o9kU+KWDE4=
+sigs.k8s.io/controller-runtime v0.24.0/go.mod h1:vFkfY5fGt5xAC/sKb8IBFKgWPNKG9OUG29dR8Y2wImw=
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg=
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg=
sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU=
sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY=
-sigs.k8s.io/structured-merge-diff/v6 v6.3.2 h1:kwVWMx5yS1CrnFWA/2QHyRVJ8jM6dBA80uLmm0wJkk8=
-sigs.k8s.io/structured-merge-diff/v6 v6.3.2/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE=
+sigs.k8s.io/structured-merge-diff/v6 v6.4.0 h1:qmp2e3ZfFi1/jJbDGpD4mt3wyp6PE1NfKHCYLqgNQJo=
+sigs.k8s.io/structured-merge-diff/v6 v6.4.0/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE=
sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs=
sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4=
diff --git a/helm/dev/cortex-prometheus-operator/Chart.yaml b/helm/dev/cortex-prometheus-operator/Chart.yaml
index 7fdaa8932..fb2ea1e8d 100644
--- a/helm/dev/cortex-prometheus-operator/Chart.yaml
+++ b/helm/dev/cortex-prometheus-operator/Chart.yaml
@@ -10,4 +10,4 @@ dependencies:
# CRDs of the prometheus operator, such as PrometheusRule, ServiceMonitor, etc.
- name: kube-prometheus-stack
repository: oci://ghcr.io/prometheus-community/charts
- version: 84.0.0
+ version: 84.4.0
From 16573d5a80a80073829cbe226bd02ba466454aef Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Fri, 1 May 2026 13:03:44 +0000
Subject: [PATCH 32/54] Bump cortex chart appVersions to sha-5d0fc93f [skip ci]
---
helm/library/cortex/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml
index cf75f46f5..e67ea0462 100644
--- a/helm/library/cortex/Chart.yaml
+++ b/helm/library/cortex/Chart.yaml
@@ -3,6 +3,6 @@ name: cortex
description: A Helm chart to distribute cortex.
type: application
version: 0.0.44
-appVersion: "sha-d8871ab8"
+appVersion: "sha-5d0fc93f"
icon: "https://example.com/icon.png"
dependencies: []
From b6fbbcf1dc5812873911d6e9ee4cd74b2a8c83c5 Mon Sep 17 00:00:00 2001
From: Philipp Matthes
Date: Mon, 4 May 2026 07:47:07 +0200
Subject: [PATCH 33/54] Make /release claude command idempotent
---
.claude/commands/release.md | 28 ++++++++--
.claude/commands/{review-pr.md => review.md} | 0
.claude/commands/update-changelog.md | 54 --------------------
3 files changed, 24 insertions(+), 58 deletions(-)
rename .claude/commands/{review-pr.md => review.md} (100%)
delete mode 100644 .claude/commands/update-changelog.md
diff --git a/.claude/commands/release.md b/.claude/commands/release.md
index 38ff4a60f..d770d7e24 100644
--- a/.claude/commands/release.md
+++ b/.claude/commands/release.md
@@ -141,7 +141,16 @@ Prepare chart version bumps so GitHub pushes bumped charts to the registry immed
For each changed library chart, patch-bump its `version` in `helm/library//Chart.yaml` (e.g. `0.0.43` → `0.1.0`), if there was no breaking change, otherwise minor-bump it. Do not touch `appVersion`. Then update the matching `dependencies[].version` entry in every `helm/bundles/*/Chart.yaml` that references it.
-Open a single PR to `main` with all the bumps, branch `release/bump-charts-`, noting in the body that it should be merged before the release PR. Use the pull-request-creator agent for this subtask, and include the chart changes in the motivation so they are included in the PR description.
+### Check for existing bump PR
+
+Before creating a new PR, check if one already exists for this release:
+
+```
+gh pr list --head release/bump-charts- --state open --json number,url
+```
+
+- **If a PR already exists**: check out the existing `release/bump-charts-` branch, reset it to `main` (`git reset --hard origin/main`), apply the version bumps on top, force-push the branch. Then update the existing PR title and body with `gh pr edit` to reflect the latest changes.
+- **If no PR exists**: create branch `release/bump-charts-` from `main`, apply the bumps, and open a new PR noting in the body that it should be merged before the release PR. Use the pull-request-creator agent for this subtask, and include the chart changes in the motivation so they are included in the PR description.
## Phase 4: Update the PR Description
@@ -149,7 +158,18 @@ Use `gh pr edit` with `--body` to update the PR description with the changelog.
## Phase 5: Create a Changelog PR
-If the CHANGELOG.md does not exists, create it with a `# Changelog` header. Then create a new PR to `main` with branch `release/changelog-`, title `Update changelog for release PR #`, and a body noting it should be merged after the release PR. Use the pull-request-creator agent for this subtask.
+If the CHANGELOG.md does not exist, create it with a `# Changelog` header. Then prepend the new changelog entry below the header.
+
+### Check for existing changelog PR
+
+Before creating a new PR, check if one already exists for this release:
+
+```
+gh pr list --head release/changelog- --state open --json number,url
+```
+
+- **If a PR already exists**: check out the existing `release/changelog-` branch, reset it to `main` (`git reset --hard origin/main`), apply the changelog update on top, force-push the branch. Then update the existing PR title and body with `gh pr edit` to reflect the latest changes.
+- **If no PR exists**: create branch `release/changelog-` from `main`, apply the changelog, and open a new PR to `main` with title `Update changelog for release PR #` and a body noting it should be merged after the release PR. Use the pull-request-creator agent for this subtask.
## Phase 6: Summarize — Report what happened
@@ -159,6 +179,6 @@ After all subagents return, produce a short summary:
## Release #NNN Post-Open Summary
- PR description updated with changelog and bump PR reference
-- Bump PR #XXX opened to update chart versions
-- Changelog PR #YYY opened to update CHANGELOG.md
+- Bump PR #XXX opened/updated to update chart versions
+- Changelog PR #YYY opened/updated to update CHANGELOG.md
```
diff --git a/.claude/commands/review-pr.md b/.claude/commands/review.md
similarity index 100%
rename from .claude/commands/review-pr.md
rename to .claude/commands/review.md
diff --git a/.claude/commands/update-changelog.md b/.claude/commands/update-changelog.md
deleted file mode 100644
index 79a5e1f44..000000000
--- a/.claude/commands/update-changelog.md
+++ /dev/null
@@ -1,54 +0,0 @@
----
-allowed-tools: Read, Write, Edit, Bash(*), WebSearch, WebFetch
-description: Create a changelog entry for a merged release PR and open a PR to main. Usage: /update-changelog PR_NUMBER
----
-
-A release PR (#$ARGUMENTS) was merged into the `release` branch. Create a changelog entry for it and open a PR to `main`.
-
-To build the entry, use the PR's commit subjects (no diffs) and the changed Helm charts as your sources. Only include charts whose Chart.yaml actually changed in this PR.
-
-Format each entry as:
-
-## {merged_at date in UTC, formatted YYYY-MM-DD} — {PR title} ([#NNN](https://github.com/cobaltcore-dev/cortex/pull/NNN))
-
-One `###` section per changed chart: `### v ()`
-Under each section, bullet the commit subjects that relate to that chart.
-
-Attribution: for each commit, inspect its changed files with `git show --name-only ` and map to the chart whose files were touched:
-
-- `postgres/**` → cortex-postgres
-- `cmd/shim/**` or `internal/shim/**` → cortex-shim
-- `helm/bundles/cortex-/**` → that specific bundle chart
-- anything else → cortex (core)
-
-Commits that only touch CI, docs, or tooling go into `### General`. Skip commits containing "[skip ci]" or that are pure version-bump message.
-
-For bundle chart sections (helm/bundles/*), add a note listing which library chart versions they now include (read the bundle's Chart.yaml dependencies). Then inspect the actual diff of the bundle's own files with `git show -- helm/bundles//` for any commit that touched that bundle, and surface specific changes:
-
-- **values.yaml** changes: call out new, removed, or renamed keys and changed defaults
-- **templates/** or **crds/** changes: call out added, removed, or modified resources by kind and name
-
-Prepend the new entry below the `# Changelog` header in `CHANGELOG.md` (create the file if it doesn't exist). Then open a PR to `main` referencing this release PR.
-
-## Example
-
-```markdown
-## 2026-04-24 — Release libs cortex v0.0.43 + bundles v0.0.56 ([#722](https://github.com/cobaltcore-dev/cortex/pull/722))
-
-### cortex v0.0.43 (sha-xxxxxxxx)
-- Commitments usage API uses postgres database instead of calling nova
-- Check hypervisor resources against reservations
-- Add committed resource reservations to capacity calculation
-
-### cortex-postgres v0.5.14 (sha-xxxxxxxx)
-- Add commitments table migration
-
-### cortex-nova v0.0.56 (sha-xxxxxxxx)
-- Update nova bundle for committed reservations support
-
-### cortex-manila v0.0.56 (sha-xxxxxxxx)
-- Update manila bundle for committed reservations support
-
-### General
-- Update golangci-lint to v2.1.0
-```
From 95641bc2c711426fa7ccbfe6d28ccbb39e7e1666 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Mon, 4 May 2026 05:56:24 +0000
Subject: [PATCH 34/54] Bump cortex chart appVersions to sha-b6fbbcf1 [skip ci]
---
helm/library/cortex/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml
index e67ea0462..f7fc666d7 100644
--- a/helm/library/cortex/Chart.yaml
+++ b/helm/library/cortex/Chart.yaml
@@ -3,6 +3,6 @@ name: cortex
description: A Helm chart to distribute cortex.
type: application
version: 0.0.44
-appVersion: "sha-5d0fc93f"
+appVersion: "sha-b6fbbcf1"
icon: "https://example.com/icon.png"
dependencies: []
From e51b5de06cf32e1e08f732841012f0e0dc25ac88 Mon Sep 17 00:00:00 2001
From: Philipp Matthes
Date: Mon, 4 May 2026 08:37:44 +0200
Subject: [PATCH 35/54] Alert only on new vm faults
The alert now uses increase(cortex_vm_faults{...}[5m]) > 0 instead of the raw gauge. This means:
- It only fires when the fault count is actively increasing (new faults appearing within the 5m window).
- Once errored instances stop appearing (even if old ones linger undeleted), the increase() drops to 0 and the alert resolves.
- The 5m window is consistent with other alerts in this file and should be safely above your scrape interval.
---
helm/bundles/cortex-nova/alerts/nova.alerts.yaml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
index 75e6deb65..bc5fd20ce 100644
--- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
+++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
@@ -615,7 +615,7 @@ groups:
investigated if this alert persists.
- alert: CortexNovaDoesntFindValidKVMHosts
- expr: sum by (az, hvtype) (cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*"}) > 0
+ expr: sum by (az, hvtype) (increase(cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*"}[5m])) > 0
for: 5m
labels:
context: scheduling
@@ -627,7 +627,7 @@ groups:
annotations:
summary: "Nova scheduling cannot find valid KVM hosts"
description: >
- Cortex is seeing faulty vms in `{{$labels.az}}` where Nova scheduling
+ Cortex is seeing new faulty vms in `{{$labels.az}}` where Nova scheduling
failed to find a valid `{{$labels.hvtype}}` host. This may indicate
capacity issues, misconfigured filters, or resource constraints in the
datacenter. Investigate the affected VMs and hypervisor availability.
From f88c708daa440a9ac86cdad4ae8c10f7705bc726 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Mon, 4 May 2026 06:47:52 +0000
Subject: [PATCH 36/54] Bump cortex chart appVersions to sha-e51b5de0 [skip ci]
---
helm/library/cortex/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml
index f7fc666d7..2256c0135 100644
--- a/helm/library/cortex/Chart.yaml
+++ b/helm/library/cortex/Chart.yaml
@@ -3,6 +3,6 @@ name: cortex
description: A Helm chart to distribute cortex.
type: application
version: 0.0.44
-appVersion: "sha-b6fbbcf1"
+appVersion: "sha-e51b5de0"
icon: "https://example.com/icon.png"
dependencies: []
From 6a5e16254a22a936e686e5758546b2370e8de741 Mon Sep 17 00:00:00 2001
From: "cortex-ai-agents[bot]"
<279748396+cortex-ai-agents[bot]@users.noreply.github.com>
Date: Mon, 4 May 2026 10:06:42 +0200
Subject: [PATCH 37/54] Fix stale documentation: traits model, pipeline name,
and API path (#788)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
## Summary
- **architecture.md**: Update traits section to describe the current
single-ConfigMap model with `TraitSyncer` and `forwardWithHook` pattern
(replaces obsolete two-ConfigMap description removed in ebbf9d44)
- **failover-reservations.md**: Correct pipeline name from
`kvm-valid-host-new-failover-reservation` to
`kvm-new-failover-reservation` to match the
`PipelineNewFailoverReservation` constant
- **committed-resource-reservations.md**: Update API endpoint reference
from `api_*.go` to `api/` subdirectory
## Test plan
- [x] Verified changes are documentation-only (no code changes)
- [x] Confirmed correct pipeline name against
`internal/scheduling/reservations/failover/reservation_scheduling.go`
line 25
- [x] Confirmed API files live in
`internal/scheduling/reservations/commitments/api/` directory
- [x] Confirmed single-ConfigMap model against
`internal/shim/placement/syncer_traits.go` and `handle_traits.go`
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-authored-by: Claude
Co-authored-by: Claude Opus 4.7
---
docs/architecture.md | 13 ++++++-------
.../reservations/committed-resource-reservations.md | 2 +-
docs/reservations/failover-reservations.md | 2 +-
3 files changed, 8 insertions(+), 9 deletions(-)
diff --git a/docs/architecture.md b/docs/architecture.md
index 0e91385c2..6c6ba83aa 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -120,20 +120,19 @@ Both `hybrid` and `crd` modes require a `versioning` config block with `id`, `mi
### Traits
-When `features.traits` is set to `hybrid` or `crd`, the shim serves OpenStack Placement traits from a pair of Kubernetes ConfigMaps instead of forwarding to upstream:
+When `features.traits` is set to `hybrid` or `crd`, the shim serves OpenStack Placement traits from a single Kubernetes ConfigMap instead of forwarding to upstream. The ConfigMap name is set by `traits.configMapName` in the shim config and is owned by the shim.
-- **Static ConfigMap** (Helm-managed): Contains the standard OpenStack traits deployed via Helm. Its name is set by `traits.configMapName` in the shim config.
-- **Custom ConfigMap** (shim-managed): Stores `CUSTOM_*` traits created at runtime through PUT requests. Named `{configMapName}-custom`.
+On startup, a `TraitSyncer` initializes the ConfigMap (creating it if it does not exist). In the background, the syncer periodically fetches traits from upstream placement (every 60 seconds with jitter) and writes them into the ConfigMap, keeping the local view in sync.
The trait endpoints support the full OpenStack Placement traits API:
-- `GET /traits` returns a sorted, merged list from both ConfigMaps, with optional filtering via the `name` query parameter (`in:TRAIT_A,TRAIT_B` or `startswith:CUSTOM_`).
-- `GET /traits/{name}` checks both ConfigMaps for existence.
+- `GET /traits` returns a sorted list from the ConfigMap, with optional filtering via the `name` query parameter (`in:TRAIT_A,TRAIT_B` or `startswith:CUSTOM_`).
+- `GET /traits/{name}` checks the ConfigMap for existence.
- `PUT /traits/{name}` creates custom traits (only `CUSTOM_*` prefixed names are allowed).
- `DELETE /traits/{name}` removes custom traits.
-Writes to the custom ConfigMap are serialized across replicas using a Kubernetes Lease-backed distributed lock (see `pkg/resourcelock`). This prevents concurrent writes from corrupting the ConfigMap data.
+Writes to the ConfigMap are serialized across replicas using a Kubernetes Lease-backed distributed lock (see `pkg/resourcelock`). This prevents concurrent writes from corrupting the ConfigMap data.
-In **hybrid** mode, `GET`, `PUT`, and `DELETE` trait requests are forwarded to upstream Placement (so upstream always has the latest data), and a **periodic sync loop** runs in the background (every 60 seconds with jitter) to fetch traits from upstream and write them into the static ConfigMap. This keeps the local view in sync with upstream and prepares for cutover to `crd` mode. In **crd** mode, traits are served exclusively from the local ConfigMaps with no upstream dependency.
+In **hybrid** mode, PUT and DELETE requests are forwarded to upstream placement via the `forwardWithHook` pattern; on success, the trait is eagerly added to or removed from the local ConfigMap so the local view is immediately consistent. GET requests in hybrid mode are also forwarded to upstream. In **crd** mode, traits are served exclusively from the local ConfigMap with no upstream dependency.
### Authentication
diff --git a/docs/reservations/committed-resource-reservations.md b/docs/reservations/committed-resource-reservations.md
index 95f8b8bd5..132e2acfd 100644
--- a/docs/reservations/committed-resource-reservations.md
+++ b/docs/reservations/committed-resource-reservations.md
@@ -20,7 +20,7 @@ Cortex reserves hypervisor capacity for customers who pre-commit resources (comm
The CR reservation implementation is located in `internal/scheduling/reservations/commitments/`. Key components include:
- `CommittedResource` controller (`committed_resource_controller.go`) — acceptance, rejection, child Reservation CRUD
- `Reservation` controller (`reservation_controller.go`) — placement, VM allocation verification
-- API endpoints (`api_*.go`)
+- API endpoints (`api/`)
- Capacity and usage calculation logic (`capacity.go`, `usage.go`)
- Syncer for periodic state sync (`syncer.go`)
diff --git a/docs/reservations/failover-reservations.md b/docs/reservations/failover-reservations.md
index 1fa36d79d..14f785ae1 100644
--- a/docs/reservations/failover-reservations.md
+++ b/docs/reservations/failover-reservations.md
@@ -144,7 +144,7 @@ We use three different scheduler pipelines for failover reservations, each servi
**Why:** When reusing a reservation, capacity is already reserved on the target host. We only need to verify that the VM is compatible with the host (traits, capabilities, AZ, etc.) without checking if there's enough free capacity.
-### `kvm-valid-host-new-failover-reservation`
+### `kvm-new-failover-reservation`
**Used when:** Creating a new failover reservation.
**Why:** When creating a new reservation, we need to find a host that:
From d8bb12ef58035b4ecf73f1234c2abb90f791550c Mon Sep 17 00:00:00 2001
From: "cortex-ai-agents[bot]"
<279748396+cortex-ai-agents[bot]@users.noreply.github.com>
Date: Mon, 4 May 2026 10:07:40 +0200
Subject: [PATCH 38/54] fix: prevent nil pointer panic in feature mode override
guard (#789)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
## Summary
- Fix nil pointer dereference in `featureModeFromConfOrHeader` when a
per-request feature mode override escalates to hybrid/crd for an
endpoint whose backing config is nil
- The previous guard (`s.config.Versioning == nil && s.config.Traits ==
nil && s.config.ResourceClasses == nil`) was too permissive — it only
rejected overrides when ALL configs were nil, allowing panics when only
some were configured
- Add a `hasBackingConfig bool` parameter so each handler explicitly
declares whether its endpoint's infrastructure is available
## Test plan
- [x] Existing `TestFeatureModeFromConfOrHeader` tests updated and
passing
- [x] `go build ./...` passes
- [x] `go test ./internal/shim/placement/...` passes
- [x] `go vet ./internal/shim/placement/...` passes
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-authored-by: Claude
---
.../shim/placement/handle_resource_classes.go | 10 +++++-----
.../handle_resource_provider_aggregates.go | 4 ++--
.../handle_resource_provider_traits.go | 6 +++---
.../placement/handle_resource_providers.go | 10 +++++-----
internal/shim/placement/handle_root.go | 2 +-
internal/shim/placement/handle_traits.go | 8 ++++----
internal/shim/placement/shim.go | 18 ++++++++++--------
internal/shim/placement/shim_test.go | 12 ++++++------
8 files changed, 36 insertions(+), 34 deletions(-)
diff --git a/internal/shim/placement/handle_resource_classes.go b/internal/shim/placement/handle_resource_classes.go
index 9a3a9d4c2..f9f4f8b5d 100644
--- a/internal/shim/placement/handle_resource_classes.go
+++ b/internal/shim/placement/handle_resource_classes.go
@@ -34,7 +34,7 @@ func (s *Shim) HandleListResourceClasses(w http.ResponseWriter, r *http.Request)
ctx := r.Context()
log := logf.FromContext(ctx)
- switch s.featureModeFromConfOrHeader(r, s.config.Features.ResourceClasses) {
+ switch s.featureModeFromConfOrHeader(r, s.config.Features.ResourceClasses, s.config.ResourceClasses != nil) {
case FeatureModePassthrough, FeatureModeHybrid:
s.forward(w, r)
return
@@ -77,7 +77,7 @@ func (s *Shim) HandleCreateResourceClass(w http.ResponseWriter, r *http.Request)
ctx := r.Context()
log := logf.FromContext(ctx)
- mode := s.featureModeFromConfOrHeader(r, s.config.Features.ResourceClasses)
+ mode := s.featureModeFromConfOrHeader(r, s.config.Features.ResourceClasses, s.config.ResourceClasses != nil)
switch mode {
case FeatureModePassthrough:
s.forward(w, r)
@@ -170,7 +170,7 @@ func (s *Shim) HandleShowResourceClass(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()
log := logf.FromContext(ctx)
- switch s.featureModeFromConfOrHeader(r, s.config.Features.ResourceClasses) {
+ switch s.featureModeFromConfOrHeader(r, s.config.Features.ResourceClasses, s.config.ResourceClasses != nil) {
case FeatureModePassthrough, FeatureModeHybrid:
s.forward(w, r)
return
@@ -215,7 +215,7 @@ func (s *Shim) HandleUpdateResourceClass(w http.ResponseWriter, r *http.Request)
ctx := r.Context()
log := logf.FromContext(ctx)
- mode := s.featureModeFromConfOrHeader(r, s.config.Features.ResourceClasses)
+ mode := s.featureModeFromConfOrHeader(r, s.config.Features.ResourceClasses, s.config.ResourceClasses != nil)
switch mode {
case FeatureModePassthrough:
s.forward(w, r)
@@ -297,7 +297,7 @@ func (s *Shim) HandleDeleteResourceClass(w http.ResponseWriter, r *http.Request)
ctx := r.Context()
log := logf.FromContext(ctx)
- mode := s.featureModeFromConfOrHeader(r, s.config.Features.ResourceClasses)
+ mode := s.featureModeFromConfOrHeader(r, s.config.Features.ResourceClasses, s.config.ResourceClasses != nil)
switch mode {
case FeatureModePassthrough:
s.forward(w, r)
diff --git a/internal/shim/placement/handle_resource_provider_aggregates.go b/internal/shim/placement/handle_resource_provider_aggregates.go
index 8d509f732..3fbea5b97 100644
--- a/internal/shim/placement/handle_resource_provider_aggregates.go
+++ b/internal/shim/placement/handle_resource_provider_aggregates.go
@@ -50,7 +50,7 @@ func (s *Shim) HandleListResourceProviderAggregates(w http.ResponseWriter, r *ht
if !ok {
return
}
- switch s.featureModeFromConfOrHeader(r, s.config.Features.Aggregates) {
+ switch s.featureModeFromConfOrHeader(r, s.config.Features.Aggregates, true) {
case FeatureModePassthrough:
s.forward(w, r)
case FeatureModeHybrid:
@@ -135,7 +135,7 @@ func (s *Shim) HandleUpdateResourceProviderAggregates(w http.ResponseWriter, r *
if !ok {
return
}
- switch s.featureModeFromConfOrHeader(r, s.config.Features.Aggregates) {
+ switch s.featureModeFromConfOrHeader(r, s.config.Features.Aggregates, true) {
case FeatureModePassthrough:
s.forward(w, r)
case FeatureModeHybrid:
diff --git a/internal/shim/placement/handle_resource_provider_traits.go b/internal/shim/placement/handle_resource_provider_traits.go
index 463edfed7..8cc61b421 100644
--- a/internal/shim/placement/handle_resource_provider_traits.go
+++ b/internal/shim/placement/handle_resource_provider_traits.go
@@ -44,7 +44,7 @@ func (s *Shim) HandleListResourceProviderTraits(w http.ResponseWriter, r *http.R
if !ok {
return
}
- switch s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviderTraits) {
+ switch s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviderTraits, true) {
case FeatureModePassthrough:
s.forward(w, r)
case FeatureModeHybrid:
@@ -127,7 +127,7 @@ func (s *Shim) HandleUpdateResourceProviderTraits(w http.ResponseWriter, r *http
if !ok {
return
}
- switch s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviderTraits) {
+ switch s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviderTraits, true) {
case FeatureModePassthrough:
s.forward(w, r)
case FeatureModeHybrid:
@@ -281,7 +281,7 @@ func (s *Shim) HandleDeleteResourceProviderTraits(w http.ResponseWriter, r *http
if !ok {
return
}
- switch s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviderTraits) {
+ switch s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviderTraits, true) {
case FeatureModePassthrough:
s.forward(w, r)
case FeatureModeHybrid:
diff --git a/internal/shim/placement/handle_resource_providers.go b/internal/shim/placement/handle_resource_providers.go
index 04955fd72..6f73e27d6 100644
--- a/internal/shim/placement/handle_resource_providers.go
+++ b/internal/shim/placement/handle_resource_providers.go
@@ -115,7 +115,7 @@ func (s *Shim) HandleCreateResourceProvider(w http.ResponseWriter, r *http.Reque
ctx := r.Context()
log := logf.FromContext(ctx)
- mode := s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviders)
+ mode := s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviders, true)
switch mode {
case FeatureModePassthrough:
s.forward(w, r)
@@ -210,7 +210,7 @@ func (s *Shim) HandleShowResourceProvider(w http.ResponseWriter, r *http.Request
ctx := r.Context()
log := logf.FromContext(ctx)
- mode := s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviders)
+ mode := s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviders, true)
switch mode {
case FeatureModePassthrough:
s.forward(w, r)
@@ -280,7 +280,7 @@ func (s *Shim) HandleUpdateResourceProvider(w http.ResponseWriter, r *http.Reque
ctx := r.Context()
log := logf.FromContext(ctx)
- mode := s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviders)
+ mode := s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviders, true)
switch mode {
case FeatureModePassthrough:
s.forward(w, r)
@@ -376,7 +376,7 @@ func (s *Shim) HandleDeleteResourceProvider(w http.ResponseWriter, r *http.Reque
ctx := r.Context()
log := logf.FromContext(ctx)
- mode := s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviders)
+ mode := s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviders, true)
switch mode {
case FeatureModePassthrough:
s.forward(w, r)
@@ -452,7 +452,7 @@ type listResourceProvidersResponse struct {
//
// See: https://docs.openstack.org/api-ref/placement/#list-resource-providers
func (s *Shim) HandleListResourceProviders(w http.ResponseWriter, r *http.Request) {
- switch s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviders) {
+ switch s.featureModeFromConfOrHeader(r, s.config.Features.ResourceProviders, true) {
case FeatureModePassthrough:
s.forward(w, r)
case FeatureModeHybrid:
diff --git a/internal/shim/placement/handle_root.go b/internal/shim/placement/handle_root.go
index 9f9b510e7..45ada64fe 100644
--- a/internal/shim/placement/handle_root.go
+++ b/internal/shim/placement/handle_root.go
@@ -50,7 +50,7 @@ func (s *Shim) HandleGetRoot(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()
log := logf.FromContext(ctx)
- switch s.featureModeFromConfOrHeader(r, s.config.Features.Root) {
+ switch s.featureModeFromConfOrHeader(r, s.config.Features.Root, s.config.Versioning != nil) {
case FeatureModePassthrough:
log.Info("forwarding GET / to upstream placement")
s.forward(w, r)
diff --git a/internal/shim/placement/handle_traits.go b/internal/shim/placement/handle_traits.go
index 0cd4e659e..c64d6887e 100644
--- a/internal/shim/placement/handle_traits.go
+++ b/internal/shim/placement/handle_traits.go
@@ -41,7 +41,7 @@ func (s *Shim) HandleListTraits(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()
log := logf.FromContext(ctx)
- switch s.featureModeFromConfOrHeader(r, s.config.Features.Traits) {
+ switch s.featureModeFromConfOrHeader(r, s.config.Features.Traits, s.config.Traits != nil) {
case FeatureModePassthrough, FeatureModeHybrid:
s.forward(w, r)
return
@@ -113,7 +113,7 @@ func (s *Shim) HandleShowTrait(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()
log := logf.FromContext(ctx)
- switch s.featureModeFromConfOrHeader(r, s.config.Features.Traits) {
+ switch s.featureModeFromConfOrHeader(r, s.config.Features.Traits, s.config.Traits != nil) {
case FeatureModePassthrough, FeatureModeHybrid:
s.forward(w, r)
return
@@ -155,7 +155,7 @@ func (s *Shim) HandleUpdateTrait(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()
log := logf.FromContext(ctx)
- mode := s.featureModeFromConfOrHeader(r, s.config.Features.Traits)
+ mode := s.featureModeFromConfOrHeader(r, s.config.Features.Traits, s.config.Traits != nil)
switch mode {
case FeatureModePassthrough:
s.forward(w, r)
@@ -237,7 +237,7 @@ func (s *Shim) HandleDeleteTrait(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()
log := logf.FromContext(ctx)
- mode := s.featureModeFromConfOrHeader(r, s.config.Features.Traits)
+ mode := s.featureModeFromConfOrHeader(r, s.config.Features.Traits, s.config.Traits != nil)
switch mode {
case FeatureModePassthrough:
s.forward(w, r)
diff --git a/internal/shim/placement/shim.go b/internal/shim/placement/shim.go
index 53de85056..77ff72a66 100644
--- a/internal/shim/placement/shim.go
+++ b/internal/shim/placement/shim.go
@@ -100,9 +100,11 @@ func (m FeatureMode) valid() bool {
}
// dispatchPassthroughOnly forwards in passthrough mode, returns 501 for
-// hybrid/crd, and 500 for unknown modes.
+// hybrid/crd, and 500 for unknown modes. These endpoints have no backing
+// config requirement so we pass true — the 501 response already guards
+// against actual nil dereferences.
func (s *Shim) dispatchPassthroughOnly(w http.ResponseWriter, r *http.Request, mode FeatureMode) {
- resolved := s.featureModeFromConfOrHeader(r, mode)
+ resolved := s.featureModeFromConfOrHeader(r, mode, true)
switch resolved {
case FeatureModePassthrough:
s.forward(w, r)
@@ -116,18 +118,18 @@ func (s *Shim) dispatchPassthroughOnly(w http.ResponseWriter, r *http.Request, m
// featureModeFromConfOrHeader returns the effective feature mode for the
// current request. If a valid override is present in the request context
// (injected by wrapHandler from the X-Cortex-Feature-Mode header), the
-// override takes precedence — unless it would escalate from passthrough into
-// a mode that requires backing config (Versioning, Traits) that was not
-// validated at startup. In that case the override is ignored and the
-// configured default is returned.
-func (s *Shim) featureModeFromConfOrHeader(r *http.Request, configured FeatureMode) FeatureMode {
+// override takes precedence — unless it would escalate to hybrid/crd without
+// the endpoint's backing config being available. Callers pass hasBackingConfig
+// to indicate whether the infrastructure required by hybrid/crd mode for their
+// specific endpoint was validated at startup.
+func (s *Shim) featureModeFromConfOrHeader(r *http.Request, configured FeatureMode, hasBackingConfig bool) FeatureMode {
override, ok := r.Context().Value(featureModeOverrideKey).(FeatureMode)
if !ok {
return configured.orDefault()
}
resolved := override.orDefault()
if resolved == FeatureModeHybrid || resolved == FeatureModeCRD {
- if s.config.Versioning == nil && s.config.Traits == nil && s.config.ResourceClasses == nil {
+ if !hasBackingConfig {
return configured.orDefault()
}
}
diff --git a/internal/shim/placement/shim_test.go b/internal/shim/placement/shim_test.go
index 46805676a..786831553 100644
--- a/internal/shim/placement/shim_test.go
+++ b/internal/shim/placement/shim_test.go
@@ -605,7 +605,7 @@ func TestFeatureModeFromConfOrHeader(t *testing.T) {
t.Run("returns configured mode when no override", func(t *testing.T) {
req := httptest.NewRequest(http.MethodGet, "/", http.NoBody)
- got := s.featureModeFromConfOrHeader(req, FeatureModeHybrid)
+ got := s.featureModeFromConfOrHeader(req, FeatureModeHybrid, true)
if got != FeatureModeHybrid {
t.Fatalf("got %q, want %q", got, FeatureModeHybrid)
}
@@ -613,7 +613,7 @@ func TestFeatureModeFromConfOrHeader(t *testing.T) {
t.Run("defaults empty configured mode to passthrough", func(t *testing.T) {
req := httptest.NewRequest(http.MethodGet, "/", http.NoBody)
- got := s.featureModeFromConfOrHeader(req, "")
+ got := s.featureModeFromConfOrHeader(req, "", true)
if got != FeatureModePassthrough {
t.Fatalf("got %q, want %q", got, FeatureModePassthrough)
}
@@ -623,7 +623,7 @@ func TestFeatureModeFromConfOrHeader(t *testing.T) {
req := httptest.NewRequest(http.MethodGet, "/", http.NoBody)
ctx := context.WithValue(req.Context(), featureModeOverrideKey, FeatureModeCRD)
req = req.WithContext(ctx)
- got := s.featureModeFromConfOrHeader(req, FeatureModePassthrough)
+ got := s.featureModeFromConfOrHeader(req, FeatureModePassthrough, true)
if got != FeatureModeCRD {
t.Fatalf("got %q, want %q", got, FeatureModeCRD)
}
@@ -634,7 +634,7 @@ func TestFeatureModeFromConfOrHeader(t *testing.T) {
req := httptest.NewRequest(http.MethodGet, "/", http.NoBody)
ctx := context.WithValue(req.Context(), featureModeOverrideKey, FeatureModeCRD)
req = req.WithContext(ctx)
- got := bare.featureModeFromConfOrHeader(req, FeatureModePassthrough)
+ got := bare.featureModeFromConfOrHeader(req, FeatureModePassthrough, false)
if got != FeatureModePassthrough {
t.Fatalf("got %q, want %q (override should be rejected without backing config)", got, FeatureModePassthrough)
}
@@ -645,7 +645,7 @@ func TestFeatureModeFromConfOrHeader(t *testing.T) {
req := httptest.NewRequest(http.MethodGet, "/", http.NoBody)
ctx := context.WithValue(req.Context(), featureModeOverrideKey, FeatureModePassthrough)
req = req.WithContext(ctx)
- got := bare.featureModeFromConfOrHeader(req, FeatureModeHybrid)
+ got := bare.featureModeFromConfOrHeader(req, FeatureModeHybrid, false)
if got != FeatureModePassthrough {
t.Fatalf("got %q, want %q", got, FeatureModePassthrough)
}
@@ -655,7 +655,7 @@ func TestFeatureModeFromConfOrHeader(t *testing.T) {
req := httptest.NewRequest(http.MethodGet, "/", http.NoBody)
ctx := context.WithValue(req.Context(), featureModeOverrideKey, FeatureMode(""))
req = req.WithContext(ctx)
- got := s.featureModeFromConfOrHeader(req, FeatureModeHybrid)
+ got := s.featureModeFromConfOrHeader(req, FeatureModeHybrid, true)
if got != FeatureModePassthrough {
t.Fatalf("got %q, want %q", got, FeatureModePassthrough)
}
From 8224141e2e6cf3c911bdb94f97544ebf97d4cea8 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Mon, 4 May 2026 08:23:13 +0000
Subject: [PATCH 39/54] Bump cortex-shim chart appVersions to sha-d8bb12ef
[skip ci]
---
helm/library/cortex-shim/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex-shim/Chart.yaml b/helm/library/cortex-shim/Chart.yaml
index c7a63bacf..346a83021 100644
--- a/helm/library/cortex-shim/Chart.yaml
+++ b/helm/library/cortex-shim/Chart.yaml
@@ -3,6 +3,6 @@ name: cortex-shim
description: A Helm chart to distribute cortex shims.
type: application
version: 0.0.3
-appVersion: "sha-d8871ab8"
+appVersion: "sha-d8bb12ef"
icon: "https://example.com/icon.png"
dependencies: []
From b5b93883e2c5cb6f790daab5bab54324df11b622 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Mon, 4 May 2026 08:23:15 +0000
Subject: [PATCH 40/54] Bump cortex chart appVersions to sha-d8bb12ef [skip ci]
---
helm/library/cortex/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml
index 2256c0135..3f1923a7d 100644
--- a/helm/library/cortex/Chart.yaml
+++ b/helm/library/cortex/Chart.yaml
@@ -3,6 +3,6 @@ name: cortex
description: A Helm chart to distribute cortex.
type: application
version: 0.0.44
-appVersion: "sha-e51b5de0"
+appVersion: "sha-d8bb12ef"
icon: "https://example.com/icon.png"
dependencies: []
From c26705a8df9db644cc3b292e2cc940fff17ef72c Mon Sep 17 00:00:00 2001
From: mblos <156897072+mblos@users.noreply.github.com>
Date: Mon, 4 May 2026 10:23:36 +0200
Subject: [PATCH 41/54] feat: Controllers and http API use CR CRD for managing
CR reservation CRDs (#773)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Introduces the CommittedResource CRD as the durable contract between the
LIQUID API and the reservation system, replacing direct Reservation
writes with a controller-owned lifecycle. Accepting or rejecting a
commitment is now a write to a CommittedResource CRD followed by
watching its status — the API layer no longer touches Reservation CRDs
directly.
---
api/v1alpha1/committed_resource_types.go | 6 +
api/v1alpha1/reservation_types.go | 15 +
cmd/manager/main.go | 12 +-
.../committed-resource-reservations.md | 39 +-
.../cortex-nova/alerts/nova.alerts.yaml | 105 +-
helm/bundles/cortex-nova/values.yaml | 66 +-
.../files/crds/cortex.cloud_reservations.yaml | 17 +
.../commitments/api/change_commitments.go | 454 ++--
.../api/change_commitments_e2e_test.go | 411 ++++
.../api/change_commitments_metrics.go | 2 +-
.../api/change_commitments_test.go | 2181 ++++++-----------
.../reservations/commitments/api/handler.go | 12 +-
.../reservations/commitments/api/info.go | 5 +-
.../commitments/api/report_capacity.go | 2 +-
.../commitments/api/report_usage.go | 2 +-
.../commitments/api/report_usage_test.go | 2 +-
.../committed_resource_controller.go | 153 +-
.../committed_resource_controller_test.go | 251 +-
.../committed_resource_integration_test.go | 628 +++--
.../reservations/commitments/config.go | 144 +-
.../reservations/commitments/e2e_checks.go | 230 +-
.../reservations/commitments/field_index.go | 22 +-
.../commitments/integration_test.go | 618 +++++
.../commitments/reservation_controller.go | 56 +-
.../reservation_controller_test.go | 10 +-
.../commitments/reservation_manager.go | 28 +-
.../reservations/commitments/state.go | 17 +-
.../reservations/commitments/syncer.go | 285 ++-
.../commitments/syncer_monitor.go | 124 +-
.../commitments/syncer_monitor_test.go | 11 +-
.../reservations/commitments/syncer_test.go | 502 ++--
tools/visualize-committed-resources/main.go | 596 +++++
32 files changed, 4528 insertions(+), 2478 deletions(-)
create mode 100644 internal/scheduling/reservations/commitments/api/change_commitments_e2e_test.go
create mode 100644 internal/scheduling/reservations/commitments/integration_test.go
create mode 100644 tools/visualize-committed-resources/main.go
diff --git a/api/v1alpha1/committed_resource_types.go b/api/v1alpha1/committed_resource_types.go
index a6f1bd217..31365887f 100644
--- a/api/v1alpha1/committed_resource_types.go
+++ b/api/v1alpha1/committed_resource_types.go
@@ -147,6 +147,12 @@ const (
// CommittedResourceConditionReady indicates whether the CommittedResource has been
// successfully reconciled into active Reservation CRDs.
CommittedResourceConditionReady = "Ready"
+
+ // Condition reasons set by the CommittedResource controller.
+ CommittedResourceReasonAccepted = "Accepted"
+ CommittedResourceReasonPlanned = "Planned"
+ CommittedResourceReasonReserving = "Reserving"
+ CommittedResourceReasonRejected = "Rejected"
)
// +kubebuilder:object:root=true
diff --git a/api/v1alpha1/reservation_types.go b/api/v1alpha1/reservation_types.go
index 21c96efad..988d4b97d 100644
--- a/api/v1alpha1/reservation_types.go
+++ b/api/v1alpha1/reservation_types.go
@@ -81,6 +81,15 @@ type CommittedResourceReservationSpec struct {
// +kubebuilder:validation:Optional
Creator string `json:"creator,omitempty"`
+ // ParentGeneration is the Generation of the CommittedResource CRD at the time this
+ // reservation was last written by the CommittedResource controller. The Reservation
+ // controller echoes it to Status.CommittedResourceReservation.ObservedParentGeneration
+ // once it has processed the reservation, allowing the CR controller to wait until
+ // all child reservations are up-to-date before accepting.
+ // Zero means the field is not set (syncer-created reservations, no parent CR).
+ // +kubebuilder:validation:Optional
+ ParentGeneration int64 `json:"parentGeneration,omitempty"`
+
// Allocations maps workload identifiers to their allocation details.
// Key: Workload UUID (VM UUID for Nova, Pod UID for Pods, Machine UID for IronCore, etc.)
// Value: allocation state and metadata
@@ -148,6 +157,12 @@ const (
// CommittedResourceReservationStatus defines the status fields specific to committed resource reservations.
type CommittedResourceReservationStatus struct {
+ // ObservedParentGeneration is the Spec.CommittedResourceReservation.ParentGeneration value
+ // that this Reservation controller last processed. When it matches ParentGeneration in spec,
+ // the CR controller knows this reservation is up-to-date for the current CR spec version.
+ // +kubebuilder:validation:Optional
+ ObservedParentGeneration int64 `json:"observedParentGeneration,omitempty"`
+
// Allocations maps VM/instance UUIDs to the host they are currently running on.
// Key: VM/instance UUID, Value: Host name where the VM is currently running.
// +kubebuilder:validation:Optional
diff --git a/cmd/manager/main.go b/cmd/manager/main.go
index ba1cd52e8..e031366f8 100644
--- a/cmd/manager/main.go
+++ b/cmd/manager/main.go
@@ -367,7 +367,7 @@ func main() {
if commitmentsConfig.DatasourceName != "" {
commitmentsUsageDB = commitments.NewDBUsageClient(multiclusterClient, commitmentsConfig.DatasourceName)
}
- commitmentsAPI := commitmentsapi.NewAPIWithConfig(multiclusterClient, commitmentsConfig, commitmentsUsageDB)
+ commitmentsAPI := commitmentsapi.NewAPIWithConfig(multiclusterClient, commitmentsConfig.API, commitmentsUsageDB)
commitmentsAPI.Init(mux, metrics.Registry, ctrl.Log.WithName("commitments-api"))
if slices.Contains(mainConfig.EnabledControllers, "nova-pipeline-controllers") {
@@ -538,12 +538,11 @@ func main() {
monitor := reservations.NewMonitor(multiclusterClient)
metrics.Registry.MustRegister(&monitor)
commitmentsConfig := conf.GetConfigOrDie[commitments.Config]()
- commitmentsConfig.ApplyDefaults()
if err := (&commitments.CommitmentReservationController{
Client: multiclusterClient,
Scheme: mgr.GetScheme(),
- Conf: commitmentsConfig,
+ Conf: commitmentsConfig.ReservationController,
}).SetupWithManager(mgr, multiclusterClient); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "CommitmentReservation")
os.Exit(1)
@@ -552,7 +551,7 @@ func main() {
if err := (&commitments.CommittedResourceController{
Client: multiclusterClient,
Scheme: mgr.GetScheme(),
- Conf: commitmentsConfig,
+ Conf: commitmentsConfig.CommittedResourceController,
}).SetupWithManager(mgr, multiclusterClient); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "CommittedResource")
os.Exit(1)
@@ -716,13 +715,12 @@ func main() {
os.Exit(1)
}
- syncerMonitor := commitments.NewSyncerMonitor()
- must.Succeed(metrics.Registry.Register(syncerMonitor))
if slices.Contains(mainConfig.EnabledTasks, "commitments-sync-task") {
setupLog.Info("starting commitments syncer")
+ syncerMonitor := commitments.NewSyncerMonitor()
+ must.Succeed(metrics.Registry.Register(syncerMonitor))
syncer := commitments.NewSyncer(multiclusterClient, syncerMonitor)
syncerConfig := conf.GetConfigOrDie[commitments.SyncerConfig]()
- syncerConfig.ApplyDefaults()
if err := (&task.Runner{
Client: multiclusterClient,
Interval: syncerConfig.SyncInterval,
diff --git a/docs/reservations/committed-resource-reservations.md b/docs/reservations/committed-resource-reservations.md
index 132e2acfd..cccd55cbf 100644
--- a/docs/reservations/committed-resource-reservations.md
+++ b/docs/reservations/committed-resource-reservations.md
@@ -55,8 +55,8 @@ flowchart LR
UsageAPI[Usage API]
Scheduler[Scheduler API]
- ChangeAPI -->|CRUD| CR
- Syncer -->|CRUD| CR
+ ChangeAPI -->|upsert + poll status| CR
+ Syncer -->|upsert| CR
UsageAPI -->|read| CR
UsageAPI -->|read| Res
CapacityAPI -->|read| Res
@@ -117,12 +117,31 @@ The controller's job is to keep child `Reservation` CRDs in sync with the desire
- **`pending`**: Cortex is being asked for a yes/no decision. If placement fails for any reason, child Reservations are removed and the CR is marked Rejected. The caller (e.g. the change-commitments API) reads the outcome and reports back to Limes. No retry.
-- **`guaranteed` / `confirmed`**: Cortex is expected to honour the commitment. The default is to keep retrying until placement succeeds (`Ready=False, Reason=Reserving`). Callers that can accept "no" as an answer (e.g. the change-commitments API on a resize request) set `Spec.AllowRejection=true`; the controller then rejects on failure instead of retrying.
+- **`guaranteed` / `confirmed`**: Cortex is expected to honour the commitment. The default is to keep retrying until placement succeeds (`Ready=False, Reason=Reserving`). Callers that can accept "no" as an answer set `Spec.AllowRejection=true` (the change-commitments API sets this for confirming requests — new commitments, resizes); the controller then rejects on failure instead of retrying.
- **On rejection**: rolls back child Reservations to the last successfully placed quantity (`Status.AcceptedAmount`). For a CR that was never accepted, this means removing all child Reservations.
The controller communicates with the Reservation controller only through CRDs — no direct calls.
+**Reconcile trigger flow:**
+
+```mermaid
+sequenceDiagram
+ participant API as Change-Commitments API
+ participant CRCtrl as CR Controller
+ participant CRCRD as CommittedResource CRD
+ participant ResCRD as Reservation CRD
+ participant ResCtrl as Reservation Controller
+
+ API->>CRCRD: write (create/update)
+ CRCRD-->>CRCtrl: watch fires
+ CRCtrl->>ResCRD: create/update child slots
+ ResCRD-->>ResCtrl: watch fires
+ ResCtrl->>ResCRD: update (ObservedParentGeneration, Ready=True/False)
+ ResCRD-->>CRCtrl: watch fires (Reservation→parent CR lookup)
+ CRCtrl->>CRCRD: update status (Accepted / Reserving / Rejected)
+```
+
### Reservation Lifecycle
| Component | Event | Timing | Action |
@@ -228,18 +247,20 @@ The `Reservation` controller (`CommitmentReservationController`) watches `Reserv
### Change-Commitments API
-The change-commitments API receives batched commitment changes from Limes and manages reservations accordingly.
+The change-commitments API receives batched commitment changes from Limes and applies them using a **write-intent, watch-for-outcome** pattern: the handler creates or updates `CommittedResource` CRDs and polls their `Status.Conditions` until each reaches a terminal state — it does not interact with `Reservation` CRDs directly.
**Request Semantics**: A request can contain multiple commitment changes across different projects and flavor groups. The semantic is **all-or-nothing** — if any commitment in the batch cannot be fulfilled (e.g., insufficient capacity), the entire request is rejected and rolled back.
-**Operations**: Cortex performs CRUD operations on local Reservation CRDs to match the new desired state:
-- Creates new reservations for increased commitment amounts
-- Deletes existing reservations for decreased commitments
-- Preserves existing reservations that already have VMs allocated when possible
+**Operations**:
+1. For each commitment in the batch, create or update a `CommittedResource` CRD. `Spec.AllowRejection` mirrors the request's `RequiresConfirmation` flag: `true` for changes where Limes needs a yes/no answer (new commitments, resizes), `false` for non-confirming changes (deletions, status-only transitions) where Limes doesn't act on the rejection reason
+2. Poll `CommittedResource.Status.Conditions[Ready]` until each reaches a terminal state: `Reason=Accepted` (success), `Reason=Planned` (deferred; accepted), or `Reason=Rejected` (failure) — only for confirming changes; non-confirming changes return immediately without polling
+3. On any failure or timeout, restore all modified `CommittedResource` CRDs to their pre-request specs (or delete newly-created ones)
+
+The `CommittedResource` controller handles all downstream `Reservation` CRUD. `AllowRejection=true` tells it to reject and roll back child Reservations on placement failure rather than retrying indefinitely.
### Syncer Task
-The syncer task runs periodically and syncs local Reservation CRD state to match Limes' view of commitments, correcting drift from missed API calls or restarts.
+The syncer task runs periodically and syncs local `CommittedResource` CRD state to match Limes' view of commitments, correcting drift from missed API calls or restarts. It writes `CommittedResource` CRDs only — Reservation CRUD is the controller's responsibility.
### Usage API
diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
index bc5fd20ce..f48f0cb28 100644
--- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
+++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
@@ -505,52 +505,10 @@ groups:
CRD retrieval. Limes scrapes may time out, affecting capacity reporting.
# Committed Resource Syncer Alerts
- - alert: CortexNovaCommittedResourceSyncerErrorsHigh
- expr: increase(cortex_committed_resource_syncer_errors_total{service="cortex-nova-metrics"}[1h]) > 3
- for: 5m
- labels:
- context: committed-resource-syncer
- dashboard: cortex-status-dashboard/cortex-status-dashboard
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Committed Resource syncer experiencing errors"
- description: >
- The committed resource syncer has encountered multiple errors in the last hour.
- This may indicate connectivity issues with Limes, malformed API responses,
- or failures writing reservation CRDs. Check the syncer logs for error details.
-
- - alert: CortexNovaCommittedResourceSyncerUnitMismatchRateHigh
- expr: |
- (
- sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unit_mismatch"}[1h]))
- / sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]))
- ) > 0.05
- and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0
- for: 15m
- labels:
- context: committed-resource-syncer
- dashboard: cortex-status-dashboard/cortex-status-dashboard
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Committed Resource syncer unit mismatch rate >5%"
- description: >
- More than 5% of commitments are being skipped due to unit mismatches between
- Limes and Cortex flavor groups. This happens when Limes has not yet been
- updated to use the new unit format after a flavor group change. The affected
- commitments will keep their existing reservations until Limes notices the update.
- Check the logs if this error persists for longer time.
-
- - alert: CortexNovaCommittedResourceSyncerUnknownFlavorGroupRateHigh
- expr: |
- (
- sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unknown_flavor_group"}[1h]))
- / sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]))
- ) > 0
- and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0
+ # These alerts only fire when the syncer is enabled (metrics are only registered when enabled).
+ # Absent metrics = syncer disabled = alerts inactive by design.
+ - alert: CortexNovaCommittedResourceSyncerNotRunning
+ expr: increase(cortex_committed_resource_syncer_duration_seconds_count{service="cortex-nova-metrics"}[3h]) < 1
for: 15m
labels:
context: committed-resource-syncer
@@ -559,46 +517,15 @@ groups:
severity: warning
support_group: workload-management
annotations:
- summary: "Committed Resource syncer unknown flavor group rate >0%"
+ summary: "Committed Resource syncer has not run in 3 hours"
description: >
- Some commitments reference flavor groups that don't exist in
- Cortex Knowledge (anymore). This may indicate that flavor group configuration is
- out of sync between Limes and Cortex, or that Knowledge extraction is failing.
- Check the flavor group Knowledge CRD and history to see what was changed.
+ No commitment sync has completed in the last 3 hours. The syncer runs hourly,
+ so at least 2 runs should appear in this window. Check that the syncer task
+ is healthy and Limes is reachable.
- - alert: CortexNovaCommittedResourceSyncerLocalChangeRateHigh
- expr: |
- (
- (
- rate(cortex_committed_resource_syncer_reservations_created_total{service="cortex-nova-metrics"}[1h]) +
- rate(cortex_committed_resource_syncer_reservations_deleted_total{service="cortex-nova-metrics"}[1h]) +
- rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h])
- ) / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h])
- ) > 0.01
- and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0
- for: 15m
- labels:
- context: committed-resource-syncer
- dashboard: cortex-status-dashboard/cortex-status-dashboard
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Committed Resource syncer local change rate >1%"
- description: >
- More than 1% of synced commitments are requiring reservation changes
- (creates, deletes, or repairs). This is higher than expected for steady-state
- operation and may indicate data inconsistencies, external modifications to
- reservations, or issues with the CRDs. Check Cortex logs for details.
-
- - alert: CortexNovaCommittedResourceSyncerRepairRateHigh
- expr: |
- (
- rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h])
- / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h])
- ) > 0
- and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0
- for: 15m
+ - alert: CortexNovaCommittedResourceSyncerErrors
+ expr: increase(cortex_committed_resource_syncer_errors_total{service="cortex-nova-metrics"}[1h]) > 3
+ for: 5m
labels:
context: committed-resource-syncer
dashboard: cortex-status-dashboard/cortex-status-dashboard
@@ -606,13 +533,11 @@ groups:
severity: warning
support_group: workload-management
annotations:
- summary: "Committed Resource syncer repair rate >0%"
+ summary: "Committed Resource syncer is repeatedly failing"
description: >
- Some commitments have reservations that needed repair
- (wrong metadata like project ID or flavor group). This may indicate data
- corruption, bugs in reservation creation, or external modifications.
- Reservations are automatically repaired, but the root cause should be
- investigated if this alert persists.
+ The committed resource syncer has encountered more than 3 errors in the last
+ hour. Check syncer logs for details; common causes are connectivity issues
+ with Limes or failures writing CommittedResource CRDs.
- alert: CortexNovaDoesntFindValidKVMHosts
expr: sum by (az, hvtype) (increase(cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*"}[5m])) > 0
diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml
index 65ad879dd..d694bdfba 100644
--- a/helm/bundles/cortex-nova/values.yaml
+++ b/helm/bundles/cortex-nova/values.yaml
@@ -141,35 +141,39 @@ cortex-scheduling-controllers:
# Number of top hosts to shuffle for evacuation requests.
# Set to 0 or negative to disable shuffling.
evacuationShuffleK: 3
- # CommittedResourceFlavorGroupPipelines maps flavor group IDs to pipeline names for CR reservations
- # This allows different scheduling strategies per flavor group (e.g., HANA vs GP)
- committedResourceFlavorGroupPipelines:
- "2152": "kvm-hana-bin-packing" # HANA flavor group
- "2101": "kvm-general-purpose-load-balancing" # General Purpose flavor group
- "*": "kvm-general-purpose-load-balancing" # Catch-all fallback
- # Default pipeline for CR reservations when no CommittedResourceFlavorGroupPipelines entry matches
- committedResourcePipelineDefault: "kvm-general-purpose-load-balancing"
- # How often to re-verify active reservations
- # 5m = 300000000000 nanoseconds
- committedResourceRequeueIntervalActive: 300000000000
- # How often to retry when knowledge is not ready
- # 1m = 60000000000 nanoseconds
- committedResourceRequeueIntervalRetry: 60000000000
- # Timeout for watching reservations to become ready before rolling back
- # 10s = 10000000000 nanoseconds
- committedResourceChangeAPIWatchReservationsTimeout: 10000000000
- # How often to poll reservation status during watch
- # 500ms = 500000000 nanoseconds
- committedResourceChangeAPIWatchReservationsPollInterval: 500000000
- # Whether the change-commitments API endpoint is active
- # When false, the endpoint returns HTTP 503. The info endpoint remains available.
- committedResourceEnableChangeCommitmentsAPI: true
- # Whether the report-usage API endpoint is active
- # When false, the endpoint returns HTTP 503.
- committedResourceEnableReportUsageAPI: true
- # Whether the report-capacity API endpoint is active
- # When false, the endpoint returns HTTP 503.
- committedResourceEnableReportCapacityAPI: true
+ committedResourceReservationController:
+ # Maps flavor group IDs to pipeline names; "*" acts as catch-all fallback
+ flavorGroupPipelines:
+ "2152": "kvm-hana-bin-packing" # HANA flavor group
+ "2101": "kvm-general-purpose-load-balancing" # General Purpose flavor group
+ "*": "kvm-general-purpose-load-balancing" # Catch-all fallback
+ # Fallback pipeline when no flavorGroupPipelines entry matches
+ pipelineDefault: "kvm-general-purpose-load-balancing"
+ # How often to re-verify active Reservation CRDs (healthy state)
+ requeueIntervalActive: "5m"
+ # Back-off interval when knowledge is unavailable
+ requeueIntervalRetry: "1m"
+ # Back-off interval while a VM allocation is still within allocationGracePeriod
+ requeueIntervalGracePeriod: "1m"
+ # How long after a VM is allocated to a reservation before it is expected to appear
+ # on the target host; allocations not confirmed within this window are removed
+ allocationGracePeriod: "15m"
+ # URL of the nova external scheduler API for placement decisions
+ schedulerURL: "http://localhost:8080/scheduler/nova/external"
+ committedResourceController:
+ # Back-off interval while CommittedResource placement is pending or failed
+ requeueIntervalRetry: "1m"
+ committedResourceAPI:
+ # Timeout for watching CommittedResource CRDs before rolling back
+ watchTimeout: "10s"
+ # How often to poll CommittedResource CRD conditions during watch
+ watchPollInterval: "500ms"
+ # When false, the endpoint returns HTTP 503; the info endpoint remains available.
+ enableChangeCommitments: true
+ # When false, the endpoint returns HTTP 503.
+ enableReportUsage: true
+ # When false, the endpoint returns HTTP 503.
+ enableReportCapacity: true
# OvercommitMappings is a list of mappings that map hypervisor traits to
# overcommit ratios. Note that this list is applied in order, so if there
# are multiple mappings applying to the same hypervisors, the last mapping
@@ -178,7 +182,7 @@ cortex-scheduling-controllers:
# Failover reservations controller configuration
# Name of the Datasource CRD that provides database connection info for Nova VM data
datasourceName: nova-servers
- # URL of the nova external scheduler API for placement decisions
+ # URL of the nova external scheduler API for placement decisions (used by failover controller)
schedulerURL: "http://localhost:8080/scheduler/nova/external"
# Maps flavor name patterns (glob) to required failover count
# Example: {"hana_*": 2, "m1.xlarge": 1}
@@ -205,6 +209,8 @@ cortex-scheduling-controllers:
limitOneNewReservationPerHypervisor: false
# Size failover reservations based on LargestFlavor in the flavor group
useFlavorGroupResources: false
+ # How often the commitments syncer reconciles Limes commitments to Reservation CRDs
+ committedResourceSyncInterval: "1h"
cortex-knowledge-controllers:
<<: *cortex
diff --git a/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml b/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml
index 686aa60fe..a30b7d221 100644
--- a/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml
+++ b/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml
@@ -138,6 +138,16 @@ spec:
type: string
domainID:
type: string
+ parentGeneration:
+ description: |-
+ ParentGeneration is the Generation of the CommittedResource CRD at the time this
+ reservation was last written by the CommittedResource controller. The Reservation
+ controller echoes it to Status.CommittedResourceReservation.ObservedParentGeneration
+ once it has processed the reservation, allowing the CR controller to wait until
+ all child reservations are up-to-date before accepting.
+ Zero means the field is not set (syncer-created reservations, no parent CR).
+ format: int64
+ type: integer
projectID:
type: string
resourceGroup:
@@ -212,6 +222,13 @@ spec:
Allocations maps VM/instance UUIDs to the host they are currently running on.
Key: VM/instance UUID, Value: Host name where the VM is currently running.
type: object
+ observedParentGeneration:
+ description: |-
+ ObservedParentGeneration is the Spec.CommittedResourceReservation.ParentGeneration value
+ that this Reservation controller last processed. When it matches ParentGeneration in spec,
+ the CR controller knows this reservation is up-to-date for the current CR spec version.
+ format: int64
+ type: integer
type: object
conditions:
description: |-
diff --git a/internal/scheduling/reservations/commitments/api/change_commitments.go b/internal/scheduling/reservations/commitments/api/change_commitments.go
index e076e41c9..9849075b9 100644
--- a/internal/scheduling/reservations/commitments/api/change_commitments.go
+++ b/internal/scheduling/reservations/commitments/api/change_commitments.go
@@ -19,10 +19,13 @@ import (
"github.com/go-logr/logr"
"github.com/google/uuid"
"github.com/sapcc/go-api-declarations/liquid"
+ apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
+ "k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
+ "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
)
// sortedKeys returns map keys sorted alphabetically for deterministic iteration.
@@ -37,42 +40,47 @@ func sortedKeys[K ~string, V any](m map[K]V) []K {
return keys
}
-// implements POST /commitments/v1/change-commitments from Limes LIQUID API:
+// crSnapshot captures a CommittedResource CRD's prior state for batch rollback.
+// prevSpec is nil when the CRD was newly created (i.e. did not exist before the batch).
+// wasDeleted is true when the batch operation deleted the CRD; rollback must re-create it.
+type crSnapshot struct {
+ crName string
+ prevSpec *v1alpha1.CommittedResourceSpec
+ wasDeleted bool
+}
+
+// HandleChangeCommitments implements POST /commitments/v1/change-commitments from the Limes LIQUID API.
+// It writes CommittedResource CRDs (one per commitment) and polls their status conditions until
+// the controller confirms or rejects each one. On any failure the whole batch is rolled back.
+//
// See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go
// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid
-//
-// This endpoint handles commitment changes by creating/updating/deleting Reservation CRDs based on the commitment lifecycle.
-// A request may contain multiple commitment changes which are processed in a single transaction. If any change fails, all changes are rolled back.
func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Request) {
startTime := time.Now()
- // Initialize
resp := liquid.CommitmentChangeResponse{}
req := liquid.CommitmentChangeRequest{}
statusCode := http.StatusOK
- // Extract or generate request ID for tracing - always set in response header
requestID := r.Header.Get("X-Request-ID")
if requestID == "" {
requestID = uuid.New().String()
}
w.Header().Set("X-Request-ID", requestID)
- // Check if API is enabled
- if !api.config.EnableChangeCommitmentsAPI {
+ if !api.config.EnableChangeCommitments {
statusCode = http.StatusServiceUnavailable
http.Error(w, "change-commitments API is disabled", statusCode)
api.recordMetrics(req, resp, statusCode, startTime)
return
}
- // Serialize all change-commitments requests
+ // Serialize all change-commitments requests so the controller sees a consistent world.
api.changeMutex.Lock()
defer api.changeMutex.Unlock()
ctx := reservations.WithGlobalRequestID(context.Background(), "committed-resource-"+requestID)
logger := commitments.LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/commitments/v1/change-commitments")
- // Only accept POST method
if r.Method != http.MethodPost {
statusCode = http.StatusMethodNotAllowed
http.Error(w, "Method not allowed", statusCode)
@@ -80,7 +88,6 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque
return
}
- // Parse request body
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
logger.Error(err, "invalid request body")
statusCode = http.StatusBadRequest
@@ -91,7 +98,6 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque
logger.Info("received change commitments request", "affectedProjects", len(req.ByProject), "dryRun", req.DryRun, "availabilityZone", req.AZ)
- // Check for dry run -> early reject, not supported yet
if req.DryRun {
resp.RejectionReason = "Dry run not supported yet"
api.recordMetrics(req, resp, statusCode, startTime)
@@ -104,26 +110,17 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque
return
}
- // Process commitment changes
- // For now, we'll implement a simplified path that checks capacity for immediate start CRs
-
if err := api.processCommitmentChanges(ctx, w, logger, req, &resp); err != nil {
- // Error already written to response by processCommitmentChanges
- // Determine status code from error context (409 or 503)
if strings.Contains(err.Error(), "version mismatch") {
statusCode = http.StatusConflict
} else if strings.Contains(err.Error(), "caches not ready") {
statusCode = http.StatusServiceUnavailable
}
- // Record metrics for error cases
api.recordMetrics(req, resp, statusCode, startTime)
return
}
- // Record metrics
api.recordMetrics(req, resp, statusCode, startTime)
-
- // Return response
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(statusCode)
if err := json.NewEncoder(w).Encode(resp); err != nil {
@@ -132,11 +129,6 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque
}
func (api *HTTPAPI) processCommitmentChanges(ctx context.Context, w http.ResponseWriter, logger logr.Logger, req liquid.CommitmentChangeRequest, resp *liquid.CommitmentChangeResponse) error {
- manager := commitments.NewReservationManager(api.client)
- requireRollback := false
- failedCommitments := make(map[string]string) // commitmentUUID to reason for failure, for better response messages in case of rollback
- creatorRequestID := reservations.GlobalRequestIDFromContext(ctx)
-
knowledge := &reservations.FlavorGroupKnowledgeClient{Client: api.client}
flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil)
if err != nil {
@@ -145,12 +137,10 @@ func (api *HTTPAPI) processCommitmentChanges(ctx context.Context, w http.Respons
return errors.New("caches not ready")
}
- // Validate InfoVersion from request matches current version (= last content change of flavor group knowledge)
var currentVersion int64 = -1
if knowledgeCRD, err := knowledge.Get(ctx); err == nil && knowledgeCRD != nil && !knowledgeCRD.Status.LastContentChange.IsZero() {
currentVersion = knowledgeCRD.Status.LastContentChange.Unix()
}
-
if req.InfoVersion != currentVersion {
logger.Info("version mismatch in commitment change request",
"requestVersion", req.InfoVersion,
@@ -160,160 +150,166 @@ func (api *HTTPAPI) processCommitmentChanges(ctx context.Context, w http.Respons
return errors.New("version mismatch")
}
- statesBefore := make(map[string]*commitments.CommitmentState) // map of commitmentID to existing state for rollback
- var reservationsToWatch []v1alpha1.Reservation
+ // If Limes does not require confirmation for this batch (e.g. deletions, status-only transitions),
+ // the controller must not reject — it must retry until it succeeds (AllowRejection=false).
+ // Conversely, when Limes requires confirmation, the controller may reject and report back.
+ allowRejection := req.RequiresConfirmation()
- if req.DryRun {
- resp.RejectionReason = "Dry run not supported yet"
- return nil
- }
+ var (
+ toWatch []string // CRD names to poll for terminal conditions (upserts only)
+ snapshots []crSnapshot // ordered list for deterministic rollback
+ failedReason string
+ rollback bool
+ )
ProcessLoop:
for _, projectID := range sortedKeys(req.ByProject) {
projectChanges := req.ByProject[projectID]
+
+ // Extract domain ID from Keystone project metadata if Limes provided it.
+ domainID := ""
+ if pm := projectChanges.ProjectMetadata.UnwrapOr(liquid.ProjectMetadata{}); pm.Domain.UUID != "" {
+ domainID = pm.Domain.UUID
+ }
+
for _, resourceName := range sortedKeys(projectChanges.ByResource) {
resourceChanges := projectChanges.ByResource[resourceName]
- // Validate resource name pattern (instances_group_*)
+
flavorGroupName, err := commitments.GetFlavorGroupNameFromResource(string(resourceName))
if err != nil {
- resp.RejectionReason = fmt.Sprintf("project with unknown resource name %s: %v", projectID, err)
- requireRollback = true
+ failedReason = fmt.Sprintf("project with unknown resource name %s: %v", projectID, err)
+ rollback = true
break ProcessLoop
}
- // Verify flavor group exists in Knowledge CRDs
- flavorGroup, flavorGroupExists := flavorGroups[flavorGroupName]
- if !flavorGroupExists {
- resp.RejectionReason = "flavor group not found: " + flavorGroupName
- requireRollback = true
+ flavorGroup, ok := flavorGroups[flavorGroupName]
+ if !ok {
+ failedReason = "flavor group not found: " + flavorGroupName
+ rollback = true
break ProcessLoop
}
- // Reject commitments for flavor groups that don't accept CRs
if !commitments.FlavorGroupAcceptsCommitments(&flavorGroup) {
- resp.RejectionReason = commitments.FlavorGroupCommitmentRejectionReason(&flavorGroup)
- requireRollback = true
+ failedReason = commitments.FlavorGroupCommitmentRejectionReason(&flavorGroup)
+ rollback = true
break ProcessLoop
}
for _, commitment := range resourceChanges.Commitments {
- logger.V(1).Info("processing commitment", "commitmentUUID", commitment.UUID, "oldStatus", commitment.OldStatus.UnwrapOr("none"), "newStatus", commitment.NewStatus.UnwrapOr("none"))
-
- // TODO add configurable upper limit validation for commitment size (number of instances) to prevent excessive reservation creation
- // TODO add domain
-
- // List all committed resource reservations, then filter by name prefix
- var all_reservations v1alpha1.ReservationList
- if err := api.client.List(ctx, &all_reservations, client.MatchingLabels{
- v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
- }); err != nil {
- failedCommitments[string(commitment.UUID)] = "failed to list reservations"
- logger.Info("failed to list reservations for commitment", "commitmentUUID", commitment.UUID, "error", err)
- requireRollback = true
- break ProcessLoop
- }
-
- // Filter by name prefix to find reservations for this commitment
- namePrefix := fmt.Sprintf("commitment-%s-", string(commitment.UUID))
- var existing_reservations v1alpha1.ReservationList
- for _, res := range all_reservations.Items {
- if len(res.Name) >= len(namePrefix) && res.Name[:len(namePrefix)] == namePrefix {
- existing_reservations.Items = append(existing_reservations.Items, res)
+ isDelete := commitment.NewStatus.IsNone()
+ crName := "commitment-" + string(commitment.UUID)
+
+ logger.V(1).Info("processing commitment",
+ "commitmentUUID", commitment.UUID,
+ "oldStatus", commitment.OldStatus.UnwrapOr("none"),
+ "newStatus", commitment.NewStatus.UnwrapOr("none"),
+ "delete", isDelete)
+
+ // Snapshot the current spec before mutation so we can restore it on rollback.
+ snap := crSnapshot{crName: crName}
+ existing := &v1alpha1.CommittedResource{}
+ if err := api.client.Get(ctx, types.NamespacedName{Name: crName}, existing); err != nil {
+ if !apierrors.IsNotFound(err) {
+ failedReason = fmt.Sprintf("commitment %s: failed to read pre-update snapshot: %v", commitment.UUID, err)
+ rollback = true
+ break ProcessLoop
}
+ // Not found: CR is new (or already absent for deletes), prevSpec stays nil.
+ } else {
+ specCopy := existing.Spec
+ snap.prevSpec = &specCopy
}
- var stateBefore *commitments.CommitmentState
- if len(existing_reservations.Items) == 0 {
- stateBefore = &commitments.CommitmentState{
- CommitmentUUID: string(commitment.UUID),
- ProjectID: string(projectID),
- FlavorGroupName: flavorGroupName,
- TotalMemoryBytes: 0,
- }
- } else {
- stateBefore, err = commitments.FromReservations(existing_reservations.Items)
- if err != nil {
- failedCommitments[string(commitment.UUID)] = "failed to parse existing commitment reservations"
- logger.Info("failed to get existing state for commitment", "commitmentUUID", commitment.UUID, "error", err)
- requireRollback = true
- break ProcessLoop
+ if isDelete {
+ // Limes is removing this commitment; delete the CRD if it exists.
+ snap.wasDeleted = true
+ if snap.prevSpec != nil {
+ if err := api.client.Delete(ctx, existing); err != nil && !apierrors.IsNotFound(err) {
+ failedReason = fmt.Sprintf("commitment %s: failed to delete CommittedResource CRD: %v", commitment.UUID, err)
+ rollback = true
+ break ProcessLoop
+ }
+ logger.V(1).Info("deleted CommittedResource CRD", "name", crName)
}
+ snapshots = append(snapshots, snap)
+ continue
}
- statesBefore[string(commitment.UUID)] = stateBefore
- // get desired state
- stateDesired, err := commitments.FromChangeCommitmentTargetState(commitment, string(projectID), flavorGroupName, flavorGroup, string(req.AZ))
+ stateDesired, err := commitments.FromChangeCommitmentTargetState(
+ commitment, string(projectID), domainID, flavorGroupName, flavorGroup, string(req.AZ))
if err != nil {
- failedCommitments[string(commitment.UUID)] = err.Error()
- logger.Info("failed to get desired state for commitment", "commitmentUUID", commitment.UUID, "error", err)
- requireRollback = true
+ failedReason = fmt.Sprintf("commitment %s: %s", commitment.UUID, err)
+ rollback = true
break ProcessLoop
}
- // Set creator request ID for traceability across controller reconciles
- stateDesired.CreatorRequestID = creatorRequestID
-
- logger.V(1).Info("applying commitment state change", "commitmentUUID", commitment.UUID, "oldMemory", stateBefore.TotalMemoryBytes, "desiredMemory", stateDesired.TotalMemoryBytes)
- applyResult, err := manager.ApplyCommitmentState(ctx, logger, stateDesired, flavorGroups, "changeCommitmentsApi")
- if err != nil {
- failedCommitments[string(commitment.UUID)] = "failed to apply commitment state"
- logger.Info("failed to apply commitment state for commitment", "commitmentUUID", commitment.UUID, "error", err)
- requireRollback = true
+ cr := &v1alpha1.CommittedResource{}
+ cr.Name = crName
+ if _, err := controllerutil.CreateOrUpdate(ctx, api.client, cr, func() error {
+ applyCRSpec(cr, stateDesired, allowRejection)
+ return nil
+ }); err != nil {
+ failedReason = fmt.Sprintf("commitment %s: failed to write CommittedResource CRD: %v", commitment.UUID, err)
+ rollback = true
break ProcessLoop
}
- logger.V(1).Info("applied commitment state change", "commitmentUUID", commitment.UUID, "touchedReservations", len(applyResult.TouchedReservations), "deletedReservations", len(applyResult.RemovedReservations))
- reservationsToWatch = append(reservationsToWatch, applyResult.TouchedReservations...)
+
+ toWatch = append(toWatch, crName)
+ snapshots = append(snapshots, snap)
+ logger.V(1).Info("upserted CommittedResource CRD", "name", crName)
}
}
}
- // TODO make the rollback defer safe
- if !requireRollback {
- logger.Info("applied commitment changes, now watching for reservation readiness", "reservationsToWatch", len(reservationsToWatch))
+ if !rollback {
+ // Non-confirming changes (RequiresConfirmation=false): Limes ignores our RejectionReason,
+ // so there is no point blocking on the controller outcome. The CRDs are written with
+ // AllowRejection=false, meaning the controller will retry indefinitely in the background.
+ if !allowRejection {
+ logger.Info("non-confirming changes applied, returning without polling", "count", len(toWatch))
+ return nil
+ }
+
+ logger.Info("CommittedResource CRDs written, polling for controller outcome", "count", len(toWatch))
+ watchStart := time.Now()
- time_start := time.Now()
+ rejected, watchErrs := watchCRsUntilReady(
+ ctx, logger, api.client, toWatch,
+ api.config.WatchTimeout.Duration,
+ api.config.WatchPollInterval.Duration,
+ )
- if failedReservations, errors := watchReservationsUntilReady(ctx, logger, api.client, reservationsToWatch, api.config.ChangeAPIWatchReservationsTimeout, api.config.ChangeAPIWatchReservationsPollInterval); len(failedReservations) > 0 || len(errors) > 0 {
- logger.Info("reservations failed to become ready, initiating rollback",
- "failedReservations", len(failedReservations),
- "errors", errors)
+ logger.Info("polling complete", "duration", time.Since(watchStart).Round(time.Millisecond))
- for _, res := range failedReservations {
- failedCommitments[res.Spec.CommittedResourceReservation.CommitmentUUID] = "not sufficient capacity"
+ switch {
+ case len(rejected) > 0:
+ var b strings.Builder
+ fmt.Fprintf(&b, "%d commitment(s) failed to apply:", len(rejected))
+ for _, crName := range toWatch { // iterate toWatch for deterministic order
+ if reason, ok := rejected[crName]; ok {
+ fmt.Fprintf(&b, "\n- commitment %s: %s", strings.TrimPrefix(crName, "commitment-"), reason)
+ }
}
- if len(failedReservations) == 0 {
- resp.RejectionReason += "timeout reached while processing commitment changes"
- api.monitor.timeouts.Inc()
+ failedReason = b.String()
+ rollback = true
+ case len(watchErrs) > 0:
+ msgs := make([]string, len(watchErrs))
+ for i, e := range watchErrs {
+ msgs[i] = e.Error()
}
- requireRollback = true
+ failedReason = "timeout reached while processing commitment changes: " + strings.Join(msgs, "; ")
+ api.monitor.timeouts.Inc()
+ rollback = true
}
-
- logger.Info("finished watching reservation", "totalSchedulingTimeSeconds", time.Since(time_start).Seconds())
}
- if requireRollback {
- // Build rejection reason from failed commitments
- if len(failedCommitments) > 0 {
- var reasonBuilder strings.Builder
- fmt.Fprintf(&reasonBuilder, "%d commitment(s) failed to apply: ", len(failedCommitments))
- for commitmentUUID, reason := range failedCommitments {
- fmt.Fprintf(&reasonBuilder, "\n- commitment %s: %s", commitmentUUID, reason)
- }
- resp.RejectionReason = reasonBuilder.String()
+ if rollback {
+ resp.RejectionReason = failedReason
+ logger.Info("rolling back CommittedResource CRDs", "reason", failedReason, "count", len(snapshots))
+ for i := len(snapshots) - 1; i >= 0; i-- {
+ rollbackCR(ctx, logger, api.client, snapshots[i])
}
-
- logger.Info("rollback of commitment changes")
- for commitmentUUID, state := range statesBefore {
- // Rollback to statesBefore for this commitment
- logger.Info("applying rollback for commitment", "commitmentUUID", commitmentUUID, "stateBefore", state)
- _, err := manager.ApplyCommitmentState(ctx, logger, state, flavorGroups, "changeCommitmentsApiRollback")
- if err != nil {
- logger.Info("failed to apply rollback state for commitment", "commitmentUUID", commitmentUUID, "error", err)
- // continue with best effort rollback for other projects
- }
- }
-
- logger.Info("finished applying rollbacks for commitment changes", "reasonOfRollback", resp.RejectionReason)
+ logger.Info("rollback complete")
return nil
}
@@ -321,111 +317,139 @@ ProcessLoop:
return nil
}
-// watchReservationsUntilReady polls until all reservations reach Ready=True or timeout.
-// Returns failed reservations and any errors encountered.
-func watchReservationsUntilReady(
+// watchCRsUntilReady polls CommittedResource conditions until each CRD reaches a terminal state:
+// - Ready=True (Accepted) — success
+// - Ready=False, Reason=Planned — success; controller reserves capacity at activation time
+// - Ready=False, Reason=Rejected — failure; reason reported to caller
+//
+// Returns a map of crName → rejection reason for failed CRDs, and any polling errors (e.g. timeout).
+func watchCRsUntilReady(
ctx context.Context,
logger logr.Logger,
k8sClient client.Client,
- reservations []v1alpha1.Reservation,
+ crNames []string,
timeout time.Duration,
pollInterval time.Duration,
-) (failedReservations []v1alpha1.Reservation, errors []error) {
+) (rejected map[string]string, errs []error) {
- if len(reservations) == 0 {
- return failedReservations, nil
+ if len(crNames) == 0 {
+ return nil, nil
}
+ rejected = make(map[string]string)
deadline := time.Now().Add(timeout)
- startTime := time.Now()
- totalReservations := len(reservations)
- reservationsToWatch := make([]v1alpha1.Reservation, len(reservations))
- copy(reservationsToWatch, reservations)
-
- // Track successful reservations for summary
- var successfulReservations []string
- pollCount := 0
+ pending := make(map[string]struct{}, len(crNames))
+ for _, name := range crNames {
+ pending[name] = struct{}{}
+ }
for {
- pollCount++
- var stillWaiting []v1alpha1.Reservation
if time.Now().After(deadline) {
- errors = append(errors, fmt.Errorf("timeout after %v waiting for reservations to become ready", timeout))
- // Log summary on timeout
- logger.Info("reservation watch completed (timeout)",
- "total", totalReservations,
- "ready", len(successfulReservations),
- "failed", len(failedReservations),
- "timedOut", len(reservationsToWatch),
- "duration", time.Since(startTime).Round(time.Millisecond),
- "polls", pollCount)
- return failedReservations, errors
+ errs = append(errs, fmt.Errorf("timeout after %v waiting for %d CommittedResource CRD(s)", timeout, len(pending)))
+ return rejected, errs
}
- for _, res := range reservationsToWatch {
- // Fetch current state
- var current v1alpha1.Reservation
- nn := types.NamespacedName{
- Name: res.Name,
- Namespace: res.Namespace,
+ for name := range pending {
+ cr := &v1alpha1.CommittedResource{}
+ if err := k8sClient.Get(ctx, types.NamespacedName{Name: name}, cr); err != nil {
+ continue // transient; keep waiting
}
- if err := k8sClient.Get(ctx, nn, ¤t); err != nil {
- // Reservation is still in process of being created, or there is a transient error
- stillWaiting = append(stillWaiting, res)
- continue
+ cond := meta.FindStatusCondition(cr.Status.Conditions, v1alpha1.CommittedResourceConditionReady)
+ if cond == nil {
+ continue // controller hasn't reconciled yet
}
- // Check Ready condition
- readyCond := meta.FindStatusCondition(
- current.Status.Conditions,
- v1alpha1.ReservationConditionReady,
- )
-
- if readyCond == nil {
- // Condition not set yet, keep waiting
- stillWaiting = append(stillWaiting, res)
- continue
- }
-
- switch readyCond.Status {
- case metav1.ConditionTrue:
- // Only consider truly ready if Status.Host is populated
- if current.Spec.TargetHost == "" || current.Status.Host == "" {
- stillWaiting = append(stillWaiting, res)
- continue
- }
- // Reservation is successfully scheduled - track for summary
- successfulReservations = append(successfulReservations, current.Name)
-
- case metav1.ConditionFalse:
- // Any failure reason counts as failed
- failedReservations = append(failedReservations, current)
- case metav1.ConditionUnknown:
- stillWaiting = append(stillWaiting, res)
+ switch {
+ case cond.Status == metav1.ConditionTrue:
+ delete(pending, name)
+ case cond.Status == metav1.ConditionFalse && cond.Reason == v1alpha1.CommittedResourceReasonPlanned:
+ delete(pending, name) // planned = accepted; controller will reserve at activation
+ case cond.Status == metav1.ConditionFalse && cond.Reason == v1alpha1.CommittedResourceReasonRejected:
+ delete(pending, name)
+ rejected[name] = cond.Message
+ // Reason=Reserving: controller is placing slots; keep waiting.
}
}
- if len(stillWaiting) == 0 {
- // All reservations have reached a terminal state - log summary
- logger.Info("reservation watch completed",
- "total", totalReservations,
- "ready", len(successfulReservations),
- "failed", len(failedReservations),
- "duration", time.Since(startTime).Round(time.Millisecond),
- "polls", pollCount)
- return failedReservations, errors
+ if len(pending) == 0 {
+ return rejected, nil
}
- reservationsToWatch = stillWaiting
-
- // Wait before next poll
select {
case <-time.After(pollInterval):
- // Continue polling
case <-ctx.Done():
- return failedReservations, append(errors, fmt.Errorf("context cancelled while waiting for reservations: %w", ctx.Err()))
+ return rejected, append(errs, fmt.Errorf("context cancelled: %w", ctx.Err()))
}
+ logger.V(1).Info("polling CommittedResource CRDs", "pending", len(pending))
+ }
+}
+
+// rollbackCR reverses the batch-local change to a single CommittedResource CRD.
+// - wasDeleted=true, prevSpec!=nil: CRD was deleted; re-create it from the snapshot.
+// - wasDeleted=true, prevSpec==nil: CRD was absent before and after; nothing to do.
+// - wasDeleted=false, prevSpec==nil: CRD was newly created; delete it.
+// - wasDeleted=false, prevSpec!=nil: CRD was updated; restore its spec.
+func rollbackCR(ctx context.Context, logger logr.Logger, k8sClient client.Client, snap crSnapshot) {
+ if snap.wasDeleted {
+ if snap.prevSpec == nil {
+ return // was absent before deletion attempt; nothing to undo
+ }
+ cr := &v1alpha1.CommittedResource{}
+ cr.Name = snap.crName
+ cr.Spec = *snap.prevSpec
+ if err := k8sClient.Create(ctx, cr); client.IgnoreAlreadyExists(err) != nil {
+ logger.Error(err, "failed to re-create CommittedResource CRD during rollback", "name", snap.crName)
+ }
+ return
+ }
+
+ if snap.prevSpec == nil {
+ cr := &v1alpha1.CommittedResource{}
+ cr.Name = snap.crName
+ if err := k8sClient.Delete(ctx, cr); client.IgnoreNotFound(err) != nil {
+ logger.Error(err, "failed to delete CommittedResource CRD during rollback", "name", snap.crName)
+ }
+ return
+ }
+
+ cr := &v1alpha1.CommittedResource{}
+ if err := k8sClient.Get(ctx, types.NamespacedName{Name: snap.crName}, cr); err != nil {
+ logger.Error(err, "failed to fetch CommittedResource CRD for rollback", "name", snap.crName)
+ return
+ }
+ cr.Spec = *snap.prevSpec
+ if err := k8sClient.Update(ctx, cr); err != nil {
+ logger.Error(err, "failed to restore CommittedResource CRD spec during rollback", "name", snap.crName)
+ }
+}
+
+// applyCRSpec writes CommitmentState fields into a CommittedResource CRD spec.
+// allowRejection=true for the change-commitments API path: the controller may reject
+// on failure and the API reports the outcome to Limes.
+func applyCRSpec(cr *v1alpha1.CommittedResource, state *commitments.CommitmentState, allowRejection bool) {
+ cr.Spec.CommitmentUUID = state.CommitmentUUID
+ cr.Spec.SchedulingDomain = v1alpha1.SchedulingDomainNova
+ cr.Spec.FlavorGroupName = state.FlavorGroupName
+ cr.Spec.ResourceType = v1alpha1.CommittedResourceTypeMemory
+ cr.Spec.Amount = *resource.NewQuantity(state.TotalMemoryBytes, resource.BinarySI)
+ cr.Spec.AvailabilityZone = state.AvailabilityZone
+ cr.Spec.ProjectID = state.ProjectID
+ cr.Spec.DomainID = state.DomainID
+ cr.Spec.State = state.State
+ cr.Spec.AllowRejection = allowRejection
+
+ if state.StartTime != nil {
+ t := metav1.NewTime(*state.StartTime)
+ cr.Spec.StartTime = &t
+ } else {
+ cr.Spec.StartTime = nil
+ }
+ if state.EndTime != nil {
+ t := metav1.NewTime(*state.EndTime)
+ cr.Spec.EndTime = &t
+ } else {
+ cr.Spec.EndTime = nil
}
}
diff --git a/internal/scheduling/reservations/commitments/api/change_commitments_e2e_test.go b/internal/scheduling/reservations/commitments/api/change_commitments_e2e_test.go
new file mode 100644
index 000000000..ee546655b
--- /dev/null
+++ b/internal/scheduling/reservations/commitments/api/change_commitments_e2e_test.go
@@ -0,0 +1,411 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package api
+
+// End-to-end tests: HTTP → CommittedResource CRD → Reservation CRDs → scheduler → controllers → HTTP response.
+//
+// Unlike change_commitments_test.go which uses fakeControllerClient (which immediately sets
+// conditions), these tests wire real CommittedResourceController and CommitmentReservationController
+// against a fake k8s client. A background goroutine drives reconcile loops so the API polling
+// loop can observe terminal conditions within its timeout window.
+
+import (
+ "context"
+ "encoding/json"
+ "net/http"
+ "net/http/httptest"
+ "os"
+ "testing"
+ "time"
+
+ schedulerdelegationapi "github.com/cobaltcore-dev/cortex/api/external/nova"
+ "github.com/cobaltcore-dev/cortex/api/v1alpha1"
+ commitments "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/commitments"
+ hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
+ "github.com/prometheus/client_golang/prometheus"
+ apierrors "k8s.io/apimachinery/pkg/api/errors"
+ apimeta "k8s.io/apimachinery/pkg/api/meta"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/apimachinery/pkg/runtime"
+ "k8s.io/apimachinery/pkg/types"
+ ctrl "sigs.k8s.io/controller-runtime"
+ "sigs.k8s.io/controller-runtime/pkg/client"
+ "sigs.k8s.io/controller-runtime/pkg/client/fake"
+ "sigs.k8s.io/controller-runtime/pkg/log"
+ "sigs.k8s.io/controller-runtime/pkg/log/zap"
+)
+
+// Field index paths for the fake client — must match the unexported constants in the commitments package.
+const (
+ e2eIdxCommittedResourceByUUID = "spec.commitmentUUID"
+ e2eIdxReservationByCommitmentUUID = "spec.committedResourceReservation.commitmentUUID"
+)
+
+// e2eEnv is a full end-to-end test environment: real controllers, fake k8s client,
+// mock scheduler, and a background reconcile driver goroutine.
+type e2eEnv struct {
+ t *testing.T
+ k8sClient client.Client
+ httpServer *httptest.Server
+ schedulerSrv *httptest.Server
+ crCtrl *commitments.CommittedResourceController
+ resCtrl *commitments.CommitmentReservationController
+ cancelBg context.CancelFunc
+ bgDone chan struct{}
+}
+
+// newE2EEnv creates an e2eEnv with the given flavors and scheduler handler.
+// The scheduler handler controls what the mock Nova scheduler returns.
+func newE2EEnv(t *testing.T, flavors []*TestFlavor, infoVersion int64, schedulerHandler http.HandlerFunc) *e2eEnv {
+ t.Helper()
+ log.SetLogger(zap.New(zap.WriteTo(os.Stderr), zap.UseDevMode(true)))
+
+ // Scheme: v1alpha1 for CR/Reservation/Knowledge types; hv1 for Hypervisor.
+ scheme := runtime.NewScheme()
+ if err := v1alpha1.AddToScheme(scheme); err != nil {
+ t.Fatalf("add v1alpha1 scheme: %v", err)
+ }
+ if err := hv1.AddToScheme(scheme); err != nil {
+ t.Fatalf("add hv1 scheme: %v", err)
+ }
+
+ // One hypervisor so the reservation controller can build a non-empty eligible-hosts list.
+ hypervisor := &hv1.Hypervisor{ObjectMeta: metav1.ObjectMeta{Name: "host-1"}}
+
+ k8sClient := fake.NewClientBuilder().
+ WithScheme(scheme).
+ WithObjects(createKnowledgeCRD(buildFlavorGroupsKnowledge(flavors, infoVersion)), hypervisor).
+ WithStatusSubresource(
+ &v1alpha1.CommittedResource{},
+ &v1alpha1.Reservation{},
+ &v1alpha1.Knowledge{},
+ ).
+ WithIndex(&v1alpha1.Reservation{}, e2eIdxReservationByCommitmentUUID, func(obj client.Object) []string {
+ res, ok := obj.(*v1alpha1.Reservation)
+ if !ok || res.Spec.CommittedResourceReservation == nil || res.Spec.CommittedResourceReservation.CommitmentUUID == "" {
+ return nil
+ }
+ return []string{res.Spec.CommittedResourceReservation.CommitmentUUID}
+ }).
+ WithIndex(&v1alpha1.CommittedResource{}, e2eIdxCommittedResourceByUUID, func(obj client.Object) []string {
+ cr, ok := obj.(*v1alpha1.CommittedResource)
+ if !ok || cr.Spec.CommitmentUUID == "" {
+ return nil
+ }
+ return []string{cr.Spec.CommitmentUUID}
+ }).
+ Build()
+
+ schedulerSrv := httptest.NewServer(schedulerHandler)
+
+ crCtrl := &commitments.CommittedResourceController{
+ Client: k8sClient,
+ Scheme: scheme,
+ Conf: commitments.CommittedResourceControllerConfig{RequeueIntervalRetry: metav1.Duration{Duration: 100 * time.Millisecond}},
+ }
+
+ resCtrl := &commitments.CommitmentReservationController{
+ Client: k8sClient,
+ Scheme: scheme,
+ Conf: commitments.ReservationControllerConfig{
+ SchedulerURL: schedulerSrv.URL,
+ AllocationGracePeriod: metav1.Duration{Duration: 15 * time.Minute},
+ RequeueIntervalActive: metav1.Duration{Duration: 5 * time.Minute},
+ RequeueIntervalRetry: metav1.Duration{Duration: 100 * time.Millisecond},
+ },
+ }
+ if err := resCtrl.Init(context.Background(), resCtrl.Conf); err != nil {
+ t.Fatalf("resCtrl.Init: %v", err)
+ }
+
+ // HTTPAPI wired directly to the real k8s client (no fakeControllerClient wrapper).
+ cfg := commitments.DefaultAPIConfig()
+ cfg.WatchTimeout = metav1.Duration{Duration: 5 * time.Second}
+ cfg.WatchPollInterval = metav1.Duration{Duration: 100 * time.Millisecond}
+ api := NewAPIWithConfig(k8sClient, cfg, nil)
+ mux := http.NewServeMux()
+ api.Init(mux, prometheus.NewRegistry(), log.Log)
+ httpServer := httptest.NewServer(mux)
+
+ ctx, cancel := context.WithCancel(context.Background())
+ env := &e2eEnv{
+ t: t,
+ k8sClient: k8sClient,
+ httpServer: httpServer,
+ schedulerSrv: schedulerSrv,
+ crCtrl: crCtrl,
+ resCtrl: resCtrl,
+ cancelBg: cancel,
+ bgDone: make(chan struct{}),
+ }
+ go env.driveReconciles(ctx)
+ return env
+}
+
+func (e *e2eEnv) close() {
+ e.cancelBg()
+ <-e.bgDone
+ e.httpServer.Close()
+ e.schedulerSrv.Close()
+}
+
+// asCRTestEnv wraps e2eEnv as a CRTestEnv to reuse its HTTP-call and assertion helpers.
+func (e *e2eEnv) asCRTestEnv() *CRTestEnv {
+ return &CRTestEnv{T: e.t, K8sClient: e.k8sClient, HTTPServer: e.httpServer}
+}
+
+// driveReconciles runs in the background, reconciling pending CRs and Reservations until ctx is cancelled.
+func (e *e2eEnv) driveReconciles(ctx context.Context) {
+ defer close(e.bgDone)
+ ticker := time.NewTicker(50 * time.Millisecond)
+ defer ticker.Stop()
+ for {
+ select {
+ case <-ctx.Done():
+ return
+ case <-ticker.C:
+ e.reconcileAll(ctx)
+ }
+ }
+}
+
+// reconcileAll drives one round of reconciles:
+// 1. CR pass 1 — adds finalizer and creates Reservation CRDs.
+// 2. Reservation pass — calls the scheduler, sets TargetHost (first reconcile) then Ready=True (second).
+// 3. CR pass 2 — re-fetches each CR and picks up Reservation outcomes (placed or rejected).
+//
+// CRs and Reservations that have already reached a terminal state are skipped to avoid
+// overwriting the rejection signal the API polling loop needs to read.
+func (e *e2eEnv) reconcileAll(ctx context.Context) {
+ var crList v1alpha1.CommittedResourceList
+ if err := e.k8sClient.List(ctx, &crList); err != nil {
+ return
+ }
+
+ // CR pass 1.
+ for _, cr := range crList.Items {
+ if e2eIsTerminalCR(cr) {
+ continue
+ }
+ e.crCtrl.Reconcile(ctx, ctrl.Request{NamespacedName: types.NamespacedName{Name: cr.Name}}) //nolint:errcheck
+ }
+
+ // Reservation pass (two reconciles per slot: first sets TargetHost, second sets Ready=True).
+ var resList v1alpha1.ReservationList
+ if err := e.k8sClient.List(ctx, &resList, client.MatchingLabels{
+ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
+ }); err != nil {
+ return
+ }
+ for _, res := range resList.Items {
+ if e2eIsTerminalReservation(res) {
+ continue
+ }
+ req := ctrl.Request{NamespacedName: types.NamespacedName{Name: res.Name}}
+ e.resCtrl.Reconcile(ctx, req) //nolint:errcheck
+ e.resCtrl.Reconcile(ctx, req) //nolint:errcheck
+ }
+
+ // CR pass 2: re-fetch so we see any condition changes made during the Reservation pass.
+ for _, cr := range crList.Items {
+ var latest v1alpha1.CommittedResource
+ if err := e.k8sClient.Get(ctx, types.NamespacedName{Name: cr.Name}, &latest); err != nil {
+ continue // deleted or transient
+ }
+ if e2eIsTerminalCR(latest) {
+ continue
+ }
+ e.crCtrl.Reconcile(ctx, ctrl.Request{NamespacedName: types.NamespacedName{Name: latest.Name}}) //nolint:errcheck
+ }
+}
+
+// e2eIsTerminalCR returns true for states the API polling loop treats as final:
+// Accepted (Ready=True), Rejected, or Planned.
+// CRs with DeletionTimestamp are never terminal here: they need one more reconcile to remove
+// their finalizer (set by the controller on first reconcile) so the fake client can delete them.
+func e2eIsTerminalCR(cr v1alpha1.CommittedResource) bool {
+ if !cr.DeletionTimestamp.IsZero() {
+ return false
+ }
+ cond := apimeta.FindStatusCondition(cr.Status.Conditions, v1alpha1.CommittedResourceConditionReady)
+ if cond == nil {
+ return false
+ }
+ if cond.Status == metav1.ConditionTrue {
+ return true
+ }
+ return cond.Reason == v1alpha1.CommittedResourceReasonRejected ||
+ cond.Reason == v1alpha1.CommittedResourceReasonPlanned
+}
+
+// waitForCRAbsent polls until the named CommittedResource no longer exists or the 1s deadline passes.
+// Used after rollback calls because the finalizer removal happens asynchronously in the background reconcile loop.
+func (e *e2eEnv) waitForCRAbsent(t *testing.T, crName string) {
+ t.Helper()
+ deadline := time.Now().Add(1 * time.Second)
+ for {
+ cr := &v1alpha1.CommittedResource{}
+ err := e.k8sClient.Get(context.Background(), types.NamespacedName{Name: crName}, cr)
+ if apierrors.IsNotFound(err) {
+ return
+ }
+ if time.Now().After(deadline) {
+ t.Errorf("expected CommittedResource %q to be absent after rollback, but it still exists", crName)
+ return
+ }
+ time.Sleep(50 * time.Millisecond)
+ }
+}
+
+// e2eIsTerminalReservation returns true when a Reservation is fully placed (Ready=True).
+func e2eIsTerminalReservation(res v1alpha1.Reservation) bool {
+ cond := apimeta.FindStatusCondition(res.Status.Conditions, v1alpha1.ReservationConditionReady)
+ return cond != nil && cond.Status == metav1.ConditionTrue
+}
+
+// ============================================================================
+// Scheduler handlers
+// ============================================================================
+
+func e2eAcceptScheduler(t *testing.T) http.HandlerFunc {
+ t.Helper()
+ return func(w http.ResponseWriter, r *http.Request) {
+ resp := &schedulerdelegationapi.ExternalSchedulerResponse{Hosts: []string{"host-1"}}
+ if err := json.NewEncoder(w).Encode(resp); err != nil {
+ t.Errorf("scheduler encode: %v", err)
+ }
+ }
+}
+
+func e2eRejectScheduler(t *testing.T) http.HandlerFunc {
+ t.Helper()
+ return func(w http.ResponseWriter, r *http.Request) {
+ // Return an empty hosts list — the reservation controller treats this as NoHostsFound.
+ resp := &schedulerdelegationapi.ExternalSchedulerResponse{Hosts: []string{}}
+ if err := json.NewEncoder(w).Encode(resp); err != nil {
+ t.Errorf("scheduler encode: %v", err)
+ }
+ }
+}
+
+// ============================================================================
+// E2E test cases
+// ============================================================================
+
+const e2eInfoVersion = int64(1234)
+
+var e2eFlavor = &TestFlavor{Name: "m1.small", Group: "hana_1", MemoryMB: 1024, VCPUs: 4}
+
+// TestE2EChangeCommitments is the full end-to-end suite: HTTP → CRD → controller → scheduler → HTTP response.
+func TestE2EChangeCommitments(t *testing.T) {
+ testCases := []struct {
+ Name string
+ Scheduler func(*testing.T) http.HandlerFunc
+ ReqJSON string
+ WantResp APIResponseExpectation
+ WantAbsent []string
+ Verify func(*testing.T, *e2eEnv)
+ }{
+ {
+ Name: "scheduler accepts: CR placed, Reservation on host-1",
+ Scheduler: e2eAcceptScheduler,
+ ReqJSON: buildRequestJSON(newCommitmentRequest("az-a", false, e2eInfoVersion,
+ createCommitment("hw_version_hana_1_ram", "project-A", "uuid-e2e-ok", "confirmed", 1))),
+ WantResp: newAPIResponse(),
+ Verify: func(t *testing.T, env *e2eEnv) {
+ t.Helper()
+ env.asCRTestEnv().VerifyCRsExist([]string{"commitment-uuid-e2e-ok"})
+
+ var cr v1alpha1.CommittedResource
+ if err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: "commitment-uuid-e2e-ok"}, &cr); err != nil {
+ t.Fatalf("get CR: %v", err)
+ }
+ if !apimeta.IsStatusConditionTrue(cr.Status.Conditions, v1alpha1.CommittedResourceConditionReady) {
+ t.Errorf("expected CR Ready=True")
+ }
+
+ var resList v1alpha1.ReservationList
+ if err := env.k8sClient.List(context.Background(), &resList, client.MatchingLabels{
+ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
+ }); err != nil {
+ t.Fatalf("list reservations: %v", err)
+ }
+ if len(resList.Items) != 1 {
+ t.Fatalf("expected 1 Reservation, got %d", len(resList.Items))
+ }
+ res := resList.Items[0]
+ if !apimeta.IsStatusConditionTrue(res.Status.Conditions, v1alpha1.ReservationConditionReady) {
+ t.Errorf("expected Reservation Ready=True")
+ }
+ if res.Status.Host != "host-1" {
+ t.Errorf("Reservation Status.Host: want host-1, got %q", res.Status.Host)
+ }
+ },
+ },
+ {
+ Name: "scheduler rejects: rejection propagates to API response, CR rolled back",
+ Scheduler: e2eRejectScheduler,
+ ReqJSON: buildRequestJSON(newCommitmentRequest("az-a", false, e2eInfoVersion,
+ createCommitment("hw_version_hana_1_ram", "project-A", "uuid-e2e-rej", "confirmed", 2))),
+ WantResp: newAPIResponse("no hosts found"),
+ WantAbsent: []string{"commitment-uuid-e2e-rej"},
+ },
+ {
+ Name: "batch with one rejection: entire batch rolled back",
+ Scheduler: e2eRejectScheduler,
+ ReqJSON: buildRequestJSON(newCommitmentRequest("az-a", false, e2eInfoVersion,
+ createCommitment("hw_version_hana_1_ram", "project-A", "uuid-e2e-batch-a", "confirmed", 2),
+ createCommitment("hw_version_hana_1_ram", "project-B", "uuid-e2e-batch-b", "confirmed", 2),
+ )),
+ WantResp: newAPIResponse("no hosts found"),
+ WantAbsent: []string{"commitment-uuid-e2e-batch-a", "commitment-uuid-e2e-batch-b"},
+ },
+ {
+ Name: "lifecycle: create then delete, CR and child Reservations cleaned up",
+ Scheduler: e2eAcceptScheduler,
+ ReqJSON: buildRequestJSON(newCommitmentRequest("az-a", false, e2eInfoVersion,
+ createCommitment("hw_version_hana_1_ram", "project-A", "uuid-e2e-lifecycle", "confirmed", 1))),
+ WantResp: newAPIResponse(),
+ Verify: func(t *testing.T, env *e2eEnv) {
+ t.Helper()
+ env.asCRTestEnv().VerifyCRsExist([]string{"commitment-uuid-e2e-lifecycle"})
+
+ te := env.asCRTestEnv()
+ deleteJSON := buildRequestJSON(newCommitmentRequest("az-a", false, e2eInfoVersion,
+ deleteCommitment("hw_version_hana_1_ram", "project-A", "uuid-e2e-lifecycle", "confirmed", 1)))
+ resp, _, statusCode := te.CallChangeCommitmentsAPI(deleteJSON)
+ te.VerifyAPIResponse(newAPIResponse(), resp, statusCode)
+
+ env.waitForCRAbsent(t, "commitment-uuid-e2e-lifecycle")
+
+ var resList v1alpha1.ReservationList
+ if err := env.k8sClient.List(context.Background(), &resList, client.MatchingLabels{
+ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
+ }); err != nil {
+ t.Fatalf("list reservations: %v", err)
+ }
+ if len(resList.Items) != 0 {
+ t.Errorf("expected 0 Reservations after delete, got %d", len(resList.Items))
+ }
+ },
+ },
+ }
+
+ for _, tc := range testCases {
+ t.Run(tc.Name, func(t *testing.T) {
+ env := newE2EEnv(t, []*TestFlavor{e2eFlavor}, e2eInfoVersion, tc.Scheduler(t))
+ defer env.close()
+
+ te := env.asCRTestEnv()
+ resp, _, statusCode := te.CallChangeCommitmentsAPI(tc.ReqJSON)
+ te.VerifyAPIResponse(tc.WantResp, resp, statusCode)
+ for _, name := range tc.WantAbsent {
+ env.waitForCRAbsent(t, name)
+ }
+ if tc.Verify != nil {
+ tc.Verify(t, env)
+ }
+ })
+ }
+}
diff --git a/internal/scheduling/reservations/commitments/api/change_commitments_metrics.go b/internal/scheduling/reservations/commitments/api/change_commitments_metrics.go
index 2c9562ee8..1afeea5f5 100644
--- a/internal/scheduling/reservations/commitments/api/change_commitments_metrics.go
+++ b/internal/scheduling/reservations/commitments/api/change_commitments_metrics.go
@@ -23,7 +23,7 @@ func (api *HTTPAPI) recordMetrics(req liquid.CommitmentChangeRequest, resp liqui
commitmentCount := countCommitments(req)
// Determine result based on response
- result := "success"
+ result := "accepted"
if resp.RejectionReason != "" {
result = "rejected"
}
diff --git a/internal/scheduling/reservations/commitments/api/change_commitments_test.go b/internal/scheduling/reservations/commitments/api/change_commitments_test.go
index deffc91c3..579173460 100644
--- a/internal/scheduling/reservations/commitments/api/change_commitments_test.go
+++ b/internal/scheduling/reservations/commitments/api/change_commitments_test.go
@@ -1,14 +1,13 @@
// Copyright SAP SE
// SPDX-License-Identifier: Apache-2.0
-//nolint:unparam,unused // test helper functions have fixed parameters for simplicity
+//nolint:unparam // test helper functions have fixed parameters for simplicity
package api
import (
"bytes"
"context"
"encoding/json"
- "fmt"
"io"
"net/http"
"net/http/httptest"
@@ -23,10 +22,11 @@ import (
"github.com/cobaltcore-dev/cortex/api/v1alpha1"
"github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute"
commitments "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/commitments"
- hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
+ . "github.com/majewsky/gg/option"
"github.com/prometheus/client_golang/prometheus"
"github.com/sapcc/go-api-declarations/liquid"
- corev1 "k8s.io/api/core/v1"
+ apierrors "k8s.io/apimachinery/pkg/api/errors"
+ "k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
@@ -40,657 +40,420 @@ import (
// Integration Tests
// ============================================================================
-func TestCommitmentChangeIntegration(t *testing.T) {
- m1Tiny := &TestFlavor{Name: "m1.tiny", Group: "gp_1", MemoryMB: 256, VCPUs: 1}
+func TestHandleChangeCommitments(t *testing.T) {
m1Small := &TestFlavor{Name: "m1.small", Group: "hana_1", MemoryMB: 1024, VCPUs: 4}
- m1Large := &TestFlavor{Name: "m1.large", Group: "hana_1", MemoryMB: 4096, VCPUs: 16}
- m1XL := &TestFlavor{Name: "m1.xl", Group: "hana_1", MemoryMB: 8192, VCPUs: 32}
testCases := []CommitmentChangeTestCase{
+ // --- Basic flow ---
{
- Name: "Shrinking CR - unused reservations removed, used reservations untouched",
- VMs: []*TestVM{{UUID: "vm-a1", Flavor: m1Large, ProjectID: "project-A", Host: "host-1", AZ: "az-a"}},
- Flavors: []*TestFlavor{m1Small, m1Large},
- ExistingReservations: []*TestReservation{
- {CommitmentID: "uuid-123", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-a1"}},
- {CommitmentID: "uuid-123", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"},
- {CommitmentID: "uuid-123", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"},
- },
- CommitmentRequest: newCommitmentRequest("az-a", false, 1234, createCommitment("hw_version_hana_1_ram", "project-A", "uuid-123", "confirmed", 2)),
- ExpectedReservations: []*TestReservation{
- {CommitmentID: "uuid-123", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-a1"}},
- {CommitmentID: "uuid-123", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"},
- },
- ExpectedAPIResponse: newAPIResponse(),
- },
- {
- Name: "Insufficient capacity when increasing CR",
- VMs: []*TestVM{},
- Flavors: []*TestFlavor{m1Small},
- ExistingReservations: []*TestReservation{{CommitmentID: "uuid-456", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}},
- CommitmentRequest: newCommitmentRequest("az-a", false, 1234, createCommitment("hw_version_hana_1_ram", "project-A", "uuid-456", "confirmed", 3)),
- AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 1024, "host-2": 0}},
- ExpectedReservations: []*TestReservation{{CommitmentID: "uuid-456", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}},
- ExpectedAPIResponse: newAPIResponse("1 commitment(s) failed", "commitment uuid-456: not sufficient capacity"),
- },
- {
- Name: "Invalid CR name - too long",
- VMs: []*TestVM{},
- Flavors: []*TestFlavor{m1Small},
- ExistingReservations: []*TestReservation{},
- CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
- createCommitment("hw_version_hana_1_ram", "project-A", strings.Repeat("long-", 13), "confirmed", 3),
- ),
- AvailableResources: &AvailableResources{},
- ExpectedReservations: []*TestReservation{},
- ExpectedAPIResponse: newAPIResponse("1 commitment(s) failed", "commitment long-long-long-long-long-long-long-long-long-long-long-long-long-: unexpected commitment format"),
- },
- {
- Name: "Planned CR is ignored in validation, no scheduling or capacity reservation",
- VMs: []*TestVM{},
+ Name: "New CR: controller accepts → API returns accepted",
Flavors: []*TestFlavor{m1Small},
CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
- createCommitment("hw_version_hana_1_ram", "project-A", "uuid-new", "planned", 200),
- ),
- ExpectedReservations: []*TestReservation{},
- ExpectedAPIResponse: newAPIResponse(),
+ createCommitment("hw_version_hana_1_ram", "project-A", "uuid-new", "confirmed", 2)),
+ ExpectedAPIResponse: newAPIResponse(),
+ ExpectedCreatedCRNames: []string{"commitment-uuid-new"},
+ ExpectedAllowRejection: map[string]bool{"commitment-uuid-new": true},
},
{
- Name: "Invalid CR name - spaces",
- VMs: []*TestVM{},
- Flavors: []*TestFlavor{m1Small},
- ExistingReservations: []*TestReservation{},
+ Name: "New CR: controller rejects → API returns rejection reason",
+ Flavors: []*TestFlavor{m1Small},
+ CROutcomes: map[string]string{
+ "commitment-uuid-rej": "not sufficient capacity",
+ },
CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
- createCommitment("hw_version_hana_1_ram", "project-A", "uuid with space", "confirmed", 3),
- ),
- AvailableResources: &AvailableResources{},
- ExpectedReservations: []*TestReservation{},
- ExpectedAPIResponse: newAPIResponse("1 commitment(s) failed", "commitment uuid with space: unexpected commitment format"),
+ createCommitment("hw_version_hana_1_ram", "project-A", "uuid-rej", "confirmed", 2)),
+ ExpectedAPIResponse: newAPIResponse("commitment uuid-rej: not sufficient capacity"),
},
+ // --- Planned state ---
{
- Name: "Swap capacity between CRs - order dependent - delete-first succeeds",
+ Name: "Planned CR: controller sets Ready=False/Planned → API accepts",
Flavors: []*TestFlavor{m1Small},
- ExistingReservations: []*TestReservation{
- {CommitmentID: "uuid-456", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"},
- {CommitmentID: "uuid-456", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}},
+ CROutcomes: map[string]string{
+ "commitment-uuid-plan": v1alpha1.CommittedResourceReasonPlanned,
+ },
CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
- createCommitment("hw_version_hana_1_ram", "project-A", "uuid-456", "confirmed", 0),
- createCommitment("hw_version_hana_1_ram", "project-B", "uuid-123", "confirmed", 2),
- ),
- AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0, "host-2": 0}},
- ExpectedReservations: []*TestReservation{
- {CommitmentID: "uuid-123", Host: "host-1", Flavor: m1Small, ProjectID: "project-B"},
- {CommitmentID: "uuid-123", Host: "host-2", Flavor: m1Small, ProjectID: "project-B"}},
- ExpectedAPIResponse: newAPIResponse(),
+ createCommitment("hw_version_hana_1_ram", "project-A", "uuid-plan", "planned", 2)),
+ ExpectedAPIResponse: newAPIResponse(),
+ ExpectedCreatedCRNames: []string{"commitment-uuid-plan"},
},
+ // --- Update existing CR ---
{
- Name: "Swap capacity between CRs - order dependent - create-first fails",
+ Name: "Resize up: existing CR updated with new amount, accepted",
Flavors: []*TestFlavor{m1Small},
- ExistingReservations: []*TestReservation{
- {CommitmentID: "uuid-123", Host: "host-1", Flavor: m1Small, ProjectID: "project-B"},
- {CommitmentID: "uuid-123", Host: "host-2", Flavor: m1Small, ProjectID: "project-B"}},
+ ExistingCRs: []*TestCR{
+ {CommitmentUUID: "uuid-resize", State: v1alpha1.CommitmentStatusConfirmed, AmountMiB: 1024, ProjectID: "project-A", AZ: "az-a"},
+ },
CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
- createCommitment("hw_version_hana_1_ram", "project-A", "uuid-456", "confirmed", 2),
- createCommitment("hw_version_hana_1_ram", "project-B", "uuid-123", "confirmed", 0),
- ),
- AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0, "host-2": 0}},
- ExpectedReservations: []*TestReservation{
- {CommitmentID: "uuid-123", Host: "host-1", Flavor: m1Small, ProjectID: "project-B"},
- {CommitmentID: "uuid-123", Host: "host-2", Flavor: m1Small, ProjectID: "project-B"}},
- ExpectedAPIResponse: newAPIResponse("1 commitment(s) failed", "commitment uuid-456: not sufficient capacity"),
+ createCommitment("hw_version_hana_1_ram", "project-A", "uuid-resize", "confirmed", 2)),
+ ExpectedAPIResponse: newAPIResponse(),
+ ExpectedCreatedCRNames: []string{"commitment-uuid-resize"},
},
+ // --- Rollback: new CR deleted on batch failure ---
{
- Name: "Flavor bin-packing - mixed sizes when largest doesn't fit",
- // Greedy selection: 10GB request with 8/4/1GB flavors → picks 1×8GB + 2×1GB
- Flavors: []*TestFlavor{m1XL, m1Large, m1Small},
- CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
- createCommitment("hw_version_hana_1_ram", "project-A", "uuid-binpack", "confirmed", 10),
- ),
- ExpectedReservations: []*TestReservation{
- {CommitmentID: "uuid-binpack", Flavor: m1XL, ProjectID: "project-A"},
- {CommitmentID: "uuid-binpack", Flavor: m1Small, ProjectID: "project-A"},
- {CommitmentID: "uuid-binpack", Flavor: m1Small, ProjectID: "project-A"},
+ Name: "Rollback new CR: newly created CRD deleted on rejection",
+ Flavors: []*TestFlavor{m1Small},
+ CROutcomes: map[string]string{
+ "commitment-uuid-rollback": "not sufficient capacity",
},
- ExpectedAPIResponse: newAPIResponse(),
+ CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
+ createCommitment("hw_version_hana_1_ram", "project-A", "uuid-rollback", "confirmed", 2)),
+ ExpectedAPIResponse: newAPIResponse("uuid-rollback: not sufficient capacity"),
+ ExpectedDeletedCRs: []string{"commitment-uuid-rollback"},
},
+ // --- Rollback: updated CR spec restored on batch failure ---
{
- Name: "Version mismatch - request rejected with 409 Conflict",
- // InfoVersion validation prevents stale requests (1233 vs 1234)
+ Name: "Rollback updated CR: spec restored on rejection",
Flavors: []*TestFlavor{m1Small},
- CommitmentRequest: newCommitmentRequest("az-a", false, 1233,
- createCommitment("hw_version_hana_1_ram", "project-A", "uuid-version", "confirmed", 2),
- ),
- EnvInfoVersion: 1234,
- ExpectedReservations: []*TestReservation{},
- ExpectedAPIResponse: APIResponseExpectation{StatusCode: 409},
+ ExistingCRs: []*TestCR{
+ {CommitmentUUID: "uuid-restore", State: v1alpha1.CommitmentStatusConfirmed, AmountMiB: 1024, ProjectID: "project-A", AZ: "az-a"},
+ },
+ CROutcomes: map[string]string{
+ "commitment-uuid-restore": "not sufficient capacity",
+ },
+ CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
+ createCommitment("hw_version_hana_1_ram", "project-A", "uuid-restore", "confirmed", 4)),
+ ExpectedAPIResponse: newAPIResponse("uuid-restore: not sufficient capacity"),
+ // CRD still exists but amount restored to 1024 MiB
+ ExpectedCRSpecs: map[string]int64{"commitment-uuid-restore": 1024 * 1024 * 1024},
},
+ // --- Batch rollback: one failure rolls back all ---
{
- Name: "Multi-project rollback - one failure rolls back all",
- // Transactional: project-B fails (insufficient capacity) → both projects rollback
+ Name: "Batch rollback: project-B fails → project-A new CR also rolled back",
Flavors: []*TestFlavor{m1Small},
- ExistingReservations: []*TestReservation{
- {CommitmentID: "uuid-project-a", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"},
+ CROutcomes: map[string]string{
+ "commitment-uuid-b": "not sufficient capacity",
},
CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
- createCommitment("hw_version_hana_1_ram", "project-A", "uuid-project-a", "confirmed", 2),
- createCommitment("hw_version_hana_1_ram", "project-B", "uuid-project-b", "confirmed", 2),
+ createCommitment("hw_version_hana_1_ram", "project-A", "uuid-a", "confirmed", 2),
+ createCommitment("hw_version_hana_1_ram", "project-B", "uuid-b", "confirmed", 2),
),
- AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 1024, "host-2": 0}},
- ExpectedReservations: []*TestReservation{
- {CommitmentID: "uuid-project-a", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"},
- },
- ExpectedAPIResponse: newAPIResponse("uuid-project-b", "not sufficient capacity"),
+ ExpectedAPIResponse: newAPIResponse("uuid-b: not sufficient capacity"),
+ ExpectedDeletedCRs: []string{"commitment-uuid-a", "commitment-uuid-b"},
},
+ // --- Timeout ---
{
- Name: "Rollback with VMs allocated - limitation: VM allocations not rolled back",
- // Controller will eventually clean up and repair inconsistent state
- VMs: []*TestVM{{UUID: "vm-rollback", Flavor: m1Small, ProjectID: "project-A", Host: "host-1", AZ: "az-a"}},
+ Name: "Timeout: no condition set → rollback and timeout error",
Flavors: []*TestFlavor{m1Small},
- ExistingReservations: []*TestReservation{
- {CommitmentID: "commitment-A", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-rollback"}},
- {CommitmentID: "commitment-A", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"},
+ CROutcomes: map[string]string{
+ "commitment-uuid-timeout": "", // empty string = no condition set (controller not responding)
},
+ NoCondition: []string{"commitment-uuid-timeout"},
CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
- createCommitment("hw_version_hana_1_ram", "project-A", "commitment-A", "confirmed", 0),
- createCommitment("hw_version_hana_1_ram", "project-B", "commitment-B", "confirmed", 6),
- ),
- AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0}},
- ExpectedReservations: []*TestReservation{
- // Rollback creates unscheduled reservations (empty Host accepts any in matching)
- {CommitmentID: "commitment-A", Flavor: m1Small, ProjectID: "project-A"},
- {CommitmentID: "commitment-A", Flavor: m1Small, ProjectID: "project-A"},
- },
- ExpectedAPIResponse: newAPIResponse("commitment-B", "not sufficient capacity"),
+ createCommitment("hw_version_hana_1_ram", "project-A", "uuid-timeout", "confirmed", 2)),
+ CustomConfig: func() *commitments.APIConfig {
+ cfg := commitments.DefaultAPIConfig()
+ cfg.WatchTimeout = metav1.Duration{}
+ cfg.WatchPollInterval = metav1.Duration{Duration: 100 * time.Millisecond}
+ return &cfg
+ }(),
+ ExpectedAPIResponse: newAPIResponse("timeout reached while processing commitment changes"),
+ ExpectedDeletedCRs: []string{"commitment-uuid-timeout"},
},
+ // --- Input validation ---
{
- Name: "New commitment creation - from zero to N reservations",
+ Name: "Invalid commitment UUID: rejected before CRD write",
Flavors: []*TestFlavor{m1Small},
CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
- createCommitment("hw_version_hana_1_ram", "project-A", "uuid-new", "confirmed", 3),
- ),
- ExpectedReservations: []*TestReservation{
- {CommitmentID: "uuid-new", Flavor: m1Small, ProjectID: "project-A"},
- {CommitmentID: "uuid-new", Flavor: m1Small, ProjectID: "project-A"},
- {CommitmentID: "uuid-new", Flavor: m1Small, ProjectID: "project-A"},
- },
- ExpectedAPIResponse: newAPIResponse(),
+ createCommitment("hw_version_hana_1_ram", "project-A", strings.Repeat("x", 50), "confirmed", 2)),
+ ExpectedAPIResponse: newAPIResponse("unexpected commitment format"),
+ ExpectedDeletedCRs: []string{"commitment-" + strings.Repeat("x", 50)},
},
{
- Name: "New commitment creation - large batch",
+ Name: "Unknown flavor group: rejected without CRD write",
Flavors: []*TestFlavor{m1Small},
CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
- createCommitment("hw_version_hana_1_ram", "project-A", "uuid-new", "confirmed", 200),
- ),
- ExpectedReservations: func() []*TestReservation {
- var reservations []*TestReservation
- for range 200 {
- reservations = append(reservations, &TestReservation{
- CommitmentID: "uuid-new",
- Flavor: m1Small,
- ProjectID: "project-A",
- })
- }
- return reservations
- }(),
- ExpectedAPIResponse: newAPIResponse(),
+ createCommitment("hw_version_nonexistent_ram", "project-A", "uuid-unk", "confirmed", 2)),
+ ExpectedAPIResponse: newAPIResponse("flavor group not found"),
},
+ // --- Infrastructure ---
{
- Name: "With reservations of custom size - total unchanged",
- // Preserves custom-sized reservations when total matches (2×2GB = 4GB)
+ Name: "Version mismatch: 409 Conflict",
Flavors: []*TestFlavor{m1Small},
- ExistingReservations: []*TestReservation{
- {CommitmentID: "uuid-custom", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048},
- {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048},
- },
- CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
- createCommitment("hw_version_hana_1_ram", "project-A", "uuid-custom", "confirmed", 4),
- ),
- ExpectedReservations: []*TestReservation{
- {CommitmentID: "uuid-custom", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048},
- {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048},
- },
- ExpectedAPIResponse: newAPIResponse(),
+ CommitmentRequest: newCommitmentRequest("az-a", false, 9999,
+ createCommitment("hw_version_hana_1_ram", "project-A", "uuid-v", "confirmed", 2)),
+ EnvInfoVersion: 1234, // env is at 1234, request claims 9999 → mismatch
+ ExpectedAPIResponse: APIResponseExpectation{StatusCode: 409},
},
{
- Name: "With reservations of custom size - increase total",
- // 4GB (2×2GB custom) → 6GB: preserves custom sizes, adds standard-sized reservations
+ Name: "API disabled: 503 Service Unavailable",
Flavors: []*TestFlavor{m1Small},
- ExistingReservations: []*TestReservation{
- {CommitmentID: "uuid-custom", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048},
- {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048},
- },
CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
- createCommitment("hw_version_hana_1_ram", "project-A", "uuid-custom", "confirmed", 6),
- ),
- ExpectedReservations: []*TestReservation{
- {CommitmentID: "uuid-custom", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048},
- {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048},
- {CommitmentID: "uuid-custom", Flavor: m1Small, ProjectID: "project-A"},
- {CommitmentID: "uuid-custom", Flavor: m1Small, ProjectID: "project-A"},
- },
- ExpectedAPIResponse: newAPIResponse(),
+ createCommitment("hw_version_hana_1_ram", "project-A", "uuid-dis", "confirmed", 2)),
+ CustomConfig: func() *commitments.APIConfig {
+ cfg := commitments.DefaultAPIConfig()
+ cfg.EnableChangeCommitments = false
+ return &cfg
+ }(),
+ ExpectedAPIResponse: APIResponseExpectation{StatusCode: 503},
},
{
- Name: "With reservations of custom size - decrease total",
- // 4GB (2×2GB custom) → 3GB: removes 1×2GB custom, adds 1×1GB standard
+ Name: "Knowledge not ready: 503 Service Unavailable",
Flavors: []*TestFlavor{m1Small},
- ExistingReservations: []*TestReservation{
- {CommitmentID: "uuid-custom", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048},
- {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048},
- },
CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
- createCommitment("hw_version_hana_1_ram", "project-A", "uuid-custom", "confirmed", 3),
- ),
- ExpectedReservations: []*TestReservation{
- {CommitmentID: "uuid-custom", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048},
- {CommitmentID: "uuid-custom", Flavor: m1Small, ProjectID: "project-A"},
- },
- ExpectedAPIResponse: newAPIResponse(),
+ createCommitment("hw_version_hana_1_ram", "project-A", "uuid-kr", "confirmed", 2)),
+ EnvInfoVersion: -1, // skip Knowledge CRD creation
+ ExpectedAPIResponse: APIResponseExpectation{StatusCode: 503},
},
{
- Name: "Complete commitment deletion - N to zero reservations",
+ Name: "Dry run: not supported yet",
Flavors: []*TestFlavor{m1Small},
- ExistingReservations: []*TestReservation{
- {CommitmentID: "uuid-delete", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"},
- {CommitmentID: "uuid-delete", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"},
- {CommitmentID: "uuid-delete", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"},
- {CommitmentID: "uuid-b-1", Host: "host-3", Flavor: m1Small, ProjectID: "project-B"},
- {CommitmentID: "uuid-a-1", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"},
- },
- CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
- createCommitment("hw_version_hana_1_ram", "project-A", "uuid-delete", "confirmed", 0),
- ),
- ExpectedReservations: []*TestReservation{
- {CommitmentID: "uuid-b-1", Host: "host-3", Flavor: m1Small, ProjectID: "project-B"},
- {CommitmentID: "uuid-a-1", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"},
- },
+ CommitmentRequest: newCommitmentRequest("az-a", true, 1234,
+ createCommitment("hw_version_hana_1_ram", "project-A", "uuid-dry", "confirmed", 2)),
+ ExpectedAPIResponse: newAPIResponse("Dry run not supported"),
+ },
+ {
+ Name: "Empty request: no CRDs created",
+ Flavors: []*TestFlavor{m1Small},
+ CommitmentRequest: newCommitmentRequest("az-a", false, 1234),
ExpectedAPIResponse: newAPIResponse(),
},
+ // --- Deletion ---
{
- Name: "VM allocation preservation - keep VMs during growth",
- VMs: []*TestVM{{UUID: "vm-existing", Flavor: m1Small, ProjectID: "project-A", Host: "host-1", AZ: "az-a"}},
+ Name: "Deletion: existing CRD is deleted",
Flavors: []*TestFlavor{m1Small},
- ExistingReservations: []*TestReservation{
- {CommitmentID: "uuid-growth", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-existing"}},
- {CommitmentID: "uuid-growth", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"},
+ ExistingCRs: []*TestCR{
+ {CommitmentUUID: "uuid-del", State: v1alpha1.CommitmentStatusConfirmed, AmountMiB: 1024, ProjectID: "project-A", AZ: "az-a"},
},
CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
- createCommitment("hw_version_hana_1_ram", "project-A", "uuid-growth", "confirmed", 3),
- ),
- ExpectedReservations: []*TestReservation{
- {CommitmentID: "uuid-growth", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-existing"}},
- {CommitmentID: "uuid-growth", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"},
- {CommitmentID: "uuid-growth", Flavor: m1Small, ProjectID: "project-A"},
- },
+ deleteCommitment("hw_version_hana_1_ram", "project-A", "uuid-del", "confirmed", 2)),
ExpectedAPIResponse: newAPIResponse(),
+ ExpectedDeletedCRs: []string{"commitment-uuid-del"},
},
{
- Name: "Multi-project success - both projects succeed",
+ Name: "Deletion: non-existing CRD is a no-op",
Flavors: []*TestFlavor{m1Small},
CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
- createCommitment("hw_version_hana_1_ram", "project-A", "uuid-a", "confirmed", 2),
- createCommitment("hw_version_hana_1_ram", "project-B", "uuid-b", "confirmed", 2),
- ),
- ExpectedReservations: []*TestReservation{
- {CommitmentID: "uuid-a", Flavor: m1Small, ProjectID: "project-A"},
- {CommitmentID: "uuid-a", Flavor: m1Small, ProjectID: "project-A"},
- {CommitmentID: "uuid-b", Flavor: m1Small, ProjectID: "project-B"},
- {CommitmentID: "uuid-b", Flavor: m1Small, ProjectID: "project-B"},
- },
+ deleteCommitment("hw_version_hana_1_ram", "project-A", "uuid-absent", "confirmed", 2)),
ExpectedAPIResponse: newAPIResponse(),
},
{
- Name: "Multiple flavor groups - hw_version_hana_1_ram and hw_version_hana_2_ram",
- // Amount in multiples of smallest flavor: hana_1 (2×1GB), hana_2 (2×2GB)
- Flavors: []*TestFlavor{
- m1Small,
- {Name: "m2.small", Group: "hana_2", MemoryMB: 2048, VCPUs: 8},
+ Name: "Deletion rollback: delete succeeds but later commitment fails → CRD re-created",
+ Flavors: []*TestFlavor{m1Small},
+ ExistingCRs: []*TestCR{
+ {CommitmentUUID: "uuid-del-rb", State: v1alpha1.CommitmentStatusConfirmed, AmountMiB: 1024, ProjectID: "project-A", AZ: "az-a"},
},
- CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
- createCommitment("hw_version_hana_1_ram", "project-A", "uuid-hana1", "confirmed", 2),
- createCommitment("hw_version_hana_2_ram", "project-A", "uuid-hana2", "confirmed", 2),
- ),
- ExpectedReservations: []*TestReservation{
- {CommitmentID: "uuid-hana1", Flavor: m1Small, ProjectID: "project-A"},
- {CommitmentID: "uuid-hana1", Flavor: m1Small, ProjectID: "project-A"},
- {CommitmentID: "uuid-hana2", Flavor: &TestFlavor{Name: "m2.small", Group: "hana_2", MemoryMB: 2048, VCPUs: 8}, ProjectID: "project-A"},
- {CommitmentID: "uuid-hana2", Flavor: &TestFlavor{Name: "m2.small", Group: "hana_2", MemoryMB: 2048, VCPUs: 8}, ProjectID: "project-A"},
+ CROutcomes: map[string]string{
+ "commitment-uuid-new-rb": "not enough capacity",
},
- ExpectedAPIResponse: newAPIResponse(),
- },
- {
- Name: "Unknown flavor group - clear rejection message",
- Flavors: []*TestFlavor{m1Small},
+ // project-A deletion sorts before project-B creation; deletion succeeds then creation fails.
CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
- createCommitment("hw_version_nonexistent_ram", "project-A", "uuid-unknown", "confirmed", 2),
+ deleteCommitment("hw_version_hana_1_ram", "project-A", "uuid-del-rb", "confirmed", 2),
+ createCommitment("hw_version_hana_1_ram", "project-B", "uuid-new-rb", "confirmed", 2),
),
- ExpectedReservations: []*TestReservation{},
- ExpectedAPIResponse: newAPIResponse("flavor group not found"),
+ ExpectedAPIResponse: newAPIResponse("not enough capacity"),
+ ExpectedCreatedCRNames: []string{"commitment-uuid-del-rb"}, // re-created during rollback
},
+ // --- Non-confirming changes (RequiresConfirmation=false → AllowRejection=false, no watch) ---
{
- Name: "Three-way capacity swap - complex reallocation",
- // A:2→0, B:1→0, C:0→3 in single transaction
+ Name: "Non-confirming: guaranteed→confirmed, AllowRejection=false, watch skipped",
Flavors: []*TestFlavor{m1Small},
- ExistingReservations: []*TestReservation{
- {CommitmentID: "uuid-a", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"},
- {CommitmentID: "uuid-a", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"},
- {CommitmentID: "uuid-b", Host: "host-3", Flavor: m1Small, ProjectID: "project-B"},
+ ExistingCRs: []*TestCR{
+ {CommitmentUUID: "uuid-guar", State: v1alpha1.CommitmentStatusGuaranteed, AmountMiB: 1024, ProjectID: "project-A", AZ: "az-a"},
},
+ // Controller would reject, but we skip watching for non-confirming changes.
+ CROutcomes: map[string]string{"commitment-uuid-guar": "not enough capacity"},
CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
- createCommitment("hw_version_hana_1_ram", "project-A", "uuid-a", "confirmed", 0),
- createCommitment("hw_version_hana_1_ram", "project-B", "uuid-b", "confirmed", 0),
- createCommitment("hw_version_hana_1_ram", "project-C", "uuid-c", "confirmed", 3),
- ),
- AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0, "host-2": 0, "host-3": 0}},
- ExpectedReservations: []*TestReservation{
- {CommitmentID: "uuid-c", Host: "host-1", Flavor: m1Small, ProjectID: "project-C"},
- {CommitmentID: "uuid-c", Host: "host-2", Flavor: m1Small, ProjectID: "project-C"},
- {CommitmentID: "uuid-c", Host: "host-3", Flavor: m1Small, ProjectID: "project-C"},
- },
- ExpectedAPIResponse: newAPIResponse(),
+ TestCommitment{
+ ResourceName: "hw_version_hana_1_ram",
+ ProjectID: "project-A",
+ ConfirmationID: "uuid-guar",
+ OldState: "guaranteed",
+ State: "confirmed",
+ Amount: 2,
+ }),
+ ExpectedAPIResponse: newAPIResponse(), // no rejection even though controller would reject
+ ExpectedCreatedCRNames: []string{"commitment-uuid-guar"},
+ ExpectedAllowRejection: map[string]bool{"commitment-uuid-guar": false},
},
{
- Name: "Reservation repair - existing reservations with wrong metadata",
- Flavors: []*TestFlavor{m1Small, m1Large},
- ExistingReservations: []*TestReservation{
- {CommitmentID: "uuid-repair", Host: "host-preserved", Flavor: m1Small, ProjectID: "project-A", AZ: "az-a"},
- {CommitmentID: "uuid-repair", Host: "host-1", Flavor: m1Small, ProjectID: "wrong-project", AZ: "az-a"},
- {CommitmentID: "uuid-repair", Host: "host-2", Flavor: &TestFlavor{Name: "m1.small", Group: "hana_13", MemoryMB: 1024, VCPUs: 4}, ProjectID: "project-A", AZ: "az-a"},
- {CommitmentID: "uuid-repair", Host: "host-4", Flavor: m1Small, ProjectID: "project-A", AZ: "wrong-az"},
- },
+ Name: "Non-confirming: planned, AllowRejection=false",
+ Flavors: []*TestFlavor{m1Small},
+ // CROutcomes not set: controller accepts (irrelevant since watch is skipped).
CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
- createCommitment("hw_version_hana_1_ram", "project-A", "uuid-repair", "confirmed", 8, "az-a"),
- ),
- ExpectedReservations: []*TestReservation{
- {CommitmentID: "uuid-repair", Host: "host-preserved", Flavor: m1Small, ProjectID: "project-A", AZ: "az-a"},
- {CommitmentID: "uuid-repair", Flavor: m1Small, ProjectID: "project-A", AZ: "az-a"},
- {CommitmentID: "uuid-repair", Flavor: m1Small, ProjectID: "project-A", AZ: "az-a"},
- {CommitmentID: "uuid-repair", Flavor: m1Small, ProjectID: "project-A", AZ: "az-a"},
- {CommitmentID: "uuid-repair", Flavor: m1Large, ProjectID: "project-A", AZ: "az-a"},
- },
- ExpectedAPIResponse: newAPIResponse(),
- },
- {
- Name: "Empty request - no commitment changes",
- Flavors: []*TestFlavor{m1Small},
- CommitmentRequest: newCommitmentRequest("az-a", false, 1234),
- ExpectedReservations: []*TestReservation{},
- ExpectedAPIResponse: newAPIResponse(),
+ createCommitment("hw_version_hana_1_ram", "project-A", "uuid-plan-nc", "planned", 2)),
+ ExpectedAPIResponse: newAPIResponse(),
+ ExpectedCreatedCRNames: []string{"commitment-uuid-plan-nc"},
+ ExpectedAllowRejection: map[string]bool{"commitment-uuid-plan-nc": false},
},
+ // --- Pending state ---
{
- Name: "Dry run request - feature not yet implemented",
+ Name: "None→pending: non-confirming, AllowRejection=false, watch skipped",
Flavors: []*TestFlavor{m1Small},
- CommitmentRequest: newCommitmentRequest("az-a", true, 1234,
- createCommitment("hw_version_hana_1_ram", "project-A", "uuid-dryrun", "confirmed", 2),
- ),
- ExpectedReservations: []*TestReservation{},
- ExpectedAPIResponse: newAPIResponse("Dry run not supported"),
+ // pending creates Reservation slots (like confirmed) but RequiresConfirmation=false.
+ CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
+ createCommitment("hw_version_hana_1_ram", "project-A", "uuid-pend", "pending", 2)),
+ ExpectedAPIResponse: newAPIResponse(),
+ ExpectedCreatedCRNames: []string{"commitment-uuid-pend"},
+ ExpectedAllowRejection: map[string]bool{"commitment-uuid-pend": false},
+ ExpectedCRSpecs: map[string]int64{"commitment-uuid-pend": 2 * 1024 * 1024 * 1024},
},
+ // --- Inactive state transitions via upsert ---
{
- Name: "Knowledge not ready - clear rejection with RetryAt",
+ Name: "confirmed→expired: non-confirming upsert, AllowRejection=false, watch skipped",
Flavors: []*TestFlavor{m1Small},
- CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
- createCommitment("hw_version_hana_1_ram", "project-A", "uuid-knowledge", "confirmed", 2),
- ),
- ExpectedReservations: []*TestReservation{},
- ExpectedAPIResponse: APIResponseExpectation{
- StatusCode: 503,
- RetryAtPresent: false,
+ ExistingCRs: []*TestCR{
+ {CommitmentUUID: "uuid-to-exp", State: v1alpha1.CommitmentStatusConfirmed, AmountMiB: 1024, ProjectID: "project-A", AZ: "az-a"},
},
- EnvInfoVersion: -1, // Skip Knowledge CRD creation
+ CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
+ TestCommitment{
+ ResourceName: "hw_version_hana_1_ram",
+ ProjectID: "project-A",
+ ConfirmationID: "uuid-to-exp",
+ OldState: "confirmed",
+ State: "expired",
+ Amount: 1,
+ }),
+ ExpectedAPIResponse: newAPIResponse(),
+ ExpectedCreatedCRNames: []string{"commitment-uuid-to-exp"},
+ ExpectedAllowRejection: map[string]bool{"commitment-uuid-to-exp": false},
+ ExpectedCRSpecs: map[string]int64{"commitment-uuid-to-exp": 0},
},
{
- Name: "API disabled - returns 503 Service Unavailable",
+ Name: "confirmed→superseded: confirming upsert, AllowRejection=true, controller accepts",
Flavors: []*TestFlavor{m1Small},
- CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
- createCommitment("hw_version_hana_1_ram", "project-A", "uuid-disabled", "confirmed", 2),
- ),
- CustomConfig: func() *commitments.Config {
- cfg := commitments.DefaultConfig()
- cfg.EnableChangeCommitmentsAPI = false
- return &cfg
- }(),
- ExpectedReservations: []*TestReservation{},
- ExpectedAPIResponse: APIResponseExpectation{
- StatusCode: 503,
+ ExistingCRs: []*TestCR{
+ {CommitmentUUID: "uuid-to-sup", State: v1alpha1.CommitmentStatusConfirmed, AmountMiB: 1024, ProjectID: "project-A", AZ: "az-a"},
},
+ // confirmed→superseded is a confirming change (not in the liquid API's free-transition list).
+ CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
+ TestCommitment{
+ ResourceName: "hw_version_hana_1_ram",
+ ProjectID: "project-A",
+ ConfirmationID: "uuid-to-sup",
+ OldState: "confirmed",
+ State: "superseded",
+ Amount: 1,
+ }),
+ ExpectedAPIResponse: newAPIResponse(),
+ ExpectedCreatedCRNames: []string{"commitment-uuid-to-sup"},
+ ExpectedAllowRejection: map[string]bool{"commitment-uuid-to-sup": true},
+ ExpectedCRSpecs: map[string]int64{"commitment-uuid-to-sup": 0},
},
+ // --- Resize ---
{
- Name: "Multiple commitments insufficient capacity - all listed in error",
- // Tests that multiple failed commitments are all mentioned in the rejection reason
- Flavors: []*TestFlavor{m1Small, m1Tiny},
+ Name: "Resize down: confirmed→confirmed with less capacity, RequiresConfirmation=true",
+ Flavors: []*TestFlavor{m1Small},
+ ExistingCRs: []*TestCR{
+ {CommitmentUUID: "uuid-dn", State: v1alpha1.CommitmentStatusConfirmed, AmountMiB: 4 * 1024, ProjectID: "project-A", AZ: "az-a"},
+ },
CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
- createCommitment("hw_version_hana_1_ram", "project-A", "uuid-multi-fail-1", "confirmed", 3),
- createCommitment("hw_version_hana_1_ram", "project-B", "uuid-multi-fail-2", "confirmed", 3),
- createCommitment("hw_version_gp_1_ram", "project-C", "uuid-would-not-fail", "confirmed", 1), // would be rolled back, but not part of the reject reason
- ),
- AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 256}},
- ExpectedReservations: []*TestReservation{},
- ExpectedAPIResponse: newAPIResponse("2 commitment(s) failed", "commitment uuid-multi-fail-1: not sufficient capacity", "commitment uuid-multi-fail-2: not sufficient capacity"),
+ TestCommitment{
+ ResourceName: "hw_version_hana_1_ram",
+ ProjectID: "project-A",
+ ConfirmationID: "uuid-dn",
+ OldState: "confirmed",
+ OldAmount: 4,
+ State: "confirmed",
+ Amount: 2,
+ }),
+ ExpectedAPIResponse: newAPIResponse(),
+ ExpectedCreatedCRNames: []string{"commitment-uuid-dn"},
+ ExpectedAllowRejection: map[string]bool{"commitment-uuid-dn": true},
+ ExpectedCRSpecs: map[string]int64{"commitment-uuid-dn": 2 * 1024 * 1024 * 1024},
},
+ // --- Mixed batch success ---
{
- Name: "Deletion priority during rollback - unscheduled removed first",
- // Tests that during rollback, unscheduled reservations (no TargetHost) are deleted first,
- // preserving scheduled reservations (with TargetHost), especially those with VM allocations
- VMs: []*TestVM{{UUID: "vm-priority", Flavor: m1Small, ProjectID: "project-A", Host: "host-1", AZ: "az-a"}},
+ Name: "Mixed batch: delete + create both succeed without rollback",
Flavors: []*TestFlavor{m1Small},
- ExistingReservations: []*TestReservation{
- // Reservation with VM allocation - should be preserved (lowest deletion priority)
- {CommitmentID: "commitment-1", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-priority"}},
- // Scheduled but unused - medium deletion priority
- {CommitmentID: "commitment-1", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"},
+ ExistingCRs: []*TestCR{
+ {CommitmentUUID: "uuid-mbdel", State: v1alpha1.CommitmentStatusConfirmed, AmountMiB: 1024, ProjectID: "project-A", AZ: "az-a"},
},
CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
- createCommitment("hw_version_hana_1_ram", "project-A", "commitment-1", "confirmed", 4),
+ deleteCommitment("hw_version_hana_1_ram", "project-A", "uuid-mbdel", "confirmed", 1),
+ createCommitment("hw_version_hana_1_ram", "project-B", "uuid-mbnew", "confirmed", 2),
),
- AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0, "host-2": 1024}},
- ExpectedReservations: []*TestReservation{
- // After rollback, should preserve the scheduled reservations (especially with VMs)
- // and remove unscheduled ones first
- {CommitmentID: "commitment-1", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-priority"}},
- {CommitmentID: "commitment-1", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"},
- },
- ExpectedAPIResponse: newAPIResponse("commitment commitment-1: not sufficient capacity"),
+ ExpectedAPIResponse: newAPIResponse(),
+ ExpectedDeletedCRs: []string{"commitment-uuid-mbdel"},
+ ExpectedCreatedCRNames: []string{"commitment-uuid-mbnew"},
+ ExpectedAllowRejection: map[string]bool{"commitment-uuid-mbnew": true},
},
+ // --- Pre-write validation failure rollback ---
{
- Name: "Watch timeout with custom config - triggers rollback with timeout error",
+ Name: "Pre-write validation failure: first CR written then rolled back on second CR's unknown flavor group",
Flavors: []*TestFlavor{m1Small},
+ // project-A (valid) sorts before project-B (invalid): A's CR is written, then B's
+ // unknown flavor group triggers a pre-watch rollback that deletes A's CR.
CommitmentRequest: newCommitmentRequest("az-a", false, 1234,
- createCommitment("hw_version_hana_1_ram", "project-A", "uuid-timeout", "confirmed", 2),
+ createCommitment("hw_version_hana_1_ram", "project-A", "uuid-pva", "confirmed", 2),
+ createCommitment("hw_version_nonexistent_ram", "project-B", "uuid-pvb", "confirmed", 2),
),
- // With 0ms timeout, the watch will timeout immediately before reservations become ready
- CustomConfig: func() *commitments.Config {
- cfg := commitments.DefaultConfig()
- cfg.ChangeAPIWatchReservationsTimeout = 0 * time.Millisecond
- cfg.ChangeAPIWatchReservationsPollInterval = 100 * time.Millisecond
- return &cfg
- }(),
- ExpectedReservations: []*TestReservation{}, // Rollback removes all reservations
- ExpectedAPIResponse: newAPIResponse("timeout reached while processing commitment changes"),
+ ExpectedAPIResponse: newAPIResponse("flavor group not found"),
+ ExpectedDeletedCRs: []string{"commitment-uuid-pva"},
},
}
for _, tc := range testCases {
t.Run(tc.Name, func(t *testing.T) {
- runCommitmentChangeTest(t, tc)
+ runChangeCommitmentsTest(t, tc)
})
}
}
-// runCommitmentChangeTest executes a single commitment change integration test case.
-func runCommitmentChangeTest(t *testing.T, tc CommitmentChangeTestCase) {
+func runChangeCommitmentsTest(t *testing.T, tc CommitmentChangeTestCase) {
t.Helper()
- // Convert test types to actual types
- var vms []VM
- for _, testVM := range tc.VMs {
- vms = append(vms, testVM.ToVM())
- }
-
- var flavorInGroups []compute.FlavorInGroup
- for _, testFlavor := range tc.Flavors {
- flavorInGroups = append(flavorInGroups, testFlavor.ToFlavorInGroup())
- }
-
- // Use EnvInfoVersion if specified (non-zero), otherwise default to CommitmentRequest.InfoVersion
- envInfoVersion := tc.CommitmentRequest.InfoVersion
- if tc.EnvInfoVersion != 0 {
- envInfoVersion = tc.EnvInfoVersion
- }
-
- flavorGroups := TestFlavorGroup{
- infoVersion: envInfoVersion,
- flavors: flavorInGroups,
- }.ToFlavorGroupsKnowledge()
-
- // Convert existing reservations with auto-numbering per commitment
- var existingReservations []*v1alpha1.Reservation
- numberCounters := make(map[string]int)
- for _, testRes := range tc.ExistingReservations {
- number := numberCounters[testRes.CommitmentID]
- numberCounters[testRes.CommitmentID]++
- existingReservations = append(existingReservations, testRes.toReservation(number))
- }
-
- // Create test environment with available resources and custom config if provided
- env := newCommitmentTestEnv(t, vms, nil, existingReservations, flavorGroups, tc.AvailableResources, tc.CustomConfig)
+ env := newCRTestEnv(t, tc)
defer env.Close()
- t.Log("Initial state:")
- env.LogStateSummary()
-
- // Call commitment change API
reqJSON := buildRequestJSON(tc.CommitmentRequest)
- resp, respJSON, statusCode := env.CallChangeCommitmentsAPI(reqJSON)
+ resp, _, statusCode := env.CallChangeCommitmentsAPI(reqJSON)
- t.Log("After API call:")
- env.LogStateSummary()
+ env.VerifyAPIResponse(tc.ExpectedAPIResponse, resp, statusCode)
- // Verify API response
- env.VerifyAPIResponse(tc.ExpectedAPIResponse, resp, respJSON, statusCode)
-
- // Verify reservations using content-based matching
- env.VerifyReservationsMatch(tc.ExpectedReservations)
-
- // Log final test result
- if t.Failed() {
- t.Log("❌ Test FAILED")
- } else {
- t.Log("✅ Test PASSED")
+ if len(tc.ExpectedCreatedCRNames) > 0 {
+ env.VerifyCRsExist(tc.ExpectedCreatedCRNames)
+ }
+ if tc.ExpectedAllowRejection != nil {
+ env.VerifyAllowRejection(tc.ExpectedAllowRejection)
+ }
+ for crName, expectedAmountBytes := range tc.ExpectedCRSpecs {
+ env.VerifyCRAmountBytes(crName, expectedAmountBytes)
+ }
+ for _, crName := range tc.ExpectedDeletedCRs {
+ env.VerifyCRAbsent(crName)
}
}
// ============================================================================
-// Test Types & Constants
+// Test Types
// ============================================================================
const (
- defaultFlavorDiskGB = 40
- flavorGroupsKnowledgeName = "flavor-groups"
- knowledgeRecencyDuration = 60 * time.Second
- defaultCommitmentExpiryYears = 1
+ defaultFlavorDiskGB = 40
+ flavorGroupsKnowledgeName = "flavor-groups"
+ knowledgeRecencyDuration = 60 * time.Second
)
type CommitmentChangeTestCase struct {
- Name string
- VMs []*TestVM
- Flavors []*TestFlavor
- ExistingReservations []*TestReservation
- CommitmentRequest CommitmentChangeRequest
- ExpectedReservations []*TestReservation
- ExpectedAPIResponse APIResponseExpectation
- AvailableResources *AvailableResources // If nil, all reservations accepted without checks
- EnvInfoVersion int64 // Override InfoVersion for version mismatch tests
- CustomConfig *commitments.Config // Override default config for testing timeout behavior
-}
-
-// AvailableResources defines available memory per host (MB).
-// Scheduler uses first-come-first-serve. CPU is ignored.
-type AvailableResources struct {
- PerHost map[string]int64 // host -> available memory MB
-}
-
-type TestFlavorGroup struct {
- infoVersion int64
- flavors []compute.FlavorInGroup
-}
-
-func (tfg TestFlavorGroup) ToFlavorGroupsKnowledge() FlavorGroupsKnowledge {
- groupMap := make(map[string][]compute.FlavorInGroup)
-
- for _, flavor := range tfg.flavors {
- groupName := flavor.ExtraSpecs["quota:hw_version"]
- if groupName == "" {
- panic("Flavor " + flavor.Name + " is missing quota:hw_version in extra specs")
- }
- groupMap[groupName] = append(groupMap[groupName], flavor)
- }
-
- // Sort group names for deterministic iteration
- sortedGroupNames := make([]string, 0, len(groupMap))
- for groupName := range groupMap {
- sortedGroupNames = append(sortedGroupNames, groupName)
- }
- sort.Strings(sortedGroupNames)
-
- var groups []compute.FlavorGroupFeature
- for _, groupName := range sortedGroupNames {
- groupFlavors := groupMap[groupName]
- if len(groupFlavors) == 0 {
- continue
- }
-
- // Sort descending: required by reservation manager's flavor selection
- sort.Slice(groupFlavors, func(i, j int) bool {
- return groupFlavors[i].MemoryMB > groupFlavors[j].MemoryMB
- })
-
- smallest := groupFlavors[len(groupFlavors)-1]
- largest := groupFlavors[0]
-
- // Compute RAM/core ratio (MiB per vCPU)
- var minRatio, maxRatio uint64 = ^uint64(0), 0
- for _, f := range groupFlavors {
- if f.VCPUs == 0 {
- continue
- }
- ratio := f.MemoryMB / f.VCPUs
- if ratio < minRatio {
- minRatio = ratio
- }
- if ratio > maxRatio {
- maxRatio = ratio
- }
- }
-
- var ramCoreRatio, ramCoreRatioMin, ramCoreRatioMax *uint64
- if minRatio == maxRatio && maxRatio != 0 {
- ramCoreRatio = &minRatio
- } else if maxRatio != 0 {
- ramCoreRatioMin = &minRatio
- ramCoreRatioMax = &maxRatio
- }
-
- groups = append(groups, compute.FlavorGroupFeature{
- Name: groupName,
- Flavors: groupFlavors,
- SmallestFlavor: smallest,
- LargestFlavor: largest,
- RamCoreRatio: ramCoreRatio,
- RamCoreRatioMin: ramCoreRatioMin,
- RamCoreRatioMax: ramCoreRatioMax,
- })
- }
-
- return FlavorGroupsKnowledge{
- InfoVersion: tfg.infoVersion,
- Groups: groups,
- }
+ Name string
+ Flavors []*TestFlavor
+ // ExistingCRs: CommittedResource CRDs present before the API call.
+ ExistingCRs []*TestCR
+ // CROutcomes: what condition the fake controller sets per crName.
+ // Value = rejection reason if non-empty and not a named reason constant.
+ // Value = CommittedResourceReasonPlanned to simulate a planned outcome.
+ // Absent entry = controller accepts (Ready=True).
+ CROutcomes map[string]string
+ // NoCondition: crNames for which the fake controller sets no condition (simulate stall/timeout).
+ NoCondition []string
+ CommitmentRequest CommitmentChangeRequest
+ ExpectedAPIResponse APIResponseExpectation
+ // Post-call assertions.
+ ExpectedCreatedCRNames []string
+ ExpectedAllowRejection map[string]bool // crName → expected AllowRejection value
+ ExpectedCRSpecs map[string]int64 // crName → expected Amount.Value() in bytes
+ ExpectedDeletedCRs []string
+ CustomConfig *commitments.APIConfig
+ EnvInfoVersion int64
}
-type FlavorGroupsKnowledge struct {
- InfoVersion int64
- Groups []compute.FlavorGroupFeature
+// TestCR defines a pre-existing CommittedResource CRD.
+type TestCR struct {
+ CommitmentUUID string
+ State v1alpha1.CommitmentStatus
+ AmountMiB int64
+ ProjectID string
+ AZ string
}
type CommitmentChangeRequest struct {
@@ -704,29 +467,15 @@ type TestCommitment struct {
ResourceName liquid.ResourceName
ProjectID string
ConfirmationID string
- State string
+ OldState string // empty = None (no prior status)
+ State string // empty = None (deletion)
Amount uint64
+ OldAmount uint64 // if non-zero, used for TotalBefore totals instead of Amount (for resize-down)
}
type APIResponseExpectation struct {
StatusCode int
RejectReasonSubstrings []string
- RetryAtPresent bool
-}
-
-type ReservationVerification struct {
- Host string
- Allocations map[string]string
-}
-
-type VM struct {
- UUID string
- FlavorName string
- ProjectID string
- CurrentHypervisor string
- AvailabilityZone string
- Resources map[string]int64
- FlavorExtraSpecs map[string]string
}
type TestFlavor struct {
@@ -735,7 +484,7 @@ type TestFlavor struct {
MemoryMB int64
VCPUs int64
DiskGB uint64
- VideoRAMMiB *uint64 // optional, from flavor extra_specs hw_video:ram_max_mb
+ VideoRAMMiB *uint64
}
func (f *TestFlavor) ToFlavorInGroup() compute.FlavorInGroup {
@@ -758,1098 +507,576 @@ func (f *TestFlavor) ToFlavorInGroup() compute.FlavorInGroup {
}
}
-type TestVM struct {
- UUID string
- Flavor *TestFlavor
- ProjectID string
- Host string
- AZ string
-}
-
-func (vm *TestVM) ToVM() VM {
- return VM{
- UUID: vm.UUID,
- FlavorName: vm.Flavor.Name,
- ProjectID: vm.ProjectID,
- CurrentHypervisor: vm.Host,
- AvailabilityZone: vm.AZ,
- Resources: map[string]int64{
- "memory": vm.Flavor.MemoryMB,
- "vcpus": vm.Flavor.VCPUs,
- },
- FlavorExtraSpecs: map[string]string{
- "quota:hw_version": vm.Flavor.Group,
- },
- }
+type FlavorGroupsKnowledge struct {
+ InfoVersion int64
+ Groups []compute.FlavorGroupFeature
}
-type TestReservation struct {
- CommitmentID string
- Host string // Empty = any host accepted in matching
- Flavor *TestFlavor
- ProjectID string
- VMs []string // VM UUIDs
- MemoryMB int64 // If 0, uses Flavor.MemoryMB; else custom size
- AZ string
+// TestFlavorGroup groups a flat list of FlavorInGroup by hw_version extra spec
+// and builds a FlavorGroupsKnowledge. Used by usage_test.go and report_usage_test.go.
+type TestFlavorGroup struct {
+ infoVersion int64
+ flavors []compute.FlavorInGroup
}
-func (tr *TestReservation) toReservation(number int) *v1alpha1.Reservation {
- name := fmt.Sprintf("commitment-%s-%d", tr.CommitmentID, number)
-
- memoryMB := tr.MemoryMB
- if memoryMB == 0 {
- memoryMB = tr.Flavor.MemoryMB
- }
-
- specAllocations := make(map[string]v1alpha1.CommittedResourceAllocation)
- statusAllocations := make(map[string]string)
- for _, vmUUID := range tr.VMs {
- specAllocations[vmUUID] = v1alpha1.CommittedResourceAllocation{
- CreationTimestamp: metav1.Now(),
- Resources: map[hv1.ResourceName]resource.Quantity{
- "memory": resource.MustParse(strconv.FormatInt(memoryMB, 10) + "Mi"),
- "cpu": resource.MustParse(strconv.FormatInt(tr.Flavor.VCPUs, 10)),
- },
- }
- statusAllocations[vmUUID] = tr.Host
- }
-
- spec := v1alpha1.ReservationSpec{
- Type: v1alpha1.ReservationTypeCommittedResource,
- TargetHost: tr.Host,
- Resources: map[hv1.ResourceName]resource.Quantity{
- "memory": resource.MustParse(strconv.FormatInt(memoryMB, 10) + "Mi"),
- "cpu": resource.MustParse(strconv.FormatInt(tr.Flavor.VCPUs, 10)),
- },
- CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{
- CommitmentUUID: tr.CommitmentID,
- ProjectID: tr.ProjectID,
- ResourceName: tr.Flavor.Name,
- ResourceGroup: tr.Flavor.Group,
- Allocations: specAllocations,
- },
+func (tg TestFlavorGroup) ToFlavorGroupsKnowledge() FlavorGroupsKnowledge {
+ groupMap := make(map[string][]compute.FlavorInGroup)
+ for _, f := range tg.flavors {
+ name := f.ExtraSpecs["quota:hw_version"]
+ groupMap[name] = append(groupMap[name], f)
}
- if tr.AZ != "" {
- spec.AvailabilityZone = tr.AZ
+ sortedNames := make([]string, 0, len(groupMap))
+ for n := range groupMap {
+ sortedNames = append(sortedNames, n)
}
+ sort.Strings(sortedNames)
- return &v1alpha1.Reservation{
- ObjectMeta: metav1.ObjectMeta{
- Name: name,
- Labels: map[string]string{
- v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
- },
- },
- Spec: spec,
- Status: v1alpha1.ReservationStatus{
- Conditions: []metav1.Condition{
- {
- Type: v1alpha1.ReservationConditionReady,
- Status: metav1.ConditionTrue,
- Reason: "ReservationActive",
- },
- },
- Host: tr.Host,
- CommittedResourceReservation: &v1alpha1.CommittedResourceReservationStatus{
- Allocations: statusAllocations,
- },
- },
+ var groups []compute.FlavorGroupFeature
+ for _, name := range sortedNames {
+ gFlavors := groupMap[name]
+ sort.Slice(gFlavors, func(i, j int) bool { return gFlavors[i].MemoryMB > gFlavors[j].MemoryMB })
+ smallest := gFlavors[len(gFlavors)-1]
+ largest := gFlavors[0]
+
+ var minR, maxR uint64 = ^uint64(0), 0
+ for _, f := range gFlavors {
+ if f.VCPUs == 0 {
+ continue
+ }
+ r := f.MemoryMB / f.VCPUs
+ if r < minR {
+ minR = r
+ }
+ if r > maxR {
+ maxR = r
+ }
+ }
+ var ratio, ratioMin, ratioMax *uint64
+ if minR == maxR && maxR != 0 {
+ ratio = &minR
+ } else if maxR != 0 {
+ ratioMin = &minR
+ ratioMax = &maxR
+ }
+ groups = append(groups, compute.FlavorGroupFeature{
+ Name: name,
+ Flavors: gFlavors,
+ SmallestFlavor: smallest,
+ LargestFlavor: largest,
+ RamCoreRatio: ratio,
+ RamCoreRatioMin: ratioMin,
+ RamCoreRatioMax: ratioMax,
+ })
}
+ return FlavorGroupsKnowledge{InfoVersion: tg.infoVersion, Groups: groups}
}
// ============================================================================
-// Test Environment
+// Fake Controller Client
// ============================================================================
-type CommitmentTestEnv struct {
- T *testing.T
- Scheme *runtime.Scheme
- K8sClient client.Client
- VMSource *MockVMSource
- FlavorGroups FlavorGroupsKnowledge
- HTTPServer *httptest.Server
- API *HTTPAPI
- availableResources map[string]int64 // host -> available memory MB
- processedReserv map[string]bool // track processed reservations
- mu sync.Mutex // protects availableResources and processedReserv
+// fakeControllerClient wraps a client.Client and simulates the CommittedResource
+// controller by immediately setting conditions after any Create or Update of a
+// CommittedResource CRD. Entries in noCondition suppress condition-setting to
+// simulate a stalled controller (used for timeout tests).
+type fakeControllerClient struct {
+ client.Client
+ outcomes map[string]string // crName → rejection reason (or reason constant); absent = accept
+ noCondition map[string]struct{}
+ mu sync.Mutex
}
-// FakeReservationController simulates synchronous reservation controller.
-type FakeReservationController struct {
- env *CommitmentTestEnv
+func (c *fakeControllerClient) Create(ctx context.Context, obj client.Object, opts ...client.CreateOption) error {
+ if err := c.Client.Create(ctx, obj, opts...); err != nil {
+ return err
+ }
+ if cr, ok := obj.(*v1alpha1.CommittedResource); ok {
+ c.setConditionFor(ctx, cr.Name)
+ }
+ return nil
}
-func (c *FakeReservationController) OnReservationCreated(res *v1alpha1.Reservation) {
- c.env.processNewReservation(res)
+func (c *fakeControllerClient) Update(ctx context.Context, obj client.Object, opts ...client.UpdateOption) error {
+ if err := c.Client.Update(ctx, obj, opts...); err != nil {
+ return err
+ }
+ if cr, ok := obj.(*v1alpha1.CommittedResource); ok {
+ c.setConditionFor(ctx, cr.Name)
+ }
+ return nil
}
-func (c *FakeReservationController) OnReservationDeleted(res *v1alpha1.Reservation) {
- c.env.mu.Lock()
- defer c.env.mu.Unlock()
+func (c *fakeControllerClient) setConditionFor(ctx context.Context, crName string) {
+ c.mu.Lock()
+ _, skip := c.noCondition[crName]
+ outcome, hasOutcome := c.outcomes[crName]
+ c.mu.Unlock()
- // Return memory when Delete() is called directly (before deletion timestamp is set)
- if c.env.availableResources != nil && res.Status.Host != "" {
- memoryQuantity := res.Spec.Resources["memory"]
- memoryBytes := memoryQuantity.Value()
- memoryMB := memoryBytes / (1024 * 1024)
+ if skip {
+ return
+ }
- if _, exists := c.env.availableResources[res.Status.Host]; exists {
- c.env.availableResources[res.Status.Host] += memoryMB
- c.env.T.Logf("↩ Returned %d MB to %s (now %d MB available) via OnReservationDeleted for %s",
- memoryMB, res.Status.Host, c.env.availableResources[res.Status.Host], res.Name)
+ var cond metav1.Condition
+ switch {
+ case !hasOutcome || outcome == "":
+ // Default: controller accepts.
+ cond = metav1.Condition{
+ Type: v1alpha1.CommittedResourceConditionReady,
+ Status: metav1.ConditionTrue,
+ Reason: v1alpha1.CommittedResourceReasonAccepted,
+ Message: "accepted",
+ }
+ case outcome == v1alpha1.CommittedResourceReasonPlanned:
+ cond = metav1.Condition{
+ Type: v1alpha1.CommittedResourceConditionReady,
+ Status: metav1.ConditionFalse,
+ Reason: v1alpha1.CommittedResourceReasonPlanned,
+ Message: "commitment is not yet active",
+ }
+ default:
+ cond = metav1.Condition{
+ Type: v1alpha1.CommittedResourceConditionReady,
+ Status: metav1.ConditionFalse,
+ Reason: v1alpha1.CommittedResourceReasonRejected,
+ Message: outcome,
}
}
- // Clear tracking so recreated reservations with same name are processed
- delete(c.env.processedReserv, res.Name)
+ cr := &v1alpha1.CommittedResource{}
+ if err := c.Get(ctx, client.ObjectKey{Name: crName}, cr); err != nil {
+ return
+ }
+ meta.SetStatusCondition(&cr.Status.Conditions, cond)
+ if err := c.Client.Status().Update(ctx, cr); err != nil {
+ return // best-effort: if the update races with another write, the polling loop retries
+ }
}
-// operationInterceptorClient routes reservation events to FakeReservationController.
-type operationInterceptorClient struct {
- client.Client
- controller *FakeReservationController
+// ============================================================================
+// Test Environment
+// ============================================================================
+
+type CRTestEnv struct {
+ T *testing.T
+ K8sClient client.Client
+ HTTPServer *httptest.Server
}
-func (d *operationInterceptorClient) Create(ctx context.Context, obj client.Object, opts ...client.CreateOption) error {
- err := d.Client.Create(ctx, obj, opts...)
- if err != nil {
- return err
- }
-
- if res, ok := obj.(*v1alpha1.Reservation); ok {
- d.controller.OnReservationCreated(res)
- }
-
- return nil
-}
-
-func (d *operationInterceptorClient) Delete(ctx context.Context, obj client.Object, opts ...client.DeleteOption) error {
- if res, ok := obj.(*v1alpha1.Reservation); ok {
- d.controller.OnReservationDeleted(res)
- }
-
- return d.Client.Delete(ctx, obj, opts...)
-}
-
-func (env *CommitmentTestEnv) Close() {
- if env.HTTPServer != nil {
- env.HTTPServer.Close()
- }
-}
-
-func newCommitmentTestEnv(
- t *testing.T,
- vms []VM,
- hypervisors []*hv1.Hypervisor,
- reservations []*v1alpha1.Reservation,
- flavorGroups FlavorGroupsKnowledge,
- resources *AvailableResources,
- customConfig *commitments.Config,
-) *CommitmentTestEnv {
-
+func newCRTestEnv(t *testing.T, tc CommitmentChangeTestCase) *CRTestEnv {
t.Helper()
-
log.SetLogger(zap.New(zap.WriteTo(os.Stderr), zap.UseDevMode(true)))
- objects := make([]client.Object, 0, len(hypervisors)+len(reservations))
- for _, hv := range hypervisors {
- objects = append(objects, hv)
- }
- for _, res := range reservations {
- objects = append(objects, res)
- }
-
scheme := runtime.NewScheme()
if err := v1alpha1.AddToScheme(scheme); err != nil {
- t.Fatalf("Failed to add v1alpha1 scheme: %v", err)
+ t.Fatalf("failed to add v1alpha1 scheme: %v", err)
}
- if err := hv1.AddToScheme(scheme); err != nil {
- t.Fatalf("Failed to add hv1 scheme: %v", err)
+
+ objects := make([]client.Object, 0)
+
+ // Knowledge CRD (InfoVersion=-1 simulates "not ready").
+ envInfoVersion := tc.CommitmentRequest.InfoVersion
+ if tc.EnvInfoVersion != 0 {
+ envInfoVersion = tc.EnvInfoVersion
+ }
+ if envInfoVersion != -1 {
+ objects = append(objects, createKnowledgeCRD(buildFlavorGroupsKnowledge(tc.Flavors, envInfoVersion)))
}
- // InfoVersion of -1 skips Knowledge CRD creation (tests "not ready" scenario)
- if flavorGroups.InfoVersion != -1 {
- knowledgeCRD := createKnowledgeCRD(flavorGroups)
- objects = append(objects, knowledgeCRD)
+ // Pre-existing CommittedResource CRDs.
+ for _, tcr := range tc.ExistingCRs {
+ objects = append(objects, tcr.toCommittedResource())
}
- baseK8sClient := fake.NewClientBuilder().
+ baseClient := fake.NewClientBuilder().
WithScheme(scheme).
WithObjects(objects...).
- WithStatusSubresource(&v1alpha1.Reservation{}).
- WithStatusSubresource(&v1alpha1.Knowledge{}).
- WithIndex(&v1alpha1.Reservation{}, "spec.type", func(obj client.Object) []string {
- res := obj.(*v1alpha1.Reservation)
- return []string{string(res.Spec.Type)}
- }).
+ WithStatusSubresource(&v1alpha1.CommittedResource{}, &v1alpha1.Knowledge{}).
Build()
- var availableResources map[string]int64
- if resources != nil && resources.PerHost != nil {
- availableResources = make(map[string]int64)
- for host, memMB := range resources.PerHost {
- availableResources[host] = memMB
- }
+ noCondition := make(map[string]struct{})
+ for _, name := range tc.NoCondition {
+ noCondition[name] = struct{}{}
}
- env := &CommitmentTestEnv{
- T: t,
- Scheme: scheme,
- K8sClient: nil, // Will be set below
- VMSource: NewMockVMSource(vms),
- FlavorGroups: flavorGroups,
- HTTPServer: nil, // Will be set below
- API: nil, // Will be set below
- availableResources: availableResources,
- processedReserv: make(map[string]bool),
+ wrapped := &fakeControllerClient{
+ Client: baseClient,
+ outcomes: tc.CROutcomes,
+ noCondition: noCondition,
}
- controller := &FakeReservationController{env: env}
- wrappedClient := &operationInterceptorClient{
- Client: baseK8sClient,
- controller: controller,
- }
- env.K8sClient = wrappedClient
-
- // Use custom config if provided, otherwise use default
var api *HTTPAPI
- if customConfig != nil {
- api = NewAPIWithConfig(wrappedClient, *customConfig, nil)
+ if tc.CustomConfig != nil {
+ api = NewAPIWithConfig(wrapped, *tc.CustomConfig, nil)
} else {
- api = NewAPI(wrappedClient)
+ api = NewAPI(wrapped)
}
mux := http.NewServeMux()
registry := prometheus.NewRegistry()
api.Init(mux, registry, log.Log)
- httpServer := httptest.NewServer(mux)
-
- env.HTTPServer = httpServer
- env.API = api
-
- return env
-}
-
-// ============================================================================
-// Environment Helper Methods
-// ============================================================================
-
-// ListVMs returns all VMs from the VMSource.
-func (env *CommitmentTestEnv) ListVMs() []VM {
- vms, err := env.VMSource.ListVMs(context.Background())
- if err != nil {
- env.T.Fatalf("Failed to list VMs: %v", err)
- }
- return vms
-}
-
-// ListReservations returns all reservations.
-func (env *CommitmentTestEnv) ListReservations() []v1alpha1.Reservation {
- var list v1alpha1.ReservationList
- if err := env.K8sClient.List(context.Background(), &list); err != nil {
- env.T.Fatalf("Failed to list reservations: %v", err)
- }
- return list.Items
-}
-// ListHypervisors returns all hypervisors.
-func (env *CommitmentTestEnv) ListHypervisors() []hv1.Hypervisor {
- var list hv1.HypervisorList
- if err := env.K8sClient.List(context.Background(), &list); err != nil {
- env.T.Fatalf("Failed to list hypervisors: %v", err)
+ return &CRTestEnv{
+ T: t,
+ K8sClient: wrapped,
+ HTTPServer: httptest.NewServer(mux),
}
- return list.Items
}
-// LogStateSummary logs a summary of the current state.
-func (env *CommitmentTestEnv) LogStateSummary() {
- env.T.Helper()
-
- hypervisors := env.ListHypervisors()
- vms := env.ListVMs()
- reservations := env.ListReservations()
-
- env.T.Log("=== State Summary ===")
- env.T.Logf("Hypervisors: %d", len(hypervisors))
- env.T.Logf("VMs: %d", len(vms))
- env.T.Logf("Reservations: %d", len(reservations))
-
- for _, res := range reservations {
- allocCount := 0
- if res.Status.CommittedResourceReservation != nil {
- allocCount = len(res.Status.CommittedResourceReservation.Allocations)
- }
- env.T.Logf(" - %s (host: %s, allocations: %d)", res.Name, res.Status.Host, allocCount)
+func (env *CRTestEnv) Close() {
+ if env.HTTPServer != nil {
+ env.HTTPServer.Close()
}
- env.T.Log("=====================")
}
-// CallChangeCommitmentsAPI calls the change commitments API endpoint with JSON.
-// Reservation processing is fully synchronous via operationInterceptorClient hooks.
-func (env *CommitmentTestEnv) CallChangeCommitmentsAPI(reqJSON string) (resp liquid.CommitmentChangeResponse, respJSON string, statusCode int) {
+func (env *CRTestEnv) CallChangeCommitmentsAPI(reqJSON string) (resp liquid.CommitmentChangeResponse, respBody string, statusCode int) {
env.T.Helper()
-
- // Make HTTP request - reservation processing happens synchronously via Create/Delete hooks
url := env.HTTPServer.URL + "/commitments/v1/change-commitments"
- httpResp, err := http.Post(url, "application/json", bytes.NewReader([]byte(reqJSON))) //nolint:gosec,noctx // test server URL, not user input
+ httpResp, err := http.Post(url, "application/json", bytes.NewReader([]byte(reqJSON))) //nolint:gosec,noctx
if err != nil {
- env.T.Fatalf("Failed to make HTTP request: %v", err)
+ env.T.Fatalf("HTTP request failed: %v", err)
}
defer httpResp.Body.Close()
-
- // Read response body
- respBytes, err := io.ReadAll(httpResp.Body)
+ raw, err := io.ReadAll(httpResp.Body)
if err != nil {
- env.T.Fatalf("Failed to read response body: %v", err)
+ env.T.Fatalf("failed to read response: %v", err)
}
-
- respJSON = string(respBytes)
-
- // Parse response - only for 200 OK responses
- // Non-200 responses (like 409 Conflict for version mismatch) use plain text via http.Error()
if httpResp.StatusCode == http.StatusOK {
- if err := json.Unmarshal(respBytes, &resp); err != nil {
- env.T.Fatalf("Failed to unmarshal response: %v", err)
+ if err := json.Unmarshal(raw, &resp); err != nil {
+ env.T.Fatalf("failed to unmarshal response: %v", err)
}
}
-
- // Final pass to handle any deletions (finalizer removal)
- env.processReservations()
-
- statusCode = httpResp.StatusCode
- return resp, respJSON, statusCode
+ return resp, string(raw), httpResp.StatusCode
}
-// processReservations handles all reservation lifecycle events synchronously.
-// This includes marking reservations as Ready/Failed and removing finalizers from deleted reservations.
-func (env *CommitmentTestEnv) processReservations() {
- ctx := context.Background()
- reservations := env.ListReservations()
-
- for _, res := range reservations {
- // Handle deletion - return memory to host and remove finalizers
- if !res.DeletionTimestamp.IsZero() {
- env.T.Logf("Processing deletion for reservation %s (host: %s)", res.Name, res.Status.Host)
-
- env.mu.Lock()
- // Return memory to host if resource tracking is enabled
- if env.availableResources != nil {
- env.T.Logf("Resource tracking enabled, returning memory for %s", res.Name)
- memoryQuantity := res.Spec.Resources["memory"]
- memoryBytes := memoryQuantity.Value()
- memoryMB := memoryBytes / (1024 * 1024)
-
- env.T.Logf("Reservation %s has host=%s, memory=%d MB", res.Name, res.Status.Host, memoryMB)
-
- // Check if host exists in our tracking
- if _, exists := env.availableResources[res.Status.Host]; !exists {
- env.mu.Unlock()
- env.T.Fatalf("Host %s not found in available resources for reservation %s - this indicates an inconsistency",
- res.Status.Host, res.Name)
- }
-
- // Return memory to host
- env.availableResources[res.Status.Host] += memoryMB
- env.T.Logf("↩ Returned %d MB to %s (now %d MB available) from deleted reservation %s",
- memoryMB, res.Status.Host, env.availableResources[res.Status.Host], res.Name)
- } else {
- env.T.Logf("Resource tracking NOT enabled for %s", res.Name)
- }
-
- // Clear tracking so recreated reservations with same name are processed
- delete(env.processedReserv, res.Name)
- env.mu.Unlock()
-
- // Remove finalizers to allow deletion
- if len(res.Finalizers) > 0 {
- res.Finalizers = []string{}
- if err := env.K8sClient.Update(ctx, &res); err != nil {
- // Ignore errors - might be already deleted
- continue
- }
- }
- continue
- }
-
- // Skip if already processed (has a condition set)
- if env.hasCondition(&res) {
- continue
- }
-
- env.mu.Lock()
- alreadyProcessed := env.processedReserv[res.Name]
- env.mu.Unlock()
-
- // Skip if already tracked as processed
- if alreadyProcessed {
- continue
- }
-
- // Process new reservation with resource-based scheduling
- env.processNewReservation(&res)
- }
-}
-
-// hasCondition checks if a reservation has any Ready condition set.
-func (env *CommitmentTestEnv) hasCondition(res *v1alpha1.Reservation) bool {
- for _, cond := range res.Status.Conditions {
- if cond.Type == v1alpha1.ReservationConditionReady {
- return true
- }
- }
- return false
-}
-
-// processNewReservation implements first-come-first-serve scheduling based on available resources.
-// It tries to find a host with enough memory capacity and assigns the reservation to that host.
-func (env *CommitmentTestEnv) processNewReservation(res *v1alpha1.Reservation) {
- env.mu.Lock()
- defer env.mu.Unlock()
-
- env.processedReserv[res.Name] = true
-
- if res.Spec.CommittedResourceReservation == nil || res.Spec.CommittedResourceReservation.ResourceGroup == "" || res.Spec.Resources == nil || res.Spec.Resources["memory"] == (resource.Quantity{}) {
- env.markReservationFailedStatus(res, "invalid reservation spec")
- env.T.Logf("✗ Invalid reservation spec for %s: marking as failed (resource group: %s, resources: %v)", res.Name, res.Spec.CommittedResourceReservation.ResourceGroup, res.Spec.Resources)
- return
- }
-
- // If no available resources configured, accept all reservations without host assignment
- if env.availableResources == nil {
- env.T.Logf("✓ Scheduled reservation %s - no resource tracking, simply accept", res.Name)
- env.markReservationSchedulerProcessedStatus(res, "some-host")
- return
- }
-
- // Get required memory from reservation spec
- memoryQuantity := res.Spec.Resources["memory"]
- memoryBytes := memoryQuantity.Value()
- memoryMB := memoryBytes / (1024 * 1024)
-
- // First-come-first-serve: find first host with enough capacity
- // Sort hosts to ensure deterministic behavior (Go map iteration is random)
- hosts := make([]string, 0, len(env.availableResources))
- for host := range env.availableResources {
- hosts = append(hosts, host)
+func (env *CRTestEnv) VerifyAPIResponse(expected APIResponseExpectation, resp liquid.CommitmentChangeResponse, statusCode int) {
+ env.T.Helper()
+ expectedCode := expected.StatusCode
+ if expectedCode == 0 {
+ expectedCode = http.StatusOK
}
- sort.Strings(hosts)
-
- var selectedHost string
- for _, host := range hosts {
- if env.availableResources[host] >= memoryMB {
- selectedHost = host
- break
- }
+ if statusCode != expectedCode {
+ env.T.Errorf("expected status %d, got %d", expectedCode, statusCode)
}
-
- if selectedHost != "" {
- // SUCCESS: Schedule on this host
- env.availableResources[selectedHost] -= memoryMB
-
- // Update reservation with selected host
- ctx := context.Background()
-
- // Update spec (TargetHost)
- res.Spec.TargetHost = selectedHost
- if err := env.K8sClient.Update(ctx, res); err != nil {
- env.T.Logf("Warning: Failed to update reservation spec: %v", err)
- }
-
- // Update status (Host) - requires Status().Update
- res.Status.Host = selectedHost
- if err := env.K8sClient.Status().Update(ctx, res); err != nil {
- env.T.Logf("Warning: Failed to update reservation status host: %v", err)
+ for _, sub := range expected.RejectReasonSubstrings {
+ if !strings.Contains(resp.RejectionReason, sub) {
+ env.T.Errorf("rejection reason %q does not contain %q", resp.RejectionReason, sub)
}
-
- env.markReservationSchedulerProcessedStatus(res, selectedHost)
- env.T.Logf("✓ Scheduled reservation %s on %s (%d MB used, %d MB remaining)",
- res.Name, selectedHost, memoryMB, env.availableResources[selectedHost])
- } else {
- env.markReservationSchedulerProcessedStatus(res, "")
- env.T.Logf("✗ Failed to schedule reservation %s (needs %d MB, no host has capacity)",
- res.Name, memoryMB)
}
}
-// markReservationSchedulerProcessedStatus updates a reservation status based on scheduling result.
-// If host is non-empty, sets Ready=True (success). If host is empty, sets Ready=False with NoHostsFound (failure).
-func (env *CommitmentTestEnv) markReservationSchedulerProcessedStatus(res *v1alpha1.Reservation, host string) {
- ctx := context.Background()
-
- // Update spec first
- res.Spec.TargetHost = host
- if err := env.K8sClient.Update(ctx, res); err != nil {
- env.T.Logf("Warning: Failed to update reservation spec: %v", err)
- return
- }
-
- // Then update status - Ready=True only if host was found, Ready=False otherwise
- res.Status.Host = host
- if host != "" {
- res.Status.Conditions = []metav1.Condition{
- {
- Type: v1alpha1.ReservationConditionReady,
- Status: metav1.ConditionTrue,
- Reason: "ReservationActive",
- Message: "Reservation is ready (set by test controller)",
- LastTransitionTime: metav1.Now(),
- },
- }
- } else {
- res.Status.Conditions = []metav1.Condition{
- {
- Type: v1alpha1.ReservationConditionReady,
- Status: metav1.ConditionFalse,
- Reason: "NoHostsFound",
- Message: "No hosts with sufficient capacity (set by test controller)",
- LastTransitionTime: metav1.Now(),
- },
+func (env *CRTestEnv) VerifyCRsExist(names []string) {
+ env.T.Helper()
+ for _, name := range names {
+ cr := &v1alpha1.CommittedResource{}
+ if err := env.K8sClient.Get(context.Background(), client.ObjectKey{Name: name}, cr); err != nil {
+ env.T.Errorf("expected CommittedResource %q to exist, but got: %v", name, err)
}
}
- if err := env.K8sClient.Status().Update(ctx, res); err != nil {
- env.T.Logf("Warning: Failed to update reservation status: %v", err)
- }
}
-// markReservationFailedStatus updates a reservation to have Ready=False status
-func (env *CommitmentTestEnv) markReservationFailedStatus(res *v1alpha1.Reservation, reason string) {
- res.Status.Conditions = []metav1.Condition{
- {
- Type: v1alpha1.ReservationConditionReady,
- Status: metav1.ConditionFalse,
- Reason: "Reservation invalid",
- Message: reason,
- LastTransitionTime: metav1.Now(),
- },
- }
-
- if err := env.K8sClient.Status().Update(context.Background(), res); err != nil {
- // Ignore errors - might be deleted during update
- return
+func (env *CRTestEnv) VerifyCRAbsent(name string) {
+ env.T.Helper()
+ cr := &v1alpha1.CommittedResource{}
+ err := env.K8sClient.Get(context.Background(), client.ObjectKey{Name: name}, cr)
+ if err == nil {
+ env.T.Errorf("expected CommittedResource %q to be absent after rollback, but it still exists", name)
+ } else if !apierrors.IsNotFound(err) {
+ env.T.Errorf("unexpected error checking if CommittedResource %q is absent: %v", name, err)
}
}
-// VerifyAPIResponse verifies the API response matches expectations.
-// For rejection reasons, it checks if ALL expected substrings are present in the actual rejection reason.
-func (env *CommitmentTestEnv) VerifyAPIResponse(expected APIResponseExpectation, actual liquid.CommitmentChangeResponse, respJSON string, statusCode int) {
+func (env *CRTestEnv) VerifyAllowRejection(expected map[string]bool) {
env.T.Helper()
-
- if statusCode != expected.StatusCode {
- env.T.Errorf("Expected status code %d, got %d", expected.StatusCode, statusCode)
- }
-
- if len(expected.RejectReasonSubstrings) > 0 {
- if actual.RejectionReason == "" {
- env.T.Errorf("Expected rejection reason containing substrings %v, got none", expected.RejectReasonSubstrings)
- } else {
- // Check that ALL expected substrings are present
- for _, substring := range expected.RejectReasonSubstrings {
- if !strings.Contains(actual.RejectionReason, substring) {
- env.T.Errorf("Expected rejection reason to contain %q, but got %q", substring, actual.RejectionReason)
- }
- }
- }
- } else {
- if actual.RejectionReason != "" {
- env.T.Errorf("Expected no rejection reason, got %q", actual.RejectionReason)
- }
- }
-
- // Check RetryAt field presence in JSON (avoids dealing with option.Option type)
- retryAtPresent := strings.Contains(respJSON, `"retryAt"`)
- if expected.RetryAtPresent {
- if !retryAtPresent {
- env.T.Error("Expected retryAt field to be present in JSON response, but it was not found")
+ for crName, want := range expected {
+ cr := &v1alpha1.CommittedResource{}
+ if err := env.K8sClient.Get(context.Background(), client.ObjectKey{Name: crName}, cr); err != nil {
+ env.T.Errorf("CommittedResource %q not found: %v", crName, err)
+ continue
}
- } else {
- if retryAtPresent {
- env.T.Error("Expected retryAt field to be absent from JSON response, but it was found")
+ if cr.Spec.AllowRejection != want {
+ env.T.Errorf("CommittedResource %q: AllowRejection=%v, want %v", crName, cr.Spec.AllowRejection, want)
}
}
}
-// VerifyReservationsMatch verifies that actual reservations match expected reservations by content.
-func (env *CommitmentTestEnv) VerifyReservationsMatch(expected []*TestReservation) {
+func (env *CRTestEnv) VerifyCRAmountBytes(crName string, wantBytes int64) {
env.T.Helper()
-
- actualReservations := env.ListReservations()
-
- // Make copies of both lists so we can remove matched items
- expectedCopy := make([]*TestReservation, len(expected))
- copy(expectedCopy, expected)
-
- actualCopy := make([]v1alpha1.Reservation, len(actualReservations))
- copy(actualCopy, actualReservations)
-
- // Track unmatched items for detailed reporting
- var unmatchedExpected []*TestReservation
- var unmatchedActual []v1alpha1.Reservation
-
- // Greedy matching: while there are expected items, find matches and remove
- for len(expectedCopy) > 0 {
- exp := expectedCopy[0]
- found := false
-
- // Find first actual that matches this expected
- for i, actual := range actualCopy {
- if env.reservationMatches(exp, &actual) {
- expectedCopy = expectedCopy[1:]
- actualCopy = append(actualCopy[:i], actualCopy[i+1:]...)
- found = true
- break
- }
- }
-
- if !found {
- unmatchedExpected = append(unmatchedExpected, exp)
- expectedCopy = expectedCopy[1:]
- }
+ cr := &v1alpha1.CommittedResource{}
+ if err := env.K8sClient.Get(context.Background(), client.ObjectKey{Name: crName}, cr); err != nil {
+ env.T.Errorf("CommittedResource %q not found: %v", crName, err)
+ return
}
-
- unmatchedActual = actualCopy
-
- // If there are any mismatches, print detailed comparison
- if len(unmatchedExpected) > 0 || len(unmatchedActual) > 0 {
- env.T.Error("❌ Reservation mismatch detected!")
- env.T.Log("")
- env.T.Log("═══════════════════════════════════════════════════════════════")
- env.T.Log("EXPECTED RESERVATIONS:")
- env.T.Log("═══════════════════════════════════════════════════════════════")
- env.printExpectedReservations(expected, unmatchedExpected)
-
- env.T.Log("")
- env.T.Log("═══════════════════════════════════════════════════════════════")
- env.T.Log("ACTUAL RESERVATIONS:")
- env.T.Log("═══════════════════════════════════════════════════════════════")
- env.printActualReservations(actualReservations, unmatchedActual)
-
- env.T.Log("")
- env.T.Log("═══════════════════════════════════════════════════════════════")
- env.T.Log("DIFF SUMMARY:")
- env.T.Log("═══════════════════════════════════════════════════════════════")
- env.printDiffSummary(unmatchedExpected, unmatchedActual)
- env.T.Log("═══════════════════════════════════════════════════════════════")
+ got := cr.Spec.Amount.Value()
+ if got != wantBytes {
+ env.T.Errorf("CommittedResource %q: Amount=%d bytes, want %d bytes", crName, got, wantBytes)
}
}
-// String returns a compact string representation of a TestReservation.
-func (tr *TestReservation) String() string {
- flavorName := ""
- flavorGroup := ""
- if tr.Flavor != nil {
- flavorName = tr.Flavor.Name
- flavorGroup = tr.Flavor.Group
- }
-
- host := tr.Host
- if host == "" {
- host = ""
- }
-
- az := tr.AZ
- if az == "" {
- az = ""
- }
+// ============================================================================
+// TestCR → v1alpha1.CommittedResource
+// ============================================================================
- vmInfo := ""
- if len(tr.VMs) > 0 {
- vmInfo = fmt.Sprintf(" VMs=%v", tr.VMs)
+func (tc *TestCR) toCommittedResource() *v1alpha1.CommittedResource {
+ amount := resource.NewQuantity(tc.AmountMiB*1024*1024, resource.BinarySI)
+ return &v1alpha1.CommittedResource{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "commitment-" + tc.CommitmentUUID,
+ },
+ Spec: v1alpha1.CommittedResourceSpec{
+ CommitmentUUID: tc.CommitmentUUID,
+ FlavorGroupName: "hana_1",
+ ResourceType: v1alpha1.CommittedResourceTypeMemory,
+ Amount: *amount,
+ AvailabilityZone: tc.AZ,
+ ProjectID: tc.ProjectID,
+ State: tc.State,
+ },
}
-
- return fmt.Sprintf("%s/%s/%s(%s)/%s/az=%s%s", tr.CommitmentID, tr.ProjectID, flavorName, flavorGroup, host, az, vmInfo)
}
-// compactReservationString returns a compact string representation of an actual Reservation.
-func compactReservationString(res *v1alpha1.Reservation) string {
- commitmentID := ""
- projectID := ""
- flavorName := ""
- flavorGroup := ""
- vmCount := 0
-
- if res.Spec.CommittedResourceReservation != nil {
- commitmentID = res.Spec.CommittedResourceReservation.CommitmentUUID
- projectID = res.Spec.CommittedResourceReservation.ProjectID
- flavorName = res.Spec.CommittedResourceReservation.ResourceName
- flavorGroup = res.Spec.CommittedResourceReservation.ResourceGroup
- if res.Status.CommittedResourceReservation != nil {
- vmCount = len(res.Status.CommittedResourceReservation.Allocations)
- }
- }
-
- host := res.Status.Host
- if host == "" {
- host = ""
- }
-
- az := res.Spec.AvailabilityZone
- if az == "" {
- az = ""
- }
+// ============================================================================
+// Request / Response helpers
+// ============================================================================
- vmInfo := ""
- if vmCount > 0 {
- vmInfo = fmt.Sprintf(" VMs=%d", vmCount)
+func newAPIResponse(rejectSubstrings ...string) APIResponseExpectation {
+ return APIResponseExpectation{
+ StatusCode: http.StatusOK,
+ RejectReasonSubstrings: rejectSubstrings,
}
-
- return fmt.Sprintf("%s/%s/%s(%s)/%s/az=%s%s", commitmentID, projectID, flavorName, flavorGroup, host, az, vmInfo)
}
-// printExpectedReservations prints all expected reservations with markers for unmatched ones.
-func (env *CommitmentTestEnv) printExpectedReservations(all, unmatched []*TestReservation) {
- env.T.Helper()
-
- unmatchedMap := make(map[*TestReservation]bool)
- for _, res := range unmatched {
- unmatchedMap[res] = true
- }
-
- if len(all) == 0 {
- env.T.Log(" (none)")
- return
- }
-
- for i, res := range all {
- marker := "✓"
- if unmatchedMap[res] {
- marker = "✗"
- }
- env.T.Logf(" %s [%d] %s", marker, i+1, res.String())
+func newCommitmentRequest(az string, dryRun bool, infoVersion int64, commitments ...TestCommitment) CommitmentChangeRequest {
+ return CommitmentChangeRequest{
+ AZ: az,
+ DryRun: dryRun,
+ InfoVersion: infoVersion,
+ Commitments: commitments,
}
-
- env.T.Logf(" Total: %d (%d matched, %d missing)",
- len(all), len(all)-len(unmatched), len(unmatched))
}
-// printActualReservations prints all actual reservations with markers for unmatched ones.
-func (env *CommitmentTestEnv) printActualReservations(all, unmatched []v1alpha1.Reservation) {
- env.T.Helper()
-
- unmatchedMap := make(map[string]bool)
- for _, res := range unmatched {
- unmatchedMap[res.Name] = true
- }
-
- if len(all) == 0 {
- env.T.Log(" (none)")
- return
+func createCommitment(resourceName, projectID, uuid, state string, amount uint64, _ ...string) TestCommitment {
+ return TestCommitment{
+ ResourceName: liquid.ResourceName(resourceName),
+ ProjectID: projectID,
+ ConfirmationID: uuid,
+ State: state,
+ Amount: amount,
}
+}
- for i, res := range all {
- marker := "✓"
- if unmatchedMap[res.Name] {
- marker = "⊕"
- }
- env.T.Logf(" %s [%d] %s", marker, i+1, compactReservationString(&res))
+// deleteCommitment builds a TestCommitment representing a removal (OldStatus=oldState, NewStatus=None).
+func deleteCommitment(resourceName, projectID, uuid, oldState string, amount uint64) TestCommitment {
+ return TestCommitment{
+ ResourceName: liquid.ResourceName(resourceName),
+ ProjectID: projectID,
+ ConfirmationID: uuid,
+ OldState: oldState,
+ State: "", // NewStatus = None
+ Amount: amount,
}
-
- env.T.Logf(" Total: %d (%d matched, %d unexpected)",
- len(all), len(all)-len(unmatched), len(unmatched))
}
-// printDiffSummary prints a summary of differences between expected and actual.
-func (env *CommitmentTestEnv) printDiffSummary(unmatchedExpected []*TestReservation, unmatchedActual []v1alpha1.Reservation) {
- env.T.Helper()
-
- if len(unmatchedExpected) > 0 {
- env.T.Logf(" MISSING (%d expected, not found):", len(unmatchedExpected))
- for _, res := range unmatchedExpected {
- env.T.Logf(" • %s", res.String())
+func buildRequestJSON(req CommitmentChangeRequest) string {
+ byProject := make(map[liquid.ProjectUUID]liquid.ProjectCommitmentChangeset)
+ for _, tc := range req.Commitments {
+ pid := liquid.ProjectUUID(tc.ProjectID)
+ if byProject[pid].ByResource == nil {
+ byProject[pid] = liquid.ProjectCommitmentChangeset{
+ ByResource: make(map[liquid.ResourceName]liquid.ResourceCommitmentChangeset),
+ }
}
- }
-
- if len(unmatchedActual) > 0 {
- env.T.Logf(" UNEXPECTED (%d found, not expected):", len(unmatchedActual))
- for _, res := range unmatchedActual {
- env.T.Logf(" • %s", compactReservationString(&res))
+ var oldStatus Option[liquid.CommitmentStatus]
+ if tc.OldState != "" {
+ oldStatus = Some(liquid.CommitmentStatus(tc.OldState))
+ } else {
+ oldStatus = None[liquid.CommitmentStatus]()
}
- }
-
- if len(unmatchedExpected) == 0 && len(unmatchedActual) == 0 {
- env.T.Log(" ✓ All match!")
- }
-}
-
-// reservationMatches checks if an actual reservation matches an expected one.
-// All fields are checked comprehensively for complete validation.
-func (env *CommitmentTestEnv) reservationMatches(expected *TestReservation, actual *v1alpha1.Reservation) bool {
- // Check CommitmentID (from reservation name prefix)
- if !strings.HasPrefix(actual.Name, "commitment-"+expected.CommitmentID+"-") {
- return false
- }
-
- // Check that CommittedResourceReservation spec exists
- if actual.Spec.CommittedResourceReservation == nil {
- return false
- }
-
- // Check CommitmentUUID in spec matches
- if actual.Spec.CommittedResourceReservation.CommitmentUUID != expected.CommitmentID {
- return false
- }
-
- // Check ProjectID
- if actual.Spec.CommittedResourceReservation.ProjectID != expected.ProjectID {
- return false
- }
-
- // Check ResourceName (flavor name)
- if expected.Flavor != nil {
- if actual.Spec.CommittedResourceReservation.ResourceName != expected.Flavor.Name {
- return false
+ var newStatus Option[liquid.CommitmentStatus]
+ if tc.State != "" {
+ newStatus = Some(liquid.CommitmentStatus(tc.State))
+ } else {
+ newStatus = None[liquid.CommitmentStatus]()
}
- }
-
- // Check ResourceGroup (flavor group)
- if expected.Flavor != nil {
- if actual.Spec.CommittedResourceReservation.ResourceGroup != expected.Flavor.Group {
- return false
+ commitment := liquid.Commitment{
+ UUID: liquid.CommitmentUUID(tc.ConfirmationID),
+ Amount: tc.Amount,
+ OldStatus: oldStatus,
+ NewStatus: newStatus,
+ ExpiresAt: time.Now().Add(365 * 24 * time.Hour),
+ }
+ byResource := byProject[pid].ByResource[tc.ResourceName]
+ byResource.Commitments = append(byResource.Commitments, commitment)
+
+ // Compute per-resource totals so RequiresConfirmation() behaves correctly.
+ // OldAmount overrides Amount for TotalBefore (resize-down: old amount != new amount).
+ oldAmt := tc.Amount
+ if tc.OldAmount != 0 {
+ oldAmt = tc.OldAmount
+ }
+ if oldStatus == Some(liquid.CommitmentStatusConfirmed) {
+ byResource.TotalConfirmedBefore += oldAmt
+ }
+ if newStatus == Some(liquid.CommitmentStatusConfirmed) {
+ byResource.TotalConfirmedAfter += tc.Amount
+ }
+ if oldStatus == Some(liquid.CommitmentStatusGuaranteed) {
+ byResource.TotalGuaranteedBefore += oldAmt
+ }
+ if newStatus == Some(liquid.CommitmentStatusGuaranteed) {
+ byResource.TotalGuaranteedAfter += tc.Amount
}
- }
-
- // Check Host (if specified in expected)
- if expected.Host != "" && actual.Status.Host != expected.Host {
- return false
- }
- // Check AZ (if specified in expected)
- if expected.AZ != "" && actual.Spec.AvailabilityZone != expected.AZ {
- return false
+ byProject[pid].ByResource[tc.ResourceName] = byResource
}
- // Check Memory (use custom MemoryMB if non-zero, otherwise use flavor size)
- expectedMemoryMB := expected.MemoryMB
- if expectedMemoryMB == 0 && expected.Flavor != nil {
- expectedMemoryMB = expected.Flavor.MemoryMB
+ request := liquid.CommitmentChangeRequest{
+ InfoVersion: req.InfoVersion,
+ AZ: liquid.AvailabilityZone(req.AZ),
+ DryRun: req.DryRun,
+ ByProject: byProject,
}
- memoryQuantity := actual.Spec.Resources["memory"]
- actualMemoryBytes := memoryQuantity.Value()
- actualMemoryMB := actualMemoryBytes / (1024 * 1024)
- if actualMemoryMB != expectedMemoryMB {
- return false
+ raw, err := json.Marshal(request)
+ if err != nil {
+ panic("failed to marshal request: " + err.Error())
}
+ return string(raw)
+}
- // Check CPU (from flavor if available)
- if expected.Flavor != nil {
- cpuQuantity := actual.Spec.Resources["cpu"]
- actualCPU := cpuQuantity.Value()
- if actualCPU != expected.Flavor.VCPUs {
- return false
- }
- }
+// ============================================================================
+// FlavorGroup Knowledge helpers
+// ============================================================================
- // Check VM allocations (set comparison - order doesn't matter)
- if !env.vmAllocationsMatch(expected.VMs, actual) {
- return false
+func buildFlavorGroupsKnowledge(flavors []*TestFlavor, infoVersion int64) FlavorGroupsKnowledge {
+ groupMap := make(map[string][]compute.FlavorInGroup)
+ for _, f := range flavors {
+ groupMap[f.Group] = append(groupMap[f.Group], f.ToFlavorInGroup())
}
- // Check reservation type
- if actual.Spec.Type != v1alpha1.ReservationTypeCommittedResource {
- return false
+ sortedNames := make([]string, 0, len(groupMap))
+ for n := range groupMap {
+ sortedNames = append(sortedNames, n)
}
+ sort.Strings(sortedNames)
- return true
-}
+ var groups []compute.FlavorGroupFeature
+ for _, name := range sortedNames {
+ gFlavors := groupMap[name]
+ sort.Slice(gFlavors, func(i, j int) bool { return gFlavors[i].MemoryMB > gFlavors[j].MemoryMB })
-// vmAllocationsMatch checks if VM allocations match (set comparison).
-func (env *CommitmentTestEnv) vmAllocationsMatch(expectedVMs []string, actual *v1alpha1.Reservation) bool {
- if actual.Status.CommittedResourceReservation == nil {
- return len(expectedVMs) == 0
- }
+ smallest := gFlavors[len(gFlavors)-1]
+ largest := gFlavors[0]
- actualVMs := make(map[string]bool)
- for vmUUID := range actual.Status.CommittedResourceReservation.Allocations {
- actualVMs[vmUUID] = true
+ var minR, maxR uint64 = ^uint64(0), 0
+ for _, f := range gFlavors {
+ if f.VCPUs == 0 {
+ continue
+ }
+ r := f.MemoryMB / f.VCPUs
+ if r < minR {
+ minR = r
+ }
+ if r > maxR {
+ maxR = r
+ }
+ }
+ var ratio, ratioMin, ratioMax *uint64
+ if minR == maxR && maxR != 0 {
+ ratio = &minR
+ } else if maxR != 0 {
+ ratioMin = &minR
+ ratioMax = &maxR
+ }
+ groups = append(groups, compute.FlavorGroupFeature{
+ Name: name,
+ Flavors: gFlavors,
+ SmallestFlavor: smallest,
+ LargestFlavor: largest,
+ RamCoreRatio: ratio,
+ RamCoreRatioMin: ratioMin,
+ RamCoreRatioMax: ratioMax,
+ })
}
+ return FlavorGroupsKnowledge{InfoVersion: infoVersion, Groups: groups}
+}
- // Check counts match
- if len(expectedVMs) != len(actualVMs) {
- return false
+func createKnowledgeCRD(fgk FlavorGroupsKnowledge) *v1alpha1.Knowledge {
+ raw, err := v1alpha1.BoxFeatureList(fgk.Groups)
+ if err != nil {
+ panic("failed to box flavor group features: " + err.Error())
}
- // Check all expected VMs are in actual
- for _, vmUUID := range expectedVMs {
- if !actualVMs[vmUUID] {
- return false
- }
+ lastChange := metav1.NewTime(time.Unix(fgk.InfoVersion, 0))
+ return &v1alpha1.Knowledge{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: flavorGroupsKnowledgeName,
+ },
+ Spec: v1alpha1.KnowledgeSpec{
+ SchedulingDomain: v1alpha1.SchedulingDomainNova,
+ },
+ Status: v1alpha1.KnowledgeStatus{
+ Conditions: []metav1.Condition{{Type: v1alpha1.KnowledgeConditionReady, Status: metav1.ConditionTrue, Reason: "Extracted"}},
+ Raw: raw,
+ LastContentChange: lastChange,
+ },
}
-
- return true
}
// ============================================================================
-// Mock VM Source
+// MockVMSource (kept for compatibility with handler.go / report_usage tests)
// ============================================================================
-// MockVMSource implements VMSource for testing.
type MockVMSource struct {
- VMs []VM
+ vms []VM
+ mu sync.Mutex
+}
+
+type VM struct {
+ UUID string
+ FlavorName string
+ ProjectID string
+ CurrentHypervisor string
+ AvailabilityZone string
+ Resources map[string]int64
+ FlavorExtraSpecs map[string]string
}
-// NewMockVMSource creates a new MockVMSource with the given VMs.
func NewMockVMSource(vms []VM) *MockVMSource {
- return &MockVMSource{VMs: vms}
+ return &MockVMSource{vms: vms}
}
-// ListVMs returns the configured VMs.
-func (s *MockVMSource) ListVMs(_ context.Context) ([]VM, error) {
- return s.VMs, nil
+func (m *MockVMSource) ListVMs(_ context.Context) ([]VM, error) {
+ m.mu.Lock()
+ defer m.mu.Unlock()
+ result := make([]VM, len(m.vms))
+ copy(result, m.vms)
+ return result, nil
}
// ============================================================================
-// Helper Functions
+// TestVM (kept for tests in other files that still use it)
// ============================================================================
-// newHypervisorWithAZ creates a Hypervisor CRD with the given parameters including availability zone.
-func newHypervisorWithAZ(name string, cpuCap, memoryGi, cpuAlloc, memoryGiAlloc int, instances []hv1.Instance, traits []string, az string) *hv1.Hypervisor {
- labels := make(map[string]string)
- if az != "" {
- labels[corev1.LabelTopologyZone] = az
- }
- return &hv1.Hypervisor{
- ObjectMeta: metav1.ObjectMeta{
- Name: name,
- Labels: labels,
- },
- Status: hv1.HypervisorStatus{
- Capacity: map[hv1.ResourceName]resource.Quantity{
- "cpu": resource.MustParse(strconv.Itoa(cpuCap)),
- "memory": resource.MustParse(strconv.Itoa(memoryGi) + "Gi"),
- },
- Allocation: map[hv1.ResourceName]resource.Quantity{
- "cpu": resource.MustParse(strconv.Itoa(cpuAlloc)),
- "memory": resource.MustParse(strconv.Itoa(memoryGiAlloc) + "Gi"),
- },
- NumInstances: len(instances),
- Instances: instances,
- Traits: traits,
- },
- }
-}
-
-// createCommitment creates a TestCommitment for use in test cases.
-// The az parameter is optional - if empty string, no AZ constraint is set.
-func createCommitment(resourceName, projectID, confirmationID, state string, amount uint64, az ...string) TestCommitment {
- return TestCommitment{
- ResourceName: liquid.ResourceName(resourceName),
- ProjectID: projectID,
- ConfirmationID: confirmationID,
- State: state,
- Amount: amount,
- }
-}
-
-// newCommitmentRequest creates a CommitmentChangeRequest with the given commitments.
-func newCommitmentRequest(az string, dryRun bool, infoVersion int64, commitments ...TestCommitment) CommitmentChangeRequest {
- return CommitmentChangeRequest{
- AZ: az,
- DryRun: dryRun,
- InfoVersion: infoVersion,
- Commitments: commitments,
- }
-}
-
-// newAPIResponse creates an APIResponseExpectation with 200 OK status.
-func newAPIResponse(rejectReasonSubstrings ...string) APIResponseExpectation {
- return APIResponseExpectation{
- StatusCode: 200,
- RejectReasonSubstrings: rejectReasonSubstrings,
- }
-}
-
-// buildRequestJSON converts a test CommitmentChangeRequest to JSON string.
-// Builds the nested JSON structure directly for simplicity.
-// Uses sorted iteration to ensure deterministic JSON output.
-func buildRequestJSON(req CommitmentChangeRequest) string {
- // Group commitments by project and resource for nested structure
- type projectResources map[liquid.ResourceName][]TestCommitment
- byProject := make(map[string]projectResources)
-
- for _, commit := range req.Commitments {
- if byProject[commit.ProjectID] == nil {
- byProject[commit.ProjectID] = make(projectResources)
- }
- byProject[commit.ProjectID][commit.ResourceName] = append(
- byProject[commit.ProjectID][commit.ResourceName],
- commit,
- )
- }
-
- // Sort projects for deterministic iteration
- sortedProjects := make([]string, 0, len(byProject))
- for projectID := range byProject {
- sortedProjects = append(sortedProjects, projectID)
- }
- sort.Strings(sortedProjects)
-
- // Build nested JSON structure with sorted iteration
- var projectParts []string
- for _, projectID := range sortedProjects {
- resources := byProject[projectID]
-
- // Sort resource names for deterministic iteration
- sortedResources := make([]liquid.ResourceName, 0, len(resources))
- for resourceName := range resources {
- sortedResources = append(sortedResources, resourceName)
- }
- sort.Slice(sortedResources, func(i, j int) bool {
- return string(sortedResources[i]) < string(sortedResources[j])
- })
-
- var resourceParts []string
- for _, resourceName := range sortedResources {
- commits := resources[resourceName]
- var commitParts []string
- for _, c := range commits {
- expiryTime := time.Now().Add(time.Duration(defaultCommitmentExpiryYears) * 365 * 24 * time.Hour)
- commitParts = append(commitParts, fmt.Sprintf(`{"uuid":"%s","newStatus":"%s","amount":%d,"expiresAt":"%s"}`,
- c.ConfirmationID, c.State, c.Amount, expiryTime.Format(time.RFC3339)))
- }
- resourceParts = append(resourceParts, fmt.Sprintf(`"%s":{"commitments":[%s]}`,
- resourceName, strings.Join(commitParts, ",")))
- }
- projectParts = append(projectParts, fmt.Sprintf(`"%s":{"byResource":{%s}}`,
- projectID, strings.Join(resourceParts, ",")))
- }
-
- return fmt.Sprintf(`{"az":"%s","dryRun":%t,"infoVersion":%d,"byProject":{%s}}`,
- req.AZ, req.DryRun, req.InfoVersion, strings.Join(projectParts, ","))
+type TestVM struct {
+ UUID string
+ Flavor *TestFlavor
+ ProjectID string
+ Host string
+ AZ string
}
-// createKnowledgeCRD creates a Knowledge CRD populated with flavor groups.
-func createKnowledgeCRD(flavorGroups FlavorGroupsKnowledge) *v1alpha1.Knowledge {
- rawExt, err := v1alpha1.BoxFeatureList(flavorGroups.Groups)
- if err != nil {
- panic("Failed to box flavor groups: " + err.Error())
- }
-
- lastContentChange := time.Unix(flavorGroups.InfoVersion, 0)
-
- return &v1alpha1.Knowledge{
- ObjectMeta: metav1.ObjectMeta{
- Name: flavorGroupsKnowledgeName,
- },
- Spec: v1alpha1.KnowledgeSpec{
- SchedulingDomain: v1alpha1.SchedulingDomainNova,
- Extractor: v1alpha1.KnowledgeExtractorSpec{
- Name: flavorGroupsKnowledgeName,
- },
- Recency: metav1.Duration{Duration: knowledgeRecencyDuration},
+func (vm *TestVM) ToVM() VM {
+ return VM{
+ UUID: vm.UUID,
+ FlavorName: vm.Flavor.Name,
+ ProjectID: vm.ProjectID,
+ CurrentHypervisor: vm.Host,
+ AvailabilityZone: vm.AZ,
+ Resources: map[string]int64{
+ "memory": vm.Flavor.MemoryMB,
+ "vcpus": vm.Flavor.VCPUs,
},
- Status: v1alpha1.KnowledgeStatus{
- LastExtracted: metav1.Time{Time: lastContentChange},
- LastContentChange: metav1.Time{Time: lastContentChange},
- Raw: rawExt,
- RawLength: len(flavorGroups.Groups),
- Conditions: []metav1.Condition{
- {
- Type: v1alpha1.KnowledgeConditionReady,
- Status: metav1.ConditionTrue,
- Reason: "KnowledgeReady",
- Message: "Flavor groups knowledge is ready",
- LastTransitionTime: metav1.Time{Time: lastContentChange},
- },
- },
+ FlavorExtraSpecs: map[string]string{
+ "quota:hw_version": vm.Flavor.Group,
},
}
}
diff --git a/internal/scheduling/reservations/commitments/api/handler.go b/internal/scheduling/reservations/commitments/api/handler.go
index f0eb24110..051a82fa2 100644
--- a/internal/scheduling/reservations/commitments/api/handler.go
+++ b/internal/scheduling/reservations/commitments/api/handler.go
@@ -20,7 +20,7 @@ var apiLog = ctrl.Log.WithName("committed-resource")
// HTTPAPI implements Limes LIQUID commitment validation endpoints.
type HTTPAPI struct {
client client.Client
- config commitments.Config
+ config commitments.APIConfig
usageDB commitments.UsageDBClient
monitor ChangeCommitmentsAPIMonitor
usageMonitor ReportUsageAPIMonitor
@@ -31,11 +31,11 @@ type HTTPAPI struct {
}
func NewAPI(client client.Client) *HTTPAPI {
- return NewAPIWithConfig(client, commitments.DefaultConfig(), nil)
+ return NewAPIWithConfig(client, commitments.DefaultAPIConfig(), nil)
}
// NewAPIWithConfig creates an HTTPAPI with the given config and optional usageDB client.
-func NewAPIWithConfig(k8sClient client.Client, config commitments.Config, usageDB commitments.UsageDBClient) *HTTPAPI {
+func NewAPIWithConfig(k8sClient client.Client, config commitments.APIConfig, usageDB commitments.UsageDBClient) *HTTPAPI {
return &HTTPAPI{
client: k8sClient,
config: config,
@@ -58,9 +58,9 @@ func (api *HTTPAPI) Init(mux *http.ServeMux, registry prometheus.Registerer, log
mux.HandleFunc("/commitments/v1/projects/", api.handleProjectEndpoint) // routes to report-usage or quota
log.Info("commitments API initialized",
- "changeCommitmentsEnabled", api.config.EnableChangeCommitmentsAPI,
- "reportUsageEnabled", api.config.EnableReportUsageAPI,
- "reportCapacityEnabled", api.config.EnableReportCapacityAPI)
+ "changeCommitmentsEnabled", api.config.EnableChangeCommitments,
+ "reportUsageEnabled", api.config.EnableReportUsage,
+ "reportCapacityEnabled", api.config.EnableReportCapacity)
}
// handleProjectEndpoint routes /commitments/v1/projects/:project_id/... requests to the appropriate handler.
diff --git a/internal/scheduling/reservations/commitments/api/info.go b/internal/scheduling/reservations/commitments/api/info.go
index 6999b38d6..2e8ddc8a8 100644
--- a/internal/scheduling/reservations/commitments/api/info.go
+++ b/internal/scheduling/reservations/commitments/api/info.go
@@ -219,7 +219,8 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, logger logr.Logger) (l
"version", version)
return liquid.ServiceInfo{
- Version: version,
- Resources: resources,
+ Version: version,
+ Resources: resources,
+ CommitmentHandlingNeedsProjectMetadata: true,
}, nil
}
diff --git a/internal/scheduling/reservations/commitments/api/report_capacity.go b/internal/scheduling/reservations/commitments/api/report_capacity.go
index f846fea8e..9f0966cce 100644
--- a/internal/scheduling/reservations/commitments/api/report_capacity.go
+++ b/internal/scheduling/reservations/commitments/api/report_capacity.go
@@ -31,7 +31,7 @@ func (api *HTTPAPI) HandleReportCapacity(w http.ResponseWriter, r *http.Request)
w.Header().Set("X-Request-ID", requestID)
// Check if API is enabled
- if !api.config.EnableReportCapacityAPI {
+ if !api.config.EnableReportCapacity {
statusCode = http.StatusServiceUnavailable
http.Error(w, "report-capacity API is disabled", statusCode)
api.recordCapacityMetrics(statusCode, startTime)
diff --git a/internal/scheduling/reservations/commitments/api/report_usage.go b/internal/scheduling/reservations/commitments/api/report_usage.go
index d87f7c24a..bf48dfe00 100644
--- a/internal/scheduling/reservations/commitments/api/report_usage.go
+++ b/internal/scheduling/reservations/commitments/api/report_usage.go
@@ -36,7 +36,7 @@ func (api *HTTPAPI) HandleReportUsage(w http.ResponseWriter, r *http.Request) {
log := apiLog.WithValues("requestID", requestID, "endpoint", "report-usage")
// Check if API is enabled
- if !api.config.EnableReportUsageAPI {
+ if !api.config.EnableReportUsage {
statusCode = http.StatusServiceUnavailable
log.Info("report-usage API is disabled, rejecting request")
http.Error(w, "report-usage API is disabled", statusCode)
diff --git a/internal/scheduling/reservations/commitments/api/report_usage_test.go b/internal/scheduling/reservations/commitments/api/report_usage_test.go
index 4cafdc213..719a7bbb1 100644
--- a/internal/scheduling/reservations/commitments/api/report_usage_test.go
+++ b/internal/scheduling/reservations/commitments/api/report_usage_test.go
@@ -580,7 +580,7 @@ func newUsageTestEnv(
}
// Create API with mock DB client
- api := NewAPIWithConfig(k8sClient, commitments.DefaultConfig(), dbClient)
+ api := NewAPIWithConfig(k8sClient, commitments.DefaultAPIConfig(), dbClient)
mux := http.NewServeMux()
registry := prometheus.NewRegistry()
api.Init(mux, registry, log.Log)
diff --git a/internal/scheduling/reservations/commitments/committed_resource_controller.go b/internal/scheduling/reservations/commitments/committed_resource_controller.go
index a25d63e3a..0481395fc 100644
--- a/internal/scheduling/reservations/commitments/committed_resource_controller.go
+++ b/internal/scheduling/reservations/commitments/committed_resource_controller.go
@@ -29,7 +29,7 @@ const crFinalizer = "committed-resource.reservations.cortex.cloud/cleanup"
type CommittedResourceController struct {
client.Client
Scheme *runtime.Scheme
- Conf Config
+ Conf CommittedResourceControllerConfig
}
func (r *CommittedResourceController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
@@ -58,7 +58,7 @@ func (r *CommittedResourceController) Reconcile(ctx context.Context, req ctrl.Re
switch cr.Spec.State {
case v1alpha1.CommitmentStatusPlanned:
- return ctrl.Result{}, r.setNotReady(ctx, &cr, "Planned", "commitment is not yet active")
+ return ctrl.Result{}, r.setNotReady(ctx, &cr, v1alpha1.CommittedResourceReasonPlanned, "commitment is not yet active")
case v1alpha1.CommitmentStatusPending:
return r.reconcilePending(ctx, logger, &cr)
case v1alpha1.CommitmentStatusGuaranteed, v1alpha1.CommitmentStatusConfirmed:
@@ -71,16 +71,41 @@ func (r *CommittedResourceController) Reconcile(ctx context.Context, req ctrl.Re
}
}
-// reconcilePending handles a one-shot confirmation attempt (Limes state: pending).
-// If placement fails for any reason, all partial reservations are removed and the
-// CR is marked Rejected so the HTTP API can report the outcome back to Limes.
+// reconcilePending handles a confirmation attempt (Limes state: pending).
+// If AllowRejection=true (API path), placement failure marks the CR Rejected so the HTTP API
+// can report the outcome back to Limes. If AllowRejection=false (syncer path), the controller
+// retries indefinitely — Limes does not require confirmation for these transitions.
func (r *CommittedResourceController) reconcilePending(ctx context.Context, logger logr.Logger, cr *v1alpha1.CommittedResource) (ctrl.Result, error) {
- if applyErr := r.applyReservationState(ctx, logger, cr); applyErr != nil {
- logger.Error(applyErr, "pending commitment placement failed, rejecting")
- if rollbackErr := r.deleteChildReservations(ctx, cr); rollbackErr != nil {
- return ctrl.Result{}, rollbackErr
+ result, applyErr := r.applyReservationState(ctx, logger, cr)
+ if applyErr != nil {
+ if cr.Spec.AllowRejection {
+ logger.Error(applyErr, "pending commitment placement failed, rejecting")
+ if rollbackErr := r.deleteChildReservations(ctx, cr); rollbackErr != nil {
+ return ctrl.Result{}, rollbackErr
+ }
+ return ctrl.Result{}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonRejected, applyErr.Error())
+ }
+ logger.Error(applyErr, "pending commitment placement failed, will retry", "requeueAfter", r.Conf.RequeueIntervalRetry.Duration)
+ return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalRetry.Duration}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonReserving, applyErr.Error())
+ }
+ allReady, anyFailed, failReason, err := r.checkChildReservationStatus(ctx, cr, result.TotalSlots)
+ if err != nil {
+ return ctrl.Result{}, err
+ }
+ if anyFailed {
+ if cr.Spec.AllowRejection {
+ logger.Info("pending commitment rejected: reservation placement failed", "reason", failReason)
+ if rollbackErr := r.deleteChildReservations(ctx, cr); rollbackErr != nil {
+ return ctrl.Result{}, rollbackErr
+ }
+ return ctrl.Result{}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonRejected, failReason)
}
- return ctrl.Result{}, r.setNotReady(ctx, cr, "Rejected", applyErr.Error())
+ logger.Info("pending commitment placement failed, will retry", "reason", failReason, "requeueAfter", r.Conf.RequeueIntervalRetry.Duration)
+ return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalRetry.Duration}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonReserving, failReason)
+ }
+ if !allReady {
+ // Reservation controller hasn't processed all slots yet; Reservation watch will re-enqueue.
+ return ctrl.Result{}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonReserving, "waiting for reservation placement")
}
return ctrl.Result{}, r.setAccepted(ctx, cr)
}
@@ -89,42 +114,113 @@ func (r *CommittedResourceController) reconcileCommitted(ctx context.Context, lo
// Spec errors are permanent regardless of AllowRejection — a bad spec won't fix itself.
if _, err := FromCommittedResource(*cr); err != nil {
logger.Error(err, "invalid commitment spec, rejecting")
- return ctrl.Result{}, r.setNotReady(ctx, cr, "Rejected", err.Error())
+ return ctrl.Result{}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonRejected, err.Error())
}
- if applyErr := r.applyReservationState(ctx, logger, cr); applyErr != nil {
+ result, applyErr := r.applyReservationState(ctx, logger, cr)
+ if applyErr != nil {
if cr.Spec.AllowRejection {
logger.Error(applyErr, "committed placement failed, rolling back to accepted amount")
if rollbackErr := r.rollbackToAccepted(ctx, logger, cr); rollbackErr != nil {
return ctrl.Result{}, rollbackErr
}
- return ctrl.Result{}, r.setNotReady(ctx, cr, "Rejected", applyErr.Error())
+ return ctrl.Result{}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonRejected, applyErr.Error())
+ }
+ logger.Error(applyErr, "committed placement incomplete, will retry", "requeueAfter", r.Conf.RequeueIntervalRetry.Duration)
+ return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalRetry.Duration}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonReserving, applyErr.Error())
+ }
+ allReady, anyFailed, failReason, err := r.checkChildReservationStatus(ctx, cr, result.TotalSlots)
+ if err != nil {
+ return ctrl.Result{}, err
+ }
+ if anyFailed {
+ if cr.Spec.AllowRejection {
+ logger.Info("committed placement failed, rolling back to accepted amount", "reason", failReason)
+ if rollbackErr := r.rollbackToAccepted(ctx, logger, cr); rollbackErr != nil {
+ return ctrl.Result{}, rollbackErr
+ }
+ return ctrl.Result{}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonRejected, failReason)
}
- logger.Error(applyErr, "committed placement incomplete, will retry", "requeueAfter", r.Conf.RequeueIntervalRetry)
- return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalRetry}, r.setNotReady(ctx, cr, "Reserving", applyErr.Error())
+ logger.Info("committed placement failed, will retry", "reason", failReason, "requeueAfter", r.Conf.RequeueIntervalRetry.Duration)
+ return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalRetry.Duration}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonReserving, failReason)
+ }
+ if !allReady {
+ // Reservation controller hasn't processed all slots yet; Reservation watch will re-enqueue.
+ return ctrl.Result{}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonReserving, "waiting for reservation placement")
}
return ctrl.Result{}, r.setAccepted(ctx, cr)
}
-func (r *CommittedResourceController) applyReservationState(ctx context.Context, logger logr.Logger, cr *v1alpha1.CommittedResource) error {
+func (r *CommittedResourceController) applyReservationState(ctx context.Context, logger logr.Logger, cr *v1alpha1.CommittedResource) (*ApplyResult, error) {
knowledge := &reservations.FlavorGroupKnowledgeClient{Client: r.Client}
flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil)
if err != nil {
- return fmt.Errorf("flavor knowledge not ready: %w", err)
+ return nil, fmt.Errorf("flavor knowledge not ready: %w", err)
}
state, err := FromCommittedResource(*cr)
if err != nil {
- return fmt.Errorf("invalid commitment spec: %w", err)
+ return nil, fmt.Errorf("invalid commitment spec: %w", err)
}
state.NamePrefix = cr.Name + "-"
state.CreatorRequestID = reservations.GlobalRequestIDFromContext(ctx)
+ state.ParentGeneration = cr.Generation
result, err := NewReservationManager(r.Client).ApplyCommitmentState(ctx, logger, state, flavorGroups, "committed-resource-controller")
if err != nil {
- return err
+ return nil, err
}
logger.Info("commitment state applied", "created", result.Created, "deleted", result.Deleted, "repaired", result.Repaired)
- return nil
+ return result, nil
+}
+
+// checkChildReservationStatus inspects the Ready conditions of all child Reservations for cr.
+// Returns allReady=true when every child has Ready=True.
+// Returns anyFailed=true (and the first failure message) when any child has Ready=False.
+// Returns allReady=false, anyFailed=false when some children have no condition yet (placement pending).
+func (r *CommittedResourceController) checkChildReservationStatus(ctx context.Context, cr *v1alpha1.CommittedResource, expectedSlots int) (allReady, anyFailed bool, failReason string, err error) {
+ var list v1alpha1.ReservationList
+ if err := r.List(ctx, &list,
+ client.MatchingLabels{v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource},
+ client.MatchingFields{idxReservationByCommitmentUUID: cr.Spec.CommitmentUUID},
+ ); err != nil {
+ return false, false, "", fmt.Errorf("failed to list reservations: %w", err)
+ }
+
+ // Cache hasn't caught up yet; Reservation watch will re-enqueue.
+ if len(list.Items) < expectedSlots {
+ return false, false, "", nil
+ }
+
+ if len(list.Items) == 0 {
+ return true, false, "", nil
+ }
+
+ // First pass: failures take priority over pending — but only for the current generation.
+ // A Ready=False condition from a previous generation means the reservation controller
+ // hasn't reprocessed this slot yet; treat it as still-pending, not as a current failure.
+ for _, res := range list.Items {
+ if res.Status.CommittedResourceReservation == nil ||
+ res.Status.CommittedResourceReservation.ObservedParentGeneration != cr.Generation {
+ continue
+ }
+ cond := meta.FindStatusCondition(res.Status.Conditions, v1alpha1.ReservationConditionReady)
+ if cond != nil && cond.Status == metav1.ConditionFalse {
+ return false, true, cond.Message, nil
+ }
+ }
+ // Second pass: check generation and readiness for all slots.
+ for _, res := range list.Items {
+ // ObservedParentGeneration must match cr.Generation before we trust the Ready condition.
+ if res.Status.CommittedResourceReservation == nil ||
+ res.Status.CommittedResourceReservation.ObservedParentGeneration != cr.Generation {
+ return false, false, "", nil
+ }
+ cond := meta.FindStatusCondition(res.Status.Conditions, v1alpha1.ReservationConditionReady)
+ if cond == nil || cond.Status != metav1.ConditionTrue {
+ return false, false, "", nil
+ }
+ }
+ return true, false, "", nil
}
func (r *CommittedResourceController) setAccepted(ctx context.Context, cr *v1alpha1.CommittedResource) error {
@@ -136,7 +232,7 @@ func (r *CommittedResourceController) setAccepted(ctx context.Context, cr *v1alp
meta.SetStatusCondition(&cr.Status.Conditions, metav1.Condition{
Type: v1alpha1.CommittedResourceConditionReady,
Status: metav1.ConditionTrue,
- Reason: "Accepted",
+ Reason: v1alpha1.CommittedResourceReasonAccepted,
Message: "commitment successfully reserved",
LastTransitionTime: now,
})
@@ -170,17 +266,14 @@ func (r *CommittedResourceController) reconcileDeletion(ctx context.Context, log
// identified by matching CommitmentUUID in the reservation spec.
func (r *CommittedResourceController) deleteChildReservations(ctx context.Context, cr *v1alpha1.CommittedResource) error {
var list v1alpha1.ReservationList
- if err := r.List(ctx, &list, client.MatchingLabels{
- v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
- }); err != nil {
+ if err := r.List(ctx, &list,
+ client.MatchingLabels{v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource},
+ client.MatchingFields{idxReservationByCommitmentUUID: cr.Spec.CommitmentUUID},
+ ); err != nil {
return fmt.Errorf("failed to list reservations: %w", err)
}
for i := range list.Items {
res := &list.Items[i]
- if res.Spec.CommittedResourceReservation == nil ||
- res.Spec.CommittedResourceReservation.CommitmentUUID != cr.Spec.CommitmentUUID {
- continue
- }
if err := r.Delete(ctx, res); client.IgnoreNotFound(err) != nil {
return fmt.Errorf("failed to delete reservation %s: %w", res.Name, err)
}
@@ -210,6 +303,7 @@ func (r *CommittedResourceController) rollbackToAccepted(ctx context.Context, lo
state.TotalMemoryBytes = cr.Status.AcceptedAmount.Value()
state.NamePrefix = cr.Name + "-"
state.CreatorRequestID = reservations.GlobalRequestIDFromContext(ctx)
+ state.ParentGeneration = cr.Generation
if _, err := NewReservationManager(r.Client).ApplyCommitmentState(ctx, logger, state, flavorGroups, "committed-resource-controller-rollback"); err != nil {
return fmt.Errorf("rollback apply failed: %w", err)
}
@@ -271,6 +365,9 @@ func (r *CommittedResourceController) SetupWithManager(mgr ctrl.Manager, mcl *mu
if err != nil {
return err
}
+ // MaxConcurrentReconciles=1: the change-commitments API handler snapshots each CR's spec
+ // before writing and restores it on rollback. Concurrent reconciles across overlapping
+ // batch requests could interleave those snapshots and produce incorrect rollback state.
return bldr.Named("committed-resource").
WithOptions(controller.Options{
MaxConcurrentReconciles: 1,
diff --git a/internal/scheduling/reservations/commitments/committed_resource_controller_test.go b/internal/scheduling/reservations/commitments/committed_resource_controller_test.go
index 6e6103972..471c013e3 100644
--- a/internal/scheduling/reservations/commitments/committed_resource_controller_test.go
+++ b/internal/scheduling/reservations/commitments/committed_resource_controller_test.go
@@ -10,6 +10,7 @@ import (
"time"
hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
+ "github.com/go-logr/logr"
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -106,6 +107,20 @@ func newCRTestClient(scheme *runtime.Scheme, objects ...client.Object) client.Cl
WithScheme(scheme).
WithObjects(objects...).
WithStatusSubresource(&v1alpha1.CommittedResource{}, &v1alpha1.Reservation{}).
+ WithIndex(&v1alpha1.Reservation{}, idxReservationByCommitmentUUID, func(obj client.Object) []string {
+ res, ok := obj.(*v1alpha1.Reservation)
+ if !ok || res.Spec.CommittedResourceReservation == nil || res.Spec.CommittedResourceReservation.CommitmentUUID == "" {
+ return nil
+ }
+ return []string{res.Spec.CommittedResourceReservation.CommitmentUUID}
+ }).
+ WithIndex(&v1alpha1.CommittedResource{}, idxCommittedResourceByUUID, func(obj client.Object) []string {
+ cr, ok := obj.(*v1alpha1.CommittedResource)
+ if !ok || cr.Spec.CommitmentUUID == "" {
+ return nil
+ }
+ return []string{cr.Spec.CommitmentUUID}
+ }).
Build()
}
@@ -153,6 +168,39 @@ func countChildReservations(t *testing.T, k8sClient client.Client, commitmentUUI
return count
}
+// setChildReservationsReady simulates the reservation controller by marking all child
+// Reservations for the given commitmentUUID as Ready=True and echoing ParentGeneration
+// into ObservedParentGeneration (matching what echoParentGeneration does in production).
+func setChildReservationsReady(t *testing.T, k8sClient client.Client, commitmentUUID string) {
+ t.Helper()
+ var list v1alpha1.ReservationList
+ if err := k8sClient.List(context.Background(), &list, client.MatchingLabels{
+ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
+ }); err != nil {
+ t.Fatalf("list reservations: %v", err)
+ }
+ for i := range list.Items {
+ res := &list.Items[i]
+ if res.Spec.CommittedResourceReservation == nil ||
+ res.Spec.CommittedResourceReservation.CommitmentUUID != commitmentUUID {
+ continue
+ }
+ res.Status.Conditions = []metav1.Condition{{
+ Type: v1alpha1.ReservationConditionReady,
+ Status: metav1.ConditionTrue,
+ Reason: "ReservationActive",
+ LastTransitionTime: metav1.Now(),
+ }}
+ if res.Status.CommittedResourceReservation == nil {
+ res.Status.CommittedResourceReservation = &v1alpha1.CommittedResourceReservationStatus{}
+ }
+ res.Status.CommittedResourceReservation.ObservedParentGeneration = res.Spec.CommittedResourceReservation.ParentGeneration
+ if err := k8sClient.Status().Update(context.Background(), res); err != nil {
+ t.Fatalf("set reservation Ready=True: %v", err)
+ }
+ }
+}
+
// ============================================================================
// Tests: per-state reconcile paths
// ============================================================================
@@ -208,10 +256,21 @@ func TestCommittedResourceController_Reconcile(t *testing.T) {
objects = append(objects, newTestFlavorKnowledge())
}
k8sClient := newCRTestClient(scheme, objects...)
- controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: Config{}}
+ controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: CommittedResourceControllerConfig{}}
+ // First reconcile: creates Reservation CRDs; if slots are expected, controller
+ // waits for the reservation controller to set Ready=True before accepting.
if _, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)); err != nil {
- t.Fatalf("reconcile: %v", err)
+ t.Fatalf("reconcile 1: %v", err)
+ }
+
+ if tt.expectedSlots > 0 {
+ // Simulate reservation controller: mark all child reservations as Ready=True.
+ setChildReservationsReady(t, k8sClient, cr.Spec.CommitmentUUID)
+ // Second reconcile: sees all Ready=True and accepts.
+ if _, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)); err != nil {
+ t.Fatalf("reconcile 2: %v", err)
+ }
}
assertCondition(t, k8sClient, cr.Name, tt.expectedStatus, tt.expectedReason)
@@ -260,7 +319,7 @@ func TestCommittedResourceController_InactiveStates(t *testing.T) {
},
}
k8sClient := newCRTestClient(scheme, cr, existing)
- controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: Config{}}
+ controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: CommittedResourceControllerConfig{}}
if _, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)); err != nil {
t.Fatalf("reconcile: %v", err)
@@ -288,11 +347,19 @@ func TestCommittedResourceController_PlacementFailure(t *testing.T) {
expectRequeue bool
}{
{
- name: "pending: always rejects on failure, no retry",
+ name: "pending AllowRejection=true: rejects on failure, no retry",
state: v1alpha1.CommitmentStatusPending,
+ allowRejection: true,
expectedReason: "Rejected",
expectRequeue: false,
},
+ {
+ name: "pending AllowRejection=false: retries on failure",
+ state: v1alpha1.CommitmentStatusPending,
+ allowRejection: false,
+ expectedReason: "Reserving",
+ expectRequeue: true,
+ },
{
name: "guaranteed AllowRejection=true: rejects on failure, no retry",
state: v1alpha1.CommitmentStatusGuaranteed,
@@ -332,7 +399,7 @@ func TestCommittedResourceController_PlacementFailure(t *testing.T) {
controller := &CommittedResourceController{
Client: k8sClient,
Scheme: scheme,
- Conf: Config{RequeueIntervalRetry: 1 * time.Minute},
+ Conf: CommittedResourceControllerConfig{RequeueIntervalRetry: metav1.Duration{Duration: 1 * time.Minute}},
}
result, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name))
@@ -354,8 +421,58 @@ func TestCommittedResourceController_PlacementFailure(t *testing.T) {
}
}
+func TestCommittedResourceController_Rollback(t *testing.T) {
+ scheme := newCRTestScheme(t)
+
+ // CR at generation 2; AcceptedAmount reflects what was accepted at generation 1.
+ cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusConfirmed)
+ cr.Generation = 2
+ accepted := resource.MustParse("4Gi")
+ cr.Status.AcceptedAmount = &accepted
+
+ // Existing reservation with stale ParentGeneration from the previous generation.
+ existing := &v1alpha1.Reservation{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "test-cr-0",
+ Labels: map[string]string{
+ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
+ },
+ },
+ Spec: v1alpha1.ReservationSpec{
+ Type: v1alpha1.ReservationTypeCommittedResource,
+ SchedulingDomain: v1alpha1.SchedulingDomainNova,
+ AvailabilityZone: "test-az",
+ Resources: map[hv1.ResourceName]resource.Quantity{
+ hv1.ResourceMemory: resource.MustParse("4Gi"),
+ hv1.ResourceCPU: resource.MustParse("2"),
+ },
+ CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{
+ CommitmentUUID: "test-uuid-1234",
+ ProjectID: "test-project",
+ DomainID: "test-domain",
+ ResourceGroup: "test-group",
+ ParentGeneration: 1, // stale
+ },
+ },
+ }
+
+ k8sClient := newCRTestClient(scheme, cr, existing, newTestFlavorKnowledge())
+ controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: CommittedResourceControllerConfig{}}
+
+ if err := controller.rollbackToAccepted(context.Background(), logr.Discard(), cr); err != nil {
+ t.Fatalf("rollbackToAccepted: %v", err)
+ }
+
+ var res v1alpha1.Reservation
+ if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: "test-cr-0"}, &res); err != nil {
+ t.Fatalf("get reservation: %v", err)
+ }
+ if got := res.Spec.CommittedResourceReservation.ParentGeneration; got != cr.Generation {
+ t.Errorf("ParentGeneration: want %d, got %d", cr.Generation, got)
+ }
+}
+
func TestCommittedResourceController_BadSpec(t *testing.T) {
- // Invalid UUID fails commitmentUUIDPattern — permanently broken regardless of AllowRejection.
scheme := newCRTestScheme(t)
cr := &v1alpha1.CommittedResource{
ObjectMeta: metav1.ObjectMeta{
@@ -374,7 +491,7 @@ func TestCommittedResourceController_BadSpec(t *testing.T) {
},
}
k8sClient := newCRTestClient(scheme, cr, newTestFlavorKnowledge())
- controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: Config{}}
+ controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: CommittedResourceControllerConfig{}}
if _, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)); err != nil {
t.Fatalf("reconcile: %v", err)
@@ -390,11 +507,18 @@ func TestCommittedResourceController_Idempotent(t *testing.T) {
scheme := newCRTestScheme(t)
cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusConfirmed)
k8sClient := newCRTestClient(scheme, cr, newTestFlavorKnowledge())
- controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: Config{}}
+ controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: CommittedResourceControllerConfig{}}
- for i := range 3 {
+ // Round 1: creates reservation, waits for placement.
+ if _, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)); err != nil {
+ t.Fatalf("reconcile 1: %v", err)
+ }
+ // Simulate reservation controller setting Ready=True.
+ setChildReservationsReady(t, k8sClient, cr.Spec.CommitmentUUID)
+ // Rounds 2 and 3: accepts, then stays accepted.
+ for i := 2; i <= 3; i++ {
if _, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)); err != nil {
- t.Fatalf("reconcile %d: %v", i+1, err)
+ t.Fatalf("reconcile %d: %v", i, err)
}
}
@@ -404,6 +528,111 @@ func TestCommittedResourceController_Idempotent(t *testing.T) {
assertCondition(t, k8sClient, cr.Name, metav1.ConditionTrue, "Accepted")
}
+// ============================================================================
+// Tests: checkChildReservationStatus generation guard
+// ============================================================================
+
+// TestCheckChildReservationStatus_GenerationGuard verifies the two-pass logic that
+// distinguishes a stale Ready=False (previous generation) from a current failure.
+func TestCheckChildReservationStatus_GenerationGuard(t *testing.T) {
+ tests := []struct {
+ name string
+ obsGen int64
+ condStatus metav1.ConditionStatus // "" = no condition set
+ condMessage string
+ wantAllReady bool
+ wantAnyFailed bool
+ wantReason string
+ }{
+ {
+ name: "Ready=False at stale generation: treated as pending",
+ obsGen: 1,
+ condStatus: metav1.ConditionFalse,
+ condMessage: "no hosts available",
+ wantAllReady: false,
+ wantAnyFailed: false,
+ },
+ {
+ name: "Ready=False at current generation: is a current failure",
+ obsGen: 2,
+ condStatus: metav1.ConditionFalse,
+ condMessage: "no hosts available",
+ wantAllReady: false,
+ wantAnyFailed: true,
+ wantReason: "no hosts available",
+ },
+ {
+ name: "Ready=True at current generation: allReady",
+ obsGen: 2,
+ condStatus: metav1.ConditionTrue,
+ wantAllReady: true,
+ },
+ {
+ name: "no condition yet at current generation: still pending",
+ obsGen: 2,
+ condStatus: "", // no condition
+ wantAllReady: false,
+ wantAnyFailed: false,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ scheme := newCRTestScheme(t)
+ cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusConfirmed)
+ cr.Generation = 2
+
+ child := &v1alpha1.Reservation{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "test-cr-0",
+ Labels: map[string]string{
+ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
+ },
+ },
+ Spec: v1alpha1.ReservationSpec{
+ Type: v1alpha1.ReservationTypeCommittedResource,
+ CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{
+ CommitmentUUID: cr.Spec.CommitmentUUID,
+ ParentGeneration: cr.Generation,
+ },
+ },
+ }
+ k8sClient := newCRTestClient(scheme, child)
+
+ child.Status.CommittedResourceReservation = &v1alpha1.CommittedResourceReservationStatus{
+ ObservedParentGeneration: tt.obsGen,
+ }
+ if tt.condStatus != "" {
+ child.Status.Conditions = []metav1.Condition{{
+ Type: v1alpha1.ReservationConditionReady,
+ Status: tt.condStatus,
+ Reason: "Test",
+ Message: tt.condMessage,
+ LastTransitionTime: metav1.Now(),
+ }}
+ }
+ if err := k8sClient.Status().Update(context.Background(), child); err != nil {
+ t.Fatalf("set reservation status: %v", err)
+ }
+
+ controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme}
+ allReady, anyFailed, reason, err := controller.checkChildReservationStatus(context.Background(), cr, 1)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+ if allReady != tt.wantAllReady {
+ t.Errorf("allReady: want %v, got %v", tt.wantAllReady, allReady)
+ }
+ if anyFailed != tt.wantAnyFailed {
+ t.Errorf("anyFailed: want %v, got %v", tt.wantAnyFailed, anyFailed)
+ }
+ if reason != tt.wantReason {
+ t.Errorf("reason: want %q, got %q", tt.wantReason, reason)
+ }
+ })
+ }
+}
+
func TestCommittedResourceController_Deletion(t *testing.T) {
scheme := newCRTestScheme(t)
cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusConfirmed)
@@ -422,7 +651,7 @@ func TestCommittedResourceController_Deletion(t *testing.T) {
},
}
k8sClient := newCRTestClient(scheme, cr, child)
- controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: Config{}}
+ controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: CommittedResourceControllerConfig{}}
if err := k8sClient.Delete(context.Background(), cr); err != nil {
t.Fatalf("delete CR: %v", err)
diff --git a/internal/scheduling/reservations/commitments/committed_resource_integration_test.go b/internal/scheduling/reservations/commitments/committed_resource_integration_test.go
index 01a0b4199..0090e45f5 100644
--- a/internal/scheduling/reservations/commitments/committed_resource_integration_test.go
+++ b/internal/scheduling/reservations/commitments/committed_resource_integration_test.go
@@ -24,6 +24,7 @@ import (
hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
"k8s.io/apimachinery/pkg/api/meta"
+ "k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
@@ -55,6 +56,20 @@ func newCRIntegrationEnv(t *testing.T) *crIntegrationEnv {
&v1alpha1.Reservation{},
&v1alpha1.Knowledge{},
).
+ WithIndex(&v1alpha1.Reservation{}, idxReservationByCommitmentUUID, func(obj client.Object) []string {
+ res, ok := obj.(*v1alpha1.Reservation)
+ if !ok || res.Spec.CommittedResourceReservation == nil || res.Spec.CommittedResourceReservation.CommitmentUUID == "" {
+ return nil
+ }
+ return []string{res.Spec.CommittedResourceReservation.CommitmentUUID}
+ }).
+ WithIndex(&v1alpha1.CommittedResource{}, idxCommittedResourceByUUID, func(obj client.Object) []string {
+ cr, ok := obj.(*v1alpha1.CommittedResource)
+ if !ok || cr.Spec.CommitmentUUID == "" {
+ return nil
+ }
+ return []string{cr.Spec.CommitmentUUID}
+ }).
Build()
schedulerServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
@@ -67,19 +82,19 @@ func newCRIntegrationEnv(t *testing.T) *crIntegrationEnv {
crCtrl := &CommittedResourceController{
Client: k8sClient,
Scheme: scheme,
- Conf: Config{RequeueIntervalRetry: 5 * time.Minute},
+ Conf: CommittedResourceControllerConfig{RequeueIntervalRetry: metav1.Duration{Duration: 5 * time.Minute}},
}
resCtrl := &CommitmentReservationController{
Client: k8sClient,
Scheme: scheme,
- Conf: Config{
+ Conf: ReservationControllerConfig{
SchedulerURL: schedulerServer.URL,
- AllocationGracePeriod: 15 * time.Minute,
- RequeueIntervalActive: 5 * time.Minute,
+ AllocationGracePeriod: metav1.Duration{Duration: 15 * time.Minute},
+ RequeueIntervalActive: metav1.Duration{Duration: 5 * time.Minute},
},
}
- if err := resCtrl.Init(context.Background(), k8sClient, resCtrl.Conf); err != nil {
+ if err := resCtrl.Init(context.Background(), resCtrl.Conf); err != nil {
t.Fatalf("resCtrl.Init: %v", err)
}
@@ -136,196 +151,483 @@ func (e *crIntegrationEnv) getCR(t *testing.T, name string) v1alpha1.CommittedRe
return cr
}
+// reconcileChildReservations runs the reservation controller twice on every child Reservation
+// for crName (first reconcile sets TargetHost, second sets Ready=True), then re-reconciles
+// the CR so it can observe the placement outcomes.
+func (e *crIntegrationEnv) reconcileChildReservations(t *testing.T, crName string) {
+ t.Helper()
+ for _, res := range e.listChildReservations(t, crName) {
+ e.reconcileReservation(t, res.Name) // calls scheduler → sets TargetHost
+ e.reconcileReservation(t, res.Name) // syncs TargetHost to Status → Ready=True
+ }
+ e.reconcileCR(t, crName)
+}
+
// ============================================================================
// Integration tests
// ============================================================================
-// TestCRLifecycle_PlannedToConfirmed verifies that transitioning a CR from planned
-// to confirmed causes the CR controller to create child Reservation CRDs.
-func TestCRLifecycle_PlannedToConfirmed(t *testing.T) {
- env := newCRIntegrationEnv(t)
- defer env.close()
+// TestCRLifecycle covers the multi-step state transitions that require imperative
+// mid-test patches and cannot be expressed as a purely declarative table.
+func TestCRLifecycle(t *testing.T) {
+ t.Run("planned→confirmed: child Reservations created and placed", func(t *testing.T) {
+ env := newCRIntegrationEnv(t)
+ defer env.close()
- cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusPlanned)
- if err := env.k8sClient.Create(context.Background(), cr); err != nil {
- t.Fatalf("create CR: %v", err)
- }
+ cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusPlanned)
+ if err := env.k8sClient.Create(context.Background(), cr); err != nil {
+ t.Fatalf("create CR: %v", err)
+ }
- // Reconcile as planned: finalizer added, no Reservations.
- env.reconcileCR(t, cr.Name)
- env.reconcileCR(t, cr.Name)
- if got := env.listChildReservations(t, cr.Name); len(got) != 0 {
- t.Fatalf("planned: expected 0 reservations, got %d", len(got))
- }
- crState := env.getCR(t, cr.Name)
- cond := meta.FindStatusCondition(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady)
- if cond == nil || cond.Reason != "Planned" {
- t.Errorf("planned: expected Reason=Planned, got %v", cond)
- }
+ // Reconcile as planned: finalizer added, no Reservations.
+ env.reconcileCR(t, cr.Name)
+ env.reconcileCR(t, cr.Name)
+ if got := env.listChildReservations(t, cr.Name); len(got) != 0 {
+ t.Fatalf("planned: expected 0 reservations, got %d", len(got))
+ }
+ crState := env.getCR(t, cr.Name)
+ cond := meta.FindStatusCondition(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady)
+ if cond == nil || cond.Reason != "Planned" {
+ t.Errorf("planned: expected Reason=Planned, got %v", cond)
+ }
- // Transition to confirmed.
- patch := client.MergeFrom(crState.DeepCopy())
- crState.Spec.State = v1alpha1.CommitmentStatusConfirmed
- if err := env.k8sClient.Patch(context.Background(), &crState, patch); err != nil {
- t.Fatalf("patch state to confirmed: %v", err)
- }
+ // Transition to confirmed.
+ patch := client.MergeFrom(crState.DeepCopy())
+ crState.Spec.State = v1alpha1.CommitmentStatusConfirmed
+ if err := env.k8sClient.Patch(context.Background(), &crState, patch); err != nil {
+ t.Fatalf("patch state to confirmed: %v", err)
+ }
+ env.reconcileCR(t, cr.Name)
- env.reconcileCR(t, cr.Name)
+ children := env.listChildReservations(t, cr.Name)
+ if len(children) != 1 {
+ t.Fatalf("confirmed: expected 1 reservation, got %d", len(children))
+ }
+ env.reconcileChildReservations(t, cr.Name)
- children := env.listChildReservations(t, cr.Name)
- if len(children) != 1 {
- t.Fatalf("confirmed: expected 1 reservation, got %d", len(children))
- }
- crState = env.getCR(t, cr.Name)
- if !meta.IsStatusConditionTrue(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) {
- t.Errorf("confirmed: expected Ready=True")
- }
-}
+ crState = env.getCR(t, cr.Name)
+ if !meta.IsStatusConditionTrue(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) {
+ t.Errorf("confirmed: expected Ready=True")
+ }
+ })
-// TestCRLifecycle_ConfirmedToExpired verifies that transitioning a CR to expired
-// deletes all child Reservation CRDs and marks Ready=False.
-func TestCRLifecycle_ConfirmedToExpired(t *testing.T) {
- env := newCRIntegrationEnv(t)
- defer env.close()
+ t.Run("confirmed→expired: child Reservations deleted, CR marked inactive", func(t *testing.T) {
+ env := newCRIntegrationEnv(t)
+ defer env.close()
- cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed)
- if err := env.k8sClient.Create(context.Background(), cr); err != nil {
- t.Fatalf("create CR: %v", err)
- }
+ cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed)
+ if err := env.k8sClient.Create(context.Background(), cr); err != nil {
+ t.Fatalf("create CR: %v", err)
+ }
- // Bring to confirmed+Ready=True.
- env.reconcileCR(t, cr.Name) // adds finalizer
- env.reconcileCR(t, cr.Name) // creates Reservations
+ // Bring to confirmed+Ready=True.
+ env.reconcileCR(t, cr.Name) // adds finalizer
+ env.reconcileCR(t, cr.Name) // creates Reservations
+ env.reconcileChildReservations(t, cr.Name) // places slots → Ready=True
- if got := env.listChildReservations(t, cr.Name); len(got) != 1 {
- t.Fatalf("pre-expire: expected 1 reservation, got %d", len(got))
- }
+ if got := env.listChildReservations(t, cr.Name); len(got) != 1 {
+ t.Fatalf("pre-expire: expected 1 reservation, got %d", len(got))
+ }
- // Transition to expired.
- crState := env.getCR(t, cr.Name)
- patch := client.MergeFrom(crState.DeepCopy())
- crState.Spec.State = v1alpha1.CommitmentStatusExpired
- if err := env.k8sClient.Patch(context.Background(), &crState, patch); err != nil {
- t.Fatalf("patch state to expired: %v", err)
- }
+ // Transition to expired.
+ crState := env.getCR(t, cr.Name)
+ patch := client.MergeFrom(crState.DeepCopy())
+ crState.Spec.State = v1alpha1.CommitmentStatusExpired
+ if err := env.k8sClient.Patch(context.Background(), &crState, patch); err != nil {
+ t.Fatalf("patch state to expired: %v", err)
+ }
+ env.reconcileCR(t, cr.Name)
- env.reconcileCR(t, cr.Name)
+ if got := env.listChildReservations(t, cr.Name); len(got) != 0 {
+ t.Errorf("expired: expected 0 reservations, got %d", len(got))
+ }
+ crState = env.getCR(t, cr.Name)
+ cond := meta.FindStatusCondition(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady)
+ if cond == nil || cond.Status != metav1.ConditionFalse {
+ t.Errorf("expired: expected Ready=False, got %v", cond)
+ }
+ if cond != nil && cond.Reason != string(v1alpha1.CommitmentStatusExpired) {
+ t.Errorf("expired: expected Reason=%s, got %s", v1alpha1.CommitmentStatusExpired, cond.Reason)
+ }
+ })
- if got := env.listChildReservations(t, cr.Name); len(got) != 0 {
- t.Errorf("expired: expected 0 reservations, got %d", len(got))
- }
- crState = env.getCR(t, cr.Name)
- cond := meta.FindStatusCondition(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady)
- if cond == nil || cond.Status != metav1.ConditionFalse {
- t.Errorf("expired: expected Ready=False, got %v", cond)
- }
- if cond != nil && cond.Reason != string(v1alpha1.CommitmentStatusExpired) {
- t.Errorf("expired: expected Reason=%s, got %s", v1alpha1.CommitmentStatusExpired, cond.Reason)
- }
-}
+ t.Run("reservation placement: two reconciles set TargetHost then Ready=True", func(t *testing.T) {
+ env := newCRIntegrationEnv(t)
+ defer env.close()
-// TestCRLifecycle_ReservationControllerPlacesChild verifies that after the CR controller
-// creates a child Reservation, the ReservationController can place it (scheduler call →
-// TargetHost set → Ready=True on the Reservation).
-func TestCRLifecycle_ReservationControllerPlacesChild(t *testing.T) {
- env := newCRIntegrationEnv(t)
- defer env.close()
+ cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed)
+ if err := env.k8sClient.Create(context.Background(), cr); err != nil {
+ t.Fatalf("create CR: %v", err)
+ }
- cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed)
- if err := env.k8sClient.Create(context.Background(), cr); err != nil {
- t.Fatalf("create CR: %v", err)
- }
+ env.reconcileCR(t, cr.Name)
+ env.reconcileCR(t, cr.Name)
- // CR controller creates child Reservation.
- env.reconcileCR(t, cr.Name)
- env.reconcileCR(t, cr.Name)
+ children := env.listChildReservations(t, cr.Name)
+ if len(children) != 1 {
+ t.Fatalf("expected 1 child reservation, got %d", len(children))
+ }
+ child := children[0]
- children := env.listChildReservations(t, cr.Name)
- if len(children) != 1 {
- t.Fatalf("expected 1 child reservation, got %d", len(children))
- }
- child := children[0]
+ // First reconcile: scheduler call → TargetHost written to Spec.
+ env.reconcileReservation(t, child.Name)
+ var afterFirst v1alpha1.Reservation
+ if err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: child.Name}, &afterFirst); err != nil {
+ t.Fatalf("get reservation after first reconcile: %v", err)
+ }
+ if afterFirst.Spec.TargetHost == "" {
+ t.Fatalf("expected TargetHost set after first reservation reconcile")
+ }
- // Reservation controller places it (first reconcile: calls scheduler → sets TargetHost).
- env.reconcileReservation(t, child.Name)
+ // Second reconcile: TargetHost synced to Status, Ready=True.
+ env.reconcileReservation(t, child.Name)
+ var afterSecond v1alpha1.Reservation
+ if err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: child.Name}, &afterSecond); err != nil {
+ t.Fatalf("get reservation after second reconcile: %v", err)
+ }
+ if !meta.IsStatusConditionTrue(afterSecond.Status.Conditions, v1alpha1.ReservationConditionReady) {
+ t.Errorf("expected reservation Ready=True after placement, got %v", afterSecond.Status.Conditions)
+ }
+ if afterSecond.Status.Host != "host-1" {
+ t.Errorf("expected Status.Host=host-1, got %q", afterSecond.Status.Host)
+ }
+ })
- var afterFirst v1alpha1.Reservation
- if err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: child.Name}, &afterFirst); err != nil {
- t.Fatalf("get reservation after first reconcile: %v", err)
- }
- if afterFirst.Spec.TargetHost == "" {
- t.Fatalf("expected TargetHost set after first reservation reconcile")
- }
+ t.Run("deletion: finalizer removed, child Reservations cleaned up", func(t *testing.T) {
+ env := newCRIntegrationEnv(t)
+ defer env.close()
- // Second reconcile: syncs TargetHost to Status, sets Ready=True.
- env.reconcileReservation(t, child.Name)
+ cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed)
+ if err := env.k8sClient.Create(context.Background(), cr); err != nil {
+ t.Fatalf("create CR: %v", err)
+ }
- var afterSecond v1alpha1.Reservation
- if err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: child.Name}, &afterSecond); err != nil {
- t.Fatalf("get reservation after second reconcile: %v", err)
- }
- if !meta.IsStatusConditionTrue(afterSecond.Status.Conditions, v1alpha1.ReservationConditionReady) {
- t.Errorf("expected reservation Ready=True after placement, got %v", afterSecond.Status.Conditions)
- }
- if afterSecond.Status.Host != "host-1" {
- t.Errorf("expected Status.Host=host-1, got %q", afterSecond.Status.Host)
- }
-}
+ // Pre-create a child Reservation to verify it gets cleaned up on deletion.
+ // newTestCommittedResource pre-populates the finalizer, so Delete() immediately sets DeletionTimestamp.
+ child := &v1alpha1.Reservation{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "my-cr-0",
+ Labels: map[string]string{
+ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
+ },
+ },
+ Spec: v1alpha1.ReservationSpec{
+ Type: v1alpha1.ReservationTypeCommittedResource,
+ CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{
+ CommitmentUUID: "test-uuid-1234",
+ },
+ },
+ }
+ if err := env.k8sClient.Create(context.Background(), child); err != nil {
+ t.Fatalf("create child reservation: %v", err)
+ }
-// TestCRLifecycle_Deletion verifies that deleting a CR cleans up all child Reservations.
-func TestCRLifecycle_Deletion(t *testing.T) {
- env := newCRIntegrationEnv(t)
- defer env.close()
+ crState := env.getCR(t, cr.Name)
+ if err := env.k8sClient.Delete(context.Background(), &crState); err != nil {
+ t.Fatalf("delete CR: %v", err)
+ }
+ env.reconcileCR(t, cr.Name)
- cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed)
- if err := env.k8sClient.Create(context.Background(), cr); err != nil {
- t.Fatalf("create CR: %v", err)
- }
+ if got := env.listChildReservations(t, cr.Name); len(got) != 0 {
+ t.Errorf("post-deletion: expected 0 reservations, got %d", len(got))
+ }
+ var final v1alpha1.CommittedResource
+ err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: cr.Name}, &final)
+ if client.IgnoreNotFound(err) != nil {
+ t.Fatalf("unexpected error after deletion: %v", err)
+ }
+ if err == nil {
+ for _, f := range final.Finalizers {
+ if f == crFinalizer {
+ t.Errorf("finalizer not removed after deletion reconcile")
+ }
+ }
+ }
+ })
+
+ t.Run("confirmed→superseded: child Reservations deleted, CR marked inactive", func(t *testing.T) {
+ env := newCRIntegrationEnv(t)
+ defer env.close()
+
+ cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed)
+ if err := env.k8sClient.Create(context.Background(), cr); err != nil {
+ t.Fatalf("create CR: %v", err)
+ }
- // newTestCommittedResource pre-populates the finalizer, so Delete() will set
- // DeletionTimestamp without needing a prior reconcile.
+ env.reconcileCR(t, cr.Name)
+ env.reconcileCR(t, cr.Name)
+ env.reconcileChildReservations(t, cr.Name)
+
+ if got := env.listChildReservations(t, cr.Name); len(got) != 1 {
+ t.Fatalf("pre-supersede: expected 1 reservation, got %d", len(got))
+ }
+
+ crState := env.getCR(t, cr.Name)
+ patch := client.MergeFrom(crState.DeepCopy())
+ crState.Spec.State = v1alpha1.CommitmentStatusSuperseded
+ if err := env.k8sClient.Patch(context.Background(), &crState, patch); err != nil {
+ t.Fatalf("patch state to superseded: %v", err)
+ }
+ env.reconcileCR(t, cr.Name)
+
+ if got := env.listChildReservations(t, cr.Name); len(got) != 0 {
+ t.Errorf("superseded: expected 0 reservations, got %d", len(got))
+ }
+ crState = env.getCR(t, cr.Name)
+ cond := meta.FindStatusCondition(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady)
+ if cond == nil || cond.Status != metav1.ConditionFalse {
+ t.Errorf("superseded: expected Ready=False, got %v", cond)
+ }
+ if cond != nil && cond.Reason != string(v1alpha1.CommitmentStatusSuperseded) {
+ t.Errorf("superseded: expected Reason=%s, got %s", v1alpha1.CommitmentStatusSuperseded, cond.Reason)
+ }
+ })
- // Pre-create a child Reservation to verify it gets cleaned up on deletion.
- child := &v1alpha1.Reservation{
- ObjectMeta: metav1.ObjectMeta{
- Name: "my-cr-0",
- Labels: map[string]string{
+ t.Run("idempotency: extra reconciles after Accepted do not create extra slots", func(t *testing.T) {
+ env := newCRIntegrationEnv(t)
+ defer env.close()
+
+ cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed)
+ if err := env.k8sClient.Create(context.Background(), cr); err != nil {
+ t.Fatalf("create CR: %v", err)
+ }
+
+ env.reconcileCR(t, cr.Name)
+ env.reconcileCR(t, cr.Name)
+ env.reconcileChildReservations(t, cr.Name)
+
+ if got := env.listChildReservations(t, cr.Name); len(got) != 1 {
+ t.Fatalf("pre-idempotency check: expected 1 reservation, got %d", len(got))
+ }
+
+ env.reconcileCR(t, cr.Name)
+ env.reconcileCR(t, cr.Name)
+
+ if got := env.listChildReservations(t, cr.Name); len(got) != 1 {
+ t.Errorf("idempotency: expected 1 reservation after extra reconciles, got %d", len(got))
+ }
+ crState := env.getCR(t, cr.Name)
+ if !meta.IsStatusConditionTrue(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) {
+ t.Errorf("idempotency: expected CR to remain Ready=True after extra reconciles")
+ }
+ })
+
+ t.Run("AllowRejection=false: stays Reserving when scheduler rejects", func(t *testing.T) {
+ hypervisor := &hv1.Hypervisor{ObjectMeta: metav1.ObjectMeta{Name: "host-1"}}
+ env := newIntgEnv(t, []client.Object{newTestFlavorKnowledge(), hypervisor}, intgRejectScheduler)
+ defer env.close()
+
+ cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed)
+ // AllowRejection stays false (the default), so placement failure must requeue, not reject.
+ if err := env.k8sClient.Create(context.Background(), cr); err != nil {
+ t.Fatalf("create CR: %v", err)
+ }
+
+ ctx := context.Background()
+ crReq := ctrl.Request{NamespacedName: types.NamespacedName{Name: cr.Name}}
+ for range 3 {
+ env.crController.Reconcile(ctx, crReq) //nolint:errcheck
+ var resList v1alpha1.ReservationList
+ env.k8sClient.List(ctx, &resList, client.MatchingLabels{ //nolint:errcheck
v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
- },
- },
- Spec: v1alpha1.ReservationSpec{
- Type: v1alpha1.ReservationTypeCommittedResource,
- CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{
- CommitmentUUID: "test-uuid-1234",
- },
- },
- }
- if err := env.k8sClient.Create(context.Background(), child); err != nil {
- t.Fatalf("create child reservation: %v", err)
- }
+ })
+ for _, res := range resList.Items {
+ resReq := ctrl.Request{NamespacedName: types.NamespacedName{Name: res.Name}}
+ env.resController.Reconcile(ctx, resReq) //nolint:errcheck
+ env.resController.Reconcile(ctx, resReq) //nolint:errcheck
+ }
+ env.crController.Reconcile(ctx, crReq) //nolint:errcheck
+ }
- // Delete sets DeletionTimestamp (object has finalizer, so it is not removed yet).
- crState := env.getCR(t, cr.Name)
- if err := env.k8sClient.Delete(context.Background(), &crState); err != nil {
- t.Fatalf("delete CR: %v", err)
- }
+ var final v1alpha1.CommittedResource
+ if err := env.k8sClient.Get(ctx, types.NamespacedName{Name: cr.Name}, &final); err != nil {
+ t.Fatalf("get CR: %v", err)
+ }
+ cond := meta.FindStatusCondition(final.Status.Conditions, v1alpha1.CommittedResourceConditionReady)
+ if cond == nil {
+ t.Fatalf("no Ready condition")
+ }
+ if cond.Reason == v1alpha1.CommittedResourceReasonRejected {
+ t.Errorf("AllowRejection=false: CR must not transition to Rejected, got Reason=%s", cond.Reason)
+ }
+ if cond.Reason != v1alpha1.CommittedResourceReasonReserving {
+ t.Errorf("AllowRejection=false: expected Reason=Reserving, got %s", cond.Reason)
+ }
+ })
- env.reconcileCR(t, cr.Name)
+ t.Run("externally deleted child Reservation is recreated by CR controller", func(t *testing.T) {
+ env := newCRIntegrationEnv(t)
+ defer env.close()
- if got := env.listChildReservations(t, cr.Name); len(got) != 0 {
- t.Errorf("post-deletion: expected 0 reservations, got %d", len(got))
- }
- // Finalizer removed — object either gone or has no finalizer.
- var final v1alpha1.CommittedResource
- err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: cr.Name}, &final)
- if client.IgnoreNotFound(err) != nil {
- t.Fatalf("unexpected error after deletion: %v", err)
- }
- if err == nil {
- for _, f := range final.Finalizers {
- if f == crFinalizer {
- t.Errorf("finalizer not removed after deletion reconcile")
+ cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed)
+ if err := env.k8sClient.Create(context.Background(), cr); err != nil {
+ t.Fatalf("create CR: %v", err)
+ }
+
+ env.reconcileCR(t, cr.Name)
+ env.reconcileCR(t, cr.Name)
+ env.reconcileChildReservations(t, cr.Name)
+
+ children := env.listChildReservations(t, cr.Name)
+ if len(children) != 1 {
+ t.Fatalf("expected 1 child reservation before deletion, got %d", len(children))
+ }
+
+ // Simulate out-of-band deletion of the slot.
+ child := children[0]
+ if err := env.k8sClient.Delete(context.Background(), &child); err != nil {
+ t.Fatalf("delete child reservation: %v", err)
+ }
+
+ // CR controller detects the missing slot and recreates it.
+ env.reconcileCR(t, cr.Name)
+ // Place the new slot.
+ env.reconcileChildReservations(t, cr.Name)
+ // CR controller observes Ready=True on the recreated slot.
+ env.reconcileCR(t, cr.Name)
+
+ if got := env.listChildReservations(t, cr.Name); len(got) != 1 {
+ t.Errorf("expected 1 reservation after recreation, got %d", len(got))
+ }
+ crState := env.getCR(t, cr.Name)
+ if !meta.IsStatusConditionTrue(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) {
+ t.Errorf("expected CR to be Ready=True after slot recreation")
+ }
+ })
+
+ t.Run("AcceptedAt: set when CR accepted", func(t *testing.T) {
+ env := newCRIntegrationEnv(t)
+ defer env.close()
+
+ cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed)
+ if err := env.k8sClient.Create(context.Background(), cr); err != nil {
+ t.Fatalf("create CR: %v", err)
+ }
+
+ env.reconcileCR(t, cr.Name)
+ env.reconcileCR(t, cr.Name)
+ env.reconcileChildReservations(t, cr.Name)
+
+ crState := env.getCR(t, cr.Name)
+ if !meta.IsStatusConditionTrue(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) {
+ t.Fatalf("expected CR to be Ready=True")
+ }
+ if crState.Status.AcceptedAt == nil {
+ t.Errorf("expected AcceptedAt to be set on acceptance")
+ }
+ if crState.Status.AcceptedAmount == nil {
+ t.Errorf("expected AcceptedAmount to be set on acceptance")
+ } else if crState.Status.AcceptedAmount.Cmp(resource.MustParse("4Gi")) != 0 {
+ t.Errorf("AcceptedAmount: want 4Gi, got %s", crState.Status.AcceptedAmount.String())
+ }
+ })
+
+ t.Run("resize failure: rolls back to AcceptedAmount, prior slot preserved", func(t *testing.T) {
+ // Scheduler: accepts the first placement call (initial 4 GiB slot), rejects all subsequent.
+ objects := []client.Object{newTestFlavorKnowledge(), intgHypervisor("host-1")}
+ env := newIntgEnv(t, objects, intgAcceptFirstScheduler(1))
+ defer env.close()
+
+ cr := intgCRAllowRejection("my-cr", "uuid-resize-0001", v1alpha1.CommitmentStatusConfirmed)
+ if err := env.k8sClient.Create(context.Background(), cr); err != nil {
+ t.Fatalf("create CR: %v", err)
+ }
+
+ // Phase 1: accept at 4 GiB (1 slot). Uses 1 scheduler call.
+ intgDriveToTerminal(t, env, []string{cr.Name})
+ var crState v1alpha1.CommittedResource
+ if err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: cr.Name}, &crState); err != nil {
+ t.Fatalf("get CR: %v", err)
+ }
+ if !meta.IsStatusConditionTrue(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) {
+ t.Fatalf("phase 1: expected CR to be Ready=True after initial placement")
+ }
+ if crState.Status.AcceptedAmount == nil || crState.Status.AcceptedAmount.Cmp(resource.MustParse("4Gi")) != 0 {
+ t.Fatalf("phase 1: AcceptedAmount must be 4Gi, got %v", crState.Status.AcceptedAmount)
+ }
+
+ // Phase 2: resize to 8 GiB (needs 2 slots). Scheduler has no more accepts.
+ patch := client.MergeFrom(crState.DeepCopy())
+ crState.Spec.Amount = resource.MustParse("8Gi")
+ if err := env.k8sClient.Patch(context.Background(), &crState, patch); err != nil {
+ t.Fatalf("patch CR to 8Gi: %v", err)
+ }
+
+ ctx := context.Background()
+ crReq := ctrl.Request{NamespacedName: types.NamespacedName{Name: cr.Name}}
+
+ // CR controller: applyReservationState bumps gen on existing slot, creates 2nd slot.
+ env.crController.Reconcile(ctx, crReq) //nolint:errcheck
+ // Reservation controller: existing slot echoes new ParentGeneration (no scheduler call);
+ // new slot calls scheduler → rejected.
+ var resList v1alpha1.ReservationList
+ env.k8sClient.List(ctx, &resList, client.MatchingLabels{ //nolint:errcheck
+ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
+ })
+ for _, res := range resList.Items {
+ resReq := ctrl.Request{NamespacedName: types.NamespacedName{Name: res.Name}}
+ env.resController.Reconcile(ctx, resReq) //nolint:errcheck
+ env.resController.Reconcile(ctx, resReq) //nolint:errcheck
+ }
+ // CR controller: detects 2nd slot Ready=False → rollbackToAccepted (keeps 1 slot) → Rejected.
+ env.crController.Reconcile(ctx, crReq) //nolint:errcheck
+
+ // Rollback must preserve 1 slot (matching AcceptedAmount=4Gi), not delete all.
+ var finalList v1alpha1.ReservationList
+ if err := env.k8sClient.List(ctx, &finalList, client.MatchingLabels{
+ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
+ }); err != nil {
+ t.Fatalf("list reservations: %v", err)
+ }
+ if len(finalList.Items) != 1 {
+ t.Errorf("resize rollback: want 1 slot (AcceptedAmount), got %d", len(finalList.Items))
+ }
+ intgAssertCRCondition(t, env.k8sClient, []string{cr.Name}, metav1.ConditionFalse, v1alpha1.CommittedResourceReasonRejected)
+ })
+
+ t.Run("AllowRejection=false: eventually accepted after scheduler starts accepting", func(t *testing.T) {
+ // Scheduler rejects the first 2 calls (one per reservation controller reconcile pair),
+ // then accepts all subsequent. AllowRejection=false means the CR controller retries rather
+ // than rejecting, so the CR must eventually reach Accepted once the scheduler cooperates.
+ objects := []client.Object{newTestFlavorKnowledge(), intgHypervisor("host-1")}
+ env := newIntgEnv(t, objects, intgRejectFirstScheduler(2))
+ defer env.close()
+
+ cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed)
+ // AllowRejection stays false (default), so placement failure must requeue, not reject.
+ if err := env.k8sClient.Create(context.Background(), cr); err != nil {
+ t.Fatalf("create CR: %v", err)
+ }
+
+ ctx := context.Background()
+ crReq := ctrl.Request{NamespacedName: types.NamespacedName{Name: cr.Name}}
+ for range 3 {
+ env.crController.Reconcile(ctx, crReq) //nolint:errcheck
+ var resList v1alpha1.ReservationList
+ env.k8sClient.List(ctx, &resList, client.MatchingLabels{ //nolint:errcheck
+ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
+ })
+ for _, res := range resList.Items {
+ resReq := ctrl.Request{NamespacedName: types.NamespacedName{Name: res.Name}}
+ env.resController.Reconcile(ctx, resReq) //nolint:errcheck
+ env.resController.Reconcile(ctx, resReq) //nolint:errcheck
}
+ env.crController.Reconcile(ctx, crReq) //nolint:errcheck
}
- }
+
+ var final v1alpha1.CommittedResource
+ if err := env.k8sClient.Get(ctx, types.NamespacedName{Name: cr.Name}, &final); err != nil {
+ t.Fatalf("get CR: %v", err)
+ }
+ cond := meta.FindStatusCondition(final.Status.Conditions, v1alpha1.CommittedResourceConditionReady)
+ if cond == nil {
+ t.Fatalf("no Ready condition after retries")
+ }
+ if cond.Reason == v1alpha1.CommittedResourceReasonRejected {
+ t.Errorf("AllowRejection=false: CR must not be Rejected, got Reason=%s", cond.Reason)
+ }
+ if cond.Status != metav1.ConditionTrue || cond.Reason != v1alpha1.CommittedResourceReasonAccepted {
+ t.Errorf("AllowRejection=false: expected Ready=True/Accepted after retries, got Ready=%s/Reason=%s", cond.Status, cond.Reason)
+ }
+ })
}
diff --git a/internal/scheduling/reservations/commitments/config.go b/internal/scheduling/reservations/commitments/config.go
index 888d37018..fe05fcc20 100644
--- a/internal/scheduling/reservations/commitments/config.go
+++ b/internal/scheduling/reservations/commitments/config.go
@@ -5,103 +5,73 @@ package commitments
import (
"time"
+
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
+// Config aggregates configuration for all commitments components.
+// Each controller and the API have their own sub-struct so that unrelated
+// fields are never visible to the wrong component.
type Config struct {
+ ReservationController ReservationControllerConfig `json:"committedResourceReservationController"`
+ CommittedResourceController CommittedResourceControllerConfig `json:"committedResourceController"`
+ API APIConfig `json:"committedResourceAPI"`
- // RequeueIntervalActive is the interval for requeueing active reservations for periodic verification.
- RequeueIntervalActive time.Duration `json:"committedResourceRequeueIntervalActive"`
- // RequeueIntervalRetry is the interval for requeueing when retrying after knowledge is not ready.
- RequeueIntervalRetry time.Duration `json:"committedResourceRequeueIntervalRetry"`
- // AllocationGracePeriod is the time window after a VM is allocated to a reservation
- // during which it's expected to appear on the target host. VMs not confirmed within
- // this period are considered stale and removed from the reservation.
- AllocationGracePeriod time.Duration `json:"committedResourceAllocationGracePeriod"`
- // RequeueIntervalGracePeriod is the interval for requeueing when VMs are in grace period.
- // Shorter than RequeueIntervalActive for faster verification of new allocations.
- RequeueIntervalGracePeriod time.Duration `json:"committedResourceRequeueIntervalGracePeriod"`
- // PipelineDefault is the default pipeline used for scheduling committed resource reservations.
- PipelineDefault string `json:"committedResourcePipelineDefault"`
-
- // SchedulerURL is the endpoint of the nova external scheduler
- SchedulerURL string `json:"schedulerURL"`
-
- // DatasourceName is the name of the Datasource CRD that provides database connection info.
- // Used to query VM state for report-usage. If empty, report-usage returns an error.
+ // DatasourceName is the name of the Datasource CRD that provides database
+ // connection info. Used to construct the UsageDBClient for report-usage.
DatasourceName string `json:"datasourceName,omitempty"`
+}
- // FlavorGroupPipelines maps flavor group names to pipeline names.
- // Example: {"2152": "kvm-hana-bin-packing", "2101": "kvm-general-purpose-load-balancing", "*": "kvm-general-purpose-load-balancing"}
- // Used to select different scheduling pipelines based on flavor group characteristics.
- FlavorGroupPipelines map[string]string `json:"committedResourceFlavorGroupPipelines,omitempty"`
-
- // API configuration
-
- // ChangeAPIWatchReservationsTimeout defines how long to wait for reservations to become ready before timing out and rolling back.
- ChangeAPIWatchReservationsTimeout time.Duration `json:"committedResourceChangeAPIWatchReservationsTimeout"`
-
- // ChangeAPIWatchReservationsPollInterval defines how frequently to poll reservation status during watch.
- ChangeAPIWatchReservationsPollInterval time.Duration `json:"committedResourceChangeAPIWatchReservationsPollInterval"`
-
- // EnableChangeCommitmentsAPI controls whether the change-commitments API endpoint is active.
- // When false, the endpoint will return HTTP 503 Service Unavailable.
- // The info endpoint remains available for health checks.
- EnableChangeCommitmentsAPI bool `json:"committedResourceEnableChangeCommitmentsAPI"`
-
- // EnableReportUsageAPI controls whether the report-usage API endpoint is active.
- // When false, the endpoint will return HTTP 503 Service Unavailable.
- // This can be used as an emergency switch if the usage reporting is causing issues.
- EnableReportUsageAPI bool `json:"committedResourceEnableReportUsageAPI"`
+// ReservationControllerConfig holds tuning knobs for the Reservation CRD controller.
+type ReservationControllerConfig struct {
+ // RequeueIntervalActive is how often to re-verify a healthy Reservation CRD.
+ RequeueIntervalActive metav1.Duration `json:"requeueIntervalActive"`
+ // RequeueIntervalRetry is the back-off interval when knowledge is unavailable.
+ RequeueIntervalRetry metav1.Duration `json:"requeueIntervalRetry"`
+ // RequeueIntervalGracePeriod is how often to re-check while a VM allocation
+ // is still within AllocationGracePeriod. Shorter than RequeueIntervalActive.
+ RequeueIntervalGracePeriod metav1.Duration `json:"requeueIntervalGracePeriod"`
+ // AllocationGracePeriod is the time window after a VM is allocated to a
+ // reservation during which it's expected to appear on the target host.
+ // VMs not confirmed within this period are considered stale and removed.
+ AllocationGracePeriod metav1.Duration `json:"allocationGracePeriod"`
+ // SchedulerURL is the endpoint of the nova external scheduler.
+ SchedulerURL string `json:"schedulerURL"`
+ // PipelineDefault is the fallback pipeline when no FlavorGroupPipelines entry matches.
+ PipelineDefault string `json:"pipelineDefault"`
+ // FlavorGroupPipelines maps flavor group IDs to pipeline names; "*" acts as catch-all.
+ FlavorGroupPipelines map[string]string `json:"flavorGroupPipelines,omitempty"`
+}
- // EnableReportCapacityAPI controls whether the report-capacity API endpoint is active.
- // When false, the endpoint will return HTTP 503 Service Unavailable.
- // This can be used as an emergency switch if the capacity reporting is causing issues.
- EnableReportCapacityAPI bool `json:"committedResourceEnableReportCapacityAPI"`
+// CommittedResourceControllerConfig holds tuning knobs for the CommittedResource CRD controller.
+type CommittedResourceControllerConfig struct {
+ // RequeueIntervalRetry is the back-off interval when placement is pending or failed.
+ RequeueIntervalRetry metav1.Duration `json:"requeueIntervalRetry"`
}
-// ApplyDefaults fills in any unset values with defaults.
-func (c *Config) ApplyDefaults() {
- defaults := DefaultConfig()
- if c.RequeueIntervalActive == 0 {
- c.RequeueIntervalActive = defaults.RequeueIntervalActive
- }
- if c.RequeueIntervalRetry == 0 {
- c.RequeueIntervalRetry = defaults.RequeueIntervalRetry
- }
- if c.RequeueIntervalGracePeriod == 0 {
- c.RequeueIntervalGracePeriod = defaults.RequeueIntervalGracePeriod
- }
- if c.AllocationGracePeriod == 0 {
- c.AllocationGracePeriod = defaults.AllocationGracePeriod
- }
- if c.PipelineDefault == "" {
- c.PipelineDefault = defaults.PipelineDefault
- }
- if c.SchedulerURL == "" {
- c.SchedulerURL = defaults.SchedulerURL
- }
- if c.ChangeAPIWatchReservationsTimeout == 0 {
- c.ChangeAPIWatchReservationsTimeout = defaults.ChangeAPIWatchReservationsTimeout
- }
- if c.ChangeAPIWatchReservationsPollInterval == 0 {
- c.ChangeAPIWatchReservationsPollInterval = defaults.ChangeAPIWatchReservationsPollInterval
- }
- // Note: EnableChangeCommitmentsAPI, EnableReportUsageAPI, EnableReportCapacityAPI
- // are booleans where false is a valid value, so we don't apply defaults for them
+// APIConfig holds configuration for the LIQUID commitment HTTP endpoints.
+type APIConfig struct {
+ // EnableChangeCommitments controls whether the change-commitments endpoint is active.
+ // When false the endpoint returns HTTP 503; the info endpoint remains available.
+ EnableChangeCommitments bool `json:"enableChangeCommitments"`
+ // EnableReportUsage controls whether the report-usage endpoint is active.
+ EnableReportUsage bool `json:"enableReportUsage"`
+ // EnableReportCapacity controls whether the report-capacity endpoint is active.
+ EnableReportCapacity bool `json:"enableReportCapacity"`
+ // WatchTimeout is how long the change-commitments handler polls CommittedResource
+ // CRD conditions before giving up and rolling back.
+ WatchTimeout metav1.Duration `json:"watchTimeout"`
+ // WatchPollInterval is how frequently the change-commitments handler polls
+ // CommittedResource CRD conditions while waiting for the controller outcome.
+ WatchPollInterval metav1.Duration `json:"watchPollInterval"`
}
-func DefaultConfig() Config {
- return Config{
- RequeueIntervalActive: 5 * time.Minute,
- RequeueIntervalRetry: 1 * time.Minute,
- RequeueIntervalGracePeriod: 1 * time.Minute,
- AllocationGracePeriod: 15 * time.Minute,
- PipelineDefault: "kvm-general-purpose-load-balancing",
- SchedulerURL: "http://localhost:8080/scheduler/nova/external",
- ChangeAPIWatchReservationsTimeout: 10 * time.Second,
- ChangeAPIWatchReservationsPollInterval: 500 * time.Millisecond,
- EnableChangeCommitmentsAPI: true,
- EnableReportUsageAPI: true,
- EnableReportCapacityAPI: true,
+func DefaultAPIConfig() APIConfig {
+ return APIConfig{
+ EnableChangeCommitments: true,
+ EnableReportUsage: true,
+ EnableReportCapacity: true,
+ WatchTimeout: metav1.Duration{Duration: 10 * time.Second},
+ WatchPollInterval: metav1.Duration{Duration: 500 * time.Millisecond},
}
}
diff --git a/internal/scheduling/reservations/commitments/e2e_checks.go b/internal/scheduling/reservations/commitments/e2e_checks.go
index 2292bcaa1..cd4b15d05 100644
--- a/internal/scheduling/reservations/commitments/e2e_checks.go
+++ b/internal/scheduling/reservations/commitments/e2e_checks.go
@@ -4,13 +4,17 @@
package commitments
import (
+ "bytes"
"context"
"encoding/json"
"fmt"
"io"
"log/slog"
"net/http"
+ "strings"
+ "time"
+ . "github.com/majewsky/gg/option"
liquid "github.com/sapcc/go-api-declarations/liquid"
"github.com/sapcc/go-bits/must"
)
@@ -19,38 +23,46 @@ const (
// Default URL for the commitments API endpoint.
// This should match the service name in the helm chart.
defaultCommitmentsAPIURL = "http://cortex-nova-scheduler:8080"
+
+ // defaultE2EProjectUUID is a well-known fake project UUID used when no TestProjectID is configured.
+ // It is intentionally not a real OpenStack project — commitments created under it self-expire.
+ defaultE2EProjectUUID = "00000000-0000-0000-0000-000000000e2e"
)
// E2EChecksConfig holds the configuration for CR e2e checks.
type E2EChecksConfig struct {
- // Base URL for the commitments API. If empty, defaults to defaultCommitmentsAPIURL.
+ // BaseURL for the commitments API. If empty, defaults to defaultCommitmentsAPIURL.
BaseURL string `json:"baseURL"`
+ // RoundTripCheck holds optional overrides for the round-trip check.
+ // If nil, defaults are used: testProjectID = defaultE2EProjectUUID, az = "".
+ RoundTripCheck *E2ERoundTripConfig `json:"roundTripCheck,omitempty"`
+}
+
+// E2ERoundTripConfig holds optional overrides for the create→delete round-trip e2e check.
+type E2ERoundTripConfig struct {
+ // AZ is the availability zone to use (e.g. "qa-de-1d"). Defaults to "" if not set.
+ AZ string `json:"az"`
+ // TestProjectID is the OpenStack project UUID to create test commitments under.
+ // Defaults to defaultE2EProjectUUID if not set.
+ TestProjectID string `json:"testProjectID"`
}
-// CheckCommitmentsInfoEndpoint sends a GET request to the /commitments/v1/info endpoint
-// and verifies that it returns HTTP 200 with a valid ServiceInfo response.
+// CheckCommitmentsInfoEndpoint verifies that GET /commitments/v1/info returns 200 with a valid ServiceInfo.
func CheckCommitmentsInfoEndpoint(ctx context.Context, config E2EChecksConfig) {
- baseURL := config.BaseURL
- if baseURL == "" {
- baseURL = defaultCommitmentsAPIURL
- }
+ baseURL := e2eBaseURL(config)
apiURL := baseURL + "/commitments/v1/info"
slog.Info("checking commitments info endpoint", "apiURL", apiURL)
httpReq := must.Return(http.NewRequestWithContext(ctx, http.MethodGet, apiURL, http.NoBody))
httpReq.Header.Set("Accept", "application/json")
- //nolint:bodyclose // Body is closed in the deferred function below.
+ //nolint:bodyclose
resp := must.Return(http.DefaultClient.Do(httpReq))
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
bodyBytes := must.Return(io.ReadAll(resp.Body))
- slog.Error("commitments info API returned non-200 status code",
- "statusCode", resp.StatusCode,
- "responseBody", string(bodyBytes),
- )
- panic(fmt.Sprintf("commitments info API returned status %d, expected 200", resp.StatusCode))
+ panic(fmt.Sprintf("commitments info API returned status %d: %s", resp.StatusCode, bodyBytes))
}
var serviceInfo liquid.ServiceInfo
@@ -58,20 +70,208 @@ func CheckCommitmentsInfoEndpoint(ctx context.Context, config E2EChecksConfig) {
panic(fmt.Sprintf("failed to decode ServiceInfo response: %v", err))
}
- // Basic validation of the response
if serviceInfo.Version < 0 {
slog.Warn("commitments info returned version -1, knowledge may not be ready yet")
}
-
slog.Info("commitments info endpoint check passed",
"version", serviceInfo.Version,
"resourceCount", len(serviceInfo.Resources),
)
}
+// CheckCommitmentsRoundTrip iterates all HandlesCommitments resources from /info and for each one:
+// 1. Creates a confirmed test commitment (amount=2, expires in 5 minutes)
+// 2. If accepted: calls the usage API to verify it returns 200, then deletes the commitment
+// 3. If rejected: logs the reason and continues — capacity rejection is not an error
+//
+// Panics on infrastructure failures (non-200 from the API, deletion failure after acceptance).
+func CheckCommitmentsRoundTrip(ctx context.Context, config E2EChecksConfig) {
+ baseURL := e2eBaseURL(config)
+ az := liquid.AvailabilityZone("")
+ projectID := liquid.ProjectUUID(defaultE2EProjectUUID)
+ if rt := config.RoundTripCheck; rt != nil {
+ if rt.AZ != "" {
+ az = liquid.AvailabilityZone(rt.AZ)
+ }
+ if rt.TestProjectID != "" {
+ projectID = liquid.ProjectUUID(rt.TestProjectID)
+ }
+ }
+
+ serviceInfo := e2eFetchServiceInfo(ctx, baseURL)
+
+ checked := 0
+ for resourceName, resInfo := range serviceInfo.Resources {
+ if !resInfo.HandlesCommitments {
+ continue
+ }
+ e2eRoundTripResource(ctx, baseURL, serviceInfo.Version, az, projectID, resourceName)
+ checked++
+ }
+
+ if checked == 0 {
+ slog.Warn("round-trip check: no HandlesCommitments resources found in /info — nothing checked")
+ }
+}
+
+// e2eRoundTripResource runs the create→usageCheck→delete cycle for one resource.
+func e2eRoundTripResource(
+ ctx context.Context,
+ baseURL string,
+ infoVersion int64,
+ az liquid.AvailabilityZone,
+ projectID liquid.ProjectUUID,
+ resourceName liquid.ResourceName,
+) {
+
+ testUUID := liquid.CommitmentUUID(fmt.Sprintf("e2e-%d", time.Now().UnixMilli()))
+ expiresAt := time.Now().Add(5 * time.Minute)
+ const amount = uint64(2)
+
+ createReq := liquid.CommitmentChangeRequest{
+ InfoVersion: infoVersion,
+ AZ: az,
+ ByProject: map[liquid.ProjectUUID]liquid.ProjectCommitmentChangeset{
+ projectID: {
+ ByResource: map[liquid.ResourceName]liquid.ResourceCommitmentChangeset{
+ resourceName: {
+ TotalConfirmedAfter: amount,
+ Commitments: []liquid.Commitment{{
+ UUID: testUUID,
+ Amount: amount,
+ NewStatus: Some(liquid.CommitmentStatusConfirmed),
+ ExpiresAt: expiresAt,
+ }},
+ },
+ },
+ },
+ },
+ }
+
+ slog.Info("round-trip check: creating test commitment",
+ "resource", resourceName, "uuid", testUUID, "project", projectID, "az", az)
+
+ rejectionReason := e2eSendChangeCommitments(ctx, baseURL, createReq)
+ if rejectionReason != "" {
+ // Only capacity rejections (no hosts available) are expected in production clusters.
+ // Any other reason (flavor group ineligible, config error, timeout) indicates a
+ // regression and should surface as a failure.
+ if !strings.Contains(rejectionReason, "no hosts found") {
+ panic(fmt.Sprintf("round-trip check: commitment rejected with unexpected reason for resource %s: %s", resourceName, rejectionReason))
+ }
+ slog.Info("round-trip check: commitment rejected — no capacity, continuing",
+ "resource", resourceName, "reason", rejectionReason)
+ return
+ }
+ slog.Info("round-trip check: commitment accepted", "resource", resourceName, "uuid", testUUID)
+
+ // Register cleanup immediately so it runs even if the usage check panics.
+ defer func() {
+ deleteReq := liquid.CommitmentChangeRequest{
+ InfoVersion: infoVersion,
+ AZ: az,
+ ByProject: map[liquid.ProjectUUID]liquid.ProjectCommitmentChangeset{
+ projectID: {
+ ByResource: map[liquid.ResourceName]liquid.ResourceCommitmentChangeset{
+ resourceName: {
+ TotalConfirmedBefore: amount,
+ Commitments: []liquid.Commitment{{
+ UUID: testUUID,
+ Amount: amount,
+ OldStatus: Some(liquid.CommitmentStatusConfirmed),
+ NewStatus: None[liquid.CommitmentStatus](),
+ ExpiresAt: expiresAt,
+ }},
+ },
+ },
+ },
+ },
+ }
+ slog.Info("round-trip check: deleting test commitment", "resource", resourceName, "uuid", testUUID)
+ if reason := e2eSendChangeCommitments(ctx, baseURL, deleteReq); reason != "" {
+ panic(fmt.Sprintf("round-trip check: delete of test commitment %s was rejected: %s", testUUID, reason))
+ }
+ slog.Info("round-trip check: commitment deleted", "resource", resourceName, "uuid", testUUID)
+ }()
+
+ // Smoke-check the usage API: verifies the usage calculation pipeline works for this project.
+ e2eCheckUsageAPI(ctx, baseURL, az, projectID)
+}
+
+// e2eCheckUsageAPI calls POST /commitments/v1/projects/:id/report-usage and verifies 200.
+// The usage report for a project with no VMs will show zero usage — we only verify the endpoint works.
+func e2eCheckUsageAPI(ctx context.Context, baseURL string, az liquid.AvailabilityZone, projectID liquid.ProjectUUID) {
+ usageReq := liquid.ServiceUsageRequest{AllAZs: []liquid.AvailabilityZone{az}}
+ body := must.Return(json.Marshal(usageReq))
+ url := fmt.Sprintf("%s/commitments/v1/projects/%s/report-usage", baseURL, projectID)
+ httpReq := must.Return(http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body)))
+ httpReq.Header.Set("Content-Type", "application/json")
+
+ //nolint:bodyclose
+ resp := must.Return(http.DefaultClient.Do(httpReq))
+ defer resp.Body.Close()
+ if resp.StatusCode != http.StatusOK {
+ bodyBytes := must.Return(io.ReadAll(resp.Body))
+ panic(fmt.Sprintf("usage API returned %d: %s", resp.StatusCode, bodyBytes))
+ }
+ slog.Info("round-trip check: usage API returned 200", "project", projectID)
+}
+
+// e2eSendChangeCommitments sends a change-commitments request.
+// Panics on HTTP non-200 (infrastructure error).
+// Returns the rejection reason on 200+rejection (expected for capacity-constrained clusters).
+// Returns "" on success.
+func e2eSendChangeCommitments(ctx context.Context, baseURL string, req liquid.CommitmentChangeRequest) string {
+ body := must.Return(json.Marshal(req))
+ httpReq := must.Return(http.NewRequestWithContext(ctx, http.MethodPost,
+ baseURL+"/commitments/v1/change-commitments", bytes.NewReader(body)))
+ httpReq.Header.Set("Content-Type", "application/json")
+
+ //nolint:bodyclose
+ resp := must.Return(http.DefaultClient.Do(httpReq))
+ defer resp.Body.Close()
+ respBody := must.Return(io.ReadAll(resp.Body))
+
+ if resp.StatusCode != http.StatusOK {
+ panic(fmt.Sprintf("change-commitments returned %d: %s", resp.StatusCode, respBody))
+ }
+ var result liquid.CommitmentChangeResponse
+ if err := json.Unmarshal(respBody, &result); err != nil {
+ panic(fmt.Sprintf("failed to decode change-commitments response: %v", err))
+ }
+ return result.RejectionReason
+}
+
+// e2eFetchServiceInfo fetches and decodes /info. Panics on failure.
+func e2eFetchServiceInfo(ctx context.Context, baseURL string) liquid.ServiceInfo {
+ httpReq := must.Return(http.NewRequestWithContext(ctx, http.MethodGet,
+ baseURL+"/commitments/v1/info", http.NoBody))
+ httpReq.Header.Set("Accept", "application/json")
+ //nolint:bodyclose
+ resp := must.Return(http.DefaultClient.Do(httpReq))
+ defer resp.Body.Close()
+ if resp.StatusCode != http.StatusOK {
+ bodyBytes := must.Return(io.ReadAll(resp.Body))
+ panic(fmt.Sprintf("info endpoint returned %d: %s", resp.StatusCode, bodyBytes))
+ }
+ var info liquid.ServiceInfo
+ if err := json.NewDecoder(resp.Body).Decode(&info); err != nil {
+ panic(fmt.Sprintf("failed to decode ServiceInfo: %v", err))
+ }
+ return info
+}
+
+func e2eBaseURL(config E2EChecksConfig) string {
+ if config.BaseURL != "" {
+ return config.BaseURL
+ }
+ return defaultCommitmentsAPIURL
+}
+
// RunCommitmentsE2EChecks runs all e2e checks for the commitments API.
func RunCommitmentsE2EChecks(ctx context.Context, config E2EChecksConfig) {
slog.Info("running commitments e2e checks")
CheckCommitmentsInfoEndpoint(ctx, config)
+ CheckCommitmentsRoundTrip(ctx, config)
slog.Info("all commitments e2e checks passed")
}
diff --git a/internal/scheduling/reservations/commitments/field_index.go b/internal/scheduling/reservations/commitments/field_index.go
index 9e3fde378..40760655d 100644
--- a/internal/scheduling/reservations/commitments/field_index.go
+++ b/internal/scheduling/reservations/commitments/field_index.go
@@ -14,6 +14,7 @@ import (
)
const idxCommittedResourceByUUID = "spec.commitmentUUID"
+const idxReservationByCommitmentUUID = "spec.committedResourceReservation.commitmentUUID"
// IndexFields registers field indexes required by the CommittedResource controller.
func IndexFields(ctx context.Context, mcl *multicluster.Client) error {
@@ -38,6 +39,25 @@ func IndexFields(ctx context.Context, mcl *multicluster.Client) error {
log.Error(err, "failed to set up index for commitmentUUID")
return err
}
- log.Info("Successfully set up index for commitmentUUID")
+ if err := mcl.IndexField(ctx,
+ &v1alpha1.Reservation{},
+ &v1alpha1.ReservationList{},
+ idxReservationByCommitmentUUID,
+ func(obj client.Object) []string {
+ res, ok := obj.(*v1alpha1.Reservation)
+ if !ok {
+ log.Error(errors.New("unexpected type"), "expected Reservation", "object", obj)
+ return nil
+ }
+ if res.Spec.CommittedResourceReservation == nil || res.Spec.CommittedResourceReservation.CommitmentUUID == "" {
+ return nil
+ }
+ return []string{res.Spec.CommittedResourceReservation.CommitmentUUID}
+ },
+ ); err != nil {
+ log.Error(err, "failed to set up index for reservation commitmentUUID")
+ return err
+ }
+ log.Info("Successfully set up field indexes")
return nil
}
diff --git a/internal/scheduling/reservations/commitments/integration_test.go b/internal/scheduling/reservations/commitments/integration_test.go
new file mode 100644
index 000000000..138f3c74c
--- /dev/null
+++ b/internal/scheduling/reservations/commitments/integration_test.go
@@ -0,0 +1,618 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package commitments
+
+// Table-driven integration tests for the committed-resource lifecycle.
+//
+// Each test case wires CommittedResourceController and CommitmentReservationController
+// against a shared fake k8s client and a mock Nova scheduler, then drives both
+// controllers synchronously until every CR reaches a terminal condition.
+//
+// Terminal conditions (no further reconcile expected without external input):
+// - Ready=True / Accepted
+// - Ready=False / Rejected
+// - Ready=False / Planned (controller waits for StartTime)
+// - Ready=False / Expired (controller has cleaned up children)
+// - Ready=False / Superseded
+
+import (
+ "context"
+ "encoding/json"
+ "net/http"
+ "net/http/httptest"
+ "sync/atomic"
+ "testing"
+ "time"
+
+ schedulerdelegationapi "github.com/cobaltcore-dev/cortex/api/external/nova"
+ "github.com/cobaltcore-dev/cortex/api/v1alpha1"
+ hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
+ "k8s.io/apimachinery/pkg/api/meta"
+ "k8s.io/apimachinery/pkg/api/resource"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/apimachinery/pkg/types"
+ ctrl "sigs.k8s.io/controller-runtime"
+ "sigs.k8s.io/controller-runtime/pkg/client"
+ "sigs.k8s.io/controller-runtime/pkg/client/fake"
+)
+
+// ============================================================================
+// Test cases
+// ============================================================================
+
+// CRIntegrationTestCase defines one end-to-end scenario for the committed-resource
+// lifecycle spanning both controllers and the mock scheduler.
+type CRIntegrationTestCase struct {
+ Name string
+
+ // Initial cluster state.
+ Hypervisors []*hv1.Hypervisor
+ ExistingReservations []*v1alpha1.Reservation // pre-placed slots (for expiry/supersede scenarios)
+
+ // CRs to create and drive to terminal state.
+ CommittedResources []*v1alpha1.CommittedResource
+
+ // When true the mock scheduler returns an empty hosts list (NoHostsFound).
+ SchedulerRejects bool
+ // SchedulerAcceptFirst, when > 0, makes the mock scheduler accept only the first N
+ // placement calls and reject all subsequent ones. Used to test partial placement
+ // (e.g. first slot placed, second slot rejected). Takes precedence over SchedulerRejects.
+ SchedulerAcceptFirst int
+
+ // Expected state after all CRs reach a terminal condition.
+ ExpectedSlots int // total Reservation CRDs remaining in the store
+ AcceptedCRs []string // CRs expected Ready=True / Accepted
+ RejectedCRs []string // CRs expected Ready=False / Rejected
+ PlannedCRs []string // CRs expected Ready=False / Planned
+ ExpiredCRs []string // CRs expected Ready=False / Expired
+ SupersededCRs []string // CRs expected Ready=False / Superseded
+}
+
+func TestCRIntegration(t *testing.T) {
+ testCases := []CRIntegrationTestCase{
+ // ------------------------------------------------------------------
+ // Acceptance: slot count from commitment amount
+ // ------------------------------------------------------------------
+ {
+ Name: "single confirmed CR: one slot placed, CR accepted",
+ Hypervisors: []*hv1.Hypervisor{
+ intgHypervisor("host-1"),
+ },
+ CommittedResources: []*v1alpha1.CommittedResource{
+ intgCR("cr-1", "uuid-intg-0001", v1alpha1.CommitmentStatusConfirmed),
+ },
+ ExpectedSlots: 1,
+ AcceptedCRs: []string{"cr-1"},
+ },
+ {
+ // 8 GiB commitment with the default 4 GiB test flavor → 2 slots
+ Name: "large CR: commitment amount spans multiple flavors, two slots placed",
+ Hypervisors: []*hv1.Hypervisor{
+ intgHypervisor("host-1"),
+ },
+ CommittedResources: []*v1alpha1.CommittedResource{
+ intgCRAmount("cr-large", "uuid-intg-0002", v1alpha1.CommitmentStatusConfirmed, "8Gi"),
+ },
+ ExpectedSlots: 2,
+ AcceptedCRs: []string{"cr-large"},
+ },
+ // ------------------------------------------------------------------
+ // Pending / guaranteed: same placement path as confirmed
+ // ------------------------------------------------------------------
+ {
+ Name: "pending CR: slot placed, CR accepted",
+ Hypervisors: []*hv1.Hypervisor{
+ intgHypervisor("host-1"),
+ },
+ CommittedResources: []*v1alpha1.CommittedResource{
+ intgCR("cr-pending", "uuid-intg-0003", v1alpha1.CommitmentStatusPending),
+ },
+ ExpectedSlots: 1,
+ AcceptedCRs: []string{"cr-pending"},
+ },
+ {
+ Name: "guaranteed CR: slot placed, CR accepted",
+ Hypervisors: []*hv1.Hypervisor{
+ intgHypervisor("host-1"),
+ },
+ CommittedResources: []*v1alpha1.CommittedResource{
+ intgCR("cr-guaranteed", "uuid-intg-0004", v1alpha1.CommitmentStatusGuaranteed),
+ },
+ ExpectedSlots: 1,
+ AcceptedCRs: []string{"cr-guaranteed"},
+ },
+ // ------------------------------------------------------------------
+ // Planned: no slots, condition stays Planned
+ // ------------------------------------------------------------------
+ {
+ Name: "planned CR: no slots created, condition stays Planned",
+ Hypervisors: []*hv1.Hypervisor{
+ intgHypervisor("host-1"),
+ },
+ CommittedResources: []*v1alpha1.CommittedResource{
+ intgCR("cr-planned", "uuid-intg-0005", v1alpha1.CommitmentStatusPlanned),
+ },
+ ExpectedSlots: 0,
+ PlannedCRs: []string{"cr-planned"},
+ },
+ // ------------------------------------------------------------------
+ // Rejection paths
+ // ------------------------------------------------------------------
+ {
+ Name: "scheduler returns no hosts: CR rejected and slots cleaned up",
+ Hypervisors: []*hv1.Hypervisor{
+ intgHypervisor("host-1"),
+ },
+ CommittedResources: []*v1alpha1.CommittedResource{
+ intgCRAllowRejection("cr-rej", "uuid-intg-0006", v1alpha1.CommitmentStatusConfirmed),
+ },
+ SchedulerRejects: true,
+ ExpectedSlots: 0,
+ RejectedCRs: []string{"cr-rej"},
+ },
+ {
+ // Reservation controller detects the empty hosts list before calling the scheduler.
+ Name: "no hypervisors in cluster: CR rejected with NoHostsAvailable",
+ Hypervisors: []*hv1.Hypervisor{},
+ CommittedResources: []*v1alpha1.CommittedResource{
+ intgCRAllowRejection("cr-nohosts", "uuid-intg-0007", v1alpha1.CommitmentStatusConfirmed),
+ },
+ ExpectedSlots: 0,
+ RejectedCRs: []string{"cr-nohosts"},
+ },
+ // ------------------------------------------------------------------
+ // Multiple independent CRs
+ // ------------------------------------------------------------------
+ {
+ Name: "two CRs with different UUIDs: each gets its own slot, both accepted",
+ Hypervisors: []*hv1.Hypervisor{
+ intgHypervisor("host-1"),
+ intgHypervisor("host-2"),
+ },
+ CommittedResources: []*v1alpha1.CommittedResource{
+ intgCR("cr-a", "uuid-intg-0008", v1alpha1.CommitmentStatusConfirmed),
+ intgCR("cr-b", "uuid-intg-0009", v1alpha1.CommitmentStatusConfirmed),
+ },
+ ExpectedSlots: 2,
+ AcceptedCRs: []string{"cr-a", "cr-b"},
+ },
+ {
+ // One CR in planned state should not block the other from being accepted.
+ Name: "one planned CR and one confirmed CR: only confirmed CR gets a slot",
+ Hypervisors: []*hv1.Hypervisor{
+ intgHypervisor("host-1"),
+ },
+ CommittedResources: []*v1alpha1.CommittedResource{
+ intgCR("cr-plan", "uuid-intg-0010", v1alpha1.CommitmentStatusPlanned),
+ intgCR("cr-conf", "uuid-intg-0011", v1alpha1.CommitmentStatusConfirmed),
+ },
+ ExpectedSlots: 1,
+ PlannedCRs: []string{"cr-plan"},
+ AcceptedCRs: []string{"cr-conf"},
+ },
+ // ------------------------------------------------------------------
+ // Inactive states: existing slots must be cleaned up
+ // ------------------------------------------------------------------
+ {
+ Name: "expired CR with existing slot: slot deleted, CR marked inactive",
+ Hypervisors: []*hv1.Hypervisor{
+ intgHypervisor("host-1"),
+ },
+ ExistingReservations: []*v1alpha1.Reservation{
+ intgExistingReservation("cr-expire-0", "uuid-intg-0012"),
+ },
+ CommittedResources: []*v1alpha1.CommittedResource{
+ intgCR("cr-expire", "uuid-intg-0012", v1alpha1.CommitmentStatusExpired),
+ },
+ ExpectedSlots: 0,
+ ExpiredCRs: []string{"cr-expire"},
+ },
+ {
+ Name: "superseded CR with existing slot: slot deleted, CR marked inactive",
+ Hypervisors: []*hv1.Hypervisor{
+ intgHypervisor("host-1"),
+ },
+ ExistingReservations: []*v1alpha1.Reservation{
+ intgExistingReservation("cr-supersede-0", "uuid-intg-0013"),
+ },
+ CommittedResources: []*v1alpha1.CommittedResource{
+ intgCR("cr-supersede", "uuid-intg-0013", v1alpha1.CommitmentStatusSuperseded),
+ },
+ ExpectedSlots: 0,
+ SupersededCRs: []string{"cr-supersede"},
+ },
+ // ------------------------------------------------------------------
+ // Spec validation: unknown flavor group
+ // ------------------------------------------------------------------
+ {
+ // ApplyCommitmentState returns "flavor group not found" which triggers
+ // rollback+Rejected (AllowRejection=true); no child slots are ever created.
+ Name: "unknown flavor group: CR rejected, no slots created",
+ Hypervisors: []*hv1.Hypervisor{
+ intgHypervisor("host-1"),
+ },
+ CommittedResources: []*v1alpha1.CommittedResource{
+ intgCRUnknownFlavorGroup("cr-unk", "uuid-intg-0014", v1alpha1.CommitmentStatusConfirmed),
+ },
+ ExpectedSlots: 0,
+ RejectedCRs: []string{"cr-unk"},
+ },
+ // ------------------------------------------------------------------
+ // Partial placement: first slot placed, second slot rejected
+ // ------------------------------------------------------------------
+ {
+ // 8 GiB CR needs 2 slots. Scheduler accepts the first call (slot 0 placed)
+ // then rejects the second (slot 1 gets NoHostsFound). With AllowRejection=true
+ // the CR controller rolls back: deletes both slots and sets Rejected.
+ Name: "partial placement: first slot placed, second slot rejected, CR rolled back",
+ Hypervisors: []*hv1.Hypervisor{
+ intgHypervisor("host-1"),
+ },
+ CommittedResources: []*v1alpha1.CommittedResource{
+ intgCRAmountAllowRejection("cr-partial", "uuid-intg-0015", v1alpha1.CommitmentStatusConfirmed, "8Gi"),
+ },
+ SchedulerAcceptFirst: 1,
+ ExpectedSlots: 0,
+ RejectedCRs: []string{"cr-partial"},
+ },
+ }
+
+ for _, tc := range testCases {
+ t.Run(tc.Name, func(t *testing.T) {
+ runCRIntegrationTestCase(t, tc)
+ })
+ }
+}
+
+// ============================================================================
+// Runner
+// ============================================================================
+
+func runCRIntegrationTestCase(t *testing.T, tc CRIntegrationTestCase) {
+ t.Helper()
+
+ schedulerFn := intgAcceptScheduler
+ switch {
+ case tc.SchedulerAcceptFirst > 0:
+ schedulerFn = intgAcceptFirstScheduler(tc.SchedulerAcceptFirst)
+ case tc.SchedulerRejects:
+ schedulerFn = intgRejectScheduler
+ }
+
+ objects := []client.Object{newTestFlavorKnowledge()}
+ for _, hv := range tc.Hypervisors {
+ objects = append(objects, hv)
+ }
+ for _, res := range tc.ExistingReservations {
+ objects = append(objects, res)
+ }
+
+ env := newIntgEnv(t, objects, schedulerFn)
+ defer env.close()
+
+ crNames := make([]string, len(tc.CommittedResources))
+ for i, cr := range tc.CommittedResources {
+ if err := env.k8sClient.Create(context.Background(), cr); err != nil {
+ t.Fatalf("create CR %s: %v", cr.Name, err)
+ }
+ crNames[i] = cr.Name
+ }
+
+ intgDriveToTerminal(t, env, crNames)
+
+ // Assert total reservation slot count.
+ var resList v1alpha1.ReservationList
+ if err := env.k8sClient.List(context.Background(), &resList, client.MatchingLabels{
+ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
+ }); err != nil {
+ t.Fatalf("list reservations: %v", err)
+ }
+ if len(resList.Items) != tc.ExpectedSlots {
+ t.Errorf("reservation slots: want %d, got %d", tc.ExpectedSlots, len(resList.Items))
+ }
+
+ // Assert CR conditions.
+ intgAssertCRCondition(t, env.k8sClient, tc.AcceptedCRs, metav1.ConditionTrue, v1alpha1.CommittedResourceReasonAccepted)
+ intgAssertCRCondition(t, env.k8sClient, tc.RejectedCRs, metav1.ConditionFalse, v1alpha1.CommittedResourceReasonRejected)
+ intgAssertCRCondition(t, env.k8sClient, tc.PlannedCRs, metav1.ConditionFalse, v1alpha1.CommittedResourceReasonPlanned)
+ intgAssertCRCondition(t, env.k8sClient, tc.ExpiredCRs, metav1.ConditionFalse, string(v1alpha1.CommitmentStatusExpired))
+ intgAssertCRCondition(t, env.k8sClient, tc.SupersededCRs, metav1.ConditionFalse, string(v1alpha1.CommitmentStatusSuperseded))
+}
+
+// ============================================================================
+// Integration environment
+// ============================================================================
+
+type intgEnv struct {
+ k8sClient client.Client
+ crController *CommittedResourceController
+ resController *CommitmentReservationController
+ schedulerSrv *httptest.Server
+}
+
+func newIntgEnv(t *testing.T, initialObjects []client.Object, schedulerFn http.HandlerFunc) *intgEnv {
+ t.Helper()
+ scheme := newCRTestScheme(t)
+
+ k8sClient := fake.NewClientBuilder().
+ WithScheme(scheme).
+ WithObjects(initialObjects...).
+ WithStatusSubresource(
+ &v1alpha1.CommittedResource{},
+ &v1alpha1.Reservation{},
+ &v1alpha1.Knowledge{},
+ ).
+ WithIndex(&v1alpha1.Reservation{}, idxReservationByCommitmentUUID, func(obj client.Object) []string {
+ res, ok := obj.(*v1alpha1.Reservation)
+ if !ok || res.Spec.CommittedResourceReservation == nil || res.Spec.CommittedResourceReservation.CommitmentUUID == "" {
+ return nil
+ }
+ return []string{res.Spec.CommittedResourceReservation.CommitmentUUID}
+ }).
+ WithIndex(&v1alpha1.CommittedResource{}, idxCommittedResourceByUUID, func(obj client.Object) []string {
+ cr, ok := obj.(*v1alpha1.CommittedResource)
+ if !ok || cr.Spec.CommitmentUUID == "" {
+ return nil
+ }
+ return []string{cr.Spec.CommitmentUUID}
+ }).
+ Build()
+
+ schedulerSrv := httptest.NewServer(schedulerFn)
+
+ crCtrl := &CommittedResourceController{
+ Client: k8sClient,
+ Scheme: scheme,
+ Conf: CommittedResourceControllerConfig{RequeueIntervalRetry: metav1.Duration{Duration: 5 * time.Minute}},
+ }
+ resCtrl := &CommitmentReservationController{
+ Client: k8sClient,
+ Scheme: scheme,
+ Conf: ReservationControllerConfig{
+ SchedulerURL: schedulerSrv.URL,
+ AllocationGracePeriod: metav1.Duration{Duration: 15 * time.Minute},
+ RequeueIntervalActive: metav1.Duration{Duration: 5 * time.Minute},
+ },
+ }
+ if err := resCtrl.Init(context.Background(), resCtrl.Conf); err != nil {
+ t.Fatalf("resCtrl.Init: %v", err)
+ }
+ return &intgEnv{k8sClient: k8sClient, crController: crCtrl, resController: resCtrl, schedulerSrv: schedulerSrv}
+}
+
+func (e *intgEnv) close() { e.schedulerSrv.Close() }
+
+// ============================================================================
+// Reconcile driver
+// ============================================================================
+
+// intgDriveToTerminal runs reconcile passes until every named CR has a terminal
+// condition or the 5 s deadline is reached.
+//
+// One pass:
+// 1. CR controller (adds finalizer / creates Reservation CRDs / handles inactive states)
+// 2. Reservation controller ×2 per slot (first call sets TargetHost, second sets Ready=True)
+// 3. CR controller again (picks up placement outcomes: Accepted or Rejected)
+func intgDriveToTerminal(t *testing.T, env *intgEnv, crNames []string) {
+ t.Helper()
+ ctx := context.Background()
+ deadline := time.Now().Add(5 * time.Second)
+
+ for {
+ if time.Now().After(deadline) {
+ for _, name := range crNames {
+ var cr v1alpha1.CommittedResource
+ if err := env.k8sClient.Get(ctx, types.NamespacedName{Name: name}, &cr); err == nil {
+ t.Logf("CR %s: conditions=%v", name, cr.Status.Conditions)
+ }
+ }
+ t.Fatal("timed out waiting for CRs to reach terminal state")
+ }
+
+ allDone := true
+ for _, name := range crNames {
+ var cr v1alpha1.CommittedResource
+ if err := env.k8sClient.Get(ctx, types.NamespacedName{Name: name}, &cr); err != nil {
+ continue // deleted = done
+ }
+ if !intgIsTerminalCR(cr) {
+ allDone = false
+ }
+ }
+ if allDone {
+ return
+ }
+
+ // Pass 1: CR controller.
+ for _, name := range crNames {
+ var cr v1alpha1.CommittedResource
+ if err := env.k8sClient.Get(ctx, types.NamespacedName{Name: name}, &cr); err != nil {
+ continue
+ }
+ if intgIsTerminalCR(cr) {
+ continue
+ }
+ env.crController.Reconcile(ctx, ctrl.Request{NamespacedName: types.NamespacedName{Name: name}}) //nolint:errcheck
+ }
+
+ // Pass 2: Reservation controller (two reconciles per slot).
+ var resList v1alpha1.ReservationList
+ env.k8sClient.List(ctx, &resList, client.MatchingLabels{ //nolint:errcheck
+ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
+ })
+ for _, res := range resList.Items {
+ if intgIsTerminalReservation(res) {
+ continue
+ }
+ req := ctrl.Request{NamespacedName: types.NamespacedName{Name: res.Name}}
+ env.resController.Reconcile(ctx, req) //nolint:errcheck
+ env.resController.Reconcile(ctx, req) //nolint:errcheck
+ }
+
+ // Pass 3: CR controller picks up Reservation outcomes.
+ for _, name := range crNames {
+ var cr v1alpha1.CommittedResource
+ if err := env.k8sClient.Get(ctx, types.NamespacedName{Name: name}, &cr); err != nil {
+ continue
+ }
+ if intgIsTerminalCR(cr) {
+ continue
+ }
+ env.crController.Reconcile(ctx, ctrl.Request{NamespacedName: types.NamespacedName{Name: name}}) //nolint:errcheck
+ }
+ }
+}
+
+func intgIsTerminalCR(cr v1alpha1.CommittedResource) bool {
+ if !cr.DeletionTimestamp.IsZero() {
+ return false // needs one more reconcile to remove its finalizer
+ }
+ cond := meta.FindStatusCondition(cr.Status.Conditions, v1alpha1.CommittedResourceConditionReady)
+ if cond == nil {
+ return false
+ }
+ if cond.Status == metav1.ConditionTrue {
+ return true
+ }
+ return cond.Reason == v1alpha1.CommittedResourceReasonRejected ||
+ cond.Reason == v1alpha1.CommittedResourceReasonPlanned ||
+ cond.Reason == string(v1alpha1.CommitmentStatusExpired) ||
+ cond.Reason == string(v1alpha1.CommitmentStatusSuperseded)
+}
+
+// intgIsTerminalReservation returns true once the Reservation controller has set any
+// condition (Ready=True after placement, or Ready=False after rejection).
+func intgIsTerminalReservation(res v1alpha1.Reservation) bool {
+ return meta.FindStatusCondition(res.Status.Conditions, v1alpha1.ReservationConditionReady) != nil
+}
+
+// ============================================================================
+// Assertion helpers
+// ============================================================================
+
+func intgAssertCRCondition(t *testing.T, k8sClient client.Client, crNames []string, wantStatus metav1.ConditionStatus, wantReason string) {
+ t.Helper()
+ for _, name := range crNames {
+ var cr v1alpha1.CommittedResource
+ if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: name}, &cr); err != nil {
+ t.Errorf("CR %s not found: %v", name, err)
+ continue
+ }
+ cond := meta.FindStatusCondition(cr.Status.Conditions, v1alpha1.CommittedResourceConditionReady)
+ if cond == nil {
+ t.Errorf("CR %s: no Ready condition", name)
+ continue
+ }
+ if cond.Status != wantStatus || cond.Reason != wantReason {
+ t.Errorf("CR %s: want Ready=%s/Reason=%s, got Ready=%s/Reason=%s", name, wantStatus, wantReason, cond.Status, cond.Reason)
+ }
+ }
+}
+
+// ============================================================================
+// Scheduler handlers
+// ============================================================================
+
+func intgAcceptScheduler(w http.ResponseWriter, r *http.Request) {
+ resp := &schedulerdelegationapi.ExternalSchedulerResponse{Hosts: []string{"host-1"}}
+ json.NewEncoder(w).Encode(resp) //nolint:errcheck
+}
+
+func intgRejectScheduler(w http.ResponseWriter, r *http.Request) {
+ resp := &schedulerdelegationapi.ExternalSchedulerResponse{Hosts: []string{}}
+ json.NewEncoder(w).Encode(resp) //nolint:errcheck
+}
+
+// intgAcceptFirstScheduler returns a handler that accepts the first count placement calls
+// and rejects all subsequent ones. Uses an atomic counter so concurrent calls are safe.
+func intgAcceptFirstScheduler(count int) http.HandlerFunc {
+ var calls atomic.Int32
+ return func(w http.ResponseWriter, r *http.Request) {
+ if int(calls.Add(1)) <= count {
+ intgAcceptScheduler(w, r)
+ } else {
+ intgRejectScheduler(w, r)
+ }
+ }
+}
+
+// intgRejectFirstScheduler returns a handler that rejects the first count placement calls
+// and accepts all subsequent ones. Used to test AllowRejection=false retry-until-success paths.
+func intgRejectFirstScheduler(count int) http.HandlerFunc {
+ var calls atomic.Int32
+ return func(w http.ResponseWriter, r *http.Request) {
+ if int(calls.Add(1)) <= count {
+ intgRejectScheduler(w, r)
+ } else {
+ intgAcceptScheduler(w, r)
+ }
+ }
+}
+
+// ============================================================================
+// Test object builders
+// ============================================================================
+
+// intgHypervisor returns a minimal Hypervisor with the given name.
+func intgHypervisor(name string) *hv1.Hypervisor {
+ return &hv1.Hypervisor{ObjectMeta: metav1.ObjectMeta{Name: name}}
+}
+
+// intgCR returns a CommittedResource with the default 4 GiB amount.
+// commitmentUUID must be unique per test case to avoid field-index collisions.
+func intgCR(name, commitmentUUID string, state v1alpha1.CommitmentStatus) *v1alpha1.CommittedResource {
+ cr := newTestCommittedResource(name, state)
+ cr.Spec.CommitmentUUID = commitmentUUID
+ return cr
+}
+
+// intgCRAmount returns a CommittedResource with a custom amount string (e.g. "8Gi").
+func intgCRAmount(name, commitmentUUID string, state v1alpha1.CommitmentStatus, amount string) *v1alpha1.CommittedResource {
+ cr := intgCR(name, commitmentUUID, state)
+ cr.Spec.Amount = resource.MustParse(amount)
+ return cr
+}
+
+// intgCRAllowRejection returns a CommittedResource with AllowRejection=true so the
+// controller rolls back and sets Rejected (rather than retrying indefinitely).
+func intgCRAllowRejection(name, commitmentUUID string, state v1alpha1.CommitmentStatus) *v1alpha1.CommittedResource {
+ cr := intgCR(name, commitmentUUID, state)
+ cr.Spec.AllowRejection = true
+ return cr
+}
+
+// intgCRAmountAllowRejection returns a CommittedResource with a custom amount and AllowRejection=true.
+func intgCRAmountAllowRejection(name, commitmentUUID string, state v1alpha1.CommitmentStatus, amount string) *v1alpha1.CommittedResource {
+ cr := intgCRAmount(name, commitmentUUID, state, amount)
+ cr.Spec.AllowRejection = true
+ return cr
+}
+
+// intgCRUnknownFlavorGroup returns a CommittedResource referencing a flavor group
+// that does not exist in the Knowledge CRD, with AllowRejection=true so the
+// controller reaches Rejected rather than retrying indefinitely.
+func intgCRUnknownFlavorGroup(name, commitmentUUID string, state v1alpha1.CommitmentStatus) *v1alpha1.CommittedResource {
+ cr := intgCRAllowRejection(name, commitmentUUID, state)
+ cr.Spec.FlavorGroupName = "nonexistent-group"
+ return cr
+}
+
+// intgExistingReservation returns a pre-placed Reservation tied to the given commitment UUID,
+// used to verify that expiry/supersede paths delete children.
+func intgExistingReservation(name, commitmentUUID string) *v1alpha1.Reservation {
+ return &v1alpha1.Reservation{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: name,
+ Labels: map[string]string{
+ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
+ },
+ },
+ Spec: v1alpha1.ReservationSpec{
+ Type: v1alpha1.ReservationTypeCommittedResource,
+ CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{
+ CommitmentUUID: commitmentUUID,
+ },
+ },
+ }
+}
diff --git a/internal/scheduling/reservations/commitments/reservation_controller.go b/internal/scheduling/reservations/commitments/reservation_controller.go
index 312a65530..96d86aeb2 100644
--- a/internal/scheduling/reservations/commitments/reservation_controller.go
+++ b/internal/scheduling/reservations/commitments/reservation_controller.go
@@ -37,11 +37,24 @@ type CommitmentReservationController struct {
// Kubernetes scheme to use for the reservations.
Scheme *runtime.Scheme
// Configuration for the controller.
- Conf Config
+ Conf ReservationControllerConfig
// SchedulerClient for making scheduler API calls.
SchedulerClient *reservations.SchedulerClient
}
+// echoParentGeneration copies Spec.CommittedResourceReservation.ParentGeneration to
+// Status.CommittedResourceReservation.ObservedParentGeneration so the CommittedResource
+// controller can confirm this reservation was processed for the current CR generation.
+func echoParentGeneration(res *v1alpha1.Reservation) {
+ if res.Spec.CommittedResourceReservation == nil {
+ return
+ }
+ if res.Status.CommittedResourceReservation == nil {
+ res.Status.CommittedResourceReservation = &v1alpha1.CommittedResourceReservationStatus{}
+ }
+ res.Status.CommittedResourceReservation.ObservedParentGeneration = res.Spec.CommittedResourceReservation.ParentGeneration
+}
+
// Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state.
// Note: This controller only handles commitment reservations, as filtered by the predicate.
@@ -76,6 +89,7 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr
Reason: "MissingResourceName",
Message: "reservation has no resource name",
})
+ echoParentGeneration(&res)
patch := client.MergeFrom(old)
if err := r.Status().Patch(ctx, &res, patch); err != nil {
// Ignore not-found errors during background deletion
@@ -92,6 +106,19 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr
if res.IsReady() {
logger.V(1).Info("reservation is active, verifying allocations")
+ // Sync ObservedParentGeneration if the CR controller bumped ParentGeneration since
+ // the last time this reservation was processed (e.g. after a spec update). Without
+ // this patch the CR controller would spin in Reserving forever for already-ready slots.
+ if res.Spec.CommittedResourceReservation != nil &&
+ (res.Status.CommittedResourceReservation == nil ||
+ res.Status.CommittedResourceReservation.ObservedParentGeneration != res.Spec.CommittedResourceReservation.ParentGeneration) {
+ old := res.DeepCopy()
+ echoParentGeneration(&res)
+ if err := r.Status().Patch(ctx, &res, client.MergeFrom(old)); client.IgnoreNotFound(err) != nil {
+ return ctrl.Result{}, err
+ }
+ }
+
// Verify all allocations in Spec against actual VM state
result, err := r.reconcileAllocations(ctx, &res)
if err != nil {
@@ -102,9 +129,9 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr
// Requeue with appropriate interval based on allocation state
// Use shorter interval if there are allocations in grace period for faster verification
if result.HasAllocationsInGracePeriod {
- return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalGracePeriod}, nil
+ return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalGracePeriod.Duration}, nil
}
- return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalActive}, nil
+ return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalActive.Duration}, nil
}
// TODO trigger re-placement of unused reservations over time
@@ -126,6 +153,7 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr
Reason: "PreAllocated",
Message: "reservation pre-allocated with VM allocations",
})
+ echoParentGeneration(&res)
patch := client.MergeFrom(old)
if err := r.Status().Patch(ctx, &res, patch); err != nil {
// Ignore not-found errors during background deletion
@@ -155,6 +183,7 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr
Reason: "ReservationActive",
Message: "reservation is successfully scheduled",
})
+ echoParentGeneration(&res)
patch := client.MergeFrom(old)
if err := r.Status().Patch(ctx, &res, patch); err != nil {
// Ignore not-found errors during background deletion
@@ -189,7 +218,7 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr
logger.Info("flavor knowledge not ready, requeueing",
"resourceName", resourceName,
"error", err)
- return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalRetry}, nil
+ return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalRetry.Duration}, nil
}
// Search for the flavor across all flavor groups
@@ -224,11 +253,12 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr
Reason: "NoHostsAvailable",
Message: "no hypervisors available for scheduling",
})
+ echoParentGeneration(&res)
patch := client.MergeFrom(old)
if err := r.Status().Patch(ctx, &res, patch); err != nil {
return ctrl.Result{}, client.IgnoreNotFound(err)
}
- return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalRetry}, nil
+ return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalRetry.Duration}, nil
}
// Select appropriate pipeline based on flavor group
@@ -271,6 +301,7 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr
Reason: "NoHostsFound",
Message: "no hosts found for reservation",
})
+ echoParentGeneration(&res)
patch := client.MergeFrom(old)
if err := r.Status().Patch(ctx, &res, patch); err != nil {
// Ignore not-found errors during background deletion
@@ -370,7 +401,7 @@ func (r *CommitmentReservationController) reconcileAllocations(ctx context.Conte
for vmUUID, allocation := range res.Spec.CommittedResourceReservation.Allocations {
allocationAge := now.Sub(allocation.CreationTimestamp.Time)
- isInGracePeriod := allocationAge < r.Conf.AllocationGracePeriod
+ isInGracePeriod := allocationAge < r.Conf.AllocationGracePeriod.Duration
if isInGracePeriod {
// New allocation: VM may not yet appear in the HV CRD (still spawning).
@@ -395,7 +426,7 @@ func (r *CommitmentReservationController) reconcileAllocations(ctx context.Conte
"reservation", res.Name,
"expectedHost", expectedHost,
"allocationAge", allocationAge,
- "gracePeriod", r.Conf.AllocationGracePeriod)
+ "gracePeriod", r.Conf.AllocationGracePeriod.Duration)
}
}
@@ -498,11 +529,9 @@ func (r *CommitmentReservationController) hypervisorToReservations(ctx context.C
}
// Init initializes the reconciler with required clients and DB connection.
-func (r *CommitmentReservationController) Init(ctx context.Context, client client.Client, conf Config) error {
- // Initialize scheduler client
+func (r *CommitmentReservationController) Init(ctx context.Context, conf ReservationControllerConfig) error {
r.SchedulerClient = reservations.NewSchedulerClient(conf.SchedulerURL)
logf.FromContext(ctx).Info("scheduler client initialized for commitment reservation controller", "url", conf.SchedulerURL)
-
return nil
}
@@ -543,7 +572,7 @@ var commitmentReservationPredicate = predicate.Funcs{
// SetupWithManager sets up the controller with the Manager.
func (r *CommitmentReservationController) SetupWithManager(mgr ctrl.Manager, mcl *multicluster.Client) error {
if err := mgr.Add(manager.RunnableFunc(func(ctx context.Context) error {
- if err := r.Init(ctx, mgr.GetClient(), r.Conf); err != nil {
+ if err := r.Init(ctx, r.Conf); err != nil {
return err
}
return nil
@@ -580,7 +609,10 @@ func (r *CommitmentReservationController) SetupWithManager(mgr ctrl.Manager, mcl
return bldr.Named("commitment-reservation").
WithOptions(controller.Options{
- // We want to process reservations one at a time to avoid overbooking.
+ // MaxConcurrentReconciles=1: conservative default. Note that this does NOT prevent
+ // the cache-staleness race where two back-to-back reconciles both pick the same host
+ // before the first write is visible to the capacity filter — that requires pessimistic
+ // blocking at the scheduler level.
MaxConcurrentReconciles: 1,
}).
Complete(r)
diff --git a/internal/scheduling/reservations/commitments/reservation_controller_test.go b/internal/scheduling/reservations/commitments/reservation_controller_test.go
index 7c0d63ee7..df6316d46 100644
--- a/internal/scheduling/reservations/commitments/reservation_controller_test.go
+++ b/internal/scheduling/reservations/commitments/reservation_controller_test.go
@@ -80,8 +80,8 @@ func TestCommitmentReservationController_Reconcile(t *testing.T) {
reconciler := &CommitmentReservationController{
Client: k8sClient,
Scheme: scheme,
- Conf: Config{
- RequeueIntervalActive: 5 * time.Minute,
+ Conf: ReservationControllerConfig{
+ RequeueIntervalActive: metav1.Duration{Duration: 5 * time.Minute},
},
}
@@ -139,7 +139,7 @@ func TestReconcileAllocations_HypervisorCRDPath(t *testing.T) {
recentTime := metav1.NewTime(now.Add(-5 * time.Minute)) // 5 minutes ago (within grace period)
oldTime := metav1.NewTime(now.Add(-30 * time.Minute)) // 30 minutes ago (past grace period)
- config := Config{AllocationGracePeriod: 15 * time.Minute}
+ config := ReservationControllerConfig{AllocationGracePeriod: metav1.Duration{Duration: 15 * time.Minute}}
tests := []struct {
name string
@@ -474,7 +474,7 @@ func TestCommitmentReservationController_reconcileInstanceReservation_Success(t
}))
defer server.Close()
- config := Config{
+ config := ReservationControllerConfig{
SchedulerURL: server.URL,
}
@@ -485,7 +485,7 @@ func TestCommitmentReservationController_reconcileInstanceReservation_Success(t
}
// Initialize the reconciler (this sets up SchedulerClient)
- if err := reconciler.Init(context.Background(), k8sClient, config); err != nil {
+ if err := reconciler.Init(context.Background(), config); err != nil {
t.Fatalf("Failed to initialize reconciler: %v", err)
}
diff --git a/internal/scheduling/reservations/commitments/reservation_manager.go b/internal/scheduling/reservations/commitments/reservation_manager.go
index d7a75cc7a..d1fa28fda 100644
--- a/internal/scheduling/reservations/commitments/reservation_manager.go
+++ b/internal/scheduling/reservations/commitments/reservation_manager.go
@@ -25,6 +25,9 @@ type ApplyResult struct {
Deleted int
// Repaired is the number of reservations repaired (metadata sync or recreated due to wrong config)
Repaired int
+ // TotalSlots is the total number of reservation slots that should exist after the apply.
+ // Used by the CR controller to wait for the correct number of children in the cache.
+ TotalSlots int
// TouchedReservations are reservations that were created or updated
TouchedReservations []v1alpha1.Reservation
// RemovedReservations are reservations that were deleted
@@ -92,6 +95,9 @@ func (m *ReservationManager) ApplyCommitmentState(
if !exists {
return nil, fmt.Errorf("flavor group not found: %s", desiredState.FlavorGroupName)
}
+ if len(flavorGroup.Flavors) == 0 {
+ return nil, fmt.Errorf("flavor group %s has no flavors", desiredState.FlavorGroupName)
+ }
deltaMemoryBytes := desiredState.TotalMemoryBytes
for _, res := range existing {
memoryQuantity := res.Spec.Resources[hv1.ResourceMemory]
@@ -210,6 +216,7 @@ func (m *ReservationManager) ApplyCommitmentState(
"total", len(existing)+result.Created)
}
+ result.TotalSlots = len(existing) + result.Created
return result, nil
}
@@ -225,7 +232,8 @@ func (m *ReservationManager) syncReservationMetadata(
if (state.CommitmentUUID != "" && reservation.Spec.CommittedResourceReservation.CommitmentUUID != state.CommitmentUUID) ||
(state.AvailabilityZone != "" && reservation.Spec.AvailabilityZone != state.AvailabilityZone) ||
(state.StartTime != nil && (reservation.Spec.StartTime == nil || !reservation.Spec.StartTime.Time.Equal(*state.StartTime))) ||
- (state.EndTime != nil && (reservation.Spec.EndTime == nil || !reservation.Spec.EndTime.Time.Equal(*state.EndTime))) {
+ (state.EndTime != nil && (reservation.Spec.EndTime == nil || !reservation.Spec.EndTime.Time.Equal(*state.EndTime))) ||
+ (state.ParentGeneration != 0 && reservation.Spec.CommittedResourceReservation.ParentGeneration != state.ParentGeneration) {
// Apply patch
logger.V(1).Info("syncing reservation metadata",
"reservation", reservation.Name,
@@ -236,6 +244,9 @@ func (m *ReservationManager) syncReservationMetadata(
if state.CommitmentUUID != "" {
reservation.Spec.CommittedResourceReservation.CommitmentUUID = state.CommitmentUUID
}
+ if state.ParentGeneration != 0 {
+ reservation.Spec.CommittedResourceReservation.ParentGeneration = state.ParentGeneration
+ }
if state.AvailabilityZone != "" {
reservation.Spec.AvailabilityZone = state.AvailabilityZone
@@ -301,13 +312,14 @@ func (m *ReservationManager) newReservation(
),
},
CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{
- ProjectID: state.ProjectID,
- CommitmentUUID: state.CommitmentUUID,
- DomainID: state.DomainID,
- ResourceGroup: state.FlavorGroupName,
- ResourceName: flavorInGroup.Name,
- Creator: creator,
- Allocations: nil,
+ ProjectID: state.ProjectID,
+ CommitmentUUID: state.CommitmentUUID,
+ DomainID: state.DomainID,
+ ResourceGroup: state.FlavorGroupName,
+ ResourceName: flavorInGroup.Name,
+ Creator: creator,
+ ParentGeneration: state.ParentGeneration,
+ Allocations: nil,
},
}
diff --git a/internal/scheduling/reservations/commitments/state.go b/internal/scheduling/reservations/commitments/state.go
index 96ede88ac..149cdfc03 100644
--- a/internal/scheduling/reservations/commitments/state.go
+++ b/internal/scheduling/reservations/commitments/state.go
@@ -97,6 +97,13 @@ type CommitmentState struct {
// When set (e.g. "-"), Reservation CRDs are named "".
// Used by the CommittedResource controller; leave empty for the legacy syncer path.
NamePrefix string
+ // ParentGeneration is the Generation of the parent CommittedResource CRD. Written into
+ // Reservation spec so the Reservation controller can echo it back in status, letting
+ // the CR controller detect when all children have been processed for the current spec.
+ // Zero for syncer-created reservations (no parent CR).
+ ParentGeneration int64
+ // State is the lifecycle state from Limes (planned/pending/guaranteed/confirmed/superseded/expired).
+ State v1alpha1.CommitmentStatus
}
// FromCommitment converts Limes commitment to CommitmentState.
@@ -144,6 +151,7 @@ func FromCommitment(
AvailabilityZone: commitment.AvailabilityZone,
StartTime: startTime,
EndTime: endTime,
+ State: v1alpha1.CommitmentStatus(commitment.Status),
}, nil
}
@@ -151,6 +159,7 @@ func FromCommitment(
func FromChangeCommitmentTargetState(
commitment liquid.Commitment,
projectID string,
+ domainID string,
flavorGroupName string,
flavorGroup compute.FlavorGroupFeature,
az string,
@@ -166,8 +175,8 @@ func FromChangeCommitmentTargetState(
var endTime *time.Time
switch commitment.NewStatus.UnwrapOr("none") {
- // guaranteed and confirmed commitments are honored with start time now
- case liquid.CommitmentStatusGuaranteed, liquid.CommitmentStatusConfirmed:
+ // pending, guaranteed, confirmed commitments are honored with Reservation slots.
+ case liquid.CommitmentStatusPending, liquid.CommitmentStatusGuaranteed, liquid.CommitmentStatusConfirmed:
amountMultiple = commitment.Amount
// Set start time: use ConfirmBy if available (when the commitment was confirmed),
// otherwise use time.Now() for immediate confirmation
@@ -187,7 +196,7 @@ func FromChangeCommitmentTargetState(
if !commitment.ExpiresAt.IsZero() {
endTime = &commitment.ExpiresAt
// check expiry time
- if commitment.ExpiresAt.Before(time.Now()) || commitment.ExpiresAt.Equal(time.Now()) {
+ if !commitment.ExpiresAt.After(time.Now()) {
// commitment is already expired, ignore capacity
amountMultiple = 0
}
@@ -203,11 +212,13 @@ func FromChangeCommitmentTargetState(
return &CommitmentState{
CommitmentUUID: string(commitment.UUID),
ProjectID: projectID,
+ DomainID: domainID,
FlavorGroupName: flavorGroupName,
TotalMemoryBytes: totalMemoryBytes,
AvailabilityZone: az,
StartTime: startTime,
EndTime: endTime,
+ State: v1alpha1.CommitmentStatus(commitment.NewStatus.UnwrapOr("")),
}, nil
}
diff --git a/internal/scheduling/reservations/commitments/syncer.go b/internal/scheduling/reservations/commitments/syncer.go
index 60c450b9a..8d3a43adf 100644
--- a/internal/scheduling/reservations/commitments/syncer.go
+++ b/internal/scheduling/reservations/commitments/syncer.go
@@ -13,7 +13,10 @@ import (
"github.com/cobaltcore-dev/cortex/internal/scheduling/reservations"
"github.com/go-logr/logr"
corev1 "k8s.io/api/core/v1"
+ "k8s.io/apimachinery/pkg/api/resource"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
+ "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
)
var (
@@ -30,21 +33,6 @@ type SyncerConfig struct {
SyncInterval time.Duration `json:"committedResourceSyncInterval"`
}
-func DefaultSyncerConfig() SyncerConfig {
- return SyncerConfig{
- SyncInterval: time.Hour,
- }
-}
-
-// ApplyDefaults fills in any unset values with defaults.
-func (c *SyncerConfig) ApplyDefaults() {
- defaults := DefaultSyncerConfig()
- if c.SyncInterval == 0 {
- c.SyncInterval = defaults.SyncInterval
- }
- // Note: KeystoneSecretRef and SSOSecretRef are not defaulted as they require explicit configuration
-}
-
type Syncer struct {
// Client to fetch commitments from Limes
CommitmentsClient
@@ -97,11 +85,6 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo
skippedUUIDs: make(map[string]bool),
}
for id, commitment := range commitments {
- // Record each commitment seen from Limes
- if s.monitor != nil {
- s.monitor.RecordCommitmentSeen()
- }
-
if commitment.ServiceType != "compute" {
log.Info("skipping non-compute commitment", "id", id, "serviceType", commitment.ServiceType)
if s.monitor != nil {
@@ -110,12 +93,19 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo
continue
}
- // Only process commitments that are active (confirmed or guaranteed).
- // planned/pending are not yet accepted by Cortex; superseded/expired are done.
- if commitment.Status != "confirmed" && commitment.Status != "guaranteed" {
- log.Info("skipping non-active commitment", "id", id, "status", commitment.Status)
- if s.monitor != nil {
- s.monitor.RecordCommitmentSkipped(SkipReasonNonActive)
+ // Validate that the commitment state is a known enum value.
+ switch v1alpha1.CommitmentStatus(commitment.Status) {
+ case v1alpha1.CommitmentStatusPlanned,
+ v1alpha1.CommitmentStatusPending,
+ v1alpha1.CommitmentStatusGuaranteed,
+ v1alpha1.CommitmentStatusConfirmed,
+ v1alpha1.CommitmentStatusSuperseded,
+ v1alpha1.CommitmentStatusExpired:
+ // valid, continue processing
+ default:
+ log.Info("skipping commitment with unknown status", "id", id, "status", commitment.Status)
+ if commitment.UUID != "" {
+ result.skippedUUIDs[commitment.UUID] = true
}
continue
}
@@ -194,11 +184,6 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo
"totalMemoryBytes", state.TotalMemoryBytes)
result.states = append(result.states, state)
-
- // Record successfully processed commitment
- if s.monitor != nil {
- s.monitor.RecordCommitmentProcessed()
- }
}
return result, nil
@@ -215,16 +200,21 @@ func (s *Syncer) SyncReservations(ctx context.Context) error {
logger.Info("starting commitment sync")
- // Record sync run
- if s.monitor != nil {
- s.monitor.RecordSyncRun()
- }
+ startTime := time.Now()
+ defer func() {
+ if s.monitor != nil {
+ s.monitor.RecordDuration(time.Since(startTime).Seconds())
+ }
+ }()
// Check if flavor group knowledge is ready
knowledge := &reservations.FlavorGroupKnowledgeClient{Client: s.Client}
knowledgeCRD, err := knowledge.Get(ctx)
if err != nil {
logger.Error(err, "failed to check flavor group knowledge readiness")
+ if s.monitor != nil {
+ s.monitor.RecordError()
+ }
return err
}
if knowledgeCRD == nil {
@@ -236,6 +226,9 @@ func (s *Syncer) SyncReservations(ctx context.Context) error {
flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, knowledgeCRD)
if err != nil {
logger.Error(err, "failed to get flavor groups from knowledge")
+ if s.monitor != nil {
+ s.monitor.RecordError()
+ }
return err
}
@@ -243,42 +236,48 @@ func (s *Syncer) SyncReservations(ctx context.Context) error {
commitmentResult, err := s.getCommitmentStates(ctx, logger, flavorGroups)
if err != nil {
logger.Error(err, "failed to get compute commitments")
+ if s.monitor != nil {
+ s.monitor.RecordError()
+ }
return err
}
- // Create ReservationManager to handle state application
- manager := NewReservationManager(s.Client)
+ if s.monitor != nil {
+ s.monitor.SetLimesCommitmentsActive(len(commitmentResult.states))
+ }
- // Apply each commitment state using the manager
- var totalCreated, totalDeleted, totalRepaired int
+ // Upsert CommittedResource CRDs for each commitment
+ var totalCreated, totalUpdated int
for _, state := range commitmentResult.states {
- logger.Info("applying commitment state",
+ logger.Info("upserting committed resource CRD",
"commitmentUUID", state.CommitmentUUID,
"projectID", state.ProjectID,
"flavorGroup", state.FlavorGroupName,
- "totalMemoryBytes", state.TotalMemoryBytes)
-
- applyResult, err := manager.ApplyCommitmentState(ctx, logger, state, flavorGroups, CreatorValue)
+ "state", state.State)
+
+ var (
+ op controllerutil.OperationResult
+ err error
+ )
+ if isTerminalCommitment(state) {
+ // Terminal commitments (superseded/expired state, or EndTime in the past): update
+ // existing CRD so the controller can clean up Reservations, but do not create a
+ // new one — if no CRD exists locally there are no Reservation slots to clean up.
+ op, err = s.updateCommittedResourceIfExists(ctx, logger, state)
+ } else {
+ op, err = s.upsertCommittedResource(ctx, logger, state)
+ }
if err != nil {
- logger.Error(err, "failed to apply commitment state",
+ logger.Error(err, "failed to upsert committed resource CRD",
"commitmentUUID", state.CommitmentUUID)
- // Continue with other commitments even if one fails
continue
}
-
- totalCreated += applyResult.Created
- totalDeleted += applyResult.Deleted
- totalRepaired += applyResult.Repaired
- }
-
- // Delete reservations that are no longer in commitments
- // Only query committed resource reservations using labels for efficiency
- var existingReservations v1alpha1.ReservationList
- if err := s.List(ctx, &existingReservations, client.MatchingLabels{
- v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
- }); err != nil {
- logger.Error(err, "failed to list existing committed resource reservations")
- return err
+ switch op {
+ case controllerutil.OperationResultCreated:
+ totalCreated++
+ case controllerutil.OperationResultUpdated:
+ totalUpdated++
+ }
}
// Build set of commitment UUIDs we should have (processed + skipped)
@@ -286,51 +285,173 @@ func (s *Syncer) SyncReservations(ctx context.Context) error {
for _, state := range commitmentResult.states {
activeCommitments[state.CommitmentUUID] = true
}
- // Also include skipped commitments - don't delete their CRDs
for uuid := range commitmentResult.skippedUUIDs {
activeCommitments[uuid] = true
}
- // Delete reservations for commitments that no longer exist
- for _, existing := range existingReservations.Items {
- // Extract commitment UUID from reservation name
- commitmentUUID := extractCommitmentUUID(existing.Name)
- if commitmentUUID == "" {
- logger.Info("skipping reservation with unparseable name", "name", existing.Name)
+ // Count CommittedResource CRDs present locally but absent from Limes (do not delete — Limes
+ // responses may be transient and deleting active CRDs would drop Reservation slots).
+ // Also GC CRDs whose EndTime has passed: the commitment is over, the controller's finalizer
+ // will clean up child Reservations on deletion.
+ var existingCRs v1alpha1.CommittedResourceList
+ if err := s.List(ctx, &existingCRs); err != nil {
+ logger.Error(err, "failed to list existing committed resource CRDs")
+ if s.monitor != nil {
+ s.monitor.RecordError()
+ }
+ return err
+ }
+ staleCRCount, gcDeleted := 0, 0
+ for i := range existingCRs.Items {
+ cr := &existingCRs.Items[i]
+ if cr.Spec.SchedulingDomain != v1alpha1.SchedulingDomainNova {
continue
}
+ isExpired := cr.Spec.EndTime != nil && !cr.Spec.EndTime.After(time.Now())
+ if !activeCommitments[cr.Spec.CommitmentUUID] && !isExpired {
+ staleCRCount++
+ }
+ if isExpired {
+ if err := s.Delete(ctx, cr); client.IgnoreNotFound(err) != nil {
+ logger.Error(err, "failed to GC expired committed resource CRD", "name", cr.Name)
+ return err
+ }
+ logger.Info("GC'd expired committed resource CRD",
+ "name", cr.Name, "endTime", cr.Spec.EndTime)
+ gcDeleted++
+ }
+ }
+ // Delete orphaned Reservation CRDs: type=committed-resource but commitment no longer active.
+ // These are left over from the pre-refactor path where the syncer wrote Reservations directly.
+ var existingReservations v1alpha1.ReservationList
+ if err := s.List(ctx, &existingReservations, client.MatchingLabels{
+ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
+ }); err != nil {
+ logger.Error(err, "failed to list committed resource reservations")
+ return err
+ }
+ var totalReservationDeleted int
+ for i := range existingReservations.Items {
+ res := &existingReservations.Items[i]
+ if res.Spec.CommittedResourceReservation == nil {
+ logger.Info("skipping reservation without committed resource spec", "name", res.Name)
+ continue
+ }
+ commitmentUUID := res.Spec.CommittedResourceReservation.CommitmentUUID
+ if commitmentUUID == "" {
+ logger.Info("skipping reservation with empty commitment UUID", "name", res.Name)
+ continue
+ }
if !activeCommitments[commitmentUUID] {
- // This commitment no longer exists, delete the reservation
- if err := s.Delete(ctx, &existing); err != nil {
- logger.Error(err, "failed to delete reservation", "name", existing.Name)
+ if err := s.Delete(ctx, res); client.IgnoreNotFound(err) != nil {
+ logger.Error(err, "failed to delete orphaned reservation", "name", res.Name)
return err
}
- logger.Info("deleted reservation for expired commitment",
- "name", existing.Name,
- "commitmentUUID", commitmentUUID)
- totalDeleted++
+ logger.Info("deleted orphaned reservation", "name", res.Name, "commitmentUUID", commitmentUUID)
+ totalReservationDeleted++
}
}
- // Record reservation change metrics
if s.monitor != nil {
+ s.monitor.RecordStaleCRs(staleCRCount)
if totalCreated > 0 {
- s.monitor.RecordReservationsCreated(totalCreated)
+ s.monitor.RecordCRCreates(totalCreated)
}
- if totalDeleted > 0 {
- s.monitor.RecordReservationsDeleted(totalDeleted)
+ if totalUpdated > 0 {
+ s.monitor.RecordCRUpdates(totalUpdated)
}
- if totalRepaired > 0 {
- s.monitor.RecordReservationsRepaired(totalRepaired)
+ if gcDeleted > 0 {
+ s.monitor.RecordCRDeletes(gcDeleted)
}
}
- logger.Info("synced reservations",
+ if staleCRCount > 0 {
+ logger.Info("WARNING: committed resource CRDs present locally but absent from Limes — review for manual cleanup",
+ "staleCRs", staleCRCount)
+ }
+
+ logger.Info("synced committed resource CRDs",
"processedCount", len(commitmentResult.states),
"skippedCount", len(commitmentResult.skippedUUIDs),
"created", totalCreated,
- "deleted", totalDeleted,
- "repaired", totalRepaired)
+ "updated", totalUpdated,
+ "staleCRs", staleCRCount,
+ "expiredCRsGCd", gcDeleted,
+ "orphanReservationsDeleted", totalReservationDeleted)
return nil
}
+
+func (s *Syncer) applyCommittedResourceSpec(cr *v1alpha1.CommittedResource, state *CommitmentState) {
+ cr.Spec.CommitmentUUID = state.CommitmentUUID
+ cr.Spec.SchedulingDomain = v1alpha1.SchedulingDomainNova
+ cr.Spec.FlavorGroupName = state.FlavorGroupName
+ cr.Spec.ResourceType = v1alpha1.CommittedResourceTypeMemory
+ cr.Spec.Amount = *resource.NewQuantity(state.TotalMemoryBytes, resource.BinarySI)
+ cr.Spec.AvailabilityZone = state.AvailabilityZone
+ cr.Spec.ProjectID = state.ProjectID
+ cr.Spec.DomainID = state.DomainID
+ cr.Spec.State = state.State
+ cr.Spec.AllowRejection = false
+
+ if state.StartTime != nil {
+ t := metav1.NewTime(*state.StartTime)
+ cr.Spec.StartTime = &t
+ } else {
+ cr.Spec.StartTime = nil
+ }
+ if state.EndTime != nil {
+ t := metav1.NewTime(*state.EndTime)
+ cr.Spec.EndTime = &t
+ } else {
+ cr.Spec.EndTime = nil
+ }
+}
+
+func (s *Syncer) upsertCommittedResource(ctx context.Context, logger logr.Logger, state *CommitmentState) (controllerutil.OperationResult, error) {
+ cr := &v1alpha1.CommittedResource{}
+ cr.Name = "commitment-" + state.CommitmentUUID
+
+ op, err := controllerutil.CreateOrUpdate(ctx, s.Client, cr, func() error {
+ s.applyCommittedResourceSpec(cr, state)
+ return nil
+ })
+ if err != nil {
+ return op, err
+ }
+ logger.V(1).Info("upserted committed resource CRD", "name", cr.Name, "op", op)
+ return op, nil
+}
+
+// updateCommittedResourceIfExists updates an existing CommittedResource CRD but does not
+// create one if it is absent. Used for terminal states (superseded/expired): we want the
+// controller to see the state transition and clean up child Reservations, but there is no
+// point creating a CRD for a commitment Cortex has never tracked.
+func (s *Syncer) updateCommittedResourceIfExists(ctx context.Context, logger logr.Logger, state *CommitmentState) (controllerutil.OperationResult, error) {
+ cr := &v1alpha1.CommittedResource{}
+ name := "commitment-" + state.CommitmentUUID
+ if err := s.Get(ctx, client.ObjectKey{Name: name}, cr); err != nil {
+ if client.IgnoreNotFound(err) == nil {
+ logger.V(1).Info("skipping terminal state — CRD does not exist locally",
+ "commitmentUUID", state.CommitmentUUID, "state", state.State)
+ return controllerutil.OperationResultNone, nil
+ }
+ return controllerutil.OperationResultNone, err
+ }
+ s.applyCommittedResourceSpec(cr, state)
+ if err := s.Update(ctx, cr); err != nil {
+ return controllerutil.OperationResultNone, err
+ }
+ logger.V(1).Info("updated committed resource CRD (terminal state)", "name", name, "state", state.State)
+ return controllerutil.OperationResultUpdated, nil
+}
+
+// isTerminalCommitment returns true when a commitment should not result in new Reservation
+// slots: either its Limes state is already terminal, or its EndTime has passed.
+func isTerminalCommitment(state *CommitmentState) bool {
+ switch state.State {
+ case v1alpha1.CommitmentStatusSuperseded, v1alpha1.CommitmentStatusExpired:
+ return true
+ }
+ return state.EndTime != nil && !state.EndTime.After(time.Now())
+}
diff --git a/internal/scheduling/reservations/commitments/syncer_monitor.go b/internal/scheduling/reservations/commitments/syncer_monitor.go
index 853518f81..7e13478bb 100644
--- a/internal/scheduling/reservations/commitments/syncer_monitor.go
+++ b/internal/scheduling/reservations/commitments/syncer_monitor.go
@@ -14,60 +14,55 @@ const (
SkipReasonInvalidResource = "invalid_resource_name"
SkipReasonEmptyUUID = "empty_uuid"
SkipReasonNonCompute = "non_compute"
- SkipReasonNonActive = "non_active"
)
// SyncerMonitor provides metrics for the commitment syncer.
type SyncerMonitor struct {
- // Sync lifecycle
- syncRuns prometheus.Counter
- syncErrors prometheus.Counter
-
- // Commitment processing
- commitmentsTotal prometheus.Counter // all commitments seen from Limes
- commitmentsProcessed prometheus.Counter // successfully processed
- commitmentsSkipped *prometheus.CounterVec // skipped with reason label
-
- // Reservation changes
- reservationsCreated prometheus.Counter
- reservationsDeleted prometheus.Counter
- reservationsRepaired prometheus.Counter
+ syncErrors prometheus.Counter
+ syncDuration prometheus.Histogram
+ limesCommitmentsActive prometheus.Gauge
+ staleCRs prometheus.Gauge
+ commitmentsSkipped *prometheus.CounterVec
+ crCreates prometheus.Counter
+ crUpdates prometheus.Counter
+ crDeletes prometheus.Counter
}
// NewSyncerMonitor creates a new monitor with Prometheus metrics.
func NewSyncerMonitor() *SyncerMonitor {
m := &SyncerMonitor{
- syncRuns: prometheus.NewCounter(prometheus.CounterOpts{
- Name: "cortex_committed_resource_syncer_runs_total",
- Help: "Total number of commitment syncer runs",
- }),
syncErrors: prometheus.NewCounter(prometheus.CounterOpts{
Name: "cortex_committed_resource_syncer_errors_total",
- Help: "Total number of commitment syncer errors",
+ Help: "Total number of commitment syncer runs that failed",
+ }),
+ syncDuration: prometheus.NewHistogram(prometheus.HistogramOpts{
+ Name: "cortex_committed_resource_syncer_duration_seconds",
+ Help: "Duration of each commitment syncer run",
+ Buckets: []float64{0.5, 1, 5, 10, 30, 60, 120},
}),
- commitmentsTotal: prometheus.NewCounter(prometheus.CounterOpts{
- Name: "cortex_committed_resource_syncer_commitments_total",
- Help: "Total number of commitments seen from Limes",
+ limesCommitmentsActive: prometheus.NewGauge(prometheus.GaugeOpts{
+ Name: "cortex_committed_resource_syncer_limes_commitments_active",
+ Help: "Number of commitments from Limes that passed filtering and should have CR CRDs",
}),
- commitmentsProcessed: prometheus.NewCounter(prometheus.CounterOpts{
- Name: "cortex_committed_resource_syncer_commitments_processed_total",
- Help: "Total number of commitments successfully processed",
+ staleCRs: prometheus.NewGauge(prometheus.GaugeOpts{
+ Name: "cortex_committed_resource_syncer_crd_unmatched",
+ Help: "Number of CommittedResource CRDs present locally but absent from Limes",
}),
commitmentsSkipped: prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "cortex_committed_resource_syncer_commitments_skipped_total",
Help: "Total number of commitments skipped during sync",
}, []string{"reason"}),
- reservationsCreated: prometheus.NewCounter(prometheus.CounterOpts{
- Name: "cortex_committed_resource_syncer_reservations_created_total",
- Help: "Total number of reservations created during sync",
+ crCreates: prometheus.NewCounter(prometheus.CounterOpts{
+ Name: "cortex_committed_resource_syncer_cr_creates_total",
+ Help: "Total number of CommittedResource CRDs created by the syncer",
}),
- reservationsDeleted: prometheus.NewCounter(prometheus.CounterOpts{
- Name: "cortex_committed_resource_syncer_reservations_deleted_total",
- Help: "Total number of reservations deleted during sync",
+ crUpdates: prometheus.NewCounter(prometheus.CounterOpts{
+ Name: "cortex_committed_resource_syncer_cr_updates_total",
+ Help: "Total number of CommittedResource CRDs updated by the syncer",
}),
- reservationsRepaired: prometheus.NewCounter(prometheus.CounterOpts{
- Name: "cortex_committed_resource_syncer_reservations_repaired_total",
- Help: "Total number of reservations repaired during sync (wrong metadata)",
+ crDeletes: prometheus.NewCounter(prometheus.CounterOpts{
+ Name: "cortex_committed_resource_syncer_cr_deletes_total",
+ Help: "Total number of CommittedResource CRDs deleted by the syncer (expired GC)",
}),
}
@@ -78,7 +73,6 @@ func NewSyncerMonitor() *SyncerMonitor {
SkipReasonInvalidResource,
SkipReasonEmptyUUID,
SkipReasonNonCompute,
- SkipReasonNonActive,
} {
m.commitmentsSkipped.WithLabelValues(reason)
}
@@ -86,66 +80,58 @@ func NewSyncerMonitor() *SyncerMonitor {
return m
}
-// RecordSyncRun records a syncer run.
-func (m *SyncerMonitor) RecordSyncRun() {
- m.syncRuns.Inc()
+func (m *SyncerMonitor) RecordError() {
+ m.syncErrors.Inc()
}
-// RecordSyncError records a syncer error.
-func (m *SyncerMonitor) RecordSyncError() {
- m.syncErrors.Inc()
+func (m *SyncerMonitor) RecordDuration(seconds float64) {
+ m.syncDuration.Observe(seconds)
}
-// RecordCommitmentSeen records a commitment seen from Limes.
-func (m *SyncerMonitor) RecordCommitmentSeen() {
- m.commitmentsTotal.Inc()
+func (m *SyncerMonitor) SetLimesCommitmentsActive(count int) {
+ m.limesCommitmentsActive.Set(float64(count))
}
-// RecordCommitmentProcessed records a commitment successfully processed.
-func (m *SyncerMonitor) RecordCommitmentProcessed() {
- m.commitmentsProcessed.Inc()
+func (m *SyncerMonitor) RecordStaleCRs(count int) {
+ m.staleCRs.Set(float64(count))
}
-// RecordCommitmentSkipped records a commitment skipped with a reason.
func (m *SyncerMonitor) RecordCommitmentSkipped(reason string) {
m.commitmentsSkipped.WithLabelValues(reason).Inc()
}
-// RecordReservationsCreated records reservations created.
-func (m *SyncerMonitor) RecordReservationsCreated(count int) {
- m.reservationsCreated.Add(float64(count))
+func (m *SyncerMonitor) RecordCRCreates(count int) {
+ m.crCreates.Add(float64(count))
}
-// RecordReservationsDeleted records reservations deleted.
-func (m *SyncerMonitor) RecordReservationsDeleted(count int) {
- m.reservationsDeleted.Add(float64(count))
+func (m *SyncerMonitor) RecordCRUpdates(count int) {
+ m.crUpdates.Add(float64(count))
}
-// RecordReservationsRepaired records reservations repaired.
-func (m *SyncerMonitor) RecordReservationsRepaired(count int) {
- m.reservationsRepaired.Add(float64(count))
+func (m *SyncerMonitor) RecordCRDeletes(count int) {
+ m.crDeletes.Add(float64(count))
}
// Describe implements prometheus.Collector.
func (m *SyncerMonitor) Describe(ch chan<- *prometheus.Desc) {
- m.syncRuns.Describe(ch)
m.syncErrors.Describe(ch)
- m.commitmentsTotal.Describe(ch)
- m.commitmentsProcessed.Describe(ch)
+ m.syncDuration.Describe(ch)
+ m.limesCommitmentsActive.Describe(ch)
+ m.staleCRs.Describe(ch)
m.commitmentsSkipped.Describe(ch)
- m.reservationsCreated.Describe(ch)
- m.reservationsDeleted.Describe(ch)
- m.reservationsRepaired.Describe(ch)
+ m.crCreates.Describe(ch)
+ m.crUpdates.Describe(ch)
+ m.crDeletes.Describe(ch)
}
// Collect implements prometheus.Collector.
func (m *SyncerMonitor) Collect(ch chan<- prometheus.Metric) {
- m.syncRuns.Collect(ch)
m.syncErrors.Collect(ch)
- m.commitmentsTotal.Collect(ch)
- m.commitmentsProcessed.Collect(ch)
+ m.syncDuration.Collect(ch)
+ m.limesCommitmentsActive.Collect(ch)
+ m.staleCRs.Collect(ch)
m.commitmentsSkipped.Collect(ch)
- m.reservationsCreated.Collect(ch)
- m.reservationsDeleted.Collect(ch)
- m.reservationsRepaired.Collect(ch)
+ m.crCreates.Collect(ch)
+ m.crUpdates.Collect(ch)
+ m.crDeletes.Collect(ch)
}
diff --git a/internal/scheduling/reservations/commitments/syncer_monitor_test.go b/internal/scheduling/reservations/commitments/syncer_monitor_test.go
index 853524a70..d973a95e9 100644
--- a/internal/scheduling/reservations/commitments/syncer_monitor_test.go
+++ b/internal/scheduling/reservations/commitments/syncer_monitor_test.go
@@ -36,14 +36,12 @@ func TestSyncerMonitor_MetricsRegistration(t *testing.T) {
name string
metricType dto.MetricType
}{
- {"cortex_committed_resource_syncer_runs_total", dto.MetricType_COUNTER},
{"cortex_committed_resource_syncer_errors_total", dto.MetricType_COUNTER},
- {"cortex_committed_resource_syncer_commitments_total", dto.MetricType_COUNTER},
- {"cortex_committed_resource_syncer_commitments_processed_total", dto.MetricType_COUNTER},
{"cortex_committed_resource_syncer_commitments_skipped_total", dto.MetricType_COUNTER},
- {"cortex_committed_resource_syncer_reservations_created_total", dto.MetricType_COUNTER},
- {"cortex_committed_resource_syncer_reservations_deleted_total", dto.MetricType_COUNTER},
- {"cortex_committed_resource_syncer_reservations_repaired_total", dto.MetricType_COUNTER},
+ {"cortex_committed_resource_syncer_cr_creates_total", dto.MetricType_COUNTER},
+ {"cortex_committed_resource_syncer_cr_updates_total", dto.MetricType_COUNTER},
+ {"cortex_committed_resource_syncer_cr_deletes_total", dto.MetricType_COUNTER},
+ {"cortex_committed_resource_syncer_crd_unmatched", dto.MetricType_GAUGE},
}
for _, tc := range cases {
@@ -100,7 +98,6 @@ func TestSyncerMonitor_SkipReasonsPreInitialized(t *testing.T) {
SkipReasonInvalidResource,
SkipReasonEmptyUUID,
SkipReasonNonCompute,
- SkipReasonNonActive,
} {
if !presentReasons[reason] {
t.Errorf("skip reason %q not pre-initialized in commitments_skipped_total", reason)
diff --git a/internal/scheduling/reservations/commitments/syncer_test.go b/internal/scheduling/reservations/commitments/syncer_test.go
index e30f286c7..28a464d1e 100644
--- a/internal/scheduling/reservations/commitments/syncer_test.go
+++ b/internal/scheduling/reservations/commitments/syncer_test.go
@@ -7,14 +7,15 @@ import (
"context"
"sort"
"testing"
+ "time"
"github.com/cobaltcore-dev/cortex/api/v1alpha1"
"github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute"
- hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
+ "github.com/prometheus/client_golang/prometheus"
+ dto "github.com/prometheus/client_model/go"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
- ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
)
@@ -273,44 +274,31 @@ func TestSyncer_SyncReservations_InstanceCommitments(t *testing.T) {
return
}
- // Verify that reservations were created
- var reservations v1alpha1.ReservationList
- err = k8sClient.List(context.Background(), &reservations)
- if err != nil {
- t.Errorf("Failed to list reservations: %v", err)
- return
+ // Verify one CommittedResource CRD was created with the correct spec
+ var crList v1alpha1.CommittedResourceList
+ if err := k8sClient.List(context.Background(), &crList); err != nil {
+ t.Fatalf("Failed to list committed resources: %v", err)
}
-
- // Should have 2 reservations (Amount = 2, each for smallest flavor)
- if len(reservations.Items) != 2 {
- t.Errorf("Expected 2 reservations, got %d", len(reservations.Items))
- return
+ if len(crList.Items) != 1 {
+ t.Fatalf("Expected 1 CommittedResource, got %d", len(crList.Items))
}
-
- // Verify the first reservation
- res := reservations.Items[0]
- if res.Spec.CommittedResourceReservation == nil {
- t.Errorf("Expected CommittedResourceReservation to be set")
- return
+ cr := crList.Items[0]
+ if cr.Name != "commitment-12345-67890-abcdef" {
+ t.Errorf("Expected name commitment-12345-67890-abcdef, got %s", cr.Name)
}
- if res.Spec.CommittedResourceReservation.ProjectID != "test-project-1" {
- t.Errorf("Expected project ID test-project-1, got %v", res.Spec.CommittedResourceReservation.ProjectID)
+ if cr.Spec.ProjectID != "test-project-1" {
+ t.Errorf("Expected projectID test-project-1, got %s", cr.Spec.ProjectID)
}
-
- if res.Spec.CommittedResourceReservation.ResourceGroup != "test_group_v1" {
- t.Errorf("Expected resource group test_group_v1, got %v", res.Spec.CommittedResourceReservation.ResourceGroup)
+ if cr.Spec.FlavorGroupName != "test_group_v1" {
+ t.Errorf("Expected flavorGroupName test_group_v1, got %s", cr.Spec.FlavorGroupName)
}
-
- // Check resource values - should be sized for the flavor that fits
- // With 2048MB total capacity, we can fit 2x 1024MB flavors
- expectedMemory := resource.MustParse("1073741824") // 1024MB in bytes
- if !res.Spec.Resources[hv1.ResourceMemory].Equal(expectedMemory) {
- t.Errorf("Expected memory %v, got %v", expectedMemory, res.Spec.Resources[hv1.ResourceMemory])
+ if cr.Spec.State != v1alpha1.CommitmentStatusConfirmed {
+ t.Errorf("Expected state confirmed, got %s", cr.Spec.State)
}
-
- expectedVCPUs := resource.MustParse("2")
- if !res.Spec.Resources[hv1.ResourceCPU].Equal(expectedVCPUs) {
- t.Errorf("Expected vCPUs %v, got %v", expectedVCPUs, res.Spec.Resources[hv1.ResourceCPU])
+ // Amount = 2 slots × 1024 MiB = 2 GiB
+ expectedAmount := resource.NewQuantity(2*1024*1024*1024, resource.BinarySI)
+ if !cr.Spec.Amount.Equal(*expectedAmount) {
+ t.Errorf("Expected amount %v, got %v", expectedAmount, cr.Spec.Amount)
}
}
@@ -320,7 +308,6 @@ func TestSyncer_SyncReservations_UpdateExisting(t *testing.T) {
t.Fatalf("Failed to add scheme: %v", err)
}
- // Create flavor group knowledge CRD
flavorGroupsKnowledge := createFlavorGroupKnowledge(t, map[string]FlavorGroupData{
"new_group_v1": {
LargestFlavorName: "new-flavor",
@@ -332,37 +319,26 @@ func TestSyncer_SyncReservations_UpdateExisting(t *testing.T) {
},
})
- // Create an existing reservation with mismatched project/flavor group
- // The ReservationManager will delete this and create a new one
- existingReservation := &v1alpha1.Reservation{
- ObjectMeta: ctrl.ObjectMeta{
- Name: "commitment-12345-67890-abcdef-0",
- Labels: map[string]string{
- v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
- },
- },
- Spec: v1alpha1.ReservationSpec{
- Type: v1alpha1.ReservationTypeCommittedResource,
- CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{
- CommitmentUUID: "12345-67890-abcdef",
- ProjectID: "old-project",
- ResourceName: "old-flavor",
- ResourceGroup: "old_group",
- Creator: CreatorValue,
- },
- Resources: map[hv1.ResourceName]resource.Quantity{
- hv1.ResourceMemory: resource.MustParse("512Mi"),
- hv1.ResourceCPU: resource.MustParse("1"),
- },
+ // Pre-existing CommittedResource CRD with stale spec; syncer should update it.
+ existingCR := &v1alpha1.CommittedResource{
+ ObjectMeta: metav1.ObjectMeta{Name: "commitment-12345-67890-abcdef"},
+ Spec: v1alpha1.CommittedResourceSpec{
+ CommitmentUUID: "12345-67890-abcdef",
+ FlavorGroupName: "old_group",
+ ResourceType: v1alpha1.CommittedResourceTypeMemory,
+ Amount: *resource.NewQuantity(512*1024*1024, resource.BinarySI),
+ ProjectID: "old-project",
+ DomainID: "old-domain",
+ AvailabilityZone: "az1",
+ State: v1alpha1.CommitmentStatusConfirmed,
},
}
k8sClient := fake.NewClientBuilder().
WithScheme(scheme).
- WithObjects(existingReservation, flavorGroupsKnowledge).
+ WithObjects(existingCR, flavorGroupsKnowledge).
Build()
- // Create mock commitment that will replace the existing reservation
mockCommitments := []Commitment{
{
ID: 1,
@@ -387,57 +363,29 @@ func TestSyncer_SyncReservations_UpdateExisting(t *testing.T) {
return result, nil
},
listProjectsFunc: func(ctx context.Context) ([]Project, error) {
- return []Project{
- {ID: "new-project", DomainID: "new-domain", Name: "New Project"},
- }, nil
- },
- listServersFunc: func(ctx context.Context, projects ...Project) (map[string][]Server, error) {
- return map[string][]Server{}, nil // No active servers
- },
- initFunc: func(ctx context.Context, client client.Client, conf SyncerConfig) error {
- // No-op for init
- return nil
+ return []Project{{ID: "new-project", DomainID: "new-domain"}}, nil
},
}
- syncer := &Syncer{
- CommitmentsClient: mockClient,
- Client: k8sClient,
- }
-
- err := syncer.SyncReservations(context.Background())
- if err != nil {
- t.Errorf("SyncReservations() error = %v", err)
- return
- }
+ syncer := &Syncer{CommitmentsClient: mockClient, Client: k8sClient}
- // Verify that reservations were updated (old one deleted, new one created)
- // The new reservation will be at index 0 since the old one was deleted first
- var reservations v1alpha1.ReservationList
- err = k8sClient.List(context.Background(), &reservations)
- if err != nil {
- t.Errorf("Failed to list reservations: %v", err)
- return
+ if err := syncer.SyncReservations(context.Background()); err != nil {
+ t.Fatalf("SyncReservations() error = %v", err)
}
- if len(reservations.Items) != 1 {
- t.Errorf("Expected 1 reservation, got %d", len(reservations.Items))
- return
+ var crList v1alpha1.CommittedResourceList
+ if err := k8sClient.List(context.Background(), &crList); err != nil {
+ t.Fatalf("Failed to list committed resources: %v", err)
}
-
- newReservation := reservations.Items[0]
-
- // Verify the new reservation has correct values
- if newReservation.Spec.CommittedResourceReservation == nil {
- t.Errorf("Expected CommittedResourceReservation to be set")
- return
+ if len(crList.Items) != 1 {
+ t.Fatalf("Expected 1 CommittedResource, got %d", len(crList.Items))
}
- if newReservation.Spec.CommittedResourceReservation.ProjectID != "new-project" {
- t.Errorf("Expected project ID new-project, got %v", newReservation.Spec.CommittedResourceReservation.ProjectID)
+ cr := crList.Items[0]
+ if cr.Spec.ProjectID != "new-project" {
+ t.Errorf("Expected projectID new-project, got %s", cr.Spec.ProjectID)
}
-
- if newReservation.Spec.CommittedResourceReservation.ResourceGroup != "new_group_v1" {
- t.Errorf("Expected resource group new_group_v1, got %v", newReservation.Spec.CommittedResourceReservation.ResourceGroup)
+ if cr.Spec.FlavorGroupName != "new_group_v1" {
+ t.Errorf("Expected flavorGroupName new_group_v1, got %s", cr.Spec.FlavorGroupName)
}
}
@@ -511,19 +459,14 @@ func TestSyncer_SyncReservations_UnitMismatch(t *testing.T) {
return
}
- // Verify that NO reservations were created due to unit mismatch
- // The commitment is skipped and Cortex trusts existing CRDs
- var reservations v1alpha1.ReservationList
- err = k8sClient.List(context.Background(), &reservations)
- if err != nil {
- t.Errorf("Failed to list reservations: %v", err)
- return
+ // Verify that NO CommittedResource CRDs were created due to unit mismatch.
+ // The commitment is skipped and Cortex trusts existing CRDs.
+ var crList v1alpha1.CommittedResourceList
+ if err := k8sClient.List(context.Background(), &crList); err != nil {
+ t.Fatalf("Failed to list committed resources: %v", err)
}
-
- // Should have 0 reservations - commitment is skipped due to unit mismatch
- // Cortex waits for Limes to update the unit before processing
- if len(reservations.Items) != 0 {
- t.Errorf("Expected 0 reservations (commitment skipped due to unit mismatch), got %d", len(reservations.Items))
+ if len(crList.Items) != 0 {
+ t.Errorf("Expected 0 CommittedResource CRDs (commitment skipped due to unit mismatch), got %d", len(crList.Items))
}
}
@@ -595,16 +538,13 @@ func TestSyncer_SyncReservations_UnitMatch(t *testing.T) {
return
}
- // Verify that reservations were created
- var reservations v1alpha1.ReservationList
- err = k8sClient.List(context.Background(), &reservations)
- if err != nil {
- t.Errorf("Failed to list reservations: %v", err)
- return
+ // Verify that one CommittedResource CRD was created
+ var crList v1alpha1.CommittedResourceList
+ if err := k8sClient.List(context.Background(), &crList); err != nil {
+ t.Fatalf("Failed to list committed resources: %v", err)
}
-
- if len(reservations.Items) != 2 {
- t.Errorf("Expected 2 reservations, got %d", len(reservations.Items))
+ if len(crList.Items) != 1 {
+ t.Errorf("Expected 1 CommittedResource CRD, got %d", len(crList.Items))
}
}
@@ -680,16 +620,13 @@ func TestSyncer_SyncReservations_EmptyUUID(t *testing.T) {
return
}
- // Verify that no reservations were created due to empty UUID
- var reservations v1alpha1.ReservationList
- err = k8sClient.List(context.Background(), &reservations)
- if err != nil {
- t.Errorf("Failed to list reservations: %v", err)
- return
+ // Verify that no CommittedResource CRDs were created due to empty UUID
+ var crList v1alpha1.CommittedResourceList
+ if err := k8sClient.List(context.Background(), &crList); err != nil {
+ t.Fatalf("Failed to list committed resources: %v", err)
}
-
- if len(reservations.Items) != 0 {
- t.Errorf("Expected 0 reservations due to empty UUID, got %d", len(reservations.Items))
+ if len(crList.Items) != 0 {
+ t.Errorf("Expected 0 CommittedResource CRDs due to empty UUID, got %d", len(crList.Items))
}
}
@@ -711,16 +648,16 @@ func TestSyncer_SyncReservations_StatusFilter(t *testing.T) {
})
tests := []struct {
- name string
- status string
- expectReservation bool
+ name string
+ status string
+ expectCR bool
}{
- {"confirmed is processed", "confirmed", true},
- {"guaranteed is processed", "guaranteed", true},
- {"planned is skipped", "planned", false},
- {"pending is skipped", "pending", false},
- {"superseded is skipped", "superseded", false},
- {"expired is skipped", "expired", false},
+ {"confirmed creates CR", "confirmed", true},
+ {"guaranteed creates CR", "guaranteed", true},
+ {"planned creates CR", "planned", true},
+ {"pending creates CR", "pending", true},
+ {"superseded does not create CR", "superseded", false},
+ {"expired does not create CR", "expired", false},
{"empty status is skipped", "", false},
}
@@ -769,17 +706,288 @@ func TestSyncer_SyncReservations_StatusFilter(t *testing.T) {
t.Fatalf("SyncReservations() error = %v", err)
}
- var reservations v1alpha1.ReservationList
- if err := k8sClient.List(context.Background(), &reservations); err != nil {
- t.Fatalf("Failed to list reservations: %v", err)
+ var crList v1alpha1.CommittedResourceList
+ if err := k8sClient.List(context.Background(), &crList); err != nil {
+ t.Fatalf("Failed to list committed resources: %v", err)
}
- if tc.expectReservation && len(reservations.Items) == 0 {
- t.Errorf("status=%q: expected reservation to be created, got none", tc.status)
+ if tc.expectCR && len(crList.Items) == 0 {
+ t.Errorf("status=%q: expected CommittedResource CRD to be created, got none", tc.status)
}
- if !tc.expectReservation && len(reservations.Items) != 0 {
- t.Errorf("status=%q: expected no reservation, got %d", tc.status, len(reservations.Items))
+ if !tc.expectCR && len(crList.Items) != 0 {
+ t.Errorf("status=%q: expected no CommittedResource CRD, got %d", tc.status, len(crList.Items))
}
})
}
}
+
+func TestSyncer_SyncReservations_StaleCRCount(t *testing.T) {
+ scheme := runtime.NewScheme()
+ if err := v1alpha1.AddToScheme(scheme); err != nil {
+ t.Fatalf("Failed to add scheme: %v", err)
+ }
+
+ flavorGroupsKnowledge := createFlavorGroupKnowledge(t, map[string]FlavorGroupData{
+ "test_group_v1": {
+ LargestFlavorName: "test-flavor",
+ LargestFlavorVCPUs: 2,
+ LargestFlavorMemoryMB: 1024,
+ SmallestFlavorName: "test-flavor",
+ SmallestFlavorVCPUs: 2,
+ SmallestFlavorMemoryMB: 1024,
+ },
+ })
+
+ // Pre-existing CRD whose commitment no longer appears in Limes
+ staleCR := &v1alpha1.CommittedResource{
+ ObjectMeta: metav1.ObjectMeta{Name: "commitment-stale-uuid-1234"},
+ Spec: v1alpha1.CommittedResourceSpec{
+ CommitmentUUID: "stale-uuid-1234",
+ FlavorGroupName: "test_group_v1",
+ ResourceType: v1alpha1.CommittedResourceTypeMemory,
+ Amount: *resource.NewQuantity(1024*1024*1024, resource.BinarySI),
+ ProjectID: "test-project",
+ DomainID: "test-domain",
+ AvailabilityZone: "az1",
+ State: v1alpha1.CommitmentStatusConfirmed,
+ SchedulingDomain: v1alpha1.SchedulingDomainNova,
+ },
+ }
+
+ k8sClient := fake.NewClientBuilder().
+ WithScheme(scheme).
+ WithObjects(staleCR, flavorGroupsKnowledge).
+ Build()
+
+ // Limes returns no commitments (stale-uuid-1234 is gone)
+ mockClient := &mockCommitmentsClient{
+ listCommitmentsByIDFunc: func(ctx context.Context, projects ...Project) (map[string]Commitment, error) {
+ return map[string]Commitment{}, nil
+ },
+ listProjectsFunc: func(ctx context.Context) ([]Project, error) {
+ return []Project{{ID: "test-project", DomainID: "test-domain"}}, nil
+ },
+ }
+
+ monitor := NewSyncerMonitor()
+ syncer := &Syncer{CommitmentsClient: mockClient, Client: k8sClient, monitor: monitor}
+
+ if err := syncer.SyncReservations(context.Background()); err != nil {
+ t.Fatalf("SyncReservations() error = %v", err)
+ }
+
+ // Stale CRD must still exist (syncer does not delete)
+ var crList v1alpha1.CommittedResourceList
+ if err := k8sClient.List(context.Background(), &crList); err != nil {
+ t.Fatalf("Failed to list committed resources: %v", err)
+ }
+ if len(crList.Items) != 1 {
+ t.Errorf("Expected stale CRD to be preserved, got %d CRDs", len(crList.Items))
+ }
+
+ // Gauge must reflect the stale count
+ ch := make(chan prometheus.Metric, 10)
+ monitor.staleCRs.Collect(ch)
+ close(ch)
+ m := <-ch
+ var dto dto.Metric
+ if err := m.Write(&dto); err != nil {
+ t.Fatalf("failed to read metric: %v", err)
+ }
+ if got := dto.GetGauge().GetValue(); got != 1 {
+ t.Errorf("Expected staleCRs gauge=1, got %v", got)
+ }
+}
+
+func TestSyncer_SyncReservations_TerminalState_NoCRDExists(t *testing.T) {
+ scheme := runtime.NewScheme()
+ if err := v1alpha1.AddToScheme(scheme); err != nil {
+ t.Fatalf("Failed to add scheme: %v", err)
+ }
+
+ flavorGroupsKnowledge := createFlavorGroupKnowledge(t, map[string]FlavorGroupData{
+ "test_group_v1": {SmallestFlavorName: "f", SmallestFlavorVCPUs: 2, SmallestFlavorMemoryMB: 1024,
+ LargestFlavorName: "f", LargestFlavorVCPUs: 2, LargestFlavorMemoryMB: 1024},
+ })
+ k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(flavorGroupsKnowledge).Build()
+
+ for _, status := range []string{"superseded", "expired"} {
+ t.Run(status, func(t *testing.T) {
+ mockClient := &mockCommitmentsClient{
+ listCommitmentsByIDFunc: func(ctx context.Context, projects ...Project) (map[string]Commitment, error) {
+ return map[string]Commitment{
+ "term-uuid-1234": {
+ ID: 1, UUID: "term-uuid-1234", ServiceType: "compute",
+ ResourceName: "hw_version_test_group_v1_ram", AvailabilityZone: "az1",
+ Amount: 1, Status: status, ProjectID: "p", DomainID: "d",
+ },
+ }, nil
+ },
+ listProjectsFunc: func(ctx context.Context) ([]Project, error) {
+ return []Project{{ID: "p", DomainID: "d"}}, nil
+ },
+ }
+ syncer := &Syncer{CommitmentsClient: mockClient, Client: k8sClient}
+ if err := syncer.SyncReservations(context.Background()); err != nil {
+ t.Fatalf("SyncReservations() error = %v", err)
+ }
+ var crList v1alpha1.CommittedResourceList
+ if err := k8sClient.List(context.Background(), &crList); err != nil {
+ t.Fatalf("Failed to list: %v", err)
+ }
+ if len(crList.Items) != 0 {
+ t.Errorf("status=%q: expected no CRD to be created, got %d", status, len(crList.Items))
+ }
+ })
+ }
+}
+
+func TestSyncer_SyncReservations_TerminalState_ExistingCRDUpdated(t *testing.T) {
+ scheme := runtime.NewScheme()
+ if err := v1alpha1.AddToScheme(scheme); err != nil {
+ t.Fatalf("Failed to add scheme: %v", err)
+ }
+
+ flavorGroupsKnowledge := createFlavorGroupKnowledge(t, map[string]FlavorGroupData{
+ "test_group_v1": {SmallestFlavorName: "f", SmallestFlavorVCPUs: 2, SmallestFlavorMemoryMB: 1024,
+ LargestFlavorName: "f", LargestFlavorVCPUs: 2, LargestFlavorMemoryMB: 1024},
+ })
+
+ existingCR := &v1alpha1.CommittedResource{
+ ObjectMeta: metav1.ObjectMeta{Name: "commitment-term-uuid-1234"},
+ Spec: v1alpha1.CommittedResourceSpec{
+ CommitmentUUID: "term-uuid-1234", FlavorGroupName: "test_group_v1",
+ ResourceType: v1alpha1.CommittedResourceTypeMemory,
+ Amount: *resource.NewQuantity(1024*1024*1024, resource.BinarySI),
+ ProjectID: "p", DomainID: "d", AvailabilityZone: "az1",
+ State: v1alpha1.CommitmentStatusConfirmed,
+ },
+ }
+
+ k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(existingCR, flavorGroupsKnowledge).Build()
+
+ mockClient := &mockCommitmentsClient{
+ listCommitmentsByIDFunc: func(ctx context.Context, projects ...Project) (map[string]Commitment, error) {
+ return map[string]Commitment{
+ "term-uuid-1234": {
+ ID: 1, UUID: "term-uuid-1234", ServiceType: "compute",
+ ResourceName: "hw_version_test_group_v1_ram", AvailabilityZone: "az1",
+ Amount: 1, Status: "superseded", ProjectID: "p", DomainID: "d",
+ },
+ }, nil
+ },
+ listProjectsFunc: func(ctx context.Context) ([]Project, error) {
+ return []Project{{ID: "p", DomainID: "d"}}, nil
+ },
+ }
+
+ syncer := &Syncer{CommitmentsClient: mockClient, Client: k8sClient}
+ if err := syncer.SyncReservations(context.Background()); err != nil {
+ t.Fatalf("SyncReservations() error = %v", err)
+ }
+
+ var crList v1alpha1.CommittedResourceList
+ if err := k8sClient.List(context.Background(), &crList); err != nil {
+ t.Fatalf("Failed to list: %v", err)
+ }
+ if len(crList.Items) != 1 {
+ t.Fatalf("Expected CRD to be preserved, got %d", len(crList.Items))
+ }
+ if crList.Items[0].Spec.State != v1alpha1.CommitmentStatusSuperseded {
+ t.Errorf("Expected state superseded, got %s", crList.Items[0].Spec.State)
+ }
+}
+
+func TestSyncer_SyncReservations_ExpiredByTime_NoCRDCreated(t *testing.T) {
+ scheme := runtime.NewScheme()
+ if err := v1alpha1.AddToScheme(scheme); err != nil {
+ t.Fatalf("Failed to add scheme: %v", err)
+ }
+
+ flavorGroupsKnowledge := createFlavorGroupKnowledge(t, map[string]FlavorGroupData{
+ "test_group_v1": {SmallestFlavorName: "f", SmallestFlavorVCPUs: 2, SmallestFlavorMemoryMB: 1024,
+ LargestFlavorName: "f", LargestFlavorVCPUs: 2, LargestFlavorMemoryMB: 1024},
+ })
+ k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(flavorGroupsKnowledge).Build()
+
+ pastTime := uint64(1) // Unix epoch — well in the past
+ mockClient := &mockCommitmentsClient{
+ listCommitmentsByIDFunc: func(ctx context.Context, projects ...Project) (map[string]Commitment, error) {
+ return map[string]Commitment{
+ "exp-uuid-1234": {
+ ID: 1, UUID: "exp-uuid-1234", ServiceType: "compute",
+ ResourceName: "hw_version_test_group_v1_ram", AvailabilityZone: "az1",
+ Amount: 1, Status: "confirmed", ExpiresAt: pastTime,
+ ProjectID: "p", DomainID: "d",
+ },
+ }, nil
+ },
+ listProjectsFunc: func(ctx context.Context) ([]Project, error) {
+ return []Project{{ID: "p", DomainID: "d"}}, nil
+ },
+ }
+
+ syncer := &Syncer{CommitmentsClient: mockClient, Client: k8sClient}
+ if err := syncer.SyncReservations(context.Background()); err != nil {
+ t.Fatalf("SyncReservations() error = %v", err)
+ }
+
+ var crList v1alpha1.CommittedResourceList
+ if err := k8sClient.List(context.Background(), &crList); err != nil {
+ t.Fatalf("Failed to list: %v", err)
+ }
+ if len(crList.Items) != 0 {
+ t.Errorf("Expected no CRD created for past-expiry confirmed commitment, got %d", len(crList.Items))
+ }
+}
+
+func TestSyncer_SyncReservations_GC_ExpiredEndTime(t *testing.T) {
+ scheme := runtime.NewScheme()
+ if err := v1alpha1.AddToScheme(scheme); err != nil {
+ t.Fatalf("Failed to add scheme: %v", err)
+ }
+
+ flavorGroupsKnowledge := createFlavorGroupKnowledge(t, map[string]FlavorGroupData{
+ "test_group_v1": {SmallestFlavorName: "f", SmallestFlavorVCPUs: 2, SmallestFlavorMemoryMB: 1024,
+ LargestFlavorName: "f", LargestFlavorVCPUs: 2, LargestFlavorMemoryMB: 1024},
+ })
+
+ pastTime := metav1.NewTime(time.Now().Add(-time.Hour))
+ expiredCR := &v1alpha1.CommittedResource{
+ ObjectMeta: metav1.ObjectMeta{Name: "commitment-gc-uuid-1234"},
+ Spec: v1alpha1.CommittedResourceSpec{
+ CommitmentUUID: "gc-uuid-1234", FlavorGroupName: "test_group_v1",
+ ResourceType: v1alpha1.CommittedResourceTypeMemory,
+ Amount: *resource.NewQuantity(1024*1024*1024, resource.BinarySI),
+ ProjectID: "p", DomainID: "d", AvailabilityZone: "az1",
+ State: v1alpha1.CommitmentStatusConfirmed,
+ EndTime: &pastTime,
+ SchedulingDomain: v1alpha1.SchedulingDomainNova,
+ },
+ }
+
+ k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(expiredCR, flavorGroupsKnowledge).Build()
+
+ // Limes no longer returns this commitment
+ mockClient := &mockCommitmentsClient{
+ listCommitmentsByIDFunc: func(ctx context.Context, projects ...Project) (map[string]Commitment, error) {
+ return map[string]Commitment{}, nil
+ },
+ listProjectsFunc: func(ctx context.Context) ([]Project, error) {
+ return []Project{{ID: "p", DomainID: "d"}}, nil
+ },
+ }
+
+ syncer := &Syncer{CommitmentsClient: mockClient, Client: k8sClient}
+ if err := syncer.SyncReservations(context.Background()); err != nil {
+ t.Fatalf("SyncReservations() error = %v", err)
+ }
+
+ var crList v1alpha1.CommittedResourceList
+ if err := k8sClient.List(context.Background(), &crList); err != nil {
+ t.Fatalf("Failed to list: %v", err)
+ }
+ if len(crList.Items) != 0 {
+ t.Errorf("Expected expired CRD to be GC'd, got %d CRDs", len(crList.Items))
+ }
+}
diff --git a/tools/visualize-committed-resources/main.go b/tools/visualize-committed-resources/main.go
new file mode 100644
index 000000000..afa16a372
--- /dev/null
+++ b/tools/visualize-committed-resources/main.go
@@ -0,0 +1,596 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+// Tool to visualize CommittedResource CRDs and their child Reservation slots.
+//
+// Usage:
+//
+// go run tools/visualize-committed-resources/main.go [flags]
+//
+// Flags:
+//
+// --context=ctx Kubernetes context (default: current context)
+// --filter-project=id Show only CRs for this project ID (substring match)
+// --filter-az=az Show only CRs in this availability zone (substring match)
+// --filter-group=name Show only CRs for this flavor group (substring match)
+// --filter-state=state Show only CRs in this state (e.g. confirmed, reserving)
+// --active Shorthand: show only confirmed/guaranteed CRs
+// --views=v1,v2,... Views to show (default: all). Available: summary, commitments, reservations, allocations
+// --hide=v1,v2,... Views to hide (applied after --views)
+// --watch=interval Refresh interval (e.g. 2s, 5s). Clears screen between refreshes.
+package main
+
+import (
+ "context"
+ "flag"
+ "fmt"
+ "os"
+ "sort"
+ "strconv"
+ "strings"
+ "time"
+
+ "github.com/cobaltcore-dev/cortex/api/v1alpha1"
+ apimeta "k8s.io/apimachinery/pkg/api/meta"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/apimachinery/pkg/runtime"
+ utilruntime "k8s.io/apimachinery/pkg/util/runtime"
+ "k8s.io/client-go/tools/clientcmd"
+ "sigs.k8s.io/controller-runtime/pkg/client"
+ "sigs.k8s.io/controller-runtime/pkg/client/config"
+)
+
+var scheme = runtime.NewScheme()
+
+func init() {
+ utilruntime.Must(v1alpha1.AddToScheme(scheme))
+}
+
+// ── ANSI colours ──────────────────────────────────────────────────────────────
+
+const (
+ colReset = "\033[0m"
+ colBold = "\033[1m"
+ colGreen = "\033[32m"
+ colYellow = "\033[33m"
+ colRed = "\033[31m"
+ colCyan = "\033[36m"
+ colGray = "\033[90m"
+)
+
+func green(s string) string { return colGreen + s + colReset }
+func yellow(s string) string { return colYellow + s + colReset }
+func red(s string) string { return colRed + s + colReset }
+func cyan(s string) string { return colCyan + s + colReset }
+func gray(s string) string { return colGray + s + colReset }
+func bold(s string) string { return colBold + s + colReset }
+
+// ── Views ─────────────────────────────────────────────────────────────────────
+
+const (
+ viewSummary = "summary"
+ viewCommitments = "commitments"
+ viewReservations = "reservations"
+ viewAllocations = "allocations"
+)
+
+var allViews = []string{viewSummary, viewCommitments, viewReservations, viewAllocations}
+
+type viewSet map[string]bool
+
+func parseViews(s string) viewSet {
+ vs := make(viewSet)
+ if s == "all" || s == "" {
+ for _, v := range allViews {
+ vs[v] = true
+ }
+ return vs
+ }
+ for _, v := range strings.Split(s, ",") {
+ vs[strings.TrimSpace(v)] = true
+ }
+ return vs
+}
+
+func (vs viewSet) hide(s string) {
+ if s == "" {
+ return
+ }
+ for _, v := range strings.Split(s, ",") {
+ delete(vs, strings.TrimSpace(v))
+ }
+}
+
+func (vs viewSet) has(v string) bool { return vs[v] }
+
+// ── k8s client ────────────────────────────────────────────────────────────────
+
+func newClient(contextName string) (client.Client, error) {
+ if contextName == "" {
+ c, err := config.GetConfig()
+ if err != nil {
+ return nil, fmt.Errorf("getting kubeconfig: %w", err)
+ }
+ return client.New(c, client.Options{Scheme: scheme})
+ }
+ loadingRules := clientcmd.NewDefaultClientConfigLoadingRules()
+ kubeConfig := clientcmd.NewNonInteractiveDeferredLoadingClientConfig(
+ loadingRules,
+ &clientcmd.ConfigOverrides{CurrentContext: contextName},
+ )
+ c, err := kubeConfig.ClientConfig()
+ if err != nil {
+ return nil, fmt.Errorf("getting kubeconfig for context %q: %w", contextName, err)
+ }
+ return client.New(c, client.Options{Scheme: scheme})
+}
+
+// ── helpers ───────────────────────────────────────────────────────────────────
+
+func printHeader(title string) {
+ line := strings.Repeat("─", 80)
+ fmt.Println()
+ fmt.Println(bold(line))
+ fmt.Printf("%s %s\n", bold("▶"), bold(title))
+ fmt.Println(bold(line))
+}
+
+func truncate(s string, n int) string {
+ if len(s) <= n {
+ return s
+ }
+ return s[:n-1] + "…"
+}
+
+func age(t *metav1.Time) string {
+ if t == nil {
+ return gray("—")
+ }
+ d := time.Since(t.Time).Round(time.Second)
+ switch {
+ case d < time.Minute:
+ return fmt.Sprintf("%ds", int(d.Seconds()))
+ case d < time.Hour:
+ return fmt.Sprintf("%dm", int(d.Minutes()))
+ case d < 24*time.Hour:
+ return fmt.Sprintf("%dh", int(d.Hours()))
+ default:
+ return fmt.Sprintf("%dd", int(d.Hours()/24))
+ }
+}
+
+func crReadyStatus(cr v1alpha1.CommittedResource) string {
+ cond := apimeta.FindStatusCondition(cr.Status.Conditions, v1alpha1.CommittedResourceConditionReady)
+ if cond == nil {
+ return gray("unknown")
+ }
+ switch cond.Reason {
+ case v1alpha1.CommittedResourceReasonAccepted:
+ return green("Accepted")
+ case v1alpha1.CommittedResourceReasonRejected:
+ return red("Rejected")
+ case v1alpha1.CommittedResourceReasonReserving:
+ return yellow("Reserving")
+ case v1alpha1.CommittedResourceReasonPlanned:
+ return gray("Planned")
+ default:
+ return yellow(cond.Reason)
+ }
+}
+
+func resReadyStatus(res v1alpha1.Reservation) string {
+ cond := apimeta.FindStatusCondition(res.Status.Conditions, v1alpha1.ReservationConditionReady)
+ if cond == nil {
+ return gray("pending")
+ }
+ if cond.Status == metav1.ConditionTrue {
+ return green("Ready")
+ }
+ return red("NotReady: " + truncate(cond.Message, 40))
+}
+
+func stateColour(state v1alpha1.CommitmentStatus) string {
+ switch state {
+ case v1alpha1.CommitmentStatusConfirmed, v1alpha1.CommitmentStatusGuaranteed:
+ return green(string(state))
+ case v1alpha1.CommitmentStatusPlanned, v1alpha1.CommitmentStatusPending:
+ return yellow(string(state))
+ case v1alpha1.CommitmentStatusExpired, v1alpha1.CommitmentStatusSuperseded:
+ return gray(string(state))
+ default:
+ return string(state)
+ }
+}
+
+// ── filters ───────────────────────────────────────────────────────────────────
+
+type filters struct {
+ project string
+ az string
+ group string
+ state string
+ active bool
+}
+
+func (f filters) match(cr v1alpha1.CommittedResource) bool {
+ if f.project != "" && !strings.Contains(cr.Spec.ProjectID, f.project) {
+ return false
+ }
+ if f.az != "" && !strings.Contains(cr.Spec.AvailabilityZone, f.az) {
+ return false
+ }
+ if f.group != "" && !strings.Contains(cr.Spec.FlavorGroupName, f.group) {
+ return false
+ }
+ if f.state != "" && !strings.EqualFold(string(cr.Spec.State), f.state) {
+ return false
+ }
+ if f.active {
+ s := cr.Spec.State
+ if s != v1alpha1.CommitmentStatusConfirmed && s != v1alpha1.CommitmentStatusGuaranteed {
+ return false
+ }
+ }
+ return true
+}
+
+// ── views ─────────────────────────────────────────────────────────────────────
+
+func printSummary(crs []v1alpha1.CommittedResource, reservations []v1alpha1.Reservation) {
+ printHeader("Summary")
+
+ byState := make(map[v1alpha1.CommitmentStatus]int)
+ byReady := map[string]int{"Accepted": 0, "Reserving": 0, "Rejected": 0, "Planned": 0, "Unknown": 0}
+ for _, cr := range crs {
+ byState[cr.Spec.State]++
+ cond := apimeta.FindStatusCondition(cr.Status.Conditions, v1alpha1.CommittedResourceConditionReady)
+ if cond == nil {
+ byReady["Unknown"]++
+ } else {
+ byReady[cond.Reason]++
+ }
+ }
+
+ resReady, resNotReady, resPending := 0, 0, 0
+ for _, res := range reservations {
+ cond := apimeta.FindStatusCondition(res.Status.Conditions, v1alpha1.ReservationConditionReady)
+ switch {
+ case cond == nil:
+ resPending++
+ case cond.Status == metav1.ConditionTrue:
+ resReady++
+ default:
+ resNotReady++
+ }
+ }
+
+ fmt.Printf(" CommittedResources : %s\n", bold(fmt.Sprintf("%d total", len(crs))))
+ for _, s := range []v1alpha1.CommitmentStatus{
+ v1alpha1.CommitmentStatusConfirmed,
+ v1alpha1.CommitmentStatusGuaranteed,
+ v1alpha1.CommitmentStatusPending,
+ v1alpha1.CommitmentStatusPlanned,
+ v1alpha1.CommitmentStatusExpired,
+ v1alpha1.CommitmentStatusSuperseded,
+ } {
+ if n := byState[s]; n > 0 {
+ fmt.Printf(" %-14s %d\n", string(s)+":", n)
+ }
+ }
+ fmt.Println()
+ fmt.Printf(" Ready conditions : %s accepted, %s reserving, %s rejected\n",
+ green(strconv.Itoa(byReady["Accepted"])),
+ yellow(strconv.Itoa(byReady["Reserving"])),
+ red(strconv.Itoa(byReady["Rejected"])),
+ )
+ fmt.Println()
+ fmt.Printf(" Reservation slots : %s total — %s ready, %s not-ready, %s pending\n",
+ bold(strconv.Itoa(len(reservations))),
+ green(strconv.Itoa(resReady)),
+ red(strconv.Itoa(resNotReady)),
+ yellow(strconv.Itoa(resPending)),
+ )
+}
+
+func printCommitments(crs []v1alpha1.CommittedResource) {
+ printHeader(fmt.Sprintf("CommittedResources (%d)", len(crs)))
+
+ if len(crs) == 0 {
+ fmt.Println(gray(" (none)"))
+ return
+ }
+
+ for _, cr := range crs {
+ fmt.Printf("\n %s %s\n",
+ bold(cyan(cr.Spec.CommitmentUUID)),
+ crReadyStatus(cr),
+ )
+ fmt.Printf(" project=%-36s group=%-20s az=%s\n",
+ cr.Spec.ProjectID, cr.Spec.FlavorGroupName, cr.Spec.AvailabilityZone)
+ fmt.Printf(" state=%-14s amount=%-10s accepted=%s\n",
+ stateColour(cr.Spec.State),
+ cr.Spec.Amount.String(),
+ func() string {
+ if cr.Status.AcceptedAmount == nil {
+ return gray("—")
+ }
+ return cr.Status.AcceptedAmount.String()
+ }(),
+ )
+
+ if cr.Status.UsedAmount != nil {
+ fmt.Printf(" used=%-12s\n", cr.Status.UsedAmount.String())
+ }
+
+ endStr := gray("no expiry")
+ if cr.Spec.EndTime != nil {
+ remaining := time.Until(cr.Spec.EndTime.Time).Round(time.Minute)
+ if remaining < 0 {
+ endStr = red(fmt.Sprintf("expired %s ago", age(cr.Spec.EndTime)))
+ } else {
+ endStr = fmt.Sprintf("expires in %s (at %s)", remaining, cr.Spec.EndTime.Format(time.RFC3339))
+ }
+ }
+ fmt.Printf(" age=%-8s %s\n", age(&cr.CreationTimestamp), endStr)
+ }
+}
+
+func printReservations(crs []v1alpha1.CommittedResource, reservations []v1alpha1.Reservation, showAllocations bool) {
+ // Index reservations by CommitmentUUID for display under each CR.
+ byUUID := make(map[string][]v1alpha1.Reservation)
+ for _, res := range reservations {
+ if res.Spec.CommittedResourceReservation == nil {
+ continue
+ }
+ uuid := res.Spec.CommittedResourceReservation.CommitmentUUID
+ byUUID[uuid] = append(byUUID[uuid], res)
+ }
+
+ printHeader("Reservation Slots")
+
+ if len(reservations) == 0 {
+ fmt.Println(gray(" (none)"))
+ return
+ }
+
+ for _, cr := range crs {
+ slots := byUUID[cr.Spec.CommitmentUUID]
+ if len(slots) == 0 {
+ continue
+ }
+ fmt.Printf("\n %s %s %s\n",
+ bold(cyan(cr.Spec.CommitmentUUID)),
+ gray(cr.Spec.FlavorGroupName),
+ gray(fmt.Sprintf("%d slot(s)", len(slots))),
+ )
+
+ sort.Slice(slots, func(i, j int) bool {
+ return slots[i].Name < slots[j].Name
+ })
+
+ for _, res := range slots {
+ targetHost := res.Spec.TargetHost
+ statusHost := res.Status.Host
+ var hostStr string
+ switch {
+ case statusHost == "":
+ hostStr = yellow(targetHost) + gray(" (not yet placed)")
+ case statusHost != targetHost:
+ hostStr = red(fmt.Sprintf("target=%s status=%s (migrating?)", targetHost, statusHost))
+ default:
+ hostStr = green(targetHost)
+ }
+
+ genOK := ""
+ if s := res.Status.CommittedResourceReservation; s != nil {
+ spec := res.Spec.CommittedResourceReservation
+ if spec != nil && s.ObservedParentGeneration != spec.ParentGeneration {
+ genOK = yellow(fmt.Sprintf(" [gen: spec=%d observed=%d]",
+ spec.ParentGeneration, s.ObservedParentGeneration))
+ }
+ }
+
+ resources := ""
+ var resourcesSb391 strings.Builder
+ for rname, qty := range res.Spec.Resources {
+ fmt.Fprintf(&resourcesSb391, "%s=%s ", rname, qty.String())
+ }
+ resources += resourcesSb391.String()
+
+ fmt.Printf(" %s host=%s %s %s%s\n",
+ truncate(res.Name, 40),
+ hostStr,
+ resReadyStatus(res),
+ gray(strings.TrimSpace(resources)),
+ genOK,
+ )
+
+ if showAllocations {
+ specAllocs := 0
+ statusAllocs := 0
+ if res.Spec.CommittedResourceReservation != nil {
+ specAllocs = len(res.Spec.CommittedResourceReservation.Allocations)
+ }
+ if res.Status.CommittedResourceReservation != nil {
+ statusAllocs = len(res.Status.CommittedResourceReservation.Allocations)
+ }
+
+ if specAllocs > 0 || statusAllocs > 0 {
+ fmt.Printf(" allocations: spec=%d confirmed=%d\n", specAllocs, statusAllocs)
+ if res.Spec.CommittedResourceReservation != nil {
+ statusAlloc := map[string]string{}
+ if res.Status.CommittedResourceReservation != nil {
+ statusAlloc = res.Status.CommittedResourceReservation.Allocations
+ }
+ for vmUUID, alloc := range res.Spec.CommittedResourceReservation.Allocations {
+ resources := ""
+ var resourcesSb422 strings.Builder
+ for rname, qty := range alloc.Resources {
+ fmt.Fprintf(&resourcesSb422, "%s=%s ", rname, qty.String())
+ }
+ resources += resourcesSb422.String()
+ confirmedHost, confirmed := statusAlloc[vmUUID]
+ state := ""
+ if confirmed {
+ state = green("confirmed on " + confirmedHost)
+ } else {
+ state = yellow(fmt.Sprintf("spec-only (grace since %s)", age(&alloc.CreationTimestamp)))
+ }
+ fmt.Printf(" vm=%s %s %s\n",
+ truncate(vmUUID, 36),
+ gray(strings.TrimSpace(resources)),
+ state,
+ )
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+// ── main ──────────────────────────────────────────────────────────────────────
+
+func main() {
+ k8sContext := flag.String("context", "", "Kubernetes context (default: current context)")
+ filterProject := flag.String("filter-project", "", "Show only CRs for this project ID (substring match)")
+ filterAZ := flag.String("filter-az", "", "Show only CRs in this availability zone (substring match)")
+ filterGroup := flag.String("filter-group", "", "Show only CRs for this flavor group (substring match)")
+ filterState := flag.String("filter-state", "", "Show only CRs in this state")
+ activeOnly := flag.Bool("active", false, "Show only confirmed/guaranteed CRs")
+ viewsFlag := flag.String("views", "all", "Views: all, summary, commitments, reservations, allocations")
+ hideFlag := flag.String("hide", "", "Views to hide (applied after --views)")
+ watchInterval := flag.Duration("watch", 0, "Refresh interval (e.g. 2s, 5s). 0 = run once.")
+ limitFlag := flag.Int("limit", 200, "Max CRs to fetch (0 = unlimited)")
+ flag.Parse()
+
+ views := parseViews(*viewsFlag)
+ views.hide(*hideFlag)
+
+ f := filters{
+ project: *filterProject,
+ az: *filterAZ,
+ group: *filterGroup,
+ state: *filterState,
+ active: *activeOnly,
+ }
+
+ cl, err := newClient(*k8sContext)
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "error: %v\n", err)
+ os.Exit(1)
+ }
+
+ ctx := context.Background()
+ var prevDigest string
+ first := true
+ for {
+ crs, reservations := fetchSnapshot(ctx, cl, f, *limitFlag)
+ if d := snapshotDigest(crs, reservations); first || d != prevDigest {
+ if !first {
+ fmt.Printf("\n%s %s %s\n",
+ bold("━━━ changed at"),
+ bold(time.Now().Format(time.RFC3339)),
+ bold("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"),
+ )
+ }
+ printSnapshot(crs, reservations, f, views)
+ prevDigest = d
+ first = false
+ }
+ if *watchInterval == 0 {
+ break
+ }
+ time.Sleep(*watchInterval)
+ }
+}
+
+// snapshotDigest returns a string that changes whenever any CRD is added, removed, or updated.
+func snapshotDigest(crs []v1alpha1.CommittedResource, reservations []v1alpha1.Reservation) string {
+ var b strings.Builder
+ for _, cr := range crs {
+ fmt.Fprintf(&b, "%s:%s ", cr.Name, cr.ResourceVersion)
+ }
+ for _, res := range reservations {
+ fmt.Fprintf(&b, "%s:%s ", res.Name, res.ResourceVersion)
+ }
+ return b.String()
+}
+
+func fetchSnapshot(ctx context.Context, cl client.Client, f filters, limit int) ([]v1alpha1.CommittedResource, []v1alpha1.Reservation) {
+ var listOpts []client.ListOption
+ if limit > 0 {
+ listOpts = append(listOpts, client.Limit(int64(limit)))
+ }
+
+ var crList v1alpha1.CommittedResourceList
+ if err := cl.List(ctx, &crList, listOpts...); err != nil {
+ fmt.Fprintf(os.Stderr, "error listing CommittedResources: %v\n", err)
+ os.Exit(1)
+ }
+
+ var resList v1alpha1.ReservationList
+ if err := cl.List(ctx, &resList, append(listOpts, client.MatchingLabels{
+ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
+ })...); err != nil {
+ fmt.Fprintf(os.Stderr, "error listing Reservations: %v\n", err)
+ os.Exit(1)
+ }
+
+ if crList.Continue != "" {
+ fmt.Fprintf(os.Stderr, yellow("warning: CR list truncated at %d — use --limit=0 or a higher value to see all\n"), limit)
+ }
+ if resList.Continue != "" {
+ fmt.Fprintf(os.Stderr, yellow("warning: Reservation list truncated at %d — use --limit=0 or a higher value to see all\n"), limit)
+ }
+ var crs []v1alpha1.CommittedResource
+ for _, cr := range crList.Items {
+ if f.match(cr) {
+ crs = append(crs, cr)
+ }
+ }
+ sort.Slice(crs, func(i, j int) bool {
+ if crs[i].Spec.FlavorGroupName != crs[j].Spec.FlavorGroupName {
+ return crs[i].Spec.FlavorGroupName < crs[j].Spec.FlavorGroupName
+ }
+ return crs[i].Spec.CommitmentUUID < crs[j].Spec.CommitmentUUID
+ })
+
+ matchedUUIDs := make(map[string]bool, len(crs))
+ for _, cr := range crs {
+ matchedUUIDs[cr.Spec.CommitmentUUID] = true
+ }
+ var reservations []v1alpha1.Reservation
+ for _, res := range resList.Items {
+ if res.Spec.CommittedResourceReservation == nil {
+ continue
+ }
+ if matchedUUIDs[res.Spec.CommittedResourceReservation.CommitmentUUID] {
+ reservations = append(reservations, res)
+ }
+ }
+ return crs, reservations
+}
+
+func printSnapshot(crs []v1alpha1.CommittedResource, reservations []v1alpha1.Reservation, f filters, views viewSet) {
+ fmt.Printf("\n%s — %s\n",
+ bold("visualize-committed-resources"),
+ gray(time.Now().Format(time.RFC3339)),
+ )
+ if f.project != "" || f.az != "" || f.group != "" || f.state != "" || f.active {
+ fmt.Printf("%s project=%q az=%q group=%q state=%q active=%v\n",
+ gray("filters:"), f.project, f.az, f.group, f.state, f.active)
+ }
+
+ if views.has(viewSummary) {
+ printSummary(crs, reservations)
+ }
+ if views.has(viewCommitments) {
+ printCommitments(crs)
+ }
+ if views.has(viewReservations) {
+ printReservations(crs, reservations, views.has(viewAllocations))
+ }
+
+ fmt.Println()
+}
From 1680a08a8d10f25d758aaf9d25cb5dc9b5ce31f2 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Mon, 4 May 2026 08:40:48 +0000
Subject: [PATCH 42/54] Bump cortex chart appVersions to sha-c26705a8 [skip ci]
---
helm/library/cortex/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml
index 3f1923a7d..7df04536e 100644
--- a/helm/library/cortex/Chart.yaml
+++ b/helm/library/cortex/Chart.yaml
@@ -3,6 +3,6 @@ name: cortex
description: A Helm chart to distribute cortex.
type: application
version: 0.0.44
-appVersion: "sha-d8bb12ef"
+appVersion: "sha-c26705a8"
icon: "https://example.com/icon.png"
dependencies: []
From ce1230b79d35464d1567c06f3cc1cbf007eeedf6 Mon Sep 17 00:00:00 2001
From: Philipp Matthes <27271818+PhilippMatthes@users.noreply.github.com>
Date: Mon, 4 May 2026 10:50:57 +0200
Subject: [PATCH 43/54] Add versioned resource naming for zero-downtime PG
major upgrades (#790)
Add versioned resource naming to the cortex-postgres helm chart so that
major PostgreSQL upgrades can be rolled out without disrupting the
existing deployment. All resource names now include a -v{major} suffix
(e.g. cortex-nova-postgresql-v17), controlled by a new major value in
the chart. Bundle charts derive the postgres host from the subchart
config automatically instead of hardcoding it. When a major bump
happens, the new StatefulSet and Service spin up alongside the old one;
operators manually remove the old resources after confirming the new
instance is healthy.
Also adds a postgres-bumper agent that checks upstream
docker-library/postgres for newer versions and opens PRs automatically,
integrated into the weekly orchestrator command.
Assisted-by: Claude Code:claude-opus-4-20250514 [Bash] [Read] [Edit]
[Write] [Agent]
---
.claude/agents/postgres-bumper.md | 194 ++++++++++++++++++
.claude/commands/weekly.md | 14 +-
.../cortex-cinder/templates/secrets.yaml | 2 +-
helm/bundles/cortex-cinder/values.yaml | 3 +-
.../cortex-manila/templates/secrets.yaml | 2 +-
helm/bundles/cortex-manila/values.yaml | 3 +-
.../cortex-nova/templates/secrets.yaml | 2 +-
helm/bundles/cortex-nova/values.yaml | 3 +-
.../cortex-postgres/templates/_helpers.tpl | 10 +
.../cortex-postgres/templates/configmap.yaml | 4 +-
.../cortex-postgres/templates/secret.yaml | 2 +-
.../cortex-postgres/templates/service.yaml | 3 +-
.../templates/statefulset.yaml | 18 +-
.../cortex-postgres/templates/vpa.yaml | 10 +-
helm/library/cortex-postgres/values.yaml | 2 +
15 files changed, 249 insertions(+), 23 deletions(-)
create mode 100644 .claude/agents/postgres-bumper.md
diff --git a/.claude/agents/postgres-bumper.md b/.claude/agents/postgres-bumper.md
new file mode 100644
index 000000000..7c9ba6ab5
--- /dev/null
+++ b/.claude/agents/postgres-bumper.md
@@ -0,0 +1,194 @@
+---
+allowed-tools: Read, Write, Edit, Bash(*), WebSearch, WebFetch, Agent
+description: Checks upstream docker-library/postgres for newer PG versions and debian digests, updates the Dockerfile and helm chart, and opens a PR.
+---
+
+# Postgres Bumper
+
+You are a postgres-bumper subagent. Your job is to check if the cortex-postgres Dockerfile and helm chart are up-to-date with upstream, apply any needed updates, and open a pull request. You handle both patch updates (same PG major, new minor/digest) and major upgrades (new PG major version).
+
+---
+
+## Setup
+
+Before doing any work, read the `AGENTS.md` file in the repository root. Follow all conventions described there.
+
+---
+
+## Phase 1: Determine latest upstream versions
+
+### 1a. Identify current values
+
+Read `postgres/Dockerfile` and extract:
+- The current `FROM debian:-slim@sha256:` line (codename and digest)
+- The current `ENV PG_MAJOR` value
+- The current `ENV PG_VERSION` value
+
+Read `helm/library/cortex-postgres/values.yaml` and extract the current `major` value.
+
+### 1b. Check what major versions are available upstream
+
+Fetch the upstream repository structure to determine the latest available PG major:
+
+```
+curl -sL https://api.github.com/repos/docker-library/postgres/contents/ | jq -r '.[].name' | grep -E '^[0-9]+$' | sort -n | tail -1
+```
+
+This gives the highest available major version (e.g. `18`).
+
+### 1c. Determine the target major
+
+- If a new major version exists upstream that is higher than the current PG_MAJOR, target the new major (major upgrade path).
+- Otherwise, stay on the current major (patch update path).
+
+### 1d. Fetch the upstream Dockerfile for the target major
+
+Determine the debian codename used by upstream for the target major. The upstream directory contains multiple variants (e.g. bookworm, trixie, plus alpine). Select the codename deterministically by preferring the newest non-alpine Debian suite. Use this approach:
+
+```bash
+# List available variants for the target major
+VARIANTS=$(curl -sL https://api.github.com/repos/docker-library/postgres/contents/ | jq -r '.[].name' | grep -v alpine)
+
+# Prefer the newest Debian codename (sorted alphabetically, last is newest for current naming)
+# Known Debian suites in order: bookworm (12), trixie (13), forky (14)
+CODENAME=$(echo "$VARIANTS" | grep -m1 'trixie' || echo "$VARIANTS" | grep -m1 'forky' || echo "$VARIANTS" | grep -m1 'bookworm' || echo "$VARIANTS" | tail -1)
+```
+
+If the current Dockerfile already uses a codename that is available for the target major, prefer that codename to minimize churn. Only switch codenames when the current one is no longer available upstream.
+
+Then fetch the upstream Dockerfile:
+
+```bash
+curl -sL https://raw.githubusercontent.com/docker-library/postgres/master///Dockerfile
+```
+
+Extract from it:
+- The debian codename (from the path and FROM line)
+- `ENV PG_MAJOR` value
+- `ENV PG_VERSION` value
+
+### 1e. Get the latest debian digest
+
+```
+docker pull debian:-slim
+docker inspect --format='{{index .RepoDigests 0}}' debian:-slim
+```
+
+Extract the `sha256:...` digest.
+
+---
+
+## Phase 2: Compare and classify
+
+Compare current values with upstream:
+
+- If PG_MAJOR, PG_VERSION, and the debian digest are all unchanged → **no update needed**. Report this and stop.
+- If PG_MAJOR is unchanged but PG_VERSION or digest changed → **patch update**.
+- If PG_MAJOR changed → **major upgrade**.
+
+---
+
+## Phase 3: Apply updates
+
+### 3a. Check for existing PR
+
+Before making changes, check if there's already an open PR for this:
+
+```
+gh pr list --head chore/bump-postgres --state open --json number,url
+```
+
+If one exists, report it and stop (don't create duplicates).
+
+### 3b. Update the Dockerfile
+
+For **both** patch and major updates:
+1. Update the `FROM` line with the new codename (if changed) and digest.
+2. Update `ENV PG_MAJOR` (if changed).
+3. Update `ENV PG_VERSION` with the new version string.
+
+For **major upgrades** additionally:
+4. Diff the upstream Dockerfile structure against ours to identify new or removed apt packages. The key differences to preserve in our Dockerfile:
+ - We install `gosu` via apt (`apt-get install ... gosu`) instead of downloading from GitHub releases with GPG verification.
+ - We do NOT set `ENV GOSU_VERSION` or download gosu binaries.
+5. If the debian codename changed, update the `aptRepo` line in the postgres installation RUN command (e.g. `trixie-pgdg` → `forky-pgdg`).
+6. If new system packages are needed (visible in upstream's Dockerfile), add them to the appropriate `apt-get install` block.
+7. If packages were removed upstream, remove them from ours too.
+
+### 3c. Update the helm chart (major upgrades only)
+
+If PG_MAJOR changed:
+1. Update `major` in `helm/library/cortex-postgres/values.yaml` to the new major (e.g. `"18"`).
+2. Check each bundle chart's values.yaml (cortex-nova, cortex-manila, cortex-cinder) — if they override `cortex-postgres.major`, update those too.
+3. Update the `postgres.host` documentation defaults in each bundle (e.g. `cortex-nova-postgresql-v18`).
+
+---
+
+## Phase 4: Verify the build
+
+Run a docker build to confirm the image builds successfully:
+
+```
+docker build -t cortex-postgres-test postgres/
+```
+
+If the build fails, investigate and fix. Common issues:
+- Package version not yet available for the new codename
+- Missing dependencies
+
+---
+
+## Phase 5: Open a Pull Request
+
+1. Create branch and commit:
+```
+git checkout -b chore/bump-postgres
+git add postgres/Dockerfile helm/
+git commit -m "Bump postgres to PG ."
+git push -u origin chore/bump-postgres
+```
+
+2. Use the **pull-request-creator** agent to open a PR. Provide the motivation including:
+ - What was updated (debian digest, PG_VERSION, PG_MAJOR)
+ - Old → new values
+ - Whether this is a patch or major upgrade
+ - For major upgrades, include the following IMPORTANT note prominently in the motivation so it appears in the PR description:
+
+ IMPORTANT: This is a major PostgreSQL upgrade. The helm chart's versioned naming will create a NEW StatefulSet and Service (e.g. cortex-nova-postgresql-v18) alongside the old one (cortex-nova-postgresql-v17). The old deployment will NOT be removed automatically. After deploying this change and confirming the new instance is healthy and re-populated by the knowledge module, operators must manually delete the old StatefulSet and its PVC (e.g. `kubectl delete statefulset cortex-nova-postgresql-v17 && kubectl delete pvc data-cortex-nova-postgresql-v17-0`).
+
+---
+
+## Phase 6: Report
+
+Return a structured report:
+
+```
+## Postgres Bumper Results
+
+### Update Type
+[Patch / Major / No update needed]
+
+### Changes
+- Debian codename: → (or "unchanged")
+- Debian digest: → (or "unchanged")
+- PG_MAJOR: → (or "unchanged")
+- PG_VERSION: → (or "unchanged")
+- Helm major: → (or "unchanged")
+
+### PR
+- PR #NNN: (or "skipped — already up-to-date" / "skipped — existing PR found")
+
+### Notes
+
+```
+
+If no update is needed:
+
+```
+## Postgres Bumper Results
+
+No update needed. Current versions match upstream.
+- PG_MAJOR:
+- PG_VERSION:
+- Debian: -slim@sha256:
+```
diff --git a/.claude/commands/weekly.md b/.claude/commands/weekly.md
index 16e6a18dd..23256bad6 100644
--- a/.claude/commands/weekly.md
+++ b/.claude/commands/weekly.md
@@ -58,7 +58,7 @@ Before dispatching subagents, gather all currently open pull requests so finding
## Phase 4: Dispatch — Hand off to subagents in parallel
-Dispatch both subagents **in parallel** using the Agent tool. Each subagent investigates and reports findings — they do NOT open pull requests.
+Dispatch all subagents **in parallel** using the Agent tool. The bug detective and docs expert investigate and report findings — they do NOT open pull requests. The postgres bumper is self-contained and opens its own PR if an update is needed.
### Subagent 1: Bug Detective
@@ -76,6 +76,15 @@ Read the instructions from `.claude/agents/docs-expert.md`. Send the agent a pro
1. The full digest from Phase 2
2. The full instructions from the docs-expert agent file
+### Subagent 3: Postgres Bumper
+
+Use `subagent_type: "general-purpose"`.
+
+Read the instructions from `.claude/agents/postgres-bumper.md`. Send the agent a prompt that includes:
+1. The full instructions from the postgres-bumper agent file
+
+This agent does NOT need the weekly digest — it checks upstream independently and opens its own PR if an update is available.
+
---
## Phase 5: Deduplicate and filter findings
@@ -136,6 +145,9 @@ After all work is done, produce a short summary:
- Skipped (already covered by open PRs): N
- PRs opened: list PR numbers/titles, or "none"
+### Postgres Bumper
+- Result: <"no update needed" / "patch update PR #NNN" / "major upgrade PR #NNN" / "skipped — existing PR found">
+
### Backlog (for future runs)
- —
(items that were deprioritized this run)
diff --git a/helm/bundles/cortex-cinder/templates/secrets.yaml b/helm/bundles/cortex-cinder/templates/secrets.yaml
index d7384c6a3..0d3ba89a6 100644
--- a/helm/bundles/cortex-cinder/templates/secrets.yaml
+++ b/helm/bundles/cortex-cinder/templates/secrets.yaml
@@ -4,7 +4,7 @@ kind: Secret
metadata:
name: cortex-cinder-postgres
data:
- host: {{ .Values.postgres.host | b64enc | quote }}
+ host: {{ printf "%s-v%s" (index .Values "cortex-postgres" "fullnameOverride") (index .Values "cortex-postgres" "major") | b64enc | quote }}
user: {{ .Values.postgres.user | b64enc | quote }}
password: {{ .Values.postgres.password | b64enc | quote }}
database: {{ .Values.postgres.database | b64enc | quote }}
diff --git a/helm/bundles/cortex-cinder/values.yaml b/helm/bundles/cortex-cinder/values.yaml
index b3853af04..f4f63a7f7 100644
--- a/helm/bundles/cortex-cinder/values.yaml
+++ b/helm/bundles/cortex-cinder/values.yaml
@@ -38,7 +38,7 @@ sharedSSOCert: &sharedSSOCert
selfSigned: "false"
postgres:
- host: cortex-cinder-postgresql
+ host: cortex-cinder-postgresql-v17
user: postgres
password: secret
database: postgres
@@ -138,3 +138,4 @@ cortex-knowledge-controllers:
# Custom configuration for the cortex postgres chart.
cortex-postgres:
fullnameOverride: cortex-cinder-postgresql
+ major: "17"
diff --git a/helm/bundles/cortex-manila/templates/secrets.yaml b/helm/bundles/cortex-manila/templates/secrets.yaml
index 3f3e93d0c..59c21b425 100644
--- a/helm/bundles/cortex-manila/templates/secrets.yaml
+++ b/helm/bundles/cortex-manila/templates/secrets.yaml
@@ -4,7 +4,7 @@ kind: Secret
metadata:
name: cortex-manila-postgres
data:
- host: {{ .Values.postgres.host | b64enc | quote }}
+ host: {{ printf "%s-v%s" (index .Values "cortex-postgres" "fullnameOverride") (index .Values "cortex-postgres" "major") | b64enc | quote }}
user: {{ .Values.postgres.user | b64enc | quote }}
password: {{ .Values.postgres.password | b64enc | quote }}
database: {{ .Values.postgres.database | b64enc | quote }}
diff --git a/helm/bundles/cortex-manila/values.yaml b/helm/bundles/cortex-manila/values.yaml
index e6be31d4b..6a604fe6a 100644
--- a/helm/bundles/cortex-manila/values.yaml
+++ b/helm/bundles/cortex-manila/values.yaml
@@ -38,7 +38,7 @@ sharedSSOCert: &sharedSSOCert
selfSigned: "false"
postgres:
- host: cortex-manila-postgresql
+ host: cortex-manila-postgresql-v17
user: postgres
password: secret
database: postgres
@@ -138,3 +138,4 @@ cortex-knowledge-controllers:
# Custom configuration for the cortex postgres chart.
cortex-postgres:
fullnameOverride: cortex-manila-postgresql
+ major: "17"
diff --git a/helm/bundles/cortex-nova/templates/secrets.yaml b/helm/bundles/cortex-nova/templates/secrets.yaml
index 382fe3e0a..50ae310ff 100644
--- a/helm/bundles/cortex-nova/templates/secrets.yaml
+++ b/helm/bundles/cortex-nova/templates/secrets.yaml
@@ -4,7 +4,7 @@ kind: Secret
metadata:
name: cortex-nova-postgres
data:
- host: {{ .Values.postgres.host | b64enc | quote }}
+ host: {{ printf "%s-v%s" (index .Values "cortex-postgres" "fullnameOverride") (index .Values "cortex-postgres" "major") | b64enc | quote }}
user: {{ .Values.postgres.user | b64enc | quote }}
password: {{ .Values.postgres.password | b64enc | quote }}
database: {{ .Values.postgres.database | b64enc | quote }}
diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml
index d694bdfba..e73ce9ad1 100644
--- a/helm/bundles/cortex-nova/values.yaml
+++ b/helm/bundles/cortex-nova/values.yaml
@@ -38,7 +38,7 @@ sharedSSOCert: &sharedSSOCert
selfSigned: "false"
postgres:
- host: cortex-nova-postgresql
+ host: cortex-nova-postgresql-v17
user: postgres
password: secret
database: postgres
@@ -234,3 +234,4 @@ cortex-knowledge-controllers:
# Custom configuration for the cortex postgres chart.
cortex-postgres:
fullnameOverride: cortex-nova-postgresql
+ major: "17"
diff --git a/helm/library/cortex-postgres/templates/_helpers.tpl b/helm/library/cortex-postgres/templates/_helpers.tpl
index d3dea331c..48135f745 100644
--- a/helm/library/cortex-postgres/templates/_helpers.tpl
+++ b/helm/library/cortex-postgres/templates/_helpers.tpl
@@ -63,3 +63,13 @@ Create the name of the service account to use
{{- default "default" .Values.serviceAccount.name }}
{{- end }}
{{- end }}
+
+{{/*
+Versioned fully qualified app name (appends -v to the fullname).
+Truncates the base name to leave room for the suffix within the 63-char DNS limit.
+*/}}
+{{- define "cortex-postgres.versionedFullname" -}}
+{{- $suffix := printf "-v%s" .Values.major -}}
+{{- $base := include "cortex-postgres.fullname" . -}}
+{{- printf "%s%s" ($base | trunc (int (sub 63 (len $suffix)))) $suffix | trimSuffix "-" }}
+{{- end }}
diff --git a/helm/library/cortex-postgres/templates/configmap.yaml b/helm/library/cortex-postgres/templates/configmap.yaml
index 4aaa7a434..e785a30b2 100644
--- a/helm/library/cortex-postgres/templates/configmap.yaml
+++ b/helm/library/cortex-postgres/templates/configmap.yaml
@@ -1,6 +1,8 @@
kind: ConfigMap
apiVersion: v1
metadata:
- name: {{ include "cortex-postgres.fullname" . }}-max-conns
+ name: {{ include "cortex-postgres.versionedFullname" . }}-max-conns
+ labels:
+ {{- include "cortex-postgres.labels" . | nindent 4 }}
data:
max_conns.sql: "ALTER SYSTEM SET max_connections = 256;"
diff --git a/helm/library/cortex-postgres/templates/secret.yaml b/helm/library/cortex-postgres/templates/secret.yaml
index 58544f2ab..cf9b92388 100644
--- a/helm/library/cortex-postgres/templates/secret.yaml
+++ b/helm/library/cortex-postgres/templates/secret.yaml
@@ -4,7 +4,7 @@
apiVersion: v1
kind: Secret
metadata:
- name: {{ include "cortex-postgres.fullname" . }}-secret
+ name: {{ include "cortex-postgres.versionedFullname" . }}-secret
labels:
{{- include "cortex-postgres.labels" . | nindent 4 }}
type: Opaque
diff --git a/helm/library/cortex-postgres/templates/service.yaml b/helm/library/cortex-postgres/templates/service.yaml
index 6fdc2c3e3..67937a2ca 100644
--- a/helm/library/cortex-postgres/templates/service.yaml
+++ b/helm/library/cortex-postgres/templates/service.yaml
@@ -4,7 +4,7 @@
apiVersion: v1
kind: Service
metadata:
- name: {{ include "cortex-postgres.fullname" . }}
+ name: {{ include "cortex-postgres.versionedFullname" . }}
labels:
{{- include "cortex-postgres.labels" . | nindent 4 }}
spec:
@@ -16,3 +16,4 @@ spec:
name: postgresql
selector:
{{- include "cortex-postgres.selectorLabels" . | nindent 4 }}
+ app.kubernetes.io/pg-major: {{ .Values.major | quote }}
diff --git a/helm/library/cortex-postgres/templates/statefulset.yaml b/helm/library/cortex-postgres/templates/statefulset.yaml
index de96f38e5..43dd06430 100644
--- a/helm/library/cortex-postgres/templates/statefulset.yaml
+++ b/helm/library/cortex-postgres/templates/statefulset.yaml
@@ -1,27 +1,29 @@
apiVersion: apps/v1
kind: StatefulSet
metadata:
- name: {{ include "cortex-postgres.fullname" . }}
+ name: {{ include "cortex-postgres.versionedFullname" . }}
annotations:
checksum/image: "{{ .Values.image.tag }}"
- kubectl.kubernetes.io/default-container: {{ include "cortex-postgres.fullname" . }}
+ kubectl.kubernetes.io/default-container: {{ include "cortex-postgres.versionedFullname" . }}
labels:
- app: {{ include "cortex-postgres.fullname" . }}
+ app: {{ include "cortex-postgres.versionedFullname" . }}
{{- include "cortex-postgres.labels" . | nindent 4 }}
spec:
selector:
matchLabels:
{{- include "cortex-postgres.selectorLabels" . | nindent 6 }}
- serviceName: 'postgresql-svc'
+ app.kubernetes.io/pg-major: {{ .Values.major | quote }}
+ serviceName: {{ include "cortex-postgres.versionedFullname" . }}
template:
metadata:
labels:
- app: {{ include "cortex-postgres.fullname" . }}
+ app: {{ include "cortex-postgres.versionedFullname" . }}
{{- include "cortex-postgres.labels" . | nindent 8 }}
+ app.kubernetes.io/pg-major: {{ .Values.major | quote }}
spec:
terminationGracePeriodSeconds: 10
containers:
- - name: {{ include "cortex-postgres.fullname" . }}
+ - name: {{ include "cortex-postgres.versionedFullname" . }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
resources:
@@ -35,7 +37,7 @@ spec:
- name: POSTGRES_PASSWORD
valueFrom:
secretKeyRef:
- name: {{ include "cortex-postgres.fullname" . }}-secret
+ name: {{ include "cortex-postgres.versionedFullname" . }}-secret
key: postgres-password
ports:
- containerPort: 5432
@@ -49,7 +51,7 @@ spec:
volumes:
- name: max-conns
configMap:
- name: {{ include "cortex-postgres.fullname" . }}-max-conns
+ name: {{ include "cortex-postgres.versionedFullname" . }}-max-conns
volumeClaimTemplates:
- metadata:
name: data
diff --git a/helm/library/cortex-postgres/templates/vpa.yaml b/helm/library/cortex-postgres/templates/vpa.yaml
index 011cf1b54..488549a8d 100644
--- a/helm/library/cortex-postgres/templates/vpa.yaml
+++ b/helm/library/cortex-postgres/templates/vpa.yaml
@@ -5,22 +5,22 @@
apiVersion: autoscaling.k8s.io/v1
kind: VerticalPodAutoscaler
metadata:
- name: {{ include "cortex-postgres.fullname" . }}-vpa
+ name: {{ include "cortex-postgres.versionedFullname" . }}-vpa
labels:
- app: {{ include "cortex-postgres.fullname" . }}
+ app: {{ include "cortex-postgres.versionedFullname" . }}
{{- include "cortex-postgres.labels" . | nindent 4 }}
spec:
targetRef:
apiVersion: "apps/v1"
- kind: "Deployment"
- name: {{ include "cortex-postgres.fullname" . }}
+ kind: "StatefulSet"
+ name: {{ include "cortex-postgres.versionedFullname" . }}
updatePolicy:
updateMode: "Initial"
minReplicas: 1
maxReplicas: 1
resourcePolicy:
containerPolicies:
- - containerName: {{ include "cortex-postgres.fullname" . }}
+ - containerName: {{ include "cortex-postgres.versionedFullname" . }}
mode: "Auto"
minAllowed:
cpu: {{ .Values.resources.requests.cpu }}
diff --git a/helm/library/cortex-postgres/values.yaml b/helm/library/cortex-postgres/values.yaml
index d318ee97e..8c52c73ed 100644
--- a/helm/library/cortex-postgres/values.yaml
+++ b/helm/library/cortex-postgres/values.yaml
@@ -3,6 +3,8 @@
fullnameOverride: cortex-postgresql
+major: "17"
+
image:
repository: ghcr.io/cobaltcore-dev/cortex-postgres
pullPolicy: IfNotPresent
From a9b46cabda0263ade1a09f96aac6a5e77ab741b0 Mon Sep 17 00:00:00 2001
From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com>
Date: Mon, 4 May 2026 10:52:01 +0200
Subject: [PATCH 44/54] Renovate: Update actions/setup-python action to v6
(#786)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This PR contains the following updates:
| Package | Type | Update | Change |
|---|---|---|---|
|
[actions/setup-python](https://redirect.github.com/actions/setup-python)
| action | major | `v5` → `v6` |
---
### Release Notes
actions/setup-python (actions/setup-python)
###
[`v6.2.0`](https://redirect.github.com/actions/setup-python/compare/v6.1.0...v6.2.0)
[Compare
Source](https://redirect.github.com/actions/setup-python/compare/v6.1.0...v6.2.0)
###
[`v6.1.0`](https://redirect.github.com/actions/setup-python/releases/tag/v6.1.0)
[Compare
Source](https://redirect.github.com/actions/setup-python/compare/v6...v6.1.0)
##### What's Changed
##### Enhancements:
- Add support for `pip-install` input by
[@gowridurgad](https://redirect.github.com/gowridurgad) in
[#1201](https://redirect.github.com/actions/setup-python/pull/1201)
- Add graalpy early-access and windows builds by
[@timfel](https://redirect.github.com/timfel) in
[#880](https://redirect.github.com/actions/setup-python/pull/880)
##### Dependency and Documentation updates:
- Enhanced wording and updated example usage for `allow-prereleases` by
[@yarikoptic](https://redirect.github.com/yarikoptic) in
[#979](https://redirect.github.com/actions/setup-python/pull/979)
- Upgrade urllib3 from 1.26.19 to 2.5.0 and document breaking changes in
v6 by [@dependabot](https://redirect.github.com/dependabot) in
[#1139](https://redirect.github.com/actions/setup-python/pull/1139)
- Upgrade typescript from 5.4.2 to 5.9.3 and Documentation update by
[@dependabot](https://redirect.github.com/dependabot) in
[#1094](https://redirect.github.com/actions/setup-python/pull/1094)
- Upgrade actions/publish-action from 0.3.0 to 0.4.0 & Documentation
update for pip-install input by
[@dependabot](https://redirect.github.com/dependabot) in
[#1199](https://redirect.github.com/actions/setup-python/pull/1199)
- Upgrade requests from 2.32.2 to 2.32.4 by
[@dependabot](https://redirect.github.com/dependabot) in
[#1130](https://redirect.github.com/actions/setup-python/pull/1130)
- Upgrade prettier from 3.5.3 to 3.6.2 by
[@dependabot](https://redirect.github.com/dependabot) in
[#1234](https://redirect.github.com/actions/setup-python/pull/1234)
- Upgrade [@types/node](https://redirect.github.com/types/node)
from 24.1.0 to 24.9.1 and update macos-13 to macos-15-intel by
[@dependabot](https://redirect.github.com/dependabot) in
[#1235](https://redirect.github.com/actions/setup-python/pull/1235)
##### New Contributors
- [@yarikoptic](https://redirect.github.com/yarikoptic) made
their first contribution in
[#979](https://redirect.github.com/actions/setup-python/pull/979)
**Full Changelog**:
###
[`v6.0.0`](https://redirect.github.com/actions/setup-python/releases/tag/v6.0.0)
[Compare
Source](https://redirect.github.com/actions/setup-python/compare/v6...v6)
#### What's Changed
##### Breaking Changes
- Upgrade to node 24 by
[@salmanmkc](https://redirect.github.com/salmanmkc) in
[#1164](https://redirect.github.com/actions/setup-python/pull/1164)
Make sure your runner is on version v2.327.1 or later to ensure
compatibility with this release. [See Release
Notes](https://redirect.github.com/actions/runner/releases/tag/v2.327.1)
##### Enhancements:
- Add support for `pip-version` by
[@priyagupta108](https://redirect.github.com/priyagupta108) in
[#1129](https://redirect.github.com/actions/setup-python/pull/1129)
- Enhance reading from .python-version by
[@krystof-k](https://redirect.github.com/krystof-k) in
[#787](https://redirect.github.com/actions/setup-python/pull/787)
- Add version parsing from Pipfile by
[@aradkdj](https://redirect.github.com/aradkdj) in
[#1067](https://redirect.github.com/actions/setup-python/pull/1067)
##### Bug fixes:
- Clarify pythonLocation behaviour for PyPy and GraalPy in environment
variables by
[@aparnajyothi-y](https://redirect.github.com/aparnajyothi-y) in
[#1183](https://redirect.github.com/actions/setup-python/pull/1183)
- Change missing cache directory error to warning by
[@aparnajyothi-y](https://redirect.github.com/aparnajyothi-y) in
[#1182](https://redirect.github.com/actions/setup-python/pull/1182)
- Add Architecture-Specific PATH Management for Python with --user Flag
on Windows by
[@aparnajyothi-y](https://redirect.github.com/aparnajyothi-y) in
[#1122](https://redirect.github.com/actions/setup-python/pull/1122)
- Include python version in PyPy python-version output by
[@cdce8p](https://redirect.github.com/cdce8p) in
[#1110](https://redirect.github.com/actions/setup-python/pull/1110)
- Update docs: clarification on pip authentication with setup-python by
[@priya-kinthali](https://redirect.github.com/priya-kinthali) in
[#1156](https://redirect.github.com/actions/setup-python/pull/1156)
##### Dependency updates:
- Upgrade idna from 2.9 to 3.7 in /**tests**/data by
[@dependabot](https://redirect.github.com/dependabot)\[bot] in
[#843](https://redirect.github.com/actions/setup-python/pull/843)
- Upgrade form-data to fix critical vulnerabilities
[#182](https://redirect.github.com/actions/setup-python/issues/182)
&
[#183](https://redirect.github.com/actions/setup-python/issues/183)
by [@aparnajyothi-y](https://redirect.github.com/aparnajyothi-y)
in
[#1163](https://redirect.github.com/actions/setup-python/pull/1163)
- Upgrade setuptools to 78.1.1 to fix path traversal vulnerability in
PackageIndex.download by
[@aparnajyothi-y](https://redirect.github.com/aparnajyothi-y) in
[#1165](https://redirect.github.com/actions/setup-python/pull/1165)
- Upgrade actions/checkout from 4 to 5 by
[@dependabot](https://redirect.github.com/dependabot)\[bot] in
[#1181](https://redirect.github.com/actions/setup-python/pull/1181)
- Upgrade
[@actions/tool-cache](https://redirect.github.com/actions/tool-cache)
from 2.0.1 to 2.0.2 by
[@dependabot](https://redirect.github.com/dependabot)\[bot] in
[#1095](https://redirect.github.com/actions/setup-python/pull/1095)
#### New Contributors
- [@krystof-k](https://redirect.github.com/krystof-k) made their
first contribution in
[#787](https://redirect.github.com/actions/setup-python/pull/787)
- [@cdce8p](https://redirect.github.com/cdce8p) made their first
contribution in
[#1110](https://redirect.github.com/actions/setup-python/pull/1110)
- [@aradkdj](https://redirect.github.com/aradkdj) made their
first contribution in
[#1067](https://redirect.github.com/actions/setup-python/pull/1067)
**Full Changelog**:
###
[`v6`](https://redirect.github.com/actions/setup-python/compare/v5...v6)
[Compare
Source](https://redirect.github.com/actions/setup-python/compare/v5.6.0...v6)
###
[`v5.6.0`](https://redirect.github.com/actions/setup-python/releases/tag/v5.6.0)
[Compare
Source](https://redirect.github.com/actions/setup-python/compare/v5.5.0...v5.6.0)
#### What's Changed
- Workflow updates related to Ubuntu 20.04 by
[@aparnajyothi-y](https://redirect.github.com/aparnajyothi-y) in
[#1065](https://redirect.github.com/actions/setup-python/pull/1065)
- Fix for Candidate Not Iterable Error by
[@aparnajyothi-y](https://redirect.github.com/aparnajyothi-y) in
[#1082](https://redirect.github.com/actions/setup-python/pull/1082)
- Upgrade semver and
[@types/semver](https://redirect.github.com/types/semver) by
[@dependabot](https://redirect.github.com/dependabot) in
[#1091](https://redirect.github.com/actions/setup-python/pull/1091)
- Upgrade prettier from 2.8.8 to 3.5.3 by
[@dependabot](https://redirect.github.com/dependabot) in
[#1046](https://redirect.github.com/actions/setup-python/pull/1046)
- Upgrade ts-jest from 29.1.2 to 29.3.2 by
[@dependabot](https://redirect.github.com/dependabot) in
[#1081](https://redirect.github.com/actions/setup-python/pull/1081)
**Full Changelog**:
###
[`v5.5.0`](https://redirect.github.com/actions/setup-python/releases/tag/v5.5.0)
[Compare
Source](https://redirect.github.com/actions/setup-python/compare/v5.4.0...v5.5.0)
#### What's Changed
##### Enhancements:
- Support free threaded Python versions like '3.13t' by
[@colesbury](https://redirect.github.com/colesbury) in
[#973](https://redirect.github.com/actions/setup-python/pull/973)
- Enhance Workflows: Include ubuntu-arm runners, Add e2e Testing for
free threaded and Upgrade
[@action/cache](https://redirect.github.com/action/cache) from
4.0.0 to 4.0.3 by
[@priya-kinthali](https://redirect.github.com/priya-kinthali) in
[#1056](https://redirect.github.com/actions/setup-python/pull/1056)
- Add support for .tool-versions file in setup-python by
[@mahabaleshwars](https://redirect.github.com/mahabaleshwars) in
[#1043](https://redirect.github.com/actions/setup-python/pull/1043)
##### Bug fixes:
- Fix architecture for pypy on Linux ARM64 by
[@mayeut](https://redirect.github.com/mayeut) in
[#1011](https://redirect.github.com/actions/setup-python/pull/1011)
This update maps arm64 to aarch64 for Linux ARM64 PyPy installations.
##### Dependency updates:
- Upgrade [@vercel/ncc](https://redirect.github.com/vercel/ncc)
from 0.38.1 to 0.38.3 by
[@dependabot](https://redirect.github.com/dependabot) in
[#1016](https://redirect.github.com/actions/setup-python/pull/1016)
- Upgrade
[@actions/glob](https://redirect.github.com/actions/glob) from
0.4.0 to 0.5.0 by
[@dependabot](https://redirect.github.com/dependabot) in
[#1015](https://redirect.github.com/actions/setup-python/pull/1015)
#### New Contributors
- [@colesbury](https://redirect.github.com/colesbury) made their
first contribution in
[#973](https://redirect.github.com/actions/setup-python/pull/973)
- [@mahabaleshwars](https://redirect.github.com/mahabaleshwars)
made their first contribution in
[#1043](https://redirect.github.com/actions/setup-python/pull/1043)
**Full Changelog**:
###
[`v5.4.0`](https://redirect.github.com/actions/setup-python/releases/tag/v5.4.0)
[Compare
Source](https://redirect.github.com/actions/setup-python/compare/v5.3.0...v5.4.0)
#### What's Changed
##### Enhancements:
- Update cache error message by
[@aparnajyothi-y](https://redirect.github.com/aparnajyothi-y) in
[#968](https://redirect.github.com/actions/setup-python/pull/968)
- Enhance Workflows: Add Ubuntu-24, Remove Python 3.8 by
[@priya-kinthali](https://redirect.github.com/priya-kinthali) in
[#985](https://redirect.github.com/actions/setup-python/pull/985)
- Configure Dependabot settings by
[@HarithaVattikuti](https://redirect.github.com/HarithaVattikuti)
in
[#1008](https://redirect.github.com/actions/setup-python/pull/1008)
##### Documentation changes:
- Readme update - recommended permissions by
[@benwells](https://redirect.github.com/benwells) in
[#1009](https://redirect.github.com/actions/setup-python/pull/1009)
- Improve Advanced Usage examples by
[@lrq3000](https://redirect.github.com/lrq3000) in
[#645](https://redirect.github.com/actions/setup-python/pull/645)
##### Dependency updates:
- Upgrade `undici` from 5.28.4 to 5.28.5 by
[@dependabot](https://redirect.github.com/dependabot) in
[#1012](https://redirect.github.com/actions/setup-python/pull/1012)
- Upgrade `urllib3` from 1.25.9 to 1.26.19 in /**tests**/data by
[@dependabot](https://redirect.github.com/dependabot) in
[#895](https://redirect.github.com/actions/setup-python/pull/895)
- Upgrade `actions/publish-immutable-action` from 0.0.3 to 0.0.4 by
[@dependabot](https://redirect.github.com/dependabot) in
[#1014](https://redirect.github.com/actions/setup-python/pull/1014)
- Upgrade `@actions/http-client` from 2.2.1 to 2.2.3 by
[@dependabot](https://redirect.github.com/dependabot) in
[#1020](https://redirect.github.com/actions/setup-python/pull/1020)
- Upgrade `requests` from 2.24.0 to 2.32.2 in /**tests**/data by
[@dependabot](https://redirect.github.com/dependabot) in
[#1019](https://redirect.github.com/actions/setup-python/pull/1019)
- Upgrade `@actions/cache` to `^4.0.0` by
[@priyagupta108](https://redirect.github.com/priyagupta108) in
[#1007](https://redirect.github.com/actions/setup-python/pull/1007)
#### New Contributors
- [@benwells](https://redirect.github.com/benwells) made their
first contribution in
[#1009](https://redirect.github.com/actions/setup-python/pull/1009)
-
[@HarithaVattikuti](https://redirect.github.com/HarithaVattikuti)
made their first contribution in
[#1008](https://redirect.github.com/actions/setup-python/pull/1008)
- [@lrq3000](https://redirect.github.com/lrq3000) made their
first contribution in
[#645](https://redirect.github.com/actions/setup-python/pull/645)
**Full Changelog**:
###
[`v5.3.0`](https://redirect.github.com/actions/setup-python/releases/tag/v5.3.0)
[Compare
Source](https://redirect.github.com/actions/setup-python/compare/v5.2.0...v5.3.0)
#### What's Changed
- Add workflow file for publishing releases to immutable action package
by [@Jcambass](https://redirect.github.com/Jcambass) in
[#941](https://redirect.github.com/actions/setup-python/pull/941)
- Upgrade IA publish by
[@Jcambass](https://redirect.github.com/Jcambass) in
[#943](https://redirect.github.com/actions/setup-python/pull/943)
##### Bug Fixes:
- Normalise Line Endings to Ensure Cross-Platform Consistency by
[@priya-kinthali](https://redirect.github.com/priya-kinthali) in
[#938](https://redirect.github.com/actions/setup-python/pull/938)
- Revise `isGhes` logic by
[@jww3](https://redirect.github.com/jww3) in
[#963](https://redirect.github.com/actions/setup-python/pull/963)
- Bump pillow from 7.2 to 10.2.0 by
[@aparnajyothi-y](https://redirect.github.com/aparnajyothi-y) in
[#956](https://redirect.github.com/actions/setup-python/pull/956)
##### Enhancements:
- Enhance workflows and documentation updates by
[@priya-kinthali](https://redirect.github.com/priya-kinthali) in
[#965](https://redirect.github.com/actions/setup-python/pull/965)
- Bump default versions to latest by
[@jeffwidman](https://redirect.github.com/jeffwidman) in
[#905](https://redirect.github.com/actions/setup-python/pull/905)
#### New Contributors
- [@Jcambass](https://redirect.github.com/Jcambass) made their
first contribution in
[#941](https://redirect.github.com/actions/setup-python/pull/941)
- [@jww3](https://redirect.github.com/jww3) made their first
contribution in
[#963](https://redirect.github.com/actions/setup-python/pull/963)
**Full Changelog**:
###
[`v5.2.0`](https://redirect.github.com/actions/setup-python/releases/tag/v5.2.0)
[Compare
Source](https://redirect.github.com/actions/setup-python/compare/v5.1.1...v5.2.0)
#### What's Changed
##### Bug fixes:
- Add `.zip` extension to Windows package downloads for `Expand-Archive`
Compatibility by
[@priyagupta108](https://redirect.github.com/priyagupta108) in
[#916](https://redirect.github.com/actions/setup-python/pull/916)
This addresses compatibility issues on Windows self-hosted runners by
ensuring that the filenames for Python and PyPy package downloads
explicitly include the .zip extension, allowing the Expand-Archive
command to function correctly.
- Add arch to cache key by
[@Zxilly](https://redirect.github.com/Zxilly) in
[#896](https://redirect.github.com/actions/setup-python/pull/896)
This addresses issues with caching by adding the architecture (arch) to
the cache key, ensuring that cache keys are accurate to prevent
conflicts.
Note: This change may break previous cache keys as they will no longer
be compatible with the new format.
##### Documentation changes:
- Fix display of emojis in contributors doc by
[@sciencewhiz](https://redirect.github.com/sciencewhiz) in
[#899](https://redirect.github.com/actions/setup-python/pull/899)
- Documentation update for caching poetry dependencies by
[@gowridurgad](https://redirect.github.com/gowridurgad) in
[#908](https://redirect.github.com/actions/setup-python/pull/908)
##### Dependency updates:
- Bump [@iarna/toml](https://redirect.github.com/iarna/toml)
version from 2.2.5 to 3.0.0 by
[@priya-kinthali](https://redirect.github.com/priya-kinthali) in
[#912](https://redirect.github.com/actions/setup-python/pull/912)
- Bump pyinstaller from 3.6 to 5.13.1 by
[@aparnajyothi-y](https://redirect.github.com/aparnajyothi-y) in
[#923](https://redirect.github.com/actions/setup-python/pull/923)
#### New Contributors
- [@sciencewhiz](https://redirect.github.com/sciencewhiz) made
their first contribution in
[#899](https://redirect.github.com/actions/setup-python/pull/899)
- [@priyagupta108](https://redirect.github.com/priyagupta108)
made their first contribution in
[#916](https://redirect.github.com/actions/setup-python/pull/916)
- [@Zxilly](https://redirect.github.com/Zxilly) made their first
contribution in
[#896](https://redirect.github.com/actions/setup-python/pull/896)
- [@aparnajyothi-y](https://redirect.github.com/aparnajyothi-y)
made their first contribution in
[#923](https://redirect.github.com/actions/setup-python/pull/923)
**Full Changelog**:
###
[`v5.1.1`](https://redirect.github.com/actions/setup-python/releases/tag/v5.1.1)
[Compare
Source](https://redirect.github.com/actions/setup-python/compare/v5.1.0...v5.1.1)
#### What's Changed
##### Bug fixes:
- fix(ci): update all failing workflows by
[@mayeut](https://redirect.github.com/mayeut) in
[#863](https://redirect.github.com/actions/setup-python/pull/863)
This update ensures compatibility and optimal performance of workflows
on the latest macOS version.
##### Documentation changes:
- Documentation update for cache by
[@gowridurgad](https://redirect.github.com/gowridurgad) in
[#873](https://redirect.github.com/actions/setup-python/pull/873)
##### Dependency updates:
- Bump braces from 3.0.2 to 3.0.3 and undici from 5.28.3 to 5.28.4 by
[@dependabot](https://redirect.github.com/dependabot) in
[#893](https://redirect.github.com/actions/setup-python/pull/893)
#### New Contributors
- [@gowridurgad](https://redirect.github.com/gowridurgad) made
their first contribution in
[#873](https://redirect.github.com/actions/setup-python/pull/873)
**Full Changelog**:
###
[`v5.1.0`](https://redirect.github.com/actions/setup-python/releases/tag/v5.1.0)
[Compare
Source](https://redirect.github.com/actions/setup-python/compare/v5...v5.1.0)
#### What's Changed
- Leveraging the raw API to retrieve the version-manifest, as it does
not impose a rate limit and hence facilitates unrestricted consumption
without the need for a token for Github Enterprise Servers by
[@Shegox](https://redirect.github.com/Shegox) in
[#766](https://redirect.github.com/actions/setup-python/pull/766).
- Dependency updates by
[@dependabot](https://redirect.github.com/dependabot) and
[@HarithaVattikuti](https://redirect.github.com/HarithaVattikuti)
in
[#817](https://redirect.github.com/actions/setup-python/pull/817)
- Documentation changes for version in README by
[@basnijholt](https://redirect.github.com/basnijholt) in
[#776](https://redirect.github.com/actions/setup-python/pull/776)
- Documentation changes for link in README by
[@ukd1](https://redirect.github.com/ukd1) in
[#793](https://redirect.github.com/actions/setup-python/pull/793)
- Documentation changes for link in Advanced Usage by
[@Jamim](https://redirect.github.com/Jamim) in
[#782](https://redirect.github.com/actions/setup-python/pull/782)
- Documentation changes for avoiding rate limit issues on GHES by
[@priya-kinthali](https://redirect.github.com/priya-kinthali) in
[#835](https://redirect.github.com/actions/setup-python/pull/835)
#### New Contributors
- [@basnijholt](https://redirect.github.com/basnijholt) made
their first contribution in
[#776](https://redirect.github.com/actions/setup-python/pull/776)
- [@ukd1](https://redirect.github.com/ukd1) made their first
contribution in
[#793](https://redirect.github.com/actions/setup-python/pull/793)
- [@Jamim](https://redirect.github.com/Jamim) made their first
contribution in
[#782](https://redirect.github.com/actions/setup-python/pull/782)
- [@Shegox](https://redirect.github.com/Shegox) made their first
contribution in
[#766](https://redirect.github.com/actions/setup-python/pull/766)
- [@priya-kinthali](https://redirect.github.com/priya-kinthali)
made their first contribution in
[#835](https://redirect.github.com/actions/setup-python/pull/835)
**Full Changelog**:
---
### Configuration
📅 **Schedule**: (UTC)
- Branch creation
- "before 8am on Friday"
- Automerge
- At any time (no schedule defined)
🚦 **Automerge**: Disabled by config. Please merge this manually once you
are satisfied.
♻ **Rebasing**: Whenever PR becomes conflicted, or you tick the
rebase/retry checkbox.
🔕 **Ignore**: Close this PR and you won't be reminded about this update
again.
---
- [ ] If you want to rebase/retry this PR, check
this box
---
This PR was generated by [Mend Renovate](https://mend.io/renovate/).
View the [repository job
log](https://developer.mend.io/github/cobaltcore-dev/cortex).
Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>
---
.github/actions/setup-claude-code-action/action.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/actions/setup-claude-code-action/action.yml b/.github/actions/setup-claude-code-action/action.yml
index 840fbdbd9..dbc832aba 100644
--- a/.github/actions/setup-claude-code-action/action.yml
+++ b/.github/actions/setup-claude-code-action/action.yml
@@ -39,7 +39,7 @@ runs:
echo "$HOME/.bun/bin" >> "$GITHUB_PATH"
- name: Setup Python
- uses: actions/setup-python@v5
+ uses: actions/setup-python@v6
with:
python-version: "3.14"
From bc53a6b0fdeba32136de6d319b9bdfe82803f0f4 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Mon, 4 May 2026 09:02:01 +0000
Subject: [PATCH 45/54] Bump cortex chart appVersions to sha-a9b46cab [skip ci]
---
helm/library/cortex/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml
index 7df04536e..f07c63c7e 100644
--- a/helm/library/cortex/Chart.yaml
+++ b/helm/library/cortex/Chart.yaml
@@ -3,6 +3,6 @@ name: cortex
description: A Helm chart to distribute cortex.
type: application
version: 0.0.44
-appVersion: "sha-c26705a8"
+appVersion: "sha-a9b46cab"
icon: "https://example.com/icon.png"
dependencies: []
From 1fb35660ecf0f20c870385d4420342fd888946e2 Mon Sep 17 00:00:00 2001
From: Philipp Matthes
Date: Mon, 4 May 2026 11:06:03 +0200
Subject: [PATCH 46/54] Set PGDATA to subdirectory to avoid lost+found conflict
PostgreSQL refuses to initdb when the data directory is a mount point
containing lost+found. Set PGDATA=/var/lib/postgresql/data/pgdata so
postgres uses a subdirectory under the PVC mount, matching upstream
convention.
---
helm/library/cortex-postgres/templates/statefulset.yaml | 2 ++
1 file changed, 2 insertions(+)
diff --git a/helm/library/cortex-postgres/templates/statefulset.yaml b/helm/library/cortex-postgres/templates/statefulset.yaml
index 43dd06430..06ba22a3c 100644
--- a/helm/library/cortex-postgres/templates/statefulset.yaml
+++ b/helm/library/cortex-postgres/templates/statefulset.yaml
@@ -39,6 +39,8 @@ spec:
secretKeyRef:
name: {{ include "cortex-postgres.versionedFullname" . }}-secret
key: postgres-password
+ - name: PGDATA
+ value: /var/lib/postgresql/data/pgdata
ports:
- containerPort: 5432
name: postgresql
From 93163d545ae57258c4cd10973d79af7a0a11db99 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Mon, 4 May 2026 09:16:05 +0000
Subject: [PATCH 47/54] Bump cortex chart appVersions to sha-1fb35660 [skip ci]
---
helm/library/cortex/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml
index f07c63c7e..1cdefa7c5 100644
--- a/helm/library/cortex/Chart.yaml
+++ b/helm/library/cortex/Chart.yaml
@@ -3,6 +3,6 @@ name: cortex
description: A Helm chart to distribute cortex.
type: application
version: 0.0.44
-appVersion: "sha-a9b46cab"
+appVersion: "sha-1fb35660"
icon: "https://example.com/icon.png"
dependencies: []
From b54c5f75b942a4f5f287f3816cbcd361bb582402 Mon Sep 17 00:00:00 2001
From: Philipp Matthes
Date: Mon, 4 May 2026 11:34:25 +0200
Subject: [PATCH 48/54] Don't run helm-lint workflow when release PR is in
draft
---
.github/workflows/helm-lint.yaml | 1 +
1 file changed, 1 insertion(+)
diff --git a/.github/workflows/helm-lint.yaml b/.github/workflows/helm-lint.yaml
index 37022816b..85021eda2 100644
--- a/.github/workflows/helm-lint.yaml
+++ b/.github/workflows/helm-lint.yaml
@@ -12,6 +12,7 @@ env:
jobs:
helm-lint:
+ if: github.event.pull_request.draft == false
runs-on: ubuntu-latest
steps:
- name: Checkout PR
From 88f03a41101342413a45df4043ec02efbcbe2734 Mon Sep 17 00:00:00 2001
From: Philipp Matthes <27271818+PhilippMatthes@users.noreply.github.com>
Date: Mon, 4 May 2026 11:34:43 +0200
Subject: [PATCH 49/54] Bump postgres to PG 18.3 (#791)
Major PostgreSQL upgrade from 17.9 to 18.3. Updates the Dockerfile (base
image digest, PG_MAJOR, PG_VERSION, JIT package, PGDATA path change to
/var/lib/postgresql/18/docker), syncs docker-entrypoint.sh and
docker-ensure-initdb.sh with upstream, and bumps the helm chart major
value to "18" across library and all bundle charts. The statefulset
volume mount moves from /var/lib/postgresql/data to /var/lib/postgresql
to match the PG 18 convention.
IMPORTANT: This is a major PostgreSQL upgrade. The helm chart's
versioned naming will create a NEW StatefulSet and Service (e.g.
cortex-nova-postgresql-v18) alongside the old one
(cortex-nova-postgresql-v17). The old deployment will NOT be removed
automatically. After deploying this change and confirming the new
instance is healthy and re-populated by the knowledge module, operators
must manually delete the old StatefulSet and its PVC (e.g. kubectl
delete statefulset cortex-nova-postgresql-v17 && kubectl delete pvc
data-cortex-nova-postgresql-v17-0).
---
helm/bundles/cortex-cinder/values.yaml | 4 +-
helm/bundles/cortex-manila/values.yaml | 4 +-
helm/bundles/cortex-nova/values.yaml | 4 +-
.../templates/statefulset.yaml | 4 +-
helm/library/cortex-postgres/values.yaml | 2 +-
postgres/Dockerfile | 18 +++++----
postgres/docker-ensure-initdb.sh | 2 +-
postgres/docker-entrypoint.sh | 38 +++++++++----------
8 files changed, 36 insertions(+), 40 deletions(-)
diff --git a/helm/bundles/cortex-cinder/values.yaml b/helm/bundles/cortex-cinder/values.yaml
index f4f63a7f7..300322880 100644
--- a/helm/bundles/cortex-cinder/values.yaml
+++ b/helm/bundles/cortex-cinder/values.yaml
@@ -38,7 +38,7 @@ sharedSSOCert: &sharedSSOCert
selfSigned: "false"
postgres:
- host: cortex-cinder-postgresql-v17
+ host: cortex-cinder-postgresql-v18
user: postgres
password: secret
database: postgres
@@ -138,4 +138,4 @@ cortex-knowledge-controllers:
# Custom configuration for the cortex postgres chart.
cortex-postgres:
fullnameOverride: cortex-cinder-postgresql
- major: "17"
+ major: "18"
diff --git a/helm/bundles/cortex-manila/values.yaml b/helm/bundles/cortex-manila/values.yaml
index 6a604fe6a..66c1c5f2d 100644
--- a/helm/bundles/cortex-manila/values.yaml
+++ b/helm/bundles/cortex-manila/values.yaml
@@ -38,7 +38,7 @@ sharedSSOCert: &sharedSSOCert
selfSigned: "false"
postgres:
- host: cortex-manila-postgresql-v17
+ host: cortex-manila-postgresql-v18
user: postgres
password: secret
database: postgres
@@ -138,4 +138,4 @@ cortex-knowledge-controllers:
# Custom configuration for the cortex postgres chart.
cortex-postgres:
fullnameOverride: cortex-manila-postgresql
- major: "17"
+ major: "18"
diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml
index e73ce9ad1..f4e9d5725 100644
--- a/helm/bundles/cortex-nova/values.yaml
+++ b/helm/bundles/cortex-nova/values.yaml
@@ -38,7 +38,7 @@ sharedSSOCert: &sharedSSOCert
selfSigned: "false"
postgres:
- host: cortex-nova-postgresql-v17
+ host: cortex-nova-postgresql-v18
user: postgres
password: secret
database: postgres
@@ -234,4 +234,4 @@ cortex-knowledge-controllers:
# Custom configuration for the cortex postgres chart.
cortex-postgres:
fullnameOverride: cortex-nova-postgresql
- major: "17"
+ major: "18"
diff --git a/helm/library/cortex-postgres/templates/statefulset.yaml b/helm/library/cortex-postgres/templates/statefulset.yaml
index 06ba22a3c..2a770ddf7 100644
--- a/helm/library/cortex-postgres/templates/statefulset.yaml
+++ b/helm/library/cortex-postgres/templates/statefulset.yaml
@@ -39,14 +39,12 @@ spec:
secretKeyRef:
name: {{ include "cortex-postgres.versionedFullname" . }}-secret
key: postgres-password
- - name: PGDATA
- value: /var/lib/postgresql/data/pgdata
ports:
- containerPort: 5432
name: postgresql
volumeMounts:
- name: data
- mountPath: /var/lib/postgresql/data
+ mountPath: /var/lib/postgresql
- name: max-conns
mountPath: /docker-entrypoint-initdb.d/max_conns.sql
subPath: max_conns.sql
diff --git a/helm/library/cortex-postgres/values.yaml b/helm/library/cortex-postgres/values.yaml
index 8c52c73ed..2d4cb4b40 100644
--- a/helm/library/cortex-postgres/values.yaml
+++ b/helm/library/cortex-postgres/values.yaml
@@ -3,7 +3,7 @@
fullnameOverride: cortex-postgresql
-major: "17"
+major: "18"
image:
repository: ghcr.io/cobaltcore-dev/cortex-postgres
diff --git a/postgres/Dockerfile b/postgres/Dockerfile
index 73ecfebd3..552a067da 100644
--- a/postgres/Dockerfile
+++ b/postgres/Dockerfile
@@ -1,4 +1,4 @@
-FROM debian:trixie-slim@sha256:4ffb3a1511099754cddc70eb1b12e50ffdb67619aa0ab6c13fcd800a78ef7c7a
+FROM debian:trixie-slim@sha256:cedb1ef40439206b673ee8b33a46a03a0c9fa90bf3732f54704f99cb061d2c5a
# explicitly set user/group IDs
RUN set -eux; \
@@ -64,10 +64,10 @@ RUN set -ex; \
gpgconf --kill all; \
rm -rf "$GNUPGHOME"
-ENV PG_MAJOR 17
+ENV PG_MAJOR 18
ENV PATH $PATH:/usr/lib/postgresql/$PG_MAJOR/bin
-ENV PG_VERSION 17.9-1.pgdg13+1
+ENV PG_VERSION 18.3-1.pgdg13+1
RUN set -ex; \
\
@@ -75,7 +75,7 @@ RUN set -ex; \
export PYTHONDONTWRITEBYTECODE=1; \
\
dpkgArch="$(dpkg --print-architecture)"; \
- aptRepo="[ signed-by=/usr/local/share/keyrings/postgres.gpg.asc ] http://apt.postgresql.org/pub/repos/apt/ trixie-pgdg main $PG_MAJOR"; \
+ aptRepo="[ signed-by=/usr/local/share/keyrings/postgres.gpg.asc ] http://apt.postgresql.org/pub/repos/apt trixie-pgdg main $PG_MAJOR"; \
case "$dpkgArch" in \
amd64 | arm64 | ppc64el) \
# arches officially built by upstream
@@ -135,6 +135,10 @@ RUN set -ex; \
apt-get install -y --no-install-recommends \
"postgresql-$PG_MAJOR=$PG_VERSION" \
; \
+# https://github.com/docker-library/postgres/pull/1344#issuecomment-2936578203 (JIT is a separate package in 18+, but only supported for a subset of architectures)
+ if apt-get install -s "postgresql-$PG_MAJOR-jit" > /dev/null 2>&1; then \
+ apt-get install -y --no-install-recommends "postgresql-$PG_MAJOR-jit=$PG_VERSION"; \
+ fi; \
\
rm -rf /var/lib/apt/lists/*; \
\
@@ -159,10 +163,8 @@ RUN set -eux; \
RUN install --verbose --directory --owner postgres --group postgres --mode 3777 /var/run/postgresql
-ENV PGDATA /var/lib/postgresql/data
-# this 1777 will be replaced by 0700 at runtime (allows semi-arbitrary "--user" values)
-RUN install --verbose --directory --owner postgres --group postgres --mode 1777 "$PGDATA"
-VOLUME /var/lib/postgresql/data
+ENV PGDATA /var/lib/postgresql/18/docker
+VOLUME /var/lib/postgresql
COPY docker-entrypoint.sh docker-ensure-initdb.sh /usr/local/bin/
RUN ln -sT docker-ensure-initdb.sh /usr/local/bin/docker-enforce-initdb.sh
diff --git a/postgres/docker-ensure-initdb.sh b/postgres/docker-ensure-initdb.sh
index 07c08899f..e9b15ef77 100755
--- a/postgres/docker-ensure-initdb.sh
+++ b/postgres/docker-ensure-initdb.sh
@@ -69,4 +69,4 @@ else
exit 99
;;
esac
-fi
\ No newline at end of file
+fi
diff --git a/postgres/docker-entrypoint.sh b/postgres/docker-entrypoint.sh
index 6057a20ba..d4442d8a5 100755
--- a/postgres/docker-entrypoint.sh
+++ b/postgres/docker-entrypoint.sh
@@ -103,24 +103,6 @@ docker_init_database_dir() {
# print large warning if POSTGRES_HOST_AUTH_METHOD is set to 'trust'
# assumes database is not set up, ie: [ -z "$DATABASE_ALREADY_EXISTS" ]
docker_verify_minimum_env() {
- case "${PG_MAJOR:-}" in
- 13) # https://github.com/postgres/postgres/commit/67a472d71c98c3d2fa322a1b4013080b20720b98
- # check password first so we can output the warning before postgres
- # messes it up
- if [ "${#POSTGRES_PASSWORD}" -ge 100 ]; then
- cat >&2 <<-'EOWARN'
-
- WARNING: The supplied POSTGRES_PASSWORD is 100+ characters.
-
- This will not work if used via PGPASSWORD with "psql".
-
- https://www.postgresql.org/message-id/flat/E1Rqxp2-0004Qt-PL%40wrigleys.postgresql.org (BUG #6412)
- https://github.com/docker-library/postgres/issues/507
-
- EOWARN
- fi
- ;;
- esac
if [ -z "$POSTGRES_PASSWORD" ] && [ 'trust' != "$POSTGRES_HOST_AUTH_METHOD" ]; then
# The - option suppresses leading tabs but *not* spaces. :)
cat >&2 <<-'EOE'
@@ -168,8 +150,14 @@ docker_error_old_databases() {
Counter to that, there appears to be PostgreSQL data in:
${OLD_DATABASES[*]}
- This is usually the result of upgrading the Docker image without upgrading
- the underlying database using "pg_upgrade" (which requires both versions).
+ This is usually the result of upgrading the Docker image without
+ upgrading the underlying database using "pg_upgrade" (which requires both
+ versions).
+
+ The suggested container configuration for 18+ is to place a single mount
+ at /var/lib/postgresql which will then place PostgreSQL data in a
+ subdirectory, allowing usage of "pg_upgrade --link" without mount point
+ boundary issues.
See https://github.com/docker-library/postgres/issues/37 for a (long)
discussion around this process, and suggestions for how to do so.
@@ -264,6 +252,14 @@ docker_setup_env() {
OLD_DATABASES+=( "$d" )
fi
done
+ if [ "${#OLD_DATABASES[@]}" -eq 0 ] && [ "$PG_MAJOR" -ge 18 ] && {
+ # in BusyBox, "mountpoint" only checks dev vs ino (https://github.com/tianon/mirror-busybox/blob/be7d1b7b1701d225379bc1665487ed0871b592a5/util-linux/mountpoint.c#L78) which will notably miss bind mounts entirely (which almost all Docker volume mounts are)
+ # coreutils checks /proc/self/mountinfo, so we have a fallback to mimic that and directly check "/proc/self/mountinfo" to catch that case
+ mountpoint -q /var/lib/postgresql/data \
+ || awk '$5 == "/var/lib/postgresql/data" { found = 1 } END { exit !found }' /proc/self/mountinfo
+ }; then
+ OLD_DATABASES+=( '/var/lib/postgresql/data (unused mount/volume)' )
+ fi
fi
}
@@ -388,4 +384,4 @@ _main() {
if ! _is_sourced; then
_main "$@"
-fi
\ No newline at end of file
+fi
From 89f6a6d03e0c3264643ba7c1cc2b49fb6c215617 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Mon, 4 May 2026 09:51:33 +0000
Subject: [PATCH 50/54] Bump cortex-postgres chart appVersions to sha-88f03a41
[skip ci]
---
helm/library/cortex-postgres/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex-postgres/Chart.yaml b/helm/library/cortex-postgres/Chart.yaml
index 7e76ef0fb..b8c803e99 100644
--- a/helm/library/cortex-postgres/Chart.yaml
+++ b/helm/library/cortex-postgres/Chart.yaml
@@ -6,4 +6,4 @@ name: cortex-postgres
description: Postgres setup for Cortex.
type: application
version: 0.5.14
-appVersion: "sha-bdda1892"
+appVersion: "sha-88f03a41"
From c966d226e99d91e2feb782e24007867f2e53d5f2 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Mon, 4 May 2026 09:51:34 +0000
Subject: [PATCH 51/54] Bump cortex chart appVersions to sha-88f03a41 [skip ci]
---
helm/library/cortex/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml
index 1cdefa7c5..8072e9c12 100644
--- a/helm/library/cortex/Chart.yaml
+++ b/helm/library/cortex/Chart.yaml
@@ -3,6 +3,6 @@ name: cortex
description: A Helm chart to distribute cortex.
type: application
version: 0.0.44
-appVersion: "sha-1fb35660"
+appVersion: "sha-88f03a41"
icon: "https://example.com/icon.png"
dependencies: []
From 7085ce574af80563547c6a4f30d76ee8b882bde6 Mon Sep 17 00:00:00 2001
From: "cortex-ai-agents[bot]"
<279748396+cortex-ai-agents[bot]@users.noreply.github.com>
Date: Mon, 4 May 2026 12:39:46 +0200
Subject: [PATCH 52/54] Update changelog for release PR #779 (#781)
Adds CHANGELOG.md with release notes for PR #779.
This PR should be merged **after** the release PR #779.
Co-authored-by: github-actions[bot]
Co-authored-by: Claude Opus 4.7
---
CHANGELOG.md | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 74 insertions(+)
create mode 100644 CHANGELOG.md
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 000000000..e9386d04a
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,74 @@
+# Changelog
+
+## 2026-05-04 — [#779](https://github.com/cobaltcore-dev/cortex/pull/779)
+
+### cortex v0.0.45 (sha-1fb35660)
+
+Non-breaking changes:
+- Add CommittedResource CRD definition and controller that watches CommittedResource objects and manages child Reservation CRUD
+- Add `AllowRejection` field to CommittedResourceSpec for controlling placement failure behavior
+- Add vmware project utilization KPI tracking instances per project/flavor and capacity per host
+- Move vmware resource commitments KPI to new infrastructure plugins package with shared utilities
+- Move vmware host capacity KPI to infrastructure plugins package
+- Add basic support for flavor groups for failover reservation with consolidation weigher
+- Add `useFlavorGroupResources` values.yaml key for cortex-nova (default: false)
+- Update external dependencies (controller-runtime v0.24.0, go-sqlite3 v1.14.44, zap v1.28.0)
+- Alert only on new vm faults (avoid re-alerting on historical faults)
+
+### cortex-shim v0.1.0 (sha-d8bb12ef)
+
+Breaking changes:
+- Remove `traits.static` values.yaml key and Helm-managed static traits ConfigMap template — traits are now fully managed by the shim at runtime via a single ConfigMap
+
+Non-breaking changes:
+- Add per-request feature mode override via `X-Cortex-Feature-Mode` header
+- Refactor /traits API to single-ConfigMap model with reusable Syncer interface pattern
+- Implement feature-gated /resource_classes API with ConfigMap storage (passthrough, hybrid, crd modes)
+- Add ResourceClassSyncer for periodic upstream sync into local ConfigMap
+- Add `resourceClasses.configMapName` values.yaml key for configuring the resource classes ConfigMap name
+- Support traits and aggregates endpoints per resource provider with three feature modes (passthrough, hybrid, crd)
+- Exercise all three feature modes in placement shim e2e tests
+- Fix nil pointer panic in feature mode override guard
+
+### cortex-postgres v0.6.0 (sha-88f03a41)
+
+Breaking changes:
+- Upgrade PostgreSQL from 17.9 to 18.3 — resource names now include a `-v{major}` suffix for zero-downtime upgrades (e.g., `cortex-nova-postgresql-v18`). After deploy, operators must remove old StatefulSets and PVCs manually.
+
+Non-breaking changes:
+- Add versioned resource naming with `cortex-postgres.versionedFullname` helper for zero-downtime PG major upgrades
+- Add `major` values.yaml key (default: "18") to control version suffix
+- Set PGDATA to subdirectory to avoid lost+found conflict
+
+### cortex-nova v0.0.58 (sha-1fb35660)
+
+Includes updated charts cortex v0.0.45 and cortex-postgres v0.6.0.
+
+Non-breaking changes:
+- Reorganize KPI CRD templates for infrastructure dashboard metrics
+- Add `useFlavorGroupResources` values.yaml key for failover reservations (default: false)
+- Restructure committedResource config keys into nested objects (`committedResourceReservationController`, `committedResourceController`, `committedResourceAPI`)
+- Add `committedResourceSyncInterval` config key for syncer reconciliation interval
+
+### cortex-placement-shim v0.1.0 (sha-d8bb12ef)
+
+Includes updated chart cortex-shim v0.1.0.
+
+Breaking changes:
+- Remove `traits.static` values.yaml key (inherited from cortex-shim breaking change)
+
+Non-breaking changes:
+- Add `resourceClasses.configMapName` values.yaml key
+
+### General
+
+Non-breaking changes:
+- Fix bump-artifact workflow to handle concurrent changes on main with concurrency groups and freshness checks
+- Add reusable `bump-chart.sh` script for CI chart version bumps
+- Add pull-request-creator Claude agent
+- Add changelog update command and workflow for release PRs
+- Add linting workflow for scaffold completeness checks
+- Make /release claude command idempotent
+- Don't run helm-lint workflow when release PR is in draft
+- Update actions/setup-python action to v6
+- Fix stale documentation: traits model, pipeline name, and API path
From 07d25e894bf55f3f215b502ee4cce8c87ce8774e Mon Sep 17 00:00:00 2001
From: "cortex-ai-agents[bot]"
<279748396+cortex-ai-agents[bot]@users.noreply.github.com>
Date: Mon, 4 May 2026 12:45:12 +0200
Subject: [PATCH 53/54] Bump chart versions for release #779 (#780)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Bump chart versions in preparation for release PR #779.
Library charts:
- cortex: 0.0.44 → 0.0.45 (patch, no breaking changes)
- cortex-shim: 0.0.3 → 0.1.0 (minor, breaking: removed traits.static)
- cortex-postgres: 0.5.14 → 0.6.0 (minor, breaking: PG 18 upgrade with
versioned naming)
Bundle charts:
- cortex-cinder: 0.0.57 → 0.0.58
- cortex-crds: 0.0.57 → 0.0.58
- cortex-ironcore: 0.0.57 → 0.0.58
- cortex-manila: 0.0.57 → 0.0.58
- cortex-nova: 0.0.57 → 0.0.58
- cortex-pods: 0.0.57 → 0.0.58
- cortex-placement-shim: 0.0.3 → 0.1.0
This PR should be merged **before** the release PR #779.
Co-authored-by: github-actions[bot]
Co-authored-by: Claude Opus 4.7
---
helm/bundles/cortex-cinder/Chart.yaml | 8 ++++----
helm/bundles/cortex-crds/Chart.yaml | 4 ++--
helm/bundles/cortex-ironcore/Chart.yaml | 4 ++--
helm/bundles/cortex-manila/Chart.yaml | 8 ++++----
helm/bundles/cortex-nova/Chart.yaml | 8 ++++----
helm/bundles/cortex-placement-shim/Chart.yaml | 4 ++--
helm/bundles/cortex-pods/Chart.yaml | 4 ++--
helm/library/cortex-postgres/Chart.yaml | 2 +-
helm/library/cortex-shim/Chart.yaml | 2 +-
helm/library/cortex/Chart.yaml | 2 +-
10 files changed, 23 insertions(+), 23 deletions(-)
diff --git a/helm/bundles/cortex-cinder/Chart.yaml b/helm/bundles/cortex-cinder/Chart.yaml
index eed40eef0..c1a93f75e 100644
--- a/helm/bundles/cortex-cinder/Chart.yaml
+++ b/helm/bundles/cortex-cinder/Chart.yaml
@@ -5,23 +5,23 @@ apiVersion: v2
name: cortex-cinder
description: A Helm chart deploying Cortex for Cinder.
type: application
-version: 0.0.57
+version: 0.0.58
appVersion: 0.1.0
dependencies:
# from: file://../../library/cortex-postgres
- name: cortex-postgres
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
- version: 0.5.14
+ version: 0.6.0
# from: file://../../library/cortex
- name: cortex
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
- version: 0.0.44
+ version: 0.0.45
alias: cortex-knowledge-controllers
# from: file://../../library/cortex
- name: cortex
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
- version: 0.0.44
+ version: 0.0.45
alias: cortex-scheduling-controllers
# Owner info adds a configmap to the kubernetes cluster with information on
diff --git a/helm/bundles/cortex-crds/Chart.yaml b/helm/bundles/cortex-crds/Chart.yaml
index ba8715c21..4972527e3 100644
--- a/helm/bundles/cortex-crds/Chart.yaml
+++ b/helm/bundles/cortex-crds/Chart.yaml
@@ -5,13 +5,13 @@ apiVersion: v2
name: cortex-crds
description: A Helm chart deploying Cortex CRDs.
type: application
-version: 0.0.57
+version: 0.0.58
appVersion: 0.1.0
dependencies:
# from: file://../../library/cortex
- name: cortex
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
- version: 0.0.44
+ version: 0.0.45
# Owner info adds a configmap to the kubernetes cluster with information on
# the service owner. This makes it easier to find out who to contact in case
diff --git a/helm/bundles/cortex-ironcore/Chart.yaml b/helm/bundles/cortex-ironcore/Chart.yaml
index 06413a2d6..2f97392d9 100644
--- a/helm/bundles/cortex-ironcore/Chart.yaml
+++ b/helm/bundles/cortex-ironcore/Chart.yaml
@@ -5,13 +5,13 @@ apiVersion: v2
name: cortex-ironcore
description: A Helm chart deploying Cortex for IronCore.
type: application
-version: 0.0.57
+version: 0.0.58
appVersion: 0.1.0
dependencies:
# from: file://../../library/cortex
- name: cortex
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
- version: 0.0.44
+ version: 0.0.45
# Owner info adds a configmap to the kubernetes cluster with information on
# the service owner. This makes it easier to find out who to contact in case
diff --git a/helm/bundles/cortex-manila/Chart.yaml b/helm/bundles/cortex-manila/Chart.yaml
index aac18d818..484789b26 100644
--- a/helm/bundles/cortex-manila/Chart.yaml
+++ b/helm/bundles/cortex-manila/Chart.yaml
@@ -5,23 +5,23 @@ apiVersion: v2
name: cortex-manila
description: A Helm chart deploying Cortex for Manila.
type: application
-version: 0.0.57
+version: 0.0.58
appVersion: 0.1.0
dependencies:
# from: file://../../library/cortex-postgres
- name: cortex-postgres
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
- version: 0.5.14
+ version: 0.6.0
# from: file://../../library/cortex
- name: cortex
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
- version: 0.0.44
+ version: 0.0.45
alias: cortex-knowledge-controllers
# from: file://../../library/cortex
- name: cortex
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
- version: 0.0.44
+ version: 0.0.45
alias: cortex-scheduling-controllers
# Owner info adds a configmap to the kubernetes cluster with information on
diff --git a/helm/bundles/cortex-nova/Chart.yaml b/helm/bundles/cortex-nova/Chart.yaml
index dca30d2e6..e0b941ee1 100644
--- a/helm/bundles/cortex-nova/Chart.yaml
+++ b/helm/bundles/cortex-nova/Chart.yaml
@@ -5,23 +5,23 @@ apiVersion: v2
name: cortex-nova
description: A Helm chart deploying Cortex for Nova.
type: application
-version: 0.0.57
+version: 0.0.58
appVersion: 0.1.0
dependencies:
# from: file://../../library/cortex-postgres
- name: cortex-postgres
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
- version: 0.5.14
+ version: 0.6.0
# from: file://../../library/cortex
- name: cortex
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
- version: 0.0.44
+ version: 0.0.45
alias: cortex-knowledge-controllers
# from: file://../../library/cortex
- name: cortex
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
- version: 0.0.44
+ version: 0.0.45
alias: cortex-scheduling-controllers
# Owner info adds a configmap to the kubernetes cluster with information on
diff --git a/helm/bundles/cortex-placement-shim/Chart.yaml b/helm/bundles/cortex-placement-shim/Chart.yaml
index 7cf70c100..d15e63025 100644
--- a/helm/bundles/cortex-placement-shim/Chart.yaml
+++ b/helm/bundles/cortex-placement-shim/Chart.yaml
@@ -5,13 +5,13 @@ apiVersion: v2
name: cortex-placement-shim
description: A Helm chart deploying the Cortex placement shim.
type: application
-version: 0.0.3
+version: 0.1.0
appVersion: 0.1.0
dependencies:
# from: file://../../library/cortex-shim
- name: cortex-shim
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
- version: 0.0.3
+ version: 0.1.0
# Owner info adds a configmap to the kubernetes cluster with information on
# the service owner. This makes it easier to find out who to contact in case
# of issues. See: https://github.com/sapcc/helm-charts/pkgs/container/helm-charts%2Fowner-info
diff --git a/helm/bundles/cortex-pods/Chart.yaml b/helm/bundles/cortex-pods/Chart.yaml
index 2368851c4..e5f17d322 100644
--- a/helm/bundles/cortex-pods/Chart.yaml
+++ b/helm/bundles/cortex-pods/Chart.yaml
@@ -5,13 +5,13 @@ apiVersion: v2
name: cortex-pods
description: A Helm chart deploying Cortex for Pods.
type: application
-version: 0.0.57
+version: 0.0.58
appVersion: 0.1.0
dependencies:
# from: file://../../library/cortex
- name: cortex
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
- version: 0.0.44
+ version: 0.0.45
# Owner info adds a configmap to the kubernetes cluster with information on
# the service owner. This makes it easier to find out who to contact in case
diff --git a/helm/library/cortex-postgres/Chart.yaml b/helm/library/cortex-postgres/Chart.yaml
index b8c803e99..2e0ab6abb 100644
--- a/helm/library/cortex-postgres/Chart.yaml
+++ b/helm/library/cortex-postgres/Chart.yaml
@@ -5,5 +5,5 @@ apiVersion: v2
name: cortex-postgres
description: Postgres setup for Cortex.
type: application
-version: 0.5.14
+version: 0.6.0
appVersion: "sha-88f03a41"
diff --git a/helm/library/cortex-shim/Chart.yaml b/helm/library/cortex-shim/Chart.yaml
index 346a83021..df4c688e6 100644
--- a/helm/library/cortex-shim/Chart.yaml
+++ b/helm/library/cortex-shim/Chart.yaml
@@ -2,7 +2,7 @@ apiVersion: v2
name: cortex-shim
description: A Helm chart to distribute cortex shims.
type: application
-version: 0.0.3
+version: 0.1.0
appVersion: "sha-d8bb12ef"
icon: "https://example.com/icon.png"
dependencies: []
diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml
index 8072e9c12..b5369ae97 100644
--- a/helm/library/cortex/Chart.yaml
+++ b/helm/library/cortex/Chart.yaml
@@ -2,7 +2,7 @@ apiVersion: v2
name: cortex
description: A Helm chart to distribute cortex.
type: application
-version: 0.0.44
+version: 0.0.45
appVersion: "sha-88f03a41"
icon: "https://example.com/icon.png"
dependencies: []
From 8e9faa918106201f2da34964e2f0ccbd7a47d5a9 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Mon, 4 May 2026 10:54:37 +0000
Subject: [PATCH 54/54] Bump cortex chart appVersions to sha-07d25e89 [skip ci]
---
helm/library/cortex/Chart.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml
index b5369ae97..463af7d34 100644
--- a/helm/library/cortex/Chart.yaml
+++ b/helm/library/cortex/Chart.yaml
@@ -3,6 +3,6 @@ name: cortex
description: A Helm chart to distribute cortex.
type: application
version: 0.0.45
-appVersion: "sha-88f03a41"
+appVersion: "sha-07d25e89"
icon: "https://example.com/icon.png"
dependencies: []