From 94ec411e73a69c58a8de4859cd31998a7a9e7980 Mon Sep 17 00:00:00 2001 From: Malte Viering Date: Tue, 28 Apr 2026 13:22:18 +0200 Subject: [PATCH 1/2] feat: Add quota crd and update quota api --- api/v1alpha1/project_quota_types.go | 122 ++++++ api/v1alpha1/zz_generated.deepcopy.go | 158 ++++++++ .../crds/cortex.cloud_projectquotas.yaml | 212 +++++++++++ .../reservations/commitments/api/handler.go | 3 + .../reservations/commitments/api/info.go | 12 +- .../reservations/commitments/api/info_test.go | 43 ++- .../reservations/commitments/api/quota.go | 156 +++++++- .../commitments/api/quota_monitor.go | 47 +++ .../commitments/api/quota_test.go | 354 ++++++++++++++++++ .../reservations/commitments/config.go | 6 + .../reservations/commitments/usage.go | 17 +- 11 files changed, 1113 insertions(+), 17 deletions(-) create mode 100644 api/v1alpha1/project_quota_types.go create mode 100644 helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml create mode 100644 internal/scheduling/reservations/commitments/api/quota_monitor.go create mode 100644 internal/scheduling/reservations/commitments/api/quota_test.go diff --git a/api/v1alpha1/project_quota_types.go b/api/v1alpha1/project_quota_types.go new file mode 100644 index 000000000..cf61585c6 --- /dev/null +++ b/api/v1alpha1/project_quota_types.go @@ -0,0 +1,122 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// ResourceQuota holds the quota for a single resource with per-AZ breakdown. +// Maps to liquid.ResourceQuotaRequest from the LIQUID API. +// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ResourceQuotaRequest +type ResourceQuota struct { + // Quota is the total quota across all AZs (for compatibility). + // Corresponds to liquid.ResourceQuotaRequest.Quota. + // +kubebuilder:validation:Required + Quota int64 `json:"quota"` + + // PerAZ holds the per-availability-zone quota breakdown. + // Key: availability zone name, Value: quota for that AZ. + // Only populated for AZSeparatedTopology resources. + // Corresponds to liquid.ResourceQuotaRequest.PerAZ[az].Quota. + // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#AZResourceQuotaRequest + // +kubebuilder:validation:Optional + PerAZ map[string]int64 `json:"perAZ,omitempty"` +} + +// ResourceQuotaUsage holds per-AZ PAYG usage for a single resource. +type ResourceQuotaUsage struct { + // PerAZ holds per-availability-zone PAYG usage values. + // Key: availability zone name, Value: PAYG usage in that AZ. + // +kubebuilder:validation:Optional + PerAZ map[string]int64 `json:"perAZ,omitempty"` +} + +// ProjectQuotaSpec defines the desired state of ProjectQuota. +// Populated from PUT /v1/projects/:uuid/quota payloads (liquid.ServiceQuotaRequest). +// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ServiceQuotaRequest +type ProjectQuotaSpec struct { + // ProjectID of the OpenStack project this quota belongs to. + // Corresponds to the :uuid in the PUT URL path. + // +kubebuilder:validation:Required + ProjectID string `json:"projectID"` + + // ProjectName is the human-readable name of the OpenStack project. + // Extracted from liquid.ServiceQuotaRequest.ProjectMetadata.Name. + // +kubebuilder:validation:Optional + ProjectName string `json:"projectName,omitempty"` + + // DomainID of the OpenStack domain this project belongs to. + // Extracted from liquid.ServiceQuotaRequest.ProjectMetadata.Domain.UUID. + // +kubebuilder:validation:Required + DomainID string `json:"domainID"` + + // DomainName is the human-readable name of the OpenStack domain. + // Extracted from liquid.ServiceQuotaRequest.ProjectMetadata.Domain.Name. + // +kubebuilder:validation:Optional + DomainName string `json:"domainName,omitempty"` + + // Quota maps LIQUID resource names to their per-AZ quota. + // Key: liquid.ResourceName (e.g. "hw_version_hana_v2_ram") + // Mirrors liquid.ServiceQuotaRequest.Resources with AZSeparatedTopology. + // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ServiceQuotaRequest + // +kubebuilder:validation:Optional + Quota map[string]ResourceQuota `json:"quota,omitempty"` +} + +// ProjectQuotaStatus defines the observed state of ProjectQuota. +// Usage values correspond to liquid.AZResourceUsageReport fields reported via /report-usage. +// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#AZResourceUsageReport +type ProjectQuotaStatus struct { + // PaygUsage tracks per-resource per-AZ pay-as-you-go usage. + // Key: liquid.ResourceName + // +kubebuilder:validation:Optional + PaygUsage map[string]ResourceQuotaUsage `json:"paygUsage,omitempty"` + + // LastReconcileAt is when the controller last reconciled this project's quota. + // +kubebuilder:validation:Optional + LastReconcileAt *metav1.Time `json:"lastReconcileAt,omitempty"` + + // Conditions holds the current status conditions. + // +kubebuilder:validation:Optional + Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:resource:scope=Cluster +// +kubebuilder:printcolumn:name="Project",type="string",JSONPath=".spec.projectID" +// +kubebuilder:printcolumn:name="Domain",type="string",JSONPath=".spec.domainID" +// +kubebuilder:printcolumn:name="LastReconcile",type="date",JSONPath=".status.lastReconcileAt" +// +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" + +// ProjectQuota is the Schema for the projectquotas API. +// It persists quota values pushed by Limes via the LIQUID quota endpoint +// (PUT /v1/projects/:uuid/quota → liquid.ServiceQuotaRequest). +// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ServiceQuotaRequest +type ProjectQuota struct { + metav1.TypeMeta `json:",inline"` + + // +optional + metav1.ObjectMeta `json:"metadata,omitempty,omitzero"` + + // +required + Spec ProjectQuotaSpec `json:"spec"` + + // +optional + Status ProjectQuotaStatus `json:"status,omitempty,omitzero"` +} + +// +kubebuilder:object:root=true + +// ProjectQuotaList contains a list of ProjectQuota +type ProjectQuotaList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []ProjectQuota `json:"items"` +} + +func init() { + SchemeBuilder.Register(&ProjectQuota{}, &ProjectQuotaList{}) +} diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index d9daa7aab..1a4bc222a 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -1420,6 +1420,120 @@ func (in *PlacementDatasource) DeepCopy() *PlacementDatasource { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ProjectQuota) DeepCopyInto(out *ProjectQuota) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProjectQuota. +func (in *ProjectQuota) DeepCopy() *ProjectQuota { + if in == nil { + return nil + } + out := new(ProjectQuota) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ProjectQuota) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ProjectQuotaList) DeepCopyInto(out *ProjectQuotaList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]ProjectQuota, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProjectQuotaList. +func (in *ProjectQuotaList) DeepCopy() *ProjectQuotaList { + if in == nil { + return nil + } + out := new(ProjectQuotaList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ProjectQuotaList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ProjectQuotaSpec) DeepCopyInto(out *ProjectQuotaSpec) { + *out = *in + if in.Quota != nil { + in, out := &in.Quota, &out.Quota + *out = make(map[string]ResourceQuota, len(*in)) + for key, val := range *in { + (*out)[key] = *val.DeepCopy() + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProjectQuotaSpec. +func (in *ProjectQuotaSpec) DeepCopy() *ProjectQuotaSpec { + if in == nil { + return nil + } + out := new(ProjectQuotaSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ProjectQuotaStatus) DeepCopyInto(out *ProjectQuotaStatus) { + *out = *in + if in.PaygUsage != nil { + in, out := &in.PaygUsage, &out.PaygUsage + *out = make(map[string]ResourceQuotaUsage, len(*in)) + for key, val := range *in { + (*out)[key] = *val.DeepCopy() + } + } + if in.LastReconcileAt != nil { + in, out := &in.LastReconcileAt, &out.LastReconcileAt + *out = (*in).DeepCopy() + } + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProjectQuotaStatus. +func (in *ProjectQuotaStatus) DeepCopy() *ProjectQuotaStatus { + if in == nil { + return nil + } + out := new(ProjectQuotaStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *PrometheusDatasource) DeepCopyInto(out *PrometheusDatasource) { *out = *in @@ -1570,6 +1684,50 @@ func (in *ReservationStatus) DeepCopy() *ReservationStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ResourceQuota) DeepCopyInto(out *ResourceQuota) { + *out = *in + if in.PerAZ != nil { + in, out := &in.PerAZ, &out.PerAZ + *out = make(map[string]int64, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourceQuota. +func (in *ResourceQuota) DeepCopy() *ResourceQuota { + if in == nil { + return nil + } + out := new(ResourceQuota) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ResourceQuotaUsage) DeepCopyInto(out *ResourceQuotaUsage) { + *out = *in + if in.PerAZ != nil { + in, out := &in.PerAZ, &out.PerAZ + *out = make(map[string]int64, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourceQuotaUsage. +func (in *ResourceQuotaUsage) DeepCopy() *ResourceQuotaUsage { + if in == nil { + return nil + } + out := new(ResourceQuotaUsage) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SchedulingHistoryEntry) DeepCopyInto(out *SchedulingHistoryEntry) { *out = *in diff --git a/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml b/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml new file mode 100644 index 000000000..07e39aaa0 --- /dev/null +++ b/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml @@ -0,0 +1,212 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.20.1 + name: projectquotas.cortex.cloud +spec: + group: cortex.cloud + names: + kind: ProjectQuota + listKind: ProjectQuotaList + plural: projectquotas + singular: projectquota + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .spec.projectID + name: Project + type: string + - jsonPath: .spec.domainID + name: Domain + type: string + - jsonPath: .status.lastReconcileAt + name: LastReconcile + type: date + - jsonPath: .status.conditions[?(@.type=='Ready')].status + name: Ready + type: string + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + ProjectQuota is the Schema for the projectquotas API. + It persists quota values pushed by Limes via the LIQUID quota endpoint + (PUT /v1/projects/:uuid/quota → liquid.ServiceQuotaRequest). + See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ServiceQuotaRequest + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: |- + ProjectQuotaSpec defines the desired state of ProjectQuota. + Populated from PUT /v1/projects/:uuid/quota payloads (liquid.ServiceQuotaRequest). + See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ServiceQuotaRequest + properties: + domainID: + description: |- + DomainID of the OpenStack domain this project belongs to. + Extracted from liquid.ServiceQuotaRequest.ProjectMetadata.Domain.UUID. + type: string + domainName: + description: |- + DomainName is the human-readable name of the OpenStack domain. + Extracted from liquid.ServiceQuotaRequest.ProjectMetadata.Domain.Name. + type: string + projectID: + description: |- + ProjectID of the OpenStack project this quota belongs to. + Corresponds to the :uuid in the PUT URL path. + type: string + projectName: + description: |- + ProjectName is the human-readable name of the OpenStack project. + Extracted from liquid.ServiceQuotaRequest.ProjectMetadata.Name. + type: string + quota: + additionalProperties: + description: |- + ResourceQuota holds the quota for a single resource with per-AZ breakdown. + Maps to liquid.ResourceQuotaRequest from the LIQUID API. + See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ResourceQuotaRequest + properties: + perAZ: + additionalProperties: + format: int64 + type: integer + description: |- + PerAZ holds the per-availability-zone quota breakdown. + Key: availability zone name, Value: quota for that AZ. + Only populated for AZSeparatedTopology resources. + Corresponds to liquid.ResourceQuotaRequest.PerAZ[az].Quota. + See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#AZResourceQuotaRequest + type: object + quota: + description: |- + Quota is the total quota across all AZs (for compatibility). + Corresponds to liquid.ResourceQuotaRequest.Quota. + format: int64 + type: integer + required: + - quota + type: object + description: |- + Quota maps LIQUID resource names to their per-AZ quota. + Key: liquid.ResourceName (e.g. "hw_version_hana_v2_ram") + Mirrors liquid.ServiceQuotaRequest.Resources with AZSeparatedTopology. + See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ServiceQuotaRequest + type: object + required: + - domainID + - projectID + type: object + status: + description: |- + ProjectQuotaStatus defines the observed state of ProjectQuota. + Usage values correspond to liquid.AZResourceUsageReport fields reported via /report-usage. + See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#AZResourceUsageReport + properties: + conditions: + description: Conditions holds the current status conditions. + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + lastReconcileAt: + description: LastReconcileAt is when the controller last reconciled + this project's quota. + format: date-time + type: string + paygUsage: + additionalProperties: + description: ResourceQuotaUsage holds per-AZ PAYG usage for a single + resource. + properties: + perAZ: + additionalProperties: + format: int64 + type: integer + description: |- + PerAZ holds per-availability-zone PAYG usage values. + Key: availability zone name, Value: PAYG usage in that AZ. + type: object + type: object + description: |- + PaygUsage tracks per-resource per-AZ pay-as-you-go usage. + Key: liquid.ResourceName + type: object + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/internal/scheduling/reservations/commitments/api/handler.go b/internal/scheduling/reservations/commitments/api/handler.go index f0eb24110..413bad31f 100644 --- a/internal/scheduling/reservations/commitments/api/handler.go +++ b/internal/scheduling/reservations/commitments/api/handler.go @@ -26,6 +26,7 @@ type HTTPAPI struct { usageMonitor ReportUsageAPIMonitor capacityMonitor ReportCapacityAPIMonitor infoMonitor InfoAPIMonitor + quotaMonitor QuotaAPIMonitor // Mutex to serialize change-commitments requests changeMutex sync.Mutex } @@ -44,6 +45,7 @@ func NewAPIWithConfig(k8sClient client.Client, config commitments.Config, usageD usageMonitor: NewReportUsageAPIMonitor(), capacityMonitor: NewReportCapacityAPIMonitor(), infoMonitor: NewInfoAPIMonitor(), + quotaMonitor: NewQuotaAPIMonitor(), } } @@ -52,6 +54,7 @@ func (api *HTTPAPI) Init(mux *http.ServeMux, registry prometheus.Registerer, log registry.MustRegister(&api.usageMonitor) registry.MustRegister(&api.capacityMonitor) registry.MustRegister(&api.infoMonitor) + registry.MustRegister(&api.quotaMonitor) mux.HandleFunc("/commitments/v1/change-commitments", api.HandleChangeCommitments) mux.HandleFunc("/commitments/v1/report-capacity", api.HandleReportCapacity) mux.HandleFunc("/commitments/v1/info", api.HandleInfo) diff --git a/internal/scheduling/reservations/commitments/api/info.go b/internal/scheduling/reservations/commitments/api/info.go index 6999b38d6..f16f71301 100644 --- a/internal/scheduling/reservations/commitments/api/info.go +++ b/internal/scheduling/reservations/commitments/api/info.go @@ -151,6 +151,12 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, logger logr.Logger) (l return liquid.ServiceInfo{}, fmt.Errorf("%w: failed to create unit for flavor group %q: %w", errInternalServiceInfo, groupName, err) } + // Determine topology: AZSeparatedTopology only for groups that accept commitments + // (AZSeparatedTopology means quota is also AZ-aware, required when HasQuota=true) + ramTopology := liquid.AZAwareTopology + if handlesCommitments { + ramTopology = liquid.AZSeparatedTopology + } resources[ramResourceName] = liquid.ResourceInfo{ DisplayName: fmt.Sprintf( "multiples of %d MiB (usable by: %s)", @@ -158,10 +164,10 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, logger logr.Logger) (l flavorListStr, ), Unit: ramUnit, // Non-standard unit: multiples of smallest flavor RAM - Topology: liquid.AZAwareTopology, + Topology: ramTopology, NeedsResourceDemand: false, - HasCapacity: true, // We report capacity via /commitments/v1/report-capacity - HasQuota: false, + HasCapacity: true, // We report capacity via /commitments/v1/report-capacity + HasQuota: handlesCommitments, // true only for groups that accept commitments HandlesCommitments: handlesCommitments, // Only groups with fixed ratio accept commitments Attributes: attrsJSON, } diff --git a/internal/scheduling/reservations/commitments/api/info_test.go b/internal/scheduling/reservations/commitments/api/info_test.go index 48e12fd2c..3ca0bd11c 100644 --- a/internal/scheduling/reservations/commitments/api/info_test.go +++ b/internal/scheduling/reservations/commitments/api/info_test.go @@ -224,7 +224,7 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { t.Fatalf("expected 6 resources (3 per flavor group), got %d", len(serviceInfo.Resources)) } - // Test RAM resource: hw_version_hana_fixed_ram + // Test RAM resource: hw_version_hana_fixed_ram (fixed ratio → commitments + quota) ramResource, ok := serviceInfo.Resources["hw_version_hana_fixed_ram"] if !ok { t.Fatal("expected hw_version_hana_fixed_ram resource to exist") @@ -235,8 +235,14 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { if !ramResource.HandlesCommitments { t.Error("hw_version_hana_fixed_ram: expected HandlesCommitments=true (RAM is primary commitment resource)") } + if ramResource.Topology != liquid.AZSeparatedTopology { + t.Errorf("hw_version_hana_fixed_ram: expected Topology=%q, got %q", liquid.AZSeparatedTopology, ramResource.Topology) + } + if !ramResource.HasQuota { + t.Error("hw_version_hana_fixed_ram: expected HasQuota=true (fixed ratio groups accept quotas)") + } - // Test Cores resource: hw_version_hana_fixed_cores + // Test Cores resource: hw_version_hana_fixed_cores (always AZAwareTopology, no quota) coresResource, ok := serviceInfo.Resources["hw_version_hana_fixed_cores"] if !ok { t.Fatal("expected hw_version_hana_fixed_cores resource to exist") @@ -247,8 +253,14 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { if coresResource.HandlesCommitments { t.Error("hw_version_hana_fixed_cores: expected HandlesCommitments=false (cores are derived)") } + if coresResource.Topology != liquid.AZAwareTopology { + t.Errorf("hw_version_hana_fixed_cores: expected Topology=%q, got %q", liquid.AZAwareTopology, coresResource.Topology) + } + if coresResource.HasQuota { + t.Error("hw_version_hana_fixed_cores: expected HasQuota=false") + } - // Test Instances resource: hw_version_hana_fixed_instances + // Test Instances resource: hw_version_hana_fixed_instances (always AZAwareTopology, no quota) instancesResource, ok := serviceInfo.Resources["hw_version_hana_fixed_instances"] if !ok { t.Fatal("expected hw_version_hana_fixed_instances resource to exist") @@ -259,8 +271,15 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { if instancesResource.HandlesCommitments { t.Error("hw_version_hana_fixed_instances: expected HandlesCommitments=false (instances are derived)") } + if instancesResource.Topology != liquid.AZAwareTopology { + t.Errorf("hw_version_hana_fixed_instances: expected Topology=%q, got %q", liquid.AZAwareTopology, instancesResource.Topology) + } + if instancesResource.HasQuota { + t.Error("hw_version_hana_fixed_instances: expected HasQuota=false") + } // Variable ratio group DOES have resources now, but HandlesCommitments=false for RAM + // Variable ratio → AZAwareTopology, no quota v2RamResource, ok := serviceInfo.Resources["hw_version_v2_variable_ram"] if !ok { t.Fatal("expected hw_version_v2_variable_ram resource to exist (all groups included)") @@ -271,6 +290,12 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { if v2RamResource.HandlesCommitments { t.Error("hw_version_v2_variable_ram: expected HandlesCommitments=false (variable ratio)") } + if v2RamResource.Topology != liquid.AZAwareTopology { + t.Errorf("hw_version_v2_variable_ram: expected Topology=%q, got %q", liquid.AZAwareTopology, v2RamResource.Topology) + } + if v2RamResource.HasQuota { + t.Error("hw_version_v2_variable_ram: expected HasQuota=false (variable ratio)") + } v2CoresResource, ok := serviceInfo.Resources["hw_version_v2_variable_cores"] if !ok { @@ -282,6 +307,12 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { if v2CoresResource.HandlesCommitments { t.Error("hw_version_v2_variable_cores: expected HandlesCommitments=false") } + if v2CoresResource.Topology != liquid.AZAwareTopology { + t.Errorf("hw_version_v2_variable_cores: expected Topology=%q, got %q", liquid.AZAwareTopology, v2CoresResource.Topology) + } + if v2CoresResource.HasQuota { + t.Error("hw_version_v2_variable_cores: expected HasQuota=false") + } v2InstancesResource, ok := serviceInfo.Resources["hw_version_v2_variable_instances"] if !ok { @@ -293,4 +324,10 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { if v2InstancesResource.HandlesCommitments { t.Error("hw_version_v2_variable_instances: expected HandlesCommitments=false") } + if v2InstancesResource.Topology != liquid.AZAwareTopology { + t.Errorf("hw_version_v2_variable_instances: expected Topology=%q, got %q", liquid.AZAwareTopology, v2InstancesResource.Topology) + } + if v2InstancesResource.HasQuota { + t.Error("hw_version_v2_variable_instances: expected HasQuota=false") + } } diff --git a/internal/scheduling/reservations/commitments/api/quota.go b/internal/scheduling/reservations/commitments/api/quota.go index c77fdf1a6..37b57d22a 100644 --- a/internal/scheduling/reservations/commitments/api/quota.go +++ b/internal/scheduling/reservations/commitments/api/quota.go @@ -4,19 +4,35 @@ package api import ( + "encoding/json" + "fmt" + "math" "net/http" + "strconv" + "time" + "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/google/uuid" + "github.com/sapcc/go-api-declarations/liquid" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" ) +// projectQuotaCRDName returns the CRD object name for a given project UUID. +// Convention: "quota-" +func projectQuotaCRDName(projectID string) string { + return "quota-" + projectID +} + // HandleQuota implements PUT /commitments/v1/projects/:project_id/quota from Limes LIQUID API. // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid // -// This is a no-op endpoint that accepts quota requests but doesn't store them. -// Cortex does not enforce quotas for committed resources - quota enforcement -// happens through commitment validation at change-commitments time. -// The endpoint exists for API compatibility with the LIQUID specification. +// This endpoint receives quota requests from Limes and persists them as ProjectQuota CRDs. +// One CRD per project, named "quota-". func (api *HTTPAPI) HandleQuota(w http.ResponseWriter, r *http.Request) { + startTime := time.Now() + // Extract or generate request ID for tracing requestID := r.Header.Get("X-Request-ID") if requestID == "" { @@ -27,14 +43,138 @@ func (api *HTTPAPI) HandleQuota(w http.ResponseWriter, r *http.Request) { log := apiLog.WithValues("requestID", requestID, "endpoint", "quota") if r.Method != http.MethodPut { - http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + api.quotaError(w, http.StatusMethodNotAllowed, "Method not allowed", startTime) + return + } + + // Check if quota API is enabled + if !api.config.EnableQuotaAPI { + api.quotaError(w, http.StatusServiceUnavailable, "Quota API is disabled", startTime) + return + } + + // Extract project UUID from URL path + projectID, err := extractProjectIDFromPath(r.URL.Path) + if err != nil { + log.Error(err, "failed to extract project ID from path") + api.quotaError(w, http.StatusBadRequest, "Invalid URL path: "+err.Error(), startTime) + return + } + + // Parse request body + var req liquid.ServiceQuotaRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + log.Error(err, "failed to decode quota request body") + api.quotaError(w, http.StatusBadRequest, "Invalid request body: "+err.Error(), startTime) return } - // No-op: Accept the quota request but don't store it - // Cortex handles capacity through commitments, not quotas - log.V(1).Info("received quota request (no-op)", "path", r.URL.Path) + // Extract project/domain metadata if available + var projectName, domainID, domainName string + if meta, ok := req.ProjectMetadata.Unpack(); ok { + // Consistency check: metadata UUID must match URL path UUID + if meta.UUID != "" && meta.UUID != projectID { + log.Info("project UUID mismatch", "urlProjectID", projectID, "metadataUUID", meta.UUID) + api.quotaError(w, http.StatusBadRequest, fmt.Sprintf("Project UUID mismatch: URL has %q but metadata has %q", projectID, meta.UUID), startTime) + return + } + projectName = meta.Name + domainID = meta.Domain.UUID + domainName = meta.Domain.Name + } + + // Build the spec quota map from the liquid request. + // liquid API uses uint64; our CRD uses int64 (K8s convention). + // Guard against overflow: uint64 values > MaxInt64 would wrap to negative. + specQuota := make(map[string]v1alpha1.ResourceQuota, len(req.Resources)) + for resourceName, resQuota := range req.Resources { + if resQuota.Quota > math.MaxInt64 { + api.quotaError(w, http.StatusBadRequest, fmt.Sprintf("Quota value for resource %q exceeds int64 max", resourceName), startTime) + return + } + rq := v1alpha1.ResourceQuota{ + Quota: int64(resQuota.Quota), + } + if len(resQuota.PerAZ) > 0 { + rq.PerAZ = make(map[string]int64, len(resQuota.PerAZ)) + for az, azQuota := range resQuota.PerAZ { + if azQuota.Quota > math.MaxInt64 { + api.quotaError(w, http.StatusBadRequest, fmt.Sprintf("Quota value for resource %q in AZ %q exceeds int64 max", resourceName, az), startTime) + return + } + rq.PerAZ[string(az)] = int64(azQuota.Quota) + } + } + specQuota[string(resourceName)] = rq + } + + // Create or update ProjectQuota CRD + crdName := projectQuotaCRDName(projectID) + ctx := r.Context() + + var existing v1alpha1.ProjectQuota + err = api.client.Get(ctx, client.ObjectKey{Name: crdName}, &existing) + if err != nil { + if !apierrors.IsNotFound(err) { + // Real error + log.Error(err, "failed to get existing ProjectQuota", "name", crdName) + api.quotaError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to check existing quota: %v", err), startTime) + return + } + // Not found — create new + pq := &v1alpha1.ProjectQuota{ + ObjectMeta: metav1.ObjectMeta{ + Name: crdName, + }, + Spec: v1alpha1.ProjectQuotaSpec{ + ProjectID: projectID, + ProjectName: projectName, + DomainID: domainID, + DomainName: domainName, + Quota: specQuota, + }, + } + if err := api.client.Create(ctx, pq); err != nil { + log.Error(err, "failed to create ProjectQuota", "name", crdName) + api.quotaError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to create quota: %v", err), startTime) + return + } + log.V(1).Info("created ProjectQuota", "name", crdName, "projectID", projectID, "resources", len(specQuota)) + } else { + // Update existing + existing.Spec.Quota = specQuota + if projectName != "" { + existing.Spec.ProjectName = projectName + } + if domainID != "" { + existing.Spec.DomainID = domainID + } + if domainName != "" { + existing.Spec.DomainName = domainName + } + if err := api.client.Update(ctx, &existing); err != nil { + log.Error(err, "failed to update ProjectQuota", "name", crdName) + api.quotaError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to update quota: %v", err), startTime) + return + } + log.V(1).Info("updated ProjectQuota", "name", crdName, "projectID", projectID, "resources", len(specQuota)) + } // Return 204 No Content as expected by the LIQUID API w.WriteHeader(http.StatusNoContent) + api.recordQuotaMetrics(http.StatusNoContent, startTime) +} + +// quotaError writes an HTTP error response and records metrics. Used for error paths in HandleQuota. +func (api *HTTPAPI) quotaError(w http.ResponseWriter, statusCode int, msg string, startTime time.Time) { + http.Error(w, msg, statusCode) + api.recordQuotaMetrics(statusCode, startTime) +} + +// recordQuotaMetrics records Prometheus metrics for a quota API request. +func (api *HTTPAPI) recordQuotaMetrics(statusCode int, startTime time.Time) { + duration := time.Since(startTime).Seconds() + statusCodeStr := strconv.Itoa(statusCode) + api.quotaMonitor.requestCounter.WithLabelValues(statusCodeStr).Inc() + api.quotaMonitor.requestDuration.WithLabelValues(statusCodeStr).Observe(duration) } diff --git a/internal/scheduling/reservations/commitments/api/quota_monitor.go b/internal/scheduling/reservations/commitments/api/quota_monitor.go new file mode 100644 index 000000000..c06d4b788 --- /dev/null +++ b/internal/scheduling/reservations/commitments/api/quota_monitor.go @@ -0,0 +1,47 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package api + +import "github.com/prometheus/client_golang/prometheus" + +// QuotaAPIMonitor provides metrics for the CR quota API. +type QuotaAPIMonitor struct { + requestCounter *prometheus.CounterVec + requestDuration *prometheus.HistogramVec +} + +// NewQuotaAPIMonitor creates a new monitor with Prometheus metrics. +// Metrics are pre-initialized with zero values for common HTTP status codes +// to ensure they appear in Prometheus before the first request. +func NewQuotaAPIMonitor() QuotaAPIMonitor { + m := QuotaAPIMonitor{ + requestCounter: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_committed_resource_quota_api_requests_total", + Help: "Total number of quota API requests by status code.", + }, []string{"status_code"}), + requestDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "cortex_committed_resource_quota_api_request_duration_seconds", + Help: "Duration of quota API requests in seconds.", + Buckets: prometheus.DefBuckets, + }, []string{"status_code"}), + } + // Pre-initialize common status codes so they appear in Prometheus before the first request + for _, statusCode := range []string{"204", "400", "405", "500"} { + m.requestCounter.WithLabelValues(statusCode) + m.requestDuration.WithLabelValues(statusCode) + } + return m +} + +// Describe implements prometheus.Collector. +func (m *QuotaAPIMonitor) Describe(ch chan<- *prometheus.Desc) { + m.requestCounter.Describe(ch) + m.requestDuration.Describe(ch) +} + +// Collect implements prometheus.Collector. +func (m *QuotaAPIMonitor) Collect(ch chan<- prometheus.Metric) { + m.requestCounter.Collect(ch) + m.requestDuration.Collect(ch) +} diff --git a/internal/scheduling/reservations/commitments/api/quota_test.go b/internal/scheduling/reservations/commitments/api/quota_test.go new file mode 100644 index 000000000..218bc0815 --- /dev/null +++ b/internal/scheduling/reservations/commitments/api/quota_test.go @@ -0,0 +1,354 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package api + +import ( + "bytes" + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + commitments "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/commitments" + "github.com/majewsky/gg/option" + "github.com/sapcc/go-api-declarations/liquid" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +// newTestScheme returns a scheme with v1alpha1 types registered. +func newTestScheme(t *testing.T) *runtime.Scheme { + t.Helper() + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add scheme: %v", err) + } + return scheme +} + +// marshalQuotaReq marshals a ServiceQuotaRequest, failing the test on error. +func marshalQuotaReq(t *testing.T, req liquid.ServiceQuotaRequest) []byte { + t.Helper() + body, err := json.Marshal(req) + if err != nil { + t.Fatalf("failed to marshal request: %v", err) + } + return body +} + +func TestHandleQuota_ErrorCases(t *testing.T) { + tests := []struct { + name string + method string + path string + body []byte + metadata *liquid.ProjectMetadata + enableQuota *bool // nil = default (enabled) + expectedStatus int + }{ + { + name: "MethodNotAllowed_GET", + method: http.MethodGet, + path: "/commitments/v1/projects/project-abc/quota", + body: nil, + expectedStatus: http.StatusMethodNotAllowed, + }, + { + name: "MethodNotAllowed_POST", + method: http.MethodPost, + path: "/commitments/v1/projects/project-abc/quota", + body: nil, + expectedStatus: http.StatusMethodNotAllowed, + }, + { + name: "DisabledAPI", + method: http.MethodPut, + path: "/commitments/v1/projects/project-abc/quota", + body: []byte(`{"resources":{}}`), + enableQuota: boolPtr(false), + expectedStatus: http.StatusServiceUnavailable, + }, + { + name: "InvalidBody", + method: http.MethodPut, + path: "/commitments/v1/projects/project-abc/quota", + body: []byte("{invalid"), + expectedStatus: http.StatusBadRequest, + }, + { + name: "EmptyBody", + method: http.MethodPut, + path: "/commitments/v1/projects/project-abc/quota", + body: []byte(""), + expectedStatus: http.StatusBadRequest, + }, + { + name: "UUIDMismatch", + method: http.MethodPut, + path: "/commitments/v1/projects/project-abc/quota", + metadata: &liquid.ProjectMetadata{ + UUID: "different-uuid", + Name: "my-project", + Domain: liquid.DomainMetadata{UUID: "domain-123", Name: "my-domain"}, + }, + expectedStatus: http.StatusBadRequest, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + scheme := newTestScheme(t) + k8sClient := fake.NewClientBuilder().WithScheme(scheme).Build() + + var httpAPI *HTTPAPI + if tc.enableQuota != nil && !*tc.enableQuota { + config := commitments.DefaultConfig() + config.EnableQuotaAPI = false + httpAPI = NewAPIWithConfig(k8sClient, config, nil) + } else { + httpAPI = NewAPI(k8sClient) + } + + // Build body: use provided bytes or construct from metadata + var bodyReader *bytes.Reader + switch { + case tc.body != nil: + bodyReader = bytes.NewReader(tc.body) + case tc.metadata != nil: + quotaReq := liquid.ServiceQuotaRequest{ + Resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{ + "hw_version_hana_1_ram": {Quota: 100}, + }, + } + quotaReq.ProjectMetadata = option.Some(*tc.metadata) + bodyReader = bytes.NewReader(marshalQuotaReq(t, quotaReq)) + default: + bodyReader = bytes.NewReader([]byte{}) + } + + req := httptest.NewRequest(tc.method, tc.path, bodyReader) + w := httptest.NewRecorder() + + httpAPI.HandleQuota(w, req) + + resp := w.Result() + defer resp.Body.Close() + + if resp.StatusCode != tc.expectedStatus { + t.Errorf("expected status %d, got %d", tc.expectedStatus, resp.StatusCode) + } + }) + } +} + +func TestHandleQuota_CreateAndUpdate(t *testing.T) { + tests := []struct { + name string + // existing is a pre-existing CRD to seed (nil = create, non-nil = update) + existing *v1alpha1.ProjectQuota + projectID string + resources map[liquid.ResourceName]liquid.ResourceQuotaRequest + metadata *liquid.ProjectMetadata + expectQuota map[string]int64 // resource name → expected total quota + expectPerAZ map[string]map[string]int64 // resource name → az → expected quota + expectName string + expectDomain string + expectDomName string + }{ + { + name: "Create_WithPerAZ", + projectID: "project-abc-123", + resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{ + "hw_version_hana_1_ram": { + Quota: 100, + PerAZ: map[liquid.AvailabilityZone]liquid.AZResourceQuotaRequest{ + "az-a": {Quota: 60}, + "az-b": {Quota: 40}, + }, + }, + }, + expectQuota: map[string]int64{"hw_version_hana_1_ram": 100}, + expectPerAZ: map[string]map[string]int64{ + "hw_version_hana_1_ram": {"az-a": 60, "az-b": 40}, + }, + }, + { + name: "Create_EmptyResources", + projectID: "project-empty", + resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{}, + expectQuota: map[string]int64{}, + }, + { + name: "Create_WithMetadata", + projectID: "project-meta-test", + resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{ + "hw_version_hana_1_ram": {Quota: 50}, + }, + metadata: &liquid.ProjectMetadata{ + UUID: "project-meta-test", + Name: "my-project-name", + Domain: liquid.DomainMetadata{ + UUID: "domain-uuid-456", + Name: "my-domain-name", + }, + }, + expectQuota: map[string]int64{"hw_version_hana_1_ram": 50}, + expectName: "my-project-name", + expectDomain: "domain-uuid-456", + expectDomName: "my-domain-name", + }, + { + name: "Update_QuotaValues", + existing: &v1alpha1.ProjectQuota{ + Spec: v1alpha1.ProjectQuotaSpec{ + ProjectID: "project-xyz", + DomainID: "original-domain", + DomainName: "original-domain-name", + ProjectName: "original-project-name", + Quota: map[string]v1alpha1.ResourceQuota{ + "hw_version_hana_1_ram": {Quota: 50, PerAZ: map[string]int64{"az-a": 50}}, + }, + }, + }, + projectID: "project-xyz", + resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{ + "hw_version_hana_1_ram": { + Quota: 200, + PerAZ: map[liquid.AvailabilityZone]liquid.AZResourceQuotaRequest{ + "az-a": {Quota: 120}, + "az-b": {Quota: 80}, + }, + }, + }, + expectQuota: map[string]int64{"hw_version_hana_1_ram": 200}, + expectPerAZ: map[string]map[string]int64{ + "hw_version_hana_1_ram": {"az-a": 120, "az-b": 80}, + }, + // Metadata should be preserved when not provided in update + expectDomain: "original-domain", + expectDomName: "original-domain-name", + expectName: "original-project-name", + }, + { + name: "Update_WithNewMetadata", + existing: &v1alpha1.ProjectQuota{ + Spec: v1alpha1.ProjectQuotaSpec{ + ProjectID: "project-update-meta", + DomainID: "old-domain", + DomainName: "old-domain-name", + ProjectName: "old-project-name", + Quota: map[string]v1alpha1.ResourceQuota{ + "hw_version_hana_1_ram": {Quota: 10}, + }, + }, + }, + projectID: "project-update-meta", + resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{ + "hw_version_hana_1_ram": {Quota: 99}, + }, + metadata: &liquid.ProjectMetadata{ + UUID: "project-update-meta", + Name: "new-project-name", + Domain: liquid.DomainMetadata{ + UUID: "new-domain", + Name: "new-domain-name", + }, + }, + expectQuota: map[string]int64{"hw_version_hana_1_ram": 99}, + expectName: "new-project-name", + expectDomain: "new-domain", + expectDomName: "new-domain-name", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + scheme := newTestScheme(t) + builder := fake.NewClientBuilder().WithScheme(scheme) + + if tc.existing != nil { + tc.existing.Name = projectQuotaCRDName(tc.projectID) + builder = builder.WithObjects(tc.existing) + } + k8sClient := builder.Build() + httpAPI := NewAPI(k8sClient) + + quotaReq := liquid.ServiceQuotaRequest{ + Resources: tc.resources, + } + if tc.metadata != nil { + quotaReq.ProjectMetadata = option.Some(*tc.metadata) + } + body := marshalQuotaReq(t, quotaReq) + + path := "/commitments/v1/projects/" + tc.projectID + "/quota" + req := httptest.NewRequest(http.MethodPut, path, bytes.NewReader(body)) + w := httptest.NewRecorder() + + httpAPI.HandleQuota(w, req) + + resp := w.Result() + defer resp.Body.Close() + + if resp.StatusCode != http.StatusNoContent { + t.Fatalf("expected status %d (No Content), got %d", http.StatusNoContent, resp.StatusCode) + } + + // Verify the ProjectQuota CRD + var pq v1alpha1.ProjectQuota + crdName := projectQuotaCRDName(tc.projectID) + if err := k8sClient.Get(context.Background(), client.ObjectKey{Name: crdName}, &pq); err != nil { + t.Fatalf("failed to get ProjectQuota CRD %q: %v", crdName, err) + } + + if pq.Spec.ProjectID != tc.projectID { + t.Errorf("expected ProjectID %q, got %q", tc.projectID, pq.Spec.ProjectID) + } + + // Verify quota totals + for resName, expectedTotal := range tc.expectQuota { + actual, ok := pq.Spec.Quota[resName] + if !ok { + t.Errorf("expected resource %q in quota spec", resName) + continue + } + if actual.Quota != expectedTotal { + t.Errorf("resource %q: expected quota %d, got %d", resName, expectedTotal, actual.Quota) + } + } + + // Verify per-AZ quotas + for resName, azMap := range tc.expectPerAZ { + actual, ok := pq.Spec.Quota[resName] + if !ok { + t.Errorf("expected resource %q in quota spec for per-AZ check", resName) + continue + } + for az, expectedAZ := range azMap { + if actual.PerAZ[az] != expectedAZ { + t.Errorf("resource %q AZ %q: expected %d, got %d", resName, az, expectedAZ, actual.PerAZ[az]) + } + } + } + + // Verify metadata + if tc.expectName != "" && pq.Spec.ProjectName != tc.expectName { + t.Errorf("expected ProjectName %q, got %q", tc.expectName, pq.Spec.ProjectName) + } + if tc.expectDomain != "" && pq.Spec.DomainID != tc.expectDomain { + t.Errorf("expected DomainID %q, got %q", tc.expectDomain, pq.Spec.DomainID) + } + if tc.expectDomName != "" && pq.Spec.DomainName != tc.expectDomName { + t.Errorf("expected DomainName %q, got %q", tc.expectDomName, pq.Spec.DomainName) + } + }) + } +} + +func boolPtr(b bool) *bool { + return &b +} diff --git a/internal/scheduling/reservations/commitments/config.go b/internal/scheduling/reservations/commitments/config.go index 888d37018..36c3ec00b 100644 --- a/internal/scheduling/reservations/commitments/config.go +++ b/internal/scheduling/reservations/commitments/config.go @@ -57,6 +57,11 @@ type Config struct { // When false, the endpoint will return HTTP 503 Service Unavailable. // This can be used as an emergency switch if the capacity reporting is causing issues. EnableReportCapacityAPI bool `json:"committedResourceEnableReportCapacityAPI"` + + // EnableQuotaAPI controls whether the quota API endpoint is active. + // When false, the endpoint will return HTTP 503 Service Unavailable. + // This can be used as an emergency switch if quota persistence is causing issues. + EnableQuotaAPI bool `json:"committedResourceEnableQuotaAPI"` } // ApplyDefaults fills in any unset values with defaults. @@ -103,5 +108,6 @@ func DefaultConfig() Config { EnableChangeCommitmentsAPI: true, EnableReportUsageAPI: true, EnableReportCapacityAPI: true, + EnableQuotaAPI: true, } } diff --git a/internal/scheduling/reservations/commitments/usage.go b/internal/scheduling/reservations/commitments/usage.go index d634fc2a0..14dbfa482 100644 --- a/internal/scheduling/reservations/commitments/usage.go +++ b/internal/scheduling/reservations/commitments/usage.go @@ -471,22 +471,33 @@ func (c *UsageCalculator) buildUsageResponse( } // Build ResourceUsageReport for all flavor groups (not just those with fixed ratio) - for flavorGroupName := range flavorGroups { + for flavorGroupName, groupData := range flavorGroups { // All flavor groups are included in usage reporting. // === 1. RAM Resource === ramResourceName := liquid.ResourceName(ResourceNameRAM(flavorGroupName)) ramPerAZ := make(map[liquid.AvailabilityZone]*liquid.AZResourceUsageReport) + // For AZSeparatedTopology resources (fixed-ratio groups), per-AZ Quota must be non-null. + // Use -1 ("infinite quota") as default until actual quota is read from ProjectQuota CRD. + ramHasAZQuota := FlavorGroupAcceptsCommitments(&groupData) for _, az := range allAZs { - ramPerAZ[az] = &liquid.AZResourceUsageReport{ + report := &liquid.AZResourceUsageReport{ Usage: 0, Subresources: []liquid.Subresource{}, } + if ramHasAZQuota { + report.Quota = Some(int64(-1)) // infinite — will be overridden by ProjectQuota CRD + } + ramPerAZ[az] = report } if azData, exists := usageByFlavorGroupAZ[flavorGroupName]; exists { for az, data := range azData { if _, known := ramPerAZ[az]; !known { - ramPerAZ[az] = &liquid.AZResourceUsageReport{} + report := &liquid.AZResourceUsageReport{} + if ramHasAZQuota { + report.Quota = Some(int64(-1)) + } + ramPerAZ[az] = report } ramPerAZ[az].Usage = data.ramUsage ramPerAZ[az].PhysicalUsage = Some(data.ramUsage) // No overcommit for RAM From 661917648516c55af15c95167dd0aa2f4af69f27 Mon Sep 17 00:00:00 2001 From: Malte Viering Date: Wed, 29 Apr 2026 16:47:33 +0200 Subject: [PATCH 2/2] WIP: add quota controller --- .claude/settings.local.json | 8 + api/v1alpha1/project_quota_types.go | 7 + api/v1alpha1/zz_generated.deepcopy.go | 7 + .../crds/cortex.cloud_projectquotas.yaml | 20 + internal/scheduling/external/nova.go | 17 + .../reservations/failover/integration_test.go | 15 + .../reservations/failover/vm_source.go | 65 + .../reservations/failover/vm_source_test.go | 4 + .../scheduling/reservations/quota/config.go | 44 + .../scheduling/reservations/quota/context.go | 27 + .../reservations/quota/controller.go | 950 +++++++++++++ .../reservations/quota/controller_test.go | 598 ++++++++ .../reservations/quota/integration_test.go | 1232 +++++++++++++++++ .../scheduling/reservations/quota/metrics.go | 98 ++ 14 files changed, 3092 insertions(+) create mode 100644 .claude/settings.local.json create mode 100644 internal/scheduling/reservations/quota/config.go create mode 100644 internal/scheduling/reservations/quota/context.go create mode 100644 internal/scheduling/reservations/quota/controller.go create mode 100644 internal/scheduling/reservations/quota/controller_test.go create mode 100644 internal/scheduling/reservations/quota/integration_test.go create mode 100644 internal/scheduling/reservations/quota/metrics.go diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 000000000..36798fdd7 --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,8 @@ +{ + "permissions": { + "allow": [ + "Read(//root/**)", + "Bash(go doc:*)" + ] + } +} diff --git a/api/v1alpha1/project_quota_types.go b/api/v1alpha1/project_quota_types.go index cf61585c6..715b6e728 100644 --- a/api/v1alpha1/project_quota_types.go +++ b/api/v1alpha1/project_quota_types.go @@ -69,7 +69,14 @@ type ProjectQuotaSpec struct { // Usage values correspond to liquid.AZResourceUsageReport fields reported via /report-usage. // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#AZResourceUsageReport type ProjectQuotaStatus struct { + // TotalUsage tracks per-resource per-AZ total resource consumption (all VMs in this project). + // Persisted by the quota controller; updated by full reconcile and HV instance diffs. + // Key: liquid.ResourceName + // +kubebuilder:validation:Optional + TotalUsage map[string]ResourceQuotaUsage `json:"totalUsage,omitempty"` + // PaygUsage tracks per-resource per-AZ pay-as-you-go usage. + // Derived as TotalUsage - CRUsage (clamped >= 0). // Key: liquid.ResourceName // +kubebuilder:validation:Optional PaygUsage map[string]ResourceQuotaUsage `json:"paygUsage,omitempty"` diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 1a4bc222a..873beb73c 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -1504,6 +1504,13 @@ func (in *ProjectQuotaSpec) DeepCopy() *ProjectQuotaSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ProjectQuotaStatus) DeepCopyInto(out *ProjectQuotaStatus) { *out = *in + if in.TotalUsage != nil { + in, out := &in.TotalUsage, &out.TotalUsage + *out = make(map[string]ResourceQuotaUsage, len(*in)) + for key, val := range *in { + (*out)[key] = *val.DeepCopy() + } + } if in.PaygUsage != nil { in, out := &in.PaygUsage, &out.PaygUsage *out = make(map[string]ResourceQuotaUsage, len(*in)) diff --git a/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml b/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml index 07e39aaa0..7bebbdb6c 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml @@ -200,6 +200,26 @@ spec: type: object description: |- PaygUsage tracks per-resource per-AZ pay-as-you-go usage. + Derived as TotalUsage - CRUsage (clamped >= 0). + Key: liquid.ResourceName + type: object + totalUsage: + additionalProperties: + description: ResourceQuotaUsage holds per-AZ PAYG usage for a single + resource. + properties: + perAZ: + additionalProperties: + format: int64 + type: integer + description: |- + PerAZ holds per-availability-zone PAYG usage values. + Key: availability zone name, Value: PAYG usage in that AZ. + type: object + type: object + description: |- + TotalUsage tracks per-resource per-AZ total resource consumption (all VMs in this project). + Persisted by the quota controller; updated by full reconcile and HV instance diffs. Key: liquid.ResourceName type: object type: object diff --git a/internal/scheduling/external/nova.go b/internal/scheduling/external/nova.go index b59a37d5b..741c5659c 100644 --- a/internal/scheduling/external/nova.go +++ b/internal/scheduling/external/nova.go @@ -17,6 +17,9 @@ type NovaReaderInterface interface { GetAllFlavors(ctx context.Context) ([]nova.Flavor, error) GetServerByID(ctx context.Context, serverID string) (*nova.Server, error) GetFlavorByName(ctx context.Context, flavorName string) (*nova.Flavor, error) + // GetDeletedServerByID returns a deleted server by its ID from the deleted_servers table. + // Returns nil, nil if the server is not found in the deleted_servers table. + GetDeletedServerByID(ctx context.Context, serverID string) (*nova.DeletedServer, error) } // NovaReader provides read access to Nova data stored in the database. @@ -107,3 +110,17 @@ func (r *NovaReader) GetFlavorByName(ctx context.Context, flavorName string) (*n } return &flavors[0], nil } + +// GetDeletedServerByID returns a deleted Nova server by its ID from the deleted_servers table. +// Returns nil, nil if the server is not found in the deleted_servers table. +func (r *NovaReader) GetDeletedServerByID(ctx context.Context, serverID string) (*nova.DeletedServer, error) { + var servers []nova.DeletedServer + query := "SELECT * FROM " + nova.DeletedServer{}.TableName() + " WHERE id = $1" + if err := r.Select(ctx, &servers, query, serverID); err != nil { + return nil, fmt.Errorf("failed to query deleted server by ID: %w", err) + } + if len(servers) == 0 { + return nil, nil + } + return &servers[0], nil +} diff --git a/internal/scheduling/reservations/failover/integration_test.go b/internal/scheduling/reservations/failover/integration_test.go index 66d5733bb..df1354be6 100644 --- a/internal/scheduling/reservations/failover/integration_test.go +++ b/internal/scheduling/reservations/failover/integration_test.go @@ -1068,6 +1068,21 @@ func (s *MockVMSource) GetVM(_ context.Context, vmUUID string) (*VM, error) { return nil, nil } +// IsServerActive returns true if the server is found in the mock VMs. +func (s *MockVMSource) IsServerActive(_ context.Context, vmUUID string) (bool, error) { + for i := range s.VMs { + if s.VMs[i].UUID == vmUUID { + return true, nil + } + } + return false, nil +} + +// GetDeletedVMInfo returns nil, nil (no deleted VMs in mock). +func (s *MockVMSource) GetDeletedVMInfo(_ context.Context, _ string) (*DeletedVMInfo, error) { + return nil, nil +} + // newIntegrationTestEnv creates a complete test environment with HTTP server and VMSource. func newIntegrationTestEnv(t *testing.T, vms []VM, hypervisors []*hv1.Hypervisor, reservations []*v1alpha1.Reservation) *IntegrationTestEnv { t.Helper() diff --git a/internal/scheduling/reservations/failover/vm_source.go b/internal/scheduling/reservations/failover/vm_source.go index 4d5c3f210..bcf935798 100644 --- a/internal/scheduling/reservations/failover/vm_source.go +++ b/internal/scheduling/reservations/failover/vm_source.go @@ -26,6 +26,9 @@ type VM struct { // AvailabilityZone is the availability zone where the VM is located. // This is used to ensure failover reservations are created in the same AZ. AvailabilityZone string + // CreatedAt is the ISO 8601 timestamp when the VM was created in Nova. + // Used by the quota controller to distinguish new VMs from migrations. + CreatedAt string // Resources contains the VM's resource allocations (e.g., "memory", "vcpus"). Resources map[string]resource.Quantity // FlavorExtraSpecs contains the flavor's extra specifications (e.g., traits, capabilities). @@ -46,6 +49,22 @@ type VMSource interface { // GetVM returns a specific VM by UUID. // Returns nil, nil if the VM is not found (not an error, just doesn't exist). GetVM(ctx context.Context, vmUUID string) (*VM, error) + // IsServerActive returns true if the server exists in the servers table (still running somewhere). + // Returns false if not found. Used by quota controller to determine if a removed HV instance was deleted vs migrated. + IsServerActive(ctx context.Context, vmUUID string) (bool, error) + // GetDeletedVMInfo returns metadata about a deleted VM (from deleted_servers table), + // including resolved flavor resources. Returns nil, nil if not found. + // Used by quota controller for incremental usage decrements. + GetDeletedVMInfo(ctx context.Context, vmUUID string) (*DeletedVMInfo, error) +} + +// DeletedVMInfo contains the metadata needed to compute resource decrements for a deleted VM. +type DeletedVMInfo struct { + ProjectID string + AvailabilityZone string + FlavorName string + RAMMiB uint64 + VCPUs uint64 } // DBVMSource implements VMSource by reading directly from the database. @@ -122,6 +141,7 @@ func (s *DBVMSource) ListVMs(ctx context.Context) ([]VM, error) { ProjectID: server.TenantID, CurrentHypervisor: server.OSEXTSRVATTRHost, AvailabilityZone: server.OSEXTAvailabilityZone, + CreatedAt: server.Created, Resources: resources, FlavorExtraSpecs: extraSpecs, }) @@ -208,6 +228,7 @@ func (s *DBVMSource) GetVM(ctx context.Context, vmUUID string) (*VM, error) { ProjectID: server.TenantID, CurrentHypervisor: server.OSEXTSRVATTRHost, AvailabilityZone: server.OSEXTAvailabilityZone, + CreatedAt: server.Created, Resources: resources, FlavorExtraSpecs: extraSpecs, }, nil @@ -397,6 +418,50 @@ func filterVMsOnKnownHypervisors(vms []VM, hypervisorList *hv1.HypervisorList) [ return result } +// IsServerActive returns true if the server exists in the servers table and is not DELETED. +// VMs in any other status (ACTIVE, SHUTOFF, MIGRATING, ERROR, etc.) still consume resources +// and should NOT be decremented from quota usage. +// Used by the quota controller to distinguish deleted VMs from migrated/existing ones. +func (s *DBVMSource) IsServerActive(ctx context.Context, vmUUID string) (bool, error) { + server, err := s.NovaReader.GetServerByID(ctx, vmUUID) + if err != nil { + return false, fmt.Errorf("failed to check server existence: %w", err) + } + if server == nil { + return false, nil + } + return server.Status != "DELETED", nil +} + +// GetDeletedVMInfo returns metadata about a deleted VM from the deleted_servers table, +// including resolved flavor resources. Returns nil, nil if the VM is not found in deleted_servers. +func (s *DBVMSource) GetDeletedVMInfo(ctx context.Context, vmUUID string) (*DeletedVMInfo, error) { + deletedServer, err := s.NovaReader.GetDeletedServerByID(ctx, vmUUID) + if err != nil { + return nil, fmt.Errorf("failed to get deleted server: %w", err) + } + if deletedServer == nil { + return nil, nil + } + + // Resolve the flavor to get RAM/VCPUs + flavor, err := s.NovaReader.GetFlavorByName(ctx, deletedServer.FlavorName) + if err != nil { + return nil, fmt.Errorf("failed to get flavor for deleted server: %w", err) + } + if flavor == nil { + return nil, fmt.Errorf("flavor %q not found for deleted server %s", deletedServer.FlavorName, vmUUID) + } + + return &DeletedVMInfo{ + ProjectID: deletedServer.TenantID, + AvailabilityZone: deletedServer.OSEXTAvailabilityZone, + FlavorName: deletedServer.FlavorName, + RAMMiB: flavor.RAM, + VCPUs: flavor.VCPUs, + }, nil +} + // warnUnknownVMsOnHypervisors logs a warning for VMs that are on hypervisors but not in the ListVMs (i.e. nova) result. // This can indicate a data sync issue between the hypervisor operator and the VM datasource. func warnUnknownVMsOnHypervisors(hypervisors *hv1.HypervisorList, vms []VM) { diff --git a/internal/scheduling/reservations/failover/vm_source_test.go b/internal/scheduling/reservations/failover/vm_source_test.go index 0b30af0e5..a710c5658 100644 --- a/internal/scheduling/reservations/failover/vm_source_test.go +++ b/internal/scheduling/reservations/failover/vm_source_test.go @@ -399,3 +399,7 @@ func (m *mockNovaReader) GetFlavorByName(ctx context.Context, flavorName string) } return nil, nil } + +func (m *mockNovaReader) GetDeletedServerByID(_ context.Context, _ string) (*nova.DeletedServer, error) { + return nil, nil +} diff --git a/internal/scheduling/reservations/quota/config.go b/internal/scheduling/reservations/quota/config.go new file mode 100644 index 000000000..b7314f595 --- /dev/null +++ b/internal/scheduling/reservations/quota/config.go @@ -0,0 +1,44 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package quota + +import ( + "time" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// QuotaControllerConfig defines the configuration for the quota controller. +type QuotaControllerConfig struct { + // FullReconcileInterval is the periodic full reconcile interval. + // Full reconcile re-reads all VMs from Postgres and recomputes all usage. Default: 5m. + FullReconcileInterval metav1.Duration `json:"fullReconcileInterval"` + + // CRStateFilter defines which CommittedResource states to include + // when summing cr_actual_usage. Default: ["confirmed", "guaranteed"] + CRStateFilter []v1alpha1.CommitmentStatus `json:"crStateFilter"` +} + +// ApplyDefaults fills in any unset values with defaults. +func (c *QuotaControllerConfig) ApplyDefaults() { + defaults := DefaultQuotaControllerConfig() + if c.FullReconcileInterval.Duration == 0 { + c.FullReconcileInterval = defaults.FullReconcileInterval + } + if len(c.CRStateFilter) == 0 { + c.CRStateFilter = defaults.CRStateFilter + } +} + +// DefaultQuotaControllerConfig returns a default configuration. +func DefaultQuotaControllerConfig() QuotaControllerConfig { + return QuotaControllerConfig{ + FullReconcileInterval: metav1.Duration{Duration: 5 * time.Minute}, + CRStateFilter: []v1alpha1.CommitmentStatus{ + v1alpha1.CommitmentStatusConfirmed, + v1alpha1.CommitmentStatusGuaranteed, + }, + } +} diff --git a/internal/scheduling/reservations/quota/context.go b/internal/scheduling/reservations/quota/context.go new file mode 100644 index 000000000..8352a1934 --- /dev/null +++ b/internal/scheduling/reservations/quota/context.go @@ -0,0 +1,27 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package quota + +import ( + "context" + + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" + "github.com/go-logr/logr" + "github.com/google/uuid" +) + +// WithNewGlobalRequestID creates a new context with a quota-prefixed global request ID. +func WithNewGlobalRequestID(ctx context.Context) context.Context { + return reservations.WithGlobalRequestID(ctx, "quota-"+uuid.New().String()) +} + +// LoggerFromContext returns a logger with greq and req values from the context. +// This creates a child logger with the request tracking values pre-attached, +// so you don't need to repeat them in every log call. +func LoggerFromContext(ctx context.Context) logr.Logger { + return log.WithValues( + "greq", reservations.GlobalRequestIDFromContext(ctx), + "req", reservations.RequestIDFromContext(ctx), + ) +} diff --git a/internal/scheduling/reservations/quota/controller.go b/internal/scheduling/reservations/quota/controller.go new file mode 100644 index 000000000..052b2b685 --- /dev/null +++ b/internal/scheduling/reservations/quota/controller.go @@ -0,0 +1,950 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package quota + +import ( + "context" + "fmt" + "time" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" + commitments "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/commitments" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/failover" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/util/retry" + "k8s.io/client-go/util/workqueue" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + "sigs.k8s.io/controller-runtime/pkg/source" +) + +var log = ctrl.Log.WithName("quota-controller").WithValues("module", "quota") + +// QuotaController manages quota usage tracking for projects. +// It provides three reconciliation modes: +// - Periodic full reconcile: recomputes all TotalUsage from Postgres +// - Incremental HV diff: delta-updates TotalUsage on HV instance changes +// - PaygUsage-only recompute: triggered by CR or ProjectQuota spec changes +type QuotaController struct { + client.Client + VMSource failover.VMSource + Config QuotaControllerConfig + Metrics *QuotaMetrics +} + +// NewQuotaController creates a new QuotaController. +func NewQuotaController( + c client.Client, + vmSource failover.VMSource, + config QuotaControllerConfig, + metrics *QuotaMetrics, +) *QuotaController { + + return &QuotaController{ + Client: c, + VMSource: vmSource, + Config: config, + Metrics: metrics, + } +} + +// ============================================================================ +// Periodic Full Reconciliation +// ============================================================================ + +// ReconcilePeriodic performs a full reconcile of all project quota usage. +// It reads all VMs from Postgres, computes TotalUsage per project/AZ/resource, +// then derives PaygUsage = TotalUsage - CRUsage for each existing ProjectQuota CRD. +func (c *QuotaController) ReconcilePeriodic(ctx context.Context) error { + ctx = WithNewGlobalRequestID(ctx) + startTime := time.Now() + logger := LoggerFromContext(ctx).WithValues("mode", "full-reconcile") + logger.Info("starting full quota reconcile") + + // Fetch flavor groups from Knowledge CRD + flavorGroupClient := &reservations.FlavorGroupKnowledgeClient{Client: c.Client} + flavorGroups, err := flavorGroupClient.GetAllFlavorGroups(ctx, nil) + if err != nil { + logger.Error(err, "failed to get flavor groups") + c.Metrics.RecordReconcileResult(false) + return fmt.Errorf("failed to get flavor groups: %w", err) + } + + // Build flavorName → flavorGroup lookup + flavorToGroup := buildFlavorToGroupMap(flavorGroups) + + // Fetch all VMs using VMSource (reads from Postgres via DBVMSource) + vms, err := c.VMSource.ListVMs(ctx) + if err != nil { + logger.Error(err, "failed to list VMs") + c.Metrics.RecordReconcileResult(false) + return fmt.Errorf("failed to list VMs: %w", err) + } + + // Compute totalUsage per project/AZ/resource + totalUsageByProject := c.computeTotalUsage(vms, flavorToGroup, flavorGroups) + + // List all existing ProjectQuota CRDs + var pqList v1alpha1.ProjectQuotaList + if err := c.List(ctx, &pqList); err != nil { + logger.Error(err, "failed to list ProjectQuota CRDs") + c.Metrics.RecordReconcileResult(false) + return fmt.Errorf("failed to list ProjectQuota CRDs: %w", err) + } + + // List all CommittedResource CRDs and pre-group by project ID + var crList v1alpha1.CommittedResourceList + if err := c.List(ctx, &crList); err != nil { + logger.Error(err, "failed to list CommittedResource CRDs") + c.Metrics.RecordReconcileResult(false) + return fmt.Errorf("failed to list CommittedResource CRDs: %w", err) + } + crsByProject := groupCRsByProject(crList.Items) + + // For each ProjectQuota CRD, write TotalUsage + PaygUsage + var updated, skipped int + for i := range pqList.Items { + pq := &pqList.Items[i] + projectID := pq.Spec.ProjectID + + // Get totalUsage for this project (may be empty if project has no VMs) + projectTotalUsage := totalUsageByProject[projectID] + + // Compute CRUsage for this project (using pre-grouped CRs) + crUsage := c.computeCRUsage(crsByProject[projectID]) + + // Derive PaygUsage = TotalUsage - CRUsage (clamp >= 0) + paygUsage := derivePaygUsage(projectTotalUsage, crUsage) + + // Write status with conflict retry + if err := c.updateProjectQuotaStatusWithRetry(ctx, pq.Name, projectTotalUsage, paygUsage); err != nil { + logger.Error(err, "failed to update ProjectQuota status", "project", projectID) + skipped++ + continue + } + + // Record metrics + c.recordUsageMetrics(projectID, projectTotalUsage, paygUsage, crUsage) + updated++ + } + + duration := time.Since(startTime) + c.Metrics.RecordReconcileDuration(duration.Seconds()) + c.Metrics.RecordReconcileResult(true) + logger.Info("full quota reconcile completed", + "duration", duration.Round(time.Millisecond), + "totalVMs", len(vms), + "projectQuotas", len(pqList.Items), + "updated", updated, + "skipped", skipped) + + return nil +} + +// ============================================================================ +// Watch-based Reconciliation (PaygUsage-only recompute) +// ============================================================================ + +// Reconcile handles watch-based reconciliation for a single ProjectQuota. +// Triggered by: CR Status.UsedAmount changes or ProjectQuota spec changes. +// It reads the persisted TotalUsage, re-lists CRs, and recomputes PaygUsage. +func (c *QuotaController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + ctx = WithNewGlobalRequestID(ctx) + logger := LoggerFromContext(ctx).WithValues("projectQuota", req.Name, "mode", "payg-recompute") + logger.V(1).Info("reconciling ProjectQuota (PaygUsage recompute)") + + // Fetch the ProjectQuota + var pq v1alpha1.ProjectQuota + if err := c.Get(ctx, req.NamespacedName, &pq); err != nil { + if client.IgnoreNotFound(err) == nil { + logger.V(1).Info("ProjectQuota not found, likely deleted") + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + projectID := pq.Spec.ProjectID + ctx = reservations.WithRequestID(ctx, projectID) + + // Read persisted TotalUsage (already computed by full reconcile or incremental) + totalUsage := pq.Status.TotalUsage + if totalUsage == nil { + // No TotalUsage yet — full reconcile hasn't run. Skip. + logger.V(1).Info("no TotalUsage persisted yet, skipping PaygUsage recompute") + return ctrl.Result{}, nil + } + + // List CRs for this project (from local cache) + var crList v1alpha1.CommittedResourceList + if err := c.List(ctx, &crList); err != nil { + logger.Error(err, "failed to list CommittedResource CRDs") + return ctrl.Result{}, err + } + crsByProject := groupCRsByProject(crList.Items) + + // Compute CRUsage + crUsage := c.computeCRUsage(crsByProject[projectID]) + + // Derive PaygUsage + paygUsage := derivePaygUsage(totalUsage, crUsage) + + // Write updated PaygUsage with conflict retry (keep TotalUsage unchanged) + if err := c.updateProjectQuotaStatusWithRetry(ctx, pq.Name, totalUsage, paygUsage); err != nil { + logger.Error(err, "failed to update ProjectQuota status") + return ctrl.Result{}, err + } + + // Record metrics + c.recordUsageMetrics(projectID, totalUsage, paygUsage, crUsage) + + logger.V(1).Info("PaygUsage recomputed", "project", projectID) + return ctrl.Result{}, nil +} + +// ============================================================================ +// Incremental Update (HV Instance Diff) +// ============================================================================ + +// usageDelta tracks resource deltas for a single project during incremental reconciliation. +type usageDelta struct { + // increments[resourceName][az] = amount to add + increments map[string]map[string]int64 + // decrements[resourceName][az] = amount to subtract + decrements map[string]map[string]int64 +} + +func newUsageDelta() *usageDelta { + return &usageDelta{ + increments: make(map[string]map[string]int64), + decrements: make(map[string]map[string]int64), + } +} + +func (d *usageDelta) addIncrement(resourceName, az string, amount int64) { + if d.increments[resourceName] == nil { + d.increments[resourceName] = make(map[string]int64) + } + d.increments[resourceName][az] += amount +} + +func (d *usageDelta) addDecrement(resourceName, az string, amount int64) { + if d.decrements[resourceName] == nil { + d.decrements[resourceName] = make(map[string]int64) + } + d.decrements[resourceName][az] += amount +} + +// ReconcileHVDiff handles incremental updates when HV instance lists change. +// It diffs old vs new instances to delta-update TotalUsage for affected projects. +// Deltas are batched per project and applied in a single status update per project +// to avoid race conditions from multiple updates. +func (c *QuotaController) ReconcileHVDiff(ctx context.Context, oldHV, newHV *hv1.Hypervisor) error { + ctx = WithNewGlobalRequestID(ctx) + logger := LoggerFromContext(ctx).WithValues("hypervisor", newHV.Name, "mode", "incremental") + + // Diff old vs new instances + oldInstances := make(map[string]bool) + for _, inst := range oldHV.Status.Instances { + if inst.Active { + oldInstances[inst.ID] = true + } + } + newInstances := make(map[string]bool) + for _, inst := range newHV.Status.Instances { + if inst.Active { + newInstances[inst.ID] = true + } + } + + // Find added and removed UUIDs + var added, removed []string + for id := range newInstances { + if !oldInstances[id] { + added = append(added, id) + } + } + for id := range oldInstances { + if !newInstances[id] { + removed = append(removed, id) + } + } + + if len(added) == 0 && len(removed) == 0 { + return nil + } + + logger.V(1).Info("HV instance diff detected", "added", len(added), "removed", len(removed)) + + // Get flavor groups for mapping + flavorGroupClient := &reservations.FlavorGroupKnowledgeClient{Client: c.Client} + flavorGroups, err := flavorGroupClient.GetAllFlavorGroups(ctx, nil) + if err != nil { + logger.Error(err, "failed to get flavor groups for incremental update") + return err + } + flavorToGroup := buildFlavorToGroupMap(flavorGroups) + + // Accumulate deltas per project (batched to avoid per-VM persist race) + projectDeltas := make(map[string]*usageDelta) + + // Process added instances + for _, vmUUID := range added { + c.accumulateAddedVM(ctx, vmUUID, flavorToGroup, flavorGroups, projectDeltas) + } + + // Process removed instances + for _, vmUUID := range removed { + c.accumulateRemovedVM(ctx, vmUUID, flavorToGroup, flavorGroups, projectDeltas) + } + + // Apply batched deltas and recompute PaygUsage for affected projects + var crList v1alpha1.CommittedResourceList + if err := c.List(ctx, &crList); err != nil { + logger.Error(err, "failed to list CRs for PaygUsage recompute") + return err + } + crsByProject := groupCRsByProject(crList.Items) + + for projectID, delta := range projectDeltas { + if err := c.applyDeltaAndUpdateStatus(ctx, projectID, delta, crsByProject[projectID]); err != nil { + logger.Error(err, "failed to apply delta for project", "project", projectID) + // Continue with other projects + } + } + + return nil +} + +// accumulateAddedVM looks up a VM and accumulates its resource contribution as a delta. +// It checks whether the VM is truly new (created after last full reconcile) vs a migration +// (already counted in TotalUsage). Only new VMs get incremented. +func (c *QuotaController) accumulateAddedVM( + ctx context.Context, + vmUUID string, + flavorToGroup map[string]string, + flavorGroups map[string]compute.FlavorGroupFeature, + projectDeltas map[string]*usageDelta, +) { + + logger := LoggerFromContext(ctx).WithValues("vmUUID", vmUUID) + + vm, err := c.VMSource.GetVM(ctx, vmUUID) + if err != nil { + logger.Error(err, "failed to get VM for increment") + return + } + if vm == nil { + return // VM not found in DB, skip + } + + // Check if this VM was already counted in the last full reconcile. + // If the VM was created BEFORE the last full reconcile, it's a migration + // (already in TotalUsage) and we should NOT increment again. + if !c.isVMNewSinceLastReconcile(ctx, vm) { + logger.V(1).Info("VM already counted (created before last reconcile), skipping increment", + "vmCreatedAt", vm.CreatedAt, "project", vm.ProjectID) + return + } + + groupName, ok := flavorToGroup[vm.FlavorName] + if !ok { + return // Flavor not in any group + } + fg, ok := flavorGroups[groupName] + if !ok { + return + } + + unitSizeMiB := int64(fg.SmallestFlavor.MemoryMB) //nolint:gosec // MemoryMB is always within int64 range + if unitSizeMiB == 0 { + return + } + + ramUnits, coresAmount := vmResourceUnits(vm.Resources, unitSizeMiB) + + delta := projectDeltas[vm.ProjectID] + if delta == nil { + delta = newUsageDelta() + projectDeltas[vm.ProjectID] = delta + } + + delta.addIncrement(commitments.ResourceNameRAM(groupName), vm.AvailabilityZone, ramUnits) + delta.addIncrement(commitments.ResourceNameCores(groupName), vm.AvailabilityZone, coresAmount) +} + +// isVMNewSinceLastReconcile checks if a VM was created after the last full reconcile. +// Returns true if the VM is new and should be incrementally added to TotalUsage. +// Returns false if the VM already existed at the last full reconcile (migration, not new). +// +// NOTE: There is a known timing gap -- the postgres servers table is only refreshed every +// N minutes by the datasource poller. A VM that was created shortly BEFORE the last reconcile +// might not have been visible in postgres yet (sync delay), so the full reconcile may have +// missed it. In that case we would also skip the increment here (CreatedAt <= LastReconcileAt) +// and the VM would only be counted on the NEXT full reconcile cycle. This is acceptable for +// now and will be resolved when we move to a CRD-based VM source with real-time events. +func (c *QuotaController) isVMNewSinceLastReconcile(ctx context.Context, vm *failover.VM) bool { + if vm.CreatedAt == "" { + // No creation time available -- be conservative, skip increment. + // The next full reconcile will pick it up. + return false + } + + // Look up the ProjectQuota for this VM's project + crdName := "quota-" + vm.ProjectID + var pq v1alpha1.ProjectQuota + if err := c.Get(ctx, client.ObjectKey{Name: crdName}, &pq); err != nil { + // If we can't find the ProjectQuota, skip (full reconcile will handle it) + return false + } + + if pq.Status.LastReconcileAt == nil { + // No full reconcile has run yet -- skip incremental updates + return false + } + + // Parse the VM's creation time and compare with last reconcile + vmCreatedAt, err := time.Parse("2006-01-02T15:04:05Z", vm.CreatedAt) + if err != nil { + // Try alternative format with timezone offset + vmCreatedAt, err = time.Parse(time.RFC3339, vm.CreatedAt) + if err != nil { + // Cannot parse -- be conservative, skip + return false + } + } + + return vmCreatedAt.After(pq.Status.LastReconcileAt.Time) +} + +// accumulateRemovedVM looks up a deleted VM and accumulates its resource contribution as a decrement. +func (c *QuotaController) accumulateRemovedVM( + ctx context.Context, + vmUUID string, + flavorToGroup map[string]string, + flavorGroups map[string]compute.FlavorGroupFeature, + projectDeltas map[string]*usageDelta, +) { + + logger := LoggerFromContext(ctx).WithValues("vmUUID", vmUUID) + + // Check if the VM still exists in the servers table (migrated away = still running) + active, err := c.VMSource.IsServerActive(ctx, vmUUID) + if err != nil { + logger.Error(err, "failed to check server for decrement") + return + } + if active { + // VM still exists (either ACTIVE on another HV, or in non-ACTIVE state). + // Don't decrement — the full reconcile handles these correctly. + return + } + + // Not found in servers table — check deleted_servers + info, err := c.VMSource.GetDeletedVMInfo(ctx, vmUUID) + if err != nil { + logger.Error(err, "failed to get deleted VM info for decrement") + return + } + if info == nil { + // Not found anywhere — cannot determine what to decrement + logger.V(1).Info("removed VM not found in servers or deleted_servers") + return + } + + groupName, ok := flavorToGroup[info.FlavorName] + if !ok { + return // Flavor not in any group + } + fg, ok := flavorGroups[groupName] + if !ok { + return + } + + // Compute commitment units from the resolved flavor resources + unitSizeMiB := int64(fg.SmallestFlavor.MemoryMB) //nolint:gosec // MemoryMB is always within int64 range + if unitSizeMiB == 0 { + return + } + + ramUnits := int64(info.RAMMiB) / unitSizeMiB //nolint:gosec // safe + coresAmount := int64(info.VCPUs) //nolint:gosec // safe + + delta := projectDeltas[info.ProjectID] + if delta == nil { + delta = newUsageDelta() + projectDeltas[info.ProjectID] = delta + } + + delta.addDecrement(commitments.ResourceNameRAM(groupName), info.AvailabilityZone, ramUnits) + delta.addDecrement(commitments.ResourceNameCores(groupName), info.AvailabilityZone, coresAmount) +} + +// applyDeltaAndUpdateStatus fetches the ProjectQuota, applies the batched delta to TotalUsage, +// recomputes PaygUsage, and persists with conflict retry. +func (c *QuotaController) applyDeltaAndUpdateStatus( + ctx context.Context, + projectID string, + delta *usageDelta, + projectCRs []v1alpha1.CommittedResource, +) error { + + crdName := "quota-" + projectID + + return retry.RetryOnConflict(retry.DefaultRetry, func() error { + // Re-fetch fresh copy on each retry + var pq v1alpha1.ProjectQuota + if err := c.Get(ctx, client.ObjectKey{Name: crdName}, &pq); err != nil { + if client.IgnoreNotFound(err) == nil { + return nil // PQ deleted, nothing to do + } + return err + } + + if pq.Status.TotalUsage == nil { + pq.Status.TotalUsage = make(map[string]v1alpha1.ResourceQuotaUsage) + } + + // Apply increments + for resourceName, azAmounts := range delta.increments { + for az, amount := range azAmounts { + incrementUsage(pq.Status.TotalUsage, resourceName, az, amount) + } + } + + // Apply decrements + for resourceName, azAmounts := range delta.decrements { + for az, amount := range azAmounts { + decrementUsage(pq.Status.TotalUsage, resourceName, az, amount) + } + } + + // Recompute PaygUsage + crUsage := c.computeCRUsage(projectCRs) + paygUsage := derivePaygUsage(pq.Status.TotalUsage, crUsage) + + pq.Status.PaygUsage = paygUsage + now := metav1.Now() + pq.Status.LastReconcileAt = &now + + if err := c.Status().Update(ctx, &pq); err != nil { + return err + } + + c.recordUsageMetrics(projectID, pq.Status.TotalUsage, paygUsage, crUsage) + return nil + }) +} + +// ============================================================================ +// Manager Setup +// ============================================================================ + +// SetupWithManager sets up the watch-based reconciler for PaygUsage recomputes. +func (c *QuotaController) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + Named("quota-controller"). + // Watch ProjectQuota for spec changes (Limes pushes quota) + For(&v1alpha1.ProjectQuota{}). + // Watch CommittedResource for status changes (UsedAmount updates) + Watches( + &v1alpha1.CommittedResource{}, + handler.EnqueueRequestsFromMapFunc(c.mapCRToProjectQuota), + builder.WithPredicates(crUsedAmountChangePredicate()), + ). + WithOptions(controller.Options{ + MaxConcurrentReconciles: 1, + }). + Complete(c) +} + +// SetupHVWatcher sets up a separate controller to watch HV CRD changes +// for incremental TotalUsage updates. +func (c *QuotaController) SetupHVWatcher(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + Named("quota-hv-watcher"). + WatchesRawSource(source.Kind( + mgr.GetCache(), + &hv1.Hypervisor{}, + &hvInstanceDiffHandler{controller: c}, + hvInstanceChangePredicate(), + )). + WithOptions(controller.Options{ + MaxConcurrentReconciles: 1, + }). + Complete(reconcile.Func(func(_ context.Context, _ ctrl.Request) (ctrl.Result, error) { + // The actual work is done in the event handler + return ctrl.Result{}, nil + })) +} + +// Start implements manager.Runnable for the periodic reconciliation loop. +// It does not block manager startup — the first reconcile fires after a short +// initial delay to allow cache sync. +func (c *QuotaController) Start(ctx context.Context) error { + log.Info("starting quota controller (periodic)", + "fullReconcileInterval", c.Config.FullReconcileInterval.Duration, + "crStateFilter", c.Config.CRStateFilter) + + // Use a short initial delay to allow cache sync before first reconcile + initialDelay := 5 * time.Second + timer := time.NewTimer(initialDelay) + defer timer.Stop() + + for { + select { + case <-ctx.Done(): + log.Info("stopping quota controller") + return nil + case <-timer.C: + if err := c.ReconcilePeriodic(ctx); err != nil { + log.Error(err, "periodic full reconcile failed") + } + timer.Reset(c.Config.FullReconcileInterval.Duration) + } + } +} + +// ============================================================================ +// Internal Helpers +// ============================================================================ + +// computeTotalUsage aggregates VM resources by project/AZ/resource. +// +// The RAM calculation converts server RAM into LIQUID commitment units: +// - Each flavor group has a "smallest flavor" defining the unit size (e.g., 32768 MiB) +// - A VM's RAM usage in units = VM_RAM_MiB / unit_size_MiB +// - Example: a 64 GiB VM in a group with 32 GiB smallest flavor = 2 units +// +// This matches the unit system used by LIQUID for commitment tracking. +// The per-AZ breakdown allows Limes to enforce AZ-level quota limits. +func (c *QuotaController) computeTotalUsage( + vms []failover.VM, + flavorToGroup map[string]string, + flavorGroups map[string]compute.FlavorGroupFeature, +) map[string]map[string]v1alpha1.ResourceQuotaUsage { + // result[projectID][resourceName] = ResourceQuotaUsage{PerAZ: {az: amount}} + result := make(map[string]map[string]v1alpha1.ResourceQuotaUsage) + + for _, vm := range vms { + groupName, ok := flavorToGroup[vm.FlavorName] + if !ok { + continue // Flavor not in any tracked group + } + fg, ok := flavorGroups[groupName] + if !ok { + continue + } + if fg.SmallestFlavor.MemoryMB == 0 { + continue // Invalid group config + } + + ramResourceName := commitments.ResourceNameRAM(groupName) + coresResourceName := commitments.ResourceNameCores(groupName) + + unitSizeMiB := int64(fg.SmallestFlavor.MemoryMB) //nolint:gosec // safe + ramUnits, coresAmount := vmResourceUnits(vm.Resources, unitSizeMiB) + + if _, ok := result[vm.ProjectID]; !ok { + result[vm.ProjectID] = make(map[string]v1alpha1.ResourceQuotaUsage) + } + + // Accumulate RAM usage for this project + AZ + ramUsage := result[vm.ProjectID][ramResourceName] + if ramUsage.PerAZ == nil { + ramUsage.PerAZ = make(map[string]int64) + } + ramUsage.PerAZ[vm.AvailabilityZone] += ramUnits + result[vm.ProjectID][ramResourceName] = ramUsage + + // Accumulate cores usage for this project + AZ + coresUsage := result[vm.ProjectID][coresResourceName] + if coresUsage.PerAZ == nil { + coresUsage.PerAZ = make(map[string]int64) + } + coresUsage.PerAZ[vm.AvailabilityZone] += coresAmount + result[vm.ProjectID][coresResourceName] = coresUsage + } + + return result +} + +// groupCRsByProject groups CommittedResources by project ID for efficient lookup. +func groupCRsByProject(crs []v1alpha1.CommittedResource) map[string][]v1alpha1.CommittedResource { + result := make(map[string][]v1alpha1.CommittedResource) + for i := range crs { + projectID := crs[i].Spec.ProjectID + result[projectID] = append(result[projectID], crs[i]) + } + return result +} + +// computeCRUsage computes the committed resource usage from a pre-filtered slice of CRs for one project. +func (c *QuotaController) computeCRUsage(crs []v1alpha1.CommittedResource) map[string]v1alpha1.ResourceQuotaUsage { + result := make(map[string]v1alpha1.ResourceQuotaUsage) + + for i := range crs { + cr := &crs[i] + + // Filter: only matching states + if !c.isCRStateIncluded(cr.Spec.State) { + continue + } + + // Get UsedAmount from status + if cr.Status.UsedAmount == nil { + continue + } + usedAmount := cr.Status.UsedAmount.Value() + if usedAmount <= 0 { + continue + } + + // Map ResourceType to resource name + var resourceName string + switch cr.Spec.ResourceType { + case v1alpha1.CommittedResourceTypeMemory: + resourceName = commitments.ResourceNameRAM(cr.Spec.FlavorGroupName) + case v1alpha1.CommittedResourceTypeCores: + resourceName = commitments.ResourceNameCores(cr.Spec.FlavorGroupName) + default: + continue + } + + // Accumulate per AZ + usage := result[resourceName] + if usage.PerAZ == nil { + usage.PerAZ = make(map[string]int64) + } + usage.PerAZ[cr.Spec.AvailabilityZone] += usedAmount + result[resourceName] = usage + } + + return result +} + +// isCRStateIncluded checks if a commitment state is in the configured filter. +func (c *QuotaController) isCRStateIncluded(state v1alpha1.CommitmentStatus) bool { + for _, s := range c.Config.CRStateFilter { + if s == state { + return true + } + } + return false +} + +// derivePaygUsage computes PaygUsage = TotalUsage - CRUsage (clamped >= 0). +func derivePaygUsage( + totalUsage map[string]v1alpha1.ResourceQuotaUsage, + crUsage map[string]v1alpha1.ResourceQuotaUsage, +) map[string]v1alpha1.ResourceQuotaUsage { + + result := make(map[string]v1alpha1.ResourceQuotaUsage) + + for resourceName, total := range totalUsage { + payg := v1alpha1.ResourceQuotaUsage{ + PerAZ: make(map[string]int64), + } + for az, totalAmount := range total.PerAZ { + crAmount := int64(0) + if cr, ok := crUsage[resourceName]; ok { + if azAmount, ok := cr.PerAZ[az]; ok { + crAmount = azAmount + } + } + paygAmount := totalAmount - crAmount + if paygAmount < 0 { + paygAmount = 0 // Clamp >= 0 + } + payg.PerAZ[az] = paygAmount + } + result[resourceName] = payg + } + + return result +} + +// updateProjectQuotaStatusWithRetry writes TotalUsage + PaygUsage + LastReconcileAt +// with retry-on-conflict to handle concurrent updates. +func (c *QuotaController) updateProjectQuotaStatusWithRetry( + ctx context.Context, + pqName string, + totalUsage map[string]v1alpha1.ResourceQuotaUsage, + paygUsage map[string]v1alpha1.ResourceQuotaUsage, +) error { + + return retry.RetryOnConflict(retry.DefaultRetry, func() error { + // Re-fetch fresh copy on each retry + var pq v1alpha1.ProjectQuota + if err := c.Get(ctx, client.ObjectKey{Name: pqName}, &pq); err != nil { + return err + } + + pq.Status.TotalUsage = totalUsage + pq.Status.PaygUsage = paygUsage + now := metav1.Now() + pq.Status.LastReconcileAt = &now + + return c.Status().Update(ctx, &pq) + }) +} + +// vmResourceUnits computes RAM commitment units and cores from a VM's resources. +// RAM is converted from bytes (resource.Quantity) to MiB, then divided by unitSizeMiB +// (the smallest flavor's memory in MiB for the flavor group) to get commitment units. +func vmResourceUnits(resources map[string]resource.Quantity, unitSizeMiB int64) (ramUnits, cores int64) { + memQty := resources["memory"] + serverRAMMiB := memQty.Value() / (1024 * 1024) // bytes to MiB + ramUnits = serverRAMMiB / unitSizeMiB // commitment units + vcpuQty := resources["vcpus"] + cores = vcpuQty.Value() + return ramUnits, cores +} + +// buildFlavorToGroupMap builds a flavorName → flavorGroupName lookup from flavor groups. +func buildFlavorToGroupMap(flavorGroups map[string]compute.FlavorGroupFeature) map[string]string { + result := make(map[string]string) + for groupName, group := range flavorGroups { + for _, flavor := range group.Flavors { + result[flavor.Name] = groupName + } + } + return result +} + +// incrementUsage increments a usage value in the map. +func incrementUsage(usage map[string]v1alpha1.ResourceQuotaUsage, resourceName, az string, amount int64) { + u := usage[resourceName] + if u.PerAZ == nil { + u.PerAZ = make(map[string]int64) + } + u.PerAZ[az] += amount + usage[resourceName] = u +} + +// decrementUsage decrements a usage value in the map (clamp >= 0). +func decrementUsage(usage map[string]v1alpha1.ResourceQuotaUsage, resourceName, az string, amount int64) { + u := usage[resourceName] + if u.PerAZ == nil { + return + } + u.PerAZ[az] -= amount + if u.PerAZ[az] < 0 { + u.PerAZ[az] = 0 + } + usage[resourceName] = u +} + +// recordUsageMetrics emits Prometheus metrics for all resources in a project. +func (c *QuotaController) recordUsageMetrics( + projectID string, + totalUsage map[string]v1alpha1.ResourceQuotaUsage, + paygUsage map[string]v1alpha1.ResourceQuotaUsage, + crUsage map[string]v1alpha1.ResourceQuotaUsage, +) { + + for resourceName, total := range totalUsage { + for az, totalAmount := range total.PerAZ { + paygAmount := int64(0) + if payg, ok := paygUsage[resourceName]; ok { + paygAmount = payg.PerAZ[az] + } + crAmount := int64(0) + if cr, ok := crUsage[resourceName]; ok { + crAmount = cr.PerAZ[az] + } + c.Metrics.RecordUsage(projectID, az, resourceName, totalAmount, paygAmount, crAmount) + } + } +} + +// ============================================================================ +// Predicates & Event Handlers +// ============================================================================ + +// mapCRToProjectQuota maps a CommittedResource change to the affected ProjectQuota reconcile request. +func (c *QuotaController) mapCRToProjectQuota(_ context.Context, obj client.Object) []reconcile.Request { + cr, ok := obj.(*v1alpha1.CommittedResource) + if !ok { + return nil + } + // Map to the ProjectQuota for this project + crdName := "quota-" + cr.Spec.ProjectID + return []reconcile.Request{ + {NamespacedName: client.ObjectKey{Name: crdName}}, + } +} + +// crUsedAmountChangePredicate triggers only when Status.UsedAmount changes on a CommittedResource. +func crUsedAmountChangePredicate() predicate.Predicate { + return predicate.Funcs{ + CreateFunc: func(_ event.CreateEvent) bool { return false }, + UpdateFunc: func(e event.UpdateEvent) bool { + oldCR, ok1 := e.ObjectOld.(*v1alpha1.CommittedResource) + newCR, ok2 := e.ObjectNew.(*v1alpha1.CommittedResource) + if !ok1 || !ok2 { + return false + } + // Trigger if UsedAmount changed + oldUsed := "" + newUsed := "" + if oldCR.Status.UsedAmount != nil { + oldUsed = oldCR.Status.UsedAmount.String() + } + if newCR.Status.UsedAmount != nil { + newUsed = newCR.Status.UsedAmount.String() + } + return oldUsed != newUsed + }, + DeleteFunc: func(_ event.DeleteEvent) bool { return true }, + GenericFunc: func(_ event.GenericEvent) bool { return false }, + } +} + +// hvInstanceChangePredicate always returns true for updates. +// ReconcileHVDiff performs its own set-diff and exits early if there are no +// actual additions/removals. This ensures instance swaps (same count, different IDs) +// are not missed. +func hvInstanceChangePredicate() predicate.TypedPredicate[*hv1.Hypervisor] { + return predicate.TypedFuncs[*hv1.Hypervisor]{ + CreateFunc: func(_ event.TypedCreateEvent[*hv1.Hypervisor]) bool { return true }, + UpdateFunc: func(_ event.TypedUpdateEvent[*hv1.Hypervisor]) bool { + return true + }, + DeleteFunc: func(_ event.TypedDeleteEvent[*hv1.Hypervisor]) bool { return true }, + GenericFunc: func(_ event.TypedGenericEvent[*hv1.Hypervisor]) bool { return false }, + } +} + +// hvInstanceDiffHandler handles HV instance diff events by calling ReconcileHVDiff. +type hvInstanceDiffHandler struct { + controller *QuotaController +} + +func (h *hvInstanceDiffHandler) Create(_ context.Context, _ event.TypedCreateEvent[*hv1.Hypervisor], _ workqueue.TypedRateLimitingInterface[reconcile.Request]) { + // On create, no diff needed (full reconcile will catch up) +} + +func (h *hvInstanceDiffHandler) Update(ctx context.Context, e event.TypedUpdateEvent[*hv1.Hypervisor], _ workqueue.TypedRateLimitingInterface[reconcile.Request]) { + if err := h.controller.ReconcileHVDiff(ctx, e.ObjectOld, e.ObjectNew); err != nil { + log.Error(err, "failed to process HV instance diff", "hypervisor", e.ObjectNew.Name) + } +} + +func (h *hvInstanceDiffHandler) Delete(_ context.Context, _ event.TypedDeleteEvent[*hv1.Hypervisor], _ workqueue.TypedRateLimitingInterface[reconcile.Request]) { + // On delete, full reconcile will correct +} + +func (h *hvInstanceDiffHandler) Generic(_ context.Context, _ event.TypedGenericEvent[*hv1.Hypervisor], _ workqueue.TypedRateLimitingInterface[reconcile.Request]) { + // No-op +} diff --git a/internal/scheduling/reservations/quota/controller_test.go b/internal/scheduling/reservations/quota/controller_test.go new file mode 100644 index 000000000..4005af326 --- /dev/null +++ b/internal/scheduling/reservations/quota/controller_test.go @@ -0,0 +1,598 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package quota + +import ( + "context" + "testing" + "time" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/failover" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestComputeTotalUsage(t *testing.T) { + ctrl := &QuotaController{Config: DefaultQuotaControllerConfig()} + + flavorGroups := map[string]compute.FlavorGroupFeature{ + "hana_v2": { + SmallestFlavor: compute.FlavorInGroup{MemoryMB: 32768}, + Flavors: []compute.FlavorInGroup{ + {Name: "m1.hana_v2.small", MemoryMB: 32768}, + {Name: "m1.hana_v2.large", MemoryMB: 65536}, + }, + }, + "general": { + SmallestFlavor: compute.FlavorInGroup{MemoryMB: 4096}, + Flavors: []compute.FlavorInGroup{ + {Name: "m1.general.small", MemoryMB: 4096}, + }, + }, + } + flavorToGroup := buildFlavorToGroupMap(flavorGroups) + + vms := []failover.VM{ + { + UUID: "vm-1", + FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", + AvailabilityZone: "az-1", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), // 32768 MiB in bytes + "vcpus": resource.MustParse("8"), + }, + }, + { + UUID: "vm-2", + FlavorName: "m1.hana_v2.large", + ProjectID: "project-a", + AvailabilityZone: "az-1", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("68719476736"), // 65536 MiB in bytes + "vcpus": resource.MustParse("16"), + }, + }, + { + UUID: "vm-3", + FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", + AvailabilityZone: "az-2", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), + "vcpus": resource.MustParse("8"), + }, + }, + { + UUID: "vm-4", + FlavorName: "m1.general.small", + ProjectID: "project-b", + AvailabilityZone: "az-1", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("4294967296"), // 4096 MiB in bytes + "vcpus": resource.MustParse("2"), + }, + }, + { + UUID: "vm-5", + FlavorName: "unknown-flavor", + ProjectID: "project-c", + AvailabilityZone: "az-1", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("4294967296"), + "vcpus": resource.MustParse("2"), + }, + }, + } + + result := ctrl.computeTotalUsage(vms, flavorToGroup, flavorGroups) + + // project-a: hana_v2 in az-1: 32768+65536 = 98304 MiB / 32768 = 3 units RAM, 8+16=24 cores + // project-a: hana_v2 in az-2: 32768 MiB / 32768 = 1 unit RAM, 8 cores + projectA := result["project-a"] + if projectA == nil { + t.Fatal("expected project-a in results") + } + + ramUsage := projectA["hw_version_hana_v2_ram"] + if ramUsage.PerAZ["az-1"] != 3 { + t.Errorf("expected project-a az-1 hana_v2_ram = 3, got %d", ramUsage.PerAZ["az-1"]) + } + if ramUsage.PerAZ["az-2"] != 1 { + t.Errorf("expected project-a az-2 hana_v2_ram = 1, got %d", ramUsage.PerAZ["az-2"]) + } + + coresUsage := projectA["hw_version_hana_v2_cores"] + if coresUsage.PerAZ["az-1"] != 24 { + t.Errorf("expected project-a az-1 hana_v2_cores = 24, got %d", coresUsage.PerAZ["az-1"]) + } + if coresUsage.PerAZ["az-2"] != 8 { + t.Errorf("expected project-a az-2 hana_v2_cores = 8, got %d", coresUsage.PerAZ["az-2"]) + } + + // project-b: general in az-1: 4096/4096=1 unit RAM, 2 cores + projectB := result["project-b"] + if projectB == nil { + t.Fatal("expected project-b in results") + } + if projectB["hw_version_general_ram"].PerAZ["az-1"] != 1 { + t.Errorf("expected project-b az-1 general_ram = 1, got %d", projectB["hw_version_general_ram"].PerAZ["az-1"]) + } + if projectB["hw_version_general_cores"].PerAZ["az-1"] != 2 { + t.Errorf("expected project-b az-1 general_cores = 2, got %d", projectB["hw_version_general_cores"].PerAZ["az-1"]) + } + + // project-c: unknown flavor → not in results + if _, exists := result["project-c"]; exists { + t.Error("expected project-c to NOT be in results (unknown flavor)") + } +} + +func TestComputeCRUsage(t *testing.T) { + ctrl := &QuotaController{Config: DefaultQuotaControllerConfig()} + + usedAmount5 := resource.MustParse("5") + usedAmount3 := resource.MustParse("3") + usedAmount2 := resource.MustParse("2") + + allCRs := []v1alpha1.CommittedResource{ + { + Spec: v1alpha1.CommittedResourceSpec{ + ProjectID: "project-a", + FlavorGroupName: "hana_v2", + AvailabilityZone: "az-1", + ResourceType: v1alpha1.CommittedResourceTypeMemory, + State: v1alpha1.CommitmentStatusConfirmed, + }, + Status: v1alpha1.CommittedResourceStatus{ + UsedAmount: &usedAmount5, + }, + }, + { + Spec: v1alpha1.CommittedResourceSpec{ + ProjectID: "project-a", + FlavorGroupName: "hana_v2", + AvailabilityZone: "az-1", + ResourceType: v1alpha1.CommittedResourceTypeMemory, + State: v1alpha1.CommitmentStatusGuaranteed, + }, + Status: v1alpha1.CommittedResourceStatus{ + UsedAmount: &usedAmount3, + }, + }, + { + Spec: v1alpha1.CommittedResourceSpec{ + ProjectID: "project-a", + FlavorGroupName: "hana_v2", + AvailabilityZone: "az-1", + ResourceType: v1alpha1.CommittedResourceTypeCores, + State: v1alpha1.CommitmentStatusConfirmed, + }, + Status: v1alpha1.CommittedResourceStatus{ + UsedAmount: &usedAmount2, + }, + }, + // Different project — should be excluded by groupCRsByProject + { + Spec: v1alpha1.CommittedResourceSpec{ + ProjectID: "project-b", + FlavorGroupName: "hana_v2", + AvailabilityZone: "az-1", + ResourceType: v1alpha1.CommittedResourceTypeMemory, + State: v1alpha1.CommitmentStatusConfirmed, + }, + Status: v1alpha1.CommittedResourceStatus{ + UsedAmount: &usedAmount5, + }, + }, + // Pending state — should be excluded by state filter + { + Spec: v1alpha1.CommittedResourceSpec{ + ProjectID: "project-a", + FlavorGroupName: "hana_v2", + AvailabilityZone: "az-2", + ResourceType: v1alpha1.CommittedResourceTypeMemory, + State: v1alpha1.CommitmentStatusPending, + }, + Status: v1alpha1.CommittedResourceStatus{ + UsedAmount: &usedAmount2, + }, + }, + } + + // Pre-group and pass only project-a's CRs + crsByProject := groupCRsByProject(allCRs) + result := ctrl.computeCRUsage(crsByProject["project-a"]) + + // Should include confirmed + guaranteed for project-a only + ramUsage := result["hw_version_hana_v2_ram"] + if ramUsage.PerAZ["az-1"] != 8 { // 5 + 3 + t.Errorf("expected cr ram usage az-1 = 8, got %d", ramUsage.PerAZ["az-1"]) + } + + coresUsage := result["hw_version_hana_v2_cores"] + if coresUsage.PerAZ["az-1"] != 2 { + t.Errorf("expected cr cores usage az-1 = 2, got %d", coresUsage.PerAZ["az-1"]) + } + + // az-2 should NOT be included (pending state) + if ramUsage.PerAZ["az-2"] != 0 { + t.Errorf("expected cr ram usage az-2 = 0 (pending excluded), got %d", ramUsage.PerAZ["az-2"]) + } +} + +func TestDerivePaygUsage(t *testing.T) { + tests := []struct { + name string + totalUsage map[string]v1alpha1.ResourceQuotaUsage + crUsage map[string]v1alpha1.ResourceQuotaUsage + expected map[string]map[string]int64 // resourceName -> az -> amount + }{ + { + name: "basic subtraction", + totalUsage: map[string]v1alpha1.ResourceQuotaUsage{ + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 10, "az-2": 5}}, + }, + crUsage: map[string]v1alpha1.ResourceQuotaUsage{ + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3}}, + }, + expected: map[string]map[string]int64{ + "hw_version_hana_v2_ram": {"az-1": 7, "az-2": 5}, + }, + }, + { + name: "clamp to zero", + totalUsage: map[string]v1alpha1.ResourceQuotaUsage{ + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 2}}, + }, + crUsage: map[string]v1alpha1.ResourceQuotaUsage{ + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 10}}, + }, + expected: map[string]map[string]int64{ + "hw_version_hana_v2_ram": {"az-1": 0}, + }, + }, + { + name: "no CR usage", + totalUsage: map[string]v1alpha1.ResourceQuotaUsage{ + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 5}}, + }, + crUsage: map[string]v1alpha1.ResourceQuotaUsage{}, + expected: map[string]map[string]int64{ + "hw_version_hana_v2_ram": {"az-1": 5}, + }, + }, + { + name: "empty total usage", + totalUsage: map[string]v1alpha1.ResourceQuotaUsage{}, + crUsage: map[string]v1alpha1.ResourceQuotaUsage{ + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 5}}, + }, + expected: map[string]map[string]int64{}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := derivePaygUsage(tt.totalUsage, tt.crUsage) + + for resourceName, expectedAZ := range tt.expected { + resUsage, ok := result[resourceName] + if !ok { + t.Errorf("expected resource %s in result", resourceName) + continue + } + for az, expectedAmount := range expectedAZ { + if resUsage.PerAZ[az] != expectedAmount { + t.Errorf("resource=%s az=%s: expected %d, got %d", + resourceName, az, expectedAmount, resUsage.PerAZ[az]) + } + } + } + + // Check no extra resources in result + for resourceName := range result { + if _, ok := tt.expected[resourceName]; !ok { + t.Errorf("unexpected resource %s in result", resourceName) + } + } + }) + } +} + +func TestBuildFlavorToGroupMap(t *testing.T) { + flavorGroups := map[string]compute.FlavorGroupFeature{ + "hana_v2": { + Flavors: []compute.FlavorInGroup{ + {Name: "m1.hana_v2.small"}, + {Name: "m1.hana_v2.large"}, + }, + }, + "general": { + Flavors: []compute.FlavorInGroup{ + {Name: "m1.general.small"}, + }, + }, + } + + result := buildFlavorToGroupMap(flavorGroups) + + if result["m1.hana_v2.small"] != "hana_v2" { + t.Errorf("expected hana_v2 for m1.hana_v2.small, got %s", result["m1.hana_v2.small"]) + } + if result["m1.hana_v2.large"] != "hana_v2" { + t.Errorf("expected hana_v2 for m1.hana_v2.large, got %s", result["m1.hana_v2.large"]) + } + if result["m1.general.small"] != "general" { + t.Errorf("expected general for m1.general.small, got %s", result["m1.general.small"]) + } + if _, exists := result["unknown"]; exists { + t.Error("expected unknown flavor not to be in map") + } +} + +func TestIncrementDecrementUsage(t *testing.T) { + usage := make(map[string]v1alpha1.ResourceQuotaUsage) + + // Increment from empty + incrementUsage(usage, "res1", "az-1", 5) + if usage["res1"].PerAZ["az-1"] != 5 { + t.Errorf("expected 5 after increment, got %d", usage["res1"].PerAZ["az-1"]) + } + + // Increment again + incrementUsage(usage, "res1", "az-1", 3) + if usage["res1"].PerAZ["az-1"] != 8 { + t.Errorf("expected 8 after second increment, got %d", usage["res1"].PerAZ["az-1"]) + } + + // Decrement + decrementUsage(usage, "res1", "az-1", 2) + if usage["res1"].PerAZ["az-1"] != 6 { + t.Errorf("expected 6 after decrement, got %d", usage["res1"].PerAZ["az-1"]) + } + + // Decrement below zero → clamp to 0 + decrementUsage(usage, "res1", "az-1", 100) + if usage["res1"].PerAZ["az-1"] != 0 { + t.Errorf("expected 0 after over-decrement, got %d", usage["res1"].PerAZ["az-1"]) + } + + // Decrement non-existent resource (no-op) + decrementUsage(usage, "res2", "az-1", 5) + // Should not panic, and res2 should not exist + if _, exists := usage["res2"]; exists { + if usage["res2"].PerAZ != nil { + t.Error("expected res2 to not have PerAZ after decrement on non-existent") + } + } +} + +func TestIsCRStateIncluded(t *testing.T) { + ctrl := &QuotaController{Config: DefaultQuotaControllerConfig()} + + if !ctrl.isCRStateIncluded(v1alpha1.CommitmentStatusConfirmed) { + t.Error("expected confirmed to be included") + } + if !ctrl.isCRStateIncluded(v1alpha1.CommitmentStatusGuaranteed) { + t.Error("expected guaranteed to be included") + } + if ctrl.isCRStateIncluded(v1alpha1.CommitmentStatusPending) { + t.Error("expected pending to NOT be included") + } +} + +func TestGroupCRsByProject(t *testing.T) { + crs := []v1alpha1.CommittedResource{ + {Spec: v1alpha1.CommittedResourceSpec{ProjectID: "p1"}}, + {Spec: v1alpha1.CommittedResourceSpec{ProjectID: "p2"}}, + {Spec: v1alpha1.CommittedResourceSpec{ProjectID: "p1"}}, + {Spec: v1alpha1.CommittedResourceSpec{ProjectID: "p3"}}, + } + + grouped := groupCRsByProject(crs) + + if len(grouped["p1"]) != 2 { + t.Errorf("expected 2 CRs for p1, got %d", len(grouped["p1"])) + } + if len(grouped["p2"]) != 1 { + t.Errorf("expected 1 CR for p2, got %d", len(grouped["p2"])) + } + if len(grouped["p3"]) != 1 { + t.Errorf("expected 1 CR for p3, got %d", len(grouped["p3"])) + } + if len(grouped["nonexistent"]) != 0 { + t.Error("expected 0 CRs for nonexistent project") + } +} + +func TestUsageDelta(t *testing.T) { + delta := newUsageDelta() + + delta.addIncrement("res1", "az-1", 5) + delta.addIncrement("res1", "az-1", 3) + delta.addIncrement("res1", "az-2", 2) + delta.addDecrement("res1", "az-1", 1) + + if delta.increments["res1"]["az-1"] != 8 { + t.Errorf("expected increment res1/az-1 = 8, got %d", delta.increments["res1"]["az-1"]) + } + if delta.increments["res1"]["az-2"] != 2 { + t.Errorf("expected increment res1/az-2 = 2, got %d", delta.increments["res1"]["az-2"]) + } + if delta.decrements["res1"]["az-1"] != 1 { + t.Errorf("expected decrement res1/az-1 = 1, got %d", delta.decrements["res1"]["az-1"]) + } +} + +func TestReconcile_NilTotalUsage(t *testing.T) { + // When TotalUsage is nil, Reconcile should skip and return no error. + // This validates the early-return branch logic used in Reconcile(). + ctrl := &QuotaController{Config: DefaultQuotaControllerConfig()} + + // computeCRUsage on nil slice should return empty map (no panic) + result := ctrl.computeCRUsage(nil) + if len(result) != 0 { + t.Errorf("expected empty result for nil CRs, got %d entries", len(result)) + } + + // derivePaygUsage on nil totalUsage should return empty map + payg := derivePaygUsage(nil, result) + if len(payg) != 0 { + t.Errorf("expected empty payg for nil totalUsage, got %d entries", len(payg)) + } +} + +func TestAccumulateAddedVM_UnknownFlavor(t *testing.T) { + // Verifies that accumulateAddedVM gracefully handles a VM with an unknown flavor + ctrl := &QuotaController{Config: DefaultQuotaControllerConfig()} + + flavorGroups := map[string]compute.FlavorGroupFeature{ + "hana_v2": { + SmallestFlavor: compute.FlavorInGroup{MemoryMB: 32768}, + Flavors: []compute.FlavorInGroup{{Name: "m1.hana_v2.small", MemoryMB: 32768}}, + }, + } + flavorToGroup := buildFlavorToGroupMap(flavorGroups) + projectDeltas := make(map[string]*usageDelta) + + // Use a mock VMSource that returns a VM with unknown flavor + ctrl.VMSource = &mockVMSource{ + getVM: func(_ context.Context, vmUUID string) (*failover.VM, error) { + return &failover.VM{ + UUID: vmUUID, + FlavorName: "unknown-flavor", + ProjectID: "project-a", + AvailabilityZone: "az-1", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("4294967296"), + "vcpus": resource.MustParse("2"), + }, + }, nil + }, + } + + ctrl.accumulateAddedVM(context.Background(), "vm-1", flavorToGroup, flavorGroups, projectDeltas) + + // Should not have added any delta (unknown flavor) + if len(projectDeltas) != 0 { + t.Errorf("expected no deltas for unknown flavor, got %d projects", len(projectDeltas)) + } +} + +func TestAccumulateAddedVM_KnownFlavor(t *testing.T) { + // Set up a fake client with a ProjectQuota that has LastReconcileAt in the past. + // The VM's CreatedAt must be AFTER LastReconcileAt for it to be considered new. + lastReconcile := metav1.NewTime(time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC)) + vmCreatedAt := "2026-01-02T00:00:00Z" // After lastReconcile + + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add scheme: %v", err) + } + + pq := &v1alpha1.ProjectQuota{ + ObjectMeta: metav1.ObjectMeta{Name: "quota-project-a"}, + Spec: v1alpha1.ProjectQuotaSpec{ProjectID: "project-a"}, + Status: v1alpha1.ProjectQuotaStatus{ + LastReconcileAt: &lastReconcile, + }, + } + + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(pq). + WithStatusSubresource(&v1alpha1.ProjectQuota{}). + Build() + + qc := &QuotaController{ + Client: k8sClient, + Config: DefaultQuotaControllerConfig(), + } + + flavorGroups := map[string]compute.FlavorGroupFeature{ + "hana_v2": { + SmallestFlavor: compute.FlavorInGroup{MemoryMB: 32768}, + Flavors: []compute.FlavorInGroup{{Name: "m1.hana_v2.small", MemoryMB: 32768}}, + }, + } + flavorToGroup := buildFlavorToGroupMap(flavorGroups) + projectDeltas := make(map[string]*usageDelta) + + qc.VMSource = &mockVMSource{ + getVM: func(_ context.Context, vmUUID string) (*failover.VM, error) { + return &failover.VM{ + UUID: vmUUID, + FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", + AvailabilityZone: "az-1", + CreatedAt: vmCreatedAt, + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), // 32768 MiB + "vcpus": resource.MustParse("8"), + }, + }, nil + }, + } + + qc.accumulateAddedVM(context.Background(), "vm-1", flavorToGroup, flavorGroups, projectDeltas) + + delta, ok := projectDeltas["project-a"] + if !ok { + t.Fatal("expected delta for project-a") + } + + // 32768 MiB / 32768 = 1 unit RAM + if delta.increments["hw_version_hana_v2_ram"]["az-1"] != 1 { + t.Errorf("expected ram increment = 1, got %d", delta.increments["hw_version_hana_v2_ram"]["az-1"]) + } + if delta.increments["hw_version_hana_v2_cores"]["az-1"] != 8 { + t.Errorf("expected cores increment = 8, got %d", delta.increments["hw_version_hana_v2_cores"]["az-1"]) + } +} + +// mockVMSource is a test helper for VMSource. +type mockVMSource struct { + listVMs func(ctx context.Context) ([]failover.VM, error) + getVM func(ctx context.Context, vmUUID string) (*failover.VM, error) + isServerActive func(ctx context.Context, vmUUID string) (bool, error) + getDeletedVM func(ctx context.Context, vmUUID string) (*failover.DeletedVMInfo, error) +} + +func (m *mockVMSource) ListVMs(ctx context.Context) ([]failover.VM, error) { + if m.listVMs != nil { + return m.listVMs(ctx) + } + return nil, nil +} + +func (m *mockVMSource) GetVM(ctx context.Context, vmUUID string) (*failover.VM, error) { + if m.getVM != nil { + return m.getVM(ctx, vmUUID) + } + return nil, nil +} + +func (m *mockVMSource) ListVMsOnHypervisors(_ context.Context, _ *hv1.HypervisorList, _ bool) ([]failover.VM, error) { + return nil, nil +} + +func (m *mockVMSource) IsServerActive(ctx context.Context, vmUUID string) (bool, error) { + if m.isServerActive != nil { + return m.isServerActive(ctx, vmUUID) + } + return false, nil +} + +func (m *mockVMSource) GetDeletedVMInfo(ctx context.Context, vmUUID string) (*failover.DeletedVMInfo, error) { + if m.getDeletedVM != nil { + return m.getDeletedVM(ctx, vmUUID) + } + return nil, nil +} diff --git a/internal/scheduling/reservations/quota/integration_test.go b/internal/scheduling/reservations/quota/integration_test.go new file mode 100644 index 000000000..36977203e --- /dev/null +++ b/internal/scheduling/reservations/quota/integration_test.go @@ -0,0 +1,1232 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package quota + +import ( + "context" + "encoding/json" + "testing" + "time" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/failover" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +// ============================================================================ +// Integration Tests +// ============================================================================ + +func TestIntegration(t *testing.T) { + lastReconcileTime := metav1.NewTime(time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC)) + + tests := []IntegrationTestCase{ + { + Name: "full reconcile - basic usage", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + makePQ("project-b", nil), + }, + Actions: []TestAction{ + { + Type: "full_reconcile", + // project-a: hana_v2 az-1: (32768+65536)/32768 = 3 RAM units, 8+16=24 cores + // project-a: hana_v2 az-2: 32768/32768 = 1 RAM unit, 8 cores + // project-a: general az-1: 4096/4096 = 1 RAM unit, 2 cores + // project-b: general az-1: 4096/4096 = 1 RAM unit, 2 cores + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + "project-b": { + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + // No CRs -> PaygUsage == TotalUsage + ExpectedPaygUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + "project-b": { + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "full reconcile - with CRs reduces PaygUsage", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + }, + CommittedResources: []*v1alpha1.CommittedResource{ + // 2 units of hana_v2 RAM committed in az-1 for project-a + makeCR("cr-1", "project-a", "hana_v2", "az-1", + v1alpha1.CommittedResourceTypeMemory, v1alpha1.CommitmentStatusConfirmed, int64Ptr(2)), + // 10 cores committed in az-1 for project-a + makeCR("cr-2", "project-a", "hana_v2", "az-1", + v1alpha1.CommittedResourceTypeCores, v1alpha1.CommitmentStatusConfirmed, int64Ptr(10)), + }, + Actions: []TestAction{ + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + // PaygUsage = TotalUsage - CRUsage + // hana_v2 RAM: 3-2=1 in az-1, 1-0=1 in az-2 + // hana_v2 Cores: 24-10=14 in az-1, 8-0=8 in az-2 + // general: no CRs so PaygUsage == TotalUsage + ExpectedPaygUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 1, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 14, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "incremental add - new VM after last reconcile", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + }, + Actions: []TestAction{ + // Step 1: full reconcile to establish baseline + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 2: HV diff adds a NEW VM (created after last reconcile) + { + Type: "hv_diff", + OldHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + }), + NewHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + activeInstance("vm-new"), // new instance + }), + OverrideVMs: withExtraVMs( + failover.VM{ + UUID: "vm-new", FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2099-01-01T00:00:00Z", // far future, always AFTER last reconcile + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), // 32768 MiB = 1 RAM unit + "vcpus": resource.MustParse("8"), + }, + }, + ), + // vm-new is created AFTER last reconcile, so it gets incremented + // +1 RAM unit (32768/32768), +8 cores + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "incremental add - migration skipped (VM created before last reconcile)", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + }, + Actions: []TestAction{ + // Step 1: full reconcile + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 2: HV diff adds vm-1 (which was created BEFORE last reconcile = migration) + { + Type: "hv_diff", + OldHV: makeHV("hv-2", []hv1.Instance{}), + NewHV: makeHV("hv-2", []hv1.Instance{ + activeInstance("vm-1"), // migrated here, created before reconcile + }), + // Should NOT increment -- vm-1 CreatedAt is 2025-12-01 which is before reconcile time + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "incremental remove - deleted VM decrements usage", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + // vm-del is not in VMs (deleted), but has info in DeletedVMs + DeletedVMs: map[string]*failover.DeletedVMInfo{ + "vm-del": { + ProjectID: "project-a", + FlavorName: "m1.hana_v2.small", + AvailabilityZone: "az-1", + RAMMiB: 32768, + VCPUs: 8, + }, + }, + ActiveVMs: map[string]bool{ + "vm-del": false, // not active (truly deleted) + }, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + }, + Actions: []TestAction{ + // Step 1: full reconcile (vm-del not in VMs so not counted) + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 2: HV diff removes vm-del (was on HV before, now gone) + { + Type: "hv_diff", + OldHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + activeInstance("vm-del"), // was here + }), + NewHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + // vm-del gone + }), + // vm-del: IsServerActive=false, deleted info found + // Decrement: -1 RAM unit, -8 cores in az-1 + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 2, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 16, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "incremental remove - migrated VM not decremented", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ActiveVMs: map[string]bool{ + "vm-1": true, // still active (migrated to another HV) + }, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + }, + Actions: []TestAction{ + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // HV reports vm-1 removed (migrated away) + { + Type: "hv_diff", + OldHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + }), + NewHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-2"), + // vm-1 gone from this HV + }), + // vm-1: IsServerActive=true, so NOT decremented + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "CR update triggers PaygUsage recompute", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + }, + CommittedResources: []*v1alpha1.CommittedResource{ + makeCR("cr-ram-1", "project-a", "hana_v2", "az-1", + v1alpha1.CommittedResourceTypeMemory, v1alpha1.CommitmentStatusConfirmed, int64Ptr(1)), + }, + Actions: []TestAction{ + // Step 1: full reconcile with initial CR (UsedAmount=1) + { + Type: "full_reconcile", + ExpectedPaygUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 2, "az-2": 1}}, // 3-1=2 + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 2: CR UsedAmount increases to 3 -> PaygUsage should drop + { + Type: "cr_update", + CRName: "cr-ram-1", + UsedAmount: 3, + ExpectedPaygUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 0, "az-2": 1}}, // 3-3=0 + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "unknown flavor VMs are skipped", + FlavorGroups: testFlavorGroups, + VMs: []failover.VM{ + { + UUID: "vm-unknown", FlavorName: "nonexistent-flavor", + ProjectID: "project-x", AvailabilityZone: "az-1", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("4294967296"), + "vcpus": resource.MustParse("2"), + }, + }, + }, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-x", nil), + }, + Actions: []TestAction{ + { + Type: "full_reconcile", + // No usage for project-x (unknown flavor skipped) + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-x": {}, + }, + ExpectedPaygUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-x": {}, + }, + }, + }, + }, + { + Name: "multiple full reconciles are idempotent", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + makePQ("project-b", nil), + }, + Actions: []TestAction{ + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + "project-b": { + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Second full reconcile - same result + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + "project-b": { + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "pending CRs are excluded from PaygUsage deduction", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + }, + CommittedResources: []*v1alpha1.CommittedResource{ + // Pending CR should NOT reduce PaygUsage + makeCR("cr-pending", "project-a", "hana_v2", "az-1", + v1alpha1.CommittedResourceTypeMemory, v1alpha1.CommitmentStatusPending, int64Ptr(5)), + }, + Actions: []TestAction{ + { + Type: "full_reconcile", + // PaygUsage == TotalUsage because pending CRs are excluded + ExpectedPaygUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "full reconcile corrects incremental drift", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + }, + Actions: []TestAction{ + // Step 1: full reconcile establishes correct baseline + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 2: HV diff adds a short-lived "phantom" VM (created after reconcile, + // but deleted before the next full reconcile runs). The incremental path + // bumps TotalUsage by +1 RAM / +8 cores. + { + Type: "hv_diff", + OldHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + }), + NewHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + activeInstance("vm-phantom"), + }), + OverrideVMs: withExtraVMs( + failover.VM{ + UUID: "vm-phantom", FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2099-01-01T00:00:00Z", // after last reconcile + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), // 32768 MiB = 1 RAM unit + "vcpus": resource.MustParse("8"), + }, + }, + ), + // TotalUsage now has phantom's contribution (drift) + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, // 3+1 drift + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, // 24+8 drift + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 3: full reconcile re-scans all VMs. Reset VM list to baseline + // (vm-phantom is gone). This corrects the drift back to the ground truth. + { + Type: "full_reconcile", + OverrideVMs: baseVMsPtr(), + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, // corrected + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, // corrected + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "complex multi-project scenario with adds, removes, and reconcile corrections", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + DeletedVMs: map[string]*failover.DeletedVMInfo{ + "vm-del": { + ProjectID: "project-a", + FlavorName: "m1.hana_v2.small", + AvailabilityZone: "az-1", + RAMMiB: 32768, + VCPUs: 8, + }, + }, + ActiveVMs: map[string]bool{ + "vm-del": false, // truly deleted + "vm-1": true, // still active (for migration scenario) + }, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + makePQ("project-b", nil), + }, + Actions: []TestAction{ + // Step 1: full reconcile establishes baseline for both projects + // project-a hana_v2: az-1=3 RAM / 24 cores, az-2=1 RAM / 8 cores; general: az-1=1 RAM / 2 cores + // project-b general: az-1=1 RAM / 2 cores + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + "project-b": { + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 2: HV diff adds a genuine new VM to project-a (hana_v2 small, az-1) + // +1 RAM unit, +8 cores + { + Type: "hv_diff", + OldHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + }), + NewHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + activeInstance("vm-new-a"), + }), + OverrideVMs: withExtraVMs( + failover.VM{ + UUID: "vm-new-a", FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2099-01-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), // 32768 MiB = 1 RAM unit + "vcpus": resource.MustParse("8"), + }, + }, + ), + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 3: HV diff adds a phantom VM to project-b (general, az-1) + // This is a short-lived VM that will disappear -- DRIFT for project-b + { + Type: "hv_diff", + OldHV: makeHV("hv-2", []hv1.Instance{ + activeInstance("vm-5"), + }), + NewHV: makeHV("hv-2", []hv1.Instance{ + activeInstance("vm-5"), + activeInstance("vm-phantom-b"), + }), + OverrideVMs: withExtraVMs( + failover.VM{ + UUID: "vm-phantom-b", FlavorName: "m1.general.small", + ProjectID: "project-b", AvailabilityZone: "az-1", + CreatedAt: "2099-01-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("4294967296"), // 4096 MiB = 1 RAM unit + "vcpus": resource.MustParse("2"), + }, + }, + ), + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-b": { + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 2}}, // 1+1 drift + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 4}}, // 2+2 drift + }, + }, + }, + // Step 4: HV diff removes vm-del from project-a (truly deleted) + // -1 RAM unit, -8 cores in az-1 + { + Type: "hv_diff", + OldHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + activeInstance("vm-new-a"), + activeInstance("vm-del"), + }), + NewHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + activeInstance("vm-new-a"), + }), + + OverrideVMs: withExtraVMs( + failover.VM{ + UUID: "vm-new-a", FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2099-01-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), + "vcpus": resource.MustParse("8"), + }, + }, + ), + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, // 4-1=3 + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, // 32-8=24 + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 5: full reconcile with OverrideVMs that includes vm-new-a + // (vm-new-a is now "real" and appears in the VM list). + // This reconcile: + // - project-a: FIXES drift -- truth is 4 (vm-new-a in list), delta said 3 + // - project-b: FIXES drift -- truth is 1, delta said 2 (phantom gone) + { + Type: "full_reconcile", + OverrideVMs: &[]failover.VM{ + // testVMs + vm-new-a + testVMs[0], testVMs[1], testVMs[2], testVMs[3], testVMs[4], + { + UUID: "vm-new-a", FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2099-01-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), + "vcpus": resource.MustParse("8"), + }, + }, + }, + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, // corrected up + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, // corrected up + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + "project-b": { + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, // corrected down + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, // corrected down + }, + }, + }, + // Step 6: another HV diff removes vm-1 from a HV (migration, not deletion). + // vm-1 is still active (ActiveVMs["vm-1"]=true), so NOT decremented. + { + Type: "hv_diff", + OldHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + activeInstance("vm-new-a"), + }), + NewHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-2"), + activeInstance("vm-new-a"), + }), + OverrideVMs: withExtraVMs( + failover.VM{ + UUID: "vm-new-a", FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2099-01-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), + "vcpus": resource.MustParse("8"), + }, + }, + ), + // vm-1 migrated, NOT decremented -- totals unchanged + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 7: final full reconcile confirms everything matches (no drift). + // This is the "reconcile that matches the deltas" -- nothing to fix. + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + "project-b": { + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + } + + _ = lastReconcileTime // referenced by test data (VM CreatedAt values) + + for _, tc := range tests { + t.Run(tc.Name, func(t *testing.T) { + env := newIntegrationTestEnv(t, tc) + + for i, action := range tc.Actions { + t.Logf(" action %d: %s", i+1, action.Type) + env.executeAction(action) + } + }) + } +} + +// ============================================================================ +// Test Data +// ============================================================================ + +var testFlavorGroups = map[string]compute.FlavorGroupFeature{ + "hana_v2": { + Name: "hana_v2", + SmallestFlavor: compute.FlavorInGroup{Name: "m1.hana_v2.small", MemoryMB: 32768, VCPUs: 8}, + LargestFlavor: compute.FlavorInGroup{Name: "m1.hana_v2.large", MemoryMB: 65536, VCPUs: 16}, + Flavors: []compute.FlavorInGroup{ + {Name: "m1.hana_v2.small", MemoryMB: 32768, VCPUs: 8}, + {Name: "m1.hana_v2.large", MemoryMB: 65536, VCPUs: 16}, + }, + }, + "general": { + Name: "general", + SmallestFlavor: compute.FlavorInGroup{Name: "m1.general.small", MemoryMB: 4096, VCPUs: 2}, + LargestFlavor: compute.FlavorInGroup{Name: "m1.general.small", MemoryMB: 4096, VCPUs: 2}, + Flavors: []compute.FlavorInGroup{ + {Name: "m1.general.small", MemoryMB: 4096, VCPUs: 2}, + }, + }, +} + +// Standard VM set for most tests. +// project-a has VMs in BOTH flavor groups (hana_v2 and general). +// project-b has only general VMs. +var testVMs = []failover.VM{ + // vm-1: hana_v2, 1 RAM unit (32768/32768), 8 cores + { + UUID: "vm-1", FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2025-12-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), // 32768 MiB + "vcpus": resource.MustParse("8"), + }, + }, + // vm-2: hana_v2, 2 RAM units (65536/32768), 16 cores + { + UUID: "vm-2", FlavorName: "m1.hana_v2.large", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2025-12-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("68719476736"), // 65536 MiB + "vcpus": resource.MustParse("16"), + }, + }, + // vm-3: hana_v2, 1 RAM unit (32768/32768), 8 cores + { + UUID: "vm-3", FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", AvailabilityZone: "az-2", + CreatedAt: "2025-12-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), // 32768 MiB + "vcpus": resource.MustParse("8"), + }, + }, + // vm-4: general, 1 RAM unit (4096/4096), 2 cores + { + UUID: "vm-4", FlavorName: "m1.general.small", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2025-12-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("4294967296"), // 4096 MiB + "vcpus": resource.MustParse("2"), + }, + }, + // vm-5: general, 1 RAM unit (4096/4096), 2 cores + { + UUID: "vm-5", FlavorName: "m1.general.small", + ProjectID: "project-b", AvailabilityZone: "az-1", + CreatedAt: "2025-12-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("4294967296"), // 4096 MiB + "vcpus": resource.MustParse("2"), + }, + }, +} + +// ============================================================================ +// Integration Test Framework +// ============================================================================ + +// TestAction defines a single step in an integration test scenario. +type TestAction struct { + // Type of action to perform. + // "full_reconcile" - run ReconcilePeriodic + // "hv_diff" - run ReconcileHVDiff with OldHV/NewHV + // "cr_update" - update a CR's UsedAmount, then run Reconcile (watch-triggered) + Type string + + // For hv_diff actions: + OldHV *hv1.Hypervisor + NewHV *hv1.Hypervisor + + // OverrideVMs, when non-nil, replaces the VMSource (ListVMs + GetVM) for + // THIS action and all subsequent actions. Use to simulate VMs appearing or + // disappearing between steps. To "undo" a temporary VM, set OverrideVMs + // again in a later action without that VM. + OverrideVMs *[]failover.VM + + // For cr_update actions: + CRName string + UsedAmount int64 + + // Optional: verify state AFTER this action completes. + // Keys are project IDs. If nil, no verification for this step. + ExpectedTotalUsage map[string]map[string]v1alpha1.ResourceQuotaUsage + ExpectedPaygUsage map[string]map[string]v1alpha1.ResourceQuotaUsage +} + +// IntegrationTestCase defines a complete integration test scenario. +type IntegrationTestCase struct { + Name string + + // Initial state seeded into the fake client and mock VMSource + VMs []failover.VM + DeletedVMs map[string]*failover.DeletedVMInfo // UUID -> deleted VM info + ActiveVMs map[string]bool // UUID -> IsServerActive response + + FlavorGroups map[string]compute.FlavorGroupFeature + ProjectQuotas []*v1alpha1.ProjectQuota + CommittedResources []*v1alpha1.CommittedResource + + // Ordered actions with per-step verification + Actions []TestAction +} + +// integrationTestEnv holds the test environment for a single test case. +type integrationTestEnv struct { + t *testing.T + client client.Client + controller *QuotaController + vmSource *mockVMSource +} + +func newIntegrationTestEnv(t *testing.T, tc IntegrationTestCase) *integrationTestEnv { + t.Helper() + + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add v1alpha1 to scheme: %v", err) + } + if err := hv1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add hv1 to scheme: %v", err) + } + + // Build initial objects list + var objects []client.Object + + // Create Knowledge CRD with flavor groups + knowledgeCRD := buildKnowledgeCRD(t, tc.FlavorGroups) + objects = append(objects, knowledgeCRD) + + // Add ProjectQuotas + for _, pq := range tc.ProjectQuotas { + objects = append(objects, pq) + } + + // Add CommittedResources + for _, cr := range tc.CommittedResources { + objects = append(objects, cr) + } + + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(objects...). + WithStatusSubresource( + &v1alpha1.ProjectQuota{}, + &v1alpha1.CommittedResource{}, + &v1alpha1.Knowledge{}, + ). + Build() + + // Build mock VMSource + vmSrc := &mockVMSource{ + listVMs: func(_ context.Context) ([]failover.VM, error) { + return tc.VMs, nil + }, + getVM: func(_ context.Context, vmUUID string) (*failover.VM, error) { + for i := range tc.VMs { + if tc.VMs[i].UUID == vmUUID { + return &tc.VMs[i], nil + } + } + return nil, nil + }, + isServerActive: func(_ context.Context, vmUUID string) (bool, error) { + if tc.ActiveVMs != nil { + if active, ok := tc.ActiveVMs[vmUUID]; ok { + return active, nil + } + } + return false, nil + }, + getDeletedVM: func(_ context.Context, vmUUID string) (*failover.DeletedVMInfo, error) { + if tc.DeletedVMs != nil { + if info, ok := tc.DeletedVMs[vmUUID]; ok { + return info, nil + } + } + return nil, nil + }, + } + + controller := &QuotaController{ + Client: k8sClient, + VMSource: vmSrc, + Config: DefaultQuotaControllerConfig(), + Metrics: NewQuotaMetrics(nil), // no-op metrics + } + + return &integrationTestEnv{ + t: t, + client: k8sClient, + controller: controller, + vmSource: vmSrc, + } +} + +func (env *integrationTestEnv) verifyTotalUsage(projectID string, expected map[string]v1alpha1.ResourceQuotaUsage) { + env.t.Helper() + crdName := "quota-" + projectID + var pq v1alpha1.ProjectQuota + if err := env.client.Get(context.Background(), client.ObjectKey{Name: crdName}, &pq); err != nil { + env.t.Fatalf("failed to get ProjectQuota %s: %v", crdName, err) + } + + if expected == nil && pq.Status.TotalUsage == nil { + return // both nil, ok + } + + for resourceName, expectedUsage := range expected { + actual, ok := pq.Status.TotalUsage[resourceName] + if !ok { + env.t.Errorf("project %s: expected TotalUsage resource %q not found", projectID, resourceName) + continue + } + for az, expectedAmount := range expectedUsage.PerAZ { + if actual.PerAZ[az] != expectedAmount { + env.t.Errorf("project %s: TotalUsage[%s][%s] = %d, want %d", + projectID, resourceName, az, actual.PerAZ[az], expectedAmount) + } + } + } + + // Check no unexpected resources + for resourceName := range pq.Status.TotalUsage { + if _, ok := expected[resourceName]; !ok { + env.t.Errorf("project %s: unexpected TotalUsage resource %q", projectID, resourceName) + } + } +} + +func (env *integrationTestEnv) verifyPaygUsage(projectID string, expected map[string]v1alpha1.ResourceQuotaUsage) { + env.t.Helper() + crdName := "quota-" + projectID + var pq v1alpha1.ProjectQuota + if err := env.client.Get(context.Background(), client.ObjectKey{Name: crdName}, &pq); err != nil { + env.t.Fatalf("failed to get ProjectQuota %s: %v", crdName, err) + } + + if expected == nil && pq.Status.PaygUsage == nil { + return + } + + for resourceName, expectedUsage := range expected { + actual, ok := pq.Status.PaygUsage[resourceName] + if !ok { + env.t.Errorf("project %s: expected PaygUsage resource %q not found", projectID, resourceName) + continue + } + for az, expectedAmount := range expectedUsage.PerAZ { + if actual.PerAZ[az] != expectedAmount { + env.t.Errorf("project %s: PaygUsage[%s][%s] = %d, want %d", + projectID, resourceName, az, actual.PerAZ[az], expectedAmount) + } + } + } + + for resourceName := range pq.Status.PaygUsage { + if _, ok := expected[resourceName]; !ok { + env.t.Errorf("project %s: unexpected PaygUsage resource %q", projectID, resourceName) + } + } +} + +func (env *integrationTestEnv) executeAction(action TestAction) { + env.t.Helper() + ctx := context.Background() + + // Apply OverrideVMs if set (persists for all subsequent actions) + if action.OverrideVMs != nil { + vms := *action.OverrideVMs + env.vmSource.listVMs = func(_ context.Context) ([]failover.VM, error) { + return vms, nil + } + env.vmSource.getVM = func(_ context.Context, vmUUID string) (*failover.VM, error) { + for i := range vms { + if vms[i].UUID == vmUUID { + return &vms[i], nil + } + } + return nil, nil + } + } + + switch action.Type { + case "full_reconcile": + if err := env.controller.ReconcilePeriodic(ctx); err != nil { + env.t.Fatalf("ReconcilePeriodic failed: %v", err) + } + + case "hv_diff": + if err := env.controller.ReconcileHVDiff(ctx, action.OldHV, action.NewHV); err != nil { + env.t.Fatalf("ReconcileHVDiff failed: %v", err) + } + + case "cr_update": + // Fetch the CR, update UsedAmount, then call Reconcile + var cr v1alpha1.CommittedResource + if err := env.client.Get(ctx, client.ObjectKey{Name: action.CRName}, &cr); err != nil { + env.t.Fatalf("failed to get CR %s: %v", action.CRName, err) + } + usedQty := resource.NewQuantity(action.UsedAmount, resource.DecimalSI) + cr.Status.UsedAmount = usedQty + if err := env.client.Status().Update(ctx, &cr); err != nil { + env.t.Fatalf("failed to update CR %s status: %v", action.CRName, err) + } + + // Simulate watch trigger: call Reconcile for the affected project + pqName := "quota-" + cr.Spec.ProjectID + _, err := env.controller.Reconcile(ctx, reconcileRequest(pqName)) + if err != nil { + env.t.Fatalf("Reconcile failed after CR update: %v", err) + } + + default: + env.t.Fatalf("unknown action type: %s", action.Type) + } + + // Verify expected state after this action + if action.ExpectedTotalUsage != nil { + for projectID, expected := range action.ExpectedTotalUsage { + env.verifyTotalUsage(projectID, expected) + } + } + if action.ExpectedPaygUsage != nil { + for projectID, expected := range action.ExpectedPaygUsage { + env.verifyPaygUsage(projectID, expected) + } + } +} + +// ============================================================================ +// Helpers +// ============================================================================ + +func buildKnowledgeCRD(t *testing.T, flavorGroups map[string]compute.FlavorGroupFeature) *v1alpha1.Knowledge { + t.Helper() + + // Convert map to slice for BoxFeatureList + var features []compute.FlavorGroupFeature + for _, fg := range flavorGroups { + features = append(features, fg) + } + + raw, err := boxFlavorGroupFeatures(features) + if err != nil { + t.Fatalf("failed to box flavor group features: %v", err) + } + + return &v1alpha1.Knowledge{ + ObjectMeta: metav1.ObjectMeta{ + Name: "flavor-groups", + }, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: "nova", + }, + Status: v1alpha1.KnowledgeStatus{ + Raw: raw, + Conditions: []metav1.Condition{ + { + Type: v1alpha1.KnowledgeConditionReady, + Status: metav1.ConditionTrue, + LastTransitionTime: metav1.Now(), + Reason: "Ready", + }, + }, + }, + } +} + +func boxFlavorGroupFeatures(features []compute.FlavorGroupFeature) (runtime.RawExtension, error) { + rawSerialized := struct { + Features []compute.FlavorGroupFeature `json:"features"` + }{ + Features: features, + } + data, err := json.Marshal(rawSerialized) + if err != nil { + return runtime.RawExtension{}, err + } + return runtime.RawExtension{Raw: data}, nil +} + +func reconcileRequest(name string) ctrl.Request { + return ctrl.Request{NamespacedName: client.ObjectKey{Name: name}} +} + +func makePQ(projectID string, lastReconcileAt *metav1.Time) *v1alpha1.ProjectQuota { + return &v1alpha1.ProjectQuota{ + ObjectMeta: metav1.ObjectMeta{Name: "quota-" + projectID}, + Spec: v1alpha1.ProjectQuotaSpec{ProjectID: projectID, DomainID: "domain-1"}, + Status: v1alpha1.ProjectQuotaStatus{ + LastReconcileAt: lastReconcileAt, + }, + } +} + +func makeCR(name, projectID, flavorGroup, az string, resourceType v1alpha1.CommittedResourceType, state v1alpha1.CommitmentStatus, usedAmount *int64) *v1alpha1.CommittedResource { + cr := &v1alpha1.CommittedResource{ + ObjectMeta: metav1.ObjectMeta{Name: name}, + Spec: v1alpha1.CommittedResourceSpec{ + CommitmentUUID: name + "-uuid", + FlavorGroupName: flavorGroup, + ResourceType: resourceType, + AvailabilityZone: az, + ProjectID: projectID, + DomainID: "domain-1", + Amount: resource.MustParse("10"), + State: state, + }, + } + if usedAmount != nil { + qty := resource.NewQuantity(*usedAmount, resource.DecimalSI) + cr.Status.UsedAmount = qty + } + return cr +} + +func int64Ptr(v int64) *int64 { return &v } + +// withExtraVMs returns a pointer to testVMs + additional VMs. +// Used with OverrideVMs to add VMs to the "world" for an action. +func withExtraVMs(extra ...failover.VM) *[]failover.VM { + vms := append(append([]failover.VM{}, testVMs...), extra...) + return &vms +} + +// baseVMsPtr returns a pointer to a copy of testVMs (resets to baseline). +func baseVMsPtr() *[]failover.VM { + vms := append([]failover.VM{}, testVMs...) + return &vms +} + +func makeHV(name string, instances []hv1.Instance) *hv1.Hypervisor { + return &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{Name: name}, + Status: hv1.HypervisorStatus{ + Instances: instances, + }, + } +} + +func activeInstance(id string) hv1.Instance { + return hv1.Instance{ID: id, Active: true} +} diff --git a/internal/scheduling/reservations/quota/metrics.go b/internal/scheduling/reservations/quota/metrics.go new file mode 100644 index 000000000..7263ab1fd --- /dev/null +++ b/internal/scheduling/reservations/quota/metrics.go @@ -0,0 +1,98 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package quota + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +// QuotaMetrics holds Prometheus metrics for the quota controller. +type QuotaMetrics struct { + totalUsageGauge *prometheus.GaugeVec + paygUsageGauge *prometheus.GaugeVec + crUsageGauge *prometheus.GaugeVec + reconcileDuration prometheus.Histogram + reconcileResultVec *prometheus.CounterVec +} + +// NewQuotaMetrics creates a new QuotaMetrics instance and registers with the given registerer. +func NewQuotaMetrics(reg prometheus.Registerer) *QuotaMetrics { + m := &QuotaMetrics{ + totalUsageGauge: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "cortex_quota_total_usage", + Help: "Total resource usage per project/AZ/resource", + }, + []string{"project_id", "availability_zone", "resource"}, + ), + paygUsageGauge: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "cortex_quota_payg_usage", + Help: "Pay-as-you-go usage per project/AZ/resource", + }, + []string{"project_id", "availability_zone", "resource"}, + ), + crUsageGauge: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "cortex_quota_cr_usage", + Help: "Committed resource usage per project/AZ/resource", + }, + []string{"project_id", "availability_zone", "resource"}, + ), + reconcileDuration: prometheus.NewHistogram( + prometheus.HistogramOpts{ + Name: "cortex_quota_reconcile_duration_seconds", + Help: "Duration of quota controller full reconcile", + Buckets: prometheus.ExponentialBuckets(0.1, 2, 10), + }, + ), + reconcileResultVec: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "cortex_quota_reconcile_total", + Help: "Total number of periodic reconcile attempts by result (success/failure)", + }, + []string{"result"}, + ), + } + + if reg != nil { + reg.MustRegister(m.totalUsageGauge) + reg.MustRegister(m.paygUsageGauge) + reg.MustRegister(m.crUsageGauge) + reg.MustRegister(m.reconcileDuration) + reg.MustRegister(m.reconcileResultVec) + } + + return m +} + +// RecordUsage records usage metrics for a project/AZ/resource. +func (m *QuotaMetrics) RecordUsage(projectID, az, resource string, totalUsage, paygUsage, crUsage int64) { + if m == nil { + return + } + m.totalUsageGauge.WithLabelValues(projectID, az, resource).Set(float64(totalUsage)) + m.paygUsageGauge.WithLabelValues(projectID, az, resource).Set(float64(paygUsage)) + m.crUsageGauge.WithLabelValues(projectID, az, resource).Set(float64(crUsage)) +} + +// RecordReconcileDuration records the duration of a full reconcile. +func (m *QuotaMetrics) RecordReconcileDuration(seconds float64) { + if m == nil { + return + } + m.reconcileDuration.Observe(seconds) +} + +// RecordReconcileResult increments the success or failure counter for periodic reconciles. +func (m *QuotaMetrics) RecordReconcileResult(success bool) { + if m == nil { + return + } + result := "failure" + if success { + result = "success" + } + m.reconcileResultVec.WithLabelValues(result).Inc() +}