From 7b0485741e440fb215e4105e90d78cb13a820939 Mon Sep 17 00:00:00 2001 From: Julius Clausnitzer Date: Wed, 22 Apr 2026 16:43:27 +0200 Subject: [PATCH 01/15] initial idea --- api/v1alpha1/flavor_group_capacity_types.go | 113 ++++ api/v1alpha1/zz_generated.deepcopy.go | 97 ++++ cmd/manager/main.go | 24 + .../reservations/capacity/config.go | 49 ++ .../reservations/capacity/controller.go | 258 +++++++++ .../reservations/capacity/controller_test.go | 499 ++++++++++++++++++ .../reservations/capacity/metrics.go | 104 ++++ 7 files changed, 1144 insertions(+) create mode 100644 api/v1alpha1/flavor_group_capacity_types.go create mode 100644 internal/scheduling/reservations/capacity/config.go create mode 100644 internal/scheduling/reservations/capacity/controller.go create mode 100644 internal/scheduling/reservations/capacity/controller_test.go create mode 100644 internal/scheduling/reservations/capacity/metrics.go diff --git a/api/v1alpha1/flavor_group_capacity_types.go b/api/v1alpha1/flavor_group_capacity_types.go new file mode 100644 index 000000000..7911e1d09 --- /dev/null +++ b/api/v1alpha1/flavor_group_capacity_types.go @@ -0,0 +1,113 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +const ( + // FlavorGroupCapacityConditionFresh indicates the status data is up-to-date. + FlavorGroupCapacityConditionFresh = "Fresh" +) + +// FlavorGroupCapacitySpec defines the desired state of FlavorGroupCapacity. +type FlavorGroupCapacitySpec struct { + // FlavorGroup is the name of the flavor group (e.g. "2101"). + // +kubebuilder:validation:Required + FlavorGroup string `json:"flavorGroup"` + + // AvailabilityZone is the OpenStack AZ this capacity data covers (e.g. "qa-de-1a"). + // +kubebuilder:validation:Required + AvailabilityZone string `json:"availabilityZone"` +} + +// FlavorGroupCapacityStatus defines the observed state of FlavorGroupCapacity. +type FlavorGroupCapacityStatus struct { + // TotalCapacity is the total schedulable slots in an empty-datacenter scenario. + // Computed as sum of floor(EffectiveCapacity.Memory / smallestFlavorMemory) across + // all hosts eligible for this flavor group (empty-state scheduler probe). + // +kubebuilder:validation:Optional + // +kubebuilder:validation:Minimum=0 + TotalCapacity int64 `json:"totalCapacity,omitempty"` + + // TotalHosts is the number of hosts eligible for this flavor group in the empty-state probe. + // +kubebuilder:validation:Optional + // +kubebuilder:validation:Minimum=0 + TotalHosts int64 `json:"totalHosts,omitempty"` + + // TotalPlaceable is the schedulable slots remaining given current VM allocations. + // Computed from the current-state scheduler probe. + // +kubebuilder:validation:Optional + // +kubebuilder:validation:Minimum=0 + TotalPlaceable int64 `json:"totalPlaceable,omitempty"` + + // PlaceableHosts is the number of hosts still able to accept a new smallest-flavor VM. + // +kubebuilder:validation:Optional + // +kubebuilder:validation:Minimum=0 + PlaceableHosts int64 `json:"placeableHosts,omitempty"` + + // TotalInstances is the total number of VM instances running on hypervisors in this AZ, + // derived from Hypervisor CRD Status.Instances (not filtered by flavor group). + // +kubebuilder:validation:Optional + // +kubebuilder:validation:Minimum=0 + TotalInstances int64 `json:"totalInstances,omitempty"` + + // CommittedCapacity is the sum of AcceptedAmount across Ready=True CommittedResource CRDs. + // TODO(BLI #337): populate once CommittedResource CRD exists. + // +kubebuilder:validation:Optional + // +kubebuilder:validation:Minimum=0 + CommittedCapacity int64 `json:"committedCapacity,omitempty"` + + // LastReconcileAt is the timestamp of the last successful reconcile. + // +kubebuilder:validation:Optional + LastReconcileAt metav1.Time `json:"lastReconcileAt,omitempty"` + + // Conditions represent the current state of the FlavorGroupCapacity. + // The Fresh condition indicates whether the status data is up-to-date. + // +kubebuilder:validation:Optional + // +patchStrategy=merge + // +patchMergeKey=type + Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:resource:scope=Cluster +// +kubebuilder:printcolumn:name="FlavorGroup",type="string",JSONPath=".spec.flavorGroup" +// +kubebuilder:printcolumn:name="AZ",type="string",JSONPath=".spec.availabilityZone" +// +kubebuilder:printcolumn:name="TotalCapacity",type="integer",JSONPath=".status.totalCapacity" +// +kubebuilder:printcolumn:name="TotalPlaceable",type="integer",JSONPath=".status.totalPlaceable" +// +kubebuilder:printcolumn:name="TotalHosts",type="integer",JSONPath=".status.totalHosts" +// +kubebuilder:printcolumn:name="LastReconcile",type="date",JSONPath=".status.lastReconcileAt" +// +kubebuilder:printcolumn:name="Fresh",type="string",JSONPath=".status.conditions[?(@.type=='Fresh')].status" + +// FlavorGroupCapacity caches pre-computed capacity data for one flavor group in one AZ. +// One CRD exists per (flavor group × AZ) pair, updated by the capacity controller on a fixed interval. +// The capacity API reads these CRDs instead of probing the scheduler on each request. +type FlavorGroupCapacity struct { + metav1.TypeMeta `json:",inline"` + + // +optional + metav1.ObjectMeta `json:"metadata,omitempty"` + + // +required + Spec FlavorGroupCapacitySpec `json:"spec"` + + // +optional + Status FlavorGroupCapacityStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +// FlavorGroupCapacityList contains a list of FlavorGroupCapacity. +type FlavorGroupCapacityList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []FlavorGroupCapacity `json:"items"` +} + +func init() { + SchemeBuilder.Register(&FlavorGroupCapacity{}, &FlavorGroupCapacityList{}) +} diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 778c91710..5322a7d1f 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -587,6 +587,103 @@ func (in *FailoverReservationStatus) DeepCopy() *FailoverReservationStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FlavorGroupCapacity) DeepCopyInto(out *FlavorGroupCapacity) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + out.Spec = in.Spec + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FlavorGroupCapacity. +func (in *FlavorGroupCapacity) DeepCopy() *FlavorGroupCapacity { + if in == nil { + return nil + } + out := new(FlavorGroupCapacity) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *FlavorGroupCapacity) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FlavorGroupCapacityList) DeepCopyInto(out *FlavorGroupCapacityList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]FlavorGroupCapacity, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FlavorGroupCapacityList. +func (in *FlavorGroupCapacityList) DeepCopy() *FlavorGroupCapacityList { + if in == nil { + return nil + } + out := new(FlavorGroupCapacityList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *FlavorGroupCapacityList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FlavorGroupCapacitySpec) DeepCopyInto(out *FlavorGroupCapacitySpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FlavorGroupCapacitySpec. +func (in *FlavorGroupCapacitySpec) DeepCopy() *FlavorGroupCapacitySpec { + if in == nil { + return nil + } + out := new(FlavorGroupCapacitySpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FlavorGroupCapacityStatus) DeepCopyInto(out *FlavorGroupCapacityStatus) { + *out = *in + in.LastReconcileAt.DeepCopyInto(&out.LastReconcileAt) + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]metav1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FlavorGroupCapacityStatus. +func (in *FlavorGroupCapacityStatus) DeepCopy() *FlavorGroupCapacityStatus { + if in == nil { + return nil + } + out := new(FlavorGroupCapacityStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *FilterSpec) DeepCopyInto(out *FilterSpec) { *out = *in diff --git a/cmd/manager/main.go b/cmd/manager/main.go index 4c390f5a8..abe42317c 100644 --- a/cmd/manager/main.go +++ b/cmd/manager/main.go @@ -56,6 +56,7 @@ import ( "github.com/cobaltcore-dev/cortex/internal/scheduling/nova" "github.com/cobaltcore-dev/cortex/internal/scheduling/pods" "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/capacity" "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/commitments" commitmentsapi "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/commitments/api" "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/failover" @@ -677,6 +678,29 @@ func main() { "maxVMsToProcess", failoverConfig.MaxVMsToProcess, "vmSelectionRotationInterval", failoverConfig.VMSelectionRotationInterval) } + if slices.Contains(mainConfig.EnabledControllers, "capacity-controller") { + setupLog.Info("enabling controller", "controller", "capacity-controller") + capacityConfig := conf.GetConfigOrDie[capacity.Config]() + capacityConfig.ApplyDefaults() + + capacityMonitor := capacity.NewMonitor(multiclusterClient) + if err := metrics.Registry.Register(&capacityMonitor); err != nil { + setupLog.Error(err, "failed to register capacity monitor metrics, continuing without metrics") + } + + capacityController := capacity.NewController(multiclusterClient, capacityConfig) + if err := mgr.Add(manager.RunnableFunc(func(ctx context.Context) error { + return capacityController.Start(ctx) + })); err != nil { + setupLog.Error(err, "unable to add capacity controller to manager") + os.Exit(1) + } + setupLog.Info("capacity-controller registered", + "schedulerURL", capacityConfig.SchedulerURL, + "reconcileInterval", capacityConfig.ReconcileInterval, + "totalPipeline", capacityConfig.TotalPipeline, + "placeablePipeline", capacityConfig.PlaceablePipeline) + } // +kubebuilder:scaffold:builder diff --git a/internal/scheduling/reservations/capacity/config.go b/internal/scheduling/reservations/capacity/config.go new file mode 100644 index 000000000..2940f32e8 --- /dev/null +++ b/internal/scheduling/reservations/capacity/config.go @@ -0,0 +1,49 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package capacity + +import "time" + +// Config holds configuration for the capacity controller. +type Config struct { + // ReconcileInterval is how often the controller probes the scheduler and updates CRDs. + ReconcileInterval time.Duration `json:"capacityReconcileInterval"` + + // TotalPipeline is the scheduler pipeline used for the empty-state probe. + // This pipeline should ignore current VM allocations (e.g. kvm-report-capacity). + TotalPipeline string `json:"capacityTotalPipeline"` + + // PlaceablePipeline is the scheduler pipeline used for the current-state probe. + // This pipeline considers current VM allocations to determine remaining placement capacity. + PlaceablePipeline string `json:"capacityPlaceablePipeline"` + + // SchedulerURL is the endpoint of the nova external scheduler. + SchedulerURL string `json:"schedulerURL"` +} + +// ApplyDefaults fills in any unset values with defaults. +func (c *Config) ApplyDefaults() { + defaults := DefaultConfig() + if c.ReconcileInterval == 0 { + c.ReconcileInterval = defaults.ReconcileInterval + } + if c.TotalPipeline == "" { + c.TotalPipeline = defaults.TotalPipeline + } + if c.PlaceablePipeline == "" { + c.PlaceablePipeline = defaults.PlaceablePipeline + } + if c.SchedulerURL == "" { + c.SchedulerURL = defaults.SchedulerURL + } +} + +func DefaultConfig() Config { + return Config{ + ReconcileInterval: 5 * time.Minute, + TotalPipeline: "kvm-report-capacity", + PlaceablePipeline: "kvm-general-purpose-load-balancing", + SchedulerURL: "http://localhost:8080/scheduler/nova/external", + } +} diff --git a/internal/scheduling/reservations/capacity/controller.go b/internal/scheduling/reservations/capacity/controller.go new file mode 100644 index 000000000..4ff2eadc4 --- /dev/null +++ b/internal/scheduling/reservations/capacity/controller.go @@ -0,0 +1,258 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package capacity + +import ( + "context" + "fmt" + "sort" + "strings" + "time" + + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "github.com/google/uuid" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" +) + +var log = ctrl.Log.WithName("capacity-controller").WithValues("module", "capacity") + +// Controller reconciles FlavorGroupCapacity CRDs on a fixed interval. +// For each (flavor group × AZ) pair it runs two scheduler probes and updates the CRD status. +type Controller struct { + client client.Client + schedulerClient *reservations.SchedulerClient + config Config +} + +func NewController(c client.Client, config Config) *Controller { + return &Controller{ + client: c, + schedulerClient: reservations.NewSchedulerClient(config.SchedulerURL), + config: config, + } +} + +// Start runs the periodic reconcile loop. Implements manager.Runnable. +func (c *Controller) Start(ctx context.Context) error { + timer := time.NewTimer(0) // fire immediately on start + defer timer.Stop() + + for { + select { + case <-ctx.Done(): + return nil + case <-timer.C: + if err := c.reconcileAll(ctx); err != nil { + log.Error(err, "reconcile cycle failed") + } + timer.Reset(c.config.ReconcileInterval) + } + } +} + +// reconcileAll iterates all flavor groups × AZs and upserts FlavorGroupCapacity CRDs. +func (c *Controller) reconcileAll(ctx context.Context) error { + knowledge := &reservations.FlavorGroupKnowledgeClient{Client: c.client} + flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) + if err != nil { + return fmt.Errorf("failed to get flavor groups: %w", err) + } + + var hvList hv1.HypervisorList + if err := c.client.List(ctx, &hvList); err != nil { + return fmt.Errorf("failed to list hypervisors: %w", err) + } + + hvByName := make(map[string]hv1.Hypervisor, len(hvList.Items)) + for _, hv := range hvList.Items { + hvByName[hv.Name] = hv + } + + azs := availabilityZones(hvList.Items) + + for groupName, groupData := range flavorGroups { + for _, az := range azs { + if err := c.reconcileOne(ctx, groupName, groupData, az, hvByName, hvList.Items); err != nil { + log.Error(err, "failed to reconcile flavor group capacity", + "flavorGroup", groupName, "az", az) + // Continue with other pairs rather than aborting the whole cycle. + } + } + } + return nil +} + +// reconcileOne updates the FlavorGroupCapacity CRD for one (group × AZ) pair. +func (c *Controller) reconcileOne( + ctx context.Context, + groupName string, + groupData compute.FlavorGroupFeature, + az string, + hvByName map[string]hv1.Hypervisor, + allHVs []hv1.Hypervisor, +) error { + smallestFlavor := groupData.SmallestFlavor + smallestFlavorBytes := int64(smallestFlavor.MemoryMB) * 1024 * 1024 //nolint:gosec + if smallestFlavorBytes <= 0 { + return fmt.Errorf("smallest flavor %q has invalid memory %d MB", smallestFlavor.Name, smallestFlavor.MemoryMB) + } + + // Empty-state probe: scheduler ignores all current VM allocations. + totalCapacity, totalHosts, totalErr := c.probeScheduler(ctx, smallestFlavor, az, c.config.TotalPipeline, hvByName, smallestFlavorBytes) + + // Current-state probe: scheduler considers current VM allocations. + totalPlaceable, placeableHosts, placeableErr := c.probeScheduler(ctx, smallestFlavor, az, c.config.PlaceablePipeline, hvByName, smallestFlavorBytes) + + // Count total instances on hypervisors in this AZ. + totalInstances := countInstancesInAZ(allHVs, az) + + // TODO(BLI #337): populate CommittedCapacity from Ready=True CommittedResource CRDs. + var committedCapacity int64 + + crdName := crdNameFor(groupName, az) + fresh := totalErr == nil && placeableErr == nil + + var existing v1alpha1.FlavorGroupCapacity + err := c.client.Get(ctx, types.NamespacedName{Name: crdName}, &existing) + if apierrors.IsNotFound(err) { + existing = v1alpha1.FlavorGroupCapacity{ + ObjectMeta: metav1.ObjectMeta{Name: crdName}, + Spec: v1alpha1.FlavorGroupCapacitySpec{ + FlavorGroup: groupName, + AvailabilityZone: az, + }, + } + if createErr := c.client.Create(ctx, &existing); createErr != nil { + return fmt.Errorf("failed to create FlavorGroupCapacity %s: %w", crdName, createErr) + } + } else if err != nil { + return fmt.Errorf("failed to get FlavorGroupCapacity %s: %w", crdName, err) + } + + patch := client.MergeFrom(existing.DeepCopy()) + existing.Status.TotalCapacity = totalCapacity + existing.Status.TotalHosts = totalHosts + existing.Status.TotalPlaceable = totalPlaceable + existing.Status.PlaceableHosts = placeableHosts + existing.Status.TotalInstances = totalInstances + existing.Status.CommittedCapacity = committedCapacity + existing.Status.LastReconcileAt = metav1.Now() + + freshCondition := metav1.Condition{ + Type: v1alpha1.FlavorGroupCapacityConditionFresh, + ObservedGeneration: existing.Generation, + } + if fresh { + freshCondition.Status = metav1.ConditionTrue + freshCondition.Reason = "ReconcileSucceeded" + freshCondition.Message = "capacity data is up-to-date" + } else { + freshCondition.Status = metav1.ConditionFalse + freshCondition.Reason = "ReconcileFailed" + if totalErr != nil { + freshCondition.Message = fmt.Sprintf("empty-state probe failed: %v", totalErr) + } else { + freshCondition.Message = fmt.Sprintf("current-state probe failed: %v", placeableErr) + } + } + meta.SetStatusCondition(&existing.Status.Conditions, freshCondition) + + if patchErr := c.client.Status().Patch(ctx, &existing, patch); patchErr != nil { + return fmt.Errorf("failed to patch FlavorGroupCapacity %s status: %w", crdName, patchErr) + } + return nil +} + +// probeScheduler calls the scheduler with the given pipeline and returns capacity + host count. +func (c *Controller) probeScheduler( + ctx context.Context, + flavor compute.FlavorInGroup, + az, pipeline string, + hvByName map[string]hv1.Hypervisor, + smallestFlavorBytes int64, +) (capacity int64, hosts int64, err error) { + resp, err := c.schedulerClient.ScheduleReservation(ctx, reservations.ScheduleReservationRequest{ + InstanceUUID: uuid.New().String(), + ProjectID: "cortex-capacity-probe", + FlavorName: flavor.Name, + MemoryMB: flavor.MemoryMB, + VCPUs: flavor.VCPUs, + FlavorExtraSpecs: flavor.ExtraSpecs, + AvailabilityZone: az, + Pipeline: pipeline, + }) + if err != nil { + return 0, 0, fmt.Errorf("scheduler call failed (pipeline=%s): %w", pipeline, err) + } + + hosts = int64(len(resp.Hosts)) //nolint:gosec + for _, hostName := range resp.Hosts { + hv, ok := hvByName[hostName] + if !ok { + continue + } + effectiveCap := hv.Status.EffectiveCapacity + if effectiveCap == nil { + effectiveCap = hv.Status.Capacity + } + if effectiveCap == nil { + continue + } + memCap, ok := effectiveCap[hv1.ResourceMemory] + if !ok { + continue + } + if capBytes := memCap.Value(); capBytes > 0 { + capacity += capBytes / smallestFlavorBytes + } + } + return capacity, hosts, nil +} + +// availabilityZones returns a sorted, deduplicated list of AZs from Hypervisor CRD labels. +func availabilityZones(hvs []hv1.Hypervisor) []string { + azSet := make(map[string]struct{}) + for _, hv := range hvs { + if az, ok := hv.Labels["topology.kubernetes.io/zone"]; ok && az != "" { + azSet[az] = struct{}{} + } + } + azs := make([]string, 0, len(azSet)) + for az := range azSet { + azs = append(azs, az) + } + sort.Strings(azs) + return azs +} + +// countInstancesInAZ counts total VM instances across all hypervisors in the given AZ. +func countInstancesInAZ(hvs []hv1.Hypervisor, az string) int64 { + var total int64 + for _, hv := range hvs { + if hv.Labels["topology.kubernetes.io/zone"] != az { + continue + } + total += int64(len(hv.Status.Instances)) //nolint:gosec + } + return total +} + +// crdNameFor produces a valid DNS subdomain name for a (flavorGroup, az) pair. +// Underscores and dots are replaced with dashes; the result is lowercased. +func crdNameFor(flavorGroup, az string) string { + combined := flavorGroup + "-" + az + combined = strings.ToLower(combined) + combined = strings.ReplaceAll(combined, "_", "-") + combined = strings.ReplaceAll(combined, ".", "-") + return combined +} diff --git a/internal/scheduling/reservations/capacity/controller_test.go b/internal/scheduling/reservations/capacity/controller_test.go new file mode 100644 index 000000000..3f3ceb6f0 --- /dev/null +++ b/internal/scheduling/reservations/capacity/controller_test.go @@ -0,0 +1,499 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package capacity + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "sort" + "testing" + + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + schedulerapi "github.com/cobaltcore-dev/cortex/api/external/nova" + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" +) + +// newTestScheme returns a runtime.Scheme with all required types registered. +func newTestScheme(t *testing.T) *runtime.Scheme { + t.Helper() + s := runtime.NewScheme() + if err := v1alpha1.AddToScheme(s); err != nil { + t.Fatalf("failed to add v1alpha1 scheme: %v", err) + } + if err := hv1.AddToScheme(s); err != nil { + t.Fatalf("failed to add hypervisor scheme: %v", err) + } + return s +} + +// newFlavorGroupKnowledge creates a ready Knowledge CRD with a single flavor group. +func newFlavorGroupKnowledge(t *testing.T, groupName string, smallestMemoryMB uint64) *v1alpha1.Knowledge { + t.Helper() + features := []compute.FlavorGroupFeature{ + { + Name: groupName, + SmallestFlavor: compute.FlavorInGroup{ + Name: groupName + "-small", + MemoryMB: smallestMemoryMB, + VCPUs: 2, + ExtraSpecs: map[string]string{"hw:cpu_policy": "dedicated"}, + }, + }, + } + raw, err := v1alpha1.BoxFeatureList(features) + if err != nil { + t.Fatalf("failed to box features: %v", err) + } + return &v1alpha1.Knowledge{ + ObjectMeta: metav1.ObjectMeta{Name: "flavor-groups"}, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Extractor: v1alpha1.KnowledgeExtractorSpec{Name: "flavor_groups"}, + }, + Status: v1alpha1.KnowledgeStatus{ + Raw: raw, + Conditions: []metav1.Condition{ + { + Type: v1alpha1.KnowledgeConditionReady, + Status: metav1.ConditionTrue, + Reason: "ExtractorSucceeded", + }, + }, + }, + } +} + +// newHypervisor creates a Hypervisor CRD with a topology AZ label and effective capacity. +func newHypervisor(name, az string, memoryBytes int64, instanceIDs ...string) *hv1.Hypervisor { + hv := &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{"topology.kubernetes.io/zone": az}, + }, + } + if memoryBytes > 0 { + qty := resource.NewQuantity(memoryBytes, resource.BinarySI) + hv.Status.EffectiveCapacity = map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *qty, + } + } + for _, id := range instanceIDs { + hv.Status.Instances = append(hv.Status.Instances, hv1.Instance{ID: id}) + } + return hv +} + +// newMockSchedulerServer creates an httptest server that always returns the given host list. +func newMockSchedulerServer(t *testing.T, hosts []string) *httptest.Server { + t.Helper() + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + resp := schedulerapi.ExternalSchedulerResponse{Hosts: hosts} + if err := json.NewEncoder(w).Encode(resp); err != nil { + t.Errorf("mock scheduler: failed to encode response: %v", err) + } + })) +} + +// --- unit tests for pure helper functions --- + +func TestCrdNameFor(t *testing.T) { + tests := []struct { + group, az, want string + }{ + {"2101", "qa-de-1a", "2101-qa-de-1a"}, + {"My_Group", "eu.west.1", "my-group-eu-west-1"}, + {"G", "AZ_1", "g-az-1"}, + } + for _, tt := range tests { + got := crdNameFor(tt.group, tt.az) + if got != tt.want { + t.Errorf("crdNameFor(%q, %q) = %q, want %q", tt.group, tt.az, got, tt.want) + } + } +} + +func TestAvailabilityZones(t *testing.T) { + hvs := []hv1.Hypervisor{ + *newHypervisor("h1", "az-a", 0), + *newHypervisor("h2", "az-b", 0), + *newHypervisor("h3", "az-a", 0), // duplicate + {ObjectMeta: metav1.ObjectMeta{Name: "h4"}}, // no label + } + got := availabilityZones(hvs) + want := []string{"az-a", "az-b"} + if len(got) != len(want) { + t.Fatalf("availabilityZones() = %v, want %v", got, want) + } + sort.Strings(got) + for i := range want { + if got[i] != want[i] { + t.Errorf("availabilityZones()[%d] = %q, want %q", i, got[i], want[i]) + } + } +} + +func TestCountInstancesInAZ(t *testing.T) { + hvs := []hv1.Hypervisor{ + *newHypervisor("h1", "az-a", 0, "vm1", "vm2"), + *newHypervisor("h2", "az-a", 0, "vm3"), + *newHypervisor("h3", "az-b", 0, "vm4"), + } + if got := countInstancesInAZ(hvs, "az-a"); got != 3 { + t.Errorf("countInstancesInAZ(az-a) = %d, want 3", got) + } + if got := countInstancesInAZ(hvs, "az-b"); got != 1 { + t.Errorf("countInstancesInAZ(az-b) = %d, want 1", got) + } + if got := countInstancesInAZ(hvs, "az-c"); got != 0 { + t.Errorf("countInstancesInAZ(az-c) = %d, want 0", got) + } +} + +// --- integration-style tests for reconcileOne --- + +func TestReconcileOne_CreatesCRD(t *testing.T) { + const ( + groupName = "2101" + az = "qa-de-1a" + memMB = 4096 // 4 GiB + memBytes = int64(memMB) * 1024 * 1024 + ) + + scheme := newTestScheme(t) + hv := newHypervisor("host-1", az, memBytes, "vm1") + knowledge := newFlavorGroupKnowledge(t, groupName, memMB) + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(knowledge, hv). + WithStatusSubresource(&v1alpha1.FlavorGroupCapacity{}, &v1alpha1.Knowledge{}). + Build() + + // Both probes return host-1 so capacity = floor(4GiB/4GiB) = 1 + schedulerServer := newMockSchedulerServer(t, []string{"host-1"}) + defer schedulerServer.Close() + + ctrl := NewController(fakeClient, Config{ + SchedulerURL: schedulerServer.URL, + TotalPipeline: "kvm-report-capacity", + PlaceablePipeline: "kvm-general-purpose", + }) + + groupData := compute.FlavorGroupFeature{ + SmallestFlavor: compute.FlavorInGroup{Name: groupName + "-small", MemoryMB: memMB}, + } + hvByName := map[string]hv1.Hypervisor{"host-1": *hv} + + if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, hvByName, []hv1.Hypervisor{*hv}); err != nil { + t.Fatalf("reconcileOne failed: %v", err) + } + + // Verify CRD was created with correct status + var crd v1alpha1.FlavorGroupCapacity + if err := fakeClient.Get(context.Background(), types.NamespacedName{Name: crdNameFor(groupName, az)}, &crd); err != nil { + t.Fatalf("failed to get CRD: %v", err) + } + if crd.Status.TotalCapacity != 1 { + t.Errorf("TotalCapacity = %d, want 1", crd.Status.TotalCapacity) + } + if crd.Status.TotalHosts != 1 { + t.Errorf("TotalHosts = %d, want 1", crd.Status.TotalHosts) + } + if crd.Status.TotalInstances != 1 { + t.Errorf("TotalInstances = %d, want 1", crd.Status.TotalInstances) + } + if crd.Status.TotalPlaceable != 1 { + t.Errorf("TotalPlaceable = %d, want 1", crd.Status.TotalPlaceable) + } +} + +func TestReconcileOne_SetsFreshConditionFalseOnSchedulerError(t *testing.T) { + const ( + groupName = "2101" + az = "qa-de-1a" + memMB = 2048 + ) + + scheme := newTestScheme(t) + knowledge := newFlavorGroupKnowledge(t, groupName, memMB) + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(knowledge). + WithStatusSubresource(&v1alpha1.FlavorGroupCapacity{}, &v1alpha1.Knowledge{}). + Build() + + // Scheduler returns 500 to simulate error + failServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer failServer.Close() + + ctrl := NewController(fakeClient, Config{ + SchedulerURL: failServer.URL, + TotalPipeline: "kvm-report-capacity", + PlaceablePipeline: "kvm-general-purpose", + }) + + groupData := compute.FlavorGroupFeature{ + SmallestFlavor: compute.FlavorInGroup{Name: groupName + "-small", MemoryMB: memMB}, + } + + // reconcileOne returns no error itself (it continues on probe failure), but sets Fresh=False + if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, map[string]hv1.Hypervisor{}, []hv1.Hypervisor{}); err != nil { + t.Fatalf("reconcileOne failed: %v", err) + } + + var crd v1alpha1.FlavorGroupCapacity + if err := fakeClient.Get(context.Background(), types.NamespacedName{Name: crdNameFor(groupName, az)}, &crd); err != nil { + t.Fatalf("failed to get CRD: %v", err) + } + + var freshStatus metav1.ConditionStatus + for _, c := range crd.Status.Conditions { + if c.Type == v1alpha1.FlavorGroupCapacityConditionFresh { + freshStatus = c.Status + } + } + if freshStatus != metav1.ConditionFalse { + t.Errorf("Fresh condition = %q, want %q", freshStatus, metav1.ConditionFalse) + } +} + +func TestReconcileOne_IdempotentUpdate(t *testing.T) { + const ( + groupName = "2101" + az = "qa-de-1a" + memMB = 2048 + memBytes = int64(memMB) * 1024 * 1024 + ) + + scheme := newTestScheme(t) + hv := newHypervisor("host-1", az, memBytes) + knowledge := newFlavorGroupKnowledge(t, groupName, memMB) + crdName := crdNameFor(groupName, az) + + // Pre-create the CRD to test the update path (not create path) + existing := &v1alpha1.FlavorGroupCapacity{ + ObjectMeta: metav1.ObjectMeta{Name: crdName}, + Spec: v1alpha1.FlavorGroupCapacitySpec{ + FlavorGroup: groupName, + AvailabilityZone: az, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(knowledge, hv, existing). + WithStatusSubresource(&v1alpha1.FlavorGroupCapacity{}, &v1alpha1.Knowledge{}). + Build() + + schedulerServer := newMockSchedulerServer(t, []string{"host-1"}) + defer schedulerServer.Close() + + ctrl := NewController(fakeClient, Config{ + SchedulerURL: schedulerServer.URL, + TotalPipeline: "kvm-report-capacity", + PlaceablePipeline: "kvm-general-purpose", + }) + + groupData := compute.FlavorGroupFeature{ + SmallestFlavor: compute.FlavorInGroup{Name: groupName + "-small", MemoryMB: memMB}, + } + hvByName := map[string]hv1.Hypervisor{"host-1": *hv} + + // First call + if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, hvByName, []hv1.Hypervisor{*hv}); err != nil { + t.Fatalf("first reconcileOne failed: %v", err) + } + // Second call — should not error on the already-existing CRD + if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, hvByName, []hv1.Hypervisor{*hv}); err != nil { + t.Fatalf("second reconcileOne failed: %v", err) + } + + var crd v1alpha1.FlavorGroupCapacity + if err := fakeClient.Get(context.Background(), types.NamespacedName{Name: crdName}, &crd); err != nil { + t.Fatalf("failed to get CRD: %v", err) + } + if crd.Status.TotalCapacity != 1 { + t.Errorf("TotalCapacity = %d, want 1", crd.Status.TotalCapacity) + } +} + +func TestReconcileAll_SkipsGroupsWithNoAZs(t *testing.T) { + scheme := newTestScheme(t) + knowledge := newFlavorGroupKnowledge(t, "2101", 2048) + + // No hypervisors → no AZs → reconcileAll returns without error + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(knowledge). + WithStatusSubresource(&v1alpha1.FlavorGroupCapacity{}, &v1alpha1.Knowledge{}). + Build() + + ctrl := NewController(fakeClient, Config{ + SchedulerURL: "http://localhost:9999", // unreachable; not called + TotalPipeline: "kvm-report-capacity", + PlaceablePipeline: "kvm-general-purpose", + }) + + if err := ctrl.reconcileAll(context.Background()); err != nil { + t.Errorf("reconcileAll with no hypervisors returned error: %v", err) + } + + var list v1alpha1.FlavorGroupCapacityList + if err := fakeClient.List(context.Background(), &list); err != nil { + t.Fatalf("failed to list CRDs: %v", err) + } + if len(list.Items) != 0 { + t.Errorf("expected 0 CRDs, got %d", len(list.Items)) + } +} + +func TestProbeScheduler_CapacityCalculation(t *testing.T) { + const memMB = 4096 + const memBytes = int64(memMB) * 1024 * 1024 + + scheme := newTestScheme(t) + hv1Obj := newHypervisor("host-1", "az-a", memBytes) + hv2Obj := newHypervisor("host-2", "az-a", memBytes*2) // 2x memory + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).Build() + + // Scheduler returns both hosts + srv := newMockSchedulerServer(t, []string{"host-1", "host-2"}) + defer srv.Close() + + c := NewController(fakeClient, Config{SchedulerURL: srv.URL}) + hvByName := map[string]hv1.Hypervisor{ + "host-1": *hv1Obj, + "host-2": *hv2Obj, + } + flavor := compute.FlavorInGroup{Name: "test-flavor", MemoryMB: memMB} + + capacity, hosts, err := c.probeScheduler(context.Background(), flavor, "az-a", "test-pipeline", hvByName, memBytes) + if err != nil { + t.Fatalf("probeScheduler failed: %v", err) + } + if hosts != 2 { + t.Errorf("hosts = %d, want 2", hosts) + } + // host-1 = 1 slot (4GiB/4GiB), host-2 = 2 slots (8GiB/4GiB) + if capacity != 3 { + t.Errorf("capacity = %d, want 3", capacity) + } +} + +func TestReconcileAll_MultipleGroupsAndAZs(t *testing.T) { + scheme := newTestScheme(t) + + const memMB = 2048 + const memBytes = int64(memMB) * 1024 * 1024 + + // Two AZs, two hypervisors + hv1Obj := newHypervisor("h1", "az-a", memBytes) + hv2Obj := newHypervisor("h2", "az-b", memBytes) + knowledge := newFlavorGroupKnowledge(t, "2101", memMB) + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(knowledge, hv1Obj, hv2Obj). + WithStatusSubresource(&v1alpha1.FlavorGroupCapacity{}, &v1alpha1.Knowledge{}). + Build() + + srv := newMockSchedulerServer(t, []string{}) + defer srv.Close() + + c := NewController(fakeClient, Config{ + SchedulerURL: srv.URL, + TotalPipeline: "kvm-report-capacity", + PlaceablePipeline: "kvm-general-purpose", + }) + + if err := c.reconcileAll(context.Background()); err != nil { + t.Fatalf("reconcileAll failed: %v", err) + } + + // Expect one CRD per AZ for the single group + var list v1alpha1.FlavorGroupCapacityList + if err := fakeClient.List(context.Background(), &list); err != nil { + t.Fatalf("failed to list CRDs: %v", err) + } + if len(list.Items) != 2 { + names := make([]string, len(list.Items)) + for i, item := range list.Items { + names[i] = item.Name + } + t.Errorf("expected 2 CRDs (one per AZ), got %d: %v", len(list.Items), names) + } +} + +func TestReconcileAll_FlavorGroupsKnowledgeNotReady(t *testing.T) { + scheme := newTestScheme(t) + + // Knowledge CRD exists but is not Ready + knowledge := &v1alpha1.Knowledge{ + ObjectMeta: metav1.ObjectMeta{Name: "flavor-groups"}, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Extractor: v1alpha1.KnowledgeExtractorSpec{Name: "flavor_groups"}, + }, + Status: v1alpha1.KnowledgeStatus{ + Conditions: []metav1.Condition{ + { + Type: v1alpha1.KnowledgeConditionReady, + Status: metav1.ConditionFalse, + Reason: "NotReady", + }, + }, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(knowledge). + WithStatusSubresource(&v1alpha1.Knowledge{}). + Build() + + c := NewController(fakeClient, Config{ + SchedulerURL: "http://localhost:9999", + TotalPipeline: "kvm-report-capacity", + PlaceablePipeline: "kvm-general-purpose", + }) + + // Should return an error when knowledge is not ready + if err := c.reconcileAll(context.Background()); err == nil { + t.Error("reconcileAll should fail when flavor groups knowledge is not ready") + } +} + +func TestReconcileOne_ZeroMemoryFlavorReturnsError(t *testing.T) { + scheme := newTestScheme(t) + fakeClient := fake.NewClientBuilder().WithScheme(scheme).Build() + c := NewController(fakeClient, Config{}) + + groupData := compute.FlavorGroupFeature{ + SmallestFlavor: compute.FlavorInGroup{Name: "bad-flavor", MemoryMB: 0}, + } + err := c.reconcileOne(context.Background(), "2101", groupData, "az-a", nil, nil) + if err == nil { + t.Error("expected error for zero-memory flavor") + } +} + +// Verify that the module-level log variable from reservations package doesn't +// collide with the one in this package. +func TestPackageLogVar(t *testing.T) { + _ = reservations.NewSchedulerClient("http://localhost") +} diff --git a/internal/scheduling/reservations/capacity/metrics.go b/internal/scheduling/reservations/capacity/metrics.go new file mode 100644 index 000000000..698d0ab9e --- /dev/null +++ b/internal/scheduling/reservations/capacity/metrics.go @@ -0,0 +1,104 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package capacity + +import ( + "context" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +var capacityLabels = []string{"flavor_group", "az"} + +// Monitor provides Prometheus metrics for FlavorGroupCapacity CRDs. +// It implements prometheus.Collector and reads CRD status on each Collect call. +type Monitor struct { + client client.Client + totalCapacity *prometheus.GaugeVec + totalPlaceable *prometheus.GaugeVec + totalHosts *prometheus.GaugeVec + placeableHosts *prometheus.GaugeVec + totalInstances *prometheus.GaugeVec + committedCapacity *prometheus.GaugeVec +} + +// NewMonitor creates a new Monitor that reads FlavorGroupCapacity CRDs. +func NewMonitor(c client.Client) Monitor { + return Monitor{ + client: c, + totalCapacity: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_committed_resource_capacity_total", + Help: "Total schedulable slots in an empty-datacenter scenario per flavor group and AZ.", + }, capacityLabels), + totalPlaceable: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_committed_resource_capacity_placeable", + Help: "Schedulable slots remaining given current VM allocations per flavor group and AZ.", + }, capacityLabels), + totalHosts: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_committed_resource_capacity_hosts_total", + Help: "Number of hosts eligible for this flavor group in the empty-state probe.", + }, capacityLabels), + placeableHosts: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_committed_resource_capacity_hosts_placeable", + Help: "Number of hosts still able to accept a new smallest-flavor VM.", + }, capacityLabels), + totalInstances: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_committed_resource_capacity_instances", + Help: "Total VM instances running on hypervisors in this AZ (not filtered by flavor group).", + }, capacityLabels), + committedCapacity: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_committed_resource_capacity_committed", + Help: "Sum of AcceptedAmount across Ready CommittedResource CRDs for this flavor group and AZ.", + }, capacityLabels), + } +} + +// Describe implements prometheus.Collector. +func (m *Monitor) Describe(ch chan<- *prometheus.Desc) { + m.totalCapacity.Describe(ch) + m.totalPlaceable.Describe(ch) + m.totalHosts.Describe(ch) + m.placeableHosts.Describe(ch) + m.totalInstances.Describe(ch) + m.committedCapacity.Describe(ch) +} + +// Collect implements prometheus.Collector — lists all FlavorGroupCapacity CRDs and exports gauges. +func (m *Monitor) Collect(ch chan<- prometheus.Metric) { + var list v1alpha1.FlavorGroupCapacityList + if err := m.client.List(context.Background(), &list); err != nil { + log.Error(err, "failed to list FlavorGroupCapacity CRDs for metrics") + return + } + + // Reset all gauges so deleted CRDs don't linger. + m.totalCapacity.Reset() + m.totalPlaceable.Reset() + m.totalHosts.Reset() + m.placeableHosts.Reset() + m.totalInstances.Reset() + m.committedCapacity.Reset() + + for _, c := range list.Items { + labels := prometheus.Labels{ + "flavor_group": c.Spec.FlavorGroup, + "az": c.Spec.AvailabilityZone, + } + m.totalCapacity.With(labels).Set(float64(c.Status.TotalCapacity)) + m.totalPlaceable.With(labels).Set(float64(c.Status.TotalPlaceable)) + m.totalHosts.With(labels).Set(float64(c.Status.TotalHosts)) + m.placeableHosts.With(labels).Set(float64(c.Status.PlaceableHosts)) + m.totalInstances.With(labels).Set(float64(c.Status.TotalInstances)) + m.committedCapacity.With(labels).Set(float64(c.Status.CommittedCapacity)) + } + + m.totalCapacity.Collect(ch) + m.totalPlaceable.Collect(ch) + m.totalHosts.Collect(ch) + m.placeableHosts.Collect(ch) + m.totalInstances.Collect(ch) + m.committedCapacity.Collect(ch) +} From 3411dd01f89917bf908112a6869e678044237be8 Mon Sep 17 00:00:00 2001 From: Julius Clausnitzer Date: Tue, 28 Apr 2026 14:22:18 +0200 Subject: [PATCH 02/15] helm and rbac --- helm/bundles/cortex-nova/values.yaml | 1 + .../cortex.cloud_flavorgroupcapacities.yaml | 166 ++++++++++++++++++ helm/library/cortex/templates/rbac/role.yaml | 3 + 3 files changed, 170 insertions(+) create mode 100644 helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml index c40849739..f709bea5f 100644 --- a/helm/bundles/cortex-nova/values.yaml +++ b/helm/bundles/cortex-nova/values.yaml @@ -133,6 +133,7 @@ cortex-scheduling-controllers: - hypervisor-overcommit-controller - committed-resource-reservations-controller - failover-reservations-controller + - capacity-controller enabledTasks: - nova-history-cleanup-task # If true, the external scheduler API will limit the list of hosts in its diff --git a/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml b/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml new file mode 100644 index 000000000..a3bc11fbe --- /dev/null +++ b/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml @@ -0,0 +1,166 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.20.1 + name: flavorgroupcapacities.cortex.cloud +spec: + group: cortex.cloud + names: + kind: FlavorGroupCapacity + listKind: FlavorGroupCapacityList + plural: flavorgroupcapacities + singular: flavorgroupcapacity + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .spec.flavorGroup + name: FlavorGroup + type: string + - jsonPath: .spec.availabilityZone + name: AZ + type: string + - jsonPath: .status.totalCapacity + name: TotalCapacity + type: integer + - jsonPath: .status.totalPlaceable + name: TotalPlaceable + type: integer + - jsonPath: .status.totalHosts + name: TotalHosts + type: integer + - jsonPath: .status.lastReconcileAt + name: LastReconcile + type: date + - jsonPath: .status.conditions[?(@.type=='Ready')].status + name: Ready + type: string + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + FlavorGroupCapacity caches pre-computed capacity data for one flavor group in one AZ. + One CRD exists per (flavor group × AZ) pair, updated by the capacity controller on a fixed interval. + The capacity API reads these CRDs instead of probing the scheduler on each request. + properties: + apiVersion: + description: APIVersion defines the versioned schema of this representation of an object. + type: string + kind: + description: Kind is a string value representing the REST resource this object represents. + type: string + metadata: + type: object + spec: + description: FlavorGroupCapacitySpec defines the desired state of FlavorGroupCapacity. + properties: + availabilityZone: + description: AvailabilityZone is the OpenStack AZ this capacity data covers (e.g. "qa-de-1a"). + type: string + flavorGroup: + description: FlavorGroup is the name of the flavor group (e.g. "2101"). + type: string + required: + - availabilityZone + - flavorGroup + type: object + status: + description: FlavorGroupCapacityStatus defines the observed state of FlavorGroupCapacity. + properties: + committedCapacity: + description: CommittedCapacity is the sum of AcceptedAmount across Ready=True CommittedResource CRDs. + format: int64 + minimum: 0 + type: integer + conditions: + description: |- + Conditions represent the current state of the FlavorGroupCapacity. + The Ready condition indicates whether the status data is up-to-date. + items: + description: Condition contains details for one aspect of the current state of this API Resource. + properties: + lastTransitionTime: + description: lastTransitionTime is the last time the condition transitioned from one status to another. + format: date-time + type: string + message: + description: message is a human readable message indicating details about the transition. + maxLength: 32768 + type: string + observedGeneration: + description: observedGeneration represents the .metadata.generation that the condition was set based upon. + format: int64 + minimum: 0 + type: integer + reason: + description: reason contains a programmatic identifier indicating the reason for the condition's last transition. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + lastReconcileAt: + description: LastReconcileAt is the timestamp of the last successful reconcile. + format: date-time + type: string + placeableHosts: + description: PlaceableHosts is the number of hosts still able to accept a new smallest-flavor VM. + format: int64 + minimum: 0 + type: integer + totalCapacity: + description: |- + TotalCapacity is the total schedulable slots in an empty-datacenter scenario. + Computed as sum of floor(EffectiveCapacity.Memory / smallestFlavorMemory) across + all hosts eligible for this flavor group (empty-state scheduler probe). + format: int64 + minimum: 0 + type: integer + totalHosts: + description: TotalHosts is the number of hosts eligible for this flavor group in the empty-state probe. + format: int64 + minimum: 0 + type: integer + totalInstances: + description: |- + TotalInstances is the total number of VM instances running on hypervisors in this AZ, + derived from Hypervisor CRD Status.Instances (not filtered by flavor group). + format: int64 + minimum: 0 + type: integer + totalPlaceable: + description: TotalPlaceable is the schedulable slots remaining given current VM allocations. + format: int64 + minimum: 0 + type: integer + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/helm/library/cortex/templates/rbac/role.yaml b/helm/library/cortex/templates/rbac/role.yaml index ea75c6897..6b89a2e33 100644 --- a/helm/library/cortex/templates/rbac/role.yaml +++ b/helm/library/cortex/templates/rbac/role.yaml @@ -14,6 +14,7 @@ rules: - datasources - reservations - committedresources + - flavorgroupcapacities - decisions - deschedulings - pipelines @@ -34,6 +35,7 @@ rules: - datasources/finalizers - reservations/finalizers - committedresources/finalizers + - flavorgroupcapacities/finalizers - decisions/finalizers - deschedulings/finalizers - pipelines/finalizers @@ -48,6 +50,7 @@ rules: - datasources/status - reservations/status - committedresources/status + - flavorgroupcapacities/status - decisions/status - deschedulings/status - pipelines/status From fc28ac8210a9a4be84ae27d380bae23eeae7175f Mon Sep 17 00:00:00 2001 From: Julius Clausnitzer Date: Tue, 28 Apr 2026 14:23:11 +0200 Subject: [PATCH 03/15] adjusting to CommittedResource CRD --- api/v1alpha1/flavor_group_capacity_types.go | 25 +++---- api/v1alpha1/zz_generated.deepcopy.go | 2 +- .../reservations/capacity/controller.go | 43 +++++++++++- .../reservations/capacity/controller_test.go | 68 +++++++++++++++++-- 4 files changed, 114 insertions(+), 24 deletions(-) diff --git a/api/v1alpha1/flavor_group_capacity_types.go b/api/v1alpha1/flavor_group_capacity_types.go index 7911e1d09..edd04ca90 100644 --- a/api/v1alpha1/flavor_group_capacity_types.go +++ b/api/v1alpha1/flavor_group_capacity_types.go @@ -8,8 +8,8 @@ import ( ) const ( - // FlavorGroupCapacityConditionFresh indicates the status data is up-to-date. - FlavorGroupCapacityConditionFresh = "Fresh" + // FlavorGroupCapacityConditionReady indicates the status data is up-to-date. + FlavorGroupCapacityConditionReady = "Ready" ) // FlavorGroupCapacitySpec defines the desired state of FlavorGroupCapacity. @@ -29,46 +29,36 @@ type FlavorGroupCapacityStatus struct { // Computed as sum of floor(EffectiveCapacity.Memory / smallestFlavorMemory) across // all hosts eligible for this flavor group (empty-state scheduler probe). // +kubebuilder:validation:Optional - // +kubebuilder:validation:Minimum=0 TotalCapacity int64 `json:"totalCapacity,omitempty"` // TotalHosts is the number of hosts eligible for this flavor group in the empty-state probe. // +kubebuilder:validation:Optional - // +kubebuilder:validation:Minimum=0 TotalHosts int64 `json:"totalHosts,omitempty"` // TotalPlaceable is the schedulable slots remaining given current VM allocations. // Computed from the current-state scheduler probe. // +kubebuilder:validation:Optional - // +kubebuilder:validation:Minimum=0 TotalPlaceable int64 `json:"totalPlaceable,omitempty"` // PlaceableHosts is the number of hosts still able to accept a new smallest-flavor VM. // +kubebuilder:validation:Optional - // +kubebuilder:validation:Minimum=0 PlaceableHosts int64 `json:"placeableHosts,omitempty"` // TotalInstances is the total number of VM instances running on hypervisors in this AZ, // derived from Hypervisor CRD Status.Instances (not filtered by flavor group). // +kubebuilder:validation:Optional - // +kubebuilder:validation:Minimum=0 TotalInstances int64 `json:"totalInstances,omitempty"` // CommittedCapacity is the sum of AcceptedAmount across Ready=True CommittedResource CRDs. - // TODO(BLI #337): populate once CommittedResource CRD exists. // +kubebuilder:validation:Optional - // +kubebuilder:validation:Minimum=0 CommittedCapacity int64 `json:"committedCapacity,omitempty"` // LastReconcileAt is the timestamp of the last successful reconcile. // +kubebuilder:validation:Optional LastReconcileAt metav1.Time `json:"lastReconcileAt,omitempty"` - // Conditions represent the current state of the FlavorGroupCapacity. - // The Fresh condition indicates whether the status data is up-to-date. + // The current status conditions of the FlavorGroupCapacity. // +kubebuilder:validation:Optional - // +patchStrategy=merge - // +patchMergeKey=type Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"` } @@ -81,7 +71,7 @@ type FlavorGroupCapacityStatus struct { // +kubebuilder:printcolumn:name="TotalPlaceable",type="integer",JSONPath=".status.totalPlaceable" // +kubebuilder:printcolumn:name="TotalHosts",type="integer",JSONPath=".status.totalHosts" // +kubebuilder:printcolumn:name="LastReconcile",type="date",JSONPath=".status.lastReconcileAt" -// +kubebuilder:printcolumn:name="Fresh",type="string",JSONPath=".status.conditions[?(@.type=='Fresh')].status" +// +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" // FlavorGroupCapacity caches pre-computed capacity data for one flavor group in one AZ. // One CRD exists per (flavor group × AZ) pair, updated by the capacity controller on a fixed interval. @@ -89,14 +79,17 @@ type FlavorGroupCapacityStatus struct { type FlavorGroupCapacity struct { metav1.TypeMeta `json:",inline"` + // metadata is a standard object metadata // +optional - metav1.ObjectMeta `json:"metadata,omitempty"` + metav1.ObjectMeta `json:"metadata,omitempty,omitzero"` + // spec defines the desired state of FlavorGroupCapacity // +required Spec FlavorGroupCapacitySpec `json:"spec"` + // status defines the observed state of FlavorGroupCapacity // +optional - Status FlavorGroupCapacityStatus `json:"status,omitempty"` + Status FlavorGroupCapacityStatus `json:"status,omitempty,omitzero"` } // +kubebuilder:object:root=true diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index be5683460..be8791809 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -807,7 +807,7 @@ func (in *FlavorGroupCapacityStatus) DeepCopyInto(out *FlavorGroupCapacityStatus in.LastReconcileAt.DeepCopyInto(&out.LastReconcileAt) if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions - *out = make([]metav1.Condition, len(*in)) + *out = make([]v1.Condition, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } diff --git a/internal/scheduling/reservations/capacity/controller.go b/internal/scheduling/reservations/capacity/controller.go index 4ff2eadc4..00853904d 100644 --- a/internal/scheduling/reservations/capacity/controller.go +++ b/internal/scheduling/reservations/capacity/controller.go @@ -116,8 +116,11 @@ func (c *Controller) reconcileOne( // Count total instances on hypervisors in this AZ. totalInstances := countInstancesInAZ(allHVs, az) - // TODO(BLI #337): populate CommittedCapacity from Ready=True CommittedResource CRDs. - var committedCapacity int64 + committedCapacity, committedErr := c.sumCommittedCapacity(ctx, groupName, az, smallestFlavorBytes) + if committedErr != nil { + log.Error(committedErr, "failed to sum committed capacity", "flavorGroup", groupName, "az", az) + committedCapacity = 0 + } crdName := crdNameFor(groupName, az) fresh := totalErr == nil && placeableErr == nil @@ -149,7 +152,7 @@ func (c *Controller) reconcileOne( existing.Status.LastReconcileAt = metav1.Now() freshCondition := metav1.Condition{ - Type: v1alpha1.FlavorGroupCapacityConditionFresh, + Type: v1alpha1.FlavorGroupCapacityConditionReady, ObservedGeneration: existing.Generation, } if fresh { @@ -219,6 +222,40 @@ func (c *Controller) probeScheduler( return capacity, hosts, nil } +// sumCommittedCapacity sums AcceptedAmount (or Spec.Amount as fallback) across all +// CommittedResource CRDs for the given (flavorGroup, az) pair with an active state +// (guaranteed or confirmed) and resource type memory. Returns the total in slots. +func (c *Controller) sumCommittedCapacity(ctx context.Context, groupName, az string, smallestFlavorBytes int64) (int64, error) { + var list v1alpha1.CommittedResourceList + if err := c.client.List(ctx, &list); err != nil { + return 0, fmt.Errorf("failed to list CommittedResources: %w", err) + } + + var total int64 + for _, cr := range list.Items { + if cr.Spec.FlavorGroupName != groupName { + continue + } + if cr.Spec.AvailabilityZone != az { + continue + } + if cr.Spec.ResourceType != v1alpha1.CommittedResourceTypeMemory { + continue + } + if cr.Spec.State != v1alpha1.CommitmentStatusGuaranteed && cr.Spec.State != v1alpha1.CommitmentStatusConfirmed { + continue + } + amount := cr.Spec.Amount + if cr.Status.AcceptedAmount != nil { + amount = *cr.Status.AcceptedAmount + } + if bytes := amount.Value(); bytes > 0 { + total += bytes / smallestFlavorBytes + } + } + return total, nil +} + // availabilityZones returns a sorted, deduplicated list of AZs from Hypervisor CRD labels. func availabilityZones(hvs []hv1.Hypervisor) []string { azSet := make(map[string]struct{}) diff --git a/internal/scheduling/reservations/capacity/controller_test.go b/internal/scheduling/reservations/capacity/controller_test.go index 3f3ceb6f0..7f9ae2fb3 100644 --- a/internal/scheduling/reservations/capacity/controller_test.go +++ b/internal/scheduling/reservations/capacity/controller_test.go @@ -16,6 +16,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" schedulerapi "github.com/cobaltcore-dev/cortex/api/external/nova" @@ -218,7 +219,7 @@ func TestReconcileOne_CreatesCRD(t *testing.T) { } } -func TestReconcileOne_SetsFreshConditionFalseOnSchedulerError(t *testing.T) { +func TestReconcileOne_SetsReadyConditionFalseOnSchedulerError(t *testing.T) { const ( groupName = "2101" az = "qa-de-1a" @@ -250,7 +251,7 @@ func TestReconcileOne_SetsFreshConditionFalseOnSchedulerError(t *testing.T) { SmallestFlavor: compute.FlavorInGroup{Name: groupName + "-small", MemoryMB: memMB}, } - // reconcileOne returns no error itself (it continues on probe failure), but sets Fresh=False + // reconcileOne returns no error itself (it continues on probe failure), but sets Ready=False if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, map[string]hv1.Hypervisor{}, []hv1.Hypervisor{}); err != nil { t.Fatalf("reconcileOne failed: %v", err) } @@ -262,12 +263,12 @@ func TestReconcileOne_SetsFreshConditionFalseOnSchedulerError(t *testing.T) { var freshStatus metav1.ConditionStatus for _, c := range crd.Status.Conditions { - if c.Type == v1alpha1.FlavorGroupCapacityConditionFresh { + if c.Type == v1alpha1.FlavorGroupCapacityConditionReady { freshStatus = c.Status } } if freshStatus != metav1.ConditionFalse { - t.Errorf("Fresh condition = %q, want %q", freshStatus, metav1.ConditionFalse) + t.Errorf("Ready condition = %q, want %q", freshStatus, metav1.ConditionFalse) } } @@ -497,3 +498,62 @@ func TestReconcileOne_ZeroMemoryFlavorReturnsError(t *testing.T) { func TestPackageLogVar(t *testing.T) { _ = reservations.NewSchedulerClient("http://localhost") } + +func TestSumCommittedCapacity(t *testing.T) { + const ( + groupName = "2101" + az = "qa-de-1a" + memMB = 4096 + memBytes = int64(memMB) * 1024 * 1024 + ) + + newCR := func(name, group, zone string, state v1alpha1.CommitmentStatus, resType v1alpha1.CommittedResourceType, amount string, acceptedAmount string) *v1alpha1.CommittedResource { + qty := resource.MustParse(amount) + cr := &v1alpha1.CommittedResource{ + ObjectMeta: metav1.ObjectMeta{Name: name}, + Spec: v1alpha1.CommittedResourceSpec{ + FlavorGroupName: group, + AvailabilityZone: zone, + State: state, + ResourceType: resType, + Amount: qty, + }, + } + if acceptedAmount != "" { + accepted := resource.MustParse(acceptedAmount) + cr.Status.AcceptedAmount = &accepted + } + return cr + } + + scheme := newTestScheme(t) + objects := []client.Object{ + // Should count: confirmed, memory, right group+AZ, AcceptedAmount set + newCR("cr1", groupName, az, v1alpha1.CommitmentStatusConfirmed, v1alpha1.CommittedResourceTypeMemory, "8Gi", "8Gi"), + // Should count: guaranteed, memory, right group+AZ, no AcceptedAmount → falls back to Spec.Amount + newCR("cr2", groupName, az, v1alpha1.CommitmentStatusGuaranteed, v1alpha1.CommittedResourceTypeMemory, "4Gi", ""), + // Should NOT count: wrong state + newCR("cr3", groupName, az, v1alpha1.CommitmentStatusPlanned, v1alpha1.CommittedResourceTypeMemory, "4Gi", ""), + // Should NOT count: wrong resource type + newCR("cr4", groupName, az, v1alpha1.CommitmentStatusConfirmed, v1alpha1.CommittedResourceTypeCores, "4Gi", ""), + // Should NOT count: wrong AZ + newCR("cr5", groupName, "other-az", v1alpha1.CommitmentStatusConfirmed, v1alpha1.CommittedResourceTypeMemory, "4Gi", ""), + // Should NOT count: wrong flavor group + newCR("cr6", "other-group", az, v1alpha1.CommitmentStatusConfirmed, v1alpha1.CommittedResourceTypeMemory, "4Gi", ""), + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(objects...). + Build() + + c := NewController(fakeClient, Config{}) + // smallestFlavorBytes = 4GiB → cr1 = 8GiB/4GiB = 2 slots, cr2 = 4GiB/4GiB = 1 slot → total = 3 + got, err := c.sumCommittedCapacity(context.Background(), groupName, az, memBytes) + if err != nil { + t.Fatalf("sumCommittedCapacity failed: %v", err) + } + if got != 3 { + t.Errorf("sumCommittedCapacity = %d, want 3", got) + } +} From 742d604e463f784e9732f586a17db8608f0fd3f1 Mon Sep 17 00:00:00 2001 From: Julius Clausnitzer Date: Tue, 28 Apr 2026 15:34:28 +0200 Subject: [PATCH 04/15] fix --- internal/scheduling/reservations/capacity/controller.go | 2 +- internal/scheduling/reservations/capacity/controller_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/scheduling/reservations/capacity/controller.go b/internal/scheduling/reservations/capacity/controller.go index 00853904d..e5e8e352e 100644 --- a/internal/scheduling/reservations/capacity/controller.go +++ b/internal/scheduling/reservations/capacity/controller.go @@ -183,7 +183,7 @@ func (c *Controller) probeScheduler( az, pipeline string, hvByName map[string]hv1.Hypervisor, smallestFlavorBytes int64, -) (capacity int64, hosts int64, err error) { +) (capacity, hosts int64, err error) { resp, err := c.schedulerClient.ScheduleReservation(ctx, reservations.ScheduleReservationRequest{ InstanceUUID: uuid.New().String(), ProjectID: "cortex-capacity-probe", diff --git a/internal/scheduling/reservations/capacity/controller_test.go b/internal/scheduling/reservations/capacity/controller_test.go index 7f9ae2fb3..ea8686c7e 100644 --- a/internal/scheduling/reservations/capacity/controller_test.go +++ b/internal/scheduling/reservations/capacity/controller_test.go @@ -405,7 +405,7 @@ func TestReconcileAll_MultipleGroupsAndAZs(t *testing.T) { // Two AZs, two hypervisors hv1Obj := newHypervisor("h1", "az-a", memBytes) hv2Obj := newHypervisor("h2", "az-b", memBytes) - knowledge := newFlavorGroupKnowledge(t, "2101", memMB) + knowledge := newFlavorGroupKnowledge(t, "2152", memMB) fakeClient := fake.NewClientBuilder(). WithScheme(scheme). From b8057cb496c5e3a42c0facd5d4bac4657b19777a Mon Sep 17 00:00:00 2001 From: Julius Clausnitzer Date: Tue, 28 Apr 2026 15:44:10 +0200 Subject: [PATCH 05/15] make crds deepcopy lint-fix --- api/v1alpha1/zz_generated.deepcopy.go | 44 ++++++------ .../cortex.cloud_flavorgroupcapacities.yaml | 72 ++++++++++++------- .../reservations/capacity/controller.go | 6 +- .../reservations/capacity/controller_test.go | 20 +++--- 4 files changed, 81 insertions(+), 61 deletions(-) diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index be8791809..f995ec9be 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -727,6 +727,28 @@ func (in *FailoverReservationStatus) DeepCopy() *FailoverReservationStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FilterSpec) DeepCopyInto(out *FilterSpec) { + *out = *in + if in.Params != nil { + in, out := &in.Params, &out.Params + *out = make(Parameters, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FilterSpec. +func (in *FilterSpec) DeepCopy() *FilterSpec { + if in == nil { + return nil + } + out := new(FilterSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *FlavorGroupCapacity) DeepCopyInto(out *FlavorGroupCapacity) { *out = *in @@ -824,28 +846,6 @@ func (in *FlavorGroupCapacityStatus) DeepCopy() *FlavorGroupCapacityStatus { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *FilterSpec) DeepCopyInto(out *FilterSpec) { - *out = *in - if in.Params != nil { - in, out := &in.Params, &out.Params - *out = make(Parameters, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FilterSpec. -func (in *FilterSpec) DeepCopy() *FilterSpec { - if in == nil { - return nil - } - out := new(FilterSpec) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *History) DeepCopyInto(out *History) { *out = *in diff --git a/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml b/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml index a3bc11fbe..efa690a12 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml @@ -45,18 +45,28 @@ spec: The capacity API reads these CRDs instead of probing the scheduler on each request. properties: apiVersion: - description: APIVersion defines the versioned schema of this representation of an object. + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources type: string kind: - description: Kind is a string value representing the REST resource this object represents. + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds type: string metadata: type: object spec: - description: FlavorGroupCapacitySpec defines the desired state of FlavorGroupCapacity. + description: spec defines the desired state of FlavorGroupCapacity properties: availabilityZone: - description: AvailabilityZone is the OpenStack AZ this capacity data covers (e.g. "qa-de-1a"). + description: AvailabilityZone is the OpenStack AZ this capacity data + covers (e.g. "qa-de-1a"). type: string flavorGroup: description: FlavorGroup is the name of the flavor group (e.g. "2101"). @@ -66,35 +76,46 @@ spec: - flavorGroup type: object status: - description: FlavorGroupCapacityStatus defines the observed state of FlavorGroupCapacity. + description: status defines the observed state of FlavorGroupCapacity properties: committedCapacity: - description: CommittedCapacity is the sum of AcceptedAmount across Ready=True CommittedResource CRDs. + description: CommittedCapacity is the sum of AcceptedAmount across + Ready=True CommittedResource CRDs. format: int64 - minimum: 0 type: integer conditions: - description: |- - Conditions represent the current state of the FlavorGroupCapacity. - The Ready condition indicates whether the status data is up-to-date. + description: The current status conditions of the FlavorGroupCapacity. items: - description: Condition contains details for one aspect of the current state of this API Resource. + description: Condition contains details for one aspect of the current + state of this API Resource. properties: lastTransitionTime: - description: lastTransitionTime is the last time the condition transitioned from one status to another. + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. format: date-time type: string message: - description: message is a human readable message indicating details about the transition. + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. maxLength: 32768 type: string observedGeneration: - description: observedGeneration represents the .metadata.generation that the condition was set based upon. + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. format: int64 minimum: 0 type: integer reason: - description: reason contains a programmatic identifier indicating the reason for the condition's last transition. + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. maxLength: 1024 minLength: 1 pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ @@ -119,17 +140,15 @@ spec: - type type: object type: array - x-kubernetes-list-map-keys: - - type - x-kubernetes-list-type: map lastReconcileAt: - description: LastReconcileAt is the timestamp of the last successful reconcile. + description: LastReconcileAt is the timestamp of the last successful + reconcile. format: date-time type: string placeableHosts: - description: PlaceableHosts is the number of hosts still able to accept a new smallest-flavor VM. + description: PlaceableHosts is the number of hosts still able to accept + a new smallest-flavor VM. format: int64 - minimum: 0 type: integer totalCapacity: description: |- @@ -137,24 +156,23 @@ spec: Computed as sum of floor(EffectiveCapacity.Memory / smallestFlavorMemory) across all hosts eligible for this flavor group (empty-state scheduler probe). format: int64 - minimum: 0 type: integer totalHosts: - description: TotalHosts is the number of hosts eligible for this flavor group in the empty-state probe. + description: TotalHosts is the number of hosts eligible for this flavor + group in the empty-state probe. format: int64 - minimum: 0 type: integer totalInstances: description: |- TotalInstances is the total number of VM instances running on hypervisors in this AZ, derived from Hypervisor CRD Status.Instances (not filtered by flavor group). format: int64 - minimum: 0 type: integer totalPlaceable: - description: TotalPlaceable is the schedulable slots remaining given current VM allocations. + description: |- + TotalPlaceable is the schedulable slots remaining given current VM allocations. + Computed from the current-state scheduler probe. format: int64 - minimum: 0 type: integer type: object required: diff --git a/internal/scheduling/reservations/capacity/controller.go b/internal/scheduling/reservations/capacity/controller.go index e5e8e352e..c6496227a 100644 --- a/internal/scheduling/reservations/capacity/controller.go +++ b/internal/scheduling/reservations/capacity/controller.go @@ -101,6 +101,7 @@ func (c *Controller) reconcileOne( hvByName map[string]hv1.Hypervisor, allHVs []hv1.Hypervisor, ) error { + smallestFlavor := groupData.SmallestFlavor smallestFlavorBytes := int64(smallestFlavor.MemoryMB) * 1024 * 1024 //nolint:gosec if smallestFlavorBytes <= 0 { @@ -184,6 +185,7 @@ func (c *Controller) probeScheduler( hvByName map[string]hv1.Hypervisor, smallestFlavorBytes int64, ) (capacity, hosts int64, err error) { + resp, err := c.schedulerClient.ScheduleReservation(ctx, reservations.ScheduleReservationRequest{ InstanceUUID: uuid.New().String(), ProjectID: "cortex-capacity-probe", @@ -198,7 +200,7 @@ func (c *Controller) probeScheduler( return 0, 0, fmt.Errorf("scheduler call failed (pipeline=%s): %w", pipeline, err) } - hosts = int64(len(resp.Hosts)) //nolint:gosec + hosts = int64(len(resp.Hosts)) for _, hostName := range resp.Hosts { hv, ok := hvByName[hostName] if !ok { @@ -279,7 +281,7 @@ func countInstancesInAZ(hvs []hv1.Hypervisor, az string) int64 { if hv.Labels["topology.kubernetes.io/zone"] != az { continue } - total += int64(len(hv.Status.Instances)) //nolint:gosec + total += int64(len(hv.Status.Instances)) } return total } diff --git a/internal/scheduling/reservations/capacity/controller_test.go b/internal/scheduling/reservations/capacity/controller_test.go index ea8686c7e..c75b3e5c7 100644 --- a/internal/scheduling/reservations/capacity/controller_test.go +++ b/internal/scheduling/reservations/capacity/controller_test.go @@ -128,8 +128,8 @@ func TestAvailabilityZones(t *testing.T) { hvs := []hv1.Hypervisor{ *newHypervisor("h1", "az-a", 0), *newHypervisor("h2", "az-b", 0), - *newHypervisor("h3", "az-a", 0), // duplicate - {ObjectMeta: metav1.ObjectMeta{Name: "h4"}}, // no label + *newHypervisor("h3", "az-a", 0), // duplicate + {ObjectMeta: metav1.ObjectMeta{Name: "h4"}}, // no label } got := availabilityZones(hvs) want := []string{"az-a", "az-b"} @@ -165,10 +165,10 @@ func TestCountInstancesInAZ(t *testing.T) { func TestReconcileOne_CreatesCRD(t *testing.T) { const ( - groupName = "2101" - az = "qa-de-1a" - memMB = 4096 // 4 GiB - memBytes = int64(memMB) * 1024 * 1024 + groupName = "2101" + az = "qa-de-1a" + memMB = 4096 // 4 GiB + memBytes = int64(memMB) * 1024 * 1024 ) scheme := newTestScheme(t) @@ -501,10 +501,10 @@ func TestPackageLogVar(t *testing.T) { func TestSumCommittedCapacity(t *testing.T) { const ( - groupName = "2101" - az = "qa-de-1a" - memMB = 4096 - memBytes = int64(memMB) * 1024 * 1024 + groupName = "2101" + az = "qa-de-1a" + memMB = 4096 + memBytes = int64(memMB) * 1024 * 1024 ) newCR := func(name, group, zone string, state v1alpha1.CommitmentStatus, resType v1alpha1.CommittedResourceType, amount string, acceptedAmount string) *v1alpha1.CommittedResource { From 19f3e9dc4467bb9c20b113041055fd75f4b3b966 Mon Sep 17 00:00:00 2001 From: Julius Clausnitzer Date: Tue, 28 Apr 2026 15:57:36 +0200 Subject: [PATCH 06/15] small fix --- api/v1alpha1/flavor_group_capacity_types.go | 2 +- .../crds/cortex.cloud_flavorgroupcapacities.yaml | 2 +- .../reservations/capacity/controller_test.go | 12 ++++++------ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/api/v1alpha1/flavor_group_capacity_types.go b/api/v1alpha1/flavor_group_capacity_types.go index edd04ca90..0ad70f701 100644 --- a/api/v1alpha1/flavor_group_capacity_types.go +++ b/api/v1alpha1/flavor_group_capacity_types.go @@ -14,7 +14,7 @@ const ( // FlavorGroupCapacitySpec defines the desired state of FlavorGroupCapacity. type FlavorGroupCapacitySpec struct { - // FlavorGroup is the name of the flavor group (e.g. "2101"). + // FlavorGroup is the name of the flavor group (e.g. "hana-v2"). // +kubebuilder:validation:Required FlavorGroup string `json:"flavorGroup"` diff --git a/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml b/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml index efa690a12..b41418895 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml @@ -69,7 +69,7 @@ spec: covers (e.g. "qa-de-1a"). type: string flavorGroup: - description: FlavorGroup is the name of the flavor group (e.g. "2101"). + description: FlavorGroup is the name of the flavor group (e.g. "hana-v2"). type: string required: - availabilityZone diff --git a/internal/scheduling/reservations/capacity/controller_test.go b/internal/scheduling/reservations/capacity/controller_test.go index c75b3e5c7..8d453ec33 100644 --- a/internal/scheduling/reservations/capacity/controller_test.go +++ b/internal/scheduling/reservations/capacity/controller_test.go @@ -165,7 +165,7 @@ func TestCountInstancesInAZ(t *testing.T) { func TestReconcileOne_CreatesCRD(t *testing.T) { const ( - groupName = "2101" + groupName = "hana-v2" az = "qa-de-1a" memMB = 4096 // 4 GiB memBytes = int64(memMB) * 1024 * 1024 @@ -221,7 +221,7 @@ func TestReconcileOne_CreatesCRD(t *testing.T) { func TestReconcileOne_SetsReadyConditionFalseOnSchedulerError(t *testing.T) { const ( - groupName = "2101" + groupName = "hana-v2" az = "qa-de-1a" memMB = 2048 ) @@ -274,7 +274,7 @@ func TestReconcileOne_SetsReadyConditionFalseOnSchedulerError(t *testing.T) { func TestReconcileOne_IdempotentUpdate(t *testing.T) { const ( - groupName = "2101" + groupName = "hana-v2" az = "qa-de-1a" memMB = 2048 memBytes = int64(memMB) * 1024 * 1024 @@ -334,7 +334,7 @@ func TestReconcileOne_IdempotentUpdate(t *testing.T) { func TestReconcileAll_SkipsGroupsWithNoAZs(t *testing.T) { scheme := newTestScheme(t) - knowledge := newFlavorGroupKnowledge(t, "2101", 2048) + knowledge := newFlavorGroupKnowledge(t, "hana-v2", 2048) // No hypervisors → no AZs → reconcileAll returns without error fakeClient := fake.NewClientBuilder(). @@ -487,7 +487,7 @@ func TestReconcileOne_ZeroMemoryFlavorReturnsError(t *testing.T) { groupData := compute.FlavorGroupFeature{ SmallestFlavor: compute.FlavorInGroup{Name: "bad-flavor", MemoryMB: 0}, } - err := c.reconcileOne(context.Background(), "2101", groupData, "az-a", nil, nil) + err := c.reconcileOne(context.Background(), "hana-v2", groupData, "az-a", nil, nil) if err == nil { t.Error("expected error for zero-memory flavor") } @@ -501,7 +501,7 @@ func TestPackageLogVar(t *testing.T) { func TestSumCommittedCapacity(t *testing.T) { const ( - groupName = "2101" + groupName = "hana-v2" az = "qa-de-1a" memMB = 4096 memBytes = int64(memMB) * 1024 * 1024 From 6436aa51d04da0a0bb0bb94ce07f32a0603c3822 Mon Sep 17 00:00:00 2001 From: Julius Clausnitzer Date: Tue, 28 Apr 2026 16:37:28 +0200 Subject: [PATCH 07/15] pull over pipeline changes from old branch --- .../cortex-nova/templates/pipelines_kvm.yaml | 40 +++++++++++++++++++ helm/bundles/cortex-nova/values.yaml | 6 +++ .../filters/filter_has_enough_capacity.go | 28 ++++++++----- 3 files changed, 63 insertions(+), 11 deletions(-) diff --git a/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml b/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml index 561d9fc3c..8078c069b 100644 --- a/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml +++ b/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml @@ -557,4 +557,44 @@ spec: VM is allocated get a higher weight, encouraging placement on pre-reserved failover capacity. For non-evacuation requests, this weigher has no effect. +--- +apiVersion: cortex.cloud/v1alpha1 +kind: Pipeline +metadata: + name: kvm-report-capacity +spec: + schedulingDomain: nova + description: | + This pipeline is used by the capacity controller to determine the + theoretical maximum capacity of each flavor group per availability zone, + as if all hosts were completely empty. It ignores current VM allocations + and all reservation blockings so that only raw hardware capacity is + considered. + type: filter-weigher + createDecisions: false + # Fetch all placement candidates, ignoring nova's preselection. + ignorePreselection: true + filters: + - name: filter_correct_az + description: | + Restricts host candidates to the requested availability zone. + - name: filter_has_enough_capacity + description: | + Filters hosts that cannot fit the flavor based on raw hardware capacity. + VM allocations and all reservation types are ignored to represent an + empty datacenter scenario. + params: + - {key: ignoreAllocations, boolValue: true} + - {key: ignoredReservationTypes, stringListValue: ["CommittedResourceReservation", "FailoverReservation"]} + - name: filter_has_requested_traits + description: | + Ensures hosts have the hardware traits required by the flavor. + - name: filter_capabilities + description: | + Ensures hosts meet the compute capabilities required by the flavor + extra specs (e.g., architecture, maxphysaddr bits). + - name: filter_status_conditions + description: | + Excludes hosts that are not ready or are disabled. + weighers: [] {{- end }} diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml index f709bea5f..e08f3f1d4 100644 --- a/helm/bundles/cortex-nova/values.yaml +++ b/helm/bundles/cortex-nova/values.yaml @@ -134,6 +134,12 @@ cortex-scheduling-controllers: - committed-resource-reservations-controller - failover-reservations-controller - capacity-controller + # Pipeline used for the empty-state capacity probe (ignores allocations and reservations). + capacityTotalPipeline: "kvm-report-capacity" + # Pipeline used for the current-state capacity probe (considers current VM allocations). + capacityPlaceablePipeline: "kvm-general-purpose-load-balancing" + # How often the capacity controller re-runs its scheduler probes. + capacityReconcileInterval: 5m enabledTasks: - nova-history-cleanup-task # If true, the external scheduler API will limit the list of hosts in its diff --git a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go index e6956609a..2ceb4944f 100644 --- a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go +++ b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go @@ -26,6 +26,10 @@ type FilterHasEnoughCapacityOpts struct { // When a reservation type is in this list, its capacity is not blocked. // Default: empty (all reservation types are considered) IgnoredReservationTypes []v1alpha1.ReservationType `json:"ignoredReservationTypes,omitempty"` + + // IgnoreAllocations skips subtracting current VM allocations from host capacity. + // When true, only raw hardware capacity is considered (empty datacenter scenario). + IgnoreAllocations bool `json:"ignoreAllocations,omitempty"` } func (FilterHasEnoughCapacityOpts) Validate() error { return nil } @@ -71,18 +75,20 @@ func (s *FilterHasEnoughCapacity) Run(traceLog *slog.Logger, request api.Externa freeResourcesByHost[hv.Name] = hv.Status.EffectiveCapacity } - // Subtract allocated resources. - for resourceName, allocated := range hv.Status.Allocation { - free, ok := freeResourcesByHost[hv.Name][resourceName] - if !ok { - traceLog.Error( - "hypervisor with allocation for unknown resource", - "host", hv.Name, "resource", resourceName, - ) - continue + // Subtract allocated resources (skip when ignoring allocations for empty-datacenter capacity queries). + if !s.Options.IgnoreAllocations { + for resourceName, allocated := range hv.Status.Allocation { + free, ok := freeResourcesByHost[hv.Name][resourceName] + if !ok { + traceLog.Error( + "hypervisor with allocation for unknown resource", + "host", hv.Name, "resource", resourceName, + ) + continue + } + free.Sub(allocated) + freeResourcesByHost[hv.Name][resourceName] = free } - free.Sub(allocated) - freeResourcesByHost[hv.Name][resourceName] = free } } From 393a6c309c4e7d4e0099bb424f678805bad1d526 Mon Sep 17 00:00:00 2001 From: Julius Clausnitzer Date: Tue, 5 May 2026 10:27:59 +0200 Subject: [PATCH 08/15] fix --- .../reservations/capacity/controller.go | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/internal/scheduling/reservations/capacity/controller.go b/internal/scheduling/reservations/capacity/controller.go index c6496227a..24767065b 100644 --- a/internal/scheduling/reservations/capacity/controller.go +++ b/internal/scheduling/reservations/capacity/controller.go @@ -144,12 +144,16 @@ func (c *Controller) reconcileOne( } patch := client.MergeFrom(existing.DeepCopy()) - existing.Status.TotalCapacity = totalCapacity - existing.Status.TotalHosts = totalHosts - existing.Status.TotalPlaceable = totalPlaceable - existing.Status.PlaceableHosts = placeableHosts - existing.Status.TotalInstances = totalInstances - existing.Status.CommittedCapacity = committedCapacity + if totalErr == nil { + existing.Status.TotalCapacity = totalCapacity + existing.Status.TotalHosts = totalHosts + existing.Status.TotalInstances = totalInstances + existing.Status.CommittedCapacity = committedCapacity + } + if placeableErr == nil { + existing.Status.TotalPlaceable = totalPlaceable + existing.Status.PlaceableHosts = placeableHosts + } existing.Status.LastReconcileAt = metav1.Now() freshCondition := metav1.Condition{ From f63f317aa951666fcd1e17c3de7565076d6eca8d Mon Sep 17 00:00:00 2001 From: Julius Clausnitzer Date: Tue, 5 May 2026 11:35:23 +0200 Subject: [PATCH 09/15] extending flavor group crd to every flavor --- api/v1alpha1/flavor_group_capacity_types.go | 46 ++++--- api/v1alpha1/zz_generated.deepcopy.go | 20 +++ .../cortex.cloud_flavorgroupcapacities.yaml | 72 ++++++----- .../reservations/capacity/controller.go | 119 +++++++++++------- .../reservations/capacity/controller_test.go | 100 +++++++++++---- .../reservations/capacity/metrics.go | 87 +++++++------ 6 files changed, 281 insertions(+), 163 deletions(-) diff --git a/api/v1alpha1/flavor_group_capacity_types.go b/api/v1alpha1/flavor_group_capacity_types.go index 0ad70f701..a7339dce2 100644 --- a/api/v1alpha1/flavor_group_capacity_types.go +++ b/api/v1alpha1/flavor_group_capacity_types.go @@ -23,36 +23,44 @@ type FlavorGroupCapacitySpec struct { AvailabilityZone string `json:"availabilityZone"` } -// FlavorGroupCapacityStatus defines the observed state of FlavorGroupCapacity. -type FlavorGroupCapacityStatus struct { - // TotalCapacity is the total schedulable slots in an empty-datacenter scenario. - // Computed as sum of floor(EffectiveCapacity.Memory / smallestFlavorMemory) across - // all hosts eligible for this flavor group (empty-state scheduler probe). +// FlavorCapacityStatus holds per-flavor capacity numbers for one (flavor group × AZ) pair. +type FlavorCapacityStatus struct { + // FlavorName is the OpenStack flavor name (e.g. "hana-v2-small"). + FlavorName string `json:"flavorName"` + + // PlaceableHosts is the number of hosts that can still fit this flavor given current allocations. // +kubebuilder:validation:Optional - TotalCapacity int64 `json:"totalCapacity,omitempty"` + PlaceableHosts int64 `json:"placeableHosts,omitempty"` - // TotalHosts is the number of hosts eligible for this flavor group in the empty-state probe. + // PlaceableVMs is the number of VM slots remaining for this flavor given current allocations. // +kubebuilder:validation:Optional - TotalHosts int64 `json:"totalHosts,omitempty"` + PlaceableVMs int64 `json:"placeableVms,omitempty"` - // TotalPlaceable is the schedulable slots remaining given current VM allocations. - // Computed from the current-state scheduler probe. + // TotalCapacityHosts is the number of eligible hosts in an empty-datacenter scenario. // +kubebuilder:validation:Optional - TotalPlaceable int64 `json:"totalPlaceable,omitempty"` + TotalCapacityHosts int64 `json:"totalCapacityHosts,omitempty"` - // PlaceableHosts is the number of hosts still able to accept a new smallest-flavor VM. + // TotalCapacityVMSlots is the maximum number of VM slots in an empty-datacenter scenario. // +kubebuilder:validation:Optional - PlaceableHosts int64 `json:"placeableHosts,omitempty"` + TotalCapacityVMSlots int64 `json:"totalCapacityVmSlots,omitempty"` +} - // TotalInstances is the total number of VM instances running on hypervisors in this AZ, - // derived from Hypervisor CRD Status.Instances (not filtered by flavor group). +// FlavorGroupCapacityStatus defines the observed state of FlavorGroupCapacity. +type FlavorGroupCapacityStatus struct { + // Flavors holds per-flavor capacity data for all flavors in the group. // +kubebuilder:validation:Optional - TotalInstances int64 `json:"totalInstances,omitempty"` + Flavors []FlavorCapacityStatus `json:"flavors,omitempty"` - // CommittedCapacity is the sum of AcceptedAmount across Ready=True CommittedResource CRDs. + // CommittedCapacity is the sum of AcceptedAmount across active CommittedResource CRDs, + // expressed in multiples of the smallest flavor's memory. // +kubebuilder:validation:Optional CommittedCapacity int64 `json:"committedCapacity,omitempty"` + // TotalInstances is the total number of VM instances running on hypervisors in this AZ, + // derived from Hypervisor CRD Status.Instances (not filtered by flavor group). + // +kubebuilder:validation:Optional + TotalInstances int64 `json:"totalInstances,omitempty"` + // LastReconcileAt is the timestamp of the last successful reconcile. // +kubebuilder:validation:Optional LastReconcileAt metav1.Time `json:"lastReconcileAt,omitempty"` @@ -67,9 +75,7 @@ type FlavorGroupCapacityStatus struct { // +kubebuilder:resource:scope=Cluster // +kubebuilder:printcolumn:name="FlavorGroup",type="string",JSONPath=".spec.flavorGroup" // +kubebuilder:printcolumn:name="AZ",type="string",JSONPath=".spec.availabilityZone" -// +kubebuilder:printcolumn:name="TotalCapacity",type="integer",JSONPath=".status.totalCapacity" -// +kubebuilder:printcolumn:name="TotalPlaceable",type="integer",JSONPath=".status.totalPlaceable" -// +kubebuilder:printcolumn:name="TotalHosts",type="integer",JSONPath=".status.totalHosts" +// +kubebuilder:printcolumn:name="TotalInstances",type="integer",JSONPath=".status.totalInstances" // +kubebuilder:printcolumn:name="LastReconcile",type="date",JSONPath=".status.lastReconcileAt" // +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index f995ec9be..e75332b77 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -749,6 +749,21 @@ func (in *FilterSpec) DeepCopy() *FilterSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FlavorCapacityStatus) DeepCopyInto(out *FlavorCapacityStatus) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FlavorCapacityStatus. +func (in *FlavorCapacityStatus) DeepCopy() *FlavorCapacityStatus { + if in == nil { + return nil + } + out := new(FlavorCapacityStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *FlavorGroupCapacity) DeepCopyInto(out *FlavorGroupCapacity) { *out = *in @@ -826,6 +841,11 @@ func (in *FlavorGroupCapacitySpec) DeepCopy() *FlavorGroupCapacitySpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *FlavorGroupCapacityStatus) DeepCopyInto(out *FlavorGroupCapacityStatus) { *out = *in + if in.Flavors != nil { + in, out := &in.Flavors, &out.Flavors + *out = make([]FlavorCapacityStatus, len(*in)) + copy(*out, *in) + } in.LastReconcileAt.DeepCopyInto(&out.LastReconcileAt) if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions diff --git a/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml b/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml index b41418895..5f475689e 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml @@ -21,14 +21,8 @@ spec: - jsonPath: .spec.availabilityZone name: AZ type: string - - jsonPath: .status.totalCapacity - name: TotalCapacity - type: integer - - jsonPath: .status.totalPlaceable - name: TotalPlaceable - type: integer - - jsonPath: .status.totalHosts - name: TotalHosts + - jsonPath: .status.totalInstances + name: TotalInstances type: integer - jsonPath: .status.lastReconcileAt name: LastReconcile @@ -79,8 +73,9 @@ spec: description: status defines the observed state of FlavorGroupCapacity properties: committedCapacity: - description: CommittedCapacity is the sum of AcceptedAmount across - Ready=True CommittedResource CRDs. + description: |- + CommittedCapacity is the sum of AcceptedAmount across active CommittedResource CRDs, + expressed in multiples of the smallest flavor's memory. format: int64 type: integer conditions: @@ -140,40 +135,51 @@ spec: - type type: object type: array + flavors: + description: Flavors holds per-flavor capacity data for all flavors + in the group. + items: + description: FlavorCapacityStatus holds per-flavor capacity numbers + for one (flavor group × AZ) pair. + properties: + flavorName: + description: FlavorName is the OpenStack flavor name (e.g. "hana-v2-small"). + type: string + placeableHosts: + description: PlaceableHosts is the number of hosts that can + still fit this flavor given current allocations. + format: int64 + type: integer + placeableVms: + description: PlaceableVMs is the number of VM slots remaining + for this flavor given current allocations. + format: int64 + type: integer + totalCapacityHosts: + description: TotalCapacityHosts is the number of eligible hosts + in an empty-datacenter scenario. + format: int64 + type: integer + totalCapacityVmSlots: + description: TotalCapacityVMSlots is the maximum number of VM + slots in an empty-datacenter scenario. + format: int64 + type: integer + required: + - flavorName + type: object + type: array lastReconcileAt: description: LastReconcileAt is the timestamp of the last successful reconcile. format: date-time type: string - placeableHosts: - description: PlaceableHosts is the number of hosts still able to accept - a new smallest-flavor VM. - format: int64 - type: integer - totalCapacity: - description: |- - TotalCapacity is the total schedulable slots in an empty-datacenter scenario. - Computed as sum of floor(EffectiveCapacity.Memory / smallestFlavorMemory) across - all hosts eligible for this flavor group (empty-state scheduler probe). - format: int64 - type: integer - totalHosts: - description: TotalHosts is the number of hosts eligible for this flavor - group in the empty-state probe. - format: int64 - type: integer totalInstances: description: |- TotalInstances is the total number of VM instances running on hypervisors in this AZ, derived from Hypervisor CRD Status.Instances (not filtered by flavor group). format: int64 type: integer - totalPlaceable: - description: |- - TotalPlaceable is the schedulable slots remaining given current VM allocations. - Computed from the current-state scheduler probe. - format: int64 - type: integer type: object required: - spec diff --git a/internal/scheduling/reservations/capacity/controller.go b/internal/scheduling/reservations/capacity/controller.go index 24767065b..8522f0036 100644 --- a/internal/scheduling/reservations/capacity/controller.go +++ b/internal/scheduling/reservations/capacity/controller.go @@ -6,6 +6,7 @@ package capacity import ( "context" "fmt" + "hash/fnv" "sort" "strings" "time" @@ -27,7 +28,7 @@ import ( var log = ctrl.Log.WithName("capacity-controller").WithValues("module", "capacity") // Controller reconciles FlavorGroupCapacity CRDs on a fixed interval. -// For each (flavor group × AZ) pair it runs two scheduler probes and updates the CRD status. +// For each (flavor group × AZ) pair it probes all flavors in the group and updates the CRD status. type Controller struct { client client.Client schedulerClient *reservations.SchedulerClient @@ -102,29 +103,13 @@ func (c *Controller) reconcileOne( allHVs []hv1.Hypervisor, ) error { - smallestFlavor := groupData.SmallestFlavor - smallestFlavorBytes := int64(smallestFlavor.MemoryMB) * 1024 * 1024 //nolint:gosec + smallestFlavorBytes := int64(groupData.SmallestFlavor.MemoryMB) * 1024 * 1024 //nolint:gosec if smallestFlavorBytes <= 0 { - return fmt.Errorf("smallest flavor %q has invalid memory %d MB", smallestFlavor.Name, smallestFlavor.MemoryMB) - } - - // Empty-state probe: scheduler ignores all current VM allocations. - totalCapacity, totalHosts, totalErr := c.probeScheduler(ctx, smallestFlavor, az, c.config.TotalPipeline, hvByName, smallestFlavorBytes) - - // Current-state probe: scheduler considers current VM allocations. - totalPlaceable, placeableHosts, placeableErr := c.probeScheduler(ctx, smallestFlavor, az, c.config.PlaceablePipeline, hvByName, smallestFlavorBytes) - - // Count total instances on hypervisors in this AZ. - totalInstances := countInstancesInAZ(allHVs, az) - - committedCapacity, committedErr := c.sumCommittedCapacity(ctx, groupName, az, smallestFlavorBytes) - if committedErr != nil { - log.Error(committedErr, "failed to sum committed capacity", "flavorGroup", groupName, "az", az) - committedCapacity = 0 + return fmt.Errorf("smallest flavor %q has invalid memory %d MB", + groupData.SmallestFlavor.Name, groupData.SmallestFlavor.MemoryMB) } crdName := crdNameFor(groupName, az) - fresh := totalErr == nil && placeableErr == nil var existing v1alpha1.FlavorGroupCapacity err := c.client.Get(ctx, types.NamespacedName{Name: crdName}, &existing) @@ -143,35 +128,67 @@ func (c *Controller) reconcileOne( return fmt.Errorf("failed to get FlavorGroupCapacity %s: %w", crdName, err) } - patch := client.MergeFrom(existing.DeepCopy()) - if totalErr == nil { - existing.Status.TotalCapacity = totalCapacity - existing.Status.TotalHosts = totalHosts - existing.Status.TotalInstances = totalInstances - existing.Status.CommittedCapacity = committedCapacity + // Build a lookup of existing per-flavor data so we can preserve stale values on probe failure. + existingByName := make(map[string]v1alpha1.FlavorCapacityStatus, len(existing.Status.Flavors)) + for _, f := range existing.Status.Flavors { + existingByName[f.FlavorName] = f } - if placeableErr == nil { - existing.Status.TotalPlaceable = totalPlaceable - existing.Status.PlaceableHosts = placeableHosts + + // Probe all flavors in the group. Sort for stable CRD output. + flavors := make([]compute.FlavorInGroup, len(groupData.Flavors)) + copy(flavors, groupData.Flavors) + sort.Slice(flavors, func(i, j int) bool { return flavors[i].Name < flavors[j].Name }) + + allFresh := true + newFlavors := make([]v1alpha1.FlavorCapacityStatus, 0, len(flavors)) + for _, flavor := range flavors { + cur := existingByName[flavor.Name] + cur.FlavorName = flavor.Name + + totalVMSlots, totalHosts, totalErr := c.probeScheduler(ctx, flavor, az, c.config.TotalPipeline, hvByName) + placeableVMs, placeableHosts, placeableErr := c.probeScheduler(ctx, flavor, az, c.config.PlaceablePipeline, hvByName) + + if totalErr != nil { + allFresh = false + } else { + cur.TotalCapacityVMSlots = totalVMSlots + cur.TotalCapacityHosts = totalHosts + } + if placeableErr != nil { + allFresh = false + } else { + cur.PlaceableVMs = placeableVMs + cur.PlaceableHosts = placeableHosts + } + newFlavors = append(newFlavors, cur) + } + + // Count total instances and committed capacity (always available regardless of probe results). + totalInstances := countInstancesInAZ(allHVs, az) + committedCapacity, committedErr := c.sumCommittedCapacity(ctx, groupName, az, smallestFlavorBytes) + if committedErr != nil { + log.Error(committedErr, "failed to sum committed capacity", "flavorGroup", groupName, "az", az) + committedCapacity = 0 } + + patch := client.MergeFrom(existing.DeepCopy()) + existing.Status.Flavors = newFlavors + existing.Status.TotalInstances = totalInstances + existing.Status.CommittedCapacity = committedCapacity existing.Status.LastReconcileAt = metav1.Now() freshCondition := metav1.Condition{ Type: v1alpha1.FlavorGroupCapacityConditionReady, ObservedGeneration: existing.Generation, } - if fresh { + if allFresh { freshCondition.Status = metav1.ConditionTrue freshCondition.Reason = "ReconcileSucceeded" freshCondition.Message = "capacity data is up-to-date" } else { freshCondition.Status = metav1.ConditionFalse freshCondition.Reason = "ReconcileFailed" - if totalErr != nil { - freshCondition.Message = fmt.Sprintf("empty-state probe failed: %v", totalErr) - } else { - freshCondition.Message = fmt.Sprintf("current-state probe failed: %v", placeableErr) - } + freshCondition.Message = "one or more flavor probes failed" } meta.SetStatusCondition(&existing.Status.Conditions, freshCondition) @@ -181,15 +198,20 @@ func (c *Controller) reconcileOne( return nil } -// probeScheduler calls the scheduler with the given pipeline and returns capacity + host count. +// probeScheduler calls the scheduler with the given pipeline and returns VM slots + host count. +// Capacity is computed as sum of floor(hostMemory / flavorMemory) across returned hosts. func (c *Controller) probeScheduler( ctx context.Context, flavor compute.FlavorInGroup, az, pipeline string, hvByName map[string]hv1.Hypervisor, - smallestFlavorBytes int64, ) (capacity, hosts int64, err error) { + flavorBytes := int64(flavor.MemoryMB) * 1024 * 1024 //nolint:gosec + if flavorBytes <= 0 { + return 0, 0, fmt.Errorf("flavor %q has invalid memory %d MB", flavor.Name, flavor.MemoryMB) + } + resp, err := c.schedulerClient.ScheduleReservation(ctx, reservations.ScheduleReservationRequest{ InstanceUUID: uuid.New().String(), ProjectID: "cortex-capacity-probe", @@ -222,7 +244,7 @@ func (c *Controller) probeScheduler( continue } if capBytes := memCap.Value(); capBytes > 0 { - capacity += capBytes / smallestFlavorBytes + capacity += capBytes / flavorBytes } } return capacity, hosts, nil @@ -290,12 +312,19 @@ func countInstancesInAZ(hvs []hv1.Hypervisor, az string) int64 { return total } -// crdNameFor produces a valid DNS subdomain name for a (flavorGroup, az) pair. -// Underscores and dots are replaced with dashes; the result is lowercased. +// crdNameFor produces a collision-safe DNS label for a (flavorGroup, az) pair. +// A 6-hex-char FNV-1a hash of the raw inputs is appended so that pairs differing only +// by characters that sanitise identically (e.g. "." vs "-") still get unique names. func crdNameFor(flavorGroup, az string) string { - combined := flavorGroup + "-" + az - combined = strings.ToLower(combined) - combined = strings.ReplaceAll(combined, "_", "-") - combined = strings.ReplaceAll(combined, ".", "-") - return combined + h := fnv.New32a() + _, _ = h.Write([]byte(flavorGroup + "\x00" + az)) + suffix := fmt.Sprintf("%06x", h.Sum32()&0xFFFFFF) + + prefix := strings.ToLower(flavorGroup + "-" + az) + prefix = strings.ReplaceAll(prefix, "_", "-") + prefix = strings.ReplaceAll(prefix, ".", "-") + if len(prefix) > 56 { // 56 + "-" + 6 = 63 chars (DNS label limit) + prefix = prefix[:56] + } + return prefix + "-" + suffix } diff --git a/internal/scheduling/reservations/capacity/controller_test.go b/internal/scheduling/reservations/capacity/controller_test.go index 8d453ec33..2cb15f3e7 100644 --- a/internal/scheduling/reservations/capacity/controller_test.go +++ b/internal/scheduling/reservations/capacity/controller_test.go @@ -8,6 +8,7 @@ import ( "encoding/json" "net/http" "net/http/httptest" + "regexp" "sort" "testing" @@ -41,15 +42,17 @@ func newTestScheme(t *testing.T) *runtime.Scheme { // newFlavorGroupKnowledge creates a ready Knowledge CRD with a single flavor group. func newFlavorGroupKnowledge(t *testing.T, groupName string, smallestMemoryMB uint64) *v1alpha1.Knowledge { t.Helper() + smallestFlavor := compute.FlavorInGroup{ + Name: groupName + "-small", + MemoryMB: smallestMemoryMB, + VCPUs: 2, + ExtraSpecs: map[string]string{"hw:cpu_policy": "dedicated"}, + } features := []compute.FlavorGroupFeature{ { - Name: groupName, - SmallestFlavor: compute.FlavorInGroup{ - Name: groupName + "-small", - MemoryMB: smallestMemoryMB, - VCPUs: 2, - ExtraSpecs: map[string]string{"hw:cpu_policy": "dedicated"}, - }, + Name: groupName, + SmallestFlavor: smallestFlavor, + Flavors: []compute.FlavorInGroup{smallestFlavor}, }, } raw, err := v1alpha1.BoxFeatureList(features) @@ -108,20 +111,45 @@ func newMockSchedulerServer(t *testing.T, hosts []string) *httptest.Server { // --- unit tests for pure helper functions --- +var ( + dnsLabelRE = regexp.MustCompile(`^[a-z0-9][a-z0-9-]{0,61}[a-z0-9]$`) + hashSuffixRE = regexp.MustCompile(`^[0-9a-f]{6}$`) +) + func TestCrdNameFor(t *testing.T) { tests := []struct { - group, az, want string + group, az string + wantPrefix string }{ - {"2101", "qa-de-1a", "2101-qa-de-1a"}, - {"My_Group", "eu.west.1", "my-group-eu-west-1"}, - {"G", "AZ_1", "g-az-1"}, + {"hana-v2", "qa-de-1a", "hana-v2-qa-de-1a-"}, + {"My_Group", "eu.west.1", "my-group-eu-west-1-"}, + {"G", "AZ_1", "g-az-1-"}, } for _, tt := range tests { got := crdNameFor(tt.group, tt.az) - if got != tt.want { - t.Errorf("crdNameFor(%q, %q) = %q, want %q", tt.group, tt.az, got, tt.want) + // Must be a valid DNS label (lowercase, hyphens, ≤63 chars). + if len(got) > 63 { + t.Errorf("crdNameFor(%q, %q) = %q (len=%d > 63)", tt.group, tt.az, got, len(got)) + } + if !dnsLabelRE.MatchString(got) { + t.Errorf("crdNameFor(%q, %q) = %q is not a valid DNS label", tt.group, tt.az, got) + } + // Must start with the expected sanitised prefix followed by a 6-hex-char hash suffix. + if len(got) < len(tt.wantPrefix)+6 || got[:len(tt.wantPrefix)] != tt.wantPrefix { + t.Errorf("crdNameFor(%q, %q) = %q, want prefix %q + 6 hex chars", tt.group, tt.az, got, tt.wantPrefix) + } + hashPart := got[len(tt.wantPrefix):] + if !hashSuffixRE.MatchString(hashPart) { + t.Errorf("crdNameFor(%q, %q) hash suffix %q is not 6 hex chars", tt.group, tt.az, hashPart) } } + + // Inputs that differ only by "." vs "-" must produce different CRD names. + dotName := crdNameFor("hana.v2", "qa-de-1a") + dashName := crdNameFor("hana-v2", "qa-de-1a") + if dotName == dashName { + t.Errorf("crdNameFor collision: hana.v2 and hana-v2 both produced %q", dotName) + } } func TestAvailabilityZones(t *testing.T) { @@ -191,8 +219,10 @@ func TestReconcileOne_CreatesCRD(t *testing.T) { PlaceablePipeline: "kvm-general-purpose", }) + smallFlavor := compute.FlavorInGroup{Name: groupName + "-small", MemoryMB: memMB, VCPUs: 2} groupData := compute.FlavorGroupFeature{ - SmallestFlavor: compute.FlavorInGroup{Name: groupName + "-small", MemoryMB: memMB}, + SmallestFlavor: smallFlavor, + Flavors: []compute.FlavorInGroup{smallFlavor}, } hvByName := map[string]hv1.Hypervisor{"host-1": *hv} @@ -200,23 +230,32 @@ func TestReconcileOne_CreatesCRD(t *testing.T) { t.Fatalf("reconcileOne failed: %v", err) } - // Verify CRD was created with correct status var crd v1alpha1.FlavorGroupCapacity if err := fakeClient.Get(context.Background(), types.NamespacedName{Name: crdNameFor(groupName, az)}, &crd); err != nil { t.Fatalf("failed to get CRD: %v", err) } - if crd.Status.TotalCapacity != 1 { - t.Errorf("TotalCapacity = %d, want 1", crd.Status.TotalCapacity) + if len(crd.Status.Flavors) != 1 { + t.Fatalf("len(Status.Flavors) = %d, want 1", len(crd.Status.Flavors)) + } + f := crd.Status.Flavors[0] + if f.FlavorName != groupName+"-small" { + t.Errorf("FlavorName = %q, want %q", f.FlavorName, groupName+"-small") + } + if f.TotalCapacityVMSlots != 1 { + t.Errorf("TotalCapacityVMSlots = %d, want 1", f.TotalCapacityVMSlots) + } + if f.TotalCapacityHosts != 1 { + t.Errorf("TotalCapacityHosts = %d, want 1", f.TotalCapacityHosts) } - if crd.Status.TotalHosts != 1 { - t.Errorf("TotalHosts = %d, want 1", crd.Status.TotalHosts) + if f.PlaceableVMs != 1 { + t.Errorf("PlaceableVMs = %d, want 1", f.PlaceableVMs) + } + if f.PlaceableHosts != 1 { + t.Errorf("PlaceableHosts = %d, want 1", f.PlaceableHosts) } if crd.Status.TotalInstances != 1 { t.Errorf("TotalInstances = %d, want 1", crd.Status.TotalInstances) } - if crd.Status.TotalPlaceable != 1 { - t.Errorf("TotalPlaceable = %d, want 1", crd.Status.TotalPlaceable) - } } func TestReconcileOne_SetsReadyConditionFalseOnSchedulerError(t *testing.T) { @@ -247,8 +286,10 @@ func TestReconcileOne_SetsReadyConditionFalseOnSchedulerError(t *testing.T) { PlaceablePipeline: "kvm-general-purpose", }) + smallFlavor := compute.FlavorInGroup{Name: groupName + "-small", MemoryMB: memMB, VCPUs: 2} groupData := compute.FlavorGroupFeature{ - SmallestFlavor: compute.FlavorInGroup{Name: groupName + "-small", MemoryMB: memMB}, + SmallestFlavor: smallFlavor, + Flavors: []compute.FlavorInGroup{smallFlavor}, } // reconcileOne returns no error itself (it continues on probe failure), but sets Ready=False @@ -309,8 +350,10 @@ func TestReconcileOne_IdempotentUpdate(t *testing.T) { PlaceablePipeline: "kvm-general-purpose", }) + smallFlavor := compute.FlavorInGroup{Name: groupName + "-small", MemoryMB: memMB, VCPUs: 2} groupData := compute.FlavorGroupFeature{ - SmallestFlavor: compute.FlavorInGroup{Name: groupName + "-small", MemoryMB: memMB}, + SmallestFlavor: smallFlavor, + Flavors: []compute.FlavorInGroup{smallFlavor}, } hvByName := map[string]hv1.Hypervisor{"host-1": *hv} @@ -327,8 +370,11 @@ func TestReconcileOne_IdempotentUpdate(t *testing.T) { if err := fakeClient.Get(context.Background(), types.NamespacedName{Name: crdName}, &crd); err != nil { t.Fatalf("failed to get CRD: %v", err) } - if crd.Status.TotalCapacity != 1 { - t.Errorf("TotalCapacity = %d, want 1", crd.Status.TotalCapacity) + if len(crd.Status.Flavors) != 1 { + t.Fatalf("len(Status.Flavors) = %d, want 1", len(crd.Status.Flavors)) + } + if crd.Status.Flavors[0].TotalCapacityVMSlots != 1 { + t.Errorf("TotalCapacityVMSlots = %d, want 1", crd.Status.Flavors[0].TotalCapacityVMSlots) } } @@ -383,7 +429,7 @@ func TestProbeScheduler_CapacityCalculation(t *testing.T) { } flavor := compute.FlavorInGroup{Name: "test-flavor", MemoryMB: memMB} - capacity, hosts, err := c.probeScheduler(context.Background(), flavor, "az-a", "test-pipeline", hvByName, memBytes) + capacity, hosts, err := c.probeScheduler(context.Background(), flavor, "az-a", "test-pipeline", hvByName) if err != nil { t.Fatalf("probeScheduler failed: %v", err) } diff --git a/internal/scheduling/reservations/capacity/metrics.go b/internal/scheduling/reservations/capacity/metrics.go index 698d0ab9e..c33426b6d 100644 --- a/internal/scheduling/reservations/capacity/metrics.go +++ b/internal/scheduling/reservations/capacity/metrics.go @@ -11,40 +11,43 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) -var capacityLabels = []string{"flavor_group", "az"} +var ( + capacityLabels = []string{"flavor_group", "az"} + capacityFlavorLabels = []string{"flavor_group", "az", "flavor_name"} +) // Monitor provides Prometheus metrics for FlavorGroupCapacity CRDs. // It implements prometheus.Collector and reads CRD status on each Collect call. type Monitor struct { - client client.Client - totalCapacity *prometheus.GaugeVec - totalPlaceable *prometheus.GaugeVec - totalHosts *prometheus.GaugeVec - placeableHosts *prometheus.GaugeVec - totalInstances *prometheus.GaugeVec - committedCapacity *prometheus.GaugeVec + client client.Client + totalCapacityVMSlots *prometheus.GaugeVec + placeableVMs *prometheus.GaugeVec + totalCapacityHosts *prometheus.GaugeVec + placeableHosts *prometheus.GaugeVec + totalInstances *prometheus.GaugeVec + committedCapacity *prometheus.GaugeVec } // NewMonitor creates a new Monitor that reads FlavorGroupCapacity CRDs. func NewMonitor(c client.Client) Monitor { return Monitor{ client: c, - totalCapacity: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + totalCapacityVMSlots: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "cortex_committed_resource_capacity_total", - Help: "Total schedulable slots in an empty-datacenter scenario per flavor group and AZ.", - }, capacityLabels), - totalPlaceable: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Help: "Total schedulable slots in an empty-datacenter scenario per flavor.", + }, capacityFlavorLabels), + placeableVMs: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "cortex_committed_resource_capacity_placeable", - Help: "Schedulable slots remaining given current VM allocations per flavor group and AZ.", - }, capacityLabels), - totalHosts: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Help: "Schedulable slots remaining given current VM allocations per flavor.", + }, capacityFlavorLabels), + totalCapacityHosts: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "cortex_committed_resource_capacity_hosts_total", - Help: "Number of hosts eligible for this flavor group in the empty-state probe.", - }, capacityLabels), + Help: "Number of hosts eligible for this flavor in the empty-state probe.", + }, capacityFlavorLabels), placeableHosts: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "cortex_committed_resource_capacity_hosts_placeable", - Help: "Number of hosts still able to accept a new smallest-flavor VM.", - }, capacityLabels), + Help: "Number of hosts still able to accept a new VM of this flavor.", + }, capacityFlavorLabels), totalInstances: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "cortex_committed_resource_capacity_instances", Help: "Total VM instances running on hypervisors in this AZ (not filtered by flavor group).", @@ -58,9 +61,9 @@ func NewMonitor(c client.Client) Monitor { // Describe implements prometheus.Collector. func (m *Monitor) Describe(ch chan<- *prometheus.Desc) { - m.totalCapacity.Describe(ch) - m.totalPlaceable.Describe(ch) - m.totalHosts.Describe(ch) + m.totalCapacityVMSlots.Describe(ch) + m.placeableVMs.Describe(ch) + m.totalCapacityHosts.Describe(ch) m.placeableHosts.Describe(ch) m.totalInstances.Describe(ch) m.committedCapacity.Describe(ch) @@ -75,29 +78,37 @@ func (m *Monitor) Collect(ch chan<- prometheus.Metric) { } // Reset all gauges so deleted CRDs don't linger. - m.totalCapacity.Reset() - m.totalPlaceable.Reset() - m.totalHosts.Reset() + m.totalCapacityVMSlots.Reset() + m.placeableVMs.Reset() + m.totalCapacityHosts.Reset() m.placeableHosts.Reset() m.totalInstances.Reset() m.committedCapacity.Reset() - for _, c := range list.Items { - labels := prometheus.Labels{ - "flavor_group": c.Spec.FlavorGroup, - "az": c.Spec.AvailabilityZone, + for _, crd := range list.Items { + groupAZLabels := prometheus.Labels{ + "flavor_group": crd.Spec.FlavorGroup, + "az": crd.Spec.AvailabilityZone, + } + m.totalInstances.With(groupAZLabels).Set(float64(crd.Status.TotalInstances)) + m.committedCapacity.With(groupAZLabels).Set(float64(crd.Status.CommittedCapacity)) + + for _, f := range crd.Status.Flavors { + flavorLabels := prometheus.Labels{ + "flavor_group": crd.Spec.FlavorGroup, + "az": crd.Spec.AvailabilityZone, + "flavor_name": f.FlavorName, + } + m.totalCapacityVMSlots.With(flavorLabels).Set(float64(f.TotalCapacityVMSlots)) + m.placeableVMs.With(flavorLabels).Set(float64(f.PlaceableVMs)) + m.totalCapacityHosts.With(flavorLabels).Set(float64(f.TotalCapacityHosts)) + m.placeableHosts.With(flavorLabels).Set(float64(f.PlaceableHosts)) } - m.totalCapacity.With(labels).Set(float64(c.Status.TotalCapacity)) - m.totalPlaceable.With(labels).Set(float64(c.Status.TotalPlaceable)) - m.totalHosts.With(labels).Set(float64(c.Status.TotalHosts)) - m.placeableHosts.With(labels).Set(float64(c.Status.PlaceableHosts)) - m.totalInstances.With(labels).Set(float64(c.Status.TotalInstances)) - m.committedCapacity.With(labels).Set(float64(c.Status.CommittedCapacity)) } - m.totalCapacity.Collect(ch) - m.totalPlaceable.Collect(ch) - m.totalHosts.Collect(ch) + m.totalCapacityVMSlots.Collect(ch) + m.placeableVMs.Collect(ch) + m.totalCapacityHosts.Collect(ch) m.placeableHosts.Collect(ch) m.totalInstances.Collect(ch) m.committedCapacity.Collect(ch) From 9a69822895a5fa730738b16254e72c4491eee8e3 Mon Sep 17 00:00:00 2001 From: Julius Clausnitzer Date: Tue, 5 May 2026 11:47:07 +0200 Subject: [PATCH 10/15] fix --- .../scheduling/reservations/capacity/controller.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/internal/scheduling/reservations/capacity/controller.go b/internal/scheduling/reservations/capacity/controller.go index 8522f0036..337ab1aa5 100644 --- a/internal/scheduling/reservations/capacity/controller.go +++ b/internal/scheduling/reservations/capacity/controller.go @@ -21,6 +21,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "github.com/cobaltcore-dev/cortex/api/v1alpha1" + schedulerapi "github.com/cobaltcore-dev/cortex/api/external/nova" "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" ) @@ -212,6 +213,14 @@ func (c *Controller) probeScheduler( return 0, 0, fmt.Errorf("flavor %q has invalid memory %d MB", flavor.Name, flavor.MemoryMB) } + // Build EligibleHosts from all known hypervisors so that novaLimitHostsToRequest + // (which filters the response to hosts present in the request) does not zero out + // the result. The AZ filter in the pipeline handles narrowing to the correct AZ. + eligibleHosts := make([]schedulerapi.ExternalSchedulerHost, 0, len(hvByName)) + for name := range hvByName { + eligibleHosts = append(eligibleHosts, schedulerapi.ExternalSchedulerHost{ComputeHost: name}) + } + resp, err := c.schedulerClient.ScheduleReservation(ctx, reservations.ScheduleReservationRequest{ InstanceUUID: uuid.New().String(), ProjectID: "cortex-capacity-probe", @@ -221,6 +230,7 @@ func (c *Controller) probeScheduler( FlavorExtraSpecs: flavor.ExtraSpecs, AvailabilityZone: az, Pipeline: pipeline, + EligibleHosts: eligibleHosts, }) if err != nil { return 0, 0, fmt.Errorf("scheduler call failed (pipeline=%s): %w", pipeline, err) From 78977addf4d22bb98b865833ab5aa52250b089aa Mon Sep 17 00:00:00 2001 From: Julius Clausnitzer Date: Tue, 5 May 2026 13:00:25 +0200 Subject: [PATCH 11/15] fix --- .../nova/plugins/filters/filter_has_enough_capacity.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go index 9e5e6b16d..b97d3e0e5 100644 --- a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go +++ b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go @@ -196,6 +196,10 @@ func (s *FilterHasEnoughCapacity) Run(traceLog *slog.Logger, request api.Externa // Oversize spec-only: if a pending VM is larger than the remaining slot, block its full size. var resourcesToBlock map[hv1.ResourceName]resource.Quantity if reservation.Spec.Type == v1alpha1.ReservationTypeCommittedResource && + // When ignoring allocations (empty-datacenter scenario) VM resources are not + // deducted, so the confirmed-VM adjustment would under-block: always use the + // full slot instead. + !s.Options.IgnoreAllocations && // if the reservation is not being migrated, block only unused resources reservation.Spec.TargetHost == reservation.Status.Host && reservation.Spec.CommittedResourceReservation != nil && From c51696aee1b74222c3c25d82df90dc148b600282 Mon Sep 17 00:00:00 2001 From: Julius Clausnitzer Date: Tue, 5 May 2026 13:12:33 +0200 Subject: [PATCH 12/15] timeout to avoid blocking of API --- internal/scheduling/reservations/capacity/metrics.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/internal/scheduling/reservations/capacity/metrics.go b/internal/scheduling/reservations/capacity/metrics.go index c33426b6d..bd13ca7ca 100644 --- a/internal/scheduling/reservations/capacity/metrics.go +++ b/internal/scheduling/reservations/capacity/metrics.go @@ -5,6 +5,7 @@ package capacity import ( "context" + "time" "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/prometheus/client_golang/prometheus" @@ -72,7 +73,9 @@ func (m *Monitor) Describe(ch chan<- *prometheus.Desc) { // Collect implements prometheus.Collector — lists all FlavorGroupCapacity CRDs and exports gauges. func (m *Monitor) Collect(ch chan<- prometheus.Metric) { var list v1alpha1.FlavorGroupCapacityList - if err := m.client.List(context.Background(), &list); err != nil { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := m.client.List(ctx, &list); err != nil { log.Error(err, "failed to list FlavorGroupCapacity CRDs for metrics") return } From ff70c00a0ba53dfe188bc3429d43e199cb891701 Mon Sep 17 00:00:00 2001 From: Julius Clausnitzer Date: Tue, 5 May 2026 14:44:56 +0200 Subject: [PATCH 13/15] fix --- helm/bundles/cortex-nova/values.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml index 9360687ea..85e995ba6 100644 --- a/helm/bundles/cortex-nova/values.yaml +++ b/helm/bundles/cortex-nova/values.yaml @@ -95,6 +95,8 @@ cortex: &cortex - cortex.cloud/v1alpha1/ReservationList - cortex.cloud/v1alpha1/CommittedResource - cortex.cloud/v1alpha1/CommittedResourceList + - cortex.cloud/v1alpha1/FlavorGroupCapacity + - cortex.cloud/v1alpha1/FlavorGroupCapacityList - kvm.cloud.sap/v1/Hypervisor - kvm.cloud.sap/v1/HypervisorList - v1/Secret From 642f9aaa6db87e26e68c9e1f492c87aff894fcf8 Mon Sep 17 00:00:00 2001 From: Julius Clausnitzer Date: Tue, 5 May 2026 15:03:23 +0200 Subject: [PATCH 14/15] fix --- internal/scheduling/reservations/capacity/config.go | 12 ++++++++---- .../scheduling/reservations/capacity/controller.go | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/internal/scheduling/reservations/capacity/config.go b/internal/scheduling/reservations/capacity/config.go index 2940f32e8..dc134e887 100644 --- a/internal/scheduling/reservations/capacity/config.go +++ b/internal/scheduling/reservations/capacity/config.go @@ -3,12 +3,16 @@ package capacity -import "time" +import ( + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) // Config holds configuration for the capacity controller. type Config struct { // ReconcileInterval is how often the controller probes the scheduler and updates CRDs. - ReconcileInterval time.Duration `json:"capacityReconcileInterval"` + ReconcileInterval metav1.Duration `json:"capacityReconcileInterval"` // TotalPipeline is the scheduler pipeline used for the empty-state probe. // This pipeline should ignore current VM allocations (e.g. kvm-report-capacity). @@ -25,7 +29,7 @@ type Config struct { // ApplyDefaults fills in any unset values with defaults. func (c *Config) ApplyDefaults() { defaults := DefaultConfig() - if c.ReconcileInterval == 0 { + if c.ReconcileInterval.Duration == 0 { c.ReconcileInterval = defaults.ReconcileInterval } if c.TotalPipeline == "" { @@ -41,7 +45,7 @@ func (c *Config) ApplyDefaults() { func DefaultConfig() Config { return Config{ - ReconcileInterval: 5 * time.Minute, + ReconcileInterval: metav1.Duration{Duration: 5 * time.Minute}, TotalPipeline: "kvm-report-capacity", PlaceablePipeline: "kvm-general-purpose-load-balancing", SchedulerURL: "http://localhost:8080/scheduler/nova/external", diff --git a/internal/scheduling/reservations/capacity/controller.go b/internal/scheduling/reservations/capacity/controller.go index 337ab1aa5..b37597911 100644 --- a/internal/scheduling/reservations/capacity/controller.go +++ b/internal/scheduling/reservations/capacity/controller.go @@ -57,7 +57,7 @@ func (c *Controller) Start(ctx context.Context) error { if err := c.reconcileAll(ctx); err != nil { log.Error(err, "reconcile cycle failed") } - timer.Reset(c.config.ReconcileInterval) + timer.Reset(c.config.ReconcileInterval.Duration) } } } From 9d9a0bc54246021d661e104f385e375dfa567877 Mon Sep 17 00:00:00 2001 From: Julius Clausnitzer Date: Wed, 6 May 2026 16:21:30 +0200 Subject: [PATCH 15/15] lint --- internal/scheduling/reservations/capacity/controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/scheduling/reservations/capacity/controller.go b/internal/scheduling/reservations/capacity/controller.go index b37597911..7a013a0a0 100644 --- a/internal/scheduling/reservations/capacity/controller.go +++ b/internal/scheduling/reservations/capacity/controller.go @@ -20,8 +20,8 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" - "github.com/cobaltcore-dev/cortex/api/v1alpha1" schedulerapi "github.com/cobaltcore-dev/cortex/api/external/nova" + "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" )