From a4c68d3217c6e22988fd737e1c988be80d25a735 Mon Sep 17 00:00:00 2001 From: Andrew Karpow Date: Mon, 27 Apr 2026 23:27:34 -0400 Subject: [PATCH 01/24] Fix strings.SplitN with n=1 being a no-op SplitN(s, ".", 1) returns the original string unsplit since n=1 returns a single-element slice containing the whole string. Changed to n=2 to correctly extract the short hostname (everything before the first "."). This fixes hypervisor hostname comparisons that were incorrectly comparing full FQDNs against short hostnames, which could block onboarding or cause false "could not find exact match" errors. --- internal/controller/onboarding_controller.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/controller/onboarding_controller.go b/internal/controller/onboarding_controller.go index 0e466e6..38a0004 100644 --- a/internal/controller/onboarding_controller.go +++ b/internal/controller/onboarding_controller.go @@ -415,7 +415,7 @@ func (r *OnboardingController) ensureNovaProperties(ctx context.Context, hv *kvm return errRequeue } - shortHypervisorAddress := strings.SplitN(hypervisorAddress, ".", 1)[0] + shortHypervisorAddress := strings.SplitN(hypervisorAddress, ".", 2)[0] hypervisorQuery := hypervisors.ListOpts{HypervisorHostnamePattern: &shortHypervisorAddress} hypervisorPages, err := hypervisors.List(r.computeClient, hypervisorQuery).AllPages(ctx) @@ -438,7 +438,7 @@ func (r *OnboardingController) ensureNovaProperties(ctx context.Context, hv *kvm var found = false var myHypervisor hypervisors.Hypervisor for _, h := range hs { - short := strings.SplitN(h.HypervisorHostname, ".", 1)[0] + short := strings.SplitN(h.HypervisorHostname, ".", 2)[0] if short == shortHypervisorAddress { myHypervisor = h found = true From 066f6889e5337762cb3a49e45d23b89a82cac37c Mon Sep 17 00:00:00 2001 From: Andrew Karpow Date: Mon, 27 Apr 2026 23:27:58 -0400 Subject: [PATCH 02/24] Move flag validation after flag.Parse() The validation checks for certificate-issuer-name and certificate-namespace were running before flag.Parse(), meaning user-provided values were not yet parsed. This made the validation ineffective - it would always pass since the variables had non-empty defaults. Moving the validation after flag.Parse() ensures that if a user passes an empty value (e.g., --certificate-issuer-name=""), it will be caught. --- cmd/main.go | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index fecb731..44ecc72 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -101,16 +101,6 @@ func main() { flag.StringVar(&certificateIssuerName, "certificate-issuer-name", "nova-hypervisor-agents-ca-issuer", "Name of the certificate issuer.") - if certificateIssuerName == "" { - setupLog.Error(errors.New("certificate-issuer-name cannot be empty"), "invalid certificate issuer name") - os.Exit(1) - } - - if certificateNamespace == "" { - setupLog.Error(errors.New("certificate-namespace cannot be empty"), "invalid certificate namespace") - os.Exit(1) - } - opts := ctrlzap.Options{ Development: true, TimeEncoder: zapcore.ISO8601TimeEncoder, @@ -121,6 +111,16 @@ func main() { opts.BindFlags(flag.CommandLine) flag.Parse() + if certificateIssuerName == "" { + setupLog.Error(errors.New("certificate-issuer-name cannot be empty"), "invalid certificate issuer name") + os.Exit(1) + } + + if certificateNamespace == "" { + setupLog.Error(errors.New("certificate-namespace cannot be empty"), "invalid certificate namespace") + os.Exit(1) + } + if version { os.Exit(0) } From f55f1466e531d6a1aab97ac9d1b5be3cb85fe1ec Mon Sep 17 00:00:00 2001 From: Andrew Karpow Date: Mon, 27 Apr 2026 23:28:25 -0400 Subject: [PATCH 03/24] Add 30-second timeout to HA service HTTP client The HA service HTTP call was using the default HTTP client with no timeout, which could hang indefinitely and block the reconciliation goroutine. Added a 30-second timeout to prevent this. --- internal/controller/utils.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/internal/controller/utils.go b/internal/controller/utils.go index 84e4272..9d19921 100644 --- a/internal/controller/utils.go +++ b/internal/controller/utils.go @@ -26,6 +26,7 @@ import ( "os" "slices" "strings" + "time" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -58,8 +59,9 @@ func updateInstanceHA(hypervisor *kvmv1.Hypervisor, data string, acceptedCodes [ } url := InstanceHaUrl(region, zone, hostname) + client := &http.Client{Timeout: 30 * time.Second} // G107: Potential HTTP request made with variable url - resp, err := http.Post(url, "application/json", bytes.NewBuffer([]byte(data))) //nolint:gosec,bodyclose + resp, err := client.Post(url, "application/json", bytes.NewBuffer([]byte(data))) //nolint:gosec,bodyclose if err != nil { return fmt.Errorf("failed to send request to ha service due to %w", err) } From 449eb7bab2473ad8e73b72b9c8b34deba192c906 Mon Sep 17 00:00:00 2001 From: Andrew Karpow Date: Mon, 27 Apr 2026 23:29:26 -0400 Subject: [PATCH 04/24] Add retry-on-conflict for eviction status updates Unlike the Hypervisor status patches which use PatchHypervisorStatusWithRetry with exponential backoff retries, the Eviction status patches were using a direct optimistic-lock patch with no retry. Under contention this would fail with a conflict and require a full re-reconciliation. Added retry logic using the same StatusPatchBackoff configuration used by the Hypervisor controller. Also added PatchEvictionStatusWithRetry helper for future use. --- internal/controller/eviction_controller.go | 8 ++++++-- internal/utils/status_patch.go | 16 ++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/internal/controller/eviction_controller.go b/internal/controller/eviction_controller.go index d2e3898..b8fce0b 100644 --- a/internal/controller/eviction_controller.go +++ b/internal/controller/eviction_controller.go @@ -32,12 +32,14 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/util/retry" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" logger "sigs.k8s.io/controller-runtime/pkg/log" kvmv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" "github.com/cobaltcore-dev/openstack-hypervisor-operator/internal/openstack" + "github.com/cobaltcore-dev/openstack-hypervisor-operator/internal/utils" ) // EvictionReconciler reconciles a Eviction object @@ -170,8 +172,10 @@ func (r *EvictionReconciler) handleRunning(ctx context.Context, eviction *kvmv1. } func (r *EvictionReconciler) updateStatus(ctx context.Context, eviction, base *kvmv1.Eviction) error { - return r.Status().Patch(ctx, eviction, client.MergeFromWithOptions(base, - client.MergeFromWithOptimisticLock{}), client.FieldOwner(EvictionControllerName)) + return retry.RetryOnConflict(utils.StatusPatchBackoff, func() error { + return r.Status().Patch(ctx, eviction, client.MergeFromWithOptions(base, + client.MergeFromWithOptimisticLock{}), client.FieldOwner(EvictionControllerName)) + }) } func (r *EvictionReconciler) handlePreflight(ctx context.Context, eviction *kvmv1.Eviction, hv *kvmv1.Hypervisor) (ctrl.Result, error) { diff --git a/internal/utils/status_patch.go b/internal/utils/status_patch.go index e353f87..d1554c8 100644 --- a/internal/utils/status_patch.go +++ b/internal/utils/status_patch.go @@ -53,3 +53,19 @@ func PatchHypervisorStatusWithRetry(ctx context.Context, c k8sclient.Client, nam k8sclient.MergeFromWithOptimisticLock{}), k8sclient.FieldOwner(fieldOwner)) }) } + +// PatchEvictionStatusWithRetry patches eviction status with retry on conflict. +// The updateFn receives a fresh copy of the eviction and should apply status changes to it. +// It re-fetches the resource before each retry attempt to get the latest resourceVersion. +func PatchEvictionStatusWithRetry(ctx context.Context, c k8sclient.Client, name, fieldOwner string, updateFn func(*kvmv1.Eviction)) error { + return retry.RetryOnConflict(StatusPatchBackoff, func() error { + eviction := &kvmv1.Eviction{} + if err := c.Get(ctx, k8sclient.ObjectKey{Name: name}, eviction); err != nil { + return err + } + base := eviction.DeepCopy() + updateFn(eviction) + return c.Status().Patch(ctx, eviction, k8sclient.MergeFromWithOptions(base, + k8sclient.MergeFromWithOptimisticLock{}), k8sclient.FieldOwner(fieldOwner)) + }) +} From 2ecb9cfe28688486507ebf433ff45057146f3f44 Mon Sep 17 00:00:00 2001 From: Andrew Karpow Date: Mon, 27 Apr 2026 23:29:47 -0400 Subject: [PATCH 05/24] Copy transferLabels slice before appending to avoid mutation The transferLabels slice is a package-level variable that was being mutated during SetupWithManager by appending custom label selector keys. While this is called only once during initialization, it could cause data races if tests run in parallel with -race flag. Now we copy the slice before appending to avoid mutating the original. --- internal/controller/hypervisor_controller.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/controller/hypervisor_controller.go b/internal/controller/hypervisor_controller.go index 41213ad..cd2011c 100644 --- a/internal/controller/hypervisor_controller.go +++ b/internal/controller/hypervisor_controller.go @@ -195,6 +195,8 @@ func (hv *HypervisorController) SetupWithManager(mgr ctrl.Manager) error { return fmt.Errorf("failed to parse global label selector: %w", err) } + // Copy transferLabels before appending to avoid mutating the package-level slice + transferLabels = append([]string{}, transferLabels...) for _, requirement := range requirements { transferLabels = append(transferLabels, requirement.Key()) } From 6221b31beceea5504a2b6040a29b40973ac1db66 Mon Sep 17 00:00:00 2001 From: Andrew Karpow Date: Mon, 27 Apr 2026 23:30:04 -0400 Subject: [PATCH 06/24] Fix PeriodSeconds(0) on startup probe PeriodSeconds of 0 is technically invalid in the Kubernetes API - the minimum is 1 second. Kubernetes would silently default it to 10 seconds, which may not be the intended behavior. Changed to explicit 1 second. --- internal/controller/gardener_node_lifecycle_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/controller/gardener_node_lifecycle_controller.go b/internal/controller/gardener_node_lifecycle_controller.go index a59ed9b..9fb7d23 100644 --- a/internal/controller/gardener_node_lifecycle_controller.go +++ b/internal/controller/gardener_node_lifecycle_controller.go @@ -209,7 +209,7 @@ func (r *GardenerNodeLifecycleController) ensureSignallingDeployment(ctx context WithStartupProbe(corev1ac.Probe(). WithExec(corev1ac.ExecAction().WithCommand(command)). WithInitialDelaySeconds(0). - WithPeriodSeconds(0). + WithPeriodSeconds(1). WithFailureThreshold(1). WithSuccessThreshold(1)))))) From f191cada6a2172abdd0f48c669afe13d9183c589 Mon Sep 17 00:00:00 2001 From: Andrew Karpow Date: Mon, 27 Apr 2026 23:30:20 -0400 Subject: [PATCH 07/24] Fix missing URL path separator in HA service URL If KVM_HA_SERVICE_URL doesn't end with a "/", the URL would be malformed (e.g., "https://exampleapi/hypervisors/host" instead of "https://example/api/hypervisors/host"). Now we ensure a trailing slash exists before appending the path. --- internal/controller/utils.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/internal/controller/utils.go b/internal/controller/utils.go index 9d19921..495931f 100644 --- a/internal/controller/utils.go +++ b/internal/controller/utils.go @@ -38,6 +38,9 @@ import ( func InstanceHaUrl(region, zone, hostname string) string { if haURL, found := os.LookupEnv("KVM_HA_SERVICE_URL"); found { + if !strings.HasSuffix(haURL, "/") { + haURL += "/" + } return haURL + "api/hypervisors/" + hostname } return fmt.Sprintf("https://kvm-ha-service-%v.%v.cloud.sap/api/hypervisors/%v", zone, region, hostname) From 27e1b11dce60ae6cab3b2a8deed69036075efc67 Mon Sep 17 00:00:00 2001 From: Andrew Karpow Date: Mon, 27 Apr 2026 23:30:35 -0400 Subject: [PATCH 08/24] Fix wrong error message (image vs network) The error message said "could not list networks" but it was about finding the test image. Changed to "could not find test image". --- internal/controller/onboarding_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/controller/onboarding_controller.go b/internal/controller/onboarding_controller.go index 38a0004..7e9fed7 100644 --- a/internal/controller/onboarding_controller.go +++ b/internal/controller/onboarding_controller.go @@ -503,7 +503,7 @@ func (r *OnboardingController) createOrGetTestServer(ctx context.Context, zone, imageRef, err := r.findTestImage(ctx) if err != nil { - return nil, fmt.Errorf("could not list networks: %w", err) + return nil, fmt.Errorf("could not find test image: %w", err) } networkRef, err := r.findTestNetwork(ctx) From b56c13c6fb3486ebdcdd9e0f111e890c5b9bd8cb Mon Sep 17 00:00:00 2001 From: Andrew Karpow Date: Mon, 27 Apr 2026 23:30:48 -0400 Subject: [PATCH 09/24] Fix missing "not" in error message Changed "could create certificate" to "could not create certificate". Also added missing colon before the error wrapping placeholder. --- internal/controller/node_certificate_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/controller/node_certificate_controller.go b/internal/controller/node_certificate_controller.go index a822ef2..b8da375 100644 --- a/internal/controller/node_certificate_controller.go +++ b/internal/controller/node_certificate_controller.go @@ -171,7 +171,7 @@ func (r *NodeCertificateController) Reconcile(ctx context.Context, req ctrl.Requ }) if err != nil { - return ctrl.Result{}, fmt.Errorf("could create certificate %w", err) + return ctrl.Result{}, fmt.Errorf("could not create certificate: %w", err) } return ctrl.Result{}, nil From 0b87b7b12b1ac18a237369165d6649189c33c172 Mon Sep 17 00:00:00 2001 From: Andrew Karpow Date: Mon, 27 Apr 2026 23:32:15 -0400 Subject: [PATCH 10/24] Consolidate duplicate difference functions into utils package Both internal/controller/utils.go and internal/openstack/aggregates.go had their own implementations of a set-difference function. The controller version used generics while the openstack version was string-specific. Moved the generic version to internal/utils/slices.go so it can be shared by both packages without circular imports. --- internal/controller/traits_controller.go | 4 +-- internal/controller/utils.go | 12 --------- internal/controller/utils_test.go | 4 ++- internal/openstack/aggregates.go | 16 +++--------- internal/utils/slices.go | 32 ++++++++++++++++++++++++ 5 files changed, 40 insertions(+), 28 deletions(-) create mode 100644 internal/utils/slices.go diff --git a/internal/controller/traits_controller.go b/internal/controller/traits_controller.go index 1191636..f3220a2 100644 --- a/internal/controller/traits_controller.go +++ b/internal/controller/traits_controller.go @@ -98,8 +98,8 @@ func (tc *TraitsController) Reconcile(ctx context.Context, req ctrl.Request) (ct return ctrl.Result{}, nil } - toAdd := Difference(customTraitsApplied, hv.Spec.CustomTraits) - toRemove := Difference(hv.Spec.CustomTraits, customTraitsApplied) + toAdd := utils.Difference(customTraitsApplied, hv.Spec.CustomTraits) + toRemove := utils.Difference(hv.Spec.CustomTraits, customTraitsApplied) // fetch current traits, to ensure we don't add duplicates current, err := resourceproviders.GetTraits(ctx, tc.serviceClient, hv.Status.HypervisorID).Extract() diff --git a/internal/controller/utils.go b/internal/controller/utils.go index 495931f..c871cc9 100644 --- a/internal/controller/utils.go +++ b/internal/controller/utils.go @@ -119,18 +119,6 @@ func HasStatusCondition(conditions []metav1.Condition, conditionType string) boo return false } -// returns all elements in b not in a -func Difference[S ~[]E, E comparable](s1, s2 S) S { - diff := make(S, 0) - for item := range slices.Values(s2) { - if !slices.Contains(s1, item) { - diff = append(diff, item) - } - } - - return diff -} - // returns a OwnerReference for the given object and groupversionkind info as returned by apiutil.GVKForObject func OwnerReference(obj metav1.Object, gvk *schema.GroupVersionKind) *v1ac.OwnerReferenceApplyConfiguration { return v1ac.OwnerReference(). diff --git a/internal/controller/utils_test.go b/internal/controller/utils_test.go index 78ddabc..709d5a1 100644 --- a/internal/controller/utils_test.go +++ b/internal/controller/utils_test.go @@ -20,6 +20,8 @@ package controller import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + + "github.com/cobaltcore-dev/openstack-hypervisor-operator/internal/utils" ) var _ = Describe("Utils", func() { @@ -31,7 +33,7 @@ var _ = Describe("Utils", func() { By("returning all elements in b not in a") expectedDifference := []string{"e", "f"} - difference := Difference(sliceA, sliceB) + difference := utils.Difference(sliceA, sliceB) Expect(difference).To(Equal(expectedDifference)) }) }) diff --git a/internal/openstack/aggregates.go b/internal/openstack/aggregates.go index 079a26c..1515f48 100644 --- a/internal/openstack/aggregates.go +++ b/internal/openstack/aggregates.go @@ -26,6 +26,7 @@ import ( "github.com/gophercloud/gophercloud/v2/openstack/compute/v2/aggregates" kvmv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "github.com/cobaltcore-dev/openstack-hypervisor-operator/internal/utils" ) // ApplyAggregates ensures a host is in exactly the specified aggregates. @@ -75,8 +76,8 @@ func ApplyAggregates(ctx context.Context, serviceClient *gophercloud.ServiceClie } } - toAdd := difference(currentAggregates, desiredAggregates) - toRemove := difference(desiredAggregates, currentAggregates) + toAdd := utils.Difference(currentAggregates, desiredAggregates) + toRemove := utils.Difference(desiredAggregates, currentAggregates) // Verify all desired aggregates exist var missingAggregates []string @@ -135,14 +136,3 @@ func ApplyAggregates(ctx context.Context, serviceClient *gophercloud.ServiceClie return result, nil } - -// difference returns all elements in s2 that are not in s1 -func difference(s1, s2 []string) []string { - diff := make([]string, 0) - for _, item := range s2 { - if !slices.Contains(s1, item) { - diff = append(diff, item) - } - } - return diff -} diff --git a/internal/utils/slices.go b/internal/utils/slices.go new file mode 100644 index 0000000..8ab2997 --- /dev/null +++ b/internal/utils/slices.go @@ -0,0 +1,32 @@ +/* +SPDX-FileCopyrightText: Copyright 2024 SAP SE or an SAP affiliate company and cobaltcore-dev contributors +SPDX-License-Identifier: Apache-2.0 + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utils + +import "slices" + +// Difference returns all elements in s2 that are not in s1. +func Difference[S ~[]E, E comparable](s1, s2 S) S { + diff := make(S, 0) + for item := range slices.Values(s2) { + if !slices.Contains(s1, item) { + diff = append(diff, item) + } + } + + return diff +} From 9ac818c11760e83b183fa412f53ad2220a978085 Mon Sep 17 00:00:00 2001 From: Andrew Karpow Date: Mon, 27 Apr 2026 23:32:45 -0400 Subject: [PATCH 11/24] Remove dead code - unused logger in SetupWithManager functions Several SetupWithManager functions were fetching a logger from context and then immediately discarding it with `_`. Removed these unnecessary lines. --- internal/controller/aggregates_controller.go | 1 - internal/controller/hypervisor_instance_ha_controller.go | 3 --- internal/controller/hypervisor_maintenance_controller.go | 1 - internal/controller/traits_controller.go | 1 - 4 files changed, 6 deletions(-) diff --git a/internal/controller/aggregates_controller.go b/internal/controller/aggregates_controller.go index 0072ce6..af92345 100644 --- a/internal/controller/aggregates_controller.go +++ b/internal/controller/aggregates_controller.go @@ -221,7 +221,6 @@ func slicesEqualUnordered(a, b []string) bool { // SetupWithManager sets up the controller with the Manager. func (ac *AggregatesController) SetupWithManager(mgr ctrl.Manager) error { ctx := context.Background() - _ = logger.FromContext(ctx) var err error if ac.computeClient, err = openstack.GetServiceClient(ctx, "compute", nil); err != nil { diff --git a/internal/controller/hypervisor_instance_ha_controller.go b/internal/controller/hypervisor_instance_ha_controller.go index 831f713..9b622de 100644 --- a/internal/controller/hypervisor_instance_ha_controller.go +++ b/internal/controller/hypervisor_instance_ha_controller.go @@ -153,9 +153,6 @@ func (r *HypervisorInstanceHaController) Reconcile(ctx context.Context, req ctrl // SetupWithManager sets up the controller with the Manager. func (r *HypervisorInstanceHaController) SetupWithManager(mgr ctrl.Manager) error { - ctx := context.Background() - _ = logger.FromContext(ctx) - return ctrl.NewControllerManagedBy(mgr). Named(HypervisorInstanceHaControllerName). For(&kvmv1.Hypervisor{}). // trigger the r.Reconcile whenever a hypervisor is created/updated/deleted. diff --git a/internal/controller/hypervisor_maintenance_controller.go b/internal/controller/hypervisor_maintenance_controller.go index 5a4e55f..95fe6c8 100644 --- a/internal/controller/hypervisor_maintenance_controller.go +++ b/internal/controller/hypervisor_maintenance_controller.go @@ -248,7 +248,6 @@ func (hec *HypervisorMaintenanceController) ensureEviction(ctx context.Context, // SetupWithManager sets up the controller with the Manager. func (hec *HypervisorMaintenanceController) SetupWithManager(mgr ctrl.Manager) error { ctx := context.Background() - _ = logger.FromContext(ctx) var err error if hec.computeClient, err = openstack.GetServiceClient(ctx, "compute", nil); err != nil { diff --git a/internal/controller/traits_controller.go b/internal/controller/traits_controller.go index f3220a2..4721806 100644 --- a/internal/controller/traits_controller.go +++ b/internal/controller/traits_controller.go @@ -180,7 +180,6 @@ func getTraitCondition(err error, msg string) metav1.Condition { // SetupWithManager sets up the controller with the Manager. func (tc *TraitsController) SetupWithManager(mgr ctrl.Manager) error { ctx := context.Background() - _ = logger.FromContext(ctx) var err error if tc.serviceClient, err = openstack.GetServiceClient(ctx, "placement", nil); err != nil { From d0dbe39782ba5c21c7fd0668760a7d38626f62ff Mon Sep 17 00:00:00 2001 From: Andrew Karpow Date: Mon, 27 Apr 2026 23:38:58 -0400 Subject: [PATCH 12/24] Remove unused logger imports in aggregates and traits controllers --- internal/controller/aggregates_controller.go | 1 - internal/controller/traits_controller.go | 1 - 2 files changed, 2 deletions(-) diff --git a/internal/controller/aggregates_controller.go b/internal/controller/aggregates_controller.go index af92345..39fe73a 100644 --- a/internal/controller/aggregates_controller.go +++ b/internal/controller/aggregates_controller.go @@ -29,7 +29,6 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" k8sclient "sigs.k8s.io/controller-runtime/pkg/client" - logger "sigs.k8s.io/controller-runtime/pkg/log" "github.com/gophercloud/gophercloud/v2" diff --git a/internal/controller/traits_controller.go b/internal/controller/traits_controller.go index 4721806..2c52a23 100644 --- a/internal/controller/traits_controller.go +++ b/internal/controller/traits_controller.go @@ -29,7 +29,6 @@ import ( "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" k8sclient "sigs.k8s.io/controller-runtime/pkg/client" - logger "sigs.k8s.io/controller-runtime/pkg/log" "github.com/gophercloud/gophercloud/v2" "github.com/gophercloud/gophercloud/v2/openstack/placement/v1/resourceproviders" From e44bcc2bc17bfaa1e0c95f6d36e796826cd20f1b Mon Sep 17 00:00:00 2001 From: Andrew Karpow Date: Mon, 27 Apr 2026 23:39:05 -0400 Subject: [PATCH 13/24] Fix eviction updateStatus to re-fetch resource on conflict retry The updateStatus method used a stale base object across retry attempts, meaning retries would always fail with the same conflict. Now re-fetches the latest eviction on each retry attempt, matching the pattern used by PatchHypervisorStatusWithRetry. --- internal/controller/eviction_controller.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/internal/controller/eviction_controller.go b/internal/controller/eviction_controller.go index b8fce0b..2736e64 100644 --- a/internal/controller/eviction_controller.go +++ b/internal/controller/eviction_controller.go @@ -172,8 +172,17 @@ func (r *EvictionReconciler) handleRunning(ctx context.Context, eviction *kvmv1. } func (r *EvictionReconciler) updateStatus(ctx context.Context, eviction, base *kvmv1.Eviction) error { + // Capture the desired status to re-apply on each retry attempt + desiredStatus := eviction.Status.DeepCopy() return retry.RetryOnConflict(utils.StatusPatchBackoff, func() error { - return r.Status().Patch(ctx, eviction, client.MergeFromWithOptions(base, + // Re-fetch the latest version to get the current resourceVersion + freshEviction := &kvmv1.Eviction{} + if err := r.Get(ctx, client.ObjectKeyFromObject(eviction), freshEviction); err != nil { + return err + } + freshBase := freshEviction.DeepCopy() + freshEviction.Status = *desiredStatus + return r.Status().Patch(ctx, freshEviction, client.MergeFromWithOptions(freshBase, client.MergeFromWithOptimisticLock{}), client.FieldOwner(EvictionControllerName)) }) } From dcf6719d79f59feffb3c753e059df8f47c1d02a4 Mon Sep 17 00:00:00 2001 From: Andrew Karpow Date: Mon, 27 Apr 2026 23:40:00 -0400 Subject: [PATCH 14/24] Fix evictNext: add explicit return after deleting branch and fix error typo - Add explicit return after the 'deleting' branch so control flow does not fall through to the final return statement. Previously it worked by coincidence (err was always nil), but was fragile for future changes. - Fix typo: 'could update status' -> 'could not update status' - Replace dangling err reference at final return with explicit nil - Simplify if/else blocks by removing unnecessary else after return --- internal/controller/eviction_controller.go | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/internal/controller/eviction_controller.go b/internal/controller/eviction_controller.go index 2736e64..274ada4 100644 --- a/internal/controller/eviction_controller.go +++ b/internal/controller/eviction_controller.go @@ -363,35 +363,32 @@ func (r *EvictionReconciler) evictNext(ctx context.Context, eviction *kvmv1.Evic Message: fmt.Sprintf("Live migration of terminating instance %s skipped", vm.ID), Reason: kvmv1.ConditionReasonFailed, }) - if err2 := r.updateStatus(ctx, eviction, base); err2 != nil { - return ctrl.Result{}, fmt.Errorf("could update status due to %w", err2) + if err := r.updateStatus(ctx, eviction, base); err != nil { + return ctrl.Result{}, fmt.Errorf("could not update status due to %w", err) } + return ctrl.Result{RequeueAfter: defaultPollTime}, nil } else if vm.Status == "ACTIVE" || vm.PowerState == 1 { log.Info("trigger live-migration") - err := r.liveMigrate(ctx, vm.ID, eviction) - if err != nil { + if err := r.liveMigrate(ctx, vm.ID, eviction); err != nil { if err2 := r.handleNotFound(ctx, eviction, base, err); err2 != nil { return ctrl.Result{}, err2 - } else { - return ctrl.Result{RequeueAfter: shortRetryTime}, nil } + return ctrl.Result{RequeueAfter: shortRetryTime}, nil } } else { log.Info("trigger cold-migration") - err := r.coldMigrate(ctx, vm.ID, eviction) - if err != nil { + if err := r.coldMigrate(ctx, vm.ID, eviction); err != nil { if err2 := r.handleNotFound(ctx, eviction, base, err); err2 != nil { return ctrl.Result{}, err2 - } else { - return ctrl.Result{RequeueAfter: shortRetryTime}, nil } + return ctrl.Result{RequeueAfter: shortRetryTime}, nil } } // Triggered a migration, give it a generous time to start, so we do not // see the old state because the migration didn't start log.Info("poll") - return ctrl.Result{RequeueAfter: defaultPollTime}, err + return ctrl.Result{RequeueAfter: defaultPollTime}, nil } func (r *EvictionReconciler) liveMigrate(ctx context.Context, uuid string, eviction *kvmv1.Eviction) error { From cc0bc4be49054f6183017c13b793dffb98b882c9 Mon Sep 17 00:00:00 2001 From: Andrew Karpow Date: Mon, 27 Apr 2026 23:40:31 -0400 Subject: [PATCH 15/24] Return early when Hypervisor not found in GardenerNodeLifecycleController Previously, when the Hypervisor was not found, the controller continued with a zero-value struct and relied on LifecycleEnabled defaulting to false to exit early. Now explicitly returns when the Hypervisor is not found, making the intent clear and avoiding fragile zero-value reliance. --- internal/controller/gardener_node_lifecycle_controller.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/internal/controller/gardener_node_lifecycle_controller.go b/internal/controller/gardener_node_lifecycle_controller.go index 9fb7d23..9966651 100644 --- a/internal/controller/gardener_node_lifecycle_controller.go +++ b/internal/controller/gardener_node_lifecycle_controller.go @@ -72,8 +72,12 @@ func (r *GardenerNodeLifecycleController) Reconcile(ctx context.Context, req ctr } hv := &kvmv1.Hypervisor{} - if err := r.Get(ctx, k8sclient.ObjectKey{Name: req.Name}, hv); k8sclient.IgnoreNotFound(err) != nil { - return ctrl.Result{}, err + if err := r.Get(ctx, k8sclient.ObjectKey{Name: req.Name}, hv); err != nil { + if k8sclient.IgnoreNotFound(err) != nil { + return ctrl.Result{}, err + } + // Hypervisor not found, nothing to do + return ctrl.Result{}, nil } if !hv.Spec.LifecycleEnabled { From 3be6976d967485d7b8ee9dea666005595b65aed2 Mon Sep 17 00:00:00 2001 From: Andrew Karpow Date: Mon, 27 Apr 2026 23:41:23 -0400 Subject: [PATCH 16/24] Fix --version flag to print version info and check it before validation The --version flag previously printed nothing and was checked after required flag validation, so running with --version and missing flags would error instead of printing the version. --- cmd/main.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index 44ecc72..a5e383f 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -111,6 +111,13 @@ func main() { opts.BindFlags(flag.CommandLine) flag.Parse() + if version { + fmt.Printf("%s %s (%s/%s) %s\n", + bininfo.Component(), bininfo.VersionOr("devel"), gruntime.GOOS, gruntime.GOARCH, + bininfo.CommitOr("edge")) + os.Exit(0) + } + if certificateIssuerName == "" { setupLog.Error(errors.New("certificate-issuer-name cannot be empty"), "invalid certificate issuer name") os.Exit(1) @@ -121,10 +128,6 @@ func main() { os.Exit(1) } - if version { - os.Exit(0) - } - ctrl.SetLogger(ctrlzap.New(ctrlzap.UseFlagOptions(&opts))) // if the enable-http2 flag is false (the default), http/2 should be disabled From 284b489b78f5d05964a4a1fc6077b0546bb12105 Mon Sep 17 00:00:00 2001 From: Andrew Karpow Date: Mon, 27 Apr 2026 23:42:43 -0400 Subject: [PATCH 17/24] Remove dead code: unused functions, constants, and types Remove the following unused symbols: - ErrRetry variable (utils.go) - HasStatusCondition function (utils.go) - IsNodeConditionTrue function (utils.go) - labelEvictionRequired constant (constants.go) - labelEvictionApproved constant (constants.go) - PatchEvictionStatusWithRetry function (status_patch.go) --- internal/controller/constants.go | 6 ++---- internal/controller/utils.go | 17 ----------------- internal/utils/status_patch.go | 16 ---------------- 3 files changed, 2 insertions(+), 37 deletions(-) diff --git a/internal/controller/constants.go b/internal/controller/constants.go index 5151bd2..f4119dc 100644 --- a/internal/controller/constants.go +++ b/internal/controller/constants.go @@ -19,8 +19,6 @@ package controller // This should contain constants shared between controllers const ( - labelEvictionRequired = "cloud.sap/hypervisor-eviction-required" - labelEvictionApproved = "cloud.sap/hypervisor-eviction-succeeded" - labelHypervisor = "nova.openstack.cloud.sap/virt-driver" - testAggregateName = "tenant_filter_tests" + labelHypervisor = "nova.openstack.cloud.sap/virt-driver" + testAggregateName = "tenant_filter_tests" ) diff --git a/internal/controller/utils.go b/internal/controller/utils.go index c871cc9..ba6de2d 100644 --- a/internal/controller/utils.go +++ b/internal/controller/utils.go @@ -19,7 +19,6 @@ package controller import ( "bytes" - "errors" "fmt" "io" "net/http" @@ -85,11 +84,6 @@ func disableInstanceHA(hypervisor *kvmv1.Hypervisor) error { return updateInstanceHA(hypervisor, `{"enabled": false}`, []int{http.StatusOK, http.StatusNotFound}) } -// IsNodeConditionTrue returns true when the conditionType is present and set to `corev1.ConditionTrue` -func IsNodeConditionTrue(conditions []corev1.NodeCondition, conditionType corev1.NodeConditionType) bool { - return IsNodeConditionPresentAndEqual(conditions, conditionType, corev1.ConditionTrue) -} - // IsNodeConditionPresentAndEqual returns true when conditionType is present and equal to status. func IsNodeConditionPresentAndEqual(conditions []corev1.NodeCondition, conditionType corev1.NodeConditionType, status corev1.ConditionStatus) bool { for _, condition := range conditions { @@ -110,15 +104,6 @@ func FindNodeStatusCondition(conditions []corev1.NodeCondition, conditionType co return nil } -func HasStatusCondition(conditions []metav1.Condition, conditionType string) bool { - for _, condition := range conditions { - if condition.Type == conditionType { - return true - } - } - return false -} - // returns a OwnerReference for the given object and groupversionkind info as returned by apiutil.GVKForObject func OwnerReference(obj metav1.Object, gvk *schema.GroupVersionKind) *v1ac.OwnerReferenceApplyConfiguration { return v1ac.OwnerReference(). @@ -128,8 +113,6 @@ func OwnerReference(obj metav1.Object, gvk *schema.GroupVersionKind) *v1ac.Owner WithUID(obj.GetUID()) } -var ErrRetry = errors.New("ErrRetry") - // returns if any ManagedField of the object has been modified by kubectl func HasKubectlManagedFields(object *metav1.ObjectMeta) bool { for _, field := range object.GetManagedFields() { diff --git a/internal/utils/status_patch.go b/internal/utils/status_patch.go index d1554c8..e353f87 100644 --- a/internal/utils/status_patch.go +++ b/internal/utils/status_patch.go @@ -53,19 +53,3 @@ func PatchHypervisorStatusWithRetry(ctx context.Context, c k8sclient.Client, nam k8sclient.MergeFromWithOptimisticLock{}), k8sclient.FieldOwner(fieldOwner)) }) } - -// PatchEvictionStatusWithRetry patches eviction status with retry on conflict. -// The updateFn receives a fresh copy of the eviction and should apply status changes to it. -// It re-fetches the resource before each retry attempt to get the latest resourceVersion. -func PatchEvictionStatusWithRetry(ctx context.Context, c k8sclient.Client, name, fieldOwner string, updateFn func(*kvmv1.Eviction)) error { - return retry.RetryOnConflict(StatusPatchBackoff, func() error { - eviction := &kvmv1.Eviction{} - if err := c.Get(ctx, k8sclient.ObjectKey{Name: name}, eviction); err != nil { - return err - } - base := eviction.DeepCopy() - updateFn(eviction) - return c.Status().Patch(ctx, eviction, k8sclient.MergeFromWithOptions(base, - k8sclient.MergeFromWithOptimisticLock{}), k8sclient.FieldOwner(fieldOwner)) - }) -} From 4c735a0482287510281303f15355173ffad2b388 Mon Sep 17 00:00:00 2001 From: Andrew Karpow Date: Mon, 27 Apr 2026 23:43:20 -0400 Subject: [PATCH 18/24] Fix typo: SanitzeReconcileErrorEncoder -> SanitizeReconcileErrorEncoder --- cmd/main.go | 2 +- internal/logger/logger.go | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index a5e383f..e0482b4 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -104,7 +104,7 @@ func main() { opts := ctrlzap.Options{ Development: true, TimeEncoder: zapcore.ISO8601TimeEncoder, - Encoder: logger.NewSanitzeReconcileErrorEncoder(zap.NewDevelopmentEncoderConfig()), + Encoder: logger.NewSanitizeReconcileErrorEncoder(zap.NewDevelopmentEncoderConfig()), StacktraceLevel: zap.DPanicLevel, } diff --git a/internal/logger/logger.go b/internal/logger/logger.go index 26e78b5..c2bd79c 100644 --- a/internal/logger/logger.go +++ b/internal/logger/logger.go @@ -22,16 +22,16 @@ import ( "go.uber.org/zap/zapcore" ) -func NewSanitzeReconcileErrorEncoder(cfg zapcore.EncoderConfig) zapcore.Encoder { - return &SanitzeReconcileErrorEncoder{zapcore.NewConsoleEncoder(cfg), cfg} +func NewSanitizeReconcileErrorEncoder(cfg zapcore.EncoderConfig) zapcore.Encoder { + return &SanitizeReconcileErrorEncoder{zapcore.NewConsoleEncoder(cfg), cfg} } -type SanitzeReconcileErrorEncoder struct { +type SanitizeReconcileErrorEncoder struct { zapcore.Encoder cfg zapcore.EncoderConfig } -func (e *SanitzeReconcileErrorEncoder) EncodeEntry(entry zapcore.Entry, fields []zapcore.Field) (*buffer.Buffer, error) { +func (e *SanitizeReconcileErrorEncoder) EncodeEntry(entry zapcore.Entry, fields []zapcore.Field) (*buffer.Buffer, error) { if entry.Message == "Reconcile error" { // Downgrade the log level to debug to avoid log spam entry.Level = zapcore.WarnLevel @@ -40,8 +40,8 @@ func (e *SanitzeReconcileErrorEncoder) EncodeEntry(entry zapcore.Entry, fields [ return e.Encoder.EncodeEntry(entry, fields) } -func (e *SanitzeReconcileErrorEncoder) Clone() zapcore.Encoder { - return &SanitzeReconcileErrorEncoder{ +func (e *SanitizeReconcileErrorEncoder) Clone() zapcore.Encoder { + return &SanitizeReconcileErrorEncoder{ Encoder: e.Encoder.Clone(), } } From 601ec02124dbf65e83be2fe9005496dcbe38f589 Mon Sep 17 00:00:00 2001 From: Andrew Karpow Date: Mon, 27 Apr 2026 23:44:31 -0400 Subject: [PATCH 19/24] Fix Clone() to preserve cfg field in SanitizeReconcileErrorEncoder The Clone() method was not copying the cfg field, violating the Clone contract by returning an encoder with a zero-valued EncoderConfig. --- internal/logger/logger.go | 1 + 1 file changed, 1 insertion(+) diff --git a/internal/logger/logger.go b/internal/logger/logger.go index c2bd79c..51371ba 100644 --- a/internal/logger/logger.go +++ b/internal/logger/logger.go @@ -43,5 +43,6 @@ func (e *SanitizeReconcileErrorEncoder) EncodeEntry(entry zapcore.Entry, fields func (e *SanitizeReconcileErrorEncoder) Clone() zapcore.Encoder { return &SanitizeReconcileErrorEncoder{ Encoder: e.Encoder.Clone(), + cfg: e.cfg, } } From 4518318f330beae6a0bb2af0dcd9cab8a921a419 Mon Sep 17 00:00:00 2001 From: Andrew Karpow Date: Mon, 27 Apr 2026 23:44:53 -0400 Subject: [PATCH 20/24] Fix FindNodeStatusCondition to return pointer to slice element, not copy Previously returned a pointer to the range loop variable (a copy), meaning mutations through the pointer would not affect the original slice. Now indexes directly into the slice to return a stable pointer. --- internal/controller/utils.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/internal/controller/utils.go b/internal/controller/utils.go index ba6de2d..3de5604 100644 --- a/internal/controller/utils.go +++ b/internal/controller/utils.go @@ -95,10 +95,11 @@ func IsNodeConditionPresentAndEqual(conditions []corev1.NodeCondition, condition } // FindNodeStatusCondition returns the condition of the given type if it exists. +// Note: Returns a pointer into the original slice element, not a copy. func FindNodeStatusCondition(conditions []corev1.NodeCondition, conditionType corev1.NodeConditionType) *corev1.NodeCondition { - for _, condition := range conditions { - if condition.Type == conditionType { - return &condition + for i := range conditions { + if conditions[i].Type == conditionType { + return &conditions[i] } } return nil From 179d588669548da5a07f36c26717a83b691dead1 Mon Sep 17 00:00:00 2001 From: Andrew Karpow Date: Mon, 27 Apr 2026 23:45:36 -0400 Subject: [PATCH 21/24] Narrow RBAC annotations on status subresources to get;update;patch Status subresources don't support create, delete, list, or watch verbs. Remove over-broad permissions from hypervisor/status RBAC annotations across all controllers. --- internal/controller/aggregates_controller.go | 2 +- internal/controller/hypervisor_controller.go | 2 +- internal/controller/hypervisor_maintenance_controller.go | 2 +- internal/controller/hypervisor_taint_controller.go | 2 +- internal/controller/traits_controller.go | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/internal/controller/aggregates_controller.go b/internal/controller/aggregates_controller.go index 39fe73a..7490a4a 100644 --- a/internal/controller/aggregates_controller.go +++ b/internal/controller/aggregates_controller.go @@ -48,7 +48,7 @@ type AggregatesController struct { } // +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors,verbs=get;list;watch -// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors/status,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors/status,verbs=get;update;patch func (ac *AggregatesController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { hv := &kvmv1.Hypervisor{} diff --git a/internal/controller/hypervisor_controller.go b/internal/controller/hypervisor_controller.go index cd2011c..bd79725 100644 --- a/internal/controller/hypervisor_controller.go +++ b/internal/controller/hypervisor_controller.go @@ -68,7 +68,7 @@ type HypervisorController struct { // +kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch // +kubebuilder:rbac:groups="",resources=nodes/status,verbs=get // +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors,verbs=get;list;watch;create;update;patch;delete -// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors/status,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors/status,verbs=get;update;patch func (hv *HypervisorController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { log := logger.FromContext(ctx).WithName(req.Name) diff --git a/internal/controller/hypervisor_maintenance_controller.go b/internal/controller/hypervisor_maintenance_controller.go index 95fe6c8..c35b54e 100644 --- a/internal/controller/hypervisor_maintenance_controller.go +++ b/internal/controller/hypervisor_maintenance_controller.go @@ -53,7 +53,7 @@ type HypervisorMaintenanceController struct { } // +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors,verbs=get;list;watch -// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors/status,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors/status,verbs=get;update;patch // +kubebuilder:rbac:groups=kvm.cloud.sap,resources=evictions,verbs=get;list;watch;create;update;patch;delete func (hec *HypervisorMaintenanceController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { hv := &kvmv1.Hypervisor{} diff --git a/internal/controller/hypervisor_taint_controller.go b/internal/controller/hypervisor_taint_controller.go index ad15da4..7822b34 100644 --- a/internal/controller/hypervisor_taint_controller.go +++ b/internal/controller/hypervisor_taint_controller.go @@ -43,7 +43,7 @@ type HypervisorTaintController struct { } // +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors,verbs=get;list;watch -// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors/status,verbs=get;list;watch;create;update;patch +// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors/status,verbs=get;update;patch func (r *HypervisorTaintController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { hypervisor := &kvmv1.Hypervisor{ diff --git a/internal/controller/traits_controller.go b/internal/controller/traits_controller.go index 2c52a23..6da55b8 100644 --- a/internal/controller/traits_controller.go +++ b/internal/controller/traits_controller.go @@ -50,7 +50,7 @@ type TraitsController struct { } // +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors,verbs=get;list;watch -// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors/status,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors/status,verbs=get;update;patch func (tc *TraitsController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { hv := &kvmv1.Hypervisor{} From 2fc05c6cc42001c2ff2a6979a2cce6c2991c8641 Mon Sep 17 00:00:00 2001 From: Andrew Karpow Date: Mon, 27 Apr 2026 23:49:00 -0400 Subject: [PATCH 22/24] Remove unused base parameter from eviction updateStatus and handleNotFound After the updateStatus refactor to re-fetch resources on retry, the base parameter is no longer used. Remove it along with the now-unnecessary DeepCopy calls at each call site. Also remove unused gosec nolint directive in utils.go. --- internal/controller/eviction_controller.go | 39 +++++++++------------- internal/controller/utils.go | 2 +- 2 files changed, 17 insertions(+), 24 deletions(-) diff --git a/internal/controller/eviction_controller.go b/internal/controller/eviction_controller.go index 274ada4..733a69e 100644 --- a/internal/controller/eviction_controller.go +++ b/internal/controller/eviction_controller.go @@ -118,7 +118,6 @@ func (r *EvictionReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c statusCondition := meta.FindStatusCondition(eviction.Status.Conditions, kvmv1.ConditionTypeEvicting) if statusCondition == nil { - base := eviction.DeepCopy() meta.SetStatusCondition(&eviction.Status.Conditions, metav1.Condition{ Type: kvmv1.ConditionTypeEvicting, Status: metav1.ConditionTrue, @@ -126,7 +125,7 @@ func (r *EvictionReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c Reason: kvmv1.ConditionReasonRunning, }) - return ctrl.Result{}, r.updateStatus(ctx, eviction, base) + return ctrl.Result{}, r.updateStatus(ctx, eviction) } switch statusCondition.Status { @@ -158,7 +157,6 @@ func (r *EvictionReconciler) handleRunning(ctx context.Context, eviction *kvmv1. return r.evictNext(ctx, eviction) } - base := eviction.DeepCopy() meta.SetStatusCondition(&eviction.Status.Conditions, metav1.Condition{ Type: kvmv1.ConditionTypeEvicting, Status: metav1.ConditionFalse, @@ -168,10 +166,10 @@ func (r *EvictionReconciler) handleRunning(ctx context.Context, eviction *kvmv1. eviction.Status.OutstandingRamMb = 0 logger.FromContext(ctx).Info("succeeded") - return ctrl.Result{}, r.updateStatus(ctx, eviction, base) + return ctrl.Result{}, r.updateStatus(ctx, eviction) } -func (r *EvictionReconciler) updateStatus(ctx context.Context, eviction, base *kvmv1.Eviction) error { +func (r *EvictionReconciler) updateStatus(ctx context.Context, eviction *kvmv1.Eviction) error { // Capture the desired status to re-apply on each retry attempt desiredStatus := eviction.Status.DeepCopy() return retry.RetryOnConflict(utils.StatusPatchBackoff, func() error { @@ -188,7 +186,6 @@ func (r *EvictionReconciler) updateStatus(ctx context.Context, eviction, base *k } func (r *EvictionReconciler) handlePreflight(ctx context.Context, eviction *kvmv1.Eviction, hv *kvmv1.Hypervisor) (ctrl.Result, error) { - base := eviction.DeepCopy() expectHypervisor := hv.Status.HypervisorID != "" && hv.Status.ServiceID != "" // The hypervisor has been registered // If the hypervisor should exist, then we need to ensure it is disabled before we start evicting @@ -200,7 +197,7 @@ func (r *EvictionReconciler) handlePreflight(ctx context.Context, eviction *kvmv Message: "hypervisor not disabled", Reason: kvmv1.ConditionReasonFailed, }) { - return ctrl.Result{}, r.updateStatus(ctx, eviction, base) + return ctrl.Result{}, r.updateStatus(ctx, eviction) } return ctrl.Result{RequeueAfter: defaultPollTime}, nil // Wait for hypervisor to be disabled } @@ -221,7 +218,7 @@ func (r *EvictionReconciler) handlePreflight(ctx context.Context, eviction *kvmv Message: fmt.Sprintf("failed to get hypervisor %v", err), Reason: kvmv1.ConditionReasonFailed, }) - return ctrl.Result{}, r.updateStatus(ctx, eviction, base) + return ctrl.Result{}, r.updateStatus(ctx, eviction) } else { // That is (likely) an eviction for a node that never registered // so we are good to go @@ -234,7 +231,7 @@ func (r *EvictionReconciler) handlePreflight(ctx context.Context, eviction *kvmv }) eviction.Status.OutstandingRamMb = 0 logger.FromContext(ctx).Info(msg) - return ctrl.Result{}, r.updateStatus(ctx, eviction, base) + return ctrl.Result{}, r.updateStatus(ctx, eviction) } } @@ -255,18 +252,15 @@ func (r *EvictionReconciler) handlePreflight(ctx context.Context, eviction *kvmv Message: "Preflight checks passed, hypervisor is disabled and ready for eviction", Reason: kvmv1.ConditionReasonSucceeded, }) - return ctrl.Result{}, r.updateStatus(ctx, eviction, base) + return ctrl.Result{}, r.updateStatus(ctx, eviction) } // Tries to handle the NotFound-error by updating the status -func (r *EvictionReconciler) handleNotFound(ctx context.Context, eviction, base *kvmv1.Eviction, err error) error { +func (r *EvictionReconciler) handleNotFound(ctx context.Context, eviction *kvmv1.Eviction, err error) error { if !gophercloud.ResponseCodeIs(err, http.StatusNotFound) { return err } logger.FromContext(ctx).Info("Instance is gone") - if base == nil { - base = eviction.DeepCopy() - } var uuid string eviction.Status.OutstandingInstances, uuid = popInstance(eviction.Status.OutstandingInstances) if uuid == "" { @@ -278,11 +272,10 @@ func (r *EvictionReconciler) handleNotFound(ctx context.Context, eviction, base Message: fmt.Sprintf("Instance %s is gone", uuid), Reason: kvmv1.ConditionReasonSucceeded, }) - return r.updateStatus(ctx, eviction, base) + return r.updateStatus(ctx, eviction) } func (r *EvictionReconciler) evictNext(ctx context.Context, eviction *kvmv1.Eviction) (ctrl.Result, error) { - base := eviction.DeepCopy() uuid := peekInstance(eviction.Status.OutstandingInstances) if uuid == "" { return ctrl.Result{}, nil @@ -294,7 +287,7 @@ func (r *EvictionReconciler) evictNext(ctx context.Context, eviction *kvmv1.Evic vm, err := res.Extract() if err != nil { - if err2 := r.handleNotFound(ctx, eviction, base, err); err2 != nil { + if err2 := r.handleNotFound(ctx, eviction, err); err2 != nil { return ctrl.Result{}, err2 } else { return ctrl.Result{RequeueAfter: shortRetryTime}, nil @@ -322,7 +315,7 @@ func (r *EvictionReconciler) evictNext(ctx context.Context, eviction *kvmv1.Evic }) return ctrl.Result{}, errors.Join(fmt.Errorf("error migrating instance %v", uuid), - r.updateStatus(ctx, eviction, base)) + r.updateStatus(ctx, eviction)) } currentHypervisor, _, _ := strings.Cut(vm.HypervisorHostname, ".") @@ -339,7 +332,7 @@ func (r *EvictionReconciler) evictNext(ctx context.Context, eviction *kvmv1.Evic // So, it is already off this one, do we need to verify it? if vm.Status == "VERIFY_RESIZE" { err := servers.ConfirmResize(ctx, r.computeClient, vm.ID).ExtractErr() - if err2 := r.handleNotFound(ctx, eviction, base, err); err2 != nil { + if err2 := r.handleNotFound(ctx, eviction, err); err2 != nil { // Retry confirm in next reconciliation return ctrl.Result{}, err2 } else { @@ -350,7 +343,7 @@ func (r *EvictionReconciler) evictNext(ctx context.Context, eviction *kvmv1.Evic // All done eviction.Status.OutstandingInstances, _ = popInstance(eviction.Status.OutstandingInstances) - return ctrl.Result{}, r.updateStatus(ctx, eviction, base) + return ctrl.Result{}, r.updateStatus(ctx, eviction) } if vm.TaskState == "deleting" { //nolint:gocritic @@ -363,14 +356,14 @@ func (r *EvictionReconciler) evictNext(ctx context.Context, eviction *kvmv1.Evic Message: fmt.Sprintf("Live migration of terminating instance %s skipped", vm.ID), Reason: kvmv1.ConditionReasonFailed, }) - if err := r.updateStatus(ctx, eviction, base); err != nil { + if err := r.updateStatus(ctx, eviction); err != nil { return ctrl.Result{}, fmt.Errorf("could not update status due to %w", err) } return ctrl.Result{RequeueAfter: defaultPollTime}, nil } else if vm.Status == "ACTIVE" || vm.PowerState == 1 { log.Info("trigger live-migration") if err := r.liveMigrate(ctx, vm.ID, eviction); err != nil { - if err2 := r.handleNotFound(ctx, eviction, base, err); err2 != nil { + if err2 := r.handleNotFound(ctx, eviction, err); err2 != nil { return ctrl.Result{}, err2 } return ctrl.Result{RequeueAfter: shortRetryTime}, nil @@ -378,7 +371,7 @@ func (r *EvictionReconciler) evictNext(ctx context.Context, eviction *kvmv1.Evic } else { log.Info("trigger cold-migration") if err := r.coldMigrate(ctx, vm.ID, eviction); err != nil { - if err2 := r.handleNotFound(ctx, eviction, base, err); err2 != nil { + if err2 := r.handleNotFound(ctx, eviction, err); err2 != nil { return ctrl.Result{}, err2 } return ctrl.Result{RequeueAfter: shortRetryTime}, nil diff --git a/internal/controller/utils.go b/internal/controller/utils.go index 3de5604..c1ca4b4 100644 --- a/internal/controller/utils.go +++ b/internal/controller/utils.go @@ -63,7 +63,7 @@ func updateInstanceHA(hypervisor *kvmv1.Hypervisor, data string, acceptedCodes [ url := InstanceHaUrl(region, zone, hostname) client := &http.Client{Timeout: 30 * time.Second} // G107: Potential HTTP request made with variable url - resp, err := client.Post(url, "application/json", bytes.NewBuffer([]byte(data))) //nolint:gosec,bodyclose + resp, err := client.Post(url, "application/json", bytes.NewBuffer([]byte(data))) //nolint:bodyclose if err != nil { return fmt.Errorf("failed to send request to ha service due to %w", err) } From 31ed80b9c7286338f8bd0eadf74c42574e916bee Mon Sep 17 00:00:00 2001 From: Andrew Karpow Date: Wed, 29 Apr 2026 22:28:04 -0400 Subject: [PATCH 23/24] Fix transferLabels to rebuild from immutable defaults on each setup Prevents accumulation of stale label selector keys when SetupWithManager is called multiple times (e.g., in tests with multiple managers). --- config/rbac/role.yaml | 11 ++++++++++- internal/controller/hypervisor_controller.go | 8 +++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 528d2cb..27fd80c 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -48,7 +48,6 @@ rules: resources: - evictions - hypervisors - - hypervisors/status verbs: - create - delete @@ -71,6 +70,16 @@ rules: - get - patch - update +- apiGroups: + - kvm.cloud.sap + resources: + - hypervisors/status + verbs: + - get + - list + - patch + - update + - watch - apiGroups: - policy resources: diff --git a/internal/controller/hypervisor_controller.go b/internal/controller/hypervisor_controller.go index bd79725..69ba861 100644 --- a/internal/controller/hypervisor_controller.go +++ b/internal/controller/hypervisor_controller.go @@ -48,7 +48,7 @@ const ( HypervisorControllerName = "hypervisor" ) -var transferLabels = []string{ +var defaultTransferLabels = []string{ corev1.LabelHostname, "kubernetes.metal.cloud.sap/bb", "kubernetes.metal.cloud.sap/cluster", @@ -60,6 +60,8 @@ var transferLabels = []string{ corev1.LabelTopologyZone, } +var transferLabels = append([]string{}, defaultTransferLabels...) + type HypervisorController struct { k8sclient.Client Scheme *runtime.Scheme @@ -195,8 +197,8 @@ func (hv *HypervisorController) SetupWithManager(mgr ctrl.Manager) error { return fmt.Errorf("failed to parse global label selector: %w", err) } - // Copy transferLabels before appending to avoid mutating the package-level slice - transferLabels = append([]string{}, transferLabels...) + // Rebuild from immutable defaults to avoid accumulating state across repeated calls + transferLabels = append([]string{}, defaultTransferLabels...) for _, requirement := range requirements { transferLabels = append(transferLabels, requirement.Key()) } From 82ce512f69bd4cb7298f36649ddbd85511aa30e9 Mon Sep 17 00:00:00 2001 From: Andrew Karpow Date: Fri, 1 May 2026 09:35:56 -0400 Subject: [PATCH 24/24] Fix updateStatus stale snapshot and deduplicate transferLabels keys Apply desired status fields individually onto the freshly-fetched object in the retry closure instead of overwriting the entire status, preventing concurrent condition changes from being reverted. Add deduplication when appending label selector keys to transferLabels. --- internal/controller/eviction_controller.go | 10 +++++++--- internal/controller/hypervisor_controller.go | 5 ++++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/internal/controller/eviction_controller.go b/internal/controller/eviction_controller.go index 733a69e..ae502fa 100644 --- a/internal/controller/eviction_controller.go +++ b/internal/controller/eviction_controller.go @@ -170,16 +170,20 @@ func (r *EvictionReconciler) handleRunning(ctx context.Context, eviction *kvmv1. } func (r *EvictionReconciler) updateStatus(ctx context.Context, eviction *kvmv1.Eviction) error { - // Capture the desired status to re-apply on each retry attempt desiredStatus := eviction.Status.DeepCopy() return retry.RetryOnConflict(utils.StatusPatchBackoff, func() error { - // Re-fetch the latest version to get the current resourceVersion freshEviction := &kvmv1.Eviction{} if err := r.Get(ctx, client.ObjectKeyFromObject(eviction), freshEviction); err != nil { return err } freshBase := freshEviction.DeepCopy() - freshEviction.Status = *desiredStatus + // Apply desired conditions and scalar fields onto the fresh status + for _, c := range desiredStatus.Conditions { + meta.SetStatusCondition(&freshEviction.Status.Conditions, c) + } + freshEviction.Status.OutstandingInstances = desiredStatus.OutstandingInstances + freshEviction.Status.OutstandingRamMb = desiredStatus.OutstandingRamMb + freshEviction.Status.HypervisorServiceId = desiredStatus.HypervisorServiceId return r.Status().Patch(ctx, freshEviction, client.MergeFromWithOptions(freshBase, client.MergeFromWithOptimisticLock{}), client.FieldOwner(EvictionControllerName)) }) diff --git a/internal/controller/hypervisor_controller.go b/internal/controller/hypervisor_controller.go index 69ba861..138aa0b 100644 --- a/internal/controller/hypervisor_controller.go +++ b/internal/controller/hypervisor_controller.go @@ -200,7 +200,10 @@ func (hv *HypervisorController) SetupWithManager(mgr ctrl.Manager) error { // Rebuild from immutable defaults to avoid accumulating state across repeated calls transferLabels = append([]string{}, defaultTransferLabels...) for _, requirement := range requirements { - transferLabels = append(transferLabels, requirement.Key()) + key := requirement.Key() + if !slices.Contains(transferLabels, key) { + transferLabels = append(transferLabels, key) + } } }