Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
a4c68d3
Fix strings.SplitN with n=1 being a no-op
notandy Apr 28, 2026
066f688
Move flag validation after flag.Parse()
notandy Apr 28, 2026
f55f146
Add 30-second timeout to HA service HTTP client
notandy Apr 28, 2026
449eb7b
Add retry-on-conflict for eviction status updates
notandy Apr 28, 2026
2ecb9cf
Copy transferLabels slice before appending to avoid mutation
notandy Apr 28, 2026
6221b31
Fix PeriodSeconds(0) on startup probe
notandy Apr 28, 2026
f191cad
Fix missing URL path separator in HA service URL
notandy Apr 28, 2026
27e1b11
Fix wrong error message (image vs network)
notandy Apr 28, 2026
b56c13c
Fix missing "not" in error message
notandy Apr 28, 2026
0b87b7b
Consolidate duplicate difference functions into utils package
notandy Apr 28, 2026
9ac818c
Remove dead code - unused logger in SetupWithManager functions
notandy Apr 28, 2026
d0dbe39
Remove unused logger imports in aggregates and traits controllers
notandy Apr 28, 2026
e44bcc2
Fix eviction updateStatus to re-fetch resource on conflict retry
notandy Apr 28, 2026
dcf6719
Fix evictNext: add explicit return after deleting branch and fix erro…
notandy Apr 28, 2026
cc0bc4b
Return early when Hypervisor not found in GardenerNodeLifecycleContro…
notandy Apr 28, 2026
3be6976
Fix --version flag to print version info and check it before validation
notandy Apr 28, 2026
284b489
Remove dead code: unused functions, constants, and types
notandy Apr 28, 2026
4c735a0
Fix typo: SanitzeReconcileErrorEncoder -> SanitizeReconcileErrorEncoder
notandy Apr 28, 2026
601ec02
Fix Clone() to preserve cfg field in SanitizeReconcileErrorEncoder
notandy Apr 28, 2026
4518318
Fix FindNodeStatusCondition to return pointer to slice element, not copy
notandy Apr 28, 2026
179d588
Narrow RBAC annotations on status subresources to get;update;patch
notandy Apr 28, 2026
2fc05c6
Remove unused base parameter from eviction updateStatus and handleNot…
notandy Apr 28, 2026
31ed80b
Fix transferLabels to rebuild from immutable defaults on each setup
notandy Apr 30, 2026
82ce512
Fix updateStatus stale snapshot and deduplicate transferLabels keys
notandy May 1, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 14 additions & 11 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,30 +101,33 @@ func main() {
flag.StringVar(&certificateIssuerName, "certificate-issuer-name", "nova-hypervisor-agents-ca-issuer",
"Name of the certificate issuer.")

if certificateIssuerName == "" {
setupLog.Error(errors.New("certificate-issuer-name cannot be empty"), "invalid certificate issuer name")
os.Exit(1)
}

if certificateNamespace == "" {
setupLog.Error(errors.New("certificate-namespace cannot be empty"), "invalid certificate namespace")
os.Exit(1)
}

opts := ctrlzap.Options{
Development: true,
TimeEncoder: zapcore.ISO8601TimeEncoder,
Encoder: logger.NewSanitzeReconcileErrorEncoder(zap.NewDevelopmentEncoderConfig()),
Encoder: logger.NewSanitizeReconcileErrorEncoder(zap.NewDevelopmentEncoderConfig()),
StacktraceLevel: zap.DPanicLevel,
}

opts.BindFlags(flag.CommandLine)
flag.Parse()

if version {
fmt.Printf("%s %s (%s/%s) %s\n",
bininfo.Component(), bininfo.VersionOr("devel"), gruntime.GOOS, gruntime.GOARCH,
bininfo.CommitOr("edge"))
os.Exit(0)
}

if certificateIssuerName == "" {
setupLog.Error(errors.New("certificate-issuer-name cannot be empty"), "invalid certificate issuer name")
os.Exit(1)
}

if certificateNamespace == "" {
setupLog.Error(errors.New("certificate-namespace cannot be empty"), "invalid certificate namespace")
os.Exit(1)
}

ctrl.SetLogger(ctrlzap.New(ctrlzap.UseFlagOptions(&opts)))

// if the enable-http2 flag is false (the default), http/2 should be disabled
Expand Down
11 changes: 10 additions & 1 deletion config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ rules:
resources:
- evictions
- hypervisors
- hypervisors/status
verbs:
- create
- delete
Expand All @@ -71,6 +70,16 @@ rules:
- get
- patch
- update
- apiGroups:
- kvm.cloud.sap
resources:
- hypervisors/status
verbs:
- get
- list
- patch
- update
- watch
- apiGroups:
- policy
resources:
Expand Down
4 changes: 1 addition & 3 deletions internal/controller/aggregates_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ import (
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/builder"
k8sclient "sigs.k8s.io/controller-runtime/pkg/client"
logger "sigs.k8s.io/controller-runtime/pkg/log"

"github.com/gophercloud/gophercloud/v2"

Expand All @@ -49,7 +48,7 @@ type AggregatesController struct {
}

// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors,verbs=get;list;watch
// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors/status,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors/status,verbs=get;update;patch

func (ac *AggregatesController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
hv := &kvmv1.Hypervisor{}
Expand Down Expand Up @@ -221,7 +220,6 @@ func slicesEqualUnordered(a, b []string) bool {
// SetupWithManager sets up the controller with the Manager.
func (ac *AggregatesController) SetupWithManager(mgr ctrl.Manager) error {
ctx := context.Background()
_ = logger.FromContext(ctx)

var err error
if ac.computeClient, err = openstack.GetServiceClient(ctx, "compute", nil); err != nil {
Expand Down
6 changes: 2 additions & 4 deletions internal/controller/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@ package controller

// This should contain constants shared between controllers
const (
labelEvictionRequired = "cloud.sap/hypervisor-eviction-required"
labelEvictionApproved = "cloud.sap/hypervisor-eviction-succeeded"
labelHypervisor = "nova.openstack.cloud.sap/virt-driver"
testAggregateName = "tenant_filter_tests"
labelHypervisor = "nova.openstack.cloud.sap/virt-driver"
testAggregateName = "tenant_filter_tests"
)
77 changes: 42 additions & 35 deletions internal/controller/eviction_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,14 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/util/retry"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
logger "sigs.k8s.io/controller-runtime/pkg/log"

kvmv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
"github.com/cobaltcore-dev/openstack-hypervisor-operator/internal/openstack"
"github.com/cobaltcore-dev/openstack-hypervisor-operator/internal/utils"
)

// EvictionReconciler reconciles a Eviction object
Expand Down Expand Up @@ -116,15 +118,14 @@ func (r *EvictionReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c

statusCondition := meta.FindStatusCondition(eviction.Status.Conditions, kvmv1.ConditionTypeEvicting)
if statusCondition == nil {
base := eviction.DeepCopy()
meta.SetStatusCondition(&eviction.Status.Conditions, metav1.Condition{
Type: kvmv1.ConditionTypeEvicting,
Status: metav1.ConditionTrue,
Message: "Running",
Reason: kvmv1.ConditionReasonRunning,
})

return ctrl.Result{}, r.updateStatus(ctx, eviction, base)
return ctrl.Result{}, r.updateStatus(ctx, eviction)
}

switch statusCondition.Status {
Expand Down Expand Up @@ -156,7 +157,6 @@ func (r *EvictionReconciler) handleRunning(ctx context.Context, eviction *kvmv1.
return r.evictNext(ctx, eviction)
}

base := eviction.DeepCopy()
meta.SetStatusCondition(&eviction.Status.Conditions, metav1.Condition{
Type: kvmv1.ConditionTypeEvicting,
Status: metav1.ConditionFalse,
Expand All @@ -166,16 +166,30 @@ func (r *EvictionReconciler) handleRunning(ctx context.Context, eviction *kvmv1.

eviction.Status.OutstandingRamMb = 0
logger.FromContext(ctx).Info("succeeded")
return ctrl.Result{}, r.updateStatus(ctx, eviction, base)
return ctrl.Result{}, r.updateStatus(ctx, eviction)
}

func (r *EvictionReconciler) updateStatus(ctx context.Context, eviction, base *kvmv1.Eviction) error {
return r.Status().Patch(ctx, eviction, client.MergeFromWithOptions(base,
client.MergeFromWithOptimisticLock{}), client.FieldOwner(EvictionControllerName))
func (r *EvictionReconciler) updateStatus(ctx context.Context, eviction *kvmv1.Eviction) error {
desiredStatus := eviction.Status.DeepCopy()
return retry.RetryOnConflict(utils.StatusPatchBackoff, func() error {
freshEviction := &kvmv1.Eviction{}
if err := r.Get(ctx, client.ObjectKeyFromObject(eviction), freshEviction); err != nil {
return err
}
freshBase := freshEviction.DeepCopy()
// Apply desired conditions and scalar fields onto the fresh status
for _, c := range desiredStatus.Conditions {
meta.SetStatusCondition(&freshEviction.Status.Conditions, c)
}
freshEviction.Status.OutstandingInstances = desiredStatus.OutstandingInstances
freshEviction.Status.OutstandingRamMb = desiredStatus.OutstandingRamMb
freshEviction.Status.HypervisorServiceId = desiredStatus.HypervisorServiceId
return r.Status().Patch(ctx, freshEviction, client.MergeFromWithOptions(freshBase,
client.MergeFromWithOptimisticLock{}), client.FieldOwner(EvictionControllerName))
})
Comment thread
coderabbitai[bot] marked this conversation as resolved.
}

func (r *EvictionReconciler) handlePreflight(ctx context.Context, eviction *kvmv1.Eviction, hv *kvmv1.Hypervisor) (ctrl.Result, error) {
base := eviction.DeepCopy()
expectHypervisor := hv.Status.HypervisorID != "" && hv.Status.ServiceID != "" // The hypervisor has been registered

// If the hypervisor should exist, then we need to ensure it is disabled before we start evicting
Expand All @@ -187,7 +201,7 @@ func (r *EvictionReconciler) handlePreflight(ctx context.Context, eviction *kvmv
Message: "hypervisor not disabled",
Reason: kvmv1.ConditionReasonFailed,
}) {
return ctrl.Result{}, r.updateStatus(ctx, eviction, base)
return ctrl.Result{}, r.updateStatus(ctx, eviction)
}
return ctrl.Result{RequeueAfter: defaultPollTime}, nil // Wait for hypervisor to be disabled
}
Expand All @@ -208,7 +222,7 @@ func (r *EvictionReconciler) handlePreflight(ctx context.Context, eviction *kvmv
Message: fmt.Sprintf("failed to get hypervisor %v", err),
Reason: kvmv1.ConditionReasonFailed,
})
return ctrl.Result{}, r.updateStatus(ctx, eviction, base)
return ctrl.Result{}, r.updateStatus(ctx, eviction)
} else {
// That is (likely) an eviction for a node that never registered
// so we are good to go
Expand All @@ -221,7 +235,7 @@ func (r *EvictionReconciler) handlePreflight(ctx context.Context, eviction *kvmv
})
eviction.Status.OutstandingRamMb = 0
logger.FromContext(ctx).Info(msg)
return ctrl.Result{}, r.updateStatus(ctx, eviction, base)
return ctrl.Result{}, r.updateStatus(ctx, eviction)
}
}

Expand All @@ -242,18 +256,15 @@ func (r *EvictionReconciler) handlePreflight(ctx context.Context, eviction *kvmv
Message: "Preflight checks passed, hypervisor is disabled and ready for eviction",
Reason: kvmv1.ConditionReasonSucceeded,
})
return ctrl.Result{}, r.updateStatus(ctx, eviction, base)
return ctrl.Result{}, r.updateStatus(ctx, eviction)
}

// Tries to handle the NotFound-error by updating the status
func (r *EvictionReconciler) handleNotFound(ctx context.Context, eviction, base *kvmv1.Eviction, err error) error {
func (r *EvictionReconciler) handleNotFound(ctx context.Context, eviction *kvmv1.Eviction, err error) error {
if !gophercloud.ResponseCodeIs(err, http.StatusNotFound) {
return err
}
logger.FromContext(ctx).Info("Instance is gone")
if base == nil {
base = eviction.DeepCopy()
}
var uuid string
eviction.Status.OutstandingInstances, uuid = popInstance(eviction.Status.OutstandingInstances)
if uuid == "" {
Expand All @@ -265,11 +276,10 @@ func (r *EvictionReconciler) handleNotFound(ctx context.Context, eviction, base
Message: fmt.Sprintf("Instance %s is gone", uuid),
Reason: kvmv1.ConditionReasonSucceeded,
})
return r.updateStatus(ctx, eviction, base)
return r.updateStatus(ctx, eviction)
}

func (r *EvictionReconciler) evictNext(ctx context.Context, eviction *kvmv1.Eviction) (ctrl.Result, error) {
base := eviction.DeepCopy()
uuid := peekInstance(eviction.Status.OutstandingInstances)
if uuid == "" {
return ctrl.Result{}, nil
Expand All @@ -281,7 +291,7 @@ func (r *EvictionReconciler) evictNext(ctx context.Context, eviction *kvmv1.Evic
vm, err := res.Extract()

if err != nil {
if err2 := r.handleNotFound(ctx, eviction, base, err); err2 != nil {
if err2 := r.handleNotFound(ctx, eviction, err); err2 != nil {
return ctrl.Result{}, err2
} else {
return ctrl.Result{RequeueAfter: shortRetryTime}, nil
Expand Down Expand Up @@ -309,7 +319,7 @@ func (r *EvictionReconciler) evictNext(ctx context.Context, eviction *kvmv1.Evic
})

return ctrl.Result{}, errors.Join(fmt.Errorf("error migrating instance %v", uuid),
r.updateStatus(ctx, eviction, base))
r.updateStatus(ctx, eviction))
}

currentHypervisor, _, _ := strings.Cut(vm.HypervisorHostname, ".")
Expand All @@ -326,7 +336,7 @@ func (r *EvictionReconciler) evictNext(ctx context.Context, eviction *kvmv1.Evic
// So, it is already off this one, do we need to verify it?
if vm.Status == "VERIFY_RESIZE" {
err := servers.ConfirmResize(ctx, r.computeClient, vm.ID).ExtractErr()
if err2 := r.handleNotFound(ctx, eviction, base, err); err2 != nil {
if err2 := r.handleNotFound(ctx, eviction, err); err2 != nil {
// Retry confirm in next reconciliation
return ctrl.Result{}, err2
} else {
Expand All @@ -337,7 +347,7 @@ func (r *EvictionReconciler) evictNext(ctx context.Context, eviction *kvmv1.Evic

// All done
eviction.Status.OutstandingInstances, _ = popInstance(eviction.Status.OutstandingInstances)
return ctrl.Result{}, r.updateStatus(ctx, eviction, base)
return ctrl.Result{}, r.updateStatus(ctx, eviction)
}

if vm.TaskState == "deleting" { //nolint:gocritic
Expand All @@ -350,35 +360,32 @@ func (r *EvictionReconciler) evictNext(ctx context.Context, eviction *kvmv1.Evic
Message: fmt.Sprintf("Live migration of terminating instance %s skipped", vm.ID),
Reason: kvmv1.ConditionReasonFailed,
})
if err2 := r.updateStatus(ctx, eviction, base); err2 != nil {
return ctrl.Result{}, fmt.Errorf("could update status due to %w", err2)
if err := r.updateStatus(ctx, eviction); err != nil {
return ctrl.Result{}, fmt.Errorf("could not update status due to %w", err)
}
return ctrl.Result{RequeueAfter: defaultPollTime}, nil
} else if vm.Status == "ACTIVE" || vm.PowerState == 1 {
log.Info("trigger live-migration")
err := r.liveMigrate(ctx, vm.ID, eviction)
if err != nil {
if err2 := r.handleNotFound(ctx, eviction, base, err); err2 != nil {
if err := r.liveMigrate(ctx, vm.ID, eviction); err != nil {
if err2 := r.handleNotFound(ctx, eviction, err); err2 != nil {
return ctrl.Result{}, err2
} else {
return ctrl.Result{RequeueAfter: shortRetryTime}, nil
}
return ctrl.Result{RequeueAfter: shortRetryTime}, nil
}
} else {
log.Info("trigger cold-migration")
err := r.coldMigrate(ctx, vm.ID, eviction)
if err != nil {
if err2 := r.handleNotFound(ctx, eviction, base, err); err2 != nil {
if err := r.coldMigrate(ctx, vm.ID, eviction); err != nil {
if err2 := r.handleNotFound(ctx, eviction, err); err2 != nil {
return ctrl.Result{}, err2
} else {
return ctrl.Result{RequeueAfter: shortRetryTime}, nil
}
return ctrl.Result{RequeueAfter: shortRetryTime}, nil
}
}

// Triggered a migration, give it a generous time to start, so we do not
// see the old state because the migration didn't start
log.Info("poll")
return ctrl.Result{RequeueAfter: defaultPollTime}, err
return ctrl.Result{RequeueAfter: defaultPollTime}, nil
}

func (r *EvictionReconciler) liveMigrate(ctx context.Context, uuid string, eviction *kvmv1.Eviction) error {
Expand Down
10 changes: 7 additions & 3 deletions internal/controller/gardener_node_lifecycle_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,12 @@ func (r *GardenerNodeLifecycleController) Reconcile(ctx context.Context, req ctr
}

hv := &kvmv1.Hypervisor{}
if err := r.Get(ctx, k8sclient.ObjectKey{Name: req.Name}, hv); k8sclient.IgnoreNotFound(err) != nil {
return ctrl.Result{}, err
if err := r.Get(ctx, k8sclient.ObjectKey{Name: req.Name}, hv); err != nil {
if k8sclient.IgnoreNotFound(err) != nil {
return ctrl.Result{}, err
}
// Hypervisor not found, nothing to do
return ctrl.Result{}, nil
}

if !hv.Spec.LifecycleEnabled {
Expand Down Expand Up @@ -209,7 +213,7 @@ func (r *GardenerNodeLifecycleController) ensureSignallingDeployment(ctx context
WithStartupProbe(corev1ac.Probe().
WithExec(corev1ac.ExecAction().WithCommand(command)).
WithInitialDelaySeconds(0).
WithPeriodSeconds(0).
WithPeriodSeconds(1).
WithFailureThreshold(1).
WithSuccessThreshold(1))))))

Expand Down
13 changes: 10 additions & 3 deletions internal/controller/hypervisor_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ const (
HypervisorControllerName = "hypervisor"
)

var transferLabels = []string{
var defaultTransferLabels = []string{
corev1.LabelHostname,
"kubernetes.metal.cloud.sap/bb",
"kubernetes.metal.cloud.sap/cluster",
Expand All @@ -60,6 +60,8 @@ var transferLabels = []string{
corev1.LabelTopologyZone,
}

var transferLabels = append([]string{}, defaultTransferLabels...)

type HypervisorController struct {
k8sclient.Client
Scheme *runtime.Scheme
Expand All @@ -68,7 +70,7 @@ type HypervisorController struct {
// +kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch
// +kubebuilder:rbac:groups="",resources=nodes/status,verbs=get
// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors/status,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors/status,verbs=get;update;patch

func (hv *HypervisorController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
log := logger.FromContext(ctx).WithName(req.Name)
Expand Down Expand Up @@ -195,8 +197,13 @@ func (hv *HypervisorController) SetupWithManager(mgr ctrl.Manager) error {
return fmt.Errorf("failed to parse global label selector: %w", err)
}

// Rebuild from immutable defaults to avoid accumulating state across repeated calls
transferLabels = append([]string{}, defaultTransferLabels...)
for _, requirement := range requirements {
transferLabels = append(transferLabels, requirement.Key())
key := requirement.Key()
if !slices.Contains(transferLabels, key) {
transferLabels = append(transferLabels, key)
}
}
}

Expand Down
Loading
Loading