From 51131ceed66aa76d33e6a56d6757c4d9b6bc8919 Mon Sep 17 00:00:00 2001 From: Brian Lockwood Date: Fri, 6 Mar 2026 10:09:24 -0800 Subject: [PATCH] fix: how pod cleanup is handled --- .../internal/controller/pod_controller.go | 13 ++++ .../internal/controller/skyhook_controller.go | 14 ---- .../controller/skyhook_controller_test.go | 64 +++++++++++++++++++ 3 files changed, 77 insertions(+), 14 deletions(-) diff --git a/operator/internal/controller/pod_controller.go b/operator/internal/controller/pod_controller.go index f36c62ae..bf33bee4 100644 --- a/operator/internal/controller/pod_controller.go +++ b/operator/internal/controller/pod_controller.go @@ -155,6 +155,19 @@ func (r *SkyhookReconciler) UpdateNodeState(ctx context.Context, pod *corev1.Pod return false, fmt.Errorf("error creating node wrapper: %w", err) } + // If the node has no nodeState annotation, the annotation was intentionally deleted + // (e.g., manual node reset). The skyhook controller always creates the annotation + // before creating pods, so a missing annotation means external intervention. + // Don't recreate it - let the skyhook controller handle the reset via + // ValidateRunningPackages which will clean up stale pods. + initialState, err := skyhookNode.State() + if err != nil { + return false, fmt.Errorf("error reading node state: %w", err) + } + if initialState == nil { + return false, nil + } + updated := false if state == v1alpha1.StateComplete { updated, err = r.HandleCompletePod(ctx, skyhookNode, packagePtr, containerName) diff --git a/operator/internal/controller/skyhook_controller.go b/operator/internal/controller/skyhook_controller.go index 60dad57a..8c7ed829 100644 --- a/operator/internal/controller/skyhook_controller.go +++ b/operator/internal/controller/skyhook_controller.go @@ -2199,20 +2199,6 @@ func (r *SkyhookReconciler) ApplyPackage(ctx context.Context, logger logr.Logger } } - // if stage != v1alpha1.StageApply { - // // If a node gets rest by a user, the about method will return the wrong node state. Above sources it from the skyhook status. - // // check if the node has nothing, reset it then apply the package. - // nodeState, err := skyhookNode.State() - // if err != nil { - // return fmt.Errorf("error getting node state: %w", err) - // } - - // _, found := nodeState[_package.GetUniqueName()] - // if !found { - // stage = v1alpha1.StageApply - // } - // } - nextStage := skyhookNode.NextStage(_package) if nextStage != nil { stage = *nextStage diff --git a/operator/internal/controller/skyhook_controller_test.go b/operator/internal/controller/skyhook_controller_test.go index fabb7758..734c63bc 100644 --- a/operator/internal/controller/skyhook_controller_test.go +++ b/operator/internal/controller/skyhook_controller_test.go @@ -1578,6 +1578,70 @@ var _ = Describe("Resource Comparison", func() { }) }) +var _ = Describe("pod controller tests", func() { + It("should not recreate annotation when nodeState is missing (node reset)", func() { + nodeName := "test-node-reset" + + // Create a node WITHOUT any skyhook nodeState annotation + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + }, + } + Expect(k8sClient.Create(ctx, node)).To(Succeed()) + defer func() { + _ = k8sClient.Delete(ctx, node) + }() + + // Build the package annotation for the pod + pkg := &PackageSkyhook{ + PackageRef: v1alpha1.PackageRef{ + Name: "bb", + Version: "1.2", + }, + Skyhook: "test-skyhook-reset", + Stage: v1alpha1.StageConfig, + Image: "ghcr.io/nvidia/skyhook/agentless", + } + pkgData, err := json.Marshal(pkg) + Expect(err).ToNot(HaveOccurred()) + + // Create a pod that looks like a completed skyhook pod on the node + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-reset-pod", + Namespace: opts.Namespace, + Labels: map[string]string{ + fmt.Sprintf("%s/name", v1alpha1.METADATA_PREFIX): "test-skyhook-reset", + }, + Annotations: map[string]string{ + fmt.Sprintf("%s/package", v1alpha1.METADATA_PREFIX): string(pkgData), + }, + }, + Spec: corev1.PodSpec{ + NodeName: nodeName, + Containers: []corev1.Container{{Name: "test", Image: "busybox"}}, + }, + } + Expect(k8sClient.Create(ctx, pod)).To(Succeed()) + defer func() { + _ = k8sClient.Delete(ctx, pod) + }() + + // Call UpdateNodeState - should skip because nodeState annotation is missing + requeue, err := operator.UpdateNodeState(ctx, pod, v1alpha1.StateErroring, "bb-config", 0) + Expect(err).ToNot(HaveOccurred()) + Expect(requeue).To(BeFalse()) + + // Verify the node still has no nodeState annotation + var updatedNode corev1.Node + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: nodeName}, &updatedNode)).To(Succeed()) + annotationKey := fmt.Sprintf("%s/nodeState_test-skyhook-reset", v1alpha1.METADATA_PREFIX) + _, hasAnnotation := updatedNode.Annotations[annotationKey] + Expect(hasAnnotation).To(BeFalse(), "nodeState annotation should not be recreated after reset") + }) +}) + func TestGenerateValidPodNames(t *testing.T) { g := NewWithT(t)