diff --git a/internal/controller/node_controller.go b/internal/controller/node_controller.go index a3e8aac..437f1a3 100644 --- a/internal/controller/node_controller.go +++ b/internal/controller/node_controller.go @@ -147,12 +147,13 @@ func (r *RuleReadinessController) processNodeAgainstAllRules(ctx context.Context "rule", rule.Name, "ruleResourceVersion", rule.ResourceVersion) - if err := r.evaluateRuleForNode(ctx, rule, node); err != nil { - log.Error(err, "Failed to evaluate rule for node", + evalErr := r.evaluateRuleForNode(ctx, rule, node) + if evalErr != nil { + log.Error(evalErr, "Failed to evaluate rule for node", "node", node.Name, "rule", rule.Name) // Continue with other rules even if one fails - r.recordNodeFailure(rule, node.Name, "EvaluationError", err.Error()) - errs = append(errs, err) + r.recordNodeFailure(rule, node.Name, "EvaluationError", evalErr.Error()) + errs = append(errs, evalErr) } // Persist the rule status @@ -169,28 +170,32 @@ func (r *RuleReadinessController) processNodeAgainstAllRules(ctx context.Context patch := client.MergeFrom(latestRule.DeepCopy()) - // update only this specific node evaluation status - currEval := readinessv1alpha1.NodeEvaluation{} - for _, eval := range rule.Status.NodeEvaluations { - if eval.NodeName == node.Name { - currEval = eval - break + // Upsert the node's evaluation only after a successful evaluation. + // On the failure path evaluateRuleForNode returns before recording a + // fresh NodeEvaluation, so this must leave any existing persisted + // evaluation untouched and only persist FailedNodes below. + if evalErr == nil { + var currEval *readinessv1alpha1.NodeEvaluation + for i := range rule.Status.NodeEvaluations { + if rule.Status.NodeEvaluations[i].NodeName == node.Name { + currEval = &rule.Status.NodeEvaluations[i] + break + } } - } - - found := false - for i := range latestRule.Status.NodeEvaluations { - if latestRule.Status.NodeEvaluations[i].NodeName == node.Name { - latestRule.Status.NodeEvaluations[i] = currEval - found = true - break + found := false + for i := range latestRule.Status.NodeEvaluations { + if latestRule.Status.NodeEvaluations[i].NodeName == node.Name { + latestRule.Status.NodeEvaluations[i] = *currEval + found = true + break + } + } + if !found { + latestRule.Status.NodeEvaluations = append( + latestRule.Status.NodeEvaluations, + *currEval, + ) } - } - if !found { - latestRule.Status.NodeEvaluations = append( - latestRule.Status.NodeEvaluations, - currEval, - ) } // handle status.FailedNodes for this node diff --git a/internal/controller/node_controller_test.go b/internal/controller/node_controller_test.go index 0036873..9961937 100644 --- a/internal/controller/node_controller_test.go +++ b/internal/controller/node_controller_test.go @@ -942,7 +942,6 @@ var _ = Describe("Node Controller", func() { Context("when rule status patch fails during node reconciliation", func() { It("should return an error when rule status patch fails", func() { ctx := context.Background() - testScheme := runtime.NewScheme() Expect(corev1.AddToScheme(testScheme)).To(Succeed()) Expect(nodereadinessiov1alpha1.AddToScheme(testScheme)).To(Succeed()) @@ -1007,4 +1006,224 @@ var _ = Describe("Node Controller", func() { Expect(err.Error()).To(ContainSubstring("status patch failed")) }) }) + + Context("status updates when evaluateRuleForNode fails", func() { + It("records the failure without writing an empty NodeEvaluation", func() { + ctx := context.Background() + testScheme := runtime.NewScheme() + Expect(corev1.AddToScheme(testScheme)).To(Succeed()) + Expect(nodereadinessiov1alpha1.AddToScheme(testScheme)).To(Succeed()) + + const ( + targetNode = "fail-eval-node" + untouchedNode = "untouched-node" + targetRule = "fail-eval-rule" + targetTaint = "readiness.k8s.io/fail-eval-taint" + ) + + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: targetNode, + Labels: map[string]string{"readiness-test": "fail-eval"}, + }, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + {Type: "Ready", Status: corev1.ConditionFalse}, + }, + }, + } + + // Pre-existing evaluation for an unrelated node — must survive the + // failed reconcile of targetNode. + existingEval := nodereadinessiov1alpha1.NodeEvaluation{ + NodeName: untouchedNode, + TaintStatus: nodereadinessiov1alpha1.TaintStatusPresent, + LastEvaluationTime: metav1.Now(), + } + + rule := &nodereadinessiov1alpha1.NodeReadinessRule{ + ObjectMeta: metav1.ObjectMeta{Name: targetRule}, + Spec: nodereadinessiov1alpha1.NodeReadinessRuleSpec{ + Conditions: []nodereadinessiov1alpha1.ConditionRequirement{ + {Type: "Ready", RequiredStatus: corev1.ConditionTrue}, + }, + Taint: corev1.Taint{ + Key: targetTaint, + Effect: corev1.TaintEffectNoSchedule, + }, + NodeSelector: metav1.LabelSelector{ + MatchLabels: map[string]string{"readiness-test": "fail-eval"}, + }, + EnforcementMode: nodereadinessiov1alpha1.EnforcementModeContinuous, + }, + Status: nodereadinessiov1alpha1.NodeReadinessRuleStatus{ + NodeEvaluations: []nodereadinessiov1alpha1.NodeEvaluation{existingEval}, + }, + } + + // Fail Patch on the target node so addTaintBySpec returns a + // non-conflict error and evaluateRuleForNode exits before + // updateNodeEvaluationStatus runs. Rule status patches go through. + fc := fakeclient.NewClientBuilder(). + WithScheme(testScheme). + WithObjects(node, rule). + WithStatusSubresource(rule). + WithInterceptorFuncs(interceptor.Funcs{ + Patch: func(ctx context.Context, c client.WithWatch, obj client.Object, patch client.Patch, opts ...client.PatchOption) error { + if _, ok := obj.(*corev1.Node); ok && obj.GetName() == targetNode { + return apierrors.NewForbidden(corev1.Resource("nodes"), obj.GetName(), nil) + } + return c.Patch(ctx, obj, patch, opts...) + }, + }). + Build() + + controller := &RuleReadinessController{ + Client: fc, + Scheme: testScheme, + clientset: fake.NewSimpleClientset(), + ruleCache: make(map[string]*nodereadinessiov1alpha1.NodeReadinessRule), + EventRecorder: record.NewFakeRecorder(10), + } + controller.updateRuleCache(ctx, rule) + + reconciler := &NodeReconciler{ + Client: fc, + Scheme: testScheme, + Controller: controller, + } + + _, err := reconciler.Reconcile(ctx, reconcile.Request{ + NamespacedName: types.NamespacedName{Name: targetNode}, + }) + Expect(err).To(HaveOccurred(), + "Reconcile propagates the evaluation error so controller-runtime requeues") + + persisted := &nodereadinessiov1alpha1.NodeReadinessRule{} + Expect(fc.Get(ctx, types.NamespacedName{Name: targetRule}, persisted)).To(Succeed()) + + // FailedNodes update must land — that's how operators see the failure. + Expect(persisted.Status.FailedNodes).To(HaveLen(1)) + Expect(persisted.Status.FailedNodes[0].NodeName).To(Equal(targetNode)) + Expect(persisted.Status.FailedNodes[0].Reason).To(Equal("EvaluationError")) + + // No empty NodeEvaluation slipped in, and the unrelated entry + // survived untouched. + for _, eval := range persisted.Status.NodeEvaluations { + Expect(eval.NodeName).NotTo(BeEmpty(), + "empty NodeEvaluation would be rejected by CRD validation against a real API server") + Expect(eval.NodeName).NotTo(Equal(targetNode), + "failed evaluation must not write a NodeEvaluation for the target node") + } + Expect(persisted.Status.NodeEvaluations).To(ContainElement( + HaveField("NodeName", untouchedNode))) + }) + + It("preserves the existing target NodeEvaluation when recording a failure", func() { + ctx := context.Background() + testScheme := runtime.NewScheme() + Expect(corev1.AddToScheme(testScheme)).To(Succeed()) + Expect(nodereadinessiov1alpha1.AddToScheme(testScheme)).To(Succeed()) + + const ( + targetNode = "stale-eval-node" + targetRule = "stale-eval-rule" + targetTaint = "readiness.k8s.io/stale-eval-taint" + ) + + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: targetNode, + Labels: map[string]string{"readiness-test": "stale-eval"}, + }, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + {Type: "Ready", Status: corev1.ConditionFalse}, + }, + }, + } + + persistedEval := nodereadinessiov1alpha1.NodeEvaluation{ + NodeName: targetNode, + ConditionResults: []nodereadinessiov1alpha1.ConditionEvaluationResult{ + { + Type: "Ready", + CurrentStatus: corev1.ConditionTrue, + RequiredStatus: corev1.ConditionTrue, + }, + }, + TaintStatus: nodereadinessiov1alpha1.TaintStatusAbsent, + LastEvaluationTime: metav1.Now(), + } + staleCachedEval := persistedEval + staleCachedEval.TaintStatus = nodereadinessiov1alpha1.TaintStatusPresent + + rule := &nodereadinessiov1alpha1.NodeReadinessRule{ + ObjectMeta: metav1.ObjectMeta{Name: targetRule}, + Spec: nodereadinessiov1alpha1.NodeReadinessRuleSpec{ + Conditions: []nodereadinessiov1alpha1.ConditionRequirement{ + {Type: "Ready", RequiredStatus: corev1.ConditionTrue}, + }, + Taint: corev1.Taint{ + Key: targetTaint, + Effect: corev1.TaintEffectNoSchedule, + }, + NodeSelector: metav1.LabelSelector{ + MatchLabels: map[string]string{"readiness-test": "stale-eval"}, + }, + EnforcementMode: nodereadinessiov1alpha1.EnforcementModeContinuous, + }, + Status: nodereadinessiov1alpha1.NodeReadinessRuleStatus{ + NodeEvaluations: []nodereadinessiov1alpha1.NodeEvaluation{persistedEval}, + }, + } + cachedRule := rule.DeepCopy() + cachedRule.Status.NodeEvaluations = []nodereadinessiov1alpha1.NodeEvaluation{staleCachedEval} + + fc := fakeclient.NewClientBuilder(). + WithScheme(testScheme). + WithObjects(node, rule). + WithStatusSubresource(rule). + WithInterceptorFuncs(interceptor.Funcs{ + Patch: func(ctx context.Context, c client.WithWatch, obj client.Object, patch client.Patch, opts ...client.PatchOption) error { + if _, ok := obj.(*corev1.Node); ok && obj.GetName() == targetNode { + return apierrors.NewForbidden(corev1.Resource("nodes"), obj.GetName(), nil) + } + return c.Patch(ctx, obj, patch, opts...) + }, + }). + Build() + + controller := &RuleReadinessController{ + Client: fc, + Scheme: testScheme, + clientset: fake.NewSimpleClientset(), + ruleCache: make(map[string]*nodereadinessiov1alpha1.NodeReadinessRule), + EventRecorder: record.NewFakeRecorder(10), + } + controller.updateRuleCache(ctx, cachedRule) + + reconciler := &NodeReconciler{ + Client: fc, + Scheme: testScheme, + Controller: controller, + } + + _, err := reconciler.Reconcile(ctx, reconcile.Request{ + NamespacedName: types.NamespacedName{Name: targetNode}, + }) + Expect(err).To(HaveOccurred(), + "Reconcile propagates the evaluation error so controller-runtime requeues") + + persisted := &nodereadinessiov1alpha1.NodeReadinessRule{} + Expect(fc.Get(ctx, types.NamespacedName{Name: targetRule}, persisted)).To(Succeed()) + + Expect(persisted.Status.FailedNodes).To(HaveLen(1)) + Expect(persisted.Status.FailedNodes[0].NodeName).To(Equal(targetNode)) + Expect(persisted.Status.NodeEvaluations).To(HaveLen(1)) + Expect(persisted.Status.NodeEvaluations[0].NodeName).To(Equal(targetNode)) + Expect(persisted.Status.NodeEvaluations[0].TaintStatus).To(Equal( + nodereadinessiov1alpha1.TaintStatusAbsent)) + }) + }) })