From 841658875a8ec37c437e0b3a64581d92f774d09b Mon Sep 17 00:00:00 2001 From: Kayd-06 Date: Tue, 5 May 2026 01:51:58 +0530 Subject: [PATCH] fix: data race in rule cache and stale failedNodes on recovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix data race in getApplicableRulesForNode: the function was returning direct pointers into the ruleCache, which were then mutated by evaluateRuleForNode outside of the ruleCacheMutex. With both RuleReconciler and NodeReconciler running concurrently this could cause a concurrent slice-append panic or silent data corruption. Return a DeepCopy of each cached rule so callers work on an isolated copy. - Fix stale FailedNodes not cleared on successful evaluation: when a node's evaluation errored, a NodeFailure entry was added via recordNodeFailure. If the node later recovered, the failure entry was never removed – nodes would be permanently stuck in FailedNodes even after recovery. Add clearNodeFailure and call it in both processAllNodesForRule and processNodeAgainstAllRules whenever evaluateRuleForNode succeeds. Signed-off-by: Kayd-06 --- internal/controller/node_controller.go | 2 ++ .../controller/nodereadinessrule_controller.go | 15 ++++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/internal/controller/node_controller.go b/internal/controller/node_controller.go index fc04d27..1d5d83a 100644 --- a/internal/controller/node_controller.go +++ b/internal/controller/node_controller.go @@ -148,6 +148,8 @@ func (r *RuleReadinessController) processNodeAgainstAllRules(ctx context.Context "node", node.Name, "rule", rule.Name) // Continue with other rules even if one fails r.recordNodeFailure(rule, node.Name, "EvaluationError", err.Error()) + } else { + r.clearNodeFailure(rule, node.Name) } // Persist the rule status diff --git a/internal/controller/nodereadinessrule_controller.go b/internal/controller/nodereadinessrule_controller.go index 99347e9..2020d4f 100644 --- a/internal/controller/nodereadinessrule_controller.go +++ b/internal/controller/nodereadinessrule_controller.go @@ -277,6 +277,8 @@ func (r *RuleReadinessController) processAllNodesForRule(ctx context.Context, ru log.Error(err, "Failed to evaluate node for rule", "rule", rule.Name, "node", node.Name) r.recordNodeFailure(rule, node.Name, "EvaluationError", err.Error()) metrics.Failures.WithLabelValues(rule.Name, "EvaluationError").Inc() + } else { + r.clearNodeFailure(rule, node.Name) } } } @@ -422,7 +424,7 @@ func (r *RuleReadinessController) getApplicableRulesForNode(ctx context.Context, for _, rule := range r.ruleCache { if r.ruleAppliesTo(ctx, rule, node) { - applicableRules = append(applicableRules, rule) + applicableRules = append(applicableRules, rule.DeepCopy()) } } @@ -706,3 +708,14 @@ func (r *RuleReadinessController) getPreviousNodeEvaluation(rule *readinessv1alp } return nil } + +// clearNodeFailure removes any failure record for a specific node from the rule status. +func (r *RuleReadinessController) clearNodeFailure(rule *readinessv1alpha1.NodeReadinessRule, nodeName string) { + var failedNodes []readinessv1alpha1.NodeFailure + for _, failure := range rule.Status.FailedNodes { + if failure.NodeName != nodeName { + failedNodes = append(failedNodes, failure) + } + } + rule.Status.FailedNodes = failedNodes +}