From 261f9443e0fee650a339834ccac478ed22054cd3 Mon Sep 17 00:00:00 2001 From: Sahitya Chandra Date: Sun, 10 May 2026 21:49:45 +0530 Subject: [PATCH] fix: cleanup taints after node selector mismatch Signed-off-by: Sahitya Chandra --- internal/controller/node_controller.go | 79 ++++++++++++++++++- internal/controller/node_controller_test.go | 54 +++++++++++++ .../nodereadinessrule_controller.go | 32 ++++++-- 3 files changed, 155 insertions(+), 10 deletions(-) diff --git a/internal/controller/node_controller.go b/internal/controller/node_controller.go index a3e8aac..3aea2c5 100644 --- a/internal/controller/node_controller.go +++ b/internal/controller/node_controller.go @@ -109,12 +109,13 @@ func (r *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl. func (r *RuleReadinessController) processNodeAgainstAllRules(ctx context.Context, node *corev1.Node) error { log := ctrl.LoggerFrom(ctx) - // Get all known (cached) applicable rules for this node - applicableRules := r.getApplicableRulesForNode(ctx, node) + // Get all known cached rules that either currently match this node or + // previously recorded status for it and may need cleanup. + rules := r.getRulesForNodeReconcile(ctx, node) var errs []error - log.Info("Processing node against rules", "node", node.Name, "ruleCount", len(applicableRules)) + log.Info("Processing node against rules", "node", node.Name, "ruleCount", len(rules)) - for _, rule := range applicableRules { + for _, rule := range rules { log.V(4).Info("Processing rule from cache", "node", node.Name, "rule", rule.Name, @@ -128,6 +129,15 @@ func (r *RuleReadinessController) processNodeAgainstAllRules(ctx context.Context continue } + if !r.ruleAppliesTo(ctx, rule, node) { + if err := r.cleanupNodeForUnmatchedRule(ctx, rule, node); err != nil { + log.Error(err, "Failed to cleanup node for unmatched rule", + "node", node.Name, "rule", rule.Name) + errs = append(errs, err) + } + continue + } + // Skip if bootstrap-only and already completed if r.isBootstrapCompleted(ctx, node.Name, rule.Name) && rule.Spec.EnforcementMode == readinessv1alpha1.EnforcementModeBootstrapOnly { log.Info("Skipping bootstrap-only rule - already completed", @@ -228,6 +238,67 @@ func (r *RuleReadinessController) processNodeAgainstAllRules(ctx context.Context return errors.Join(errs...) } +func (r *RuleReadinessController) cleanupNodeForUnmatchedRule(ctx context.Context, rule *readinessv1alpha1.NodeReadinessRule, node *corev1.Node) error { + log := ctrl.LoggerFrom(ctx) + if !ruleHasNodeStatus(rule, node.Name) { + return nil + } + + if r.hasTaintBySpec(node, rule.Spec.Taint) { + log.Info("Removing taint from node that no longer matches rule selector", + "node", node.Name, + "rule", rule.Name, + "taint", rule.Spec.Taint.Key) + if err := r.removeTaintBySpec(ctx, node, rule.Spec.Taint, rule.Name); err != nil { + metrics.Failures.WithLabelValues(rule.Name, "RemoveTaintError").Inc() + return fmt.Errorf("failed to remove taint after selector mismatch: %w", err) + } + metrics.TaintOperations.WithLabelValues(rule.Name, "remove").Inc() + } + + if err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + latestRule := &readinessv1alpha1.NodeReadinessRule{} + if err := r.Get(ctx, client.ObjectKey{Name: rule.Name}, latestRule); err != nil { + return err + } + + patch := client.MergeFrom(latestRule.DeepCopy()) + removeNodeFromRuleStatus(latestRule, node.Name) + return r.Status().Patch(ctx, latestRule, patch) + }); err != nil { + return err + } + + removeNodeFromRuleStatus(rule, node.Name) + return nil +} + +func removeNodeFromRuleStatus(rule *readinessv1alpha1.NodeReadinessRule, nodeName string) { + var appliedNodes []string + for _, appliedNode := range rule.Status.AppliedNodes { + if appliedNode != nodeName { + appliedNodes = append(appliedNodes, appliedNode) + } + } + rule.Status.AppliedNodes = appliedNodes + + var nodeEvaluations []readinessv1alpha1.NodeEvaluation + for _, evaluation := range rule.Status.NodeEvaluations { + if evaluation.NodeName != nodeName { + nodeEvaluations = append(nodeEvaluations, evaluation) + } + } + rule.Status.NodeEvaluations = nodeEvaluations + + var failedNodes []readinessv1alpha1.NodeFailure + for _, failure := range rule.Status.FailedNodes { + if failure.NodeName != nodeName { + failedNodes = append(failedNodes, failure) + } + } + rule.Status.FailedNodes = failedNodes +} + // getConditionStatus gets the status of a condition on a node. func (r *RuleReadinessController) getConditionStatus(node *corev1.Node, conditionType string) corev1.ConditionStatus { for _, condition := range node.Status.Conditions { diff --git a/internal/controller/node_controller_test.go b/internal/controller/node_controller_test.go index 0036873..610cff8 100644 --- a/internal/controller/node_controller_test.go +++ b/internal/controller/node_controller_test.go @@ -344,6 +344,60 @@ var _ = Describe("Node Controller", func() { return false }, time.Second*5).Should(BeTrue()) }) + + It("should remove a managed taint when node labels stop matching the rule selector", func() { + _, err := nodeReconciler.Reconcile(ctx, reconcile.Request{NamespacedName: namespacedName}) + Expect(err).NotTo(HaveOccurred()) + + By("verifying the rule recorded this node as evaluated") + Eventually(func() bool { + updatedRule := &nodereadinessiov1alpha1.NodeReadinessRule{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: ruleName}, updatedRule); err != nil { + return false + } + for _, evaluation := range updatedRule.Status.NodeEvaluations { + if evaluation.NodeName == nodeName { + return true + } + } + return false + }, time.Second*5).Should(BeTrue()) + + By("changing the node labels so the rule no longer applies") + updatedNode := &corev1.Node{} + Expect(k8sClient.Get(ctx, namespacedName, updatedNode)).To(Succeed()) + updatedNode.Labels = map[string]string{"env": "other"} + Expect(k8sClient.Update(ctx, updatedNode)).To(Succeed()) + + _, err = nodeReconciler.Reconcile(ctx, reconcile.Request{NamespacedName: namespacedName}) + Expect(err).NotTo(HaveOccurred()) + + By("verifying the taint managed by the rule is removed") + Eventually(func() bool { + recheckedNode := &corev1.Node{} + _ = k8sClient.Get(ctx, namespacedName, recheckedNode) + for _, taint := range recheckedNode.Spec.Taints { + if taint.Key == taintKey { + return true + } + } + return false + }, time.Second*5).Should(BeFalse()) + + By("verifying node-specific rule status is removed") + Eventually(func() bool { + updatedRule := &nodereadinessiov1alpha1.NodeReadinessRule{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: ruleName}, updatedRule); err != nil { + return false + } + for _, evaluation := range updatedRule.Status.NodeEvaluations { + if evaluation.NodeName == nodeName { + return false + } + } + return true + }, time.Second*5).Should(BeTrue()) + }) }) When("a rule's node selector does not match", func() { diff --git a/internal/controller/nodereadinessrule_controller.go b/internal/controller/nodereadinessrule_controller.go index 09ab6e1..9954482 100644 --- a/internal/controller/nodereadinessrule_controller.go +++ b/internal/controller/nodereadinessrule_controller.go @@ -417,20 +417,40 @@ func (r *RuleReadinessController) updateNodeEvaluationStatus( nodeEval.LastEvaluationTime = metav1.Now() } -// getApplicableRulesForNode returns all rules applicable to a node. -func (r *RuleReadinessController) getApplicableRulesForNode(ctx context.Context, node *corev1.Node) []*readinessv1alpha1.NodeReadinessRule { +// getRulesForNodeReconcile returns rules applicable to a node plus rules that +// previously recorded this node and may need cleanup after label changes. +func (r *RuleReadinessController) getRulesForNodeReconcile(ctx context.Context, node *corev1.Node) []*readinessv1alpha1.NodeReadinessRule { r.ruleCacheMutex.RLock() defer r.ruleCacheMutex.RUnlock() - var applicableRules []*readinessv1alpha1.NodeReadinessRule + var rules []*readinessv1alpha1.NodeReadinessRule for _, rule := range r.ruleCache { - if r.ruleAppliesTo(ctx, rule, node) { - applicableRules = append(applicableRules, rule) + if r.ruleAppliesTo(ctx, rule, node) || ruleHasNodeStatus(rule, node.Name) { + rules = append(rules, rule) } } - return applicableRules + return rules +} + +func ruleHasNodeStatus(rule *readinessv1alpha1.NodeReadinessRule, nodeName string) bool { + for _, appliedNode := range rule.Status.AppliedNodes { + if appliedNode == nodeName { + return true + } + } + for _, evaluation := range rule.Status.NodeEvaluations { + if evaluation.NodeName == nodeName { + return true + } + } + for _, failure := range rule.Status.FailedNodes { + if failure.NodeName == nodeName { + return true + } + } + return false } // ruleAppliesTo checks if a rule applies to a node.