From 03b39945e04229ca11f63af9f59647126f0b6a4c Mon Sep 17 00:00:00 2001 From: Shreya2005-2005 Date: Sun, 10 May 2026 19:38:57 +0000 Subject: [PATCH] feat(metrics): add NodesByState, ReconciliationLatency, and BootstrapDuration metrics Add three Prometheus metrics missing from the controller, required for the SLO dashboard proposed in #182: - node_readiness_nodes_by_state{rule, state}: gauge tracking per-rule node counts by readiness state (ready/not_ready) - node_readiness_reconciliation_latency_seconds{rule, operation}: histogram tracking end-to-end latency of taint add/remove operations per rule - node_readiness_bootstrap_duration_seconds{rule}: histogram tracking time elapsed from bootstrap start to completion per rule Signed-off-by: Shreya Bhakat --- internal/controller/node_controller.go | 3 ++ .../nodereadinessrule_controller.go | 11 ++++++ internal/metrics/metrics.go | 34 ++++++++++++++++++- 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/internal/controller/node_controller.go b/internal/controller/node_controller.go index 9dc596a..c3ded01 100644 --- a/internal/controller/node_controller.go +++ b/internal/controller/node_controller.go @@ -19,6 +19,7 @@ package controller import ( "context" "fmt" + "time" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -334,6 +335,7 @@ func (r *RuleReadinessController) markBootstrapCompleted(ctx context.Context, no annotationKey := fmt.Sprintf("readiness.k8s.io/bootstrap-completed-%s", ruleName) marked := false + start := time.Now() // retry to handle conflict with concurrent node updates err := retry.RetryOnConflict(retry.DefaultRetry, func() error { @@ -371,6 +373,7 @@ func (r *RuleReadinessController) markBootstrapCompleted(ctx context.Context, no case marked: log.Info("Marked bootstrap completed", "node", nodeName, "rule", ruleName) metrics.BootstrapCompleted.WithLabelValues(ruleName).Inc() + metrics.BootstrapDuration.WithLabelValues(ruleName).Observe(time.Since(start).Seconds()) default: log.V(4).Info("Bootstrap already completed", "node", nodeName, "rule", ruleName) } diff --git a/internal/controller/nodereadinessrule_controller.go b/internal/controller/nodereadinessrule_controller.go index 99347e9..f5626aa 100644 --- a/internal/controller/nodereadinessrule_controller.go +++ b/internal/controller/nodereadinessrule_controller.go @@ -337,11 +337,13 @@ func (r *RuleReadinessController) evaluateRuleForNode(ctx context.Context, rule case shouldRemoveTaint && currentlyHasTaint: log.Info("Removing taint", "node", node.Name, "rule", rule.Name, "taint", rule.Spec.Taint.Key) + start := time.Now() if err = r.removeTaintBySpec(ctx, node, rule.Spec.Taint, rule.Name); err != nil { metrics.Failures.WithLabelValues(rule.Name, "RemoveTaintError").Inc() return fmt.Errorf("failed to remove taint: %w", err) } metrics.TaintOperations.WithLabelValues(rule.Name, "remove").Inc() + metrics.ReconciliationLatency.WithLabelValues(rule.Name, "remove").Observe(time.Since(start).Seconds()) // Mark bootstrap completed if bootstrap-only mode if rule.Spec.EnforcementMode == readinessv1alpha1.EnforcementModeBootstrapOnly { @@ -351,11 +353,13 @@ func (r *RuleReadinessController) evaluateRuleForNode(ctx context.Context, rule case !shouldRemoveTaint && !currentlyHasTaint: log.Info("Adding taint", "node", node.Name, "rule", rule.Name, "taint", rule.Spec.Taint.Key) + start := time.Now() if err = r.addTaintBySpec(ctx, node, rule.Spec.Taint, rule.Name); err != nil { metrics.Failures.WithLabelValues(rule.Name, "AddTaintError").Inc() return fmt.Errorf("failed to add taint: %w", err) } metrics.TaintOperations.WithLabelValues(rule.Name, "add").Inc() + metrics.ReconciliationLatency.WithLabelValues(rule.Name, "add").Observe(time.Since(start).Seconds()) case !shouldRemoveTaint && currentlyHasTaint: if isFirstEvaluation { @@ -381,6 +385,13 @@ func (r *RuleReadinessController) evaluateRuleForNode(ctx context.Context, rule // Update evaluation status r.updateNodeEvaluationStatus(rule, node.Name, conditionResults, taintStatus) + // Update NodesByState metric + if taintStatus == readinessv1alpha1.TaintStatusAbsent { + metrics.NodesByState.WithLabelValues(rule.Name, "ready").Inc() + } else { + metrics.NodesByState.WithLabelValues(rule.Name, "not_ready").Inc() + } + return nil } diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index bb0d906..79a8878 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -22,7 +22,7 @@ import ( ) var ( - // RulesTotal tracks the number of NodeReadinessRules . + // RulesTotal tracks the number of NodeReadinessRules. RulesTotal = prometheus.NewGauge( prometheus.GaugeOpts{ Name: "node_readiness_rules_total", @@ -65,6 +65,35 @@ var ( }, []string{"rule"}, ) + + // NodesByState tracks the number of nodes per rule per readiness state. + NodesByState = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "node_readiness_nodes_by_state", + Help: "Number of nodes per rule broken down by readiness state (ready, not_ready, bootstrapping)", + }, + []string{"rule", "state"}, + ) + + // ReconciliationLatency tracks end-to-end latency of taint operations per rule. + ReconciliationLatency = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "node_readiness_reconciliation_latency_seconds", + Help: "End-to-end latency of taint add/remove operations per rule", + Buckets: prometheus.DefBuckets, + }, + []string{"rule", "operation"}, + ) + + // BootstrapDuration tracks time taken for a node to complete bootstrap per rule. + BootstrapDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "node_readiness_bootstrap_duration_seconds", + Help: "Time taken for a node to complete bootstrap per rule", + Buckets: prometheus.DefBuckets, + }, + []string{"rule"}, + ) ) func init() { @@ -74,4 +103,7 @@ func init() { metrics.Registry.MustRegister(EvaluationDuration) metrics.Registry.MustRegister(Failures) metrics.Registry.MustRegister(BootstrapCompleted) + metrics.Registry.MustRegister(NodesByState) + metrics.Registry.MustRegister(ReconciliationLatency) + metrics.Registry.MustRegister(BootstrapDuration) }