Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions internal/controller/node_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package controller
import (
"context"
"fmt"
"time"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -334,6 +335,7 @@ func (r *RuleReadinessController) markBootstrapCompleted(ctx context.Context, no

annotationKey := fmt.Sprintf("readiness.k8s.io/bootstrap-completed-%s", ruleName)
marked := false
start := time.Now()

// retry to handle conflict with concurrent node updates
err := retry.RetryOnConflict(retry.DefaultRetry, func() error {
Expand Down Expand Up @@ -371,6 +373,7 @@ func (r *RuleReadinessController) markBootstrapCompleted(ctx context.Context, no
case marked:
log.Info("Marked bootstrap completed", "node", nodeName, "rule", ruleName)
metrics.BootstrapCompleted.WithLabelValues(ruleName).Inc()
metrics.BootstrapDuration.WithLabelValues(ruleName).Observe(time.Since(start).Seconds())
default:
log.V(4).Info("Bootstrap already completed", "node", nodeName, "rule", ruleName)
}
Expand Down
11 changes: 11 additions & 0 deletions internal/controller/nodereadinessrule_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -337,11 +337,13 @@ func (r *RuleReadinessController) evaluateRuleForNode(ctx context.Context, rule
case shouldRemoveTaint && currentlyHasTaint:
log.Info("Removing taint", "node", node.Name, "rule", rule.Name, "taint", rule.Spec.Taint.Key)

start := time.Now()
if err = r.removeTaintBySpec(ctx, node, rule.Spec.Taint, rule.Name); err != nil {
metrics.Failures.WithLabelValues(rule.Name, "RemoveTaintError").Inc()
return fmt.Errorf("failed to remove taint: %w", err)
}
metrics.TaintOperations.WithLabelValues(rule.Name, "remove").Inc()
metrics.ReconciliationLatency.WithLabelValues(rule.Name, "remove").Observe(time.Since(start).Seconds())

// Mark bootstrap completed if bootstrap-only mode
if rule.Spec.EnforcementMode == readinessv1alpha1.EnforcementModeBootstrapOnly {
Expand All @@ -351,11 +353,13 @@ func (r *RuleReadinessController) evaluateRuleForNode(ctx context.Context, rule
case !shouldRemoveTaint && !currentlyHasTaint:
log.Info("Adding taint", "node", node.Name, "rule", rule.Name, "taint", rule.Spec.Taint.Key)

start := time.Now()
if err = r.addTaintBySpec(ctx, node, rule.Spec.Taint, rule.Name); err != nil {
metrics.Failures.WithLabelValues(rule.Name, "AddTaintError").Inc()
return fmt.Errorf("failed to add taint: %w", err)
}
metrics.TaintOperations.WithLabelValues(rule.Name, "add").Inc()
metrics.ReconciliationLatency.WithLabelValues(rule.Name, "add").Observe(time.Since(start).Seconds())

case !shouldRemoveTaint && currentlyHasTaint:
if isFirstEvaluation {
Expand All @@ -381,6 +385,13 @@ func (r *RuleReadinessController) evaluateRuleForNode(ctx context.Context, rule
// Update evaluation status
r.updateNodeEvaluationStatus(rule, node.Name, conditionResults, taintStatus)

// Update NodesByState metric
if taintStatus == readinessv1alpha1.TaintStatusAbsent {
metrics.NodesByState.WithLabelValues(rule.Name, "ready").Inc()
} else {
metrics.NodesByState.WithLabelValues(rule.Name, "not_ready").Inc()
}

return nil
}

Expand Down
34 changes: 33 additions & 1 deletion internal/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import (
)

var (
// RulesTotal tracks the number of NodeReadinessRules .
// RulesTotal tracks the number of NodeReadinessRules.
RulesTotal = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "node_readiness_rules_total",
Expand Down Expand Up @@ -65,6 +65,35 @@ var (
},
[]string{"rule"},
)

// NodesByState tracks the number of nodes per rule per readiness state.
NodesByState = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "node_readiness_nodes_by_state",
Help: "Number of nodes per rule broken down by readiness state (ready, not_ready, bootstrapping)",
},
[]string{"rule", "state"},
)

// ReconciliationLatency tracks end-to-end latency of taint operations per rule.
ReconciliationLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "node_readiness_reconciliation_latency_seconds",
Help: "End-to-end latency of taint add/remove operations per rule",
Buckets: prometheus.DefBuckets,
},
[]string{"rule", "operation"},
)

// BootstrapDuration tracks time taken for a node to complete bootstrap per rule.
BootstrapDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "node_readiness_bootstrap_duration_seconds",
Help: "Time taken for a node to complete bootstrap per rule",
Buckets: prometheus.DefBuckets,
},
[]string{"rule"},
)
)

func init() {
Expand All @@ -74,4 +103,7 @@ func init() {
metrics.Registry.MustRegister(EvaluationDuration)
metrics.Registry.MustRegister(Failures)
metrics.Registry.MustRegister(BootstrapCompleted)
metrics.Registry.MustRegister(NodesByState)
metrics.Registry.MustRegister(ReconciliationLatency)
metrics.Registry.MustRegister(BootstrapDuration)
}