From 3eeece0df49b0d0429ee8bdf5d0faa9823c1efdb Mon Sep 17 00:00:00 2001 From: Karthik Bhat Date: Mon, 9 Mar 2026 12:34:47 +0530 Subject: [PATCH] Add scale tests with monitoring --- cmd/readiness-condition-reporter/main_test.go | 1 - docs/book/src/operations/monitoring.md | 177 ++++++- hack/test-workloads/scale/Makefile | 283 ++++++++++ hack/test-workloads/scale/README.md | 489 ++++++++++++++++++ .../scale/cleanup-kwok-nodes-rules.sh | 51 ++ .../scale/grafana-dashboard.json | 406 +++++++++++++++ hack/test-workloads/scale/kind-config.yaml | 7 + hack/test-workloads/scale/scale-test.sh | 162 ++++++ hack/test-workloads/scale/servicemonitor.yaml | 21 + hack/test-workloads/scale/setup-monitoring.sh | 230 ++++++++ internal/controller/node_controller.go | 13 +- internal/controller/node_controller_test.go | 314 ++++------- .../nodereadinessrule_controller.go | 112 +++- internal/metrics/metrics.go | 72 ++- 14 files changed, 2091 insertions(+), 247 deletions(-) create mode 100644 hack/test-workloads/scale/Makefile create mode 100644 hack/test-workloads/scale/README.md create mode 100755 hack/test-workloads/scale/cleanup-kwok-nodes-rules.sh create mode 100644 hack/test-workloads/scale/grafana-dashboard.json create mode 100644 hack/test-workloads/scale/kind-config.yaml create mode 100755 hack/test-workloads/scale/scale-test.sh create mode 100644 hack/test-workloads/scale/servicemonitor.yaml create mode 100755 hack/test-workloads/scale/setup-monitoring.sh diff --git a/cmd/readiness-condition-reporter/main_test.go b/cmd/readiness-condition-reporter/main_test.go index 51e42e9..2324a12 100644 --- a/cmd/readiness-condition-reporter/main_test.go +++ b/cmd/readiness-condition-reporter/main_test.go @@ -178,7 +178,6 @@ func TestUpdateNodeCondition(t *testing.T) { if foundCondition == nil { t.Fatal("Condition not found") } - if foundCondition.Status != tt.wantStatus { t.Errorf("Condition status = %v, want %v", foundCondition.Status, tt.wantStatus) } diff --git a/docs/book/src/operations/monitoring.md b/docs/book/src/operations/monitoring.md index 04608df..f49cc72 100644 --- a/docs/book/src/operations/monitoring.md +++ b/docs/book/src/operations/monitoring.md @@ -1,16 +1,48 @@ # Monitoring -Node Readiness Controller exposes Prometheus-compatible metrics. This page describes the Prometheus metrics exposed by Node Readiness Controller for monitoring rule evaluation, taint operations, failures, and bootstrap progress. +The Node Readiness Controller exposes Prometheus-compatible metrics. This page documents the metrics currently registered by the controller and how they can be used for monitoring rule evaluation, taint operations, failures, bootstrap progress, and rule health. ## Metrics Endpoint -The controller serves metrics on `/metrics` only when metrics are explicitly enabled. Depending on the installation, the endpoint is served either over HTTP or over HTTPS. See [Installation](../user-guide/installation.md) for deployment details. +The controller serves metrics on `/metrics` only when metrics are explicitly enabled. -## Supported Metrics +Depending on the installation, the endpoint is exposed as: + + - HTTP on port `8080` when the standard Prometheus component is enabled. + - HTTPS on port `8443` when the Prometheus TLS component is enabled. + +See [Installation](https://www.google.com/search?q=../user-guide/installation.md) for deployment details. + +## Metric Lifecycle Management + +When a `NodeReadinessRule` is deleted, the controller automatically cleans up the associated rule-labeled Prometheus series. This prevents stale metrics from remaining visible in dashboards and alerts. + +**Metrics cleaned up on rule deletion:** + + - `node_readiness_taint_operations_total{rule="..."}` + - `node_readiness_evaluation_duration_seconds{rule="..."}` + - `node_readiness_failures_total{rule="..."}` + - `node_readiness_bootstrap_completed_total{rule="..."}` + - `node_readiness_reconciliation_latency_seconds{rule="..."}` + - `node_readiness_bootstrap_duration_seconds{rule="..."}` + - `node_readiness_nodes_by_state{rule="..."}` + - `node_readiness_rule_last_reconciliation_timestamp_seconds{rule="..."}` + +This ensures that: + + - Deleted rules do not continue to appear in dashboards with stale values. + - Memory usage does not grow unbounded from removed rules. + - Metric cardinality remains highly accurate over time. + +**Note:** The global `node_readiness_rules_total` gauge is updated separately. Rule-labeled metrics are explicitly deleted during rule cleanup. + +----- + +## Core Metrics ### `node_readiness_rules_total` -Number of `NodeReadinessRule` objects tracked by the controller. +Number of `NodeReadinessRule` objects currently tracked by the controller. | Property | Value | | --- | --- | @@ -25,24 +57,17 @@ Total number of taint operations performed by the controller. | Property | Value | | --- | --- | | Type | `counter` | -| Labels | `rule`, `operation` | +| Labels | `rule`, `operation` (`add`, `remove`) | | Recorded when | The controller successfully adds or removes a taint | -#### Labels - -| Label | Description | Values | -| --- | --- | --- | -| `rule` | `NodeReadinessRule` name | Any rule name | -| `operation` | Taint operation performed by the controller | `add`, `remove` | - ### `node_readiness_evaluation_duration_seconds` -Duration of rule evaluations. +Duration of the controller's internal rule evaluations. | Property | Value | | --- | --- | | Type | `histogram` | -| Labels | none | +| Labels | `rule` | | Buckets | Prometheus default histogram buckets | | Recorded when | The controller evaluates a rule against a node | @@ -53,15 +78,8 @@ Total number of failure events recorded by the controller. | Property | Value | | --- | --- | | Type | `counter` | -| Labels | `rule`, `reason` | -| Recorded when | The controller records an evaluation failure or taint add/remove failure | - -#### Labels - -| Label | Description | Values | -| --- | --- | --- | -| `rule` | `NodeReadinessRule` name | Any rule name | -| `reason` | Failure label recorded by the controller | `EvaluationError`, `AddTaintError`, `RemoveTaintError` | +| Labels | `rule`, `reason` (`EvaluationError`, `AddTaintError`, `RemoveTaintError`) | +| Recorded when | The controller encounters an error evaluating or patching a node | ### `node_readiness_bootstrap_completed_total` @@ -73,8 +91,113 @@ Total number of nodes that have completed bootstrap. | Labels | `rule` | | Recorded when | The controller marks bootstrap as completed for a node under a bootstrap-only rule | -#### Labels +----- + +## Extended Health and SLI Metrics + +### `node_readiness_reconciliation_latency_seconds` + +End-to-end latency from node condition change to taint operation completion. + +| Property | Value | +| --- | --- | +| Type | `histogram` | +| Labels | `rule`, `operation` (`add_taint`, `remove_taint`) | +| Buckets | `0.01`, `0.05`, `0.1`, `0.5`, `1`, `2`, `5`, `10`, `30`, `60`, `120`, `300` seconds | +| Recorded when | A taint operation completes | + +**Use case:** Measure how quickly the controller responds to node condition changes in the cluster. + +### `node_readiness_bootstrap_duration_seconds` + +Time from node creation to bootstrap completion for bootstrap-only rules. + +| Property | Value | +| --- | --- | +| Type | `histogram` | +| Labels | `rule` | +| Buckets | `1`, `5`, `10`, `30`, `60`, `120`, `300`, `600`, `1200` seconds | +| Recorded when | Bootstrap completion is observed for a node under a bootstrap-only rule | + +**Use case:** Measure the actual time nodes take to become fully provisioned and bootstrap-complete. + +### `node_readiness_nodes_by_state` + +Number of nodes in each readiness state per rule. + +| Property | Value | +| --- | --- | +| Type | `gauge` | +| Labels | `rule`, `state` (`ready`, `not_ready`, `bootstrapping`) | +| Recorded when | A rule reconciliation completes | + +**Use case:** Track aggregate node health without introducing per-node metric cardinality, keeping controller memory footprint lean. + +### `node_readiness_rule_last_reconciliation_timestamp_seconds` + +Unix timestamp of the last reconciliation for a rule. + +| Property | Value | +| --- | --- | +| Type | `gauge` | +| Labels | `rule` | +| Recorded when | A rule reconciliation loop successfully completes | + +**Use case:** Detect rules that may be stuck or not reconciling frequently enough. + +----- + +## Example Queries & SLOs + +### Latency Monitoring & SLOs + +**Objective:** 95% of internal evaluations complete within 50 milliseconds (0.05s). + +```promql +# Percentage of evaluations completing within 50ms +sum(rate(node_readiness_evaluation_duration_seconds_bucket{le="0.05"}[5m])) / +sum(rate(node_readiness_evaluation_duration_seconds_count[5m])) * 100 +``` + +```promql +# P95 End-to-End Reconciliation Latency across all rules +histogram_quantile(0.95, + sum by (le) ( + rate(node_readiness_reconciliation_latency_seconds_bucket[5m]) + ) +) +``` + +### Freshness Monitoring & SLOs + +**Objective:** All rules reconcile within the last 2 minutes. + +```promql +# Alert if any rule has not reconciled in the last 120 seconds +(time() - node_readiness_rule_last_reconciliation_timestamp_seconds) > 120 +``` + +### Availability Monitoring & SLOs + +**Objective:** 99.9% of targeted nodes are ready. + +```promql +# Percentage of ready nodes globally +100 * sum(node_readiness_nodes_by_state{state="ready"}) / sum(node_readiness_nodes_by_state) + +# Percentage of ready nodes per rule +100 * node_readiness_nodes_by_state{state="ready"} / sum by (rule) (node_readiness_nodes_by_state) +``` + +## Monitoring and Scale Testing + +For an end-to-end monitoring setup with Prometheus and Grafana during scale tests, see the [scale testing guide](../../../../hack/test-workloads/scale/README.md). + +## Alerting Recommendations + +Typical alerts to consider: -| Label | Description | Values | -| --- | --- | --- | -| `rule` | `NodeReadinessRule` name | Any rule name | + - **High latency:** P95 reconciliation latency above 10s for 5 minutes. + - **Stale reconciliations:** Any rule with no reconciliation for more than 5 minutes. + - **High failure rate:** Sustained increase in `node_readiness_failures_total`. + - **Low availability:** Ready-node percentage below your target threshold for a sustained period. \ No newline at end of file diff --git a/hack/test-workloads/scale/Makefile b/hack/test-workloads/scale/Makefile new file mode 100644 index 0000000..767b3e2 --- /dev/null +++ b/hack/test-workloads/scale/Makefile @@ -0,0 +1,283 @@ +# Copyright The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Scale Testing Makefile for Node Readiness Controller +# This Makefile provides targets for setting up and running scale tests +# with monitoring using Podman and Kind. + +.DEFAULT_GOAL := help + +# Directories +SCALE_DIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) +PROJECT_ROOT := $(shell cd $(SCALE_DIR)/../../.. && pwd) + +# Configuration +CLUSTER_NAME ?= nrr-test +CONTAINER_TOOL ?= podman +IMG_PREFIX ?= controller +IMG_TAG ?= latest +KIND_CONFIG ?= $(SCALE_DIR)/kind-config.yaml + +# Tools +KUBECTL ?= kubectl +KIND ?= kind +HELM ?= helm + +# Namespace for controller +CONTROLLER_NAMESPACE ?= nrr-system + +##@ General + +.PHONY: help +help: ## Display this help + @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-20s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) + +##@ Setup + +.PHONY: setup +setup: create-cluster install-controller install-monitoring ## Complete setup: cluster + controller + monitoring + @echo "" + @echo "✅ Setup complete!" + @echo "" + @echo "Next steps:" + @echo " 1. Run scale test: make test NODE_COUNT=1000 RULE_COUNT=3" + @echo " 2. View Grafana: http://localhost:3000 (credentials shown above)" + @echo " 3. Import dashboard: $(SCALE_DIR)/grafana-dashboard.json" + +.PHONY: create-cluster +create-cluster: ## Create Kind cluster with scale test configuration + @echo "==> Creating Kind cluster: $(CLUSTER_NAME)" + @if $(KIND) get clusters | grep -q "^$(CLUSTER_NAME)$$"; then \ + echo "Cluster $(CLUSTER_NAME) already exists"; \ + else \ + $(KIND) create cluster --config $(KIND_CONFIG) --name $(CLUSTER_NAME); \ + echo "✓ Cluster created"; \ + fi + +.PHONY: install-controller +install-controller: ## Build and install NRR controller with Podman or Docker + @echo "==> Installing NRR Controller" + @echo "Step 1: Installing CRDs..." + @cd $(PROJECT_ROOT) && $(MAKE) install + @echo "" + @echo "Step 2: Building controller image with $(CONTAINER_TOOL)..." +ifeq ($(CONTAINER_TOOL),podman) + @cd $(PROJECT_ROOT) && $(MAKE) podman-build IMG_PREFIX=$(IMG_PREFIX) IMG_TAG=$(IMG_TAG) +else + @cd $(PROJECT_ROOT) && $(MAKE) docker-build IMG_PREFIX=$(IMG_PREFIX) IMG_TAG=$(IMG_TAG) +endif + @echo "" + @echo "Step 3: Loading image into Kind cluster..." + @cd $(PROJECT_ROOT) && $(MAKE) kind-load CONTAINER_TOOL=$(CONTAINER_TOOL) IMG_PREFIX=$(IMG_PREFIX) IMG_TAG=$(IMG_TAG) KIND_CLUSTER=$(CLUSTER_NAME) + @echo "" + @echo "Step 4: Deploying controller with metrics enabled..." +ifeq ($(CONTAINER_TOOL),podman) + @cd $(PROJECT_ROOT) && $(MAKE) deploy IMG_PREFIX=localhost/$(IMG_PREFIX) IMG_TAG=$(IMG_TAG) ENABLE_METRICS=true +else + @cd $(PROJECT_ROOT) && $(MAKE) deploy IMG_PREFIX=$(IMG_PREFIX) IMG_TAG=$(IMG_TAG) ENABLE_METRICS=true +endif + @echo "" + @echo "Step 5: Waiting for controller to be ready..." + @for i in 1 2 3 4 5 6 7 8 9 10; do \ + echo "Checking pod status (attempt $$i/10)..."; \ + $(KUBECTL) get pods -n $(CONTROLLER_NAMESPACE) -o wide 2>/dev/null || true; \ + if $(KUBECTL) get pods -n $(CONTROLLER_NAMESPACE) -o jsonpath='{.items[*].status.conditions[?(@.type=="Ready")].status}' 2>/dev/null | grep -q "True"; then \ + echo "✓ Controller is running!"; \ + break; \ + fi; \ + [ $$i -lt 10 ] && sleep 10; \ + done + @echo "✓ Controller installed" + +.PHONY: install-monitoring +install-monitoring: ## Install Prometheus and Grafana monitoring stack + @echo "==> Installing Monitoring Stack" + @echo "Step 1: Adding Prometheus Helm repository..." + @$(HELM) repo add prometheus-community https://prometheus-community.github.io/helm-charts 2>/dev/null || true + @$(HELM) repo update + @echo "" + @echo "Step 2: Creating monitoring namespace..." + @$(KUBECTL) create namespace monitoring --dry-run=client -o yaml | $(KUBECTL) apply -f - + @echo "" + @echo "Step 3: Installing or updating kube-prometheus-stack..." + @$(HELM) upgrade --install prom-stack prometheus-community/kube-prometheus-stack \ + --namespace monitoring \ + --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ + --set prometheus.prometheusSpec.scrapeInterval=5s \ + --set nodeExporter.enabled=false \ + --set grafana.adminPassword=admin \ + --wait --timeout 5m + @echo "✓ Prometheus stack installed" + @echo "" + @echo "Step 4: Creating ServiceMonitor for NRR metrics..." + @$(KUBECTL) apply -f $(SCALE_DIR)/servicemonitor.yaml + @echo "✓ ServiceMonitor created" + @echo "" + @echo "Step 5: Setting up port forwarding..." + @$(MAKE) port-forward + @echo "" + @echo "✓ Monitoring stack installed" + @echo "" + @echo "📊 Access URLs:" + @echo " Grafana: http://localhost:3000" + @echo " Prometheus: http://localhost:9090" + @echo "" + @echo "🔐 Grafana Credentials:" + @echo " Username: admin" + @echo " Password: admin" + +.PHONY: port-forward +port-forward: ## Setup port forwarding for Grafana and Prometheus + @echo "Setting up port forwarding..." + @pkill -f "port-forward.*grafana" 2>/dev/null || true + @pkill -f "port-forward.*prometheus" 2>/dev/null || true + @$(KUBECTL) port-forward -n monitoring svc/prom-stack-grafana 3000:80 > /dev/null 2>&1 & + @$(KUBECTL) port-forward -n monitoring svc/prom-stack-kube-prometheus-prometheus 9090:9090 > /dev/null 2>&1 & + @sleep 2 + @echo "✓ Port forwarding active (Grafana: 3000, Prometheus: 9090)" + +##@ Testing + +.PHONY: test +test: ## Run scale test (usage: make test NODE_COUNT=1000 RULE_COUNT=3) + @echo "==> Running Scale Test" + @if [ -z "$(NODE_COUNT)" ]; then \ + echo "Usage: make test NODE_COUNT= [RULE_COUNT=]"; \ + echo "Example: make test NODE_COUNT=1000 RULE_COUNT=3"; \ + exit 1; \ + fi + @$(SCALE_DIR)/scale-test.sh $(NODE_COUNT) $(if $(RULE_COUNT),$(RULE_COUNT),1) + +.PHONY: test-quick +test-quick: ## Quick test with 100 nodes and 1 rule + @$(MAKE) test NODE_COUNT=100 RULE_COUNT=1 + +.PHONY: test-medium +test-medium: ## Medium test with 500 nodes and 2 rules + @$(MAKE) test NODE_COUNT=500 RULE_COUNT=2 + +.PHONY: test-large +test-large: ## Large test with 1000 nodes and 3 rules + @$(MAKE) test NODE_COUNT=1000 RULE_COUNT=3 + +.PHONY: test-xlarge +test-xlarge: ## Extra large test with 5000 nodes and 5 rules + @$(MAKE) test NODE_COUNT=5000 RULE_COUNT=5 + +##@ Monitoring + +.PHONY: dashboard +dashboard: ## Open Grafana dashboard in browser + @echo "Opening Grafana dashboard..." + @echo "URL: http://localhost:3000" + @echo "Username: admin" + @echo "Password: admin" + @echo "" + @echo "Import dashboard from: $(SCALE_DIR)/grafana-dashboard.json" + @open http://localhost:3000 2>/dev/null || xdg-open http://localhost:3000 2>/dev/null || echo "Please open http://localhost:3000 manually" + +.PHONY: prometheus +prometheus: ## Open Prometheus UI in browser + @echo "Opening Prometheus UI..." + @open http://localhost:9090 2>/dev/null || xdg-open http://localhost:9090 2>/dev/null || echo "Please open http://localhost:9090 manually" + +.PHONY: logs +logs: ## Show controller logs + @$(KUBECTL) logs -n $(CONTROLLER_NAMESPACE) -l control-plane=controller-manager --tail=100 -f + +.PHONY: metrics +metrics: ## Show current Prometheus-formatted metrics from the controller Service + @echo "Fetching Prometheus metrics from controller Service..." + @$(KUBECTL) get --raw "/api/v1/namespaces/$(CONTROLLER_NAMESPACE)/services/http:metrics-service:8080/proxy/metrics" 2>/dev/null || \ + echo "Unable to fetch /metrics via metrics-service. Ensure the controller is deployed with ENABLE_METRICS=true and the Service exists." + +##@ Cleanup + +.PHONY: clean-test +clean-test: ## Clean up test resources (nodes and rules) + @echo "==> Cleaning up test resources" + @$(SCALE_DIR)/cleanup-kwok-nodes-rules.sh + @echo "✓ Test resources cleaned" + +.PHONY: clean-monitoring +clean-monitoring: ## Uninstall monitoring stack + @echo "==> Uninstalling monitoring stack" + @pkill -f "port-forward.*grafana" 2>/dev/null || true + @pkill -f "port-forward.*prometheus" 2>/dev/null || true + @$(HELM) uninstall prom-stack -n monitoring 2>/dev/null || true + @$(KUBECTL) delete namespace monitoring --ignore-not-found=true + @echo "✓ Monitoring stack removed" + +.PHONY: clean-controller +clean-controller: ## Uninstall controller + @echo "==> Uninstalling controller" + @cd $(PROJECT_ROOT) && $(MAKE) undeploy ENABLE_METRICS=true 2>/dev/null || true + @cd $(PROJECT_ROOT) && $(MAKE) uninstall 2>/dev/null || true + @echo "✓ Controller uninstalled" + +.PHONY: clean-cluster +clean-cluster: ## Delete Kind cluster + @echo "==> Deleting Kind cluster: $(CLUSTER_NAME)" + @$(KIND) delete cluster --name $(CLUSTER_NAME) + @echo "✓ Cluster deleted" + +.PHONY: clean +clean: clean-test clean-monitoring clean-controller clean-cluster ## Complete cleanup: remove everything + @echo "" + @echo "✅ Complete cleanup finished" + +##@ Utilities + +.PHONY: status +status: ## Show status of all components + @echo "==> Cluster Status" + @echo "Cluster: $(CLUSTER_NAME)" + @$(KIND) get clusters | grep "^$(CLUSTER_NAME)$$" && echo "✓ Cluster exists" || echo "✗ Cluster not found" + @echo "" + @echo "==> Controller Status" + @$(KUBECTL) get pods -n $(CONTROLLER_NAMESPACE) 2>/dev/null || echo "✗ Controller not deployed" + @echo "" + @echo "==> Monitoring Status" + @$(KUBECTL) get pods -n monitoring 2>/dev/null || echo "✗ Monitoring not deployed" + @echo "" + @echo "==> Test Resources" + @echo "NodeReadinessRules:" + @$(KUBECTL) get nodereadinessrules 2>/dev/null || echo " None" + @echo "KWOK Nodes:" + @$(KUBECTL) get nodes -l kwok.x-k8s.io/node=fake --no-headers 2>/dev/null | wc -l | xargs echo " Count:" || echo " 0" + @echo "" + @echo "==> Port Forwarding" + @pgrep -f "port-forward.*grafana" > /dev/null && echo "✓ Grafana port-forward active (3000)" || echo "✗ Grafana port-forward not active" + @pgrep -f "port-forward.*prometheus" > /dev/null && echo "✓ Prometheus port-forward active (9090)" || echo "✗ Prometheus port-forward not active" + +.PHONY: verify +verify: ## Verify all prerequisites are installed + @echo "==> Verifying Prerequisites" + @command -v $(KIND) >/dev/null 2>&1 && echo "✓ kind installed" || echo "✗ kind not found" + @command -v $(KUBECTL) >/dev/null 2>&1 && echo "✓ kubectl installed" || echo "✗ kubectl not found" + @command -v $(HELM) >/dev/null 2>&1 && echo "✓ helm installed" || echo "✗ helm not found" + @command -v $(CONTAINER_TOOL) >/dev/null 2>&1 && echo "✓ $(CONTAINER_TOOL) installed" || echo "✗ $(CONTAINER_TOOL) not found" + @command -v jq >/dev/null 2>&1 && echo "✓ jq installed" || echo "✗ jq not found" + @command -v bc >/dev/null 2>&1 && echo "✓ bc installed" || echo "✗ bc not found" + +.PHONY: info +info: ## Show configuration information + @echo "==> Configuration" + @echo "Project Root: $(PROJECT_ROOT)" + @echo "Scale Directory: $(SCALE_DIR)" + @echo "Cluster Name: $(CLUSTER_NAME)" + @echo "Container Tool: $(CONTAINER_TOOL)" + @echo "Image: $(IMG_PREFIX):$(IMG_TAG)" + @echo "Kind Config: $(KIND_CONFIG)" + @echo "Controller NS: $(CONTROLLER_NAMESPACE)" diff --git a/hack/test-workloads/scale/README.md b/hack/test-workloads/scale/README.md new file mode 100644 index 0000000..ee7b4a4 --- /dev/null +++ b/hack/test-workloads/scale/README.md @@ -0,0 +1,489 @@ +# Node Readiness Controller - Scale Testing Guide + +This guide explains how to run scale tests for Node Readiness Controller (NRR) with Prometheus and Grafana, and how to interpret the metrics that are currently emitted by the controller. + +## Table of Contents + +- [Overview](#overview) +- [Metrics Available During Scale Tests](#metrics-available-during-scale-tests) +- [Prerequisites](#prerequisites) +- [Quick Start](#quick-start) +- [Detailed Setup](#detailed-setup) +- [Import the Grafana Dashboard](#import-the-grafana-dashboard) +- [Run Scale Tests](#run-scale-tests) +- [Monitoring Queries](#monitoring-queries) +- [Interpreting Results](#interpreting-results) +- [Troubleshooting](#troubleshooting) +- [Cleanup](#cleanup) +- [Advanced Usage](#advanced-usage) +- [Additional Resources](#additional-resources) + +## Overview + +The scale test framework allows you to: + + - Test NRR with multiple nodes using [KWOK](https://kwok.sigs.k8s.io/) (fake nodes). + - Measure taint addition and removal throughput. + - Observe evaluation logic performance in Grafana. + - Monitor controller resource usage (memory and CPU). + - Inspect `controller-runtime` metrics. + - Monitor NRR rule-level health and aggregate readiness metrics. + +## Metrics Available During Scale Tests + +During scale tests you will see both: + + - **NRR custom metrics** exposed by the controller. + - **controller-runtime / process metrics** exposed by the manager and scraped by Prometheus. + +### NRR Custom Metrics (The "Mega-Scale" Stack) + +NRR uses an **Aggregate-First** telemetry strategy. This means we track the state of the cluster without introducing per-node metric labels, ensuring the controller remains lean as the cluster grows. + +The following controller metrics are currently registered: + + - `node_readiness_rules_total` + - `node_readiness_taint_operations_total{rule, operation}` + - `node_readiness_evaluation_duration_seconds{rule}` + - `node_readiness_failures_total{rule, reason}` + - `node_readiness_bootstrap_completed_total{rule}` + - `node_readiness_reconciliation_latency_seconds{rule, operation}` + - `node_readiness_bootstrap_duration_seconds{rule}` + - `node_readiness_nodes_by_state{rule, state}` + - `node_readiness_rule_last_reconciliation_timestamp_seconds{rule}` + +## Prerequisites + +Ensure the following tools are installed: + +```bash +kind version +kubectl version --client +helm version +podman --version # or docker --version +jq --version +bc --version +``` + +Container runtime support: + + - **Podman** is the default in the Makefile + - **Docker** is also supported via `CONTAINER_TOOL=docker` + +## Quick Start + +### Option 1: Makefile + +```bash +cd hack/test-workloads/scale + +# Full setup: cluster + controller + monitoring +make setup + +# Run a test +make test NODE_COUNT=1000 RULE_COUNT=3 + +# Open Grafana +make dashboard + +# Open Prometheus +make prometheus + +# Inspect the controller's /metrics endpoint output +make metrics +``` + +Using Docker instead of Podman: + +```bash +cd hack/test-workloads/scale +make setup CONTAINER_TOOL=docker +make test NODE_COUNT=1000 RULE_COUNT=3 CONTAINER_TOOL=docker +``` + +### Option 2: Script + +```bash +cd hack/test-workloads/scale + +# Setup monitoring stack +./setup-monitoring.sh + +# Run scale test in another terminal +./scale-test.sh 1000 3 +``` + +## Detailed Setup + +### Container runtime configuration + +The Makefile supports both Podman and Docker. + +#### Podman + +```bash +make setup +make test NODE_COUNT=1000 RULE_COUNT=3 +``` + +#### Docker + +```bash +make setup CONTAINER_TOOL=docker +make test NODE_COUNT=1000 RULE_COUNT=3 CONTAINER_TOOL=docker +``` + +#### Show current configuration + +```bash +make info +``` + +### Available Make targets + +```bash +make help +make verify +make info +make status +``` + +Key targets: + +| Target | Description | +| --- | --- | +| `make setup` | Create cluster, install controller, install monitoring | +| `make test NODE_COUNT=1000 RULE_COUNT=3` | Run scale test | +| `make test-quick` | 100 nodes, 1 rule | +| `make test-medium` | 500 nodes, 2 rules | +| `make test-large` | 1000 nodes, 3 rules | +| `make test-xlarge` | 5000 nodes, 5 rules | +| `make dashboard` | Open Grafana | +| `make prometheus` | Open Prometheus | +| `make metrics` | Print the controller `/metrics` output via the Kubernetes Service proxy | +| `make logs` | Follow controller logs | +| `make status` | Show status of cluster, controller, monitoring, and port-forwarding | +| `make clean` | Remove everything | + +### What `make setup` does + +`make setup` runs: + +1. `create-cluster` +2. `install-controller` +3. `install-monitoring` + +Controller installation enables the metrics endpoint and deploys the controller into the `nrr-system` namespace. + +Monitoring installation: + +- installs or updates `kube-prometheus-stack` +- creates the `monitoring` namespace +- applies `servicemonitor.yaml` +- configures Prometheus with a `5s` scrape interval for the stack +- disables `nodeExporter` in this scale-test setup +- starts local port-forwards for Grafana and Prometheus + +### Metrics scraping configuration + +The scale setup uses `hack/test-workloads/scale/servicemonitor.yaml`. + +Current behavior: + + - scrapes the controller Service in namespace `nrr-system` + - matches Service labels: + - `control-plane: controller-manager` + - `app.kubernetes.io/name: nrrcontroller` + - scrapes endpoint: + - port: `http` + - scheme: `http` + - interval: `5s` + +This matches the scale-test setup, which deploys the controller with metrics enabled over HTTP. + +## Import the Grafana Dashboard + +1. Open Grafana at `http://localhost:3000` +2. Login with: + - username: `admin` + - password: `admin` when using the Makefile setup + - password from script output when using `setup-monitoring.sh` +3. Import `hack/test-workloads/scale/grafana-dashboard.json` +4. Select Prometheus as the datasource + +The dashboard JSON in this directory is the source of truth for the available panels. + +Current dashboard highlights: + +- **NRR Ready Nodes (%)**: percentage of nodes currently in NRR `ready` state +- **SLI: Fast Evaluations (% under 50ms)**: percentage of evaluations completing within 50ms +- **Bootstrap Completions**: total number of completed bootstrap events +- **Nodes by Readiness State**: aggregate counts for `ready`, `not_ready`, and `bootstrapping` +- **Nodes by Rule and State**: readiness-state breakdown per rule +- **Reconciliation Latency (P50/P95/P99)**: latency percentiles broken out by operation label +- **Evaluation Rate by Rule**: how actively each rule is being evaluated +- **Taint Operations (Throughput)**: add/remove operation rate +- **Failures & Errors**: failure rate by reason +- **Rule Reconciliation Age**: time since each rule last reconciled +- **Workqueue Depth (Backlog)**: controller backlog indicator +- **Controller Memory Usage** and **Controller CPU Usage** with both container-level and process-level visibility where available +- **Bootstrap Duration by Rule (P95)**: bootstrap latency broken out per rule +- **Bootstrap Duration Rate / Samples**: indicates whether bootstrap duration histograms currently have sample volume +- **Total Taint Operations**: cumulative add/remove operations over the selected time range + +## Run Scale Tests + +### Using the Makefile + +```bash +make test NODE_COUNT=1000 RULE_COUNT=3 + +make test-quick +make test-medium +make test-large +make test-xlarge +``` + +### Using the script directly + +```bash +./scale-test.sh + +./scale-test.sh 100 1 +./scale-test.sh 1000 3 +./scale-test.sh 5000 5 +``` + +### What the test does + +The test workflow is: + +1. clean up old test artifacts +2. create one or more `NodeReadinessRule` objects +3. create fake KWOK nodes +4. wait for NRR to apply taints +5. patch node conditions so rules become satisfied +6. wait for NRR to remove taints +7. print timing and throughput results + +## Monitoring Queries + +Use these in Prometheus while running scale tests to validate controller performance. + +### Evaluation Performance + +Measures the percentage of evaluations completing within 50ms. + +```promql +sum(rate(node_readiness_evaluation_duration_seconds_bucket{le="0.05"}[5m])) / +sum(rate(node_readiness_evaluation_duration_seconds_count[5m])) * 100 +``` + +### End-to-End Reconciliation Latency (P99) + +How long does it take NRR to react to a condition change in the cluster? + +```promql +histogram_quantile(0.99, + sum by (le, operation) (rate(node_readiness_reconciliation_latency_seconds_bucket[5m])) +) +``` + +### Cluster Readiness Overview + +Safely aggregate node health without cardinality explosions. + +```promql +sum by (state) (node_readiness_nodes_by_state) +``` + +### Controller Freshness (Is it stuck?) + +```promql +# Alert if any rule has not reconciled in the last 120 seconds +(time() - node_readiness_rule_last_reconciliation_timestamp_seconds) > 120 +``` + +### Failure Rate + +```promql +sum by (reason) (rate(node_readiness_failures_total[5m])) +``` + +### Controller Resource Usage + +```promql +process_resident_memory_bytes +rate(process_cpu_seconds_total[5m]) +``` + +### Controller-runtime Metrics + +```promql +sum(rate(controller_runtime_reconcile_total[5m])) +sum(rate(controller_runtime_reconcile_errors_total[5m])) +workqueue_depth +``` + +## Interpreting Results + +### Good signals during a healthy scale run + +- **Stable memory profile:** controller memory should stay relatively stable for a given test size. +- **Evaluation performance:** the fast-evaluations panel tracks the percentage of evaluations completing within 50ms. +- **Throughput spikes:** taint operations should spike during node creation and condition patching, then fall back down. +- **Clean node transitions:** `node_readiness_nodes_by_state` should move from `not_ready` or `bootstrapping` toward `ready`. +- **Per-rule visibility:** `Nodes by Rule and State` and `Evaluation Rate by Rule` should make it obvious if one rule is lagging behind the others. +- **Bootstrap duration:** bootstrap duration panels reflect the end-to-end time for nodes to reach bootstrap completion. +- **Bootstrap completion growth:** the bootstrap completions stat should rise as nodes complete bootstrap-only workflows. +- **Low failure rate:** `node_readiness_failures_total` should remain low or flat in healthy runs. + +### Important note on ready percentage + +`NRR Ready Nodes (%)` is based on **NRR aggregate state**, not the Kubernetes `Ready=True` node condition. + +During scale tests, it is normal for this panel to stay low or at `0%` during the taint-add phase because the test intentionally creates nodes before satisfying the custom readiness conditions. It should increase after the condition patching phase completes. + +### Signals to investigate + +- **Rising workqueue depth:** indicates the controller cannot keep up with node events. +- **High sustained latency percentiles:** suggests API pressure or reconciliation bottlenecks. +- **Memory growth across repeated runs:** may indicate a leak or excessive retained state. +- **Bootstrap duration getting worse with scale:** suggests the controller or API server is struggling to complete bootstrap-only workflows promptly. +- **Rule lag continuously increasing:** investigate reconcile health if `node_readiness_rule_last_reconciliation_timestamp_seconds` stops advancing while work remains. + +### Example validation checklist + +After a `make test-large` run: + + - verify taint operations occurred: `sum(node_readiness_taint_operations_total)` + - verify evaluations occurred: `sum(node_readiness_evaluation_duration_seconds_count)` + - verify no unexpected sustained failures: `sum(rate(node_readiness_failures_total[5m]))` + - verify aggregate node state moved as expected: `node_readiness_nodes_by_state` + +## Troubleshooting + +### Metrics are missing in Prometheus + +Check that the controller is running: + +```bash +kubectl get pods -n nrr-system +kubectl logs -n nrr-system -l control-plane=controller-manager --tail=100 +``` + +Check that the ServiceMonitor exists: + +```bash +kubectl get servicemonitor -n monitoring +kubectl get servicemonitor -n monitoring node-readiness-controller-monitor -o yaml +``` + +Check Prometheus targets: + +```bash +make prometheus +``` + +Then inspect `http://localhost:9090/targets`. + +### Metrics endpoint not reachable + +The scale setup scrapes the HTTP metrics endpoint through the ServiceMonitor. Verify the Service exists and exposes port `http`: + +```bash +kubectl get svc -n nrr-system +kubectl get svc -n nrr-system metrics-service -o yaml +``` + +### Dashboard shows no data + + - ensure the Grafana time range includes the test interval + - verify the Prometheus datasource is healthy + - confirm the imported dashboard uses the Prometheus datasource + - query the metrics directly in Prometheus first + +### Scale test hangs + +```bash +kubectl logs -n nrr-system -l control-plane=controller-manager -f +kubectl get nodes -l kwok.x-k8s.io/node=fake --watch +kubectl get nodereadinessrules +``` + +### Port forwarding fails + +```bash +lsof -i :3000 +lsof -i :9090 +``` + +Then restart: + +```bash +kubectl port-forward -n monitoring svc/prom-stack-grafana 3000:80 & +kubectl port-forward -n monitoring svc/prom-stack-kube-prometheus-prometheus 9090:9090 & +``` + +### Podman image build fails + +```bash +cd ../../../ +make podman-build +podman images | grep controller +``` + +## Cleanup + +### Makefile targets + +```bash +make clean-test +make clean-monitoring +make clean-controller +make clean-cluster +make clean +``` + +### Manual cleanup + +```bash +./cleanup-kwok-nodes-rules.sh + +kubectl delete nodereadinessrules -l scale-test=true +kubectl delete nodes -l kwok.x-k8s.io/node=fake + +pkill -f "port-forward.*grafana" +pkill -f "port-forward.*prometheus" + +kind delete cluster --name nrr-test +helm uninstall prom-stack -n monitoring +``` + +## Advanced Usage + +### View logs, metrics, and component status + +```bash +make logs +make metrics +make status +``` + +### Inspect the current setup + +```bash +make verify +make info +``` + +## Additional Resources + +- [Monitoring Operations Guide](../../../docs/book/src/operations/monitoring.md) +- [Main Project README](../../../README.md) +- [Architecture Draft](../../../docs/architecture.draft.md) +- [API Reference](../../../docs/book/src/reference/api-spec.md) + +----- + +Happy testing\! \ No newline at end of file diff --git a/hack/test-workloads/scale/cleanup-kwok-nodes-rules.sh b/hack/test-workloads/scale/cleanup-kwok-nodes-rules.sh new file mode 100755 index 0000000..9b3908d --- /dev/null +++ b/hack/test-workloads/scale/cleanup-kwok-nodes-rules.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash + +# Copyright The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Cleanup script for KWOK nodes and NodeReadinessRules + +set -euo pipefail + +echo "=== Cleanup Script ===" +echo "" + +# Delete all KWOK nodes +echo "🧹 Deleting all KWOK nodes..." +NODE_COUNT=$(kubectl get nodes -l kwok.x-k8s.io/node=fake --no-headers 2>/dev/null | wc -l | tr -d ' ') + +if [ "$NODE_COUNT" -eq 0 ]; then + echo " No KWOK nodes found." +else + echo " Found $NODE_COUNT KWOK nodes. Deleting..." + kubectl delete nodes -l kwok.x-k8s.io/node=fake --grace-period=0 --force + echo " ✓ All KWOK nodes deleted" +fi + +echo "" + +# Delete all NodeReadinessRules +echo "🧹 Deleting all NodeReadinessRules..." +NRR_COUNT=$(kubectl get nodereadinessrules --no-headers 2>/dev/null | wc -l | tr -d ' ') + +if [ "$NRR_COUNT" -eq 0 ]; then + echo " No NodeReadinessRules found." +else + echo " Found $NRR_COUNT NodeReadinessRule(s). Deleting..." + kubectl delete nodereadinessrules --all + echo " ✓ All NodeReadinessRules deleted" +fi + +echo "" +echo "=== Cleanup Complete ===" diff --git a/hack/test-workloads/scale/grafana-dashboard.json b/hack/test-workloads/scale/grafana-dashboard.json new file mode 100644 index 0000000..ef88cf3 --- /dev/null +++ b/hack/test-workloads/scale/grafana-dashboard.json @@ -0,0 +1,406 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": {"type": "grafana", "uid": "-- Grafana --"}, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "panels": [ + { + "title": "NRR Ready Nodes (%)", + "type": "stat", + "gridPos": {"h": 6, "w": 8, "x": 0, "y": 0}, + "id": 1, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "100 * sum(node_readiness_nodes_by_state{state=\"ready\"}) / clamp_min(sum(node_readiness_nodes_by_state), 1)", + "legendFormat": "Ready %" + } + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "textMode": "value" + }, + "fieldConfig": { + "defaults": { + "unit": "percent", + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": null}, + {"color": "yellow", "value": 50}, + {"color": "green", "value": 95} + ] + } + } + } + }, + { + "title": "SLI: Fast Evaluations (% under 50ms)", + "type": "stat", + "gridPos": {"h": 6, "w": 8, "x": 8, "y": 0}, + "id": 2, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "sum(rate(node_readiness_evaluation_duration_seconds_bucket{le=\"0.05\"}[5m])) / sum(rate(node_readiness_evaluation_duration_seconds_count[5m])) * 100", + "legendFormat": "Evaluations < 50ms" + } + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "textMode": "value" + }, + "fieldConfig": { + "defaults": { + "unit": "percent", + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": null}, + {"color": "yellow", "value": 95}, + {"color": "green", "value": 99} + ] + } + } + } + }, + { + "title": "Bootstrap Completions", + "type": "stat", + "gridPos": {"h": 6, "w": 8, "x": 16, "y": 0}, + "id": 3, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "sum(node_readiness_bootstrap_completed_total)", + "legendFormat": "Completed" + } + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "textMode": "value" + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "blue", "value": null}, + {"color": "green", "value": 1} + ] + } + } + } + }, + { + "title": "Nodes by Readiness State", + "type": "timeseries", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 6}, + "id": 4, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "sum by (state) (node_readiness_nodes_by_state)", + "legendFormat": "{{state}}" + } + ], + "options": { + "legend": {"displayMode": "table", "placement": "right"}, + "tooltip": {"mode": "multi"} + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 20, + "stacking": {"group": "A", "mode": "normal"} + } + } + } + }, + { + "title": "Nodes by Rule and State", + "type": "timeseries", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 6}, + "id": 5, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "sum by (rule, state) (node_readiness_nodes_by_state)", + "legendFormat": "{{rule}} / {{state}}" + } + ], + "options": { + "legend": {"displayMode": "table", "placement": "right"}, + "tooltip": {"mode": "multi"} + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 15, + "stacking": {"group": "A", "mode": "normal"} + } + } + } + }, + { + "title": "Reconciliation Latency (P50/P95/P99)", + "type": "timeseries", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 14}, + "id": 6, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (le, operation) (rate(node_readiness_reconciliation_latency_seconds_bucket[5m])))", + "legendFormat": "P50 {{operation}}" + }, + { + "expr": "histogram_quantile(0.95, sum by (le, operation) (rate(node_readiness_reconciliation_latency_seconds_bucket[5m])))", + "legendFormat": "P95 {{operation}}" + }, + { + "expr": "histogram_quantile(0.99, sum by (le, operation) (rate(node_readiness_reconciliation_latency_seconds_bucket[5m])))", + "legendFormat": "P99 {{operation}}" + } + ], + "options": { + "legend": {"displayMode": "list", "placement": "bottom"} + }, + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": {"drawStyle": "line", "fillOpacity": 10, "lineWidth": 2} + } + } + }, + { + "title": "Evaluation Rate by Rule", + "type": "timeseries", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 14}, + "id": 7, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "sum by (rule) (rate(node_readiness_evaluation_duration_seconds_count[5m]))", + "legendFormat": "{{rule}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": {"drawStyle": "line", "fillOpacity": 15, "lineWidth": 2} + } + } + }, + { + "title": "Taint Operations (Throughput)", + "type": "timeseries", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 22}, + "id": 8, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "sum by (operation) (rate(node_readiness_taint_operations_total[1m]))", + "legendFormat": "{{operation}} rate" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": {"drawStyle": "bars", "fillOpacity": 80} + } + } + }, + { + "title": "Failures & Errors", + "type": "timeseries", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 22}, + "id": 9, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "sum by (reason) (rate(node_readiness_failures_total[1m])) or vector(0)", + "legendFormat": "{{reason}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": {"drawStyle": "line", "fillOpacity": 20, "lineWidth": 2} + } + } + }, + { + "title": "Rule Reconciliation Age", + "type": "timeseries", + "gridPos": {"h": 7, "w": 8, "x": 0, "y": 30}, + "id": 10, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "time() - node_readiness_rule_last_reconciliation_timestamp_seconds", + "legendFormat": "{{rule}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": {"drawStyle": "line", "fillOpacity": 20} + } + } + }, + { + "title": "Workqueue Depth (Backlog)", + "type": "timeseries", + "gridPos": {"h": 7, "w": 8, "x": 8, "y": 30}, + "id": 11, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "sum(workqueue_depth)", + "legendFormat": "Queue Depth" + } + ], + "fieldConfig": { + "defaults": { + "custom": {"drawStyle": "line", "fillOpacity": 30} + } + } + }, + { + "title": "Controller Memory Usage", + "type": "timeseries", + "gridPos": {"h": 7, "w": 8, "x": 16, "y": 30}, + "id": 12, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "sum(container_memory_working_set_bytes{namespace=\"nrr-system\", container=\"manager\"})", + "legendFormat": "Container Working Set" + }, + { + "expr": "process_resident_memory_bytes", + "legendFormat": "Process RSS" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", + "custom": {"drawStyle": "line", "fillOpacity": 20} + } + } + }, + { + "title": "Controller CPU Usage", + "type": "timeseries", + "gridPos": {"h": 7, "w": 12, "x": 0, "y": 37}, + "id": 13, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"nrr-system\", container=\"manager\"}[1m]))", + "legendFormat": "Container CPU" + }, + { + "expr": "rate(process_cpu_seconds_total[1m])", + "legendFormat": "Process CPU" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": {"drawStyle": "line", "fillOpacity": 20} + } + } + }, + { + "title": "Bootstrap Duration by Rule (P95)", + "type": "timeseries", + "gridPos": {"h": 7, "w": 12, "x": 12, "y": 37}, + "id": 14, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum by (le, rule) (rate(node_readiness_bootstrap_duration_seconds_bucket[5m])))", + "legendFormat": "P95 {{rule}}" + } + ], + "options": { + "legend": {"displayMode": "list", "placement": "bottom"} + }, + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": {"drawStyle": "line", "fillOpacity": 15, "lineWidth": 2} + } + } + }, + { + "title": "Bootstrap Duration Rate / Samples", + "type": "timeseries", + "gridPos": {"h": 7, "w": 12, "x": 0, "y": 44}, + "id": 15, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "sum(rate(node_readiness_bootstrap_duration_seconds_count[5m]))", + "legendFormat": "bootstrap samples/sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": {"drawStyle": "line", "fillOpacity": 20} + } + } + }, + { + "title": "Total Taint Operations", + "type": "timeseries", + "gridPos": {"h": 7, "w": 12, "x": 12, "y": 44}, + "id": 16, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "sum(node_readiness_taint_operations_total{operation=\"add\"})", + "legendFormat": "adds" + }, + { + "expr": "sum(node_readiness_taint_operations_total{operation=\"remove\"})", + "legendFormat": "removes" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": {"drawStyle": "line", "fillOpacity": 20} + } + } + } + ], + "refresh": "5s", + "schemaVersion": 38, + "tags": ["nrr", "kubernetes", "scale-test"], + "time": {"from": "now-15m", "to": "now"}, + "timepicker": {"refresh_intervals": ["5s", "10s", "30s", "1m"]}, + "title": "Node Readiness Controller - Production Scale", + "uid": "nrr-scale-dashboard", + "version": 1 +} \ No newline at end of file diff --git a/hack/test-workloads/scale/kind-config.yaml b/hack/test-workloads/scale/kind-config.yaml new file mode 100644 index 0000000..e3e8c17 --- /dev/null +++ b/hack/test-workloads/scale/kind-config.yaml @@ -0,0 +1,7 @@ +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +name: nrr-test +nodes: +- role: control-plane +- role: worker +- role: worker diff --git a/hack/test-workloads/scale/scale-test.sh b/hack/test-workloads/scale/scale-test.sh new file mode 100755 index 0000000..1d7f9bd --- /dev/null +++ b/hack/test-workloads/scale/scale-test.sh @@ -0,0 +1,162 @@ +#!/usr/bin/env bash + +# Copyright The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +################################################################################ +# NRR SCALE TEST UTILITY - HELP GUIDE +################################################################################ +# USAGE: +# ./scale-test.sh +# +# ARGUMENTS: +# NODE_COUNT : Total fake KWOK nodes to create (Default: 10) +# RULE_COUNT : Total NodeReadinessRules to evaluate per node (Default: 1) +# +# DESCRIPTION: +# 1. Cleans up previous test artifacts (nodes and rules). +# 2. Creates N rules with unique conditions and taints. +# 3. Spawns M KWOK nodes in parallel batches. +# 4. Measures 'Taint Addition' latency (Controller reacting to new nodes). +# 5. Patches all nodes to satisfy all rule conditions. +# 6. Measures 'Taint Removal' latency (Controller finalizing readiness). +################################################################################ + +# Input Parameters +NODE_COUNT=${1:-10} +RULE_COUNT=${2:-1} +BATCH_SIZE=50 +BASE_RULE_NAME="kwok-network-rule" + +# Validate input +if ! [[ "$NODE_COUNT" =~ ^[0-9]+$ ]] || ! [[ "$RULE_COUNT" =~ ^[0-9]+$ ]]; then + echo "Error: Please provide valid positive numbers for node and rule counts" + echo "Example: ./scale-test.sh 1000 3" + exit 1 +fi + +echo "🚀 Starting Scale Test: $NODE_COUNT Nodes | $RULE_COUNT Rules" +echo "----------------------------------------------------------" + +# Step 0: Cleanup +echo "Step 0: Cleaning up existing resources..." +kubectl delete nodereadinessrules -l scale-test=true --ignore-not-found=true +kubectl delete nodes -l kwok.x-k8s.io/node=fake --ignore-not-found=true +sleep 2 + +# Step 1: Create Multiple Rules +echo "Step 1: Creating $RULE_COUNT rules..." +for r in $(seq 1 $RULE_COUNT); do + cat </dev/null +apiVersion: v1 +kind: Node +metadata: + name: kwok-node-$1 + labels: { kwok.x-k8s.io/node: fake } +spec: + taints: [{key: "kwok.x-k8s.io/node", value: "fake", effect: "NoSchedule"}] +status: + allocatable: {cpu: "32", memory: "256Gi", pods: "110"} + capacity: {cpu: "32", memory: "256Gi", pods: "110"} + conditions: [{type: "Ready", status: "True", reason: "KubeletReady", message: "ready", lastHeartbeatTime: "$(date -u +"%Y-%m-%dT%H:%M:%SZ")", lastTransitionTime: "$(date -u +"%Y-%m-%dT%H:%M:%SZ")"}] +EOF +} + +for batch_start in $(seq 1 $BATCH_SIZE $NODE_COUNT); do + batch_end=$((batch_start + BATCH_SIZE - 1)) + [ $batch_end -gt $NODE_COUNT ] && batch_end=$NODE_COUNT + for i in $(seq $batch_start $batch_end); do create_node $i & done + wait +done + +# Step 3: Wait for ALL taints +echo "Step 3: Waiting for Controller to add $((NODE_COUNT * RULE_COUNT)) total taints..." +while true; do + TOTAL_TAINTS=$(kubectl get nodes -l kwok.x-k8s.io/node=fake -o json | jq "[.items[].spec.taints // [] | .[] | select(.key | startswith(\"readiness.k8s.io/network-unready\"))] | length") + [ "$TOTAL_TAINTS" -eq $((NODE_COUNT * RULE_COUNT)) ] && break + echo -n "[$TOTAL_TAINTS]" && sleep 1 +done +TAINT_END_TIME=$(date +%s); TAINT_END_NANOS=$(date +%N) + +# Step 4: Patch Conditions +echo -e "\nStep 4: Satisfying conditions for all rules..." +UNTAINT_START_TIME=$(date +%s); UNTAINT_START_NANOS=$(date +%N) + +patch_node_conditions() { + PATCH_JSON="[" + for r in $(seq 1 $RULE_COUNT); do + PATCH_JSON+="{\"op\":\"add\",\"path\":\"/status/conditions/-\",\"value\":{\"type\":\"network.kubernetes.io/CNIReady-$r\",\"status\":\"True\",\"lastHeartbeatTime\":\"$(date -u +"%Y-%m-%dT%H:%M:%SZ")\",\"lastTransitionTime\":\"$(date -u +"%Y-%m-%dT%H:%M:%SZ")\",\"reason\":\"CNIReady\",\"message\":\"ready\"}}" + [ $r -lt $RULE_COUNT ] && PATCH_JSON+="," + done + PATCH_JSON+="]" + kubectl patch node kwok-node-$1 --subresource=status --type=json -p="$PATCH_JSON" > /dev/null 2>&1 +} + +for batch_start in $(seq 1 $BATCH_SIZE $NODE_COUNT); do + batch_end=$((batch_start + BATCH_SIZE - 1)) + [ $batch_end -gt $NODE_COUNT ] && batch_end=$NODE_COUNT + for i in $(seq $batch_start $batch_end); do patch_node_conditions $i & done + wait +done + +# Step 5: Wait for Removal +echo "Step 5: Waiting for Taint removal..." +while true; do + REMAINING=$(kubectl get nodes -l kwok.x-k8s.io/node=fake -o json | jq "[.items[].spec.taints // [] | .[] | select(.key | startswith(\"readiness.k8s.io/network-unready\"))] | length") + [ "$REMAINING" -eq 0 ] && break + echo -n "[$REMAINING]" && sleep 1 +done +UNTAINT_END_TIME=$(date +%s); UNTAINT_END_NANOS=$(date +%N) + +# Step 6: Final Stats +TAINT_MS=$(echo "scale=0; (($TAINT_END_TIME - $TAINT_START_TIME) * 1000) + (($TAINT_END_NANOS - $TAINT_START_NANOS) / 1000000)" | bc) +UNTAINT_MS=$(echo "scale=0; (($UNTAINT_END_TIME - $UNTAINT_START_TIME) * 1000) + (($UNTAINT_END_NANOS - $UNTAINT_START_NANOS) / 1000000)" | bc) +AVG_SIZE=$(kubectl get nodereadinessrules -l scale-test=true -o json | jq '[.items[] | tostring | length] | add / length') + +echo -e "\n\n╔════════════════════════════════════════════════════════════════╗" +echo "║ MULTI-RULE PERFORMANCE SUMMARY ║" +echo "╠════════════════════════════════════════════════════════════════╣" +printf "║ Total Nodes: %-40s║\n" "$NODE_COUNT" +printf "║ Active Rules: %-40s║\n" "$RULE_COUNT" +printf "║ Taint Add Time: %-40s║\n" "${TAINT_MS} ms" +printf "║ Taint Remove Time: %-40s║\n" "${UNTAINT_MS} ms" +echo "║ ║" +printf "║ Avg Rule Size: %-40s║\n" "${AVG_SIZE%.*} bytes" +echo "╚════════════════════════════════════════════════════════════════╝" diff --git a/hack/test-workloads/scale/servicemonitor.yaml b/hack/test-workloads/scale/servicemonitor.yaml new file mode 100644 index 0000000..610c41e --- /dev/null +++ b/hack/test-workloads/scale/servicemonitor.yaml @@ -0,0 +1,21 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: node-readiness-controller-monitor + namespace: monitoring + labels: + release: prom-stack # This matches your helm release +spec: + # This tells Prometheus to look in the nrr-system namespace + namespaceSelector: + matchNames: + - nrr-system + # This matches the labels found in your 'kubectl get svc' output + selector: + matchLabels: + control-plane: controller-manager + app.kubernetes.io/name: nrrcontroller + endpoints: + - port: http # Match the spec.ports[0].name from your service + scheme: http # Your service is not using HTTPS/TLS + interval: 5s # High-resolution for your scale test diff --git a/hack/test-workloads/scale/setup-monitoring.sh b/hack/test-workloads/scale/setup-monitoring.sh new file mode 100755 index 0000000..1c0f2e5 --- /dev/null +++ b/hack/test-workloads/scale/setup-monitoring.sh @@ -0,0 +1,230 @@ +#!/usr/bin/env bash + +# Copyright The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Complete setup script for NRR scale testing with Prometheus and Grafana +# This script: +# 1. Creates a Kind cluster +# 2. Installs NRR using Podman +# 3. Installs Prometheus stack +# 4. Creates ServiceMonitor +# 5. Sets up port forwarding +# 6. Provides instructions for Grafana dashboard import + +set -euo pipefail + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Configuration +CLUSTER_NAME="${CLUSTER_NAME:-nrr-test}" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" + +echo -e "${BLUE}=========================================${NC}" +echo -e "${BLUE}NRR Scale Test Setup with Monitoring${NC}" +echo -e "${BLUE}=========================================${NC}" +echo "" + +# Function to print step headers +print_step() { + echo "" + echo -e "${GREEN}==> $1${NC}" +} + +# Function to print warnings +print_warning() { + echo -e "${YELLOW}⚠️ $1${NC}" +} + +# Function to print errors +print_error() { + echo -e "${RED}❌ $1${NC}" +} + +# Function to print success +print_success() { + echo -e "${GREEN}✓ $1${NC}" +} + +# Check prerequisites +print_step "Step 1: Checking prerequisites..." + +MISSING_TOOLS=() + +if ! command -v kind &> /dev/null; then + MISSING_TOOLS+=("kind") +fi + +if ! command -v kubectl &> /dev/null; then + MISSING_TOOLS+=("kubectl") +fi + +if ! command -v helm &> /dev/null; then + MISSING_TOOLS+=("helm") +fi + +if ! command -v podman &> /dev/null; then + MISSING_TOOLS+=("podman") +fi + +if [ ${#MISSING_TOOLS[@]} -ne 0 ]; then + print_error "Missing required tools: ${MISSING_TOOLS[*]}" + echo "" + echo "Please install:" + for tool in "${MISSING_TOOLS[@]}"; do + echo " - $tool" + done + exit 1 +fi + +print_success "All prerequisites installed" + +# Create Kind cluster and install NRR +print_step "Step 2: Creating Kind cluster and installing NRR with Podman..." +cd "$PROJECT_ROOT" + +if kind get clusters | grep -q "^${CLUSTER_NAME}$"; then + print_warning "Cluster '$CLUSTER_NAME' already exists. Deleting..." + kind delete cluster --name "$CLUSTER_NAME" +fi + +# Run the podman-kind-test target +print_success "Running: make podman-kind-test" +if ! make podman-kind-test KIND_CLUSTER="$CLUSTER_NAME"; then + print_error "Failed to create cluster and install NRR" + exit 1 +fi + +print_success "NRR installed successfully" + +# Add Prometheus Helm repo +print_step "Step 3: Setting up Prometheus stack..." + +print_success "Adding Prometheus Helm repository..." +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update + +# Install Prometheus stack +print_success "Installing kube-prometheus-stack..." +helm upgrade --install prom-stack prometheus-community/kube-prometheus-stack \ + --namespace monitoring \ + --create-namespace \ + --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ + --set prometheus.prometheusSpec.scrapeInterval="5s" \ + --set nodeExporter.enabled=false \ + --wait \ + --timeout 5m + +print_success "Prometheus stack installed" + +# Wait for Prometheus pods to be ready +print_success "Waiting for Prometheus pods to be ready..." +kubectl wait --for=condition=ready pod \ + -l app.kubernetes.io/name=prometheus \ + -n monitoring \ + --timeout=300s + +kubectl wait --for=condition=ready pod \ + -l app.kubernetes.io/name=grafana \ + -n monitoring \ + --timeout=300s + +print_success "Prometheus and Grafana are ready" + +# Create ServiceMonitor +print_step "Step 4: Creating ServiceMonitor for NRR metrics..." + +if [ ! -f "$SCRIPT_DIR/servicemonitor.yaml" ]; then + print_error "ServiceMonitor file not found: $SCRIPT_DIR/servicemonitor.yaml" + exit 1 +fi + +kubectl apply -f "$SCRIPT_DIR/servicemonitor.yaml" +print_success "ServiceMonitor created" + +# Get Grafana admin password +print_step "Step 5: Retrieving Grafana credentials..." +GRAFANA_PASSWORD=$(kubectl get secret --namespace monitoring prom-stack-grafana -o jsonpath="{.data.admin-password}" | base64 --decode) + +print_success "Grafana admin password retrieved" + +# Setup port forwarding +print_step "Step 6: Setting up port forwarding..." + +# Kill any existing port forwards +pkill -f "port-forward.*grafana" 2>/dev/null || true +pkill -f "port-forward.*prometheus" 2>/dev/null || true + +# Start port forwarding in background +kubectl port-forward -n monitoring svc/prom-stack-grafana 3000:80 > /dev/null 2>&1 & +GRAFANA_PF_PID=$! + +kubectl port-forward -n monitoring svc/prom-stack-kube-prometheus-prometheus 9090:9090 > /dev/null 2>&1 & +PROMETHEUS_PF_PID=$! + +# Wait for port forwards to be ready +sleep 3 + +print_success "Port forwarding established" + +# Print final instructions +echo "" +echo -e "${BLUE}=========================================${NC}" +echo -e "${BLUE}Setup Complete!${NC}" +echo -e "${BLUE}=========================================${NC}" +echo "" +echo -e "${GREEN}📊 Access URLs:${NC}" +echo -e " Grafana: ${BLUE}http://localhost:3000${NC}" +echo -e " Prometheus: ${BLUE}http://localhost:9090${NC}" +echo "" +echo -e "${GREEN}🔐 Grafana Credentials:${NC}" +echo -e " Username: ${BLUE}admin${NC}" +echo -e " Password: ${BLUE}${GRAFANA_PASSWORD}${NC}" +echo "" +echo -e "${GREEN}📈 Import Dashboard:${NC}" +echo " 1. Open Grafana: http://localhost:3000" +echo " 2. Login with credentials above" +echo " 3. Go to: Dashboards → Import" +echo " 4. Click 'Upload JSON file'" +echo " 5. Select: $SCRIPT_DIR/graphana-dashboard.json" +echo " 6. Select Prometheus datasource" +echo " 7. Click 'Import'" +echo "" +echo -e "${GREEN}🚀 Run Scale Test:${NC}" +echo " cd $PROJECT_ROOT" +echo " ./scale/new-script.sh 1000" +echo "" +echo -e "${GREEN}🧹 Cleanup:${NC}" +echo " ./hack/test-workloads/scale/cleanup-kwok-nodes-rules.sh" +echo " kind delete cluster --name $CLUSTER_NAME" +echo "" +echo -e "${YELLOW}⚠️ Port forwarding is running in background${NC}" +echo -e "${YELLOW} PIDs: Grafana=$GRAFANA_PF_PID, Prometheus=$PROMETHEUS_PF_PID${NC}" +echo -e "${YELLOW} To stop: kill $GRAFANA_PF_PID $PROMETHEUS_PF_PID${NC}" +echo "" +echo -e "${GREEN}Press Ctrl+C to stop port forwarding and exit${NC}" + +# Keep script running to maintain port forwards +trap "echo ''; echo 'Stopping port forwarding...'; kill $GRAFANA_PF_PID $PROMETHEUS_PF_PID 2>/dev/null; exit 0" INT TERM + +# Wait for port forward processes +wait $GRAFANA_PF_PID $PROMETHEUS_PF_PID + +# Made with Bob diff --git a/internal/controller/node_controller.go b/internal/controller/node_controller.go index fc04d27..7a0152f 100644 --- a/internal/controller/node_controller.go +++ b/internal/controller/node_controller.go @@ -19,6 +19,7 @@ package controller import ( "context" "fmt" + "time" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -128,6 +129,9 @@ func (r *RuleReadinessController) processNodeAgainstAllRules(ctx context.Context if r.isBootstrapCompleted(ctx, node.Name, rule.Name) && rule.Spec.EnforcementMode == readinessv1alpha1.EnforcementModeBootstrapOnly { log.Info("Skipping bootstrap-only rule - already completed", "node", node.Name, "rule", rule.Name) + // Update metrics to reflect current state even when skipping + r.updateNodesByStateMetrics(ctx, rule) + metrics.RuleLastReconciliationTime.WithLabelValues(rule.Name).Set(float64(time.Now().Unix())) continue } @@ -156,8 +160,9 @@ func (r *RuleReadinessController) processNodeAgainstAllRules(ctx context.Context "rule", rule.Name, "resourceVersion", rule.ResourceVersion) + var latestRule *readinessv1alpha1.NodeReadinessRule err := retry.RetryOnConflict(retry.DefaultRetry, func() error { - latestRule := &readinessv1alpha1.NodeReadinessRule{} + latestRule = &readinessv1alpha1.NodeReadinessRule{} if err := r.Get(ctx, client.ObjectKey{Name: rule.Name}, latestRule); err != nil { return err } @@ -212,10 +217,12 @@ func (r *RuleReadinessController) processNodeAgainstAllRules(ctx context.Context "resourceVersion", rule.ResourceVersion) // continue with other rules } else { + r.updateNodesByStateMetrics(ctx, latestRule) + metrics.RuleLastReconciliationTime.WithLabelValues(latestRule.Name).Set(float64(time.Now().Unix())) log.V(4).Info("Successfully persisted rule status from node reconciler", "node", node.Name, - "rule", rule.Name, - "newResourceVersion", rule.ResourceVersion) + "rule", latestRule.Name, + "newResourceVersion", latestRule.ResourceVersion) } } } diff --git a/internal/controller/node_controller_test.go b/internal/controller/node_controller_test.go index fee7160..6dc19d7 100644 --- a/internal/controller/node_controller_test.go +++ b/internal/controller/node_controller_test.go @@ -18,24 +18,21 @@ package controller import ( "context" - "sync/atomic" "time" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + dto "github.com/prometheus/client_model/go" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes/fake" "k8s.io/client-go/tools/record" - "sigs.k8s.io/controller-runtime/pkg/client" - fakeclient "sigs.k8s.io/controller-runtime/pkg/client/fake" - "sigs.k8s.io/controller-runtime/pkg/client/interceptor" "sigs.k8s.io/controller-runtime/pkg/reconcile" nodereadinessiov1alpha1 "sigs.k8s.io/node-readiness-controller/api/v1alpha1" + "sigs.k8s.io/node-readiness-controller/internal/metrics" ) var _ = Describe("Node Controller", func() { @@ -694,247 +691,144 @@ var _ = Describe("Node Controller", func() { }, time.Second*5).Should(BeTrue(), "NodeEvaluation should be updated with new condition and taint status") }) }) - - // These tests use the controller-runtime fake client (not envtest's - // k8sClient) with interceptors to simulate concurrent node modifications. - // The fake client enforces resourceVersion checks, so when - // MergeFromWithOptimisticLock is used and another write bumps the - // resourceVersion, the patch fails with a Conflict error — the same - // behavior a real API server would produce. - Context("optimistic locking on taint operations", func() { + Context("when updating aggregate readiness-state metrics from node reconciliation", func() { var ( - ctx context.Context - testScheme *runtime.Scheme + ctx context.Context + readinessController *RuleReadinessController + nodeReconciler *NodeReconciler + fakeClientset *fake.Clientset + node1 *corev1.Node + node2 *corev1.Node + rule *nodereadinessiov1alpha1.NodeReadinessRule ) + readGaugeValue := func(ruleName, state string) float64 { + metric := &dto.Metric{} + Expect(nodereadinessiov1alpha1.EnforcementModeContinuous).NotTo(BeEmpty()) + Expect(metrics.NodesByState.WithLabelValues(ruleName, state).Write(metric)).To(Succeed()) + return metric.GetGauge().GetValue() + } + BeforeEach(func() { ctx = context.Background() - testScheme = runtime.NewScheme() - Expect(corev1.AddToScheme(testScheme)).To(Succeed()) - }) - - It("should retry and succeed when removeTaintBySpec encounters a conflict", func() { - node := &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{Name: "ol-remove-conflict"}, - Spec: corev1.NodeSpec{ - Taints: []corev1.Taint{ - {Key: "readiness.k8s.io/test", Effect: corev1.TaintEffectNoSchedule}, - {Key: "other-controller/taint", Effect: corev1.TaintEffectNoSchedule}, - }, - }, - } - var patchCount atomic.Int32 - - // The interceptor simulates a concurrent modification: on the - // first Patch call it updates the node (bumping resourceVersion) - // before delegating to the real Patch. Because - // MergeFromWithOptimisticLock embeds the original resourceVersion, - // the fake client detects the mismatch and returns a Conflict. - // The retry logic should handle this and succeed on the second attempt. - fc := fakeclient.NewClientBuilder(). - WithScheme(testScheme). - WithObjects(node). - WithInterceptorFuncs(interceptor.Funcs{ - Patch: func(ctx context.Context, c client.WithWatch, obj client.Object, patch client.Patch, opts ...client.PatchOption) error { - if obj.GetName() == "ol-remove-conflict" && patchCount.Add(1) == 1 { - // Simulate concurrent modification by another controller. - current := &corev1.Node{} - Expect(c.Get(ctx, types.NamespacedName{Name: obj.GetName()}, current)).To(Succeed()) - current.Spec.Taints = append(current.Spec.Taints, corev1.Taint{ - Key: "concurrent-controller/new-taint", Effect: corev1.TaintEffectNoSchedule, - }) - Expect(c.Update(ctx, current)).To(Succeed()) - } - return c.Patch(ctx, obj, patch, opts...) - }, - }). - Build() - - controller := &RuleReadinessController{ - Client: fc, - Scheme: testScheme, - clientset: fake.NewSimpleClientset(), + fakeClientset = fake.NewSimpleClientset() + readinessController = &RuleReadinessController{ + Client: k8sClient, + Scheme: k8sClient.Scheme(), + clientset: fakeClientset, ruleCache: make(map[string]*nodereadinessiov1alpha1.NodeReadinessRule), EventRecorder: record.NewFakeRecorder(10), } - Expect(fc.Get(ctx, types.NamespacedName{Name: node.Name}, node)).To(Succeed()) - - err := controller.removeTaintBySpec(ctx, node, corev1.Taint{ - Key: "readiness.k8s.io/test", - Effect: corev1.TaintEffectNoSchedule, - }, "test-rule") - - // Should succeed after retry - Expect(err).NotTo(HaveOccurred()) - - // Verify the taint was removed and concurrent modification was preserved - updated := &corev1.Node{} - Expect(fc.Get(ctx, types.NamespacedName{Name: node.Name}, updated)).To(Succeed()) - Expect(updated.Spec.Taints).To(HaveLen(2)) - - // Check that our taint was removed but the others remain - taintKeys := make(map[string]bool) - for _, taint := range updated.Spec.Taints { - taintKeys[taint.Key] = true + nodeReconciler = &NodeReconciler{ + Client: k8sClient, + Scheme: k8sClient.Scheme(), + Controller: readinessController, } - Expect(taintKeys).NotTo(HaveKey("readiness.k8s.io/test")) - Expect(taintKeys).To(HaveKey("other-controller/taint")) - Expect(taintKeys).To(HaveKey("concurrent-controller/new-taint")) - // Verify that the patch was attempted twice (first failed, second succeeded) - Expect(patchCount.Load()).To(BeNumerically(">=", 2)) - }) + rule = &nodereadinessiov1alpha1.NodeReadinessRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: "aggregate-metrics-rule", + }, + Spec: nodereadinessiov1alpha1.NodeReadinessRuleSpec{ + Conditions: []nodereadinessiov1alpha1.ConditionRequirement{ + {Type: "AggregateCondition", RequiredStatus: corev1.ConditionTrue}, + }, + Taint: corev1.Taint{ + Key: "readiness.k8s.io/aggregate-test", + Effect: corev1.TaintEffectNoSchedule, + }, + NodeSelector: metav1.LabelSelector{ + MatchLabels: map[string]string{"aggregate-test": "true"}, + }, + EnforcementMode: nodereadinessiov1alpha1.EnforcementModeContinuous, + }, + } - It("should retry and succeed when addTaintBySpec encounters a conflict", func() { - node := &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{Name: "ol-add-conflict"}, + node1 = &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "aggregate-node-1", + Labels: map[string]string{"aggregate-test": "true"}, + }, Spec: corev1.NodeSpec{ Taints: []corev1.Taint{ - {Key: "other-controller/taint", Effect: corev1.TaintEffectNoSchedule}, + {Key: "readiness.k8s.io/aggregate-test", Effect: corev1.TaintEffectNoSchedule}, }, }, - } - - var patchCount atomic.Int32 - - // The interceptor simulates a concurrent modification on the first - // patch attempt, which should trigger a retry that succeeds. - fc := fakeclient.NewClientBuilder(). - WithScheme(testScheme). - WithObjects(node). - WithInterceptorFuncs(interceptor.Funcs{ - Patch: func(ctx context.Context, c client.WithWatch, obj client.Object, patch client.Patch, opts ...client.PatchOption) error { - if obj.GetName() == "ol-add-conflict" && patchCount.Add(1) == 1 { - current := &corev1.Node{} - Expect(c.Get(ctx, types.NamespacedName{Name: obj.GetName()}, current)).To(Succeed()) - current.Spec.Taints = append(current.Spec.Taints, corev1.Taint{ - Key: "concurrent-controller/new-taint", Effect: corev1.TaintEffectNoSchedule, - }) - Expect(c.Update(ctx, current)).To(Succeed()) - } - return c.Patch(ctx, obj, patch, opts...) + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + {Type: "AggregateCondition", Status: corev1.ConditionTrue}, }, - }). - Build() - - controller := &RuleReadinessController{ - Client: fc, - Scheme: testScheme, - clientset: fake.NewSimpleClientset(), - ruleCache: make(map[string]*nodereadinessiov1alpha1.NodeReadinessRule), - EventRecorder: record.NewFakeRecorder(10), - } - - Expect(fc.Get(ctx, types.NamespacedName{Name: node.Name}, node)).To(Succeed()) - - err := controller.addTaintBySpec(ctx, node, corev1.Taint{ - Key: "readiness.k8s.io/test", - Effect: corev1.TaintEffectNoSchedule, - }, "test-rule") - - // Should succeed after retry - Expect(err).NotTo(HaveOccurred()) - - // Verify both taints are present (ours and the concurrent one) - updated := &corev1.Node{} - Expect(fc.Get(ctx, types.NamespacedName{Name: node.Name}, updated)).To(Succeed()) - Expect(updated.Spec.Taints).To(HaveLen(3)) - - // Check that all expected taints are present - taintKeys := make(map[string]bool) - for _, taint := range updated.Spec.Taints { - taintKeys[taint.Key] = true + }, } - Expect(taintKeys).To(HaveKey("readiness.k8s.io/test")) - Expect(taintKeys).To(HaveKey("other-controller/taint")) - Expect(taintKeys).To(HaveKey("concurrent-controller/new-taint")) - // Verify that the patch was attempted twice (first failed, second succeeded) - Expect(patchCount.Load()).To(BeNumerically(">=", 2)) - }) - - It("should succeed when no concurrent modification occurs", func() { - node := &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{Name: "ol-no-conflict"}, + node2 = &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "aggregate-node-2", + Labels: map[string]string{"aggregate-test": "true"}, + }, Spec: corev1.NodeSpec{ Taints: []corev1.Taint{ - {Key: "readiness.k8s.io/test", Effect: corev1.TaintEffectNoSchedule}, - {Key: "other/taint", Effect: corev1.TaintEffectNoSchedule}, + {Key: "readiness.k8s.io/aggregate-test", Effect: corev1.TaintEffectNoSchedule}, + }, + }, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + {Type: "AggregateCondition", Status: corev1.ConditionFalse}, }, }, } - fc := fakeclient.NewClientBuilder(). - WithScheme(testScheme). - WithObjects(node). - Build() - - controller := &RuleReadinessController{ - Client: fc, - Scheme: testScheme, - clientset: fake.NewSimpleClientset(), - ruleCache: make(map[string]*nodereadinessiov1alpha1.NodeReadinessRule), - EventRecorder: record.NewFakeRecorder(10), - } + metrics.CleanupRuleMetrics(rule.Name) + }) - Expect(fc.Get(ctx, types.NamespacedName{Name: node.Name}, node)).To(Succeed()) + JustBeforeEach(func() { + Expect(k8sClient.Create(ctx, node1)).To(Succeed()) + Expect(k8sClient.Create(ctx, node2)).To(Succeed()) + Expect(k8sClient.Create(ctx, rule)).To(Succeed()) + readinessController.updateRuleCache(ctx, rule) + }) - err := controller.removeTaintBySpec(ctx, node, corev1.Taint{ - Key: "readiness.k8s.io/test", - Effect: corev1.TaintEffectNoSchedule, - }, "test-rule") - Expect(err).NotTo(HaveOccurred()) + AfterEach(func() { + metrics.CleanupRuleMetrics(rule.Name) - updated := &corev1.Node{} - Expect(fc.Get(ctx, types.NamespacedName{Name: node.Name}, updated)).To(Succeed()) - Expect(updated.Spec.Taints).To(HaveLen(1)) - Expect(updated.Spec.Taints[0].Key).To(Equal("other/taint")) - }) + _ = k8sClient.Delete(ctx, node1) + _ = k8sClient.Delete(ctx, node2) - It("should skip patch when removing a taint that does not exist", func() { - node := &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{Name: "ol-noop"}, - Spec: corev1.NodeSpec{ - Taints: []corev1.Taint{ - {Key: "other/taint", Effect: corev1.TaintEffectNoSchedule}, - }, - }, + updatedRule := &nodereadinessiov1alpha1.NodeReadinessRule{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: rule.Name}, updatedRule); err == nil { + updatedRule.Finalizers = nil + _ = k8sClient.Update(ctx, updatedRule) + _ = k8sClient.Delete(ctx, updatedRule) } - var patchCalled atomic.Bool - - fc := fakeclient.NewClientBuilder(). - WithScheme(testScheme). - WithObjects(node). - WithInterceptorFuncs(interceptor.Funcs{ - Patch: func(ctx context.Context, c client.WithWatch, obj client.Object, patch client.Patch, opts ...client.PatchOption) error { - if obj.GetName() == "ol-noop" { - patchCalled.Store(true) - } - return c.Patch(ctx, obj, patch, opts...) - }, - }). - Build() + Eventually(func() bool { + err := k8sClient.Get(ctx, types.NamespacedName{Name: rule.Name}, &nodereadinessiov1alpha1.NodeReadinessRule{}) + return apierrors.IsNotFound(err) + }, time.Second*10).Should(BeTrue()) - controller := &RuleReadinessController{ - Client: fc, - Scheme: testScheme, - clientset: fake.NewSimpleClientset(), - ruleCache: make(map[string]*nodereadinessiov1alpha1.NodeReadinessRule), - EventRecorder: record.NewFakeRecorder(10), - } + readinessController.removeRuleFromCache(ctx, rule.Name) + }) - Expect(fc.Get(ctx, types.NamespacedName{Name: node.Name}, node)).To(Succeed()) + It("should refresh NodesByState using aggregate rule status during node reconciliation", func() { + _, err := nodeReconciler.Reconcile(ctx, reconcile.Request{NamespacedName: types.NamespacedName{Name: node1.Name}}) + Expect(err).NotTo(HaveOccurred()) - err := controller.removeTaintBySpec(ctx, node, corev1.Taint{ - Key: "readiness.k8s.io/nonexistent", - Effect: corev1.TaintEffectNoSchedule, - }, "test-rule") + _, err = nodeReconciler.Reconcile(ctx, reconcile.Request{NamespacedName: types.NamespacedName{Name: node2.Name}}) Expect(err).NotTo(HaveOccurred()) - Expect(patchCalled.Load()).To(BeFalse(), - "Patch should not be called when taint removal is a no-op") + + Eventually(func() float64 { + return readGaugeValue(rule.Name, "ready") + }, time.Second*5).Should(Equal(float64(1))) + + Eventually(func() float64 { + return readGaugeValue(rule.Name, "not_ready") + }, time.Second*5).Should(Equal(float64(1))) + + Consistently(func() float64 { + return readGaugeValue(rule.Name, "bootstrapping") + }, time.Second).Should(Equal(float64(0))) }) }) }) diff --git a/internal/controller/nodereadinessrule_controller.go b/internal/controller/nodereadinessrule_controller.go index 99347e9..1a6559d 100644 --- a/internal/controller/nodereadinessrule_controller.go +++ b/internal/controller/nodereadinessrule_controller.go @@ -46,6 +46,11 @@ import ( const ( // finalizerName is the finalizer added to NodeReadinessRule to ensure cleanup. finalizerName = "readiness.node.x-k8s.io/cleanup-taints" + + // maxLatencyRecordingWindow is the maximum time window after a condition transition + // during which we record reconciliation latency metrics. This prevents skewing metrics + // when applying new rules to old, existing nodes. + maxLatencyRecordingWindow = 5 * time.Minute ) // RuleReadinessController manages node taints based on readiness rules. @@ -190,6 +195,10 @@ func (r *RuleReconciler) reconcileDelete(ctx context.Context, rule *readinessv1a log.V(3).Info("Removing the rule from cache") r.Controller.removeRuleFromCache(ctx, rule.Name) + // Clean up Prometheus metrics for this rule to prevent ghost metrics + log.V(3).Info("Cleaning up Prometheus metrics for deleted rule", "rule", rule.Name) + metrics.CleanupRuleMetrics(rule.Name) + log.V(3).Info("Removing the finalizer from the rule") patch := client.MergeFrom(rule.DeepCopy()) controllerutil.RemoveFinalizer(rule, finalizerName) @@ -268,6 +277,7 @@ func (r *RuleReadinessController) processAllNodesForRule(ctx context.Context, ru log.Info("Processing all nodes for rule", "rule", rule.Name, "totalNodes", len(nodeList.Items)) var appliedNodes []string + for _, node := range nodeList.Items { if r.ruleAppliesTo(ctx, rule, &node) { appliedNodes = append(appliedNodes, node.Name) @@ -289,20 +299,30 @@ func (r *RuleReadinessController) processAllNodesForRule(ctx context.Context, ru rule.Status.DryRunResults = readinessv1alpha1.DryRunResults{} } + r.updateNodesByStateMetrics(ctx, rule) + + // Record rule-level reconciliation timestamp + metrics.RuleLastReconciliationTime.WithLabelValues(rule.Name).Set(float64(time.Now().Unix())) + log.Info("Completed processing nodes for rule", "rule", rule.Name, "processedCount", len(appliedNodes)) return nil } // evaluateRuleForNode evaluates a single rule against a single node. func (r *RuleReadinessController) evaluateRuleForNode(ctx context.Context, rule *readinessv1alpha1.NodeReadinessRule, node *corev1.Node) error { - timer := prometheus.NewTimer(metrics.EvaluationDuration) - defer timer.ObserveDuration() + // Track evaluation duration per rule + evalTimer := prometheus.NewTimer(metrics.EvaluationDuration.WithLabelValues(rule.Name)) + defer evalTimer.ObserveDuration() + log := ctrl.LoggerFrom(ctx) // Evaluate all conditions (ALL logic) allConditionsSatisfied := true conditionResults := make([]readinessv1alpha1.ConditionEvaluationResult, 0, len(rule.Spec.Conditions)) + // Track the most recent condition transition time for latency calculation + var mostRecentTransitionTime time.Time + for _, condReq := range rule.Spec.Conditions { currentStatus := r.getConditionStatus(node, condReq.Type) satisfied := currentStatus == condReq.RequiredStatus @@ -317,11 +337,25 @@ func (r *RuleReadinessController) evaluateRuleForNode(ctx context.Context, rule RequiredStatus: condReq.RequiredStatus, }) + // Track the most recent transition time across all conditions for latency calculation + for _, condition := range node.Status.Conditions { + if string(condition.Type) == condReq.Type { + if condition.LastTransitionTime.After(mostRecentTransitionTime) { + mostRecentTransitionTime = condition.LastTransitionTime.Time + } + break + } + } + log.V(1).Info("Condition evaluation", "node", node.Name, "rule", rule.Name, "conditionType", condReq.Type, "current", currentStatus, "required", condReq.RequiredStatus, - "satisfied", satisfied) + "satisfied", satisfied, "lastTransitionTime", mostRecentTransitionTime) } + // Log aggregate condition satisfaction status + log.Info("Conditions evaluated", "node", node.Name, "rule", rule.Name, + "allConditionsSatisfied", allConditionsSatisfied, "conditionCount", len(rule.Spec.Conditions)) + // Determine taint action shouldRemoveTaint := allConditionsSatisfied currentlyHasTaint := r.hasTaintBySpec(node, rule.Spec.Taint) @@ -341,11 +375,32 @@ func (r *RuleReadinessController) evaluateRuleForNode(ctx context.Context, rule metrics.Failures.WithLabelValues(rule.Name, "RemoveTaintError").Inc() return fmt.Errorf("failed to remove taint: %w", err) } + + // Only record latency if the condition transitioned recently (e.g., within the last 5 minutes). + // This prevents skewing metrics when applying new rules to old, existing nodes. + if !mostRecentTransitionTime.IsZero() && time.Since(mostRecentTransitionTime) < maxLatencyRecordingWindow { + latency := time.Since(mostRecentTransitionTime).Seconds() + metrics.ReconciliationLatency.WithLabelValues(rule.Name, "remove_taint").Observe(latency) + log.V(1).Info("Taint removal latency", "node", node.Name, "rule", rule.Name, + "latency", fmt.Sprintf("%.3fs", latency), + "conditionTransitionTime", mostRecentTransitionTime.Format(time.RFC3339)) + } metrics.TaintOperations.WithLabelValues(rule.Name, "remove").Inc() // Mark bootstrap completed if bootstrap-only mode if rule.Spec.EnforcementMode == readinessv1alpha1.EnforcementModeBootstrapOnly { r.markBootstrapCompleted(ctx, node.Name, rule.Name) + + // Calculate bootstrap duration from node creation to taint removal + // Use the node's creation timestamp directly. + bootstrapDuration := time.Since(node.CreationTimestamp.Time).Seconds() + metrics.BootstrapDuration.WithLabelValues(rule.Name).Observe(bootstrapDuration) + + log.Info("Bootstrap completed", + "node", node.Name, + "rule", rule.Name, + "duration", fmt.Sprintf("%.2fs", bootstrapDuration), + "nodeCreated", node.CreationTimestamp.Format(time.RFC3339)) } case !shouldRemoveTaint && !currentlyHasTaint: @@ -355,6 +410,16 @@ func (r *RuleReadinessController) evaluateRuleForNode(ctx context.Context, rule metrics.Failures.WithLabelValues(rule.Name, "AddTaintError").Inc() return fmt.Errorf("failed to add taint: %w", err) } + + // Calculate end-to-end latency from condition change to taint addition completion + // Only record if we have a valid condition transition time + if !mostRecentTransitionTime.IsZero() { + latency := time.Since(mostRecentTransitionTime).Seconds() + metrics.ReconciliationLatency.WithLabelValues(rule.Name, "add_taint").Observe(latency) + log.V(1).Info("Taint addition latency", "node", node.Name, "rule", rule.Name, + "latency", fmt.Sprintf("%.3fs", latency), + "conditionTransitionTime", mostRecentTransitionTime.Format(time.RFC3339)) + } metrics.TaintOperations.WithLabelValues(rule.Name, "add").Inc() case !shouldRemoveTaint && currentlyHasTaint: @@ -381,6 +446,12 @@ func (r *RuleReadinessController) evaluateRuleForNode(ctx context.Context, rule // Update evaluation status r.updateNodeEvaluationStatus(rule, node.Name, conditionResults, taintStatus) + // Log reconciliation completion with per-node details + now := time.Now() + log.Info("Node reconciliation completed", "node", node.Name, "rule", rule.Name, + "taintStatus", taintStatus, "allConditionsSatisfied", allConditionsSatisfied, + "timestamp", now.Unix()) + return nil } @@ -706,3 +777,38 @@ func (r *RuleReadinessController) getPreviousNodeEvaluation(rule *readinessv1alp } return nil } + +func (r *RuleReadinessController) updateNodesByStateMetrics(ctx context.Context, rule *readinessv1alpha1.NodeReadinessRule) { + nodeStates := map[string]int{ + "ready": 0, + "not_ready": 0, + "bootstrapping": 0, + } + + for _, evaluation := range rule.Status.NodeEvaluations { + allConditionsSatisfied := true + for _, conditionResult := range evaluation.ConditionResults { + if conditionResult.CurrentStatus != conditionResult.RequiredStatus { + allConditionsSatisfied = false + break + } + } + + hasTaint := evaluation.TaintStatus == readinessv1alpha1.TaintStatusPresent + + switch { + case allConditionsSatisfied && !hasTaint: + nodeStates["ready"]++ + case !allConditionsSatisfied && hasTaint && + rule.Spec.EnforcementMode == readinessv1alpha1.EnforcementModeBootstrapOnly && + !r.isBootstrapCompleted(ctx, evaluation.NodeName, rule.Name): + nodeStates["bootstrapping"]++ + default: + nodeStates["not_ready"]++ + } + } + + for state, count := range nodeStates { + metrics.NodesByState.WithLabelValues(rule.Name, state).Set(float64(count)) + } +} diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index bb0d906..7dc41c2 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -22,7 +22,7 @@ import ( ) var ( - // RulesTotal tracks the number of NodeReadinessRules . + // RulesTotal tracks the number of NodeReadinessRules. RulesTotal = prometheus.NewGauge( prometheus.GaugeOpts{ Name: "node_readiness_rules_total", @@ -40,12 +40,13 @@ var ( ) // EvaluationDuration tracks the duration of rule evaluations. - EvaluationDuration = prometheus.NewHistogram( + EvaluationDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: "node_readiness_evaluation_duration_seconds", Help: "Duration of rule evaluations", Buckets: prometheus.DefBuckets, }, + []string{"rule"}, ) // Failures tracks the number of operational failures. @@ -65,13 +66,78 @@ var ( }, []string{"rule"}, ) + + // ReconciliationLatency tracks end-to-end latency from condition change to taint operation. + // This measures how quickly the controller responds to node condition changes. + // Note: Uses in-memory tracking for condition transition times to avoid high cardinality. + ReconciliationLatency = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "node_readiness_reconciliation_latency_seconds", + Help: "End-to-end latency from node condition change to taint operation completion", + Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 30, 60, 120, 300}, // 10ms to 5min + }, + []string{"rule", "operation"}, // operation: add_taint, remove_taint + ) + + // BootstrapDuration tracks the time from node creation to bootstrap completion (taint removal). + // This measures the end-to-end bootstrap time for nodes in bootstrap-only mode. + // Bootstrap start time is tracked in-memory, not as a metric. + BootstrapDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "node_readiness_bootstrap_duration_seconds", + Help: "Time from node creation to bootstrap completion (taint removal) for bootstrap-only rules", + Buckets: []float64{1, 5, 10, 30, 60, 120, 300, 600, 1200}, // 1s to 20min + }, + []string{"rule"}, + ) + + // NodesByState tracks nodes in each readiness state per rule. + // Provides a quick overview of cluster health. + NodesByState = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "node_readiness_nodes_by_state", + Help: "Number of nodes in each readiness state per rule", + }, + []string{"rule", "state"}, // state: ready, not_ready, bootstrapping + ) + + // RuleLastReconciliationTime tracks when a rule was last reconciled. + // This provides rule-level visibility for admins to detect stuck rules. + RuleLastReconciliationTime = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "node_readiness_rule_last_reconciliation_timestamp_seconds", + Help: "Unix timestamp of the last rule reconciliation", + }, + []string{"rule"}, + ) ) func init() { - // Register custom metrics with the global prometheus registry metrics.Registry.MustRegister(RulesTotal) metrics.Registry.MustRegister(TaintOperations) metrics.Registry.MustRegister(EvaluationDuration) metrics.Registry.MustRegister(Failures) metrics.Registry.MustRegister(BootstrapCompleted) + metrics.Registry.MustRegister(ReconciliationLatency) + metrics.Registry.MustRegister(BootstrapDuration) + metrics.Registry.MustRegister(NodesByState) + metrics.Registry.MustRegister(RuleLastReconciliationTime) +} + +// CleanupRuleMetrics removes all Prometheus metrics associated with a deleted rule. +// This prevents memory leaks and "ghost" metrics in Grafana after a rule is deleted. +func CleanupRuleMetrics(ruleName string) { + ruleLabel := prometheus.Labels{"rule": ruleName} + + // For metrics that only have the "rule" label, use Delete() + BootstrapCompleted.Delete(ruleLabel) + RuleLastReconciliationTime.Delete(ruleLabel) + + // For metrics with multiple labels (like "rule" + "state"), use DeletePartialMatch() + TaintOperations.DeletePartialMatch(ruleLabel) + EvaluationDuration.DeletePartialMatch(ruleLabel) + Failures.DeletePartialMatch(ruleLabel) + ReconciliationLatency.DeletePartialMatch(ruleLabel) + BootstrapDuration.DeletePartialMatch(ruleLabel) + NodesByState.DeletePartialMatch(ruleLabel) }