From 3eeece0df49b0d0429ee8bdf5d0faa9823c1efdb Mon Sep 17 00:00:00 2001
From: Karthik Bhat <karthikkn1997@gmail.com>
Date: Mon, 9 Mar 2026 12:34:47 +0530
Subject: [PATCH] Add scale tests with monitoring

---
 cmd/readiness-condition-reporter/main_test.go |   1 -
 docs/book/src/operations/monitoring.md        | 177 ++++++-
 hack/test-workloads/scale/Makefile            | 283 ++++++++++
 hack/test-workloads/scale/README.md           | 489 ++++++++++++++++++
 .../scale/cleanup-kwok-nodes-rules.sh         |  51 ++
 .../scale/grafana-dashboard.json              | 406 +++++++++++++++
 hack/test-workloads/scale/kind-config.yaml    |   7 +
 hack/test-workloads/scale/scale-test.sh       | 162 ++++++
 hack/test-workloads/scale/servicemonitor.yaml |  21 +
 hack/test-workloads/scale/setup-monitoring.sh | 230 ++++++++
 internal/controller/node_controller.go        |  13 +-
 internal/controller/node_controller_test.go   | 314 ++++-------
 .../nodereadinessrule_controller.go           | 112 +++-
 internal/metrics/metrics.go                   |  72 ++-
 14 files changed, 2091 insertions(+), 247 deletions(-)
 create mode 100644 hack/test-workloads/scale/Makefile
 create mode 100644 hack/test-workloads/scale/README.md
 create mode 100755 hack/test-workloads/scale/cleanup-kwok-nodes-rules.sh
 create mode 100644 hack/test-workloads/scale/grafana-dashboard.json
 create mode 100644 hack/test-workloads/scale/kind-config.yaml
 create mode 100755 hack/test-workloads/scale/scale-test.sh
 create mode 100644 hack/test-workloads/scale/servicemonitor.yaml
 create mode 100755 hack/test-workloads/scale/setup-monitoring.sh

diff --git a/cmd/readiness-condition-reporter/main_test.go b/cmd/readiness-condition-reporter/main_test.go
index 51e42e9..2324a12 100644
--- a/cmd/readiness-condition-reporter/main_test.go
+++ b/cmd/readiness-condition-reporter/main_test.go
@@ -178,7 +178,6 @@ func TestUpdateNodeCondition(t *testing.T) {
 			if foundCondition == nil {
 				t.Fatal("Condition not found")
 			}
-
 			if foundCondition.Status != tt.wantStatus {
 				t.Errorf("Condition status = %v, want %v", foundCondition.Status, tt.wantStatus)
 			}
diff --git a/docs/book/src/operations/monitoring.md b/docs/book/src/operations/monitoring.md
index 04608df..f49cc72 100644
--- a/docs/book/src/operations/monitoring.md
+++ b/docs/book/src/operations/monitoring.md
@@ -1,16 +1,48 @@
 # Monitoring
 
-Node Readiness Controller exposes Prometheus-compatible metrics. This page describes the Prometheus metrics exposed by Node Readiness Controller for monitoring rule evaluation, taint operations, failures, and bootstrap progress.
+The Node Readiness Controller exposes Prometheus-compatible metrics. This page documents the metrics currently registered by the controller and how they can be used for monitoring rule evaluation, taint operations, failures, bootstrap progress, and rule health.
 
 ## Metrics Endpoint
 
-The controller serves metrics on `/metrics` only when metrics are explicitly enabled. Depending on the installation, the endpoint is served either over HTTP or over HTTPS. See [Installation](../user-guide/installation.md) for deployment details.
+The controller serves metrics on `/metrics` only when metrics are explicitly enabled.
 
-## Supported Metrics
+Depending on the installation, the endpoint is exposed as:
+
+  - HTTP on port `8080` when the standard Prometheus component is enabled.
+  - HTTPS on port `8443` when the Prometheus TLS component is enabled.
+
+See [Installation](https://www.google.com/search?q=../user-guide/installation.md) for deployment details.
+
+## Metric Lifecycle Management
+
+When a `NodeReadinessRule` is deleted, the controller automatically cleans up the associated rule-labeled Prometheus series. This prevents stale metrics from remaining visible in dashboards and alerts.
+
+**Metrics cleaned up on rule deletion:**
+
+  - `node_readiness_taint_operations_total{rule="..."}`
+  - `node_readiness_evaluation_duration_seconds{rule="..."}`
+  - `node_readiness_failures_total{rule="..."}`
+  - `node_readiness_bootstrap_completed_total{rule="..."}`
+  - `node_readiness_reconciliation_latency_seconds{rule="..."}`
+  - `node_readiness_bootstrap_duration_seconds{rule="..."}`
+  - `node_readiness_nodes_by_state{rule="..."}`
+  - `node_readiness_rule_last_reconciliation_timestamp_seconds{rule="..."}`
+
+This ensures that:
+
+  - Deleted rules do not continue to appear in dashboards with stale values.
+  - Memory usage does not grow unbounded from removed rules.
+  - Metric cardinality remains highly accurate over time.
+
+**Note:** The global `node_readiness_rules_total` gauge is updated separately. Rule-labeled metrics are explicitly deleted during rule cleanup.
+
+-----
+
+## Core Metrics
 
 ### `node_readiness_rules_total`
 
-Number of `NodeReadinessRule` objects tracked by the controller.
+Number of `NodeReadinessRule` objects currently tracked by the controller.
 
 | Property | Value |
 | --- | --- |
@@ -25,24 +57,17 @@ Total number of taint operations performed by the controller.
 | Property | Value |
 | --- | --- |
 | Type | `counter` |
-| Labels | `rule`, `operation` |
+| Labels | `rule`, `operation` (`add`, `remove`) |
 | Recorded when | The controller successfully adds or removes a taint |
 
-#### Labels
-
-| Label | Description | Values |
-| --- | --- | --- |
-| `rule` | `NodeReadinessRule` name | Any rule name |
-| `operation` | Taint operation performed by the controller | `add`, `remove` |
-
 ### `node_readiness_evaluation_duration_seconds`
 
-Duration of rule evaluations.
+Duration of the controller's internal rule evaluations.
 
 | Property | Value |
 | --- | --- |
 | Type | `histogram` |
-| Labels | none |
+| Labels | `rule` |
 | Buckets | Prometheus default histogram buckets |
 | Recorded when | The controller evaluates a rule against a node |
 
@@ -53,15 +78,8 @@ Total number of failure events recorded by the controller.
 | Property | Value |
 | --- | --- |
 | Type | `counter` |
-| Labels | `rule`, `reason` |
-| Recorded when | The controller records an evaluation failure or taint add/remove failure |
-
-#### Labels
-
-| Label | Description | Values |
-| --- | --- | --- |
-| `rule` | `NodeReadinessRule` name | Any rule name |
-| `reason` | Failure label recorded by the controller | `EvaluationError`, `AddTaintError`, `RemoveTaintError` |
+| Labels | `rule`, `reason` (`EvaluationError`, `AddTaintError`, `RemoveTaintError`) |
+| Recorded when | The controller encounters an error evaluating or patching a node |
 
 ### `node_readiness_bootstrap_completed_total`
 
@@ -73,8 +91,113 @@ Total number of nodes that have completed bootstrap.
 | Labels | `rule` |
 | Recorded when | The controller marks bootstrap as completed for a node under a bootstrap-only rule |
 
-#### Labels
+-----
+
+## Extended Health and SLI Metrics
+
+### `node_readiness_reconciliation_latency_seconds`
+
+End-to-end latency from node condition change to taint operation completion.
+
+| Property | Value |
+| --- | --- |
+| Type | `histogram` |
+| Labels | `rule`, `operation` (`add_taint`, `remove_taint`) |
+| Buckets | `0.01`, `0.05`, `0.1`, `0.5`, `1`, `2`, `5`, `10`, `30`, `60`, `120`, `300` seconds |
+| Recorded when | A taint operation completes |
+
+**Use case:** Measure how quickly the controller responds to node condition changes in the cluster.
+
+### `node_readiness_bootstrap_duration_seconds`
+
+Time from node creation to bootstrap completion for bootstrap-only rules.
+
+| Property | Value |
+| --- | --- |
+| Type | `histogram` |
+| Labels | `rule` |
+| Buckets | `1`, `5`, `10`, `30`, `60`, `120`, `300`, `600`, `1200` seconds |
+| Recorded when | Bootstrap completion is observed for a node under a bootstrap-only rule |
+
+**Use case:** Measure the actual time nodes take to become fully provisioned and bootstrap-complete.
+
+### `node_readiness_nodes_by_state`
+
+Number of nodes in each readiness state per rule.
+
+| Property | Value |
+| --- | --- |
+| Type | `gauge` |
+| Labels | `rule`, `state` (`ready`, `not_ready`, `bootstrapping`) |
+| Recorded when | A rule reconciliation completes |
+
+**Use case:** Track aggregate node health without introducing per-node metric cardinality, keeping controller memory footprint lean.
+
+### `node_readiness_rule_last_reconciliation_timestamp_seconds`
+
+Unix timestamp of the last reconciliation for a rule.
+
+| Property | Value |
+| --- | --- |
+| Type | `gauge` |
+| Labels | `rule` |
+| Recorded when | A rule reconciliation loop successfully completes |
+
+**Use case:** Detect rules that may be stuck or not reconciling frequently enough.
+
+-----
+
+## Example Queries & SLOs
+
+### Latency Monitoring & SLOs
+
+**Objective:** 95% of internal evaluations complete within 50 milliseconds (0.05s).
+
+```promql
+# Percentage of evaluations completing within 50ms
+sum(rate(node_readiness_evaluation_duration_seconds_bucket{le="0.05"}[5m])) /
+sum(rate(node_readiness_evaluation_duration_seconds_count[5m])) * 100
+```
+
+```promql
+# P95 End-to-End Reconciliation Latency across all rules
+histogram_quantile(0.95,
+  sum by (le) (
+    rate(node_readiness_reconciliation_latency_seconds_bucket[5m])
+  )
+)
+```
+
+### Freshness Monitoring & SLOs
+
+**Objective:** All rules reconcile within the last 2 minutes.
+
+```promql
+# Alert if any rule has not reconciled in the last 120 seconds
+(time() - node_readiness_rule_last_reconciliation_timestamp_seconds) > 120
+```
+
+### Availability Monitoring & SLOs
+
+**Objective:** 99.9% of targeted nodes are ready.
+
+```promql
+# Percentage of ready nodes globally
+100 * sum(node_readiness_nodes_by_state{state="ready"}) / sum(node_readiness_nodes_by_state)
+
+# Percentage of ready nodes per rule
+100 * node_readiness_nodes_by_state{state="ready"} / sum by (rule) (node_readiness_nodes_by_state)
+```
+
+## Monitoring and Scale Testing
+
+For an end-to-end monitoring setup with Prometheus and Grafana during scale tests, see the [scale testing guide](../../../../hack/test-workloads/scale/README.md).
+
+## Alerting Recommendations
+
+Typical alerts to consider:
 
-| Label | Description | Values |
-| --- | --- | --- |
-| `rule` | `NodeReadinessRule` name | Any rule name |
+  - **High latency:** P95 reconciliation latency above 10s for 5 minutes.
+  - **Stale reconciliations:** Any rule with no reconciliation for more than 5 minutes.
+  - **High failure rate:** Sustained increase in `node_readiness_failures_total`.
+  - **Low availability:** Ready-node percentage below your target threshold for a sustained period.
\ No newline at end of file
diff --git a/hack/test-workloads/scale/Makefile b/hack/test-workloads/scale/Makefile
new file mode 100644
index 0000000..767b3e2
--- /dev/null
+++ b/hack/test-workloads/scale/Makefile
@@ -0,0 +1,283 @@
+# Copyright The Kubernetes Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Scale Testing Makefile for Node Readiness Controller
+# This Makefile provides targets for setting up and running scale tests
+# with monitoring using Podman and Kind.
+
+.DEFAULT_GOAL := help
+
+# Directories
+SCALE_DIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+PROJECT_ROOT := $(shell cd $(SCALE_DIR)/../../.. && pwd)
+
+# Configuration
+CLUSTER_NAME ?= nrr-test
+CONTAINER_TOOL ?= podman
+IMG_PREFIX ?= controller
+IMG_TAG ?= latest
+KIND_CONFIG ?= $(SCALE_DIR)/kind-config.yaml
+
+# Tools
+KUBECTL ?= kubectl
+KIND ?= kind
+HELM ?= helm
+
+# Namespace for controller
+CONTROLLER_NAMESPACE ?= nrr-system
+
+##@ General
+
+.PHONY: help
+help: ## Display this help
+	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf "  \033[36m%-20s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
+
+##@ Setup
+
+.PHONY: setup
+setup: create-cluster install-controller install-monitoring ## Complete setup: cluster + controller + monitoring
+	@echo ""
+	@echo "✅ Setup complete!"
+	@echo ""
+	@echo "Next steps:"
+	@echo "  1. Run scale test: make test NODE_COUNT=1000 RULE_COUNT=3"
+	@echo "  2. View Grafana: http://localhost:3000 (credentials shown above)"
+	@echo "  3. Import dashboard: $(SCALE_DIR)/grafana-dashboard.json"
+
+.PHONY: create-cluster
+create-cluster: ## Create Kind cluster with scale test configuration
+	@echo "==> Creating Kind cluster: $(CLUSTER_NAME)"
+	@if $(KIND) get clusters | grep -q "^$(CLUSTER_NAME)$$"; then \
+		echo "Cluster $(CLUSTER_NAME) already exists"; \
+	else \
+		$(KIND) create cluster --config $(KIND_CONFIG) --name $(CLUSTER_NAME); \
+		echo "✓ Cluster created"; \
+	fi
+
+.PHONY: install-controller
+install-controller: ## Build and install NRR controller with Podman or Docker
+	@echo "==> Installing NRR Controller"
+	@echo "Step 1: Installing CRDs..."
+	@cd $(PROJECT_ROOT) && $(MAKE) install
+	@echo ""
+	@echo "Step 2: Building controller image with $(CONTAINER_TOOL)..."
+ifeq ($(CONTAINER_TOOL),podman)
+	@cd $(PROJECT_ROOT) && $(MAKE) podman-build IMG_PREFIX=$(IMG_PREFIX) IMG_TAG=$(IMG_TAG)
+else
+	@cd $(PROJECT_ROOT) && $(MAKE) docker-build IMG_PREFIX=$(IMG_PREFIX) IMG_TAG=$(IMG_TAG)
+endif
+	@echo ""
+	@echo "Step 3: Loading image into Kind cluster..."
+	@cd $(PROJECT_ROOT) && $(MAKE) kind-load CONTAINER_TOOL=$(CONTAINER_TOOL) IMG_PREFIX=$(IMG_PREFIX) IMG_TAG=$(IMG_TAG) KIND_CLUSTER=$(CLUSTER_NAME)
+	@echo ""
+	@echo "Step 4: Deploying controller with metrics enabled..."
+ifeq ($(CONTAINER_TOOL),podman)
+	@cd $(PROJECT_ROOT) && $(MAKE) deploy IMG_PREFIX=localhost/$(IMG_PREFIX) IMG_TAG=$(IMG_TAG) ENABLE_METRICS=true
+else
+	@cd $(PROJECT_ROOT) && $(MAKE) deploy IMG_PREFIX=$(IMG_PREFIX) IMG_TAG=$(IMG_TAG) ENABLE_METRICS=true
+endif
+	@echo ""
+	@echo "Step 5: Waiting for controller to be ready..."
+	@for i in 1 2 3 4 5 6 7 8 9 10; do \
+		echo "Checking pod status (attempt $$i/10)..."; \
+		$(KUBECTL) get pods -n $(CONTROLLER_NAMESPACE) -o wide 2>/dev/null || true; \
+		if $(KUBECTL) get pods -n $(CONTROLLER_NAMESPACE) -o jsonpath='{.items[*].status.conditions[?(@.type=="Ready")].status}' 2>/dev/null | grep -q "True"; then \
+			echo "✓ Controller is running!"; \
+			break; \
+		fi; \
+		[ $$i -lt 10 ] && sleep 10; \
+	done
+	@echo "✓ Controller installed"
+
+.PHONY: install-monitoring
+install-monitoring: ## Install Prometheus and Grafana monitoring stack
+	@echo "==> Installing Monitoring Stack"
+	@echo "Step 1: Adding Prometheus Helm repository..."
+	@$(HELM) repo add prometheus-community https://prometheus-community.github.io/helm-charts 2>/dev/null || true
+	@$(HELM) repo update
+	@echo ""
+	@echo "Step 2: Creating monitoring namespace..."
+	@$(KUBECTL) create namespace monitoring --dry-run=client -o yaml | $(KUBECTL) apply -f -
+	@echo ""
+	@echo "Step 3: Installing or updating kube-prometheus-stack..."
+	@$(HELM) upgrade --install prom-stack prometheus-community/kube-prometheus-stack \
+		--namespace monitoring \
+		--set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \
+		--set prometheus.prometheusSpec.scrapeInterval=5s \
+		--set nodeExporter.enabled=false \
+		--set grafana.adminPassword=admin \
+		--wait --timeout 5m
+	@echo "✓ Prometheus stack installed"
+	@echo ""
+	@echo "Step 4: Creating ServiceMonitor for NRR metrics..."
+	@$(KUBECTL) apply -f $(SCALE_DIR)/servicemonitor.yaml
+	@echo "✓ ServiceMonitor created"
+	@echo ""
+	@echo "Step 5: Setting up port forwarding..."
+	@$(MAKE) port-forward
+	@echo ""
+	@echo "✓ Monitoring stack installed"
+	@echo ""
+	@echo "📊 Access URLs:"
+	@echo "  Grafana:    http://localhost:3000"
+	@echo "  Prometheus: http://localhost:9090"
+	@echo ""
+	@echo "🔐 Grafana Credentials:"
+	@echo "  Username: admin"
+	@echo "  Password: admin"
+
+.PHONY: port-forward
+port-forward: ## Setup port forwarding for Grafana and Prometheus
+	@echo "Setting up port forwarding..."
+	@pkill -f "port-forward.*grafana" 2>/dev/null || true
+	@pkill -f "port-forward.*prometheus" 2>/dev/null || true
+	@$(KUBECTL) port-forward -n monitoring svc/prom-stack-grafana 3000:80 > /dev/null 2>&1 &
+	@$(KUBECTL) port-forward -n monitoring svc/prom-stack-kube-prometheus-prometheus 9090:9090 > /dev/null 2>&1 &
+	@sleep 2
+	@echo "✓ Port forwarding active (Grafana: 3000, Prometheus: 9090)"
+
+##@ Testing
+
+.PHONY: test
+test: ## Run scale test (usage: make test NODE_COUNT=1000 RULE_COUNT=3)
+	@echo "==> Running Scale Test"
+	@if [ -z "$(NODE_COUNT)" ]; then \
+		echo "Usage: make test NODE_COUNT=<count> [RULE_COUNT=<count>]"; \
+		echo "Example: make test NODE_COUNT=1000 RULE_COUNT=3"; \
+		exit 1; \
+	fi
+	@$(SCALE_DIR)/scale-test.sh $(NODE_COUNT) $(if $(RULE_COUNT),$(RULE_COUNT),1)
+
+.PHONY: test-quick
+test-quick: ## Quick test with 100 nodes and 1 rule
+	@$(MAKE) test NODE_COUNT=100 RULE_COUNT=1
+
+.PHONY: test-medium
+test-medium: ## Medium test with 500 nodes and 2 rules
+	@$(MAKE) test NODE_COUNT=500 RULE_COUNT=2
+
+.PHONY: test-large
+test-large: ## Large test with 1000 nodes and 3 rules
+	@$(MAKE) test NODE_COUNT=1000 RULE_COUNT=3
+
+.PHONY: test-xlarge
+test-xlarge: ## Extra large test with 5000 nodes and 5 rules
+	@$(MAKE) test NODE_COUNT=5000 RULE_COUNT=5
+
+##@ Monitoring
+
+.PHONY: dashboard
+dashboard: ## Open Grafana dashboard in browser
+	@echo "Opening Grafana dashboard..."
+	@echo "URL: http://localhost:3000"
+	@echo "Username: admin"
+	@echo "Password: admin"
+	@echo ""
+	@echo "Import dashboard from: $(SCALE_DIR)/grafana-dashboard.json"
+	@open http://localhost:3000 2>/dev/null || xdg-open http://localhost:3000 2>/dev/null || echo "Please open http://localhost:3000 manually"
+
+.PHONY: prometheus
+prometheus: ## Open Prometheus UI in browser
+	@echo "Opening Prometheus UI..."
+	@open http://localhost:9090 2>/dev/null || xdg-open http://localhost:9090 2>/dev/null || echo "Please open http://localhost:9090 manually"
+
+.PHONY: logs
+logs: ## Show controller logs
+	@$(KUBECTL) logs -n $(CONTROLLER_NAMESPACE) -l control-plane=controller-manager --tail=100 -f
+
+.PHONY: metrics
+metrics: ## Show current Prometheus-formatted metrics from the controller Service
+	@echo "Fetching Prometheus metrics from controller Service..."
+	@$(KUBECTL) get --raw "/api/v1/namespaces/$(CONTROLLER_NAMESPACE)/services/http:metrics-service:8080/proxy/metrics" 2>/dev/null || \
+		echo "Unable to fetch /metrics via metrics-service. Ensure the controller is deployed with ENABLE_METRICS=true and the Service exists."
+
+##@ Cleanup
+
+.PHONY: clean-test
+clean-test: ## Clean up test resources (nodes and rules)
+	@echo "==> Cleaning up test resources"
+	@$(SCALE_DIR)/cleanup-kwok-nodes-rules.sh
+	@echo "✓ Test resources cleaned"
+
+.PHONY: clean-monitoring
+clean-monitoring: ## Uninstall monitoring stack
+	@echo "==> Uninstalling monitoring stack"
+	@pkill -f "port-forward.*grafana" 2>/dev/null || true
+	@pkill -f "port-forward.*prometheus" 2>/dev/null || true
+	@$(HELM) uninstall prom-stack -n monitoring 2>/dev/null || true
+	@$(KUBECTL) delete namespace monitoring --ignore-not-found=true
+	@echo "✓ Monitoring stack removed"
+
+.PHONY: clean-controller
+clean-controller: ## Uninstall controller
+	@echo "==> Uninstalling controller"
+	@cd $(PROJECT_ROOT) && $(MAKE) undeploy ENABLE_METRICS=true 2>/dev/null || true
+	@cd $(PROJECT_ROOT) && $(MAKE) uninstall 2>/dev/null || true
+	@echo "✓ Controller uninstalled"
+
+.PHONY: clean-cluster
+clean-cluster: ## Delete Kind cluster
+	@echo "==> Deleting Kind cluster: $(CLUSTER_NAME)"
+	@$(KIND) delete cluster --name $(CLUSTER_NAME)
+	@echo "✓ Cluster deleted"
+
+.PHONY: clean
+clean: clean-test clean-monitoring clean-controller clean-cluster ## Complete cleanup: remove everything
+	@echo ""
+	@echo "✅ Complete cleanup finished"
+
+##@ Utilities
+
+.PHONY: status
+status: ## Show status of all components
+	@echo "==> Cluster Status"
+	@echo "Cluster: $(CLUSTER_NAME)"
+	@$(KIND) get clusters | grep "^$(CLUSTER_NAME)$$" && echo "✓ Cluster exists" || echo "✗ Cluster not found"
+	@echo ""
+	@echo "==> Controller Status"
+	@$(KUBECTL) get pods -n $(CONTROLLER_NAMESPACE) 2>/dev/null || echo "✗ Controller not deployed"
+	@echo ""
+	@echo "==> Monitoring Status"
+	@$(KUBECTL) get pods -n monitoring 2>/dev/null || echo "✗ Monitoring not deployed"
+	@echo ""
+	@echo "==> Test Resources"
+	@echo "NodeReadinessRules:"
+	@$(KUBECTL) get nodereadinessrules 2>/dev/null || echo "  None"
+	@echo "KWOK Nodes:"
+	@$(KUBECTL) get nodes -l kwok.x-k8s.io/node=fake --no-headers 2>/dev/null | wc -l | xargs echo "  Count:" || echo "  0"
+	@echo ""
+	@echo "==> Port Forwarding"
+	@pgrep -f "port-forward.*grafana" > /dev/null && echo "✓ Grafana port-forward active (3000)" || echo "✗ Grafana port-forward not active"
+	@pgrep -f "port-forward.*prometheus" > /dev/null && echo "✓ Prometheus port-forward active (9090)" || echo "✗ Prometheus port-forward not active"
+
+.PHONY: verify
+verify: ## Verify all prerequisites are installed
+	@echo "==> Verifying Prerequisites"
+	@command -v $(KIND) >/dev/null 2>&1 && echo "✓ kind installed" || echo "✗ kind not found"
+	@command -v $(KUBECTL) >/dev/null 2>&1 && echo "✓ kubectl installed" || echo "✗ kubectl not found"
+	@command -v $(HELM) >/dev/null 2>&1 && echo "✓ helm installed" || echo "✗ helm not found"
+	@command -v $(CONTAINER_TOOL) >/dev/null 2>&1 && echo "✓ $(CONTAINER_TOOL) installed" || echo "✗ $(CONTAINER_TOOL) not found"
+	@command -v jq >/dev/null 2>&1 && echo "✓ jq installed" || echo "✗ jq not found"
+	@command -v bc >/dev/null 2>&1 && echo "✓ bc installed" || echo "✗ bc not found"
+
+.PHONY: info
+info: ## Show configuration information
+	@echo "==> Configuration"
+	@echo "Project Root:      $(PROJECT_ROOT)"
+	@echo "Scale Directory:   $(SCALE_DIR)"
+	@echo "Cluster Name:      $(CLUSTER_NAME)"
+	@echo "Container Tool:    $(CONTAINER_TOOL)"
+	@echo "Image:             $(IMG_PREFIX):$(IMG_TAG)"
+	@echo "Kind Config:       $(KIND_CONFIG)"
+	@echo "Controller NS:     $(CONTROLLER_NAMESPACE)"
diff --git a/hack/test-workloads/scale/README.md b/hack/test-workloads/scale/README.md
new file mode 100644
index 0000000..ee7b4a4
--- /dev/null
+++ b/hack/test-workloads/scale/README.md
@@ -0,0 +1,489 @@
+# Node Readiness Controller - Scale Testing Guide
+
+This guide explains how to run scale tests for Node Readiness Controller (NRR) with Prometheus and Grafana, and how to interpret the metrics that are currently emitted by the controller.
+
+## Table of Contents
+
+- [Overview](#overview)
+- [Metrics Available During Scale Tests](#metrics-available-during-scale-tests)
+- [Prerequisites](#prerequisites)
+- [Quick Start](#quick-start)
+- [Detailed Setup](#detailed-setup)
+- [Import the Grafana Dashboard](#import-the-grafana-dashboard)
+- [Run Scale Tests](#run-scale-tests)
+- [Monitoring Queries](#monitoring-queries)
+- [Interpreting Results](#interpreting-results)
+- [Troubleshooting](#troubleshooting)
+- [Cleanup](#cleanup)
+- [Advanced Usage](#advanced-usage)
+- [Additional Resources](#additional-resources)
+
+## Overview
+
+The scale test framework allows you to:
+
+  - Test NRR with multiple nodes using [KWOK](https://kwok.sigs.k8s.io/) (fake nodes).
+  - Measure taint addition and removal throughput.
+  - Observe evaluation logic performance in Grafana.
+  - Monitor controller resource usage (memory and CPU).
+  - Inspect `controller-runtime` metrics.
+  - Monitor NRR rule-level health and aggregate readiness metrics.
+
+## Metrics Available During Scale Tests
+
+During scale tests you will see both:
+
+  - **NRR custom metrics** exposed by the controller.
+  - **controller-runtime / process metrics** exposed by the manager and scraped by Prometheus.
+
+### NRR Custom Metrics (The "Mega-Scale" Stack)
+
+NRR uses an **Aggregate-First** telemetry strategy. This means we track the state of the cluster without introducing per-node metric labels, ensuring the controller remains lean as the cluster grows.
+
+The following controller metrics are currently registered:
+
+  - `node_readiness_rules_total`
+  - `node_readiness_taint_operations_total{rule, operation}`
+  - `node_readiness_evaluation_duration_seconds{rule}`
+  - `node_readiness_failures_total{rule, reason}`
+  - `node_readiness_bootstrap_completed_total{rule}`
+  - `node_readiness_reconciliation_latency_seconds{rule, operation}`
+  - `node_readiness_bootstrap_duration_seconds{rule}`
+  - `node_readiness_nodes_by_state{rule, state}`
+  - `node_readiness_rule_last_reconciliation_timestamp_seconds{rule}`
+
+## Prerequisites
+
+Ensure the following tools are installed:
+
+```bash
+kind version
+kubectl version --client
+helm version
+podman --version   # or docker --version
+jq --version
+bc --version
+```
+
+Container runtime support:
+
+  - **Podman** is the default in the Makefile
+  - **Docker** is also supported via `CONTAINER_TOOL=docker`
+
+## Quick Start
+
+### Option 1: Makefile
+
+```bash
+cd hack/test-workloads/scale
+
+# Full setup: cluster + controller + monitoring
+make setup
+
+# Run a test
+make test NODE_COUNT=1000 RULE_COUNT=3
+
+# Open Grafana
+make dashboard
+
+# Open Prometheus
+make prometheus
+
+# Inspect the controller's /metrics endpoint output
+make metrics
+```
+
+Using Docker instead of Podman:
+
+```bash
+cd hack/test-workloads/scale
+make setup CONTAINER_TOOL=docker
+make test NODE_COUNT=1000 RULE_COUNT=3 CONTAINER_TOOL=docker
+```
+
+### Option 2: Script
+
+```bash
+cd hack/test-workloads/scale
+
+# Setup monitoring stack
+./setup-monitoring.sh
+
+# Run scale test in another terminal
+./scale-test.sh 1000 3
+```
+
+## Detailed Setup
+
+### Container runtime configuration
+
+The Makefile supports both Podman and Docker.
+
+#### Podman
+
+```bash
+make setup
+make test NODE_COUNT=1000 RULE_COUNT=3
+```
+
+#### Docker
+
+```bash
+make setup CONTAINER_TOOL=docker
+make test NODE_COUNT=1000 RULE_COUNT=3 CONTAINER_TOOL=docker
+```
+
+#### Show current configuration
+
+```bash
+make info
+```
+
+### Available Make targets
+
+```bash
+make help
+make verify
+make info
+make status
+```
+
+Key targets:
+
+| Target | Description |
+| --- | --- |
+| `make setup` | Create cluster, install controller, install monitoring |
+| `make test NODE_COUNT=1000 RULE_COUNT=3` | Run scale test |
+| `make test-quick` | 100 nodes, 1 rule |
+| `make test-medium` | 500 nodes, 2 rules |
+| `make test-large` | 1000 nodes, 3 rules |
+| `make test-xlarge` | 5000 nodes, 5 rules |
+| `make dashboard` | Open Grafana |
+| `make prometheus` | Open Prometheus |
+| `make metrics` | Print the controller `/metrics` output via the Kubernetes Service proxy |
+| `make logs` | Follow controller logs |
+| `make status` | Show status of cluster, controller, monitoring, and port-forwarding |
+| `make clean` | Remove everything |
+
+### What `make setup` does
+
+`make setup` runs:
+
+1.  `create-cluster`
+2.  `install-controller`
+3.  `install-monitoring`
+
+Controller installation enables the metrics endpoint and deploys the controller into the `nrr-system` namespace.
+
+Monitoring installation:
+
+- installs or updates `kube-prometheus-stack`
+- creates the `monitoring` namespace
+- applies `servicemonitor.yaml`
+- configures Prometheus with a `5s` scrape interval for the stack
+- disables `nodeExporter` in this scale-test setup
+- starts local port-forwards for Grafana and Prometheus
+
+### Metrics scraping configuration
+
+The scale setup uses `hack/test-workloads/scale/servicemonitor.yaml`.
+
+Current behavior:
+
+  - scrapes the controller Service in namespace `nrr-system`
+  - matches Service labels:
+      - `control-plane: controller-manager`
+      - `app.kubernetes.io/name: nrrcontroller`
+  - scrapes endpoint:
+      - port: `http`
+      - scheme: `http`
+      - interval: `5s`
+
+This matches the scale-test setup, which deploys the controller with metrics enabled over HTTP.
+
+## Import the Grafana Dashboard
+
+1. Open Grafana at `http://localhost:3000`
+2. Login with:
+   - username: `admin`
+   - password: `admin` when using the Makefile setup
+   - password from script output when using `setup-monitoring.sh`
+3. Import `hack/test-workloads/scale/grafana-dashboard.json`
+4. Select Prometheus as the datasource
+
+The dashboard JSON in this directory is the source of truth for the available panels.
+
+Current dashboard highlights:
+
+- **NRR Ready Nodes (%)**: percentage of nodes currently in NRR `ready` state
+- **SLI: Fast Evaluations (% under 50ms)**: percentage of evaluations completing within 50ms
+- **Bootstrap Completions**: total number of completed bootstrap events
+- **Nodes by Readiness State**: aggregate counts for `ready`, `not_ready`, and `bootstrapping`
+- **Nodes by Rule and State**: readiness-state breakdown per rule
+- **Reconciliation Latency (P50/P95/P99)**: latency percentiles broken out by operation label
+- **Evaluation Rate by Rule**: how actively each rule is being evaluated
+- **Taint Operations (Throughput)**: add/remove operation rate
+- **Failures & Errors**: failure rate by reason
+- **Rule Reconciliation Age**: time since each rule last reconciled
+- **Workqueue Depth (Backlog)**: controller backlog indicator
+- **Controller Memory Usage** and **Controller CPU Usage** with both container-level and process-level visibility where available
+- **Bootstrap Duration by Rule (P95)**: bootstrap latency broken out per rule
+- **Bootstrap Duration Rate / Samples**: indicates whether bootstrap duration histograms currently have sample volume
+- **Total Taint Operations**: cumulative add/remove operations over the selected time range
+
+## Run Scale Tests
+
+### Using the Makefile
+
+```bash
+make test NODE_COUNT=1000 RULE_COUNT=3
+
+make test-quick
+make test-medium
+make test-large
+make test-xlarge
+```
+
+### Using the script directly
+
+```bash
+./scale-test.sh <NODE_COUNT> <RULE_COUNT>
+
+./scale-test.sh 100 1
+./scale-test.sh 1000 3
+./scale-test.sh 5000 5
+```
+
+### What the test does
+
+The test workflow is:
+
+1.  clean up old test artifacts
+2.  create one or more `NodeReadinessRule` objects
+3.  create fake KWOK nodes
+4.  wait for NRR to apply taints
+5.  patch node conditions so rules become satisfied
+6.  wait for NRR to remove taints
+7.  print timing and throughput results
+
+## Monitoring Queries
+
+Use these in Prometheus while running scale tests to validate controller performance.
+
+### Evaluation Performance
+
+Measures the percentage of evaluations completing within 50ms.
+
+```promql
+sum(rate(node_readiness_evaluation_duration_seconds_bucket{le="0.05"}[5m])) /
+sum(rate(node_readiness_evaluation_duration_seconds_count[5m])) * 100
+```
+
+### End-to-End Reconciliation Latency (P99)
+
+How long does it take NRR to react to a condition change in the cluster?
+
+```promql
+histogram_quantile(0.99, 
+  sum by (le, operation) (rate(node_readiness_reconciliation_latency_seconds_bucket[5m]))
+)
+```
+
+### Cluster Readiness Overview
+
+Safely aggregate node health without cardinality explosions.
+
+```promql
+sum by (state) (node_readiness_nodes_by_state)
+```
+
+### Controller Freshness (Is it stuck?)
+
+```promql
+# Alert if any rule has not reconciled in the last 120 seconds
+(time() - node_readiness_rule_last_reconciliation_timestamp_seconds) > 120
+```
+
+### Failure Rate
+
+```promql
+sum by (reason) (rate(node_readiness_failures_total[5m]))
+```
+
+### Controller Resource Usage
+
+```promql
+process_resident_memory_bytes
+rate(process_cpu_seconds_total[5m])
+```
+
+### Controller-runtime Metrics
+
+```promql
+sum(rate(controller_runtime_reconcile_total[5m]))
+sum(rate(controller_runtime_reconcile_errors_total[5m]))
+workqueue_depth
+```
+
+## Interpreting Results
+
+### Good signals during a healthy scale run
+
+- **Stable memory profile:** controller memory should stay relatively stable for a given test size.
+- **Evaluation performance:** the fast-evaluations panel tracks the percentage of evaluations completing within 50ms.
+- **Throughput spikes:** taint operations should spike during node creation and condition patching, then fall back down.
+- **Clean node transitions:** `node_readiness_nodes_by_state` should move from `not_ready` or `bootstrapping` toward `ready`.
+- **Per-rule visibility:** `Nodes by Rule and State` and `Evaluation Rate by Rule` should make it obvious if one rule is lagging behind the others.
+- **Bootstrap duration:** bootstrap duration panels reflect the end-to-end time for nodes to reach bootstrap completion.
+- **Bootstrap completion growth:** the bootstrap completions stat should rise as nodes complete bootstrap-only workflows.
+- **Low failure rate:** `node_readiness_failures_total` should remain low or flat in healthy runs.
+
+### Important note on ready percentage
+
+`NRR Ready Nodes (%)` is based on **NRR aggregate state**, not the Kubernetes `Ready=True` node condition.
+
+During scale tests, it is normal for this panel to stay low or at `0%` during the taint-add phase because the test intentionally creates nodes before satisfying the custom readiness conditions. It should increase after the condition patching phase completes.
+
+### Signals to investigate
+
+- **Rising workqueue depth:** indicates the controller cannot keep up with node events.
+- **High sustained latency percentiles:** suggests API pressure or reconciliation bottlenecks.
+- **Memory growth across repeated runs:** may indicate a leak or excessive retained state.
+- **Bootstrap duration getting worse with scale:** suggests the controller or API server is struggling to complete bootstrap-only workflows promptly.
+- **Rule lag continuously increasing:** investigate reconcile health if `node_readiness_rule_last_reconciliation_timestamp_seconds` stops advancing while work remains.
+
+### Example validation checklist
+
+After a `make test-large` run:
+
+  - verify taint operations occurred: `sum(node_readiness_taint_operations_total)`
+  - verify evaluations occurred: `sum(node_readiness_evaluation_duration_seconds_count)`
+  - verify no unexpected sustained failures: `sum(rate(node_readiness_failures_total[5m]))`
+  - verify aggregate node state moved as expected: `node_readiness_nodes_by_state`
+
+## Troubleshooting
+
+### Metrics are missing in Prometheus
+
+Check that the controller is running:
+
+```bash
+kubectl get pods -n nrr-system
+kubectl logs -n nrr-system -l control-plane=controller-manager --tail=100
+```
+
+Check that the ServiceMonitor exists:
+
+```bash
+kubectl get servicemonitor -n monitoring
+kubectl get servicemonitor -n monitoring node-readiness-controller-monitor -o yaml
+```
+
+Check Prometheus targets:
+
+```bash
+make prometheus
+```
+
+Then inspect `http://localhost:9090/targets`.
+
+### Metrics endpoint not reachable
+
+The scale setup scrapes the HTTP metrics endpoint through the ServiceMonitor. Verify the Service exists and exposes port `http`:
+
+```bash
+kubectl get svc -n nrr-system
+kubectl get svc -n nrr-system metrics-service -o yaml
+```
+
+### Dashboard shows no data
+
+  - ensure the Grafana time range includes the test interval
+  - verify the Prometheus datasource is healthy
+  - confirm the imported dashboard uses the Prometheus datasource
+  - query the metrics directly in Prometheus first
+
+### Scale test hangs
+
+```bash
+kubectl logs -n nrr-system -l control-plane=controller-manager -f
+kubectl get nodes -l kwok.x-k8s.io/node=fake --watch
+kubectl get nodereadinessrules
+```
+
+### Port forwarding fails
+
+```bash
+lsof -i :3000
+lsof -i :9090
+```
+
+Then restart:
+
+```bash
+kubectl port-forward -n monitoring svc/prom-stack-grafana 3000:80 &
+kubectl port-forward -n monitoring svc/prom-stack-kube-prometheus-prometheus 9090:9090 &
+```
+
+### Podman image build fails
+
+```bash
+cd ../../../
+make podman-build
+podman images | grep controller
+```
+
+## Cleanup
+
+### Makefile targets
+
+```bash
+make clean-test
+make clean-monitoring
+make clean-controller
+make clean-cluster
+make clean
+```
+
+### Manual cleanup
+
+```bash
+./cleanup-kwok-nodes-rules.sh
+
+kubectl delete nodereadinessrules -l scale-test=true
+kubectl delete nodes -l kwok.x-k8s.io/node=fake
+
+pkill -f "port-forward.*grafana"
+pkill -f "port-forward.*prometheus"
+
+kind delete cluster --name nrr-test
+helm uninstall prom-stack -n monitoring
+```
+
+## Advanced Usage
+
+### View logs, metrics, and component status
+
+```bash
+make logs
+make metrics
+make status
+```
+
+### Inspect the current setup
+
+```bash
+make verify
+make info
+```
+
+## Additional Resources
+
+- [Monitoring Operations Guide](../../../docs/book/src/operations/monitoring.md)
+- [Main Project README](../../../README.md)
+- [Architecture Draft](../../../docs/architecture.draft.md)
+- [API Reference](../../../docs/book/src/reference/api-spec.md)
+
+-----
+
+Happy testing\!
\ No newline at end of file
diff --git a/hack/test-workloads/scale/cleanup-kwok-nodes-rules.sh b/hack/test-workloads/scale/cleanup-kwok-nodes-rules.sh
new file mode 100755
index 0000000..9b3908d
--- /dev/null
+++ b/hack/test-workloads/scale/cleanup-kwok-nodes-rules.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+
+# Copyright The Kubernetes Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Cleanup script for KWOK nodes and NodeReadinessRules
+
+set -euo pipefail
+
+echo "=== Cleanup Script ==="
+echo ""
+
+# Delete all KWOK nodes
+echo "🧹 Deleting all KWOK nodes..."
+NODE_COUNT=$(kubectl get nodes -l kwok.x-k8s.io/node=fake --no-headers 2>/dev/null | wc -l | tr -d ' ')
+
+if [ "$NODE_COUNT" -eq 0 ]; then
+  echo "   No KWOK nodes found."
+else
+  echo "   Found $NODE_COUNT KWOK nodes. Deleting..."
+  kubectl delete nodes -l kwok.x-k8s.io/node=fake --grace-period=0 --force
+  echo "   ✓ All KWOK nodes deleted"
+fi
+
+echo ""
+
+# Delete all NodeReadinessRules
+echo "🧹 Deleting all NodeReadinessRules..."
+NRR_COUNT=$(kubectl get nodereadinessrules --no-headers 2>/dev/null | wc -l | tr -d ' ')
+
+if [ "$NRR_COUNT" -eq 0 ]; then
+  echo "   No NodeReadinessRules found."
+else
+  echo "   Found $NRR_COUNT NodeReadinessRule(s). Deleting..."
+  kubectl delete nodereadinessrules --all
+  echo "   ✓ All NodeReadinessRules deleted"
+fi
+
+echo ""
+echo "=== Cleanup Complete ==="
diff --git a/hack/test-workloads/scale/grafana-dashboard.json b/hack/test-workloads/scale/grafana-dashboard.json
new file mode 100644
index 0000000..ef88cf3
--- /dev/null
+++ b/hack/test-workloads/scale/grafana-dashboard.json
@@ -0,0 +1,406 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {"type": "grafana", "uid": "-- Grafana --"},
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "panels": [
+    {
+      "title": "NRR Ready Nodes (%)",
+      "type": "stat",
+      "gridPos": {"h": 6, "w": 8, "x": 0, "y": 0},
+      "id": 1,
+      "datasource": {"type": "prometheus", "uid": "prometheus"},
+      "targets": [
+        {
+          "expr": "100 * sum(node_readiness_nodes_by_state{state=\"ready\"}) / clamp_min(sum(node_readiness_nodes_by_state), 1)",
+          "legendFormat": "Ready %"
+        }
+      ],
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "textMode": "value"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {"color": "red", "value": null},
+              {"color": "yellow", "value": 50},
+              {"color": "green", "value": 95}
+            ]
+          }
+        }
+      }
+    },
+    {
+      "title": "SLI: Fast Evaluations (% under 50ms)",
+      "type": "stat",
+      "gridPos": {"h": 6, "w": 8, "x": 8, "y": 0},
+      "id": 2,
+      "datasource": {"type": "prometheus", "uid": "prometheus"},
+      "targets": [
+        {
+          "expr": "sum(rate(node_readiness_evaluation_duration_seconds_bucket{le=\"0.05\"}[5m])) / sum(rate(node_readiness_evaluation_duration_seconds_count[5m])) * 100",
+          "legendFormat": "Evaluations < 50ms"
+        }
+      ],
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "textMode": "value"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {"color": "red", "value": null},
+              {"color": "yellow", "value": 95},
+              {"color": "green", "value": 99}
+            ]
+          }
+        }
+      }
+    },
+    {
+      "title": "Bootstrap Completions",
+      "type": "stat",
+      "gridPos": {"h": 6, "w": 8, "x": 16, "y": 0},
+      "id": 3,
+      "datasource": {"type": "prometheus", "uid": "prometheus"},
+      "targets": [
+        {
+          "expr": "sum(node_readiness_bootstrap_completed_total)",
+          "legendFormat": "Completed"
+        }
+      ],
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "textMode": "value"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {"color": "blue", "value": null},
+              {"color": "green", "value": 1}
+            ]
+          }
+        }
+      }
+    },
+    {
+      "title": "Nodes by Readiness State",
+      "type": "timeseries",
+      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 6},
+      "id": 4,
+      "datasource": {"type": "prometheus", "uid": "prometheus"},
+      "targets": [
+        {
+          "expr": "sum by (state) (node_readiness_nodes_by_state)",
+          "legendFormat": "{{state}}"
+        }
+      ],
+      "options": {
+        "legend": {"displayMode": "table", "placement": "right"},
+        "tooltip": {"mode": "multi"}
+      },
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "drawStyle": "line",
+            "fillOpacity": 20,
+            "stacking": {"group": "A", "mode": "normal"}
+          }
+        }
+      }
+    },
+    {
+      "title": "Nodes by Rule and State",
+      "type": "timeseries",
+      "gridPos": {"h": 8, "w": 12, "x": 12, "y": 6},
+      "id": 5,
+      "datasource": {"type": "prometheus", "uid": "prometheus"},
+      "targets": [
+        {
+          "expr": "sum by (rule, state) (node_readiness_nodes_by_state)",
+          "legendFormat": "{{rule}} / {{state}}"
+        }
+      ],
+      "options": {
+        "legend": {"displayMode": "table", "placement": "right"},
+        "tooltip": {"mode": "multi"}
+      },
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "drawStyle": "line",
+            "fillOpacity": 15,
+            "stacking": {"group": "A", "mode": "normal"}
+          }
+        }
+      }
+    },
+    {
+      "title": "Reconciliation Latency (P50/P95/P99)",
+      "type": "timeseries",
+      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 14},
+      "id": 6,
+      "datasource": {"type": "prometheus", "uid": "prometheus"},
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, sum by (le, operation) (rate(node_readiness_reconciliation_latency_seconds_bucket[5m])))",
+          "legendFormat": "P50 {{operation}}"
+        },
+        {
+          "expr": "histogram_quantile(0.95, sum by (le, operation) (rate(node_readiness_reconciliation_latency_seconds_bucket[5m])))",
+          "legendFormat": "P95 {{operation}}"
+        },
+        {
+          "expr": "histogram_quantile(0.99, sum by (le, operation) (rate(node_readiness_reconciliation_latency_seconds_bucket[5m])))",
+          "legendFormat": "P99 {{operation}}"
+        }
+      ],
+      "options": {
+        "legend": {"displayMode": "list", "placement": "bottom"}
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "custom": {"drawStyle": "line", "fillOpacity": 10, "lineWidth": 2}
+        }
+      }
+    },
+    {
+      "title": "Evaluation Rate by Rule",
+      "type": "timeseries",
+      "gridPos": {"h": 8, "w": 12, "x": 12, "y": 14},
+      "id": 7,
+      "datasource": {"type": "prometheus", "uid": "prometheus"},
+      "targets": [
+        {
+          "expr": "sum by (rule) (rate(node_readiness_evaluation_duration_seconds_count[5m]))",
+          "legendFormat": "{{rule}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "ops",
+          "custom": {"drawStyle": "line", "fillOpacity": 15, "lineWidth": 2}
+        }
+      }
+    },
+    {
+      "title": "Taint Operations (Throughput)",
+      "type": "timeseries",
+      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 22},
+      "id": 8,
+      "datasource": {"type": "prometheus", "uid": "prometheus"},
+      "targets": [
+        {
+          "expr": "sum by (operation) (rate(node_readiness_taint_operations_total[1m]))",
+          "legendFormat": "{{operation}} rate"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "ops",
+          "custom": {"drawStyle": "bars", "fillOpacity": 80}
+        }
+      }
+    },
+    {
+      "title": "Failures & Errors",
+      "type": "timeseries",
+      "gridPos": {"h": 8, "w": 12, "x": 12, "y": 22},
+      "id": 9,
+      "datasource": {"type": "prometheus", "uid": "prometheus"},
+      "targets": [
+        {
+          "expr": "sum by (reason) (rate(node_readiness_failures_total[1m])) or vector(0)",
+          "legendFormat": "{{reason}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "ops",
+          "custom": {"drawStyle": "line", "fillOpacity": 20, "lineWidth": 2}
+        }
+      }
+    },
+    {
+      "title": "Rule Reconciliation Age",
+      "type": "timeseries",
+      "gridPos": {"h": 7, "w": 8, "x": 0, "y": 30},
+      "id": 10,
+      "datasource": {"type": "prometheus", "uid": "prometheus"},
+      "targets": [
+        {
+          "expr": "time() - node_readiness_rule_last_reconciliation_timestamp_seconds",
+          "legendFormat": "{{rule}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "custom": {"drawStyle": "line", "fillOpacity": 20}
+        }
+      }
+    },
+    {
+      "title": "Workqueue Depth (Backlog)",
+      "type": "timeseries",
+      "gridPos": {"h": 7, "w": 8, "x": 8, "y": 30},
+      "id": 11,
+      "datasource": {"type": "prometheus", "uid": "prometheus"},
+      "targets": [
+        {
+          "expr": "sum(workqueue_depth)",
+          "legendFormat": "Queue Depth"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "custom": {"drawStyle": "line", "fillOpacity": 30}
+        }
+      }
+    },
+    {
+      "title": "Controller Memory Usage",
+      "type": "timeseries",
+      "gridPos": {"h": 7, "w": 8, "x": 16, "y": 30},
+      "id": 12,
+      "datasource": {"type": "prometheus", "uid": "prometheus"},
+      "targets": [
+        {
+          "expr": "sum(container_memory_working_set_bytes{namespace=\"nrr-system\", container=\"manager\"})",
+          "legendFormat": "Container Working Set"
+        },
+        {
+          "expr": "process_resident_memory_bytes",
+          "legendFormat": "Process RSS"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "bytes",
+          "custom": {"drawStyle": "line", "fillOpacity": 20}
+        }
+      }
+    },
+    {
+      "title": "Controller CPU Usage",
+      "type": "timeseries",
+      "gridPos": {"h": 7, "w": 12, "x": 0, "y": 37},
+      "id": 13,
+      "datasource": {"type": "prometheus", "uid": "prometheus"},
+      "targets": [
+        {
+          "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"nrr-system\", container=\"manager\"}[1m]))",
+          "legendFormat": "Container CPU"
+        },
+        {
+          "expr": "rate(process_cpu_seconds_total[1m])",
+          "legendFormat": "Process CPU"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short",
+          "custom": {"drawStyle": "line", "fillOpacity": 20}
+        }
+      }
+    },
+    {
+      "title": "Bootstrap Duration by Rule (P95)",
+      "type": "timeseries",
+      "gridPos": {"h": 7, "w": 12, "x": 12, "y": 37},
+      "id": 14,
+      "datasource": {"type": "prometheus", "uid": "prometheus"},
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum by (le, rule) (rate(node_readiness_bootstrap_duration_seconds_bucket[5m])))",
+          "legendFormat": "P95 {{rule}}"
+        }
+      ],
+      "options": {
+        "legend": {"displayMode": "list", "placement": "bottom"}
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "custom": {"drawStyle": "line", "fillOpacity": 15, "lineWidth": 2}
+        }
+      }
+    },
+    {
+      "title": "Bootstrap Duration Rate / Samples",
+      "type": "timeseries",
+      "gridPos": {"h": 7, "w": 12, "x": 0, "y": 44},
+      "id": 15,
+      "datasource": {"type": "prometheus", "uid": "prometheus"},
+      "targets": [
+        {
+          "expr": "sum(rate(node_readiness_bootstrap_duration_seconds_count[5m]))",
+          "legendFormat": "bootstrap samples/sec"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "ops",
+          "custom": {"drawStyle": "line", "fillOpacity": 20}
+        }
+      }
+    },
+    {
+      "title": "Total Taint Operations",
+      "type": "timeseries",
+      "gridPos": {"h": 7, "w": 12, "x": 12, "y": 44},
+      "id": 16,
+      "datasource": {"type": "prometheus", "uid": "prometheus"},
+      "targets": [
+        {
+          "expr": "sum(node_readiness_taint_operations_total{operation=\"add\"})",
+          "legendFormat": "adds"
+        },
+        {
+          "expr": "sum(node_readiness_taint_operations_total{operation=\"remove\"})",
+          "legendFormat": "removes"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short",
+          "custom": {"drawStyle": "line", "fillOpacity": 20}
+        }
+      }
+    }
+  ],
+  "refresh": "5s",
+  "schemaVersion": 38,
+  "tags": ["nrr", "kubernetes", "scale-test"],
+  "time": {"from": "now-15m", "to": "now"},
+  "timepicker": {"refresh_intervals": ["5s", "10s", "30s", "1m"]},
+  "title": "Node Readiness Controller - Production Scale",
+  "uid": "nrr-scale-dashboard",
+  "version": 1
+}
\ No newline at end of file
diff --git a/hack/test-workloads/scale/kind-config.yaml b/hack/test-workloads/scale/kind-config.yaml
new file mode 100644
index 0000000..e3e8c17
--- /dev/null
+++ b/hack/test-workloads/scale/kind-config.yaml
@@ -0,0 +1,7 @@
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+name: nrr-test
+nodes:
+- role: control-plane
+- role: worker
+- role: worker
diff --git a/hack/test-workloads/scale/scale-test.sh b/hack/test-workloads/scale/scale-test.sh
new file mode 100755
index 0000000..1d7f9bd
--- /dev/null
+++ b/hack/test-workloads/scale/scale-test.sh
@@ -0,0 +1,162 @@
+#!/usr/bin/env bash
+
+# Copyright The Kubernetes Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+################################################################################
+# NRR SCALE TEST UTILITY - HELP GUIDE
+################################################################################
+# USAGE:
+#   ./scale-test.sh <NODE_COUNT> <RULE_COUNT>
+#
+# ARGUMENTS:
+#   NODE_COUNT : Total fake KWOK nodes to create (Default: 10)
+#   RULE_COUNT : Total NodeReadinessRules to evaluate per node (Default: 1)
+#
+# DESCRIPTION:
+#   1. Cleans up previous test artifacts (nodes and rules).
+#   2. Creates N rules with unique conditions and taints.
+#   3. Spawns M KWOK nodes in parallel batches.
+#   4. Measures 'Taint Addition' latency (Controller reacting to new nodes).
+#   5. Patches all nodes to satisfy all rule conditions.
+#   6. Measures 'Taint Removal' latency (Controller finalizing readiness).
+################################################################################
+
+# Input Parameters
+NODE_COUNT=${1:-10}
+RULE_COUNT=${2:-1} 
+BATCH_SIZE=50  
+BASE_RULE_NAME="kwok-network-rule"
+
+# Validate input
+if ! [[ "$NODE_COUNT" =~ ^[0-9]+$ ]] || ! [[ "$RULE_COUNT" =~ ^[0-9]+$ ]]; then
+  echo "Error: Please provide valid positive numbers for node and rule counts"
+  echo "Example: ./scale-test.sh 1000 3"
+  exit 1
+fi
+
+echo "🚀 Starting Scale Test: $NODE_COUNT Nodes | $RULE_COUNT Rules"
+echo "----------------------------------------------------------"
+
+# Step 0: Cleanup
+echo "Step 0: Cleaning up existing resources..."
+kubectl delete nodereadinessrules -l scale-test=true --ignore-not-found=true
+kubectl delete nodes -l kwok.x-k8s.io/node=fake --ignore-not-found=true
+sleep 2
+
+# Step 1: Create Multiple Rules
+echo "Step 1: Creating $RULE_COUNT rules..."
+for r in $(seq 1 $RULE_COUNT); do
+  cat <<EOF | kubectl apply -f -
+apiVersion: readiness.node.x-k8s.io/v1alpha1
+kind: NodeReadinessRule
+metadata:
+  name: ${BASE_RULE_NAME}-$r
+  labels: { scale-test: "true" }
+spec:
+  nodeSelector: { matchLabels: { kwok.x-k8s.io/node: fake } }
+  conditions:
+    - type: "network.kubernetes.io/CNIReady-$r"
+      requiredStatus: "True"
+  taint:
+    key: "readiness.k8s.io/network-unready-$r"
+    value: "true"
+    effect: NoSchedule
+  enforcementMode: "bootstrap-only"
+EOF
+done
+
+# Step 2: Create Nodes
+echo "Step 2: Spawning $NODE_COUNT nodes in parallel..."
+TAINT_START_TIME=$(date +%s); TAINT_START_NANOS=$(date +%N)
+
+create_node() {
+  cat <<EOF | kubectl apply -f - 2>/dev/null
+apiVersion: v1
+kind: Node
+metadata:
+  name: kwok-node-$1
+  labels: { kwok.x-k8s.io/node: fake }
+spec:
+  taints: [{key: "kwok.x-k8s.io/node", value: "fake", effect: "NoSchedule"}]
+status:
+  allocatable: {cpu: "32", memory: "256Gi", pods: "110"}
+  capacity: {cpu: "32", memory: "256Gi", pods: "110"}
+  conditions: [{type: "Ready", status: "True", reason: "KubeletReady", message: "ready", lastHeartbeatTime: "$(date -u +"%Y-%m-%dT%H:%M:%SZ")", lastTransitionTime: "$(date -u +"%Y-%m-%dT%H:%M:%SZ")"}]
+EOF
+}
+
+for batch_start in $(seq 1 $BATCH_SIZE $NODE_COUNT); do
+  batch_end=$((batch_start + BATCH_SIZE - 1))
+  [ $batch_end -gt $NODE_COUNT ] && batch_end=$NODE_COUNT
+  for i in $(seq $batch_start $batch_end); do create_node $i & done
+  wait
+done
+
+# Step 3: Wait for ALL taints
+echo "Step 3: Waiting for Controller to add $((NODE_COUNT * RULE_COUNT)) total taints..."
+while true; do
+  TOTAL_TAINTS=$(kubectl get nodes -l kwok.x-k8s.io/node=fake -o json | jq "[.items[].spec.taints // [] | .[] | select(.key | startswith(\"readiness.k8s.io/network-unready\"))] | length")
+  [ "$TOTAL_TAINTS" -eq $((NODE_COUNT * RULE_COUNT)) ] && break
+  echo -n "[$TOTAL_TAINTS]" && sleep 1
+done
+TAINT_END_TIME=$(date +%s); TAINT_END_NANOS=$(date +%N)
+
+# Step 4: Patch Conditions
+echo -e "\nStep 4: Satisfying conditions for all rules..."
+UNTAINT_START_TIME=$(date +%s); UNTAINT_START_NANOS=$(date +%N)
+
+patch_node_conditions() {
+  PATCH_JSON="["
+  for r in $(seq 1 $RULE_COUNT); do
+    PATCH_JSON+="{\"op\":\"add\",\"path\":\"/status/conditions/-\",\"value\":{\"type\":\"network.kubernetes.io/CNIReady-$r\",\"status\":\"True\",\"lastHeartbeatTime\":\"$(date -u +"%Y-%m-%dT%H:%M:%SZ")\",\"lastTransitionTime\":\"$(date -u +"%Y-%m-%dT%H:%M:%SZ")\",\"reason\":\"CNIReady\",\"message\":\"ready\"}}"
+    [ $r -lt $RULE_COUNT ] && PATCH_JSON+=","
+  done
+  PATCH_JSON+="]"
+  kubectl patch node kwok-node-$1 --subresource=status --type=json -p="$PATCH_JSON" > /dev/null 2>&1
+}
+
+for batch_start in $(seq 1 $BATCH_SIZE $NODE_COUNT); do
+  batch_end=$((batch_start + BATCH_SIZE - 1))
+  [ $batch_end -gt $NODE_COUNT ] && batch_end=$NODE_COUNT
+  for i in $(seq $batch_start $batch_end); do patch_node_conditions $i & done
+  wait
+done
+
+# Step 5: Wait for Removal
+echo "Step 5: Waiting for Taint removal..."
+while true; do
+  REMAINING=$(kubectl get nodes -l kwok.x-k8s.io/node=fake -o json | jq "[.items[].spec.taints // [] | .[] | select(.key | startswith(\"readiness.k8s.io/network-unready\"))] | length")
+  [ "$REMAINING" -eq 0 ] && break
+  echo -n "[$REMAINING]" && sleep 1
+done
+UNTAINT_END_TIME=$(date +%s); UNTAINT_END_NANOS=$(date +%N)
+
+# Step 6: Final Stats
+TAINT_MS=$(echo "scale=0; (($TAINT_END_TIME - $TAINT_START_TIME) * 1000) + (($TAINT_END_NANOS - $TAINT_START_NANOS) / 1000000)" | bc)
+UNTAINT_MS=$(echo "scale=0; (($UNTAINT_END_TIME - $UNTAINT_START_TIME) * 1000) + (($UNTAINT_END_NANOS - $UNTAINT_START_NANOS) / 1000000)" | bc)
+AVG_SIZE=$(kubectl get nodereadinessrules -l scale-test=true -o json | jq '[.items[] | tostring | length] | add / length')
+
+echo -e "\n\n╔════════════════════════════════════════════════════════════════╗"
+echo "║                MULTI-RULE PERFORMANCE SUMMARY                  ║"
+echo "╠════════════════════════════════════════════════════════════════╣"
+printf "║ Total Nodes:           %-40s║\n" "$NODE_COUNT"
+printf "║ Active Rules:          %-40s║\n" "$RULE_COUNT"
+printf "║ Taint Add Time:        %-40s║\n" "${TAINT_MS} ms"
+printf "║ Taint Remove Time:     %-40s║\n" "${UNTAINT_MS} ms"
+echo "║                                                                ║"
+printf "║ Avg Rule Size:         %-40s║\n" "${AVG_SIZE%.*} bytes"
+echo "╚════════════════════════════════════════════════════════════════╝"
diff --git a/hack/test-workloads/scale/servicemonitor.yaml b/hack/test-workloads/scale/servicemonitor.yaml
new file mode 100644
index 0000000..610c41e
--- /dev/null
+++ b/hack/test-workloads/scale/servicemonitor.yaml
@@ -0,0 +1,21 @@
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: node-readiness-controller-monitor
+  namespace: monitoring
+  labels:
+    release: prom-stack # This matches your helm release
+spec:
+  # This tells Prometheus to look in the nrr-system namespace
+  namespaceSelector:
+    matchNames:
+      - nrr-system
+  # This matches the labels found in your 'kubectl get svc' output
+  selector:
+    matchLabels:
+      control-plane: controller-manager
+      app.kubernetes.io/name: nrrcontroller
+  endpoints:
+  - port: http # Match the spec.ports[0].name from your service
+    scheme: http # Your service is not using HTTPS/TLS
+    interval: 5s # High-resolution for your scale test
diff --git a/hack/test-workloads/scale/setup-monitoring.sh b/hack/test-workloads/scale/setup-monitoring.sh
new file mode 100755
index 0000000..1c0f2e5
--- /dev/null
+++ b/hack/test-workloads/scale/setup-monitoring.sh
@@ -0,0 +1,230 @@
+#!/usr/bin/env bash
+
+# Copyright The Kubernetes Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Complete setup script for NRR scale testing with Prometheus and Grafana
+# This script:
+# 1. Creates a Kind cluster
+# 2. Installs NRR using Podman
+# 3. Installs Prometheus stack
+# 4. Creates ServiceMonitor
+# 5. Sets up port forwarding
+# 6. Provides instructions for Grafana dashboard import
+
+set -euo pipefail
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# Configuration
+CLUSTER_NAME="${CLUSTER_NAME:-nrr-test}"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
+
+echo -e "${BLUE}=========================================${NC}"
+echo -e "${BLUE}NRR Scale Test Setup with Monitoring${NC}"
+echo -e "${BLUE}=========================================${NC}"
+echo ""
+
+# Function to print step headers
+print_step() {
+    echo ""
+    echo -e "${GREEN}==> $1${NC}"
+}
+
+# Function to print warnings
+print_warning() {
+    echo -e "${YELLOW}⚠️  $1${NC}"
+}
+
+# Function to print errors
+print_error() {
+    echo -e "${RED}❌ $1${NC}"
+}
+
+# Function to print success
+print_success() {
+    echo -e "${GREEN}✓ $1${NC}"
+}
+
+# Check prerequisites
+print_step "Step 1: Checking prerequisites..."
+
+MISSING_TOOLS=()
+
+if ! command -v kind &> /dev/null; then
+    MISSING_TOOLS+=("kind")
+fi
+
+if ! command -v kubectl &> /dev/null; then
+    MISSING_TOOLS+=("kubectl")
+fi
+
+if ! command -v helm &> /dev/null; then
+    MISSING_TOOLS+=("helm")
+fi
+
+if ! command -v podman &> /dev/null; then
+    MISSING_TOOLS+=("podman")
+fi
+
+if [ ${#MISSING_TOOLS[@]} -ne 0 ]; then
+    print_error "Missing required tools: ${MISSING_TOOLS[*]}"
+    echo ""
+    echo "Please install:"
+    for tool in "${MISSING_TOOLS[@]}"; do
+        echo "  - $tool"
+    done
+    exit 1
+fi
+
+print_success "All prerequisites installed"
+
+# Create Kind cluster and install NRR
+print_step "Step 2: Creating Kind cluster and installing NRR with Podman..."
+cd "$PROJECT_ROOT"
+
+if kind get clusters | grep -q "^${CLUSTER_NAME}$"; then
+    print_warning "Cluster '$CLUSTER_NAME' already exists. Deleting..."
+    kind delete cluster --name "$CLUSTER_NAME"
+fi
+
+# Run the podman-kind-test target
+print_success "Running: make podman-kind-test"
+if ! make podman-kind-test KIND_CLUSTER="$CLUSTER_NAME"; then
+    print_error "Failed to create cluster and install NRR"
+    exit 1
+fi
+
+print_success "NRR installed successfully"
+
+# Add Prometheus Helm repo
+print_step "Step 3: Setting up Prometheus stack..."
+
+print_success "Adding Prometheus Helm repository..."
+helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+helm repo update
+
+# Install Prometheus stack
+print_success "Installing kube-prometheus-stack..."
+helm upgrade --install prom-stack prometheus-community/kube-prometheus-stack \
+  --namespace monitoring \
+  --create-namespace \
+  --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \
+  --set prometheus.prometheusSpec.scrapeInterval="5s" \
+  --set nodeExporter.enabled=false \
+  --wait \
+  --timeout 5m
+
+print_success "Prometheus stack installed"
+
+# Wait for Prometheus pods to be ready
+print_success "Waiting for Prometheus pods to be ready..."
+kubectl wait --for=condition=ready pod \
+  -l app.kubernetes.io/name=prometheus \
+  -n monitoring \
+  --timeout=300s
+
+kubectl wait --for=condition=ready pod \
+  -l app.kubernetes.io/name=grafana \
+  -n monitoring \
+  --timeout=300s
+
+print_success "Prometheus and Grafana are ready"
+
+# Create ServiceMonitor
+print_step "Step 4: Creating ServiceMonitor for NRR metrics..."
+
+if [ ! -f "$SCRIPT_DIR/servicemonitor.yaml" ]; then
+    print_error "ServiceMonitor file not found: $SCRIPT_DIR/servicemonitor.yaml"
+    exit 1
+fi
+
+kubectl apply -f "$SCRIPT_DIR/servicemonitor.yaml"
+print_success "ServiceMonitor created"
+
+# Get Grafana admin password
+print_step "Step 5: Retrieving Grafana credentials..."
+GRAFANA_PASSWORD=$(kubectl get secret --namespace monitoring prom-stack-grafana -o jsonpath="{.data.admin-password}" | base64 --decode)
+
+print_success "Grafana admin password retrieved"
+
+# Setup port forwarding
+print_step "Step 6: Setting up port forwarding..."
+
+# Kill any existing port forwards
+pkill -f "port-forward.*grafana" 2>/dev/null || true
+pkill -f "port-forward.*prometheus" 2>/dev/null || true
+
+# Start port forwarding in background
+kubectl port-forward -n monitoring svc/prom-stack-grafana 3000:80 > /dev/null 2>&1 &
+GRAFANA_PF_PID=$!
+
+kubectl port-forward -n monitoring svc/prom-stack-kube-prometheus-prometheus 9090:9090 > /dev/null 2>&1 &
+PROMETHEUS_PF_PID=$!
+
+# Wait for port forwards to be ready
+sleep 3
+
+print_success "Port forwarding established"
+
+# Print final instructions
+echo ""
+echo -e "${BLUE}=========================================${NC}"
+echo -e "${BLUE}Setup Complete!${NC}"
+echo -e "${BLUE}=========================================${NC}"
+echo ""
+echo -e "${GREEN}📊 Access URLs:${NC}"
+echo -e "  Grafana:    ${BLUE}http://localhost:3000${NC}"
+echo -e "  Prometheus: ${BLUE}http://localhost:9090${NC}"
+echo ""
+echo -e "${GREEN}🔐 Grafana Credentials:${NC}"
+echo -e "  Username: ${BLUE}admin${NC}"
+echo -e "  Password: ${BLUE}${GRAFANA_PASSWORD}${NC}"
+echo ""
+echo -e "${GREEN}📈 Import Dashboard:${NC}"
+echo "  1. Open Grafana: http://localhost:3000"
+echo "  2. Login with credentials above"
+echo "  3. Go to: Dashboards → Import"
+echo "  4. Click 'Upload JSON file'"
+echo "  5. Select: $SCRIPT_DIR/graphana-dashboard.json"
+echo "  6. Select Prometheus datasource"
+echo "  7. Click 'Import'"
+echo ""
+echo -e "${GREEN}🚀 Run Scale Test:${NC}"
+echo "  cd $PROJECT_ROOT"
+echo "  ./scale/new-script.sh 1000"
+echo ""
+echo -e "${GREEN}🧹 Cleanup:${NC}"
+echo "  ./hack/test-workloads/scale/cleanup-kwok-nodes-rules.sh"
+echo "  kind delete cluster --name $CLUSTER_NAME"
+echo ""
+echo -e "${YELLOW}⚠️  Port forwarding is running in background${NC}"
+echo -e "${YELLOW}   PIDs: Grafana=$GRAFANA_PF_PID, Prometheus=$PROMETHEUS_PF_PID${NC}"
+echo -e "${YELLOW}   To stop: kill $GRAFANA_PF_PID $PROMETHEUS_PF_PID${NC}"
+echo ""
+echo -e "${GREEN}Press Ctrl+C to stop port forwarding and exit${NC}"
+
+# Keep script running to maintain port forwards
+trap "echo ''; echo 'Stopping port forwarding...'; kill $GRAFANA_PF_PID $PROMETHEUS_PF_PID 2>/dev/null; exit 0" INT TERM
+
+# Wait for port forward processes
+wait $GRAFANA_PF_PID $PROMETHEUS_PF_PID
+
+# Made with Bob
diff --git a/internal/controller/node_controller.go b/internal/controller/node_controller.go
index fc04d27..7a0152f 100644
--- a/internal/controller/node_controller.go
+++ b/internal/controller/node_controller.go
@@ -19,6 +19,7 @@ package controller
 import (
 	"context"
 	"fmt"
+	"time"
 
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -128,6 +129,9 @@ func (r *RuleReadinessController) processNodeAgainstAllRules(ctx context.Context
 		if r.isBootstrapCompleted(ctx, node.Name, rule.Name) && rule.Spec.EnforcementMode == readinessv1alpha1.EnforcementModeBootstrapOnly {
 			log.Info("Skipping bootstrap-only rule - already completed",
 				"node", node.Name, "rule", rule.Name)
+			// Update metrics to reflect current state even when skipping
+			r.updateNodesByStateMetrics(ctx, rule)
+			metrics.RuleLastReconciliationTime.WithLabelValues(rule.Name).Set(float64(time.Now().Unix()))
 			continue
 		}
 
@@ -156,8 +160,9 @@ func (r *RuleReadinessController) processNodeAgainstAllRules(ctx context.Context
 			"rule", rule.Name,
 			"resourceVersion", rule.ResourceVersion)
 
+		var latestRule *readinessv1alpha1.NodeReadinessRule
 		err := retry.RetryOnConflict(retry.DefaultRetry, func() error {
-			latestRule := &readinessv1alpha1.NodeReadinessRule{}
+			latestRule = &readinessv1alpha1.NodeReadinessRule{}
 			if err := r.Get(ctx, client.ObjectKey{Name: rule.Name}, latestRule); err != nil {
 				return err
 			}
@@ -212,10 +217,12 @@ func (r *RuleReadinessController) processNodeAgainstAllRules(ctx context.Context
 				"resourceVersion", rule.ResourceVersion)
 			// continue with other rules
 		} else {
+			r.updateNodesByStateMetrics(ctx, latestRule)
+			metrics.RuleLastReconciliationTime.WithLabelValues(latestRule.Name).Set(float64(time.Now().Unix()))
 			log.V(4).Info("Successfully persisted rule status from node reconciler",
 				"node", node.Name,
-				"rule", rule.Name,
-				"newResourceVersion", rule.ResourceVersion)
+				"rule", latestRule.Name,
+				"newResourceVersion", latestRule.ResourceVersion)
 		}
 	}
 }
diff --git a/internal/controller/node_controller_test.go b/internal/controller/node_controller_test.go
index fee7160..6dc19d7 100644
--- a/internal/controller/node_controller_test.go
+++ b/internal/controller/node_controller_test.go
@@ -18,24 +18,21 @@ package controller
 
 import (
 	"context"
-	"sync/atomic"
 	"time"
 
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
+	dto "github.com/prometheus/client_model/go"
 	corev1 "k8s.io/api/core/v1"
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/client-go/kubernetes/fake"
 	"k8s.io/client-go/tools/record"
-	"sigs.k8s.io/controller-runtime/pkg/client"
-	fakeclient "sigs.k8s.io/controller-runtime/pkg/client/fake"
-	"sigs.k8s.io/controller-runtime/pkg/client/interceptor"
 	"sigs.k8s.io/controller-runtime/pkg/reconcile"
 
 	nodereadinessiov1alpha1 "sigs.k8s.io/node-readiness-controller/api/v1alpha1"
+	"sigs.k8s.io/node-readiness-controller/internal/metrics"
 )
 
 var _ = Describe("Node Controller", func() {
@@ -694,247 +691,144 @@ var _ = Describe("Node Controller", func() {
 			}, time.Second*5).Should(BeTrue(), "NodeEvaluation should be updated with new condition and taint status")
 		})
 	})
-
-	// These tests use the controller-runtime fake client (not envtest's
-	// k8sClient) with interceptors to simulate concurrent node modifications.
-	// The fake client enforces resourceVersion checks, so when
-	// MergeFromWithOptimisticLock is used and another write bumps the
-	// resourceVersion, the patch fails with a Conflict error — the same
-	// behavior a real API server would produce.
-	Context("optimistic locking on taint operations", func() {
+	Context("when updating aggregate readiness-state metrics from node reconciliation", func() {
 		var (
-			ctx        context.Context
-			testScheme *runtime.Scheme
+			ctx                 context.Context
+			readinessController *RuleReadinessController
+			nodeReconciler      *NodeReconciler
+			fakeClientset       *fake.Clientset
+			node1               *corev1.Node
+			node2               *corev1.Node
+			rule                *nodereadinessiov1alpha1.NodeReadinessRule
 		)
 
+		readGaugeValue := func(ruleName, state string) float64 {
+			metric := &dto.Metric{}
+			Expect(nodereadinessiov1alpha1.EnforcementModeContinuous).NotTo(BeEmpty())
+			Expect(metrics.NodesByState.WithLabelValues(ruleName, state).Write(metric)).To(Succeed())
+			return metric.GetGauge().GetValue()
+		}
+
 		BeforeEach(func() {
 			ctx = context.Background()
-			testScheme = runtime.NewScheme()
-			Expect(corev1.AddToScheme(testScheme)).To(Succeed())
-		})
-
-		It("should retry and succeed when removeTaintBySpec encounters a conflict", func() {
-			node := &corev1.Node{
-				ObjectMeta: metav1.ObjectMeta{Name: "ol-remove-conflict"},
-				Spec: corev1.NodeSpec{
-					Taints: []corev1.Taint{
-						{Key: "readiness.k8s.io/test", Effect: corev1.TaintEffectNoSchedule},
-						{Key: "other-controller/taint", Effect: corev1.TaintEffectNoSchedule},
-					},
-				},
-			}
 
-			var patchCount atomic.Int32
-
-			// The interceptor simulates a concurrent modification: on the
-			// first Patch call it updates the node (bumping resourceVersion)
-			// before delegating to the real Patch. Because
-			// MergeFromWithOptimisticLock embeds the original resourceVersion,
-			// the fake client detects the mismatch and returns a Conflict.
-			// The retry logic should handle this and succeed on the second attempt.
-			fc := fakeclient.NewClientBuilder().
-				WithScheme(testScheme).
-				WithObjects(node).
-				WithInterceptorFuncs(interceptor.Funcs{
-					Patch: func(ctx context.Context, c client.WithWatch, obj client.Object, patch client.Patch, opts ...client.PatchOption) error {
-						if obj.GetName() == "ol-remove-conflict" && patchCount.Add(1) == 1 {
-							// Simulate concurrent modification by another controller.
-							current := &corev1.Node{}
-							Expect(c.Get(ctx, types.NamespacedName{Name: obj.GetName()}, current)).To(Succeed())
-							current.Spec.Taints = append(current.Spec.Taints, corev1.Taint{
-								Key: "concurrent-controller/new-taint", Effect: corev1.TaintEffectNoSchedule,
-							})
-							Expect(c.Update(ctx, current)).To(Succeed())
-						}
-						return c.Patch(ctx, obj, patch, opts...)
-					},
-				}).
-				Build()
-
-			controller := &RuleReadinessController{
-				Client:        fc,
-				Scheme:        testScheme,
-				clientset:     fake.NewSimpleClientset(),
+			fakeClientset = fake.NewSimpleClientset()
+			readinessController = &RuleReadinessController{
+				Client:        k8sClient,
+				Scheme:        k8sClient.Scheme(),
+				clientset:     fakeClientset,
 				ruleCache:     make(map[string]*nodereadinessiov1alpha1.NodeReadinessRule),
 				EventRecorder: record.NewFakeRecorder(10),
 			}
 
-			Expect(fc.Get(ctx, types.NamespacedName{Name: node.Name}, node)).To(Succeed())
-
-			err := controller.removeTaintBySpec(ctx, node, corev1.Taint{
-				Key:    "readiness.k8s.io/test",
-				Effect: corev1.TaintEffectNoSchedule,
-			}, "test-rule")
-
-			// Should succeed after retry
-			Expect(err).NotTo(HaveOccurred())
-
-			// Verify the taint was removed and concurrent modification was preserved
-			updated := &corev1.Node{}
-			Expect(fc.Get(ctx, types.NamespacedName{Name: node.Name}, updated)).To(Succeed())
-			Expect(updated.Spec.Taints).To(HaveLen(2))
-
-			// Check that our taint was removed but the others remain
-			taintKeys := make(map[string]bool)
-			for _, taint := range updated.Spec.Taints {
-				taintKeys[taint.Key] = true
+			nodeReconciler = &NodeReconciler{
+				Client:     k8sClient,
+				Scheme:     k8sClient.Scheme(),
+				Controller: readinessController,
 			}
-			Expect(taintKeys).NotTo(HaveKey("readiness.k8s.io/test"))
-			Expect(taintKeys).To(HaveKey("other-controller/taint"))
-			Expect(taintKeys).To(HaveKey("concurrent-controller/new-taint"))
 
-			// Verify that the patch was attempted twice (first failed, second succeeded)
-			Expect(patchCount.Load()).To(BeNumerically(">=", 2))
-		})
+			rule = &nodereadinessiov1alpha1.NodeReadinessRule{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "aggregate-metrics-rule",
+				},
+				Spec: nodereadinessiov1alpha1.NodeReadinessRuleSpec{
+					Conditions: []nodereadinessiov1alpha1.ConditionRequirement{
+						{Type: "AggregateCondition", RequiredStatus: corev1.ConditionTrue},
+					},
+					Taint: corev1.Taint{
+						Key:    "readiness.k8s.io/aggregate-test",
+						Effect: corev1.TaintEffectNoSchedule,
+					},
+					NodeSelector: metav1.LabelSelector{
+						MatchLabels: map[string]string{"aggregate-test": "true"},
+					},
+					EnforcementMode: nodereadinessiov1alpha1.EnforcementModeContinuous,
+				},
+			}
 
-		It("should retry and succeed when addTaintBySpec encounters a conflict", func() {
-			node := &corev1.Node{
-				ObjectMeta: metav1.ObjectMeta{Name: "ol-add-conflict"},
+			node1 = &corev1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:   "aggregate-node-1",
+					Labels: map[string]string{"aggregate-test": "true"},
+				},
 				Spec: corev1.NodeSpec{
 					Taints: []corev1.Taint{
-						{Key: "other-controller/taint", Effect: corev1.TaintEffectNoSchedule},
+						{Key: "readiness.k8s.io/aggregate-test", Effect: corev1.TaintEffectNoSchedule},
 					},
 				},
-			}
-
-			var patchCount atomic.Int32
-
-			// The interceptor simulates a concurrent modification on the first
-			// patch attempt, which should trigger a retry that succeeds.
-			fc := fakeclient.NewClientBuilder().
-				WithScheme(testScheme).
-				WithObjects(node).
-				WithInterceptorFuncs(interceptor.Funcs{
-					Patch: func(ctx context.Context, c client.WithWatch, obj client.Object, patch client.Patch, opts ...client.PatchOption) error {
-						if obj.GetName() == "ol-add-conflict" && patchCount.Add(1) == 1 {
-							current := &corev1.Node{}
-							Expect(c.Get(ctx, types.NamespacedName{Name: obj.GetName()}, current)).To(Succeed())
-							current.Spec.Taints = append(current.Spec.Taints, corev1.Taint{
-								Key: "concurrent-controller/new-taint", Effect: corev1.TaintEffectNoSchedule,
-							})
-							Expect(c.Update(ctx, current)).To(Succeed())
-						}
-						return c.Patch(ctx, obj, patch, opts...)
+				Status: corev1.NodeStatus{
+					Conditions: []corev1.NodeCondition{
+						{Type: "AggregateCondition", Status: corev1.ConditionTrue},
 					},
-				}).
-				Build()
-
-			controller := &RuleReadinessController{
-				Client:        fc,
-				Scheme:        testScheme,
-				clientset:     fake.NewSimpleClientset(),
-				ruleCache:     make(map[string]*nodereadinessiov1alpha1.NodeReadinessRule),
-				EventRecorder: record.NewFakeRecorder(10),
-			}
-
-			Expect(fc.Get(ctx, types.NamespacedName{Name: node.Name}, node)).To(Succeed())
-
-			err := controller.addTaintBySpec(ctx, node, corev1.Taint{
-				Key:    "readiness.k8s.io/test",
-				Effect: corev1.TaintEffectNoSchedule,
-			}, "test-rule")
-
-			// Should succeed after retry
-			Expect(err).NotTo(HaveOccurred())
-
-			// Verify both taints are present (ours and the concurrent one)
-			updated := &corev1.Node{}
-			Expect(fc.Get(ctx, types.NamespacedName{Name: node.Name}, updated)).To(Succeed())
-			Expect(updated.Spec.Taints).To(HaveLen(3))
-
-			// Check that all expected taints are present
-			taintKeys := make(map[string]bool)
-			for _, taint := range updated.Spec.Taints {
-				taintKeys[taint.Key] = true
+				},
 			}
-			Expect(taintKeys).To(HaveKey("readiness.k8s.io/test"))
-			Expect(taintKeys).To(HaveKey("other-controller/taint"))
-			Expect(taintKeys).To(HaveKey("concurrent-controller/new-taint"))
 
-			// Verify that the patch was attempted twice (first failed, second succeeded)
-			Expect(patchCount.Load()).To(BeNumerically(">=", 2))
-		})
-
-		It("should succeed when no concurrent modification occurs", func() {
-			node := &corev1.Node{
-				ObjectMeta: metav1.ObjectMeta{Name: "ol-no-conflict"},
+			node2 = &corev1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:   "aggregate-node-2",
+					Labels: map[string]string{"aggregate-test": "true"},
+				},
 				Spec: corev1.NodeSpec{
 					Taints: []corev1.Taint{
-						{Key: "readiness.k8s.io/test", Effect: corev1.TaintEffectNoSchedule},
-						{Key: "other/taint", Effect: corev1.TaintEffectNoSchedule},
+						{Key: "readiness.k8s.io/aggregate-test", Effect: corev1.TaintEffectNoSchedule},
+					},
+				},
+				Status: corev1.NodeStatus{
+					Conditions: []corev1.NodeCondition{
+						{Type: "AggregateCondition", Status: corev1.ConditionFalse},
 					},
 				},
 			}
 
-			fc := fakeclient.NewClientBuilder().
-				WithScheme(testScheme).
-				WithObjects(node).
-				Build()
-
-			controller := &RuleReadinessController{
-				Client:        fc,
-				Scheme:        testScheme,
-				clientset:     fake.NewSimpleClientset(),
-				ruleCache:     make(map[string]*nodereadinessiov1alpha1.NodeReadinessRule),
-				EventRecorder: record.NewFakeRecorder(10),
-			}
+			metrics.CleanupRuleMetrics(rule.Name)
+		})
 
-			Expect(fc.Get(ctx, types.NamespacedName{Name: node.Name}, node)).To(Succeed())
+		JustBeforeEach(func() {
+			Expect(k8sClient.Create(ctx, node1)).To(Succeed())
+			Expect(k8sClient.Create(ctx, node2)).To(Succeed())
+			Expect(k8sClient.Create(ctx, rule)).To(Succeed())
+			readinessController.updateRuleCache(ctx, rule)
+		})
 
-			err := controller.removeTaintBySpec(ctx, node, corev1.Taint{
-				Key:    "readiness.k8s.io/test",
-				Effect: corev1.TaintEffectNoSchedule,
-			}, "test-rule")
-			Expect(err).NotTo(HaveOccurred())
+		AfterEach(func() {
+			metrics.CleanupRuleMetrics(rule.Name)
 
-			updated := &corev1.Node{}
-			Expect(fc.Get(ctx, types.NamespacedName{Name: node.Name}, updated)).To(Succeed())
-			Expect(updated.Spec.Taints).To(HaveLen(1))
-			Expect(updated.Spec.Taints[0].Key).To(Equal("other/taint"))
-		})
+			_ = k8sClient.Delete(ctx, node1)
+			_ = k8sClient.Delete(ctx, node2)
 
-		It("should skip patch when removing a taint that does not exist", func() {
-			node := &corev1.Node{
-				ObjectMeta: metav1.ObjectMeta{Name: "ol-noop"},
-				Spec: corev1.NodeSpec{
-					Taints: []corev1.Taint{
-						{Key: "other/taint", Effect: corev1.TaintEffectNoSchedule},
-					},
-				},
+			updatedRule := &nodereadinessiov1alpha1.NodeReadinessRule{}
+			if err := k8sClient.Get(ctx, types.NamespacedName{Name: rule.Name}, updatedRule); err == nil {
+				updatedRule.Finalizers = nil
+				_ = k8sClient.Update(ctx, updatedRule)
+				_ = k8sClient.Delete(ctx, updatedRule)
 			}
 
-			var patchCalled atomic.Bool
-
-			fc := fakeclient.NewClientBuilder().
-				WithScheme(testScheme).
-				WithObjects(node).
-				WithInterceptorFuncs(interceptor.Funcs{
-					Patch: func(ctx context.Context, c client.WithWatch, obj client.Object, patch client.Patch, opts ...client.PatchOption) error {
-						if obj.GetName() == "ol-noop" {
-							patchCalled.Store(true)
-						}
-						return c.Patch(ctx, obj, patch, opts...)
-					},
-				}).
-				Build()
+			Eventually(func() bool {
+				err := k8sClient.Get(ctx, types.NamespacedName{Name: rule.Name}, &nodereadinessiov1alpha1.NodeReadinessRule{})
+				return apierrors.IsNotFound(err)
+			}, time.Second*10).Should(BeTrue())
 
-			controller := &RuleReadinessController{
-				Client:        fc,
-				Scheme:        testScheme,
-				clientset:     fake.NewSimpleClientset(),
-				ruleCache:     make(map[string]*nodereadinessiov1alpha1.NodeReadinessRule),
-				EventRecorder: record.NewFakeRecorder(10),
-			}
+			readinessController.removeRuleFromCache(ctx, rule.Name)
+		})
 
-			Expect(fc.Get(ctx, types.NamespacedName{Name: node.Name}, node)).To(Succeed())
+		It("should refresh NodesByState using aggregate rule status during node reconciliation", func() {
+			_, err := nodeReconciler.Reconcile(ctx, reconcile.Request{NamespacedName: types.NamespacedName{Name: node1.Name}})
+			Expect(err).NotTo(HaveOccurred())
 
-			err := controller.removeTaintBySpec(ctx, node, corev1.Taint{
-				Key:    "readiness.k8s.io/nonexistent",
-				Effect: corev1.TaintEffectNoSchedule,
-			}, "test-rule")
+			_, err = nodeReconciler.Reconcile(ctx, reconcile.Request{NamespacedName: types.NamespacedName{Name: node2.Name}})
 			Expect(err).NotTo(HaveOccurred())
-			Expect(patchCalled.Load()).To(BeFalse(),
-				"Patch should not be called when taint removal is a no-op")
+
+			Eventually(func() float64 {
+				return readGaugeValue(rule.Name, "ready")
+			}, time.Second*5).Should(Equal(float64(1)))
+
+			Eventually(func() float64 {
+				return readGaugeValue(rule.Name, "not_ready")
+			}, time.Second*5).Should(Equal(float64(1)))
+
+			Consistently(func() float64 {
+				return readGaugeValue(rule.Name, "bootstrapping")
+			}, time.Second).Should(Equal(float64(0)))
 		})
 	})
 })
diff --git a/internal/controller/nodereadinessrule_controller.go b/internal/controller/nodereadinessrule_controller.go
index 99347e9..1a6559d 100644
--- a/internal/controller/nodereadinessrule_controller.go
+++ b/internal/controller/nodereadinessrule_controller.go
@@ -46,6 +46,11 @@ import (
 const (
 	// finalizerName is the finalizer added to NodeReadinessRule to ensure cleanup.
 	finalizerName = "readiness.node.x-k8s.io/cleanup-taints"
+
+	// maxLatencyRecordingWindow is the maximum time window after a condition transition
+	// during which we record reconciliation latency metrics. This prevents skewing metrics
+	// when applying new rules to old, existing nodes.
+	maxLatencyRecordingWindow = 5 * time.Minute
 )
 
 // RuleReadinessController manages node taints based on readiness rules.
@@ -190,6 +195,10 @@ func (r *RuleReconciler) reconcileDelete(ctx context.Context, rule *readinessv1a
 	log.V(3).Info("Removing the rule from cache")
 	r.Controller.removeRuleFromCache(ctx, rule.Name)
 
+	// Clean up Prometheus metrics for this rule to prevent ghost metrics
+	log.V(3).Info("Cleaning up Prometheus metrics for deleted rule", "rule", rule.Name)
+	metrics.CleanupRuleMetrics(rule.Name)
+
 	log.V(3).Info("Removing the finalizer from the rule")
 	patch := client.MergeFrom(rule.DeepCopy())
 	controllerutil.RemoveFinalizer(rule, finalizerName)
@@ -268,6 +277,7 @@ func (r *RuleReadinessController) processAllNodesForRule(ctx context.Context, ru
 	log.Info("Processing all nodes for rule", "rule", rule.Name, "totalNodes", len(nodeList.Items))
 
 	var appliedNodes []string
+
 	for _, node := range nodeList.Items {
 		if r.ruleAppliesTo(ctx, rule, &node) {
 			appliedNodes = append(appliedNodes, node.Name)
@@ -289,20 +299,30 @@ func (r *RuleReadinessController) processAllNodesForRule(ctx context.Context, ru
 		rule.Status.DryRunResults = readinessv1alpha1.DryRunResults{}
 	}
 
+	r.updateNodesByStateMetrics(ctx, rule)
+
+	// Record rule-level reconciliation timestamp
+	metrics.RuleLastReconciliationTime.WithLabelValues(rule.Name).Set(float64(time.Now().Unix()))
+
 	log.Info("Completed processing nodes for rule", "rule", rule.Name, "processedCount", len(appliedNodes))
 	return nil
 }
 
 // evaluateRuleForNode evaluates a single rule against a single node.
 func (r *RuleReadinessController) evaluateRuleForNode(ctx context.Context, rule *readinessv1alpha1.NodeReadinessRule, node *corev1.Node) error {
-	timer := prometheus.NewTimer(metrics.EvaluationDuration)
-	defer timer.ObserveDuration()
+	// Track evaluation duration per rule
+	evalTimer := prometheus.NewTimer(metrics.EvaluationDuration.WithLabelValues(rule.Name))
+	defer evalTimer.ObserveDuration()
+
 	log := ctrl.LoggerFrom(ctx)
 
 	// Evaluate all conditions (ALL logic)
 	allConditionsSatisfied := true
 	conditionResults := make([]readinessv1alpha1.ConditionEvaluationResult, 0, len(rule.Spec.Conditions))
 
+	// Track the most recent condition transition time for latency calculation
+	var mostRecentTransitionTime time.Time
+
 	for _, condReq := range rule.Spec.Conditions {
 		currentStatus := r.getConditionStatus(node, condReq.Type)
 		satisfied := currentStatus == condReq.RequiredStatus
@@ -317,11 +337,25 @@ func (r *RuleReadinessController) evaluateRuleForNode(ctx context.Context, rule
 			RequiredStatus: condReq.RequiredStatus,
 		})
 
+		// Track the most recent transition time across all conditions for latency calculation
+		for _, condition := range node.Status.Conditions {
+			if string(condition.Type) == condReq.Type {
+				if condition.LastTransitionTime.After(mostRecentTransitionTime) {
+					mostRecentTransitionTime = condition.LastTransitionTime.Time
+				}
+				break
+			}
+		}
+
 		log.V(1).Info("Condition evaluation", "node", node.Name, "rule", rule.Name,
 			"conditionType", condReq.Type, "current", currentStatus, "required", condReq.RequiredStatus,
-			"satisfied", satisfied)
+			"satisfied", satisfied, "lastTransitionTime", mostRecentTransitionTime)
 	}
 
+	// Log aggregate condition satisfaction status
+	log.Info("Conditions evaluated", "node", node.Name, "rule", rule.Name,
+		"allConditionsSatisfied", allConditionsSatisfied, "conditionCount", len(rule.Spec.Conditions))
+
 	// Determine taint action
 	shouldRemoveTaint := allConditionsSatisfied
 	currentlyHasTaint := r.hasTaintBySpec(node, rule.Spec.Taint)
@@ -341,11 +375,32 @@ func (r *RuleReadinessController) evaluateRuleForNode(ctx context.Context, rule
 			metrics.Failures.WithLabelValues(rule.Name, "RemoveTaintError").Inc()
 			return fmt.Errorf("failed to remove taint: %w", err)
 		}
+
+		// Only record latency if the condition transitioned recently (e.g., within the last 5 minutes).
+		// This prevents skewing metrics when applying new rules to old, existing nodes.
+		if !mostRecentTransitionTime.IsZero() && time.Since(mostRecentTransitionTime) < maxLatencyRecordingWindow {
+			latency := time.Since(mostRecentTransitionTime).Seconds()
+			metrics.ReconciliationLatency.WithLabelValues(rule.Name, "remove_taint").Observe(latency)
+			log.V(1).Info("Taint removal latency", "node", node.Name, "rule", rule.Name,
+				"latency", fmt.Sprintf("%.3fs", latency),
+				"conditionTransitionTime", mostRecentTransitionTime.Format(time.RFC3339))
+		}
 		metrics.TaintOperations.WithLabelValues(rule.Name, "remove").Inc()
 
 		// Mark bootstrap completed if bootstrap-only mode
 		if rule.Spec.EnforcementMode == readinessv1alpha1.EnforcementModeBootstrapOnly {
 			r.markBootstrapCompleted(ctx, node.Name, rule.Name)
+
+			// Calculate bootstrap duration from node creation to taint removal
+			// Use the node's creation timestamp directly.
+			bootstrapDuration := time.Since(node.CreationTimestamp.Time).Seconds()
+			metrics.BootstrapDuration.WithLabelValues(rule.Name).Observe(bootstrapDuration)
+
+			log.Info("Bootstrap completed",
+				"node", node.Name,
+				"rule", rule.Name,
+				"duration", fmt.Sprintf("%.2fs", bootstrapDuration),
+				"nodeCreated", node.CreationTimestamp.Format(time.RFC3339))
 		}
 
 	case !shouldRemoveTaint && !currentlyHasTaint:
@@ -355,6 +410,16 @@ func (r *RuleReadinessController) evaluateRuleForNode(ctx context.Context, rule
 			metrics.Failures.WithLabelValues(rule.Name, "AddTaintError").Inc()
 			return fmt.Errorf("failed to add taint: %w", err)
 		}
+
+		// Calculate end-to-end latency from condition change to taint addition completion
+		// Only record if we have a valid condition transition time
+		if !mostRecentTransitionTime.IsZero() {
+			latency := time.Since(mostRecentTransitionTime).Seconds()
+			metrics.ReconciliationLatency.WithLabelValues(rule.Name, "add_taint").Observe(latency)
+			log.V(1).Info("Taint addition latency", "node", node.Name, "rule", rule.Name,
+				"latency", fmt.Sprintf("%.3fs", latency),
+				"conditionTransitionTime", mostRecentTransitionTime.Format(time.RFC3339))
+		}
 		metrics.TaintOperations.WithLabelValues(rule.Name, "add").Inc()
 
 	case !shouldRemoveTaint && currentlyHasTaint:
@@ -381,6 +446,12 @@ func (r *RuleReadinessController) evaluateRuleForNode(ctx context.Context, rule
 	// Update evaluation status
 	r.updateNodeEvaluationStatus(rule, node.Name, conditionResults, taintStatus)
 
+	// Log reconciliation completion with per-node details
+	now := time.Now()
+	log.Info("Node reconciliation completed", "node", node.Name, "rule", rule.Name,
+		"taintStatus", taintStatus, "allConditionsSatisfied", allConditionsSatisfied,
+		"timestamp", now.Unix())
+
 	return nil
 }
 
@@ -706,3 +777,38 @@ func (r *RuleReadinessController) getPreviousNodeEvaluation(rule *readinessv1alp
 	}
 	return nil
 }
+
+func (r *RuleReadinessController) updateNodesByStateMetrics(ctx context.Context, rule *readinessv1alpha1.NodeReadinessRule) {
+	nodeStates := map[string]int{
+		"ready":         0,
+		"not_ready":     0,
+		"bootstrapping": 0,
+	}
+
+	for _, evaluation := range rule.Status.NodeEvaluations {
+		allConditionsSatisfied := true
+		for _, conditionResult := range evaluation.ConditionResults {
+			if conditionResult.CurrentStatus != conditionResult.RequiredStatus {
+				allConditionsSatisfied = false
+				break
+			}
+		}
+
+		hasTaint := evaluation.TaintStatus == readinessv1alpha1.TaintStatusPresent
+
+		switch {
+		case allConditionsSatisfied && !hasTaint:
+			nodeStates["ready"]++
+		case !allConditionsSatisfied && hasTaint &&
+			rule.Spec.EnforcementMode == readinessv1alpha1.EnforcementModeBootstrapOnly &&
+			!r.isBootstrapCompleted(ctx, evaluation.NodeName, rule.Name):
+			nodeStates["bootstrapping"]++
+		default:
+			nodeStates["not_ready"]++
+		}
+	}
+
+	for state, count := range nodeStates {
+		metrics.NodesByState.WithLabelValues(rule.Name, state).Set(float64(count))
+	}
+}
diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go
index bb0d906..7dc41c2 100644
--- a/internal/metrics/metrics.go
+++ b/internal/metrics/metrics.go
@@ -22,7 +22,7 @@ import (
 )
 
 var (
-	// RulesTotal tracks the number of NodeReadinessRules .
+	// RulesTotal tracks the number of NodeReadinessRules.
 	RulesTotal = prometheus.NewGauge(
 		prometheus.GaugeOpts{
 			Name: "node_readiness_rules_total",
@@ -40,12 +40,13 @@ var (
 	)
 
 	// EvaluationDuration tracks the duration of rule evaluations.
-	EvaluationDuration = prometheus.NewHistogram(
+	EvaluationDuration = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Name:    "node_readiness_evaluation_duration_seconds",
 			Help:    "Duration of rule evaluations",
 			Buckets: prometheus.DefBuckets,
 		},
+		[]string{"rule"},
 	)
 
 	// Failures tracks the number of operational failures.
@@ -65,13 +66,78 @@ var (
 		},
 		[]string{"rule"},
 	)
+
+	// ReconciliationLatency tracks end-to-end latency from condition change to taint operation.
+	// This measures how quickly the controller responds to node condition changes.
+	// Note: Uses in-memory tracking for condition transition times to avoid high cardinality.
+	ReconciliationLatency = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Name:    "node_readiness_reconciliation_latency_seconds",
+			Help:    "End-to-end latency from node condition change to taint operation completion",
+			Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 30, 60, 120, 300}, // 10ms to 5min
+		},
+		[]string{"rule", "operation"}, // operation: add_taint, remove_taint
+	)
+
+	// BootstrapDuration tracks the time from node creation to bootstrap completion (taint removal).
+	// This measures the end-to-end bootstrap time for nodes in bootstrap-only mode.
+	// Bootstrap start time is tracked in-memory, not as a metric.
+	BootstrapDuration = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Name:    "node_readiness_bootstrap_duration_seconds",
+			Help:    "Time from node creation to bootstrap completion (taint removal) for bootstrap-only rules",
+			Buckets: []float64{1, 5, 10, 30, 60, 120, 300, 600, 1200}, // 1s to 20min
+		},
+		[]string{"rule"},
+	)
+
+	// NodesByState tracks nodes in each readiness state per rule.
+	// Provides a quick overview of cluster health.
+	NodesByState = prometheus.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Name: "node_readiness_nodes_by_state",
+			Help: "Number of nodes in each readiness state per rule",
+		},
+		[]string{"rule", "state"}, // state: ready, not_ready, bootstrapping
+	)
+
+	// RuleLastReconciliationTime tracks when a rule was last reconciled.
+	// This provides rule-level visibility for admins to detect stuck rules.
+	RuleLastReconciliationTime = prometheus.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Name: "node_readiness_rule_last_reconciliation_timestamp_seconds",
+			Help: "Unix timestamp of the last rule reconciliation",
+		},
+		[]string{"rule"},
+	)
 )
 
 func init() {
-	// Register custom metrics with the global prometheus registry
 	metrics.Registry.MustRegister(RulesTotal)
 	metrics.Registry.MustRegister(TaintOperations)
 	metrics.Registry.MustRegister(EvaluationDuration)
 	metrics.Registry.MustRegister(Failures)
 	metrics.Registry.MustRegister(BootstrapCompleted)
+	metrics.Registry.MustRegister(ReconciliationLatency)
+	metrics.Registry.MustRegister(BootstrapDuration)
+	metrics.Registry.MustRegister(NodesByState)
+	metrics.Registry.MustRegister(RuleLastReconciliationTime)
+}
+
+// CleanupRuleMetrics removes all Prometheus metrics associated with a deleted rule.
+// This prevents memory leaks and "ghost" metrics in Grafana after a rule is deleted.
+func CleanupRuleMetrics(ruleName string) {
+	ruleLabel := prometheus.Labels{"rule": ruleName}
+
+	// For metrics that only have the "rule" label, use Delete()
+	BootstrapCompleted.Delete(ruleLabel)
+	RuleLastReconciliationTime.Delete(ruleLabel)
+
+	// For metrics with multiple labels (like "rule" + "state"), use DeletePartialMatch()
+	TaintOperations.DeletePartialMatch(ruleLabel)
+	EvaluationDuration.DeletePartialMatch(ruleLabel)
+	Failures.DeletePartialMatch(ruleLabel)
+	ReconciliationLatency.DeletePartialMatch(ruleLabel)
+	BootstrapDuration.DeletePartialMatch(ruleLabel)
+	NodesByState.DeletePartialMatch(ruleLabel)
 }