From ca8a8b93fc02785ef3cfdb09e5f4271b34a5db03 Mon Sep 17 00:00:00 2001 From: Geoffrey Asare Date: Wed, 6 May 2026 13:46:21 -0400 Subject: [PATCH 01/20] feat(stack): allow controller metrics scraping --- charts/kubex-automation-stack/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/kubex-automation-stack/values.yaml b/charts/kubex-automation-stack/values.yaml index dfa83ea..f42e72f 100644 --- a/charts/kubex-automation-stack/values.yaml +++ b/charts/kubex-automation-stack/values.yaml @@ -216,7 +216,7 @@ prometheus: target_label: namespace - source_labels: [__meta_kubernetes_endpointslice_name] action: keep - regex: '((kubex|densify)-(kube-state-metrics|prometheus-node-exporter|ephemeral-storage-collector)|.*dcgm|k8s-ephemeral-storage-metrics).*' + regex: '((kubex|densify)-(kube-state-metrics|prometheus-node-exporter|ephemeral-storage-collector)|.*dcgm|k8s-ephemeral-storage-metrics|kubex-webhook-service.*|.*kubex-automation-engine.*metrics-service.*).*' - source_labels: [__meta_kubernetes_service_name] action: replace target_label: service From f18919b64ea34f6b91b9ad5055ecd7d02ac94842 Mon Sep 17 00:00:00 2001 From: Geoffrey Asare Date: Wed, 6 May 2026 14:16:46 -0400 Subject: [PATCH 02/20] fix(stack): drop webhook from metrics allowlist --- charts/kubex-automation-stack/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/kubex-automation-stack/values.yaml b/charts/kubex-automation-stack/values.yaml index f42e72f..1de97f3 100644 --- a/charts/kubex-automation-stack/values.yaml +++ b/charts/kubex-automation-stack/values.yaml @@ -216,7 +216,7 @@ prometheus: target_label: namespace - source_labels: [__meta_kubernetes_endpointslice_name] action: keep - regex: '((kubex|densify)-(kube-state-metrics|prometheus-node-exporter|ephemeral-storage-collector)|.*dcgm|k8s-ephemeral-storage-metrics|kubex-webhook-service.*|.*kubex-automation-engine.*metrics-service.*).*' + regex: '((kubex|densify)-(kube-state-metrics|prometheus-node-exporter|ephemeral-storage-collector)|.*dcgm|k8s-ephemeral-storage-metrics|.*kubex-automation-engine.*metrics-service.*).*' - source_labels: [__meta_kubernetes_service_name] action: replace target_label: service From 48d7b49a20dd15aaca02326465dffeccacf0b73d Mon Sep 17 00:00:00 2001 From: Geoffrey Asare Date: Wed, 6 May 2026 14:39:23 -0400 Subject: [PATCH 03/20] fix(stack): tighten controller metrics allowlist --- charts/kubex-automation-stack/Chart.yaml | 2 +- charts/kubex-automation-stack/values.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/kubex-automation-stack/Chart.yaml b/charts/kubex-automation-stack/Chart.yaml index 14176bb..eb743a4 100644 --- a/charts/kubex-automation-stack/Chart.yaml +++ b/charts/kubex-automation-stack/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 description: Kubex Collection Stack name: kubex-automation-stack -version: 1.0.8 +version: 1.0.9 type: application icon: https://www.kubex.ai/wp-content/uploads/kubex-by-densify-logo.png dependencies: diff --git a/charts/kubex-automation-stack/values.yaml b/charts/kubex-automation-stack/values.yaml index 1de97f3..e467926 100644 --- a/charts/kubex-automation-stack/values.yaml +++ b/charts/kubex-automation-stack/values.yaml @@ -216,7 +216,7 @@ prometheus: target_label: namespace - source_labels: [__meta_kubernetes_endpointslice_name] action: keep - regex: '((kubex|densify)-(kube-state-metrics|prometheus-node-exporter|ephemeral-storage-collector)|.*dcgm|k8s-ephemeral-storage-metrics|.*kubex-automation-engine.*metrics-service.*).*' + regex: '((kubex|densify)-(kube-state-metrics|prometheus-node-exporter|ephemeral-storage-collector)|.*dcgm|k8s-ephemeral-storage-metrics|.*kubex-automation-engine.*metrics-service).*' - source_labels: [__meta_kubernetes_service_name] action: replace target_label: service From 7ec4ec8952b794313818dcf9ba2487535d8a960e Mon Sep 17 00:00:00 2001 From: Geoffrey Asare Date: Wed, 6 May 2026 15:04:31 -0400 Subject: [PATCH 04/20] fix(stack): keep controller metrics samples --- charts/kubex-automation-stack/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/kubex-automation-stack/values.yaml b/charts/kubex-automation-stack/values.yaml index e467926..e31a0ba 100644 --- a/charts/kubex-automation-stack/values.yaml +++ b/charts/kubex-automation-stack/values.yaml @@ -225,7 +225,7 @@ prometheus: target_label: node metric_relabel_configs: - source_labels: [__name__] - regex: '^(DCGM_FI_(DEV_(FB_(FREE|USED)|GPU_UTIL|POWER_USAGE)|PROF_(DRAM_ACTIVE|GR_ENGINE_ACTIVE|PIPE_TENSOR_ACTIVE))|ephemeral_storage_.*|kube_(cronjob_(created|info|labels|next_schedule_time|status_(active|last_schedule_time))|daemonset_(created|labels|status_number_available)|deployment_(created|labels|metadata_generation|spec_strategy_rollingupdate_max_(surge|unavailable))|horizontalpodautoscaler_(info|labels|spec_(max_replicas|min_replicas|target_metric)|status_(condition|current_replicas|target_metric))|job_(created|info|labels|owner|spec_(completions|parallelism)|status_(active|completion_time|start_time))|namespace_(annotations|labels)|node_(info|labels|role|spec_taint|status_(allocatable|capacity))|pod_(container_(info|resource_(limits|requests)|status_(last_terminated_(exitcode|timestamp)|restarts_total|terminated(?:_reason)?))|created|info|labels|owner|status_(phase|qos_class))|replicaset_(created|labels|owner|spec_replicas)|replicationcontroller_(created|spec_replicas)|resourcequota(?:_created)?|statefulset_(created|labels|replicas))|node_(cpu_(core_throttles_total|seconds_total)|disk_(read_bytes_total|reads_completed_total|writes_completed_total|written_bytes_total)|memory_(Buffers_bytes|Cached_bytes|MemFree_bytes|MemTotal_bytes|SReclaimable_bytes)|network_(receive_(bytes_total|packets_total)|speed_bytes|transmit_(bytes_total|packets_total))|vmstat_oom_kill)|openshift_clusterresourcequota_(created|labels|namespace_usage|selector|usage))$' + regex: '^(DCGM_FI_(DEV_(FB_(FREE|USED)|GPU_UTIL|POWER_USAGE)|PROF_(DRAM_ACTIVE|GR_ENGINE_ACTIVE|PIPE_TENSOR_ACTIVE))|automation_controller_controller_action_(total|duration_seconds(?:_bucket|_sum|_count)?)|controller_runtime_.*|workqueue_.*|go_.*|process_.*|rest_client_.*|ephemeral_storage_.*|kube_(cronjob_(created|info|labels|next_schedule_time|status_(active|last_schedule_time))|daemonset_(created|labels|status_number_available)|deployment_(created|labels|metadata_generation|spec_strategy_rollingupdate_max_(surge|unavailable))|horizontalpodautoscaler_(info|labels|spec_(max_replicas|min_replicas|target_metric)|status_(condition|current_replicas|target_metric))|job_(created|info|labels|owner|spec_(completions|parallelism)|status_(active|completion_time|start_time))|namespace_(annotations|labels)|node_(info|labels|role|spec_taint|status_(allocatable|capacity))|pod_(container_(info|resource_(limits|requests)|status_(last_terminated_(exitcode|timestamp)|restarts_total|terminated(?:_reason)?))|created|info|labels|owner|status_(phase|qos_class))|replicaset_(created|labels|owner|spec_replicas)|replicationcontroller_(created|spec_replicas)|resourcequota(?:_created)?|statefulset_(created|labels|replicas))|node_(cpu_(core_throttles_total|seconds_total)|disk_(read_bytes_total|reads_completed_total|writes_completed_total|written_bytes_total)|memory_(Buffers_bytes|Cached_bytes|MemFree_bytes|MemTotal_bytes|SReclaimable_bytes)|network_(receive_(bytes_total|packets_total)|speed_bytes|transmit_(bytes_total|packets_total))|vmstat_oom_kill)|openshift_clusterresourcequota_(created|labels|namespace_usage|selector|usage))$' action: keep ################################################################# From faea326fa3917450728b01580542cd58937d5c13 Mon Sep 17 00:00:00 2001 From: Geoffrey Asare Date: Wed, 6 May 2026 15:15:17 -0400 Subject: [PATCH 05/20] fix(stack): dedicate controller metrics scrape --- charts/kubex-automation-stack/Chart.yaml | 2 +- charts/kubex-automation-stack/values.yaml | 40 +++++++++++++++++++++-- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/charts/kubex-automation-stack/Chart.yaml b/charts/kubex-automation-stack/Chart.yaml index eb743a4..5816939 100644 --- a/charts/kubex-automation-stack/Chart.yaml +++ b/charts/kubex-automation-stack/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 description: Kubex Collection Stack name: kubex-automation-stack -version: 1.0.9 +version: 1.0.10 type: application icon: https://www.kubex.ai/wp-content/uploads/kubex-by-densify-logo.png dependencies: diff --git a/charts/kubex-automation-stack/values.yaml b/charts/kubex-automation-stack/values.yaml index e31a0ba..df7dc96 100644 --- a/charts/kubex-automation-stack/values.yaml +++ b/charts/kubex-automation-stack/values.yaml @@ -216,7 +216,7 @@ prometheus: target_label: namespace - source_labels: [__meta_kubernetes_endpointslice_name] action: keep - regex: '((kubex|densify)-(kube-state-metrics|prometheus-node-exporter|ephemeral-storage-collector)|.*dcgm|k8s-ephemeral-storage-metrics|.*kubex-automation-engine.*metrics-service).*' + regex: '((kubex|densify)-(kube-state-metrics|prometheus-node-exporter|ephemeral-storage-collector)|.*dcgm|k8s-ephemeral-storage-metrics).*' - source_labels: [__meta_kubernetes_service_name] action: replace target_label: service @@ -225,9 +225,45 @@ prometheus: target_label: node metric_relabel_configs: - source_labels: [__name__] - regex: '^(DCGM_FI_(DEV_(FB_(FREE|USED)|GPU_UTIL|POWER_USAGE)|PROF_(DRAM_ACTIVE|GR_ENGINE_ACTIVE|PIPE_TENSOR_ACTIVE))|automation_controller_controller_action_(total|duration_seconds(?:_bucket|_sum|_count)?)|controller_runtime_.*|workqueue_.*|go_.*|process_.*|rest_client_.*|ephemeral_storage_.*|kube_(cronjob_(created|info|labels|next_schedule_time|status_(active|last_schedule_time))|daemonset_(created|labels|status_number_available)|deployment_(created|labels|metadata_generation|spec_strategy_rollingupdate_max_(surge|unavailable))|horizontalpodautoscaler_(info|labels|spec_(max_replicas|min_replicas|target_metric)|status_(condition|current_replicas|target_metric))|job_(created|info|labels|owner|spec_(completions|parallelism)|status_(active|completion_time|start_time))|namespace_(annotations|labels)|node_(info|labels|role|spec_taint|status_(allocatable|capacity))|pod_(container_(info|resource_(limits|requests)|status_(last_terminated_(exitcode|timestamp)|restarts_total|terminated(?:_reason)?))|created|info|labels|owner|status_(phase|qos_class))|replicaset_(created|labels|owner|spec_replicas)|replicationcontroller_(created|spec_replicas)|resourcequota(?:_created)?|statefulset_(created|labels|replicas))|node_(cpu_(core_throttles_total|seconds_total)|disk_(read_bytes_total|reads_completed_total|writes_completed_total|written_bytes_total)|memory_(Buffers_bytes|Cached_bytes|MemFree_bytes|MemTotal_bytes|SReclaimable_bytes)|network_(receive_(bytes_total|packets_total)|speed_bytes|transmit_(bytes_total|packets_total))|vmstat_oom_kill)|openshift_clusterresourcequota_(created|labels|namespace_usage|selector|usage))$' + regex: '^(DCGM_FI_(DEV_(FB_(FREE|USED)|GPU_UTIL|POWER_USAGE)|PROF_(DRAM_ACTIVE|GR_ENGINE_ACTIVE|PIPE_TENSOR_ACTIVE))|ephemeral_storage_.*|kube_(cronjob_(created|info|labels|next_schedule_time|status_(active|last_schedule_time))|daemonset_(created|labels|status_number_available)|deployment_(created|labels|metadata_generation|spec_strategy_rollingupdate_max_(surge|unavailable))|horizontalpodautoscaler_(info|labels|spec_(max_replicas|min_replicas|target_metric)|status_(condition|current_replicas|target_metric))|job_(created|info|labels|owner|spec_(completions|parallelism)|status_(active|completion_time|start_time))|namespace_(annotations|labels)|node_(info|labels|role|spec_taint|status_(allocatable|capacity))|pod_(container_(info|resource_(limits|requests)|status_(last_terminated_(exitcode|timestamp)|restarts_total|terminated(?:_reason)?))|created|info|labels|owner|status_(phase|qos_class))|replicaset_(created|labels|owner|spec_replicas)|replicationcontroller_(created|spec_replicas)|resourcequota(?:_created)?|statefulset_(created|labels|replicas))|node_(cpu_(core_throttles_total|seconds_total)|disk_(read_bytes_total|reads_completed_total|writes_completed_total|written_bytes_total)|memory_(Buffers_bytes|Cached_bytes|MemFree_bytes|MemTotal_bytes|SReclaimable_bytes)|network_(receive_(bytes_total|packets_total)|speed_bytes|transmit_(bytes_total|packets_total))|vmstat_oom_kill)|openshift_clusterresourcequota_(created|labels|namespace_usage|selector|usage))$' action: keep + - job_name: 'kubex-automation-engine-metrics-endpointslice' + honor_labels: true + kubernetes_sd_configs: + - role: endpointslice + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_endpointslice_name] + action: keep + regex: '.*kubex-automation-engine.*metrics-service.*' + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: service + - source_labels: [__meta_kubernetes_pod_node_name] + action: replace + target_label: node + ################################################################# # Ephemeral Storage Metrics Exporter # Collects and exposes ephemeral storage metrics via a DaemonSet From d3da635d768383b73834141bc5aa3bbbd876e7fe Mon Sep 17 00:00:00 2001 From: Geoffrey Asare Date: Wed, 6 May 2026 15:23:46 -0400 Subject: [PATCH 06/20] fix(stack): make controller metrics scrape explicit http --- charts/kubex-automation-stack/values.yaml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/charts/kubex-automation-stack/values.yaml b/charts/kubex-automation-stack/values.yaml index df7dc96..f06832a 100644 --- a/charts/kubex-automation-stack/values.yaml +++ b/charts/kubex-automation-stack/values.yaml @@ -230,13 +230,11 @@ prometheus: - job_name: 'kubex-automation-engine-metrics-endpointslice' honor_labels: true + scheme: http kubernetes_sd_configs: - role: endpointslice relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] - action: replace - target_label: __scheme__ - regex: (https?) + # The controller chart exposes metrics over HTTP on port 8080. - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ From d5d2a7a2eaf92a1905e77157b03d475be0d7be28 Mon Sep 17 00:00:00 2001 From: Geoffrey Asare Date: Wed, 6 May 2026 15:24:34 -0400 Subject: [PATCH 07/20] fix(stack): pin controller metrics scrape to http --- charts/kubex-automation-stack/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/kubex-automation-stack/Chart.yaml b/charts/kubex-automation-stack/Chart.yaml index 5816939..bf627bb 100644 --- a/charts/kubex-automation-stack/Chart.yaml +++ b/charts/kubex-automation-stack/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 description: Kubex Collection Stack name: kubex-automation-stack -version: 1.0.10 +version: 1.0.11 type: application icon: https://www.kubex.ai/wp-content/uploads/kubex-by-densify-logo.png dependencies: From 4b9909be57261f03029d4ca18d2672f843c72164 Mon Sep 17 00:00:00 2001 From: Geoffrey Asare Date: Wed, 6 May 2026 15:59:36 -0400 Subject: [PATCH 08/20] chore(stack): set chart version to 1.0.9 --- charts/kubex-automation-stack/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/kubex-automation-stack/Chart.yaml b/charts/kubex-automation-stack/Chart.yaml index bf627bb..eb743a4 100644 --- a/charts/kubex-automation-stack/Chart.yaml +++ b/charts/kubex-automation-stack/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 description: Kubex Collection Stack name: kubex-automation-stack -version: 1.0.11 +version: 1.0.9 type: application icon: https://www.kubex.ai/wp-content/uploads/kubex-by-densify-logo.png dependencies: From 74a2275123ce263aa386ad6b90fc43ce2c64520c Mon Sep 17 00:00:00 2001 From: Geoffrey Asare Date: Wed, 6 May 2026 16:11:37 -0400 Subject: [PATCH 09/20] fix(stack): deduplicate shared endpointslice relabels --- charts/kubex-automation-stack/values.yaml | 61 ++++++++--------------- 1 file changed, 21 insertions(+), 40 deletions(-) diff --git a/charts/kubex-automation-stack/values.yaml b/charts/kubex-automation-stack/values.yaml index f06832a..4ed311e 100644 --- a/charts/kubex-automation-stack/values.yaml +++ b/charts/kubex-automation-stack/values.yaml @@ -193,36 +193,35 @@ prometheus: kubernetes_sd_configs: - role: endpointslice relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] - action: replace - target_label: __scheme__ - regex: (https?) - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + - &kubexEndpointsliceMetricsPath + source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) - - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + - &kubexEndpointsliceAddress + source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] action: replace target_label: __address__ regex: (.+?)(?::\d+)?;(\d+) replacement: $1:$2 - - action: labelmap + - &kubexEndpointsliceParamLabels + action: labelmap regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) replacement: __param_$1 - - action: labelmap + - &kubexEndpointsliceServiceLabels + action: labelmap regex: __meta_kubernetes_service_label_(.+) - - source_labels: [__meta_kubernetes_namespace] + - &kubexEndpointsliceNamespace + source_labels: [__meta_kubernetes_namespace] action: replace target_label: namespace + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) - source_labels: [__meta_kubernetes_endpointslice_name] action: keep regex: '((kubex|densify)-(kube-state-metrics|prometheus-node-exporter|ephemeral-storage-collector)|.*dcgm|k8s-ephemeral-storage-metrics).*' - - source_labels: [__meta_kubernetes_service_name] - action: replace - target_label: service - - source_labels: [__meta_kubernetes_pod_node_name] - action: replace - target_label: node metric_relabel_configs: - source_labels: [__name__] regex: '^(DCGM_FI_(DEV_(FB_(FREE|USED)|GPU_UTIL|POWER_USAGE)|PROF_(DRAM_ACTIVE|GR_ENGINE_ACTIVE|PIPE_TENSOR_ACTIVE))|ephemeral_storage_.*|kube_(cronjob_(created|info|labels|next_schedule_time|status_(active|last_schedule_time))|daemonset_(created|labels|status_number_available)|deployment_(created|labels|metadata_generation|spec_strategy_rollingupdate_max_(surge|unavailable))|horizontalpodautoscaler_(info|labels|spec_(max_replicas|min_replicas|target_metric)|status_(condition|current_replicas|target_metric))|job_(created|info|labels|owner|spec_(completions|parallelism)|status_(active|completion_time|start_time))|namespace_(annotations|labels)|node_(info|labels|role|spec_taint|status_(allocatable|capacity))|pod_(container_(info|resource_(limits|requests)|status_(last_terminated_(exitcode|timestamp)|restarts_total|terminated(?:_reason)?))|created|info|labels|owner|status_(phase|qos_class))|replicaset_(created|labels|owner|spec_replicas)|replicationcontroller_(created|spec_replicas)|resourcequota(?:_created)?|statefulset_(created|labels|replicas))|node_(cpu_(core_throttles_total|seconds_total)|disk_(read_bytes_total|reads_completed_total|writes_completed_total|written_bytes_total)|memory_(Buffers_bytes|Cached_bytes|MemFree_bytes|MemTotal_bytes|SReclaimable_bytes)|network_(receive_(bytes_total|packets_total)|speed_bytes|transmit_(bytes_total|packets_total))|vmstat_oom_kill)|openshift_clusterresourcequota_(created|labels|namespace_usage|selector|usage))$' @@ -234,33 +233,15 @@ prometheus: kubernetes_sd_configs: - role: endpointslice relabel_configs: - # The controller chart exposes metrics over HTTP on port 8080. - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] - action: replace - target_label: __address__ - regex: (.+?)(?::\d+)?;(\d+) - replacement: $1:$2 - - action: labelmap - regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) - replacement: __param_$1 - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace + - *kubexEndpointsliceMetricsPath + - *kubexEndpointsliceAddress + - *kubexEndpointsliceParamLabels + - *kubexEndpointsliceServiceLabels + - *kubexEndpointsliceNamespace - source_labels: [__meta_kubernetes_endpointslice_name] action: keep - regex: '.*kubex-automation-engine.*metrics-service.*' - - source_labels: [__meta_kubernetes_service_name] - action: replace - target_label: service - - source_labels: [__meta_kubernetes_pod_node_name] - action: replace - target_label: node + regex: '.+-kubex-automation-engine-metrics-service' + # Store the controller-runtime, Go, process, and workqueue metrics as-is. ################################################################# # Ephemeral Storage Metrics Exporter From f5d77f1d005895e653a97a98d2c45940332ba6e6 Mon Sep 17 00:00:00 2001 From: Geoffrey Asare Date: Wed, 6 May 2026 16:27:38 -0400 Subject: [PATCH 10/20] fix(stack): restore shared scrape labels --- charts/kubex-automation-stack/values.yaml | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/charts/kubex-automation-stack/values.yaml b/charts/kubex-automation-stack/values.yaml index 4ed311e..214c56d 100644 --- a/charts/kubex-automation-stack/values.yaml +++ b/charts/kubex-automation-stack/values.yaml @@ -193,6 +193,11 @@ prometheus: kubernetes_sd_configs: - role: endpointslice relabel_configs: + # Scheme annotation overrides the job's default scrape protocol when a target serves HTTPS. + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) - &kubexEndpointsliceMetricsPath source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] action: replace @@ -215,10 +220,14 @@ prometheus: source_labels: [__meta_kubernetes_namespace] action: replace target_label: namespace - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + - &kubexEndpointsliceServiceName + source_labels: [__meta_kubernetes_service_name] action: replace - target_label: __scheme__ - regex: (https?) + target_label: service + - &kubexEndpointsliceNodeName + source_labels: [__meta_kubernetes_pod_node_name] + action: replace + target_label: node - source_labels: [__meta_kubernetes_endpointslice_name] action: keep regex: '((kubex|densify)-(kube-state-metrics|prometheus-node-exporter|ephemeral-storage-collector)|.*dcgm|k8s-ephemeral-storage-metrics).*' @@ -229,6 +238,7 @@ prometheus: - job_name: 'kubex-automation-engine-metrics-endpointslice' honor_labels: true + # The controller chart exposes metrics on port 8080 over plain HTTP. scheme: http kubernetes_sd_configs: - role: endpointslice @@ -238,6 +248,9 @@ prometheus: - *kubexEndpointsliceParamLabels - *kubexEndpointsliceServiceLabels - *kubexEndpointsliceNamespace + - *kubexEndpointsliceServiceName + - *kubexEndpointsliceNodeName + # Matches any Helm release name prefix (for example 'controller-' or 'prod-'). - source_labels: [__meta_kubernetes_endpointslice_name] action: keep regex: '.+-kubex-automation-engine-metrics-service' From c602a899a364a6b6f06f917bef95459cfff9957a Mon Sep 17 00:00:00 2001 From: Geoffrey Asare Date: Wed, 6 May 2026 16:48:44 -0400 Subject: [PATCH 11/20] docs(stack): clarify controller metrics scrape intent --- charts/kubex-automation-stack/values.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/kubex-automation-stack/values.yaml b/charts/kubex-automation-stack/values.yaml index 214c56d..b008fcd 100644 --- a/charts/kubex-automation-stack/values.yaml +++ b/charts/kubex-automation-stack/values.yaml @@ -238,7 +238,7 @@ prometheus: - job_name: 'kubex-automation-engine-metrics-endpointslice' honor_labels: true - # The controller chart exposes metrics on port 8080 over plain HTTP. + # The controller chart exposes unauthenticated metrics on port 8080 over plain HTTP. scheme: http kubernetes_sd_configs: - role: endpointslice @@ -254,7 +254,7 @@ prometheus: - source_labels: [__meta_kubernetes_endpointslice_name] action: keep regex: '.+-kubex-automation-engine-metrics-service' - # Store the controller-runtime, Go, process, and workqueue metrics as-is. + # Store controller_runtime_*, go_*, process_*, workqueue_*, rest_client_*, and automation_controller_* metrics as-is. ################################################################# # Ephemeral Storage Metrics Exporter From d94a1d719bb796b53a69b9ac2932cfd6a2a339de Mon Sep 17 00:00:00 2001 From: Geoffrey Asare Date: Wed, 6 May 2026 16:55:06 -0400 Subject: [PATCH 12/20] docs(stack): clarify controller metrics scrape behavior --- charts/kubex-automation-stack/values.yaml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/charts/kubex-automation-stack/values.yaml b/charts/kubex-automation-stack/values.yaml index b008fcd..d62bfa5 100644 --- a/charts/kubex-automation-stack/values.yaml +++ b/charts/kubex-automation-stack/values.yaml @@ -239,6 +239,7 @@ prometheus: - job_name: 'kubex-automation-engine-metrics-endpointslice' honor_labels: true # The controller chart exposes unauthenticated metrics on port 8080 over plain HTTP. + # No bearer token is required for this /metrics endpoint. scheme: http kubernetes_sd_configs: - role: endpointslice @@ -250,11 +251,14 @@ prometheus: - *kubexEndpointsliceNamespace - *kubexEndpointsliceServiceName - *kubexEndpointsliceNodeName - # Matches any Helm release name prefix (for example 'controller-' or 'prod-'). + # Matches any Helm release name prefix (for example 'controller-' or 'prod-') + # and the EndpointSlice-generated suffix appended after the service name. - source_labels: [__meta_kubernetes_endpointslice_name] action: keep regex: '.+-kubex-automation-engine-metrics-service' - # Store controller_runtime_*, go_*, process_*, workqueue_*, rest_client_*, and automation_controller_* metrics as-is. + # Stores all scraped metrics as-is (no metric_relabel_configs filter). + # Expected families: controller_runtime_*, go_*, process_*, workqueue_*, rest_client_*, automation_controller_* + # Add a metric_relabel_configs allowlist here if cardinality becomes a concern. ################################################################# # Ephemeral Storage Metrics Exporter From 6e4c87cb945c309f47cd0fddbbbda83c88eb4a1b Mon Sep 17 00:00:00 2001 From: Geoffrey Asare Date: Thu, 7 May 2026 09:49:22 -0400 Subject: [PATCH 13/20] fix(stack): scope controller metrics scrape by service --- charts/kubex-automation-stack/values.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/charts/kubex-automation-stack/values.yaml b/charts/kubex-automation-stack/values.yaml index d62bfa5..096b466 100644 --- a/charts/kubex-automation-stack/values.yaml +++ b/charts/kubex-automation-stack/values.yaml @@ -251,9 +251,9 @@ prometheus: - *kubexEndpointsliceNamespace - *kubexEndpointsliceServiceName - *kubexEndpointsliceNodeName - # Matches any Helm release name prefix (for example 'controller-' or 'prod-') - # and the EndpointSlice-generated suffix appended after the service name. - - source_labels: [__meta_kubernetes_endpointslice_name] + # Matches any Helm release name prefix (for example 'controller-' or 'prod-'). + # This intentionally supports scraping multiple controller releases when their service names share this suffix. + - source_labels: [__meta_kubernetes_service_name] action: keep regex: '.+-kubex-automation-engine-metrics-service' # Stores all scraped metrics as-is (no metric_relabel_configs filter). From e72a399b1c4a43b8a5ab9bffcfa899f6d80d4644 Mon Sep 17 00:00:00 2001 From: Geoffrey Asare Date: Thu, 7 May 2026 12:40:41 -0400 Subject: [PATCH 14/20] fix(stack): tighten controller metrics scraping --- charts/kubex-automation-stack/values.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/charts/kubex-automation-stack/values.yaml b/charts/kubex-automation-stack/values.yaml index 096b466..df81b7a 100644 --- a/charts/kubex-automation-stack/values.yaml +++ b/charts/kubex-automation-stack/values.yaml @@ -237,8 +237,8 @@ prometheus: action: keep - job_name: 'kubex-automation-engine-metrics-endpointslice' - honor_labels: true # The controller chart exposes unauthenticated metrics on port 8080 over plain HTTP. + # Unlike the shared endpointslice job above, this scrape does not honor scheme annotations. # No bearer token is required for this /metrics endpoint. scheme: http kubernetes_sd_configs: @@ -249,13 +249,13 @@ prometheus: - *kubexEndpointsliceParamLabels - *kubexEndpointsliceServiceLabels - *kubexEndpointsliceNamespace - - *kubexEndpointsliceServiceName - - *kubexEndpointsliceNodeName # Matches any Helm release name prefix (for example 'controller-' or 'prod-'). # This intentionally supports scraping multiple controller releases when their service names share this suffix. - source_labels: [__meta_kubernetes_service_name] action: keep - regex: '.+-kubex-automation-engine-metrics-service' + regex: '.+-kubex-automation-engine-metrics-service$' + - *kubexEndpointsliceServiceName + - *kubexEndpointsliceNodeName # Stores all scraped metrics as-is (no metric_relabel_configs filter). # Expected families: controller_runtime_*, go_*, process_*, workqueue_*, rest_client_*, automation_controller_* # Add a metric_relabel_configs allowlist here if cardinality becomes a concern. From 39fafc4388528840a910f1f6a4a5acf1c6a326a1 Mon Sep 17 00:00:00 2001 From: Geoffrey Asare Date: Thu, 7 May 2026 13:33:58 -0400 Subject: [PATCH 15/20] fix(stack): enforce controller metrics port explicitly --- charts/kubex-automation-stack/values.yaml | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/charts/kubex-automation-stack/values.yaml b/charts/kubex-automation-stack/values.yaml index df81b7a..686b0cd 100644 --- a/charts/kubex-automation-stack/values.yaml +++ b/charts/kubex-automation-stack/values.yaml @@ -220,6 +220,9 @@ prometheus: source_labels: [__meta_kubernetes_namespace] action: replace target_label: namespace + - source_labels: [__meta_kubernetes_endpointslice_name] + action: keep + regex: '((kubex|densify)-(kube-state-metrics|prometheus-node-exporter|ephemeral-storage-collector)|.*dcgm|k8s-ephemeral-storage-metrics).*' - &kubexEndpointsliceServiceName source_labels: [__meta_kubernetes_service_name] action: replace @@ -228,27 +231,29 @@ prometheus: source_labels: [__meta_kubernetes_pod_node_name] action: replace target_label: node - - source_labels: [__meta_kubernetes_endpointslice_name] - action: keep - regex: '((kubex|densify)-(kube-state-metrics|prometheus-node-exporter|ephemeral-storage-collector)|.*dcgm|k8s-ephemeral-storage-metrics).*' metric_relabel_configs: - source_labels: [__name__] regex: '^(DCGM_FI_(DEV_(FB_(FREE|USED)|GPU_UTIL|POWER_USAGE)|PROF_(DRAM_ACTIVE|GR_ENGINE_ACTIVE|PIPE_TENSOR_ACTIVE))|ephemeral_storage_.*|kube_(cronjob_(created|info|labels|next_schedule_time|status_(active|last_schedule_time))|daemonset_(created|labels|status_number_available)|deployment_(created|labels|metadata_generation|spec_strategy_rollingupdate_max_(surge|unavailable))|horizontalpodautoscaler_(info|labels|spec_(max_replicas|min_replicas|target_metric)|status_(condition|current_replicas|target_metric))|job_(created|info|labels|owner|spec_(completions|parallelism)|status_(active|completion_time|start_time))|namespace_(annotations|labels)|node_(info|labels|role|spec_taint|status_(allocatable|capacity))|pod_(container_(info|resource_(limits|requests)|status_(last_terminated_(exitcode|timestamp)|restarts_total|terminated(?:_reason)?))|created|info|labels|owner|status_(phase|qos_class))|replicaset_(created|labels|owner|spec_replicas)|replicationcontroller_(created|spec_replicas)|resourcequota(?:_created)?|statefulset_(created|labels|replicas))|node_(cpu_(core_throttles_total|seconds_total)|disk_(read_bytes_total|reads_completed_total|writes_completed_total|written_bytes_total)|memory_(Buffers_bytes|Cached_bytes|MemFree_bytes|MemTotal_bytes|SReclaimable_bytes)|network_(receive_(bytes_total|packets_total)|speed_bytes|transmit_(bytes_total|packets_total))|vmstat_oom_kill)|openshift_clusterresourcequota_(created|labels|namespace_usage|selector|usage))$' action: keep - job_name: 'kubex-automation-engine-metrics-endpointslice' - # The controller chart exposes unauthenticated metrics on port 8080 over plain HTTP. - # Unlike the shared endpointslice job above, this scrape does not honor scheme annotations. + # The controller chart exposes unauthenticated metrics on port 8443 over plain HTTP. + # Unlike the shared endpointslice job above, this scrape does not honor scheme annotations or bearer tokens. + # The target is scraped by service-name suffix so multiple controller releases can be collected when needed. # No bearer token is required for this /metrics endpoint. scheme: http kubernetes_sd_configs: - role: endpointslice relabel_configs: - *kubexEndpointsliceMetricsPath - - *kubexEndpointsliceAddress - *kubexEndpointsliceParamLabels - *kubexEndpointsliceServiceLabels - *kubexEndpointsliceNamespace + - source_labels: [__address__] + action: replace + target_label: __address__ + regex: '(.+?)(?::\d+)?' + replacement: '$1:8443' # Matches any Helm release name prefix (for example 'controller-' or 'prod-'). # This intentionally supports scraping multiple controller releases when their service names share this suffix. - source_labels: [__meta_kubernetes_service_name] From c4e9be12e4d5d04e2ac9cc30e5ab6f4acec6930a Mon Sep 17 00:00:00 2001 From: Geoffrey Asare Date: Thu, 7 May 2026 13:34:22 -0400 Subject: [PATCH 16/20] fix(stack): make controller scrape path explicit --- charts/kubex-automation-stack/values.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/kubex-automation-stack/values.yaml b/charts/kubex-automation-stack/values.yaml index 686b0cd..ac299ad 100644 --- a/charts/kubex-automation-stack/values.yaml +++ b/charts/kubex-automation-stack/values.yaml @@ -245,8 +245,8 @@ prometheus: kubernetes_sd_configs: - role: endpointslice relabel_configs: - - *kubexEndpointsliceMetricsPath - - *kubexEndpointsliceParamLabels + - target_label: __metrics_path__ + replacement: /metrics - *kubexEndpointsliceServiceLabels - *kubexEndpointsliceNamespace - source_labels: [__address__] From 288f3c00339cf7f810dcd980255914ae132b38e6 Mon Sep 17 00:00:00 2001 From: Geoffrey Asare Date: Thu, 7 May 2026 14:15:15 -0400 Subject: [PATCH 17/20] fix(stack): align controller scrape with validated defaults --- charts/kubex-automation-stack/values.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/charts/kubex-automation-stack/values.yaml b/charts/kubex-automation-stack/values.yaml index ac299ad..072e88a 100644 --- a/charts/kubex-automation-stack/values.yaml +++ b/charts/kubex-automation-stack/values.yaml @@ -237,7 +237,7 @@ prometheus: action: keep - job_name: 'kubex-automation-engine-metrics-endpointslice' - # The controller chart exposes unauthenticated metrics on port 8443 over plain HTTP. + # The controller chart exposes unauthenticated metrics on port 8080 over plain HTTP. # Unlike the shared endpointslice job above, this scrape does not honor scheme annotations or bearer tokens. # The target is scraped by service-name suffix so multiple controller releases can be collected when needed. # No bearer token is required for this /metrics endpoint. @@ -245,6 +245,7 @@ prometheus: kubernetes_sd_configs: - role: endpointslice relabel_configs: + # This job uses a fixed controller metrics path/port instead of the shared annotation-driven overrides. - target_label: __metrics_path__ replacement: /metrics - *kubexEndpointsliceServiceLabels @@ -253,7 +254,7 @@ prometheus: action: replace target_label: __address__ regex: '(.+?)(?::\d+)?' - replacement: '$1:8443' + replacement: '$1:8080' # Matches any Helm release name prefix (for example 'controller-' or 'prod-'). # This intentionally supports scraping multiple controller releases when their service names share this suffix. - source_labels: [__meta_kubernetes_service_name] From a21e70bdb524f8471b95d907a0a78791ba571207 Mon Sep 17 00:00:00 2001 From: Geoffrey Asare Date: Thu, 7 May 2026 14:15:41 -0400 Subject: [PATCH 18/20] docs(stack): clarify controller scrape contract --- charts/kubex-automation-stack/values.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/charts/kubex-automation-stack/values.yaml b/charts/kubex-automation-stack/values.yaml index 072e88a..d989ffd 100644 --- a/charts/kubex-automation-stack/values.yaml +++ b/charts/kubex-automation-stack/values.yaml @@ -239,6 +239,8 @@ prometheus: - job_name: 'kubex-automation-engine-metrics-endpointslice' # The controller chart exposes unauthenticated metrics on port 8080 over plain HTTP. # Unlike the shared endpointslice job above, this scrape does not honor scheme annotations or bearer tokens. + # The job is intentionally fixed to the validated controller metrics defaults. + # The shared path/port/param anchors stay with the shared job; this job keeps its own explicit endpoint. # The target is scraped by service-name suffix so multiple controller releases can be collected when needed. # No bearer token is required for this /metrics endpoint. scheme: http From dbf848dbfef04d20a0facfebd4aadefb646b6c03 Mon Sep 17 00:00:00 2001 From: Geoffrey Asare Date: Mon, 11 May 2026 14:11:03 -0400 Subject: [PATCH 19/20] fix(stack): simplify controller metrics relabeling --- charts/kubex-automation-stack/values.yaml | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/charts/kubex-automation-stack/values.yaml b/charts/kubex-automation-stack/values.yaml index d989ffd..6036ac1 100644 --- a/charts/kubex-automation-stack/values.yaml +++ b/charts/kubex-automation-stack/values.yaml @@ -198,19 +198,16 @@ prometheus: action: replace target_label: __scheme__ regex: (https?) - - &kubexEndpointsliceMetricsPath - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) - - &kubexEndpointsliceAddress - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] action: replace target_label: __address__ regex: (.+?)(?::\d+)?;(\d+) replacement: $1:$2 - - &kubexEndpointsliceParamLabels - action: labelmap + - action: labelmap regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) replacement: __param_$1 - &kubexEndpointsliceServiceLabels @@ -252,16 +249,16 @@ prometheus: replacement: /metrics - *kubexEndpointsliceServiceLabels - *kubexEndpointsliceNamespace - - source_labels: [__address__] - action: replace - target_label: __address__ - regex: '(.+?)(?::\d+)?' - replacement: '$1:8080' # Matches any Helm release name prefix (for example 'controller-' or 'prod-'). # This intentionally supports scraping multiple controller releases when their service names share this suffix. - source_labels: [__meta_kubernetes_service_name] action: keep regex: '.+-kubex-automation-engine-metrics-service$' + - source_labels: [__address__] + action: replace + target_label: __address__ + regex: '(.+?)(?::\d+)?' + replacement: '$1:8080' - *kubexEndpointsliceServiceName - *kubexEndpointsliceNodeName # Stores all scraped metrics as-is (no metric_relabel_configs filter). From 8f3c861792735b94e13e964218e688a11018f2f5 Mon Sep 17 00:00:00 2001 From: Geoffrey Asare Date: Tue, 12 May 2026 12:15:59 -0400 Subject: [PATCH 20/20] fix(stack): close remaining metrics review items --- charts/kubex-automation-stack/values.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/charts/kubex-automation-stack/values.yaml b/charts/kubex-automation-stack/values.yaml index 6036ac1..f3c74f6 100644 --- a/charts/kubex-automation-stack/values.yaml +++ b/charts/kubex-automation-stack/values.yaml @@ -217,6 +217,9 @@ prometheus: source_labels: [__meta_kubernetes_namespace] action: replace target_label: namespace + - source_labels: [__meta_kubernetes_service_name] + action: drop + regex: '.+-kubex-automation-engine-metrics-service$' - source_labels: [__meta_kubernetes_endpointslice_name] action: keep regex: '((kubex|densify)-(kube-state-metrics|prometheus-node-exporter|ephemeral-storage-collector)|.*dcgm|k8s-ephemeral-storage-metrics).*' @@ -237,7 +240,7 @@ prometheus: # The controller chart exposes unauthenticated metrics on port 8080 over plain HTTP. # Unlike the shared endpointslice job above, this scrape does not honor scheme annotations or bearer tokens. # The job is intentionally fixed to the validated controller metrics defaults. - # The shared path/port/param anchors stay with the shared job; this job keeps its own explicit endpoint. + # The shared annotation-driven path/port/param rules stay with the shared job; this job keeps its own explicit endpoint. # The target is scraped by service-name suffix so multiple controller releases can be collected when needed. # No bearer token is required for this /metrics endpoint. scheme: http