From 68068eaea9007942a269a2b52b1cb7fd6d2b5820 Mon Sep 17 00:00:00 2001
From: kbhos <kbhos@splunk.com>
Date: Sun, 26 Apr 2026 18:24:19 +0530
Subject: [PATCH 1/5] feat(ai-platform): SAIA service exposure to external
 requests

---
 tools/cluster_setup/cluster-config.yaml       | 156 ++++++-
 tools/cluster_setup/eks_cluster_with_stack.sh | 394 +++++++++++++++++-
 tools/cluster_setup/k0s-cluster-config.yaml   |  24 +-
 tools/cluster_setup/k0s_cluster_with_stack.sh |  91 ++++
 .../splunk-operator-cluster.yaml              |   1 -
 5 files changed, 623 insertions(+), 43 deletions(-)

diff --git a/tools/cluster_setup/cluster-config.yaml b/tools/cluster_setup/cluster-config.yaml
index 513b425..9111689 100644
--- a/tools/cluster_setup/cluster-config.yaml
+++ b/tools/cluster_setup/cluster-config.yaml
@@ -13,7 +13,26 @@
 
 # ---------- Cluster Configuration ----------
 cluster:
-  useExisting: false   # true = do not create cluster; use existing one (script fails if cluster not found)
+  # ------------------------------------------------------------------------
+  # LIFECYCLE WORKFLOW (to avoid VPC/IGW quota churn and DELETE_FAILED loops)
+  # ------------------------------------------------------------------------
+  #   1. FIRST install (cluster does not exist yet):
+  #        useExisting: false            # eksctl creates the cluster + VPC
+  #        ./eks_cluster_with_stack.sh install
+  #
+  #   2. AFTER first install succeeds, flip this one line:
+  #        useExisting: true             # subsequent `install` only reconciles
+  #                                      # operators/CRs on the existing cluster.
+  #      Re-running `install` is now safe and does NOT create new VPCs/IGWs.
+  #
+  #   3. When you genuinely want to tear down:
+  #        ALWAYS use `delete-full` (NOT `delete`). It uninstalls CRs/operators
+  #        first so the AWS Load Balancer Controller removes its NLBs + SGs
+  #        before CFN deletes the VPC -- this prevents DELETE_FAILED stacks
+  #        leaving orphan VPCs behind and eating your per-region quota.
+  #        ./eks_cluster_with_stack.sh delete-full
+  # ------------------------------------------------------------------------
+  useExisting: true   # true = do not create cluster; use existing one (script fails if cluster not found)
   name: "ai-tier-sok-test-east2"                   # CHANGE THIS: Your EKS cluster name (DNS-1123 compliant: lowercase, numbers, hyphens)
   region: "us-east-2"                     # CHANGE THIS: Your AWS region (e.g., us-east-1, us-west-2, eu-west-1)
   k8sVersion: "1.31"                      # Kubernetes version (1.29, 1.30, 1.31 supported)
@@ -79,7 +98,7 @@ nodeGroups:
     desiredCapacity: 2                    # Initial number of GPU nodes
     minSize: 2                            # Minimum GPU nodes
     maxSize: 4                            # Maximum GPU nodes (set equal to desiredCapacity for H100)
-    volumeSize: 1000                      # EBS volume size per GPU node (GB) - larger for model storage
+    volumeSize: 500                      # EBS volume size per GPU node (GB) - larger for model storage
     volumeType: "gp3"                     # EBS volume type
 
     # ── H100 ONLY ──────────────────────────────────────────────────────────────
@@ -99,7 +118,7 @@ nodeGroups:
 # Object storage: only AWS S3 or external S3-compatible (no in-cluster MinIO install).
 # Use objectStore.type: aws (S3) or s3compat | minio | seaweedfs (external; endpoint + credentials required).
 storage:
-  s3Bucket: "ai-platform-bucket-minio-us-east-2"  # Used when objectStore.type is aws
+  s3Bucket: "ai-platform-bucket-us-east-2"  # Used when objectStore.type is aws
   storageClass: "gp3"                        # Storage class for Kubernetes PVCs (gp3, gp2, io1, io2)
   vectorDbSize: "50Gi"                       # VectorDB persistent volume size
 
@@ -108,12 +127,8 @@ storage:
   # - minio: same wiring as s3compat but path uses minio:// (use if an older operator webhook rejects s3compat://)
   # - seaweedfs: path uses seaweedfs:// (requires operator webhook that allows that scheme)
   objectStore:
-    type: "minio"                         # aws | s3compat | minio | seaweedfs (external only for non-aws)
-    bucket: "ai-platform-bucket-minio-us-east-2"
-    endpoint: "http://13.59.216.105:9000"    # MinIO API (9000) or SeaweedFS S3 gateway (8333)
-    auth:
-      rootUser: "minioadmin"
-      rootPassword: "minioadmin"              # Must match SeaweedFS env (AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY) or MinIO root
+    type: "aws"                         # aws | s3compat | minio | seaweedfs (external only for non-aws)
+    bucket: "ai-platform-bucket-us-east-2"            # Must match SeaweedFS env (AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY) or MinIO root
 
 # ---------- Container Images Configuration ----------
 images:
@@ -153,7 +168,7 @@ images:
     #   Result: "docker.io/myorg/splunk-ai-operator:v1.0.0"
     # Bump tag after building fixed operator (SAIA 8Gi default, SchemaJobId persist, feature config)
     #image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/splunk-ai-operator:v0.1.8"
-    image: "docker.io/kpratyush775/splunk-ai-operator:v0.1.31"
+    image: "docker.io/kbhos698/splunk-ai-operator:ai-tier"
 
   # Splunk Enterprise Images
   splunk:
@@ -176,8 +191,8 @@ images:
     # Option 2: Full path with different registry
     #   headImage: "docker.io/rayproject/ray:2.44.0"
     #   Result: "docker.io/rayproject/ray:2.44.0"
-    headImage: "ml-platform/ray/ray-head:build-008"
-    workerImage: "ml-platform/ray/ray-worker-gpu:build-008"
+    headImage: "ml-platform/ray/ray-head:build-v2-008"
+    workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-008"
 
   # Weaviate Vector Database
   weaviate:
@@ -189,9 +204,14 @@ images:
   # SAIA (Splunk AI Assistant) Images
   saia:
     # Relative paths - registry prefix auto-applied
-    apiImage: "ml-platform/saia/saia-api:build-005"
-    dataLoaderImage: "ml-platform/saia/saia-data-loader:build-003"
-
+    # NOTE: keep dataLoaderImage in sync with apiImage/apiV2Image. Tags older than
+    # v2-008 (specifically pre v2.0.4-13-g3b677604) ship a broken URL-compat shim
+    # that ignores VECTOR_DB_GRPC_* env vars and falls back to grpc.<host>:443 TLS,
+    # causing the vector-db-setup posthook Job to fail with a Weaviate gRPC health
+    # check error. See pkg/ai/features/saia/impl.go (reconcilePostInstallHook).
+    apiImage: "ml-platform/saia/saia-api:build-v2-009"
+    apiV2Image: "ml-platform/saia/saia-api-v2:build-v2-009"
+    dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v2-009"
   # Supporting Images
   fluentBit:
     # Docker Hub public image (has full path, registry prefix ignored)
@@ -204,6 +224,14 @@ images:
     # Public image - full path so registry prefix is NOT applied; validation checks this URL
     image: "docker.io/otel/opentelemetry-collector-contrib:0.122.1"
 
+  # NGINX reverse proxy used by the SAIA reconciler to route v1 / v2 requests
+  # by path. OPTIONAL: omit this block to use the script default
+  # (docker.io/library/nginx:1.27-alpine). Add it only to pin a specific tag
+  # or point at an internal mirror in airgapped clusters.
+  #
+  # nginx:
+  #   image: "harbor.internal/library/nginx:1.27-alpine"
+
 # ---------- Operator Versions ----------
 operators:
   ray:
@@ -246,6 +274,104 @@ aiPlatform:
     serviceAccountName: "ray-worker-sa"
     imageRegistry: ""                     # Leave empty for default
 
+  # ---------------------------------------------------------------------------
+  # Public SAIA exposure
+  # ---------------------------------------------------------------------------
+  # The operator always renders a public Kubernetes Service named
+  # `<aiPlatform.name>-saia-service` whose endpoints are the in-cluster nginx
+  # pods (nginx terminates path routing to saia v1 / v2). HOW that Service is
+  # reached from outside the cluster depends on two settings below:
+  #
+  #   - aiPlatform.serviceTemplate.{type, nodePort, annotations}
+  #   - aiPlatform.awsLoadBalancerController.install
+  #
+  # Pick ONE of the three modes below. Each row shows: what you put in this
+  # file, what the install script does, and what you (the customer) must
+  # provision outside the cluster.
+  #
+  # ---------------------------------------------------------------------------
+  # MODE 1 — Operator-managed AWS NLB (default, simplest on EKS)
+  # ---------------------------------------------------------------------------
+  #   serviceTemplate:
+  #     type: LoadBalancer
+  #     annotations:
+  #       service.beta.kubernetes.io/aws-load-balancer-type:             "external"
+  #       service.beta.kubernetes.io/aws-load-balancer-scheme:           "internet-facing"  # or "internal"
+  #       service.beta.kubernetes.io/aws-load-balancer-nlb-target-type:  "instance"
+  #   awsLoadBalancerController:
+  #     install: true
+  #
+  # Script does:
+  #   * Installs AWS Load Balancer Controller (LBC) with IRSA and tags
+  #     public/private subnets for auto-discovery.
+  #   * Creates the LoadBalancer-typed Service; LBC reads the annotations and
+  #     provisions an internet-facing AWS NLB (~2-3 min). Public DNS appears
+  #     in `.status.loadBalancer.ingress[0].hostname`.
+  # You must do:
+  #   * Nothing on the AWS side — fully automated.
+  #   * (Optional) Add an ACM cert listener annotation if you want TLS
+  #     termination at the NLB.
+  #
+  # ---------------------------------------------------------------------------
+  # MODE 2 — Bring-your-own AWS LB (you already have an NLB / ALB)
+  # ---------------------------------------------------------------------------
+  #   serviceTemplate:
+  #     type: NodePort
+  #     nodePort: 30080            # any free port in 30000-32767
+  #   awsLoadBalancerController:
+  #     install: false             # no LBC, no operator-created LB
+  #
+  # Script does:
+  #   * Creates the public Service as NodePort 30080 on every worker.
+  #   * Skips LBC install entirely.
+  # You must do (in AWS, outside the script):
+  #   1. Pre-create an NLB or ALB (any scheme).
+  #   2. Create a target group:
+  #        - Target type:    instance
+  #        - Protocol/Port:  TCP/30080 (NLB) or HTTP/30080 (ALB)
+  #        - Health check:   HTTP /nginx_health on port "traffic-port", 200 OK
+  #   3. Attach the EKS managed-nodegroup ASG to the target group so
+  #      membership tracks node scale-in/out, e.g. via Terraform:
+  #        resource "aws_autoscaling_attachment" "saia" {
+  #          autoscaling_group_name = "eks-<cluster>-<nodegroup>-NodeGroup-XXXX"
+  #          lb_target_group_arn    = "arn:aws:elasticloadbalancing:...:targetgroup/my-saia-tg/..."
+  #        }
+  #   4. Worker node SG: allow ingress TCP/30080 from the NLB subnet CIDRs
+  #      (NLB) or from the ALB's security group (ALB).
+  #
+  # ---------------------------------------------------------------------------
+  # MODE 3 — On-prem / k0s / airgap (HAProxy, F5, MetalLB, hardware LB, …)
+  # ---------------------------------------------------------------------------
+  #   serviceTemplate:
+  #     type: NodePort
+  #     nodePort: 30080
+  #   awsLoadBalancerController:
+  #     install: false             # has no effect off-AWS, leave false
+  #
+  # Script does:
+  #   * Same as Mode 2 — creates the public Service as NodePort 30080.
+  # You must do (outside the cluster):
+  #   * Point your existing L4 LB (HAProxy / F5 / MetalLB / hardware) at every
+  #     worker node IP on TCP/30080, with HTTP health-check /nginx_health.
+  #     Sample HAProxy backend:
+  #         backend saia_be
+  #             option httpchk GET /nginx_health
+  #             server worker1 10.0.1.11:30080 check
+  #             server worker2 10.0.1.12:30080 check
+  #
+  # ---------------------------------------------------------------------------
+
+  # Active mode below — EDIT to switch. Default is MODE 1.
+  serviceTemplate:
+    type: LoadBalancer
+    annotations:
+      service.beta.kubernetes.io/aws-load-balancer-type: "external"
+      service.beta.kubernetes.io/aws-load-balancer-scheme: "internet-facing"
+      service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: "instance"
+
+  awsLoadBalancerController:
+    install: true
+
   # CPU Scheduling
   cpuScheduling:
     nodeSelector: {}
diff --git a/tools/cluster_setup/eks_cluster_with_stack.sh b/tools/cluster_setup/eks_cluster_with_stack.sh
index 7426ae1..b6982fb 100755
--- a/tools/cluster_setup/eks_cluster_with_stack.sh
+++ b/tools/cluster_setup/eks_cluster_with_stack.sh
@@ -91,6 +91,14 @@ load_config() {
     SAIA_SERVICE_SA="$(yq eval '.aiPlatform.serviceAccounts.saiaService' "$cfg")"
     DEFAULT_ACCELERATOR="$(yq eval '.aiPlatform.defaultAcceleratorType' "$cfg")"
     WORKER_IMAGE_REGISTRY="$(yq eval '.aiPlatform.workerGroupConfig.imageRegistry' "$cfg")"
+    SAIA_SERVICE_TYPE="$(yq eval '.aiPlatform.serviceTemplate.type // ""' "$cfg")"
+    SAIA_SERVICE_NODE_PORT="$(yq eval '.aiPlatform.serviceTemplate.nodePort // ""' "$cfg")"
+    # AWS Load Balancer Controller (LBC) install toggle. Default: false — the
+    # script assumes customers bring their own LB and point it at NodePort
+    # (Path A). Set to true only when you want operator-managed NLB/ALB
+    # provisioning via the `aws-load-balancer-type: external` annotation or
+    # dynamic target registration via TargetGroupBinding CRs (Path B).
+    INSTALL_LBC="$(yq eval '.aiPlatform.awsLoadBalancerController.install // false' "$cfg")"
     INGRESS_HOST="$(yq eval '.aiPlatform.ingress.host' "$cfg")"
     INGRESS_CLASS="$(yq eval '.aiPlatform.ingress.className' "$cfg")"
     INGRESS_TLS_SECRET="$(yq eval '.aiPlatform.ingress.tlsSecretName' "$cfg")"
@@ -120,9 +128,11 @@ load_config() {
     RAY_WORKER_IMAGE="$(yq eval '.images.ray.workerImage' "$cfg")"
     WEAVIATE_IMAGE="$(yq eval '.images.weaviate.image' "$cfg")"
     SAIA_API_IMAGE="$(yq eval '.images.saia.apiImage' "$cfg")"
+    SAIA_API_V2_IMAGE="$(yq eval '.images.saia.apiV2Image // ""' "$cfg")"
     SAIA_DATALOADER_IMAGE="$(yq eval '.images.saia.dataLoaderImage' "$cfg")"
     FLUENT_BIT_IMAGE="$(yq eval '.images.fluentBit.image' "$cfg")"
     OTEL_COLLECTOR_IMAGE="$(yq eval '.images.otelCollector.image' "$cfg")"
+    NGINX_IMAGE="$(yq eval '.images.nginx.image // "docker.io/library/nginx:1.27-alpine"' "$cfg")"
 
     # Subnets - read as arrays (support both cluster.subnets and top-level subnets)
     PRIVATE_SUBNETS=()
@@ -172,6 +182,9 @@ load_config() {
     SAIA_SERVICE_SA="saia-service-sa"
     DEFAULT_ACCELERATOR="L40S"
     WORKER_IMAGE_REGISTRY=""
+    SAIA_SERVICE_TYPE=""
+    SAIA_SERVICE_NODE_PORT=""
+    INSTALL_LBC="false"
     INGRESS_HOST="ai.example.com"
     INGRESS_CLASS="nginx"
     INGRESS_TLS_SECRET="ai-platform-tls"
@@ -179,6 +192,8 @@ load_config() {
     SPLUNK_OPERATOR_FILE="./splunk-operator-cluster.yaml"
     SPLUNK_AI_FILE="./artifacts.yaml"
     SPLUNK_IMAGE="splunk/splunk:10.2.0-dev1"
+    SAIA_API_V2_IMAGE=""
+    NGINX_IMAGE="docker.io/library/nginx:1.27-alpine"
     RAY_VERSION="v1.2.2"
     NVIDIA_VERSION="v0.17.3"
     ENABLE_CPU=true
@@ -230,6 +245,19 @@ load_config() {
   # Splunk operators
   SPLUNK_AI_NS="splunk-ai-operator-system"
 
+  # AWS Load Balancer Controller (LBC) — required when a Service of type=LoadBalancer
+  # uses the "service.beta.kubernetes.io/aws-load-balancer-type: external" annotation
+  # (the in-tree EKS cloud controller intentionally skips those Services). Pinned
+  # chart and policy versions keep installs reproducible against a vetted upstream
+  # release (supply-chain hygiene: codeguard-0-supply-chain-security).
+  LBC_NS="kube-system"
+  LBC_SA="aws-load-balancer-controller"
+  LBC_RELEASE="aws-load-balancer-controller"
+  LBC_ROLE_NAME="AWSLoadBalancerControllerRole-${CLUSTER_NAME}"
+  LBC_POLICY_NAME="AWSLoadBalancerControllerIAMPolicy-${CLUSTER_NAME}"
+  LBC_CHART_VERSION="1.8.2"   # helm chart version (appVersion v2.8.2)
+  LBC_POLICY_VERSION="v2.8.2" # upstream tag used to fetch iam_policy.json
+
   log "Configuration loaded: cluster=${CLUSTER_NAME}, region=${REGION}, namespace=${AI_NS}"
 }
 
@@ -386,47 +414,67 @@ configure_images() {
   local ray_worker_full=$(build_image_url "$IMAGE_REGISTRY" "$RAY_WORKER_IMAGE")
   local weaviate_full=$(build_image_url "$IMAGE_REGISTRY" "$WEAVIATE_IMAGE")
   local saia_api_full=$(build_image_url "$IMAGE_REGISTRY" "$SAIA_API_IMAGE")
+  local saia_api_v2_full=""
   local saia_dataloader_full=$(build_image_url "$IMAGE_REGISTRY" "$SAIA_DATALOADER_IMAGE")
   local fluent_bit_full=$(build_image_url "$IMAGE_REGISTRY" "$FLUENT_BIT_IMAGE")
   local otel_collector_full=$(build_image_url "$IMAGE_REGISTRY" "$OTEL_COLLECTOR_IMAGE")
+  local nginx_full=$(build_image_url "$IMAGE_REGISTRY" "$NGINX_IMAGE")
+  if [[ -n "${SAIA_API_V2_IMAGE}" && "${SAIA_API_V2_IMAGE}" != "null" ]]; then
+    saia_api_v2_full=$(build_image_url "$IMAGE_REGISTRY" "$SAIA_API_V2_IMAGE")
+  fi
 
   # Escape special characters for sed
   local ray_head_escaped=$(echo "$ray_head_full" | sed 's/[\/&]/\\&/g')
   local ray_worker_escaped=$(echo "$ray_worker_full" | sed 's/[\/&]/\\&/g')
   local weaviate_escaped=$(echo "$weaviate_full" | sed 's/[\/&]/\\&/g')
   local saia_api_escaped=$(echo "$saia_api_full" | sed 's/[\/&]/\\&/g')
+  local saia_api_v2_escaped=""
   local saia_dataloader_escaped=$(echo "$saia_dataloader_full" | sed 's/[\/&]/\\&/g')
   local fluent_bit_escaped=$(echo "$fluent_bit_full" | sed 's/[\/&]/\\&/g')
   local otel_collector_escaped=$(echo "$otel_collector_full" | sed 's/[\/&]/\\&/g')
+  local nginx_escaped=$(echo "$nginx_full" | sed 's/[\/&]/\\&/g')
   local operator_escaped=$(echo "$operator_full" | sed 's/[\/&]/\\&/g')
+  if [[ -n "${saia_api_v2_full}" ]]; then
+    saia_api_v2_escaped=$(echo "$saia_api_v2_full" | sed 's/[\/&]/\\&/g')
+  fi
 
-  SEDOPTION="-i"
+  local SED_INPLACE
   if [[ "$OSTYPE" == "darwin"* ]]; then
-    SEDOPTION="-i ''"
+    SED_INPLACE=(sed -i "")
+  else
+    SED_INPLACE=(sed -i)
   fi
   # Replace RELATED_IMAGE_ env vars by matching the env var name (not the value pattern)
   # This works regardless of what registry/image was there before
-  sed $SEDOPTION "/name: RELATED_IMAGE_RAY_HEAD/,/value:/ s|value:.*|value: ${ray_head_escaped}|" "$SPLUNK_AI_FILE"
-  sed $SEDOPTION "/name: RELATED_IMAGE_RAY_WORKER/,/value:/ s|value:.*|value: ${ray_worker_escaped}|" "$SPLUNK_AI_FILE"
-  sed $SEDOPTION "/name: RELATED_IMAGE_WEAVIATE/,/value:/ s|value:.*|value: ${weaviate_escaped}|" "$SPLUNK_AI_FILE"
-  sed $SEDOPTION "/name: RELATED_IMAGE_SAIA_API/,/value:/ s|value:.*|value: ${saia_api_escaped}|" "$SPLUNK_AI_FILE"
-  sed $SEDOPTION "/name: RELATED_IMAGE_POST_INSTALL_HOOK/,/value:/ s|value:.*|value: ${saia_dataloader_escaped}|" "$SPLUNK_AI_FILE"
-  sed $SEDOPTION "/name: RELATED_IMAGE_FLUENT_BIT/,/value:/ s|value:.*|value: ${fluent_bit_escaped}|" "$SPLUNK_AI_FILE"
-  sed $SEDOPTION "/name: RELATED_IMAGE_OTEL_COLLECTOR/,/value:/ s|value:.*|value: ${otel_collector_escaped}|" "$SPLUNK_AI_FILE"
-  sed $SEDOPTION "/name: MODEL_VERSION/,/value:/ s|value:.*|value: ${MODEL_VERSION}|" "$SPLUNK_AI_FILE"
-  sed $SEDOPTION "/name: RAY_VERSION/,/value:/ s|value:.*|value: ${RAY_RUNTIME_VERSION}|" "$SPLUNK_AI_FILE"
+  "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_RAY_HEAD/,/value:/ s|value:.*|value: ${ray_head_escaped}|" "$SPLUNK_AI_FILE"
+  "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_RAY_WORKER/,/value:/ s|value:.*|value: ${ray_worker_escaped}|" "$SPLUNK_AI_FILE"
+  "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_WEAVIATE/,/value:/ s|value:.*|value: ${weaviate_escaped}|" "$SPLUNK_AI_FILE"
+  "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_SAIA_API$/,/value:/ s|value:.*|value: ${saia_api_escaped}|" "$SPLUNK_AI_FILE"
+  if [[ -n "${saia_api_v2_escaped}" ]]; then
+    "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_SAIA_API_V2/,/value:/ s|value:.*|value: ${saia_api_v2_escaped}|" "$SPLUNK_AI_FILE"
+  fi
+  "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_POST_INSTALL_HOOK/,/value:/ s|value:.*|value: ${saia_dataloader_escaped}|" "$SPLUNK_AI_FILE"
+  "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_FLUENT_BIT/,/value:/ s|value:.*|value: ${fluent_bit_escaped}|" "$SPLUNK_AI_FILE"
+  "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_OTEL_COLLECTOR/,/value:/ s|value:.*|value: ${otel_collector_escaped}|" "$SPLUNK_AI_FILE"
+  "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_NGINX/,/value:/ s|value:.*|value: ${nginx_escaped}|" "$SPLUNK_AI_FILE"
+  "${SED_INPLACE[@]}" "/name: MODEL_VERSION/,/value:/ s|value:.*|value: ${MODEL_VERSION}|" "$SPLUNK_AI_FILE"
+  "${SED_INPLACE[@]}" "/name: RAY_VERSION/,/value:/ s|value:.*|value: ${RAY_RUNTIME_VERSION}|" "$SPLUNK_AI_FILE"
 
   # Replace operator image (the container image itself, not env var)
   # Find the line with "image:" that's near "splunk-ai-operator" and replace it
-  sed $SEDOPTION "s|image: .*splunk.*ai.*operator.*|image: ${operator_escaped}|I" "$SPLUNK_AI_FILE"
+  "${SED_INPLACE[@]}" "s|image: .*splunk.*ai.*operator.*|image: ${operator_escaped}|I" "$SPLUNK_AI_FILE"
 
   log "  ✓ Updated RELATED_IMAGE_RAY_HEAD: $ray_head_full"
   log "  ✓ Updated RELATED_IMAGE_RAY_WORKER: $ray_worker_full"
   log "  ✓ Updated RELATED_IMAGE_WEAVIATE: $weaviate_full"
   log "  ✓ Updated RELATED_IMAGE_SAIA_API: $saia_api_full"
+  if [[ -n "${saia_api_v2_full}" ]]; then
+    log "  ✓ Updated RELATED_IMAGE_SAIA_API_V2: $saia_api_v2_full"
+  fi
   log "  ✓ Updated RELATED_IMAGE_POST_INSTALL_HOOK: $saia_dataloader_full"
   log "  ✓ Updated RELATED_IMAGE_FLUENT_BIT: $fluent_bit_full"
   log "  ✓ Updated RELATED_IMAGE_OTEL_COLLECTOR: $otel_collector_full"
+  log "  ✓ Updated RELATED_IMAGE_NGINX: $nginx_full"
   log "  ✓ Updated operator image: $operator_full"
   log "  ✓ Updated MODEL_VERSION: $MODEL_VERSION"
   log "  ✓ Updated RAY_VERSION: $RAY_RUNTIME_VERSION"
@@ -441,10 +489,10 @@ configure_images() {
   local splunk_op_escaped=$(echo "$splunk_operator_full" | sed 's/[\/&]/\\&/g')
 
   # Replace RELATED_IMAGE_SPLUNK_ENTERPRISE env var
-  sed $SEDOPTION "/name: RELATED_IMAGE_SPLUNK_ENTERPRISE/,/value:/ s|value:.*|value: ${splunk_escaped}|" "$SPLUNK_OPERATOR_FILE"
+  "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_SPLUNK_ENTERPRISE/,/value:/ s|value:.*|value: ${splunk_escaped}|" "$SPLUNK_OPERATOR_FILE"
 
   # Replace splunk-operator image (the container image itself)
-  sed $SEDOPTION "s|image: .*splunk.*operator.*|image: ${splunk_op_escaped}|I" "$SPLUNK_OPERATOR_FILE"
+  "${SED_INPLACE[@]}" "s|image: .*splunk.*operator.*|image: ${splunk_op_escaped}|I" "$SPLUNK_OPERATOR_FILE"
 
   log "  ✓ Updated Splunk Enterprise image: $splunk_full"
   log "  ✓ Updated Splunk Operator image: $splunk_operator_full"
@@ -1365,6 +1413,139 @@ install_cert_manager() {
   check_ready cert-manager "app.kubernetes.io/instance=cert-manager,app.kubernetes.io/component=controller"
 }
 
+# ---------- AWS Load Balancer Controller (LBC) ----------
+# LBC watches Services with the "aws-load-balancer-type: external" annotation
+# (the in-tree cloud controller skips those Services on purpose) and drives
+# NLB/ALB provisioning through the AWS ELBv2 API. Without LBC installed, such
+# Services stay in EXTERNAL-IP=<pending> forever. LBC also gives us IP-mode
+# targeting, ACM-backed TLS termination, and modern NLB attributes — all
+# features the in-tree controller does not support.
+
+# Fetches the upstream-recommended IAM policy for LBC from a pinned git tag and
+# creates a customer-managed policy in the account (idempotent). Emits the ARN
+# on stdout so the caller can attach it via eksctl. Uses a cluster-scoped name
+# so teardown of one cluster won't remove a policy shared with other clusters.
+ensure_lbc_iam_policy() {
+  # Resolve the caller's account ID; construct the canonical policy ARN
+  # deterministically (IAM policy names are unique per account). This avoids
+  # parsing AWS CLI text output -- some CLI/JMESPath combinations have been
+  # observed to emit multi-line "None\nNone" for `Policies[?...].Arn | [0]`
+  # when no match exists, which would otherwise slip past a "!= None" guard.
+  local acct policy_arn
+  acct="$(aws sts get-caller-identity --query Account --output text 2>/dev/null | tr -d '[:space:]')"
+  if [[ -z "$acct" || ! "$acct" =~ ^[0-9]{12}$ ]]; then
+    err "Could not resolve a valid AWS account ID via STS (got: '${acct}')"
+  fi
+  policy_arn="arn:aws:iam::${acct}:policy/${LBC_POLICY_NAME}"
+
+  if aws iam get-policy --policy-arn "$policy_arn" >/dev/null 2>&1; then
+    log "✓ LBC IAM policy already exists: ${policy_arn}" >&2
+    printf "%s" "$policy_arn"
+    return 0
+  fi
+
+  local tmp; tmp="$(mktemp)"; TMP_FILES+=("$tmp")
+  local url="https://raw.githubusercontent.com/kubernetes-sigs/aws-load-balancer-controller/${LBC_POLICY_VERSION}/docs/install/iam_policy.json"
+  log "Fetching LBC IAM policy ${LBC_POLICY_VERSION} from ${url}" >&2
+  if ! curl -fsSL --max-time 60 "$url" -o "$tmp"; then
+    err "Failed to download AWS LBC IAM policy from ${url}. Check network access or bump LBC_POLICY_VERSION."
+  fi
+  if ! jq -e . "$tmp" >/dev/null 2>&1; then
+    err "Downloaded LBC IAM policy is not valid JSON. Refusing to proceed."
+  fi
+
+  local created
+  created="$(aws iam create-policy \
+    --policy-name "${LBC_POLICY_NAME}" \
+    --policy-document "file://${tmp}" \
+    --description "AWS Load Balancer Controller policy for ${CLUSTER_NAME} (${LBC_POLICY_VERSION})" \
+    --query 'Policy.Arn' --output text 2>/dev/null | tr -d '[:space:]')"
+  if [[ -z "$created" || "$created" != arn:aws:iam::* ]]; then
+    err "create-policy did not return a valid ARN for ${LBC_POLICY_NAME} (got: '${created}')"
+  fi
+  log "✓ Created LBC IAM policy ${LBC_POLICY_NAME}: ${created}" >&2
+  printf "%s" "$created"
+}
+
+# Creates the IRSA-bound ServiceAccount used by the LBC deployment. Uses eksctl
+# so the trust policy is pinned to this cluster's OIDC provider and SA subject.
+ensure_lbc_irsa() {
+  log "Ensuring IRSA for AWS Load Balancer Controller (${LBC_NS}/${LBC_SA})..."
+  local policy_arn; policy_arn="$(ensure_lbc_iam_policy)"
+  if [[ -z "$policy_arn" || "$policy_arn" != arn:aws:iam::* ]]; then
+    err "LBC IAM policy ARN is empty/invalid ('${policy_arn}'); cannot configure IRSA"
+  fi
+
+  eksctl create iamserviceaccount \
+    --cluster "${CLUSTER_NAME}" \
+    --region "${REGION}" \
+    --namespace "${LBC_NS}" \
+    --name "${LBC_SA}" \
+    --role-name "${LBC_ROLE_NAME}" \
+    --attach-policy-arn "${policy_arn}" \
+    --approve \
+    --override-existing-serviceaccounts
+
+  wait_resource_exists "${LBC_NS}" sa "${LBC_SA}" 180
+  log "✓ LBC IRSA role and service account configured"
+}
+
+# Tags user-provided subnets so LBC can auto-discover where to place LBs.
+# eksctl already tags subnets it creates, so this is a no-op when the cluster
+# was created without explicit cluster.subnets.
+tag_lbc_subnets() {
+  if [[ ${#PUBLIC_SUBNETS[@]} -eq 0 && ${#PRIVATE_SUBNETS[@]} -eq 0 ]]; then
+    log "No user-provided subnets; eksctl-created subnets are already tagged for LBC discovery."
+    return 0
+  fi
+  log "Tagging user-provided subnets for AWS Load Balancer Controller discovery..."
+  if [[ ${#PUBLIC_SUBNETS[@]} -gt 0 ]]; then
+    log "  Public subnets (${#PUBLIC_SUBNETS[@]}): kubernetes.io/role/elb=1"
+    aws ec2 create-tags --region "${REGION}" \
+      --resources "${PUBLIC_SUBNETS[@]}" \
+      --tags Key=kubernetes.io/role/elb,Value=1 \
+             "Key=kubernetes.io/cluster/${CLUSTER_NAME},Value=shared"
+  fi
+  if [[ ${#PRIVATE_SUBNETS[@]} -gt 0 ]]; then
+    log "  Private subnets (${#PRIVATE_SUBNETS[@]}): kubernetes.io/role/internal-elb=1"
+    aws ec2 create-tags --region "${REGION}" \
+      --resources "${PRIVATE_SUBNETS[@]}" \
+      --tags Key=kubernetes.io/role/internal-elb,Value=1 \
+             "Key=kubernetes.io/cluster/${CLUSTER_NAME},Value=shared"
+  fi
+  log "✓ Subnets tagged for LBC auto-discovery"
+}
+
+install_aws_load_balancer_controller() {
+  log "Installing AWS Load Balancer Controller (helm chart ${LBC_CHART_VERSION})..."
+
+  local vpc_id
+  vpc_id="$(aws eks describe-cluster --name "${CLUSTER_NAME}" --region "${REGION}" \
+    --query 'cluster.resourcesVpcConfig.vpcId' --output text 2>/dev/null || true)"
+  if [[ -z "$vpc_id" || "$vpc_id" == "None" ]]; then
+    err "Could not determine VPC ID for cluster ${CLUSTER_NAME}. LBC install requires vpcId."
+  fi
+
+  if ! aws iam get-role --role-name "${LBC_ROLE_NAME}" >/dev/null 2>&1; then
+    err "IRSA role ${LBC_ROLE_NAME} not found. ensure_lbc_irsa must run first."
+  fi
+
+  helm repo add eks https://aws.github.io/eks-charts >/dev/null
+  helm repo update >/dev/null
+  helm_retry 5 upgrade --install "${LBC_RELEASE}" eks/aws-load-balancer-controller \
+    --namespace "${LBC_NS}" \
+    --version "${LBC_CHART_VERSION}" \
+    --set clusterName="${CLUSTER_NAME}" \
+    --set region="${REGION}" \
+    --set vpcId="${vpc_id}" \
+    --set serviceAccount.create=false \
+    --set serviceAccount.name="${LBC_SA}" \
+    --wait --timeout 10m
+
+  check_ready "${LBC_NS}" "app.kubernetes.io/name=aws-load-balancer-controller"
+  log "✓ AWS Load Balancer Controller ${LBC_CHART_VERSION} installed and ready"
+}
+
 # ---------- External S3-compatible object storage (credentials only; no in-cluster install) ----------
 ensure_s3compat_credentials() {
   # Only create credentials secret when using external S3-compatible storage (s3compat, minio, seaweedfs).
@@ -1536,6 +1717,39 @@ ensure_s3_upload_splunk_app() {
   fi
 }
 
+ensure_external_objstore_upload_splunk_app() {
+  if [[ -z "${SPLUNK_APP_LOCAL_PATH}" ]]; then
+    log "SPLUNK_APP_LOCAL_PATH not set; skipping app upload to ${OBJ_STORE_TYPE}://${OBJ_STORE_BUCKET}/apps/"
+    return 0
+  fi
+  if [[ ! -f "${SPLUNK_APP_LOCAL_PATH}" ]]; then
+    warn "SPLUNK_APP_LOCAL_PATH='${SPLUNK_APP_LOCAL_PATH}' not found; skipping upload"
+    return 0
+  fi
+  if [[ -z "${OBJ_STORE_ENDPOINT}" ]]; then
+    warn "OBJ_STORE_ENDPOINT not set; cannot upload Splunk app to external object store"
+    return 0
+  fi
+
+  local base key
+  base="$(basename "${SPLUNK_APP_LOCAL_PATH}")"
+  key="apps/${base}"
+  log "Ensuring Splunk app '${base}' exists at ${OBJ_STORE_TYPE}://${OBJ_STORE_BUCKET}/${key}"
+
+  if AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \
+    aws --endpoint-url "${OBJ_STORE_ENDPOINT}" s3api head-object --bucket "${OBJ_STORE_BUCKET}" --key "${key}" >/dev/null 2>&1; then
+    log "App already present at ${OBJ_STORE_TYPE}://${OBJ_STORE_BUCKET}/${key}; skipping upload"
+  else
+    AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \
+      aws --endpoint-url "${OBJ_STORE_ENDPOINT}" s3 cp "${SPLUNK_APP_LOCAL_PATH}" "s3://${OBJ_STORE_BUCKET}/${key}"
+    log "Uploaded ${base} to ${OBJ_STORE_TYPE}://${OBJ_STORE_BUCKET}/${key}"
+  fi
+}
+
+should_wait_for_splunk_app_install() {
+  [[ -n "${SPLUNK_APP_LOCAL_PATH:-}" && -f "${SPLUNK_APP_LOCAL_PATH}" ]]
+}
+
 ensure_namespace() { kubectl get ns "$1" >/dev/null 2>&1 || kubectl create ns "$1"; }
 
 ensure_bucket_policy() {
@@ -2112,6 +2326,119 @@ show_platform_access_info() {
   log ""
 }
 
+saia_service_template_enabled() {
+  [[ -n "${SAIA_SERVICE_TYPE:-}" && "${SAIA_SERVICE_TYPE}" != "null" && "${SAIA_SERVICE_TYPE}" != "ClusterIP" ]]
+}
+
+saia_aiservice_name() {
+  local platform_name="${1:-${AI_PLATFORM_NAME}}"
+  printf "%s-saia" "${platform_name}"
+}
+
+wait_for_aiservice_exists() {
+  local name="$1" timeout="${2:-600}" waited=0
+  while ! kubectl -n "${AI_NS}" get aiservice "${name}" >/dev/null 2>&1; do
+    [[ $waited -ge $timeout ]] && err "Timed out waiting for AIService ${AI_NS}/${name}"
+    sleep 5
+    waited=$((waited + 5))
+  done
+}
+
+apply_saia_service_annotations() {
+  local aiservice_name="$1"
+  local annotation_keys key value
+
+  annotation_keys="$(yq eval '.aiPlatform.serviceTemplate.annotations // {} | keys | .[]' "${CONFIG_FILE}" 2>/dev/null || true)"
+  [[ -z "${annotation_keys}" ]] && return 0
+
+  local annotate_args=()
+  while IFS= read -r key; do
+    [[ -z "${key}" || "${key}" == "null" ]] && continue
+    value="$(yq eval ".aiPlatform.serviceTemplate.annotations.\"${key}\"" "${CONFIG_FILE}" 2>/dev/null || echo "")"
+    [[ -z "${value}" || "${value}" == "null" ]] && continue
+    annotate_args+=("${key}=${value}")
+  done <<< "${annotation_keys}"
+
+  if [[ ${#annotate_args[@]} -gt 0 ]]; then
+    log "Applying SAIA Service annotations to AIService/${aiservice_name}..."
+    kubectl -n "${AI_NS}" annotate aiservice "${aiservice_name}" "${annotate_args[@]}" --overwrite
+  fi
+}
+
+patch_saia_public_service_workaround() {
+  local platform_name="${1:-${AI_PLATFORM_NAME}}"
+  local aiservice_name public_svc_name
+
+  aiservice_name="$(saia_aiservice_name "${platform_name}")"
+  public_svc_name="${aiservice_name}-saia-service"
+
+  wait_for_aiservice_exists "${aiservice_name}"
+
+  if saia_service_template_enabled; then
+    log "Patching AIService/${aiservice_name} with SAIA public exposure settings..."
+    if [[ "${SAIA_SERVICE_TYPE}" == "NodePort" && -n "${SAIA_SERVICE_NODE_PORT:-}" && "${SAIA_SERVICE_NODE_PORT}" != "null" ]]; then
+      kubectl -n "${AI_NS}" patch aiservice "${aiservice_name}" --type merge -p "{
+  \"spec\": {
+    \"serviceTemplate\": {
+      \"spec\": {
+        \"type\": \"NodePort\",
+        \"ports\": [
+          {
+            \"name\": \"http\",
+            \"port\": 8080,
+            \"targetPort\": 8080,
+            \"nodePort\": ${SAIA_SERVICE_NODE_PORT}
+          }
+        ]
+      }
+    }
+  }
+}"
+    else
+      kubectl -n "${AI_NS}" patch aiservice "${aiservice_name}" --type merge -p "{
+  \"spec\": {
+    \"serviceTemplate\": {
+      \"spec\": {
+        \"type\": \"${SAIA_SERVICE_TYPE}\"
+      }
+    }
+  }
+}"
+    fi
+  fi
+
+  apply_saia_service_annotations "${aiservice_name}"
+
+  kubectl -n "${AI_NS}" annotate aiservice "${aiservice_name}" script-reconcile-ts="$(date +%s)" --overwrite >/dev/null
+
+  if saia_service_template_enabled; then
+    log "Recreating SAIA public Service to ensure patched settings take effect..."
+    kubectl -n "${AI_NS}" delete svc "${public_svc_name}" --ignore-not-found >/dev/null 2>&1 || true
+    wait_resource_exists "${AI_NS}" service "${public_svc_name}" 300
+  fi
+}
+
+wait_for_saia_load_balancer() {
+  local platform_name="${1:-${AI_PLATFORM_NAME}}" timeout="${2:-1200}" waited=0
+  local svc_name hostname=""
+  svc_name="$(saia_aiservice_name "${platform_name}")-saia-service"
+
+  [[ "${SAIA_SERVICE_TYPE:-}" == "LoadBalancer" ]] || return 0
+
+  log "Waiting for SAIA LoadBalancer Service ${AI_NS}/${svc_name} to receive an external hostname..."
+  while true; do
+    hostname="$(kubectl -n "${AI_NS}" get svc "${svc_name}" -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null || true)"
+    [[ -z "${hostname}" ]] && hostname="$(kubectl -n "${AI_NS}" get svc "${svc_name}" -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true)"
+    if [[ -n "${hostname}" ]]; then
+      log "✓ SAIA external endpoint: ${hostname}"
+      return 0
+    fi
+    [[ $waited -ge $timeout ]] && err "Timed out waiting for SAIA LoadBalancer Service ${AI_NS}/${svc_name}"
+    sleep 5
+    waited=$((waited + 5))
+  done
+}
+
 # Quick status check function - can be called standalone
 check_aiplatform_status() {
   local platform_name="${1:-${AI_PLATFORM_NAME}}"
@@ -2262,6 +2589,14 @@ YAML
       ;;
   esac
 
+  local svc_template_yaml=""
+  if saia_service_template_enabled; then
+    svc_template_yaml="  serviceTemplate:"$'\n'"    spec:"$'\n'"      type: ${SAIA_SERVICE_TYPE}"$'\n'
+    if [[ "${SAIA_SERVICE_TYPE}" == "NodePort" && -n "${SAIA_SERVICE_NODE_PORT:-}" && "${SAIA_SERVICE_NODE_PORT}" != "null" ]]; then
+      svc_template_yaml+="      ports:"$'\n'"      - name: http"$'\n'"        port: 8080"$'\n'"        targetPort: 8080"$'\n'"        nodePort: ${SAIA_SERVICE_NODE_PORT}"$'\n'
+    fi
+  fi
+
   cat <<YAML | kubectl -n "${AI_NS}" apply --server-side --force-conflicts -f -
 apiVersion: ai.splunk.com/v1
 kind: AIPlatform
@@ -2279,6 +2614,7 @@ spec:
     - name: saia
       version: "1.1.0"
       serviceAccountName: ${SAIA_SERVICE_SA}
+${svc_template_yaml}
   storage:
     vectorDB:
       size: ${VECTORDB_SIZE}
@@ -2314,6 +2650,8 @@ spec:
 YAML
 
   wait_aiplatform_ready
+  patch_saia_public_service_workaround "${AI_PLATFORM_NAME}"
+  wait_for_saia_load_balancer "${AI_PLATFORM_NAME}" 1200
 }
 
 # Wait until Splunk AI Assistant app shows as installed in Standalone status
@@ -2616,6 +2954,7 @@ delete_cluster_minimal() {
   delete_iamserviceaccount_if_exists "${AI_NS}" "${RAY_WORKER_SA}"
   delete_iamserviceaccount_if_exists "${AI_NS}" "${SAIA_SERVICE_SA}"
   delete_iamserviceaccount_if_exists "${EBS_NS}" "${EBS_SA}"
+  delete_iamserviceaccount_if_exists "${LBC_NS}" "${LBC_SA}"
   echo ""
 
   log "Step 2: Deleting IAM roles..."
@@ -2624,6 +2963,7 @@ delete_cluster_minimal() {
   delete_role_if_exists "IRSA-${CLUSTER_NAME}-${RAY_WORKER_SA}"
   delete_role_if_exists "IRSA-${CLUSTER_NAME}-${SAIA_SERVICE_SA}"
   delete_role_if_exists "${EBS_IRSA_ROLE_NAME}"
+  delete_role_if_exists "${LBC_ROLE_NAME}"
   echo ""
 
   log "Step 3: Cleaning up any eksctl-created EBS CSI addon roles..."
@@ -2720,6 +3060,7 @@ delete_cluster_minimal() {
   else
     delete_policy_if_exists "${AI_BUCKET_POLICY_NAME}"
   fi
+  delete_policy_if_exists "${LBC_POLICY_NAME}"
   echo ""
 
   log "Step 8: Purging all IRSA roles associated with this cluster's OIDC provider..."
@@ -2779,6 +3120,7 @@ delete_everything() {
   helm uninstall "${AUTOSCALER_RELEASE}" -n "${AUTOSCALER_NS}" || true
   kubectl delete -f https://github.com/splunk/splunk-operator/releases/download/2.8.1/splunk-operator-cluster.yaml --ignore-not-found
   kubectl delete -k "github.com/ray-project/kuberay/ray-operator/config/default?ref=v1.2.2" --ignore-not-found
+  helm uninstall "${LBC_RELEASE}" -n "${LBC_NS}" || true
   helm uninstall kube-prometheus -n monitoring || true
   helm uninstall cert-manager -n cert-manager || true
   kubectl delete storageclass gp3 --ignore-not-found
@@ -2856,7 +3198,7 @@ preflight_env() {
   fi
 
   pf_header "Tools"
-  for t in aws eksctl kubectl helm git jq yq; do
+  for t in aws eksctl kubectl helm git jq yq curl; do
     if command -v "$t" >/dev/null 2>&1; then pf_ok "$t found ($(command -v $t))"; else pf_fail "$t not found in PATH"; fi
   done
 
@@ -3102,6 +3444,7 @@ install_ai_platform_stack() {
   log "=== Setting up Splunk AI Platform stack ==="
   if [[ "${USE_EXTERNAL_OBJ_STORE}" == "true" ]]; then
     log "Using external S3-compatible object storage (${OBJ_STORE_TYPE}); skipping S3 bucket creation; using ECR-only policy for IRSA."
+    ensure_external_objstore_upload_splunk_app
   else
     ensure_s3_bucket_and_prefixes
     ensure_s3_upload_splunk_app
@@ -3169,19 +3512,36 @@ reconcile_flow() {
   fi
   install_kube_prometheus
   install_cert_manager
+  # AWS Load Balancer Controller (LBC) — only install when the operator itself
+  # needs to provision NLBs/ALBs (Service type=LoadBalancer with the
+  # `aws-load-balancer-type: external` annotation) or when binding k8s Services
+  # to customer-managed target groups via TargetGroupBinding CRs. Customers who
+  # bring their own LB and point it at NodePort (Path A) should leave this off.
+  if [[ "${INSTALL_LBC}" == "true" ]]; then
+    log "aiPlatform.awsLoadBalancerController.install=true — installing AWS Load Balancer Controller"
+    tag_lbc_subnets
+    ensure_lbc_irsa
+    install_aws_load_balancer_controller
+  else
+    log "aiPlatform.awsLoadBalancerController.install=false — skipping LBC install (bring-your-own-LB / NodePort path)"
+  fi
   ensure_s3compat_credentials
   install_otel_operator_and_contrib_collector
   install_ray_operator
   install_splunk_operator
   install_splunk_ai_operator
   install_ai_platform_stack
-  wait_splunk_ai_assistant_installed "Splunk_AI_Assistant_Cloud.tgz" 1200
+  if should_wait_for_splunk_app_install; then
+    wait_splunk_ai_assistant_installed "Splunk_AI_Assistant_Cloud.tgz" 1200
+  else
+    log "Skipping Splunk AI Assistant app wait because no local app archive is configured"
+  fi
   # push_saia_conf_into_pod
 }
 
 # ---------- MAIN ----------
 main_install() {
-  for t in aws eksctl kubectl helm git jq yq; do need "$t"; done
+  for t in aws eksctl kubectl helm git jq yq curl; do need "$t"; done
 
   # Load configuration from YAML file
   load_config
diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml
index 124373f..3935404 100644
--- a/tools/cluster_setup/k0s-cluster-config.yaml
+++ b/tools/cluster_setup/k0s-cluster-config.yaml
@@ -15,7 +15,7 @@ cluster:
   name: airgap-cluster
   # region: us-east-2                    # Ignored for on-prem, but required in config
   sshUser: ec2-user                       # CHANGE THIS: SSH user for remote nodes
-  sshKeyPath: /Users/mohaari2/.ssh/ai-key-arif.pem  # CHANGE THIS: Path to SSH private key
+  sshKeyPath: /Users/kiran/.ssh/ai-key-arif.pem  # CHANGE THIS: Path to SSH private key
 
 # ---------- Node Configuration ----------
 nodes:
@@ -25,12 +25,11 @@ nodes:
 
   existingIPs:
     controllers:
-      - 3.144.14.96                       # CHANGE THIS: Your controller server IP
+      - 3.149.241.167
     workers:
-      - 3.14.134.16                       # CHANGE THIS: CPU worker 1
-      - 13.59.78.115                      # CHANGE THIS: GPU worker 1
-      - 3.15.20.136                       # CHANGE THIS: GPU worker 2
-
+      - 18.221.244.241
+      - 18.191.19.128
+      - 3.137.209.219
 # ---------- Storage Configuration ----------
 # Object storage: AWS S3 or external S3-compatible (no in-cluster MinIO install for external).
 # Use objectStore.type: aws (S3) or s3compat | minio | seaweedfs (external; endpoint + credentials required).
@@ -103,7 +102,7 @@ images:
     # Build & push with:
     #   IMG=658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.26 \
     #     make docker-build-amd64 docker-push
-    image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.28"
+    image: "docker.io/kbhos698/splunk-ai-operator:ai-tier"
 
   splunk:
     image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/splunk/splunk:10-2-ai-custom" #TODO this update
@@ -175,8 +174,8 @@ kubernetes:
 
 # ---------- File Paths ----------
 files:
-  splunkOperator: "/Users/mohaari2/Files/repos/AI/splunk-ai-operator/tools/cluster_setup/splunk-operator-cluster.yaml"
-  aiPlatform: "/Users/mohaari2/Files/repos/AI/splunk-ai-operator/tools/cluster_setup/artifacts.yaml"
+  splunkOperator: "./splunk-operator-cluster.yaml"
+  aiPlatform: "./artifacts.yaml"
 
 # ---------- Splunk Configuration ----------
 splunk:
@@ -213,7 +212,12 @@ aiPlatform:
   # To ENABLE external exposure for on-prem / airgap customers, NodePort is the
   # recommended default: any k8s node IP + the configured nodePort yields a
   # reachable endpoint from VPN-connected users. No cloud LB / cert-manager
-  # needed. Use LoadBalancer only if the customer runs MetalLB or a cloud LB.
+  # needed. Use LoadBalancer only if the cluster has MetalLB/cloud LB support.
+  #
+  # Note: the current operator image preserves serviceTemplate.spec.type, but
+  # not nested serviceTemplate.metadata.annotations. The k0s installer applies
+  # any annotations below directly to the generated AIService after creation,
+  # which the current operator already copies onto the rendered Service.
   serviceTemplate:
     type: NodePort          # ClusterIP | NodePort | LoadBalancer (omit block = ClusterIP)
     nodePort: 30080         # Fixed NodePort (30000-32767). Required for stable DNS.
diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh
index 1f45cfc..915118d 100755
--- a/tools/cluster_setup/k0s_cluster_with_stack.sh
+++ b/tools/cluster_setup/k0s_cluster_with_stack.sh
@@ -3675,6 +3675,96 @@ YAML
   log "AIPlatform CR installed successfully"
 }
 
+saia_service_template_enabled_k0s() {
+  local svc_type
+  svc_type=$(yq eval '.aiPlatform.serviceTemplate.type // ""' "${CONFIG_FILE}" 2>/dev/null || echo "")
+  [[ -n "${svc_type}" && "${svc_type}" != "null" && "${svc_type}" != "ClusterIP" ]]
+}
+
+wait_for_k0s_aiservice_exists() {
+  local name="$1" timeout="${2:-600}" waited=0
+  while ! kubectl -n "${AI_NS}" get aiservice "${name}" >/dev/null 2>&1; do
+    [[ $waited -ge $timeout ]] && err "Timed out waiting for AIService ${AI_NS}/${name}"
+    sleep 5
+    waited=$((waited + 5))
+  done
+}
+
+apply_k0s_saia_service_annotations() {
+  local aiservice_name="$1"
+  local annotation_keys key value
+
+  annotation_keys="$(yq eval '.aiPlatform.serviceTemplate.annotations // {} | keys | .[]' "${CONFIG_FILE}" 2>/dev/null || true)"
+  [[ -z "${annotation_keys}" ]] && return 0
+
+  local annotate_args=()
+  while IFS= read -r key; do
+    [[ -z "${key}" || "${key}" == "null" ]] && continue
+    value="$(yq eval ".aiPlatform.serviceTemplate.annotations.\"${key}\"" "${CONFIG_FILE}" 2>/dev/null || echo "")"
+    [[ -z "${value}" || "${value}" == "null" ]] && continue
+    annotate_args+=("${key}=${value}")
+  done <<< "${annotation_keys}"
+
+  if [[ ${#annotate_args[@]} -gt 0 ]]; then
+    log "Applying SAIA Service annotations to AIService/${aiservice_name}..."
+    kubectl -n "${AI_NS}" annotate aiservice "${aiservice_name}" "${annotate_args[@]}" --overwrite
+  fi
+}
+
+patch_k0s_saia_public_service_workaround() {
+  local platform_name="${CLUSTER_NAME}-ai-platform"
+  local aiservice_name="${platform_name}-saia"
+  local public_svc_name="${aiservice_name}-saia-service"
+  local svc_type svc_node_port
+
+  svc_type=$(yq eval '.aiPlatform.serviceTemplate.type // ""' "${CONFIG_FILE}" 2>/dev/null || echo "")
+  svc_node_port=$(yq eval '.aiPlatform.serviceTemplate.nodePort // ""' "${CONFIG_FILE}" 2>/dev/null || echo "")
+
+  wait_for_k0s_aiservice_exists "${aiservice_name}"
+
+  if saia_service_template_enabled_k0s; then
+    log "Patching AIService/${aiservice_name} with SAIA public exposure settings..."
+    if [[ "${svc_type}" == "NodePort" && -n "${svc_node_port}" && "${svc_node_port}" != "null" ]]; then
+      kubectl -n "${AI_NS}" patch aiservice "${aiservice_name}" --type merge -p "{
+  \"spec\": {
+    \"serviceTemplate\": {
+      \"spec\": {
+        \"type\": \"NodePort\",
+        \"ports\": [
+          {
+            \"name\": \"http\",
+            \"port\": 8080,
+            \"targetPort\": 8080,
+            \"nodePort\": ${svc_node_port}
+          }
+        ]
+      }
+    }
+  }
+}"
+    else
+      kubectl -n "${AI_NS}" patch aiservice "${aiservice_name}" --type merge -p "{
+  \"spec\": {
+    \"serviceTemplate\": {
+      \"spec\": {
+        \"type\": \"${svc_type}\"
+      }
+    }
+  }
+}"
+    fi
+  fi
+
+  apply_k0s_saia_service_annotations "${aiservice_name}"
+
+  kubectl -n "${AI_NS}" annotate aiservice "${aiservice_name}" script-reconcile-ts="$(date +%s)" --overwrite >/dev/null
+
+  if saia_service_template_enabled_k0s; then
+    log "Recreating SAIA public Service to ensure patched settings take effect..."
+    kubectl -n "${AI_NS}" delete svc "${public_svc_name}" --ignore-not-found >/dev/null 2>&1 || true
+  fi
+}
+
 # ====== INSTALL FULL STACK ======
 install_ai_platform_stack() {
   log "Installing complete AI Platform stack..."
@@ -3770,6 +3860,7 @@ install_ai_platform_stack() {
   # Install AI Platform operator and CR while Splunk Standalone boots
   install_splunk_ai_operator
   install_ai_platform_cr
+  patch_k0s_saia_public_service_workaround
 
   # Now wait for Splunk Standalone to be ready (likely already done by now)
   wait_for_splunk_standalone
diff --git a/tools/cluster_setup/splunk-operator-cluster.yaml b/tools/cluster_setup/splunk-operator-cluster.yaml
index 0732ea3..467879e 100644
--- a/tools/cluster_setup/splunk-operator-cluster.yaml
+++ b/tools/cluster_setup/splunk-operator-cluster.yaml
@@ -55325,7 +55325,6 @@ subjects:
 apiVersion: v1
 data:
   OPERATOR_NAME: '"splunk-operator"'
-  # TODO identify whats this ??
   RELATED_IMAGE_SPLUNK_ENTERPRISE: 667741767953.dkr.ecr.us-west-2.amazonaws.com/splunk/splunk:splunk-redhat-8-amd64-10.2.0-ef65e8205e4d-6d943f7-28228924
   WATCH_NAMESPACE: ""
 kind: ConfigMap

From 6433a1565de3234b43003afdef0a5432332e8239 Mon Sep 17 00:00:00 2001
From: kbhos <kbhos@splunk.com>
Date: Wed, 29 Apr 2026 14:17:35 +0530
Subject: [PATCH 2/5] WIP: pre-merge in-progress work on saia-gateway-changes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- cluster-config.yaml: rewrite SAIA exposure as 3 NodePort-free modes,
  drop redundant nginx image entry; add byoTargetGroup config block
- eks_cluster_with_stack.sh: read BYO_TG_*; add validate_byo_target_group_config,
  apply_byo_target_group_binding, patch_saia_service_disable_nodeport;
  update patch_saia_public_service_workaround for NodePort-free mode
- k0s-cluster-config.yaml: switch SAIA exposure to LoadBalancer + MetalLB;
  add metallb config block; revert object storage to type=minio with AWS S3
  endpoint (the only working path on k0s — type=aws is silently swapped to
  in-cluster MinIO by the install script)
- k0s_cluster_with_stack.sh: add install_metallb function (chart pin 0.14.8,
  L2 / BGP advertisements); patch_k0s_saia_service_disable_nodeport; fix
  describe_pod node-count whitespace bug
- artifacts.yaml: minor diff (will be overwritten by upcoming merge)

Pre-merge of origin/ai-tier-v2-k0s; will be subsumed by the merge commit.

Made-with: Cursor
---
 tools/cluster_setup/artifacts.yaml            |  67 ++++---
 tools/cluster_setup/cluster-config.yaml       | 136 ++++++++------
 tools/cluster_setup/eks_cluster_with_stack.sh | 162 ++++++++++++++--
 tools/cluster_setup/k0s-cluster-config.yaml   | 165 +++++++++++-----
 tools/cluster_setup/k0s_cluster_with_stack.sh | 177 +++++++++++++++++-
 5 files changed, 552 insertions(+), 155 deletions(-)

diff --git a/tools/cluster_setup/artifacts.yaml b/tools/cluster_setup/artifacts.yaml
index 69c3664..9b5b51f 100644
--- a/tools/cluster_setup/artifacts.yaml
+++ b/tools/cluster_setup/artifacts.yaml
@@ -1061,11 +1061,18 @@ spec:
                 items:
                   description: FeatureSpec defines the features to enable in the AIPlatform
                   properties:
+                    env:
+                      additionalProperties:
+                        type: string
+                      description: Env specifies environment variables to propagate
+                        to the child AIService.
+                      type: object
                     name:
                       description: Name of the feature, e.g. "saia" or "seca"
                       enum:
                       - saia
                       - seca
+                      - weaviate-service
                       type: string
                     scaleFactor:
                       description: ScaleFactor is the desired fixed number of replicas
@@ -2085,6 +2092,11 @@ spec:
                       type: object
                       x-kubernetes-map-type: atomic
                     type: array
+                  otelImage:
+                    default: otel/opentelemetry-collector-contrib:0.122.1
+                    description: OTelImage is the OpenTelemetry Collector sidecar
+                      image
+                    type: string
                   rayHeadGroupImage:
                     description: Ray head group image, e.g. "rayproject/ray-head:latest"
                     type: string
@@ -2225,7 +2237,8 @@ spec:
                 type: object
               objectStorage:
                 description: |-
-                  ObjectStorage defines the object storage configuration for AI artifacts, tasks, and models
+                  ObjectStorage defines the object storage configuration for AI artifacts, tasks, and models.
+                  It is optional for platforms that only enable features that do not require object storage.
                   Supported providers: S3, GCS, Azure Blob Storage, MinIO
                 properties:
                   endpoint:
@@ -2237,8 +2250,8 @@ spec:
                   path:
                     description: |-
                       Remote volume URI in the format s3://bucketname/<path prefix>, gs://bucketname/<path prefix>,
-                      azure://containername/<path prefix>, minio://bucketname/<path prefix>, seaweedfs://bucketname/<path prefix>, or s3compat://bucketname/<path prefix>
-                    pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$
+                      azure://containername/<path prefix>, or minio://bucketname/<path prefix>
+                    pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$
                     type: string
                   region:
                     description: Region of the remote storage volume. Required for
@@ -2908,8 +2921,6 @@ spec:
                     pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$
                     type: string
                 type: object
-            required:
-            - objectStorage
             type: object
           status:
             description: AIPlatformStatus defines observed state
@@ -4084,11 +4095,18 @@ spec:
               features:
                 description: Feature defines the features to be enabled for the AIService
                 properties:
+                  env:
+                    additionalProperties:
+                      type: string
+                    description: Env specifies environment variables to propagate
+                      to the child AIService.
+                    type: object
                   name:
                     description: Name of the feature, e.g. "saia" or "seca"
                     enum:
                     - saia
                     - seca
+                    - weaviate-service
                     type: string
                   scaleFactor:
                     description: ScaleFactor is the desired fixed number of replicas
@@ -4866,27 +4884,15 @@ spec:
                 properties:
                   endpoint:
                     description: |-
-                      Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS)
-                      Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible (MinIO, SeaweedFS, etc.)
+                      Optional override endpoint (only needed for S3-compatible services like MinIO)
+                      Must be a valid HTTP/HTTPS URL
                     pattern: ^https?://.*$
                     type: string
                   path:
                     description: |-
                       Remote volume URI in the format s3://bucketname/<path prefix>, gs://bucketname/<path prefix>,
-                      azure://containername/<path prefix>, s3compat://bucketname/<path prefix> (generic S3-compatible), minio://, or seaweedfs://
-                    pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$
-                    type: string
-                  provider:
-                    description: |-
-                      Provider is an optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint.
-                      Values: aws, minio, seaweedfs, s3compat, gcs, azure
-                    enum:
-                    - aws
-                    - minio
-                    - seaweedfs
-                    - s3compat
-                    - gcs
-                    - azure
+                      azure://containername/<path prefix>, or minio://bucketname/<path prefix>
+                    pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$
                     type: string
                   region:
                     description: Region of the remote storage volume. Required for
@@ -4894,8 +4900,7 @@ spec:
                     minLength: 1
                     type: string
                   secretRef:
-                    description: Secret name containing storage credentials (e.g.
-                      s3_access_key, s3_secret_key for S3-compatible backends)
+                    description: Secret name containing storage credentials
                     maxLength: 253
                     minLength: 1
                     type: string
@@ -5682,19 +5687,19 @@ spec:
             fieldRef:
               fieldPath: metadata.name
         - name: RELATED_IMAGE_RAY_HEAD
-          value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-head:build-v2-002
+          value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-head:build-v2-008
         - name: RELATED_IMAGE_RAY_WORKER
-          value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-worker-gpu:build-v2-002
+          value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-worker-gpu:build-v2-008
         - name: RELATED_IMAGE_WEAVIATE
           value: docker.io/semitechnologies/weaviate:stable-v1.28-007846a
+        - name: RELATED_IMAGE_WEAVIATE_SERVICE
+          value: docker.io/semitechnologies/weaviate:stable-v1.28-007846a
         - name: RELATED_IMAGE_SAIA_API
-          value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:build-v2-002
+          value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:build-v2-009
         - name: RELATED_IMAGE_SAIA_API_V2
-          value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api-v2:build-v2-002
+          value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api-v2:build-v2-009
         - name: RELATED_IMAGE_POST_INSTALL_HOOK
-          value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-data-loader:v2.0.4-31-g9efe1fc
-        - name: SPLUNK_METRICS_INDEX_NAME
-          value: _metrics
+          value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-data-loader:build-v2-009
         - name: RELATED_IMAGE_FLUENT_BIT
           value: docker.io/fluent/fluent-bit:1.9.6
         - name: RELATED_IMAGE_OTEL_COLLECTOR
@@ -5705,7 +5710,7 @@ spec:
           value: v0.3.14-36-g1549f5a
         - name: RAY_VERSION
           value: 2.53.0
-        image: 658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.25
+        image: docker.io/kbhos698/splunk-ai-operator:ai-tier
         livenessProbe:
           httpGet:
             path: /healthz
diff --git a/tools/cluster_setup/cluster-config.yaml b/tools/cluster_setup/cluster-config.yaml
index 9111689..bafef6a 100644
--- a/tools/cluster_setup/cluster-config.yaml
+++ b/tools/cluster_setup/cluster-config.yaml
@@ -275,91 +275,99 @@ aiPlatform:
     imageRegistry: ""                     # Leave empty for default
 
   # ---------------------------------------------------------------------------
-  # Public SAIA exposure
+  # Public SAIA exposure (NodePort-free)
   # ---------------------------------------------------------------------------
-  # The operator always renders a public Kubernetes Service named
+  # The operator renders a public Kubernetes Service named
   # `<aiPlatform.name>-saia-service` whose endpoints are the in-cluster nginx
-  # pods (nginx terminates path routing to saia v1 / v2). HOW that Service is
-  # reached from outside the cluster depends on two settings below:
+  # pods (nginx terminates path routing to saia v1 / v2). The install script
+  # then configures HOW that Service is reached from outside the cluster.
   #
-  #   - aiPlatform.serviceTemplate.{type, nodePort, annotations}
-  #   - aiPlatform.awsLoadBalancerController.install
+  # IMPORTANT: this template intentionally does NOT use Service.type=NodePort.
+  # Many enterprise security policies prohibit opening 30000-32767 on every
+  # worker. All three modes below are NodePort-free — the script sets
+  # `allocateLoadBalancerNodePorts: false` on LoadBalancer Services so
+  # kube-proxy never opens a node port; for the BYO mode the Service stays
+  # ClusterIP and AWS LBC registers pod IPs into the customer's target group.
   #
-  # Pick ONE of the three modes below. Each row shows: what you put in this
-  # file, what the install script does, and what you (the customer) must
-  # provision outside the cluster.
+  # Pick ONE of the modes below by editing the active block at the bottom of
+  # this section. Each mode shows: the YAML to use, what the script does, and
+  # what you must provision outside the cluster.
   #
   # ---------------------------------------------------------------------------
-  # MODE 1 — Operator-managed AWS NLB (default, simplest on EKS)
+  # MODE 1 — Operator-managed AWS NLB, IP-target mode (DEFAULT)
   # ---------------------------------------------------------------------------
   #   serviceTemplate:
   #     type: LoadBalancer
   #     annotations:
   #       service.beta.kubernetes.io/aws-load-balancer-type:             "external"
-  #       service.beta.kubernetes.io/aws-load-balancer-scheme:           "internet-facing"  # or "internal"
-  #       service.beta.kubernetes.io/aws-load-balancer-nlb-target-type:  "instance"
+  #       service.beta.kubernetes.io/aws-load-balancer-scheme:           "internet-facing"   # or "internal"
+  #       service.beta.kubernetes.io/aws-load-balancer-nlb-target-type:  "ip"                # ← pods, not nodes
+  #       service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: "true"
+  #     # Optional TLS termination at the NLB:
+  #     # service.beta.kubernetes.io/aws-load-balancer-ssl-cert: "arn:aws:acm:..."
+  #     # service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "443"
+  #     # service.beta.kubernetes.io/aws-load-balancer-ssl-negotiation-policy: "ELBSecurityPolicy-TLS13-1-2-2021-06"
   #   awsLoadBalancerController:
   #     install: true
+  #   byoTargetGroup:
+  #     enabled: false
   #
   # Script does:
-  #   * Installs AWS Load Balancer Controller (LBC) with IRSA and tags
-  #     public/private subnets for auto-discovery.
-  #   * Creates the LoadBalancer-typed Service; LBC reads the annotations and
-  #     provisions an internet-facing AWS NLB (~2-3 min). Public DNS appears
-  #     in `.status.loadBalancer.ingress[0].hostname`.
-  # You must do:
-  #   * Nothing on the AWS side — fully automated.
-  #   * (Optional) Add an ACM cert listener annotation if you want TLS
-  #     termination at the NLB.
+  #   * Installs AWS Load Balancer Controller (LBC) with IRSA, tags subnets.
+  #   * Creates the LoadBalancer Service; LBC provisions an NLB whose targets
+  #     are pod IPs (no NodePort, no kube-proxy hop, real client IP preserved).
+  #   * Patches the rendered Service to set `allocateLoadBalancerNodePorts:
+  #     false` and `externalTrafficPolicy: Local`.
+  # You must do: nothing on the AWS side. DNS appears in
+  # `.status.loadBalancer.ingress[0].hostname` after ~2-3 min.
   #
   # ---------------------------------------------------------------------------
-  # MODE 2 — Bring-your-own AWS LB (you already have an NLB / ALB)
+  # MODE 2 — Bring-your-own AWS LB (TargetGroupBinding, IP-target)
   # ---------------------------------------------------------------------------
+  # Customer already owns the NLB / ALB / target group. LBC is installed only
+  # to manage target-group membership; it does NOT create LBs in this mode.
+  #
   #   serviceTemplate:
-  #     type: NodePort
-  #     nodePort: 30080            # any free port in 30000-32767
+  #     type: ClusterIP                # LB is owned by the customer
   #   awsLoadBalancerController:
-  #     install: false             # no LBC, no operator-created LB
+  #     install: true                  # required for TargetGroupBinding
+  #   byoTargetGroup:
+  #     enabled: true
+  #     targetGroupArn: "arn:aws:elasticloadbalancing:<region>:<account>:targetgroup/<your-tg>/<id>"
+  #     securityGroupId: "sg-xxxxxxxxxxxxxxxxx"   # the customer's LB security group
   #
   # Script does:
-  #   * Creates the public Service as NodePort 30080 on every worker.
-  #   * Skips LBC install entirely.
-  # You must do (in AWS, outside the script):
-  #   1. Pre-create an NLB or ALB (any scheme).
-  #   2. Create a target group:
-  #        - Target type:    instance
-  #        - Protocol/Port:  TCP/30080 (NLB) or HTTP/30080 (ALB)
-  #        - Health check:   HTTP /nginx_health on port "traffic-port", 200 OK
-  #   3. Attach the EKS managed-nodegroup ASG to the target group so
-  #      membership tracks node scale-in/out, e.g. via Terraform:
-  #        resource "aws_autoscaling_attachment" "saia" {
-  #          autoscaling_group_name = "eks-<cluster>-<nodegroup>-NodeGroup-XXXX"
-  #          lb_target_group_arn    = "arn:aws:elasticloadbalancing:...:targetgroup/my-saia-tg/..."
-  #        }
-  #   4. Worker node SG: allow ingress TCP/30080 from the NLB subnet CIDRs
-  #      (NLB) or from the ALB's security group (ALB).
+  #   * Installs LBC.
+  #   * Leaves the public Service as ClusterIP.
+  #   * Applies a TargetGroupBinding CR with `targetType: ip` so LBC registers
+  #     nginx pod IPs into the customer's target group as endpoints change.
+  # You must do (outside the cluster):
+  #   1. Pre-create the target group in the EKS VPC with:
+  #        - Target type:   ip
+  #        - Protocol/Port: TCP/8080 (NLB) or HTTP/8080 (ALB)  ← pod port, not 30080
+  #        - Health check:  HTTP /nginx_health on traffic-port, 200 OK
+  #   2. Attach the target group to your existing LB listener.
+  #   3. Worker pod SG ingress 8080 from the LB SG only — the
+  #      TargetGroupBinding `networking.ingress.from.securityGroup` block
+  #      configured by the script does this for you.
   #
   # ---------------------------------------------------------------------------
-  # MODE 3 — On-prem / k0s / airgap (HAProxy, F5, MetalLB, hardware LB, …)
+  # MODE 3 — On-prem / k0s / airgap (NOT applicable to this EKS template)
   # ---------------------------------------------------------------------------
-  #   serviceTemplate:
-  #     type: NodePort
-  #     nodePort: 30080
-  #   awsLoadBalancerController:
-  #     install: false             # has no effect off-AWS, leave false
-  #
-  # Script does:
-  #   * Same as Mode 2 — creates the public Service as NodePort 30080.
-  # You must do (outside the cluster):
-  #   * Point your existing L4 LB (HAProxy / F5 / MetalLB / hardware) at every
-  #     worker node IP on TCP/30080, with HTTP health-check /nginx_health.
-  #     Sample HAProxy backend:
-  #         backend saia_be
-  #             option httpchk GET /nginx_health
-  #             server worker1 10.0.1.11:30080 check
-  #             server worker2 10.0.1.12:30080 check
+  # Use the dedicated `k0s-cluster-config.yaml` template, which configures
+  # MetalLB to allocate a routable VIP. The user-facing contract there is
+  # identical (`type: LoadBalancer`) — only the LB provider changes.
   #
   # ---------------------------------------------------------------------------
+  # SECURITY NOTES (apply to all modes)
+  # ---------------------------------------------------------------------------
+  #   * Always terminate TLS at the LB (ACM cert on AWS) and place an auth
+  #     layer in front (oauth2-proxy, Cognito on the ALB, API Gateway, …)
+  #     before exposing on the public internet.
+  #   * Restrict the LB listener to trusted source CIDRs / SGs (never
+  #     0.0.0.0/0 to a sensitive endpoint).
+  #   * Pod SG ingress should allow 8080 only from the LB SG.
+  # ---------------------------------------------------------------------------
 
   # Active mode below — EDIT to switch. Default is MODE 1.
   serviceTemplate:
@@ -367,11 +375,21 @@ aiPlatform:
     annotations:
       service.beta.kubernetes.io/aws-load-balancer-type: "external"
       service.beta.kubernetes.io/aws-load-balancer-scheme: "internet-facing"
-      service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: "instance"
+      service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: "ip"
+      service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: "true"
 
   awsLoadBalancerController:
     install: true
 
+  # Bring-your-own AWS target group (Mode 2). Set enabled: true and provide
+  # both targetGroupArn and securityGroupId; the script will then leave the
+  # SAIA Service as ClusterIP and apply a TargetGroupBinding (LBC manages
+  # target registration into your existing target group).
+  byoTargetGroup:
+    enabled: false
+    # targetGroupArn: "arn:aws:elasticloadbalancing:us-east-2:123456789012:targetgroup/my-saia-tg/abc123"
+    # securityGroupId: "sg-0123456789abcdef0"
+
   # CPU Scheduling
   cpuScheduling:
     nodeSelector: {}
diff --git a/tools/cluster_setup/eks_cluster_with_stack.sh b/tools/cluster_setup/eks_cluster_with_stack.sh
index b6982fb..9c815d8 100755
--- a/tools/cluster_setup/eks_cluster_with_stack.sh
+++ b/tools/cluster_setup/eks_cluster_with_stack.sh
@@ -93,12 +93,18 @@ load_config() {
     WORKER_IMAGE_REGISTRY="$(yq eval '.aiPlatform.workerGroupConfig.imageRegistry' "$cfg")"
     SAIA_SERVICE_TYPE="$(yq eval '.aiPlatform.serviceTemplate.type // ""' "$cfg")"
     SAIA_SERVICE_NODE_PORT="$(yq eval '.aiPlatform.serviceTemplate.nodePort // ""' "$cfg")"
-    # AWS Load Balancer Controller (LBC) install toggle. Default: false — the
-    # script assumes customers bring their own LB and point it at NodePort
-    # (Path A). Set to true only when you want operator-managed NLB/ALB
-    # provisioning via the `aws-load-balancer-type: external` annotation or
-    # dynamic target registration via TargetGroupBinding CRs (Path B).
+    # AWS Load Balancer Controller (LBC) install toggle. Required for both
+    # operator-managed NLB provisioning (Mode 1) and customer-owned LB
+    # registration via TargetGroupBinding (Mode 2). Off-AWS users (k0s) leave
+    # this false.
     INSTALL_LBC="$(yq eval '.aiPlatform.awsLoadBalancerController.install // false' "$cfg")"
+    # Bring-your-own AWS target group (Mode 2). When enabled the script keeps
+    # the public Service as ClusterIP and applies a TargetGroupBinding so LBC
+    # registers nginx pod IPs into the customer's pre-existing target group.
+    # Requires INSTALL_LBC=true.
+    BYO_TG_ENABLED="$(yq eval '.aiPlatform.byoTargetGroup.enabled // false' "$cfg")"
+    BYO_TG_ARN="$(yq eval '.aiPlatform.byoTargetGroup.targetGroupArn // ""' "$cfg")"
+    BYO_TG_SG_ID="$(yq eval '.aiPlatform.byoTargetGroup.securityGroupId // ""' "$cfg")"
     INGRESS_HOST="$(yq eval '.aiPlatform.ingress.host' "$cfg")"
     INGRESS_CLASS="$(yq eval '.aiPlatform.ingress.className' "$cfg")"
     INGRESS_TLS_SECRET="$(yq eval '.aiPlatform.ingress.tlsSecretName' "$cfg")"
@@ -185,6 +191,9 @@ load_config() {
     SAIA_SERVICE_TYPE=""
     SAIA_SERVICE_NODE_PORT=""
     INSTALL_LBC="false"
+    BYO_TG_ENABLED="false"
+    BYO_TG_ARN=""
+    BYO_TG_SG_ID=""
     INGRESS_HOST="ai.example.com"
     INGRESS_CLASS="nginx"
     INGRESS_TLS_SECRET="ai-platform-tls"
@@ -2365,18 +2374,120 @@ apply_saia_service_annotations() {
   fi
 }
 
+byo_target_group_enabled() {
+  [[ "${BYO_TG_ENABLED:-false}" == "true" ]]
+}
+
+# Validates BYO target-group configuration and warns about misconfigurations
+# before any kubectl/aws calls are issued. Caller decides whether to err or
+# return on warnings — we treat missing required fields as fatal because the
+# rest of the install would silently misroute traffic.
+validate_byo_target_group_config() {
+  byo_target_group_enabled || return 0
+
+  if [[ "${INSTALL_LBC:-false}" != "true" ]]; then
+    err "byoTargetGroup.enabled=true requires awsLoadBalancerController.install=true (LBC manages the TargetGroupBinding)."
+  fi
+  if [[ -z "${BYO_TG_ARN:-}" || "${BYO_TG_ARN}" == "null" ]]; then
+    err "byoTargetGroup.enabled=true requires byoTargetGroup.targetGroupArn to be set."
+  fi
+  if [[ "${BYO_TG_ARN}" != arn:aws:elasticloadbalancing:* ]]; then
+    err "byoTargetGroup.targetGroupArn must look like 'arn:aws:elasticloadbalancing:<region>:<account>:targetgroup/<name>/<id>' (got: ${BYO_TG_ARN})."
+  fi
+  if [[ -z "${BYO_TG_SG_ID:-}" || "${BYO_TG_SG_ID}" == "null" ]]; then
+    err "byoTargetGroup.enabled=true requires byoTargetGroup.securityGroupId (the customer LB's SG) so LBC opens pod-SG ingress correctly."
+  fi
+  if [[ "${SAIA_SERVICE_TYPE:-}" == "LoadBalancer" ]]; then
+    log "WARNING: byoTargetGroup.enabled=true with serviceTemplate.type=LoadBalancer creates BOTH an operator-managed LB AND a TargetGroupBinding. Set serviceTemplate.type=ClusterIP for pure BYO." >&2
+  fi
+}
+
+# Apply a TargetGroupBinding CR pointing at the customer's pre-provisioned
+# target group. AWS LBC reads this CR and registers the SAIA Service's pod
+# IPs (targetType: ip) into the customer's TG, then deregisters them on pod
+# rotation. The networking.ingress block has LBC open the pod SG to the LB's
+# SG only — never 0.0.0.0/0 (codeguard-0-iac-security).
+apply_byo_target_group_binding() {
+  local platform_name="${1:-${AI_PLATFORM_NAME}}"
+  local svc_name
+  svc_name="$(saia_aiservice_name "${platform_name}")-saia-service"
+
+  byo_target_group_enabled || return 0
+
+  log "Applying TargetGroupBinding for BYO target group ${BYO_TG_ARN}..."
+  cat <<YAML | kubectl -n "${AI_NS}" apply -f -
+apiVersion: elbv2.k8s.aws/v1beta1
+kind: TargetGroupBinding
+metadata:
+  name: ${svc_name}-tgb
+  namespace: ${AI_NS}
+spec:
+  serviceRef:
+    name: ${svc_name}
+    port: 8080
+  targetGroupARN: ${BYO_TG_ARN}
+  targetType: ip
+  networking:
+    ingress:
+      - from:
+          - securityGroup:
+              groupID: ${BYO_TG_SG_ID}
+        ports:
+          - protocol: TCP
+            port: 8080
+YAML
+  log "✓ TargetGroupBinding ${AI_NS}/${svc_name}-tgb applied"
+}
+
+# Disable kube-proxy NodePort allocation on the rendered SAIA Service. The
+# operator's reconcileSAIAService only touches Selector/Ports on existing
+# Services (pkg/ai/features/saia/impl.go), so this patch survives subsequent
+# reconciles. externalTrafficPolicy=Local preserves real client IP for
+# MetalLB-style providers; for AWS NLB ip-target mode it is a no-op since
+# LBC bypasses kube-proxy entirely.
+patch_saia_service_disable_nodeport() {
+  local platform_name="${1:-${AI_PLATFORM_NAME}}"
+  local svc_name
+  svc_name="$(saia_aiservice_name "${platform_name}")-saia-service"
+
+  # Only meaningful when the Service is type=LoadBalancer; ClusterIP services
+  # don't allocate NodePorts.
+  local svc_type
+  svc_type="$(kubectl -n "${AI_NS}" get svc "${svc_name}" -o jsonpath='{.spec.type}' 2>/dev/null || true)"
+  [[ "${svc_type}" != "LoadBalancer" ]] && return 0
+
+  log "Patching Service ${AI_NS}/${svc_name} to disable NodePort allocation..."
+  kubectl -n "${AI_NS}" patch svc "${svc_name}" --type=merge -p '{
+  "spec": {
+    "allocateLoadBalancerNodePorts": false,
+    "externalTrafficPolicy": "Local"
+  }
+}' >/dev/null
+  log "✓ Service ${AI_NS}/${svc_name}: allocateLoadBalancerNodePorts=false, externalTrafficPolicy=Local"
+}
+
 patch_saia_public_service_workaround() {
   local platform_name="${1:-${AI_PLATFORM_NAME}}"
-  local aiservice_name public_svc_name
+  local aiservice_name public_svc_name effective_type
 
   aiservice_name="$(saia_aiservice_name "${platform_name}")"
   public_svc_name="${aiservice_name}-saia-service"
 
   wait_for_aiservice_exists "${aiservice_name}"
 
-  if saia_service_template_enabled; then
-    log "Patching AIService/${aiservice_name} with SAIA public exposure settings..."
-    if [[ "${SAIA_SERVICE_TYPE}" == "NodePort" && -n "${SAIA_SERVICE_NODE_PORT:-}" && "${SAIA_SERVICE_NODE_PORT}" != "null" ]]; then
+  # In BYO mode the customer owns the LB; force the SAIA Service to ClusterIP
+  # regardless of what serviceTemplate.type says — TargetGroupBinding wires
+  # everything else.
+  if byo_target_group_enabled; then
+    effective_type="ClusterIP"
+  else
+    effective_type="${SAIA_SERVICE_TYPE}"
+  fi
+
+  if [[ -n "${effective_type:-}" && "${effective_type}" != "null" ]]; then
+    log "Patching AIService/${aiservice_name} with SAIA public exposure settings (type=${effective_type})..."
+    if [[ "${effective_type}" == "NodePort" && -n "${SAIA_SERVICE_NODE_PORT:-}" && "${SAIA_SERVICE_NODE_PORT}" != "null" ]]; then
+      log "WARNING: NodePort exposure is discouraged; consider Mode 1 (LoadBalancer + LBC) or Mode 2 (BYO target group) instead." >&2
       kubectl -n "${AI_NS}" patch aiservice "${aiservice_name}" --type merge -p "{
   \"spec\": {
     \"serviceTemplate\": {
@@ -2399,7 +2510,7 @@ patch_saia_public_service_workaround() {
   \"spec\": {
     \"serviceTemplate\": {
       \"spec\": {
-        \"type\": \"${SAIA_SERVICE_TYPE}\"
+        \"type\": \"${effective_type}\"
       }
     }
   }
@@ -2411,11 +2522,16 @@ patch_saia_public_service_workaround() {
 
   kubectl -n "${AI_NS}" annotate aiservice "${aiservice_name}" script-reconcile-ts="$(date +%s)" --overwrite >/dev/null
 
-  if saia_service_template_enabled; then
+  if [[ -n "${effective_type:-}" && "${effective_type}" != "null" && "${effective_type}" != "ClusterIP" ]]; then
     log "Recreating SAIA public Service to ensure patched settings take effect..."
     kubectl -n "${AI_NS}" delete svc "${public_svc_name}" --ignore-not-found >/dev/null 2>&1 || true
     wait_resource_exists "${AI_NS}" service "${public_svc_name}" 300
   fi
+
+  # NodePort-free hardening: disable kube-proxy NodePort allocation on
+  # LoadBalancer Services and apply BYO TargetGroupBinding if configured.
+  patch_saia_service_disable_nodeport "${platform_name}"
+  apply_byo_target_group_binding "${platform_name}"
 }
 
 wait_for_saia_load_balancer() {
@@ -2423,6 +2539,13 @@ wait_for_saia_load_balancer() {
   local svc_name hostname=""
   svc_name="$(saia_aiservice_name "${platform_name}")-saia-service"
 
+  # In BYO mode the Service is ClusterIP and the customer's LB DNS is not
+  # surfaced via .status.loadBalancer; skip the wait. Mode 1 (operator-
+  # managed NLB) still gates on SAIA_SERVICE_TYPE=LoadBalancer.
+  if byo_target_group_enabled; then
+    log "byoTargetGroup.enabled=true — skipping wait for operator-managed LB hostname (LB is customer-managed)."
+    return 0
+  fi
   [[ "${SAIA_SERVICE_TYPE:-}" == "LoadBalancer" ]] || return 0
 
   log "Waiting for SAIA LoadBalancer Service ${AI_NS}/${svc_name} to receive an external hostname..."
@@ -3512,18 +3635,23 @@ reconcile_flow() {
   fi
   install_kube_prometheus
   install_cert_manager
-  # AWS Load Balancer Controller (LBC) — only install when the operator itself
-  # needs to provision NLBs/ALBs (Service type=LoadBalancer with the
-  # `aws-load-balancer-type: external` annotation) or when binding k8s Services
-  # to customer-managed target groups via TargetGroupBinding CRs. Customers who
-  # bring their own LB and point it at NodePort (Path A) should leave this off.
+  # Validate BYO target-group config before any side-effecting calls. Fail
+  # fast if the customer set byoTargetGroup.enabled=true without LBC or
+  # required ARN/SG fields — better an early error than a silently-broken
+  # data path.
+  validate_byo_target_group_config
+  # AWS Load Balancer Controller (LBC) — required when the operator provisions
+  # NLBs/ALBs (Mode 1: Service type=LoadBalancer + `aws-load-balancer-type:
+  # external` annotation) or when binding the SAIA Service to a customer-
+  # managed target group via TargetGroupBinding (Mode 2: byoTargetGroup
+  # enabled). Off-AWS deployments leave this false.
   if [[ "${INSTALL_LBC}" == "true" ]]; then
     log "aiPlatform.awsLoadBalancerController.install=true — installing AWS Load Balancer Controller"
     tag_lbc_subnets
     ensure_lbc_irsa
     install_aws_load_balancer_controller
   else
-    log "aiPlatform.awsLoadBalancerController.install=false — skipping LBC install (bring-your-own-LB / NodePort path)"
+    log "aiPlatform.awsLoadBalancerController.install=false — skipping LBC install"
   fi
   ensure_s3compat_credentials
   install_otel_operator_and_contrib_collector
diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml
index 3935404..e107334 100644
--- a/tools/cluster_setup/k0s-cluster-config.yaml
+++ b/tools/cluster_setup/k0s-cluster-config.yaml
@@ -13,7 +13,7 @@
 # ---------- Cluster Configuration ----------
 cluster:
   name: airgap-cluster
-  # region: us-east-2                    # Ignored for on-prem, but required in config
+  region: us-east-2                       # CHANGE THIS — required when storage.objectStore.type=aws (region of the S3 bucket); ignored for true on-prem
   sshUser: ec2-user                       # CHANGE THIS: SSH user for remote nodes
   sshKeyPath: /Users/kiran/.ssh/ai-key-arif.pem  # CHANGE THIS: Path to SSH private key
 
@@ -31,31 +31,76 @@ nodes:
       - 18.191.19.128
       - 3.137.209.219
 # ---------- Storage Configuration ----------
-# Object storage: AWS S3 or external S3-compatible (no in-cluster MinIO install for external).
-# Use objectStore.type: aws (S3) or s3compat | minio | seaweedfs (external; endpoint + credentials required).
+# Object storage choices (`storage.objectStore.type`):
+#   * aws       — real AWS S3 (this template's default). SAIA pods authenticate
+#                 via the AWS SDK default credential chain (see prerequisite
+#                 below). No in-cluster Secret is created.
+#   * minio     — external MinIO (AWS-spec compliant). Provide endpoint + auth.
+#   * seaweedfs — external SeaweedFS S3 gateway. Provide endpoint + auth.
+#   * s3compat  — generic S3-compatible. Provide endpoint + auth.
 storage:
-  s3Bucket: "ai-platform-bucket-minio-us-east-2"  # Used when objectStore.type is aws
-  storageClass: "local-path"                 # Storage class for Kubernetes PVCs (gp3, gp2, io1, io2)
-  vectorDbSize: "50Gi"                       # VectorDB persistent volume size
+  # ---------------------------------------------------------------------
+  # Real AWS S3 on k0s — via s3compat with AWS S3 endpoint
+  # ---------------------------------------------------------------------
+  # WHY NOT type=aws?
+  #   The k0s install script and operator do NOT actually support
+  #   objectStore.type=aws today. When set, the script silently:
+  #     * installs an in-cluster MinIO into the minio-system namespace,
+  #     * points SAIA's S3COMPAT_OBJECT_STORE_ENDPOINT_URL at that MinIO
+  #       (http://minio.minio-system.svc.cluster.local:9000),
+  #     * ignores AIPlatform.spec.objectStorage.path = s3://...
+  #   Result: SAIA always uses the in-cluster MinIO regardless of the
+  #   objectStore.type setting. This was verified by inspecting pod env
+  #   vars after a clean install with type=aws — see operator bug filed
+  #   for proper k0s AWS S3 support.
+  #
+  # WORKAROUND — use type=s3compat with the AWS S3 regional endpoint:
+  #   The s3compat adapter is just boto3 with explicit endpoint_url +
+  #   credentials. AWS S3 IS S3-compatible, so the same code path works
+  #   when pointed at https://s3.<region>.amazonaws.com with a real AWS
+  #   access-key/secret pair. SAIA pods authenticate with the static
+  #   AWS keys you put in auth.rootUser / auth.rootPassword, the
+  #   installer creates a Kubernetes Secret named "minio-credentials"
+  #   with those keys, and SAIA's storage_adapters/factory.py uses them
+  #   to sign SigV4 requests to AWS S3 — which AWS accepts as valid.
+  #
+  # SECURITY NOTES (codeguard-1-hardcoded-credentials,
+  # codeguard-0-additional-cryptography):
+  #   * The AWS access key below MUST come from a dedicated IAM USER (not
+  #     a root account) with a least-privilege policy scoped to the
+  #     bucket only:
+  #         s3:GetObject, s3:PutObject, s3:DeleteObject,
+  #         s3:GetObjectTagging, s3:PutObjectTagging
+  #         on arn:aws:s3:::<bucket>/*
+  #         s3:ListBucket, s3:GetBucketLocation
+  #         on arn:aws:s3:::<bucket>
+  #   * Rotate the access key every 90 days at most; deactivate the
+  #     prior key after rollout.
+  #   * Do NOT commit these values to source control — populate from a
+  #     real secrets manager (Vault / AWS Secrets Manager / sops) at
+  #     deploy time, not from this YAML file.
+  #   * `endpoint` MUST be HTTPS — never use plaintext for S3 traffic.
+  #
+  # When the operator is fixed to support k0s+IRSA-style or k0s+IMDS
+  # auth properly, switch back to type=aws and remove the auth block.
+  # ---------------------------------------------------------------------
+  s3Bucket: "ai-platform-bucket-us-east-2"      # CHANGE THIS — must match objectStore.bucket below
+  storageClass: "local-path"                     # k0s default storage class (NOT "gp3" — gp3 is EKS-only)
+  vectorDbSize: "50Gi"                           # VectorDB persistent volume size
 
   objectStore:
-    # 2026-04-21: switched from seaweedfs to minio because SeaweedFS returns
-    # S3 InternalError/500 (not NoSuchKey/404) for GetObjectTagging on a
-    # missing key. The SAIA v2 S3ConversationStore (added by Tony in
-    # saia-service commits 3d3756f3/8e2a9f40, shipped in image build-v2-002)
-    # calls GetObjectTagging on the conversation key *before* the first
-    # PutObject, so every brand-new draft: conversation hit a 502 from the
-    # SDK's 5-retry backoff. MinIO is AWS-spec compliant (NoSuchKey/404) and
-    # hosts the same bucket name at :9000, so swapping the endpoint is
-    # sufficient. Fallback: flip back by setting type: "seaweedfs" and
-    # endpoint to :8333 (but note the 502 on every draft conversation).
-    type: "minio"                                # aws | s3compat | minio | seaweedfs (external only for non-aws)
-    bucket: "ai-platform-bucket-minio-us-east-2"
-    # endpoint: "http://3.144.157.201:8333"      # SeaweedFS (deprecated — see comment above)
-    endpoint: "http://13.59.216.105:9000"        # MinIO (AWS-spec compliant GetObjectTagging semantics)
+    # type=minio (NOT s3compat) is required: the AIPlatform CRD pattern
+    # only accepts ^(s3|gs|azure|minio)://... — `s3compat://` is rejected by
+    # the API server. The path scheme is purely a label for the CRD; the
+    # runtime endpoint below decides which backend SAIA actually talks to.
+    # Pointing endpoint at AWS S3 with real AWS keys makes this configuration
+    # use real AWS S3, not MinIO, despite the type label.
+    type: "minio"
+    bucket: "ai-platform-bucket-us-east-2"       # CHANGE THIS — must match storage.s3Bucket above
+    endpoint: "https://s3.us-east-2.amazonaws.com"   # CHANGE THIS — AWS regional S3 endpoint, MUST be HTTPS
     auth:
-      rootUser: "minioadmin"
-      rootPassword: "minioadmin"
+      rootUser: "<paste-AWS_ACCESS_KEY_ID-here>"      # CHANGE THIS — AWS_ACCESS_KEY_ID (AKIA...)
+      rootPassword: "<paste-AWS_SECRET_ACCESS_KEY-here>"   # CHANGE THIS — never ship a real key in this file
 
 # ---------- Container Images Configuration ----------
 images:
@@ -190,37 +235,36 @@ aiPlatform:
   workerGroupConfig:
     imageRegistry: ""
 
-  # ---------- SAIA public exposure (OPTIONAL) ----------
-  # The SAIA "public" Service (nginx reverse proxy in front of v1+v2 API pods)
-  # defaults to ClusterIP, meaning it is only reachable from inside the cluster.
-  #
-  # Two call patterns hit this Service:
-  #   (A) Splunk Enterprise pod      → saia-service   (works with ClusterIP)
-  #   (B) End user's browser         → saia-service   (needs external exposure)
+  # ---------- SAIA public exposure (NodePort-free) ----------
+  # The SAIA "public" Service (nginx reverse proxy in front of v1 + v2 API
+  # pods) defaults to ClusterIP — only reachable from inside the cluster. Two
+  # call patterns hit it:
+  #   (A) Splunk Enterprise pod   → saia-service   (works with ClusterIP)
+  #   (B) End user's browser      → saia-service   (needs external exposure)
   #
   # Pattern B is used by the v2 chat UI (/query streaming, conversations,
   # feedback, admin endpoints). Without external exposure the v2 chat UI
-  # breaks for users, even though v1 one-shot SPL features still work.
+  # breaks for users; v1 one-shot SPL still works.
   #
-  # To DISABLE external exposure (use ClusterIP only), either:
-  #   * Delete / comment-out the entire `serviceTemplate:` block below, OR
-  #   * Set `type: ClusterIP` explicitly.
-  # Either is treated identically — the installer skips emitting serviceTemplate
-  # into the AIPlatform CR and the operator falls through to the ClusterIP
-  # default in reconcileSAIAService().
+  # The supported on-prem path is `type: LoadBalancer` backed by MetalLB
+  # (allocates a routable VIP from a pool you provide; ARP / BGP-announces it
+  # on your network). NodePort is intentionally avoided so we never open
+  # 30000-32767 on every worker node.
   #
-  # To ENABLE external exposure for on-prem / airgap customers, NodePort is the
-  # recommended default: any k8s node IP + the configured nodePort yields a
-  # reachable endpoint from VPN-connected users. No cloud LB / cert-manager
-  # needed. Use LoadBalancer only if the cluster has MetalLB/cloud LB support.
+  # The installer:
+  #   * Installs MetalLB (set metallb.install: true below).
+  #   * Applies an IPAddressPool + L2Advertisement (or BGPAdvertisement) from
+  #     the metallb config below.
+  #   * Renders the SAIA Service as type: LoadBalancer; MetalLB allocates a
+  #     VIP from the pool and announces it.
+  #   * Patches the Service with `allocateLoadBalancerNodePorts: false` and
+  #     `externalTrafficPolicy: Local` so kube-proxy does not open a NodePort.
   #
-  # Note: the current operator image preserves serviceTemplate.spec.type, but
-  # not nested serviceTemplate.metadata.annotations. The k0s installer applies
-  # any annotations below directly to the generated AIService after creation,
-  # which the current operator already copies onto the rendered Service.
+  # To DISABLE external exposure (ClusterIP only), comment out the whole
+  # serviceTemplate block AND set metallb.install: false.
   serviceTemplate:
-    type: NodePort          # ClusterIP | NodePort | LoadBalancer (omit block = ClusterIP)
-    nodePort: 30080         # Fixed NodePort (30000-32767). Required for stable DNS.
+    type: LoadBalancer      # ClusterIP | LoadBalancer (NodePort is not used on k0s)
+    # No nodePort field — explicitly NodePort-free.
 
   features:
     - name: "saia"
@@ -238,6 +282,35 @@ aiPlatform:
         value: "true"
         effect: "NoSchedule"
 
+# ---------- MetalLB (k0s LoadBalancer provider) ----------
+# Required when aiPlatform.serviceTemplate.type=LoadBalancer on a bare-metal
+# / k0s cluster. Pinned chart version for supply-chain reproducibility
+# (codeguard-0-supply-chain-security).
+metallb:
+  install: true                    # set false if MetalLB is already installed or not needed
+  chartVersion: "0.14.8"           # metallb/metallb Helm chart (matches MetalLB v0.14.8)
+  namespace: "metallb-system"
+
+  # Address pool — a range of IPs MetalLB can hand out to LoadBalancer
+  # Services. Must be routable from clients (VPN-connected users) to your k0s
+  # workers. Use IPs that are NOT used elsewhere on the LAN.
+  pool:
+    name: "saia-pool"
+    addresses:
+      - "10.20.30.100-10.20.30.110"   # CHANGE THIS to a free range on your network
+
+  # Advertisement mode: "layer2" works on most LANs without network gear
+  # changes (one elected node answers ARP for the VIP at a time; failover ~
+  # seconds). Use "bgp" only if your fabric supports BGP peering — then also
+  # populate metallb.bgpPeers below.
+  mode: "layer2"                  # layer2 | bgp
+
+  # Required only when mode=bgp. Leave empty for layer2.
+  bgpPeers: []
+    # - peerAddress: "10.0.0.1"
+    #   peerASN: 65001
+    #   myASN: 65000
+
 # ---------- Image Pull Secrets ----------
 imagePullSecrets:
   secrets:
diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh
index 915118d..c5384c0 100755
--- a/tools/cluster_setup/k0s_cluster_with_stack.sh
+++ b/tools/cluster_setup/k0s_cluster_with_stack.sh
@@ -3711,6 +3711,155 @@ apply_k0s_saia_service_annotations() {
   fi
 }
 
+# ---------- MetalLB (k0s LoadBalancer provider) ----------
+# k0s ships without a Service.type=LoadBalancer provider. MetalLB fills that
+# gap by allocating a VIP from a customer-provided pool and announcing it via
+# Layer-2 (ARP/NDP) or BGP. We pin the chart version for supply-chain
+# reproducibility (codeguard-0-supply-chain-security).
+
+metallb_enabled_k0s() {
+  local v
+  v="$(yq eval '.metallb.install // false' "${CONFIG_FILE}" 2>/dev/null || echo false)"
+  [[ "${v}" == "true" ]]
+}
+
+install_metallb() {
+  metallb_enabled_k0s || { log "metallb.install != true — skipping MetalLB install"; return 0; }
+
+  local ns chart_version pool_name addr_count mode
+  ns="$(yq eval '.metallb.namespace // "metallb-system"' "${CONFIG_FILE}" 2>/dev/null)"
+  chart_version="$(yq eval '.metallb.chartVersion // "0.14.8"' "${CONFIG_FILE}" 2>/dev/null)"
+  pool_name="$(yq eval '.metallb.pool.name // "saia-pool"' "${CONFIG_FILE}" 2>/dev/null)"
+  addr_count="$(yq eval '.metallb.pool.addresses // [] | length' "${CONFIG_FILE}" 2>/dev/null || echo 0)"
+  mode="$(yq eval '.metallb.mode // "layer2"' "${CONFIG_FILE}" 2>/dev/null)"
+
+  if [[ "${addr_count}" == "0" ]]; then
+    err "metallb.install=true but metallb.pool.addresses is empty. Provide at least one IP range routable on your network."
+  fi
+  if [[ "${mode}" != "layer2" && "${mode}" != "bgp" ]]; then
+    err "metallb.mode must be 'layer2' or 'bgp' (got: ${mode})."
+  fi
+
+  log "Installing MetalLB ${chart_version} into namespace ${ns}..."
+  helm repo add metallb https://metallb.github.io/metallb >/dev/null 2>&1 || true
+  helm repo update >/dev/null 2>&1 || true
+
+  kubectl get ns "${ns}" >/dev/null 2>&1 || kubectl create ns "${ns}"
+
+  helm upgrade --install metallb metallb/metallb \
+    --namespace "${ns}" \
+    --version "${chart_version}" \
+    --wait --timeout 5m
+
+  # Wait for the controller webhook to be Ready before applying CRs, otherwise
+  # the IPAddressPool / L2Advertisement applies race the validating webhook.
+  log "Waiting for MetalLB controller to be ready..."
+  kubectl -n "${ns}" rollout status deploy/metallb-controller --timeout=180s
+
+  # Render IPAddressPool with the configured address ranges.
+  local addresses_yaml=""
+  local i
+  local pool_count
+  pool_count="$(yq eval '.metallb.pool.addresses | length' "${CONFIG_FILE}" 2>/dev/null || echo 0)"
+  for ((i=0; i<pool_count; i++)); do
+    local addr
+    addr="$(yq eval ".metallb.pool.addresses[${i}]" "${CONFIG_FILE}" 2>/dev/null)"
+    [[ -z "${addr}" || "${addr}" == "null" ]] && continue
+    addresses_yaml+="    - ${addr}"$'\n'
+  done
+
+  log "Applying MetalLB IPAddressPool '${pool_name}' (${addr_count} range(s))..."
+  cat <<YAML | kubectl -n "${ns}" apply -f -
+apiVersion: metallb.io/v1beta1
+kind: IPAddressPool
+metadata:
+  name: ${pool_name}
+  namespace: ${ns}
+spec:
+  addresses:
+${addresses_yaml}
+YAML
+
+  if [[ "${mode}" == "layer2" ]]; then
+    log "Applying MetalLB L2Advertisement for pool '${pool_name}'..."
+    cat <<YAML | kubectl -n "${ns}" apply -f -
+apiVersion: metallb.io/v1beta1
+kind: L2Advertisement
+metadata:
+  name: ${pool_name}-l2
+  namespace: ${ns}
+spec:
+  ipAddressPools:
+    - ${pool_name}
+YAML
+  else
+    # BGP mode — render BGPPeers from config and attach a BGPAdvertisement.
+    local peer_count
+    peer_count="$(yq eval '.metallb.bgpPeers // [] | length' "${CONFIG_FILE}" 2>/dev/null || echo 0)"
+    if [[ "${peer_count}" == "0" ]]; then
+      err "metallb.mode=bgp requires metallb.bgpPeers to be non-empty (peerAddress, peerASN, myASN per peer)."
+    fi
+    local p
+    for ((p=0; p<peer_count; p++)); do
+      local peer_addr peer_asn my_asn
+      peer_addr="$(yq eval ".metallb.bgpPeers[${p}].peerAddress" "${CONFIG_FILE}" 2>/dev/null)"
+      peer_asn="$(yq eval ".metallb.bgpPeers[${p}].peerASN" "${CONFIG_FILE}" 2>/dev/null)"
+      my_asn="$(yq eval ".metallb.bgpPeers[${p}].myASN" "${CONFIG_FILE}" 2>/dev/null)"
+      [[ -z "${peer_addr}" || -z "${peer_asn}" || -z "${my_asn}" ]] && \
+        err "metallb.bgpPeers[${p}] missing peerAddress / peerASN / myASN."
+      cat <<YAML | kubectl -n "${ns}" apply -f -
+apiVersion: metallb.io/v1beta1
+kind: BGPPeer
+metadata:
+  name: bgp-peer-${p}
+  namespace: ${ns}
+spec:
+  peerAddress: ${peer_addr}
+  peerASN: ${peer_asn}
+  myASN: ${my_asn}
+YAML
+    done
+    cat <<YAML | kubectl -n "${ns}" apply -f -
+apiVersion: metallb.io/v1beta1
+kind: BGPAdvertisement
+metadata:
+  name: ${pool_name}-bgp
+  namespace: ${ns}
+spec:
+  ipAddressPools:
+    - ${pool_name}
+YAML
+  fi
+
+  log "✓ MetalLB ${chart_version} installed (${mode}, pool=${pool_name})"
+}
+
+# Disable kube-proxy NodePort allocation on the rendered SAIA Service so
+# kube-proxy never opens 30000-32767 on workers. The operator's
+# reconcileSAIAService only mutates Selector/Ports on existing Services
+# (pkg/ai/features/saia/impl.go), so this patch survives subsequent
+# reconciles. externalTrafficPolicy=Local preserves the real client IP for
+# MetalLB-style L4 providers (the announcing node forwards directly to a
+# local pod with no SNAT).
+patch_k0s_saia_service_disable_nodeport() {
+  local platform_name="${CLUSTER_NAME}-ai-platform"
+  local aiservice_name="${platform_name}-saia"
+  local svc_name="${aiservice_name}-saia-service"
+
+  local svc_type
+  svc_type="$(kubectl -n "${AI_NS}" get svc "${svc_name}" -o jsonpath='{.spec.type}' 2>/dev/null || true)"
+  [[ "${svc_type}" != "LoadBalancer" ]] && return 0
+
+  log "Patching Service ${AI_NS}/${svc_name} to disable NodePort allocation..."
+  kubectl -n "${AI_NS}" patch svc "${svc_name}" --type=merge -p '{
+  "spec": {
+    "allocateLoadBalancerNodePorts": false,
+    "externalTrafficPolicy": "Local"
+  }
+}' >/dev/null
+  log "✓ Service ${AI_NS}/${svc_name}: allocateLoadBalancerNodePorts=false, externalTrafficPolicy=Local"
+}
+
 patch_k0s_saia_public_service_workaround() {
   local platform_name="${CLUSTER_NAME}-ai-platform"
   local aiservice_name="${platform_name}-saia"
@@ -3723,8 +3872,9 @@ patch_k0s_saia_public_service_workaround() {
   wait_for_k0s_aiservice_exists "${aiservice_name}"
 
   if saia_service_template_enabled_k0s; then
-    log "Patching AIService/${aiservice_name} with SAIA public exposure settings..."
+    log "Patching AIService/${aiservice_name} with SAIA public exposure settings (type=${svc_type})..."
     if [[ "${svc_type}" == "NodePort" && -n "${svc_node_port}" && "${svc_node_port}" != "null" ]]; then
+      log "WARNING: NodePort exposure is discouraged on k0s. Prefer type=LoadBalancer with metallb.install=true." >&2
       kubectl -n "${AI_NS}" patch aiservice "${aiservice_name}" --type merge -p "{
   \"spec\": {
     \"serviceTemplate\": {
@@ -3762,7 +3912,17 @@ patch_k0s_saia_public_service_workaround() {
   if saia_service_template_enabled_k0s; then
     log "Recreating SAIA public Service to ensure patched settings take effect..."
     kubectl -n "${AI_NS}" delete svc "${public_svc_name}" --ignore-not-found >/dev/null 2>&1 || true
+    # Wait briefly for the operator to recreate it before patching NodePort
+    # allocation off; if it doesn't come back the patch will be a no-op.
+    local waited=0
+    while ! kubectl -n "${AI_NS}" get svc "${public_svc_name}" >/dev/null 2>&1; do
+      [[ ${waited} -ge 300 ]] && break
+      sleep 5
+      waited=$((waited + 5))
+    done
   fi
+
+  patch_k0s_saia_service_disable_nodeport
 }
 
 # ====== INSTALL FULL STACK ======
@@ -3857,6 +4017,14 @@ install_ai_platform_stack() {
   # Apply Splunk Standalone CR (non-blocking — pod boots in background)
   install_splunk_standalone
 
+  # MetalLB must be installed BEFORE the AIPlatform CR is reconciled — the
+  # operator renders a Service.type=LoadBalancer for SAIA and we need a
+  # provider in the cluster to allocate a VIP, otherwise the Service is
+  # stuck in EXTERNAL-IP=<pending> indefinitely. No-op when
+  # metallb.install=false (e.g., user is bringing their own MetalLB or wants
+  # ClusterIP only).
+  install_metallb
+
   # Install AI Platform operator and CR while Splunk Standalone boots
   install_splunk_ai_operator
   install_ai_platform_cr
@@ -3880,7 +4048,12 @@ check_platform_health() {
   # Check 1: Cluster nodes
   log "Checking cluster nodes..."
   local not_ready
-  not_ready=$(kubectl get nodes --no-headers 2>/dev/null | grep -v " Ready " | wc -l || echo "0")
+  # `wc -l` on macOS returns "       N" with leading whitespace and the `||
+  # echo` fallback can append a second value, so the resulting string was
+  # tripping the `[[ -gt 0 ]]` test ("[[: 0\n0: syntax error"). Strip
+  # whitespace and default to 0 if grep returns 1 (no matches).
+  not_ready=$(kubectl get nodes --no-headers 2>/dev/null | grep -v " Ready " | wc -l | tr -d '[:space:]')
+  not_ready="${not_ready:-0}"
   if [[ "${not_ready}" -gt 0 ]]; then
     warn "Found ${not_ready} node(s) not in Ready state"
     kubectl get nodes

From 293cffb3daa70c6fa35496e63e86d4a82cccbd95 Mon Sep 17 00:00:00 2001
From: kbhos <kbhos@splunk.com>
Date: Thu, 30 Apr 2026 01:57:17 +0530
Subject: [PATCH 3/5] metalLB changes

---
 tools/cluster_setup/eks_cluster_with_stack.sh | 20 ++++++-
 tools/cluster_setup/k0s-cluster-config.yaml   |  3 +
 tools/cluster_setup/k0s_cluster_with_stack.sh | 60 +++++++++++++++----
 3 files changed, 72 insertions(+), 11 deletions(-)

diff --git a/tools/cluster_setup/eks_cluster_with_stack.sh b/tools/cluster_setup/eks_cluster_with_stack.sh
index 9c815d8..eef3d51 100755
--- a/tools/cluster_setup/eks_cluster_with_stack.sh
+++ b/tools/cluster_setup/eks_cluster_with_stack.sh
@@ -947,6 +947,11 @@ ${public_subnets}"
     fi
   else
     log "No subnets specified - eksctl will create new subnets automatically"
+    # One NAT gateway => one Elastic IP. HighlyAvailable uses one NAT per AZ
+    # (often 3 EIPs) and commonly trips the default regional EIP quota (5).
+    vpc_config="vpc:
+  nat:
+    gateway: Single"
   fi
 
   cat <<EOF > eks-cluster-config.yaml
@@ -956,6 +961,8 @@ metadata:
   name: ${CLUSTER_NAME}
   region: ${REGION}
   version: "${K8S_VERSION}"
+autoModeConfig:
+  enabled: false
 iam:
   withOIDC: true
 addons:
@@ -3343,7 +3350,18 @@ preflight_env() {
     fi
   fi
   if [[ $subnet_count -eq 0 ]]; then
-    pf_ok "No subnets specified - eksctl will create new VPC and subnets automatically"
+    pf_ok "No subnets specified - eksctl will create new VPC and subnets automatically (NAT mode: Single = 1 Elastic IP)"
+    pf_header "Elastic IP headroom (new VPC)"
+    local eip_cnt
+    eip_cnt="$(aws ec2 describe-addresses --region "${REGION}" --query 'length(Addresses)' --output text 2>/dev/null || true)"
+    if [[ -n "${eip_cnt}" && "${eip_cnt}" =~ ^[0-9]+$ ]]; then
+      pf_ok "Allocated Elastic IPs in ${REGION}: ${eip_cnt}"
+      if (( eip_cnt >= 5 )); then
+        pf_warn "Typical default EIP quota is 5 per region. At ${eip_cnt}+ addresses, NAT gateway EIP allocation may fail (you saw: maximum number of addresses). Release unused EIPs in EC2 → Elastic IPs or request a quota increase before create cluster."
+      fi
+    else
+      pf_warn "Could not list Elastic IPs (aws ec2 describe-addresses). If create fails on NAT/EIP, check quotas and unused addresses."
+    fi
   else
     local all_subnets=("${PRIVATE_SUBNETS[@]}" "${PUBLIC_SUBNETS[@]}")
     local vpc_id=""
diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml
index bcb37b9..e796987 100644
--- a/tools/cluster_setup/k0s-cluster-config.yaml
+++ b/tools/cluster_setup/k0s-cluster-config.yaml
@@ -313,6 +313,9 @@ aiPlatform:
 # Required when aiPlatform.serviceTemplate.type=LoadBalancer on a bare-metal
 # / k0s cluster. Pinned chart version for supply-chain reproducibility
 # (codeguard-0-supply-chain-security).
+#
+# If serviceTemplate.type=NodePort, the installer skips MetalLB entirely even
+# when metallb.install=true (NodePort does not use a LoadBalancer provider).
 metallb:
   install: true                    # set false if MetalLB is already installed or not needed
   chartVersion: "0.14.8"           # metallb/metallb Helm chart (matches MetalLB v0.14.8)
diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh
index e1f5683..56b635a 100755
--- a/tools/cluster_setup/k0s_cluster_with_stack.sh
+++ b/tools/cluster_setup/k0s_cluster_with_stack.sh
@@ -388,6 +388,17 @@ configure_images() {
   log "✓ All images configured successfully"
 }
 
+# True if objectStore.auth values are still obvious template text. Non-empty
+# placeholders otherwise pass the length preflight and get applied into
+# minio-credentials, which makes SAIA fail at startup with InvalidAccessKeyId.
+object_store_auth_looks_like_placeholder() {
+  case "${MINIO_ROOT_USER}${MINIO_ROOT_PASSWORD}" in
+    *\<*|*\>*) return 0 ;;
+    *CHANGEME*|*changeme*) return 0 ;;
+  esac
+  return 1
+}
+
 # ====== PREFLIGHT CHECKS ======
 preflight_checks() {
   pf_header "Required tools"
@@ -423,6 +434,9 @@ preflight_checks() {
     [[ -n "${OBJ_STORE_ENDPOINT}" ]] && pf_ok "Endpoint: ${OBJ_STORE_ENDPOINT}" || pf_fail "objectStore.endpoint is required"
   fi
   [[ -n "${MINIO_ROOT_PASSWORD}" ]] && pf_ok "Credentials configured" || pf_fail "Object store credentials required (objectStore.auth.rootPassword)"
+  if object_store_auth_looks_like_placeholder; then
+    pf_fail "objectStore.auth still contains template placeholders (e.g. <...> or CHANGEME). Replace with a real access key and secret in your config (keep secrets in a Git-ignored file such as tools/cluster_setup/k0s-config.local.yaml)."
+  fi
 
   pf_header "Infrastructure mode"
   pf_ok "Using existing infrastructure (on-prem/baremetal)"
@@ -1118,6 +1132,10 @@ ensure_namespace() {
 # the Kubernetes credentials secret so the operator and workloads can auth.
 ensure_s3compat_credentials() {
   log "Creating credentials secret for S3-compatible object storage (${OBJ_STORE_TYPE})..."
+  if object_store_auth_looks_like_placeholder; then
+    err "Refusing to create minio-credentials: objectStore.auth contains template placeholders; fix ${CONFIG_FILE}"
+    return 1
+  fi
   if [[ -z "${OBJ_STORE_ENDPOINT}" && -z "${MINIO_ENDPOINT}" ]]; then
     err "storage.objectStore.type=${OBJ_STORE_TYPE} requires storage.objectStore.endpoint"
     return 1
@@ -2637,15 +2655,23 @@ install_ai_platform_cr() {
 
   # Ensure object storage credentials secret exists in AI namespace
   log "Creating/updating S3-compatible credentials secret (minio-credentials) in ${AI_NS}..."
-  kubectl -n "${AI_NS}" create secret generic minio-credentials \
-    --from-literal=AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" \
-    --from-literal=AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \
-    --from-literal=s3_access_key="${MINIO_ROOT_USER}" \
-    --from-literal=s3_secret_key="${MINIO_ROOT_PASSWORD}" \
-    --from-literal=MINIO_ACCESS_KEY="${MINIO_ROOT_USER}" \
-    --from-literal=MINIO_SECRET_KEY="${MINIO_ROOT_PASSWORD}" \
-    --dry-run=client -o yaml | kubectl -n "${AI_NS}" apply -f -
-  log "✓ Object storage credentials secret ready"
+  if object_store_auth_looks_like_placeholder; then
+    if kubectl get secret minio-credentials -n "${AI_NS}" &>/dev/null; then
+      warn "Skipping minio-credentials apply: auth in ${CONFIG_FILE} still looks like a template (e.g. contains '<'). Preserving existing secret."
+    else
+      err "minio-credentials missing and cannot be created: fix objectStore.auth in ${CONFIG_FILE} (remove <...> placeholders)."
+    fi
+  else
+    kubectl -n "${AI_NS}" create secret generic minio-credentials \
+      --from-literal=AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" \
+      --from-literal=AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \
+      --from-literal=s3_access_key="${MINIO_ROOT_USER}" \
+      --from-literal=s3_secret_key="${MINIO_ROOT_PASSWORD}" \
+      --from-literal=MINIO_ACCESS_KEY="${MINIO_ROOT_USER}" \
+      --from-literal=MINIO_SECRET_KEY="${MINIO_ROOT_PASSWORD}" \
+      --dry-run=client -o yaml | kubectl -n "${AI_NS}" apply -f -
+    log "✓ Object storage credentials secret ready"
+  fi
 
   # Build imagePullSecrets YAML from created secrets
   local image_pull_secrets=""
@@ -2825,6 +2851,14 @@ saia_service_template_enabled_k0s() {
   [[ -n "${svc_type}" && "${svc_type}" != "null" && "${svc_type}" != "ClusterIP" ]]
 }
 
+# True when SAIA public Service is explicitly NodePort. MetalLB is not used in
+# that mode, so install_metallb skips the Helm install even if metallb.install=true.
+k0s_saia_service_template_is_nodeport() {
+  local svc_type
+  svc_type=$(yq eval '.aiPlatform.serviceTemplate.type // ""' "${CONFIG_FILE}" 2>/dev/null || echo "")
+  [[ "${svc_type}" == "NodePort" ]]
+}
+
 wait_for_k0s_aiservice_exists() {
   local name="$1" timeout="${2:-600}" waited=0
   while ! kubectl -n "${AI_NS}" get aiservice "${name}" >/dev/null 2>&1; do
@@ -2870,6 +2904,12 @@ metallb_enabled_k0s() {
 install_metallb() {
   metallb_enabled_k0s || { log "metallb.install != true — skipping MetalLB install"; return 0; }
 
+  if k0s_saia_service_template_is_nodeport; then
+    log "Skipping MetalLB install: aiPlatform.serviceTemplate.type=NodePort (LoadBalancer provider not used for SAIA)."
+    log "NOTE: metallb.install=true has no effect while SAIA uses NodePort. Set metallb.install=false to match config, or use type=LoadBalancer to install MetalLB."
+    return 0
+  fi
+
   local ns chart_version pool_name addr_count mode
   ns="$(yq eval '.metallb.namespace // "metallb-system"' "${CONFIG_FILE}" 2>/dev/null)"
   chart_version="$(yq eval '.metallb.chartVersion // "0.14.8"' "${CONFIG_FILE}" 2>/dev/null)"
@@ -3018,7 +3058,7 @@ patch_k0s_saia_public_service_workaround() {
   if saia_service_template_enabled_k0s; then
     log "Patching AIService/${aiservice_name} with SAIA public exposure settings (type=${svc_type})..."
     if [[ "${svc_type}" == "NodePort" && -n "${svc_node_port}" && "${svc_node_port}" != "null" ]]; then
-      log "WARNING: NodePort exposure is discouraged on k0s. Prefer type=LoadBalancer with metallb.install=true." >&2
+      log "WARNING: NodePort exposure is discouraged on k0s. Prefer type=LoadBalancer with metallb.install=true (MetalLB install is skipped automatically when type=NodePort)." >&2
       kubectl -n "${AI_NS}" patch aiservice "${aiservice_name}" --type merge -p "{
   \"spec\": {
     \"serviceTemplate\": {

From 7b100b797cbb3ffb4da8d70af52e4152283062cc Mon Sep 17 00:00:00 2001
From: kbhos-splunk <kbhos@splunk.com>
Date: Thu, 30 Apr 2026 02:26:59 +0530
Subject: [PATCH 4/5] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 tools/cluster_setup/k0s_cluster_with_stack.sh | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh
index ee0e3aa..b4d508f 100755
--- a/tools/cluster_setup/k0s_cluster_with_stack.sh
+++ b/tools/cluster_setup/k0s_cluster_with_stack.sh
@@ -3222,12 +3222,10 @@ check_platform_health() {
   # Check 1: Cluster nodes
   log "Checking cluster nodes..."
   local not_ready
-  # `wc -l` on macOS returns "       N" with leading whitespace and the `||
-  # echo` fallback can append a second value, so the resulting string was
-  # tripping the `[[ -gt 0 ]]` test ("[[: 0\n0: syntax error"). Strip
-  # whitespace and default to 0 if grep returns 1 (no matches).
-  not_ready=$(kubectl get nodes --no-headers 2>/dev/null | grep -v " Ready " | wc -l | tr -d '[:space:]')
-  not_ready="${not_ready:-0}"
+  # Count nodes whose status is not Ready without relying on grep exit codes.
+  # This avoids `set -euo pipefail` aborting the script when all nodes are
+  # Ready, while still producing a whitespace-free numeric result.
+  not_ready=$(kubectl get nodes --no-headers 2>/dev/null | awk 'index($0, " Ready ") == 0 { count++ } END { print count+0 }')
   if [[ "${not_ready}" -gt 0 ]]; then
     warn "Found ${not_ready} node(s) not in Ready state"
     kubectl get nodes

From bfc43001b5fa4163e49267783c105d79537e0874 Mon Sep 17 00:00:00 2001
From: kbhos-splunk <kbhos@splunk.com>
Date: Thu, 30 Apr 2026 02:27:06 +0530
Subject: [PATCH 5/5] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 tools/cluster_setup/cluster-config.yaml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tools/cluster_setup/cluster-config.yaml b/tools/cluster_setup/cluster-config.yaml
index ebd7b82..6109331 100644
--- a/tools/cluster_setup/cluster-config.yaml
+++ b/tools/cluster_setup/cluster-config.yaml
@@ -271,9 +271,11 @@ aiPlatform:
   # Public SAIA exposure (NodePort-free)
   # ---------------------------------------------------------------------------
   # The operator renders a public Kubernetes Service named
-  # `<aiPlatform.name>-saia-service` whose endpoints are the in-cluster nginx
-  # pods (nginx terminates path routing to saia v1 / v2). The install script
-  # then configures HOW that Service is reached from outside the cluster.
+  # `<aiService.name>-saia-service`; because the AIService is typically named
+  # `<aiPlatform.name>-saia`, the resulting Service is usually
+  # `<aiPlatform.name>-saia-saia-service`. Its endpoints are the in-cluster
+  # nginx pods (nginx terminates path routing to saia v1 / v2). The install
+  # script then configures HOW that Service is reached from outside the cluster.
   #
   # IMPORTANT: this template intentionally does NOT use Service.type=NodePort.
   # Many enterprise security policies prohibit opening 30000-32767 on every