diff --git a/tools/cluster_setup/cluster-config.yaml b/tools/cluster_setup/cluster-config.yaml
index c0ed83a..6109331 100644
--- a/tools/cluster_setup/cluster-config.yaml
+++ b/tools/cluster_setup/cluster-config.yaml
@@ -13,8 +13,27 @@
 
 # ---------- Cluster Configuration ----------
 cluster:
-  useExisting: false   # true = do not create cluster; use existing one (script fails if cluster not found)
-  name: "my-ai-cluster"                     # CHANGE THIS: Your EKS cluster name (DNS-1123 compliant: lowercase, numbers, hyphens)
+  # ------------------------------------------------------------------------
+  # LIFECYCLE WORKFLOW (to avoid VPC/IGW quota churn and DELETE_FAILED loops)
+  # ------------------------------------------------------------------------
+  #   1. FIRST install (cluster does not exist yet):
+  #        useExisting: false            # eksctl creates the cluster + VPC
+  #        ./eks_cluster_with_stack.sh install
+  #
+  #   2. AFTER first install succeeds, flip this one line:
+  #        useExisting: false            # subsequent `install` only reconciles
+  #                                      # operators/CRs on the existing cluster.
+  #      Re-running `install` is now safe and does NOT create new VPCs/IGWs.
+  #
+  #   3. When you genuinely want to tear down:
+  #        ALWAYS use `delete-full` (NOT `delete`). It uninstalls CRs/operators
+  #        first so the AWS Load Balancer Controller removes its NLBs + SGs
+  #        before CFN deletes the VPC -- this prevents DELETE_FAILED stacks
+  #        leaving orphan VPCs behind and eating your per-region quota.
+  #        ./eks_cluster_with_stack.sh delete-full
+  # ------------------------------------------------------------------------
+  useExisting: false  # true = do not create cluster; use existing one (script fails if cluster not found)
+  name: "my-ai-cluster"                             # CHANGE THIS: Your EKS cluster name (DNS-1123 compliant: lowercase, numbers, hyphens)
   region: "us-east-2"                     # CHANGE THIS: Your AWS region (e.g., us-east-1, us-west-2, eu-west-1)
   k8sVersion: "1.31"                      # Kubernetes version (1.29, 1.30, 1.31 supported)
   # When true: require subnets (existing VPC). On 'delete', only EKS and related resources are removed; VPC is preserved so you can redeploy (e.g. with MinIO on EC2 in same VPC).
@@ -73,7 +92,7 @@ nodeGroups:
     desiredCapacity: 2                    # Initial number of GPU nodes
     minSize: 2                            # Minimum GPU nodes
     maxSize: 4                            # Maximum GPU nodes (set equal to desiredCapacity for H100)
-    volumeSize: 1000                      # EBS volume size per GPU node (GB) - larger for model storage
+    volumeSize: 500                      # EBS volume size per GPU node (GB) - larger for model storage
     volumeType: "gp3"                     # EBS volume type
 
     # ── H100 ONLY ──────────────────────────────────────────────────────────────
@@ -93,7 +112,7 @@ nodeGroups:
 # Object storage: only AWS S3 or external S3-compatible (no in-cluster MinIO install).
 # Use objectStore.type: aws (S3) or s3compat | minio | seaweedfs (external; endpoint + credentials required).
 storage:
-  s3Bucket: "ai-platform-bucket-minio-us-east-2"  # Used when objectStore.type is aws
+  s3Bucket: "ai-platform-bucket-us-east-2"  # Used when objectStore.type is aws
   storageClass: "gp3"                        # Storage class for Kubernetes PVCs (gp3, gp2, io1, io2)
   vectorDbSize: "50Gi"                       # VectorDB persistent volume size
 
@@ -102,12 +121,8 @@ storage:
   # - minio: same wiring as s3compat but path uses minio:// (use if an older operator webhook rejects s3compat://)
   # - seaweedfs: path uses seaweedfs:// (requires operator webhook that allows that scheme)
   objectStore:
-    type: "minio"                         # aws | s3compat | minio | seaweedfs (external only for non-aws)
-    bucket: "ai-platform-bucket-minio-us-east-2"
-    endpoint: "http://10.0.0.5:9000"         # CHANGE THIS: MinIO API (9000) or SeaweedFS S3 gateway (8333)
-    auth:
-      rootUser: ""                            # CHANGE THIS: S3-compatible access key (or MinIO root user)
-      rootPassword: ""                        # CHANGE THIS: S3-compatible secret key (or MinIO root password)
+    type: "aws"                         # aws | s3compat | minio | seaweedfs (external only for non-aws)
+    bucket: "ai-platform-bucket-us-east-2"            # Must match SeaweedFS env (AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY) or MinIO root
 
 # ---------- Container Images Configuration ----------
 images:
@@ -147,7 +162,7 @@ images:
     #   Result: "docker.io/myorg/splunk-ai-operator:v1.0.0"
     # Bump tag after building fixed operator (SAIA 8Gi default, SchemaJobId persist, feature config)
     #image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/splunk-ai-operator:v0.1.8"
-    image: "docker.io/kpratyush775/splunk-ai-operator:v0.1.31"
+    image: "docker.io/kbhos698/splunk-ai-operator:ai-tier"
 
   # Splunk Enterprise Images
   splunk:
@@ -170,8 +185,8 @@ images:
     # Option 2: Full path with different registry
     #   headImage: "docker.io/rayproject/ray:2.44.0"
     #   Result: "docker.io/rayproject/ray:2.44.0"
-    headImage: "ml-platform/ray/ray-head:build-008"
-    workerImage: "ml-platform/ray/ray-worker-gpu:build-008"
+    headImage: "ml-platform/ray/ray-head:build-v2-008"
+    workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-008"
 
   # Weaviate Vector Database
   weaviate:
@@ -183,9 +198,14 @@ images:
   # SAIA (Splunk AI Assistant) Images
   saia:
     # Relative paths - registry prefix auto-applied
-    apiImage: "ml-platform/saia/saia-api:build-005"
-    dataLoaderImage: "ml-platform/saia/saia-data-loader:build-003"
-
+    # NOTE: keep dataLoaderImage in sync with apiImage/apiV2Image. Tags older than
+    # v2-008 (specifically pre v2.0.4-13-g3b677604) ship a broken URL-compat shim
+    # that ignores VECTOR_DB_GRPC_* env vars and falls back to grpc.<host>:443 TLS,
+    # causing the vector-db-setup posthook Job to fail with a Weaviate gRPC health
+    # check error. See pkg/ai/features/saia/impl.go (reconcilePostInstallHook).
+    apiImage: "ml-platform/saia/saia-api:build-v2-009"
+    apiV2Image: "ml-platform/saia/saia-api-v2:build-v2-009"
+    dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v2-009"
   # Supporting Images
   fluentBit:
     # Docker Hub public image (has full path, registry prefix ignored)
@@ -198,6 +218,14 @@ images:
     # Public image - full path so registry prefix is NOT applied; validation checks this URL
     image: "docker.io/otel/opentelemetry-collector-contrib:0.122.1"
 
+  # NGINX reverse proxy used by the SAIA reconciler to route v1 / v2 requests
+  # by path. OPTIONAL: omit this block to use the script default
+  # (docker.io/library/nginx:1.27-alpine). Add it only to pin a specific tag
+  # or point at an internal mirror in airgapped clusters.
+  #
+  # nginx:
+  #   image: "harbor.internal/library/nginx:1.27-alpine"
+
 # ---------- Operator Versions ----------
 operators:
   ray:
@@ -239,6 +267,124 @@ aiPlatform:
     serviceAccountName: "ray-worker-sa"
     imageRegistry: ""                     # Leave empty for default
 
+  # ---------------------------------------------------------------------------
+  # Public SAIA exposure (NodePort-free)
+  # ---------------------------------------------------------------------------
+  # The operator renders a public Kubernetes Service named
+  # `<aiService.name>-saia-service`; because the AIService is typically named
+  # `<aiPlatform.name>-saia`, the resulting Service is usually
+  # `<aiPlatform.name>-saia-saia-service`. Its endpoints are the in-cluster
+  # nginx pods (nginx terminates path routing to saia v1 / v2). The install
+  # script then configures HOW that Service is reached from outside the cluster.
+  #
+  # IMPORTANT: this template intentionally does NOT use Service.type=NodePort.
+  # Many enterprise security policies prohibit opening 30000-32767 on every
+  # worker. All three modes below are NodePort-free — the script sets
+  # `allocateLoadBalancerNodePorts: false` on LoadBalancer Services so
+  # kube-proxy never opens a node port; for the BYO mode the Service stays
+  # ClusterIP and AWS LBC registers pod IPs into the customer's target group.
+  #
+  # Pick ONE of the modes below by editing the active block at the bottom of
+  # this section. Each mode shows: the YAML to use, what the script does, and
+  # what you must provision outside the cluster.
+  #
+  # ---------------------------------------------------------------------------
+  # MODE 1 — Operator-managed AWS NLB, IP-target mode (DEFAULT)
+  # ---------------------------------------------------------------------------
+  #   serviceTemplate:
+  #     type: LoadBalancer
+  #     annotations:
+  #       service.beta.kubernetes.io/aws-load-balancer-type:             "external"
+  #       service.beta.kubernetes.io/aws-load-balancer-scheme:           "internet-facing"   # or "internal"
+  #       service.beta.kubernetes.io/aws-load-balancer-nlb-target-type:  "ip"                # ← pods, not nodes
+  #       service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: "true"
+  #     # Optional TLS termination at the NLB:
+  #     # service.beta.kubernetes.io/aws-load-balancer-ssl-cert: "arn:aws:acm:..."
+  #     # service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "443"
+  #     # service.beta.kubernetes.io/aws-load-balancer-ssl-negotiation-policy: "ELBSecurityPolicy-TLS13-1-2-2021-06"
+  #   awsLoadBalancerController:
+  #     install: true
+  #   byoTargetGroup:
+  #     enabled: false
+  #
+  # Script does:
+  #   * Installs AWS Load Balancer Controller (LBC) with IRSA, tags subnets.
+  #   * Creates the LoadBalancer Service; LBC provisions an NLB whose targets
+  #     are pod IPs (no NodePort, no kube-proxy hop, real client IP preserved).
+  #   * Patches the rendered Service to set `allocateLoadBalancerNodePorts:
+  #     false` and `externalTrafficPolicy: Local`.
+  # You must do: nothing on the AWS side. DNS appears in
+  # `.status.loadBalancer.ingress[0].hostname` after ~2-3 min.
+  #
+  # ---------------------------------------------------------------------------
+  # MODE 2 — Bring-your-own AWS LB (TargetGroupBinding, IP-target)
+  # ---------------------------------------------------------------------------
+  # Customer already owns the NLB / ALB / target group. LBC is installed only
+  # to manage target-group membership; it does NOT create LBs in this mode.
+  #
+  #   serviceTemplate:
+  #     type: ClusterIP                # LB is owned by the customer
+  #   awsLoadBalancerController:
+  #     install: true                  # required for TargetGroupBinding
+  #   byoTargetGroup:
+  #     enabled: true
+  #     targetGroupArn: "arn:aws:elasticloadbalancing:<region>:<account>:targetgroup/<your-tg>/<id>"
+  #     securityGroupId: "sg-xxxxxxxxxxxxxxxxx"   # the customer's LB security group
+  #
+  # Script does:
+  #   * Installs LBC.
+  #   * Leaves the public Service as ClusterIP.
+  #   * Applies a TargetGroupBinding CR with `targetType: ip` so LBC registers
+  #     nginx pod IPs into the customer's target group as endpoints change.
+  # You must do (outside the cluster):
+  #   1. Pre-create the target group in the EKS VPC with:
+  #        - Target type:   ip
+  #        - Protocol/Port: TCP/8080 (NLB) or HTTP/8080 (ALB)  ← pod port, not 30080
+  #        - Health check:  HTTP /nginx_health on traffic-port, 200 OK
+  #   2. Attach the target group to your existing LB listener.
+  #   3. Worker pod SG ingress 8080 from the LB SG only — the
+  #      TargetGroupBinding `networking.ingress.from.securityGroup` block
+  #      configured by the script does this for you.
+  #
+  # ---------------------------------------------------------------------------
+  # MODE 3 — On-prem / k0s / airgap (NOT applicable to this EKS template)
+  # ---------------------------------------------------------------------------
+  # Use the dedicated `k0s-cluster-config.yaml` template, which configures
+  # MetalLB to allocate a routable VIP. The user-facing contract there is
+  # identical (`type: LoadBalancer`) — only the LB provider changes.
+  #
+  # ---------------------------------------------------------------------------
+  # SECURITY NOTES (apply to all modes)
+  # ---------------------------------------------------------------------------
+  #   * Always terminate TLS at the LB (ACM cert on AWS) and place an auth
+  #     layer in front (oauth2-proxy, Cognito on the ALB, API Gateway, …)
+  #     before exposing on the public internet.
+  #   * Restrict the LB listener to trusted source CIDRs / SGs (never
+  #     0.0.0.0/0 to a sensitive endpoint).
+  #   * Pod SG ingress should allow 8080 only from the LB SG.
+  # ---------------------------------------------------------------------------
+
+  # Active mode below — EDIT to switch. Default is MODE 1.
+  serviceTemplate:
+    type: LoadBalancer
+    annotations:
+      service.beta.kubernetes.io/aws-load-balancer-type: "external"
+      service.beta.kubernetes.io/aws-load-balancer-scheme: "internet-facing"
+      service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: "ip"
+      service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: "true"
+
+  awsLoadBalancerController:
+    install: true
+
+  # Bring-your-own AWS target group (Mode 2). Set enabled: true and provide
+  # both targetGroupArn and securityGroupId; the script will then leave the
+  # SAIA Service as ClusterIP and apply a TargetGroupBinding (LBC manages
+  # target registration into your existing target group).
+  byoTargetGroup:
+    enabled: false
+    # targetGroupArn: "arn:aws:elasticloadbalancing:us-east-2:123456789012:targetgroup/my-saia-tg/abc123"
+    # securityGroupId: "sg-0123456789abcdef0"
+
   # CPU Scheduling
   cpuScheduling:
     nodeSelector: {}
diff --git a/tools/cluster_setup/eks_cluster_with_stack.sh b/tools/cluster_setup/eks_cluster_with_stack.sh
index 93df2bd..dccb365 100755
--- a/tools/cluster_setup/eks_cluster_with_stack.sh
+++ b/tools/cluster_setup/eks_cluster_with_stack.sh
@@ -91,6 +91,20 @@ load_config() {
     SAIA_SERVICE_SA="$(yq eval '.aiPlatform.serviceAccounts.saiaService' "$cfg")"
     DEFAULT_ACCELERATOR="$(yq eval '.aiPlatform.defaultAcceleratorType' "$cfg")"
     WORKER_IMAGE_REGISTRY="$(yq eval '.aiPlatform.workerGroupConfig.imageRegistry' "$cfg")"
+    SAIA_SERVICE_TYPE="$(yq eval '.aiPlatform.serviceTemplate.type // ""' "$cfg")"
+    SAIA_SERVICE_NODE_PORT="$(yq eval '.aiPlatform.serviceTemplate.nodePort // ""' "$cfg")"
+    # AWS Load Balancer Controller (LBC) install toggle. Required for both
+    # operator-managed NLB provisioning (Mode 1) and customer-owned LB
+    # registration via TargetGroupBinding (Mode 2). Off-AWS users (k0s) leave
+    # this false.
+    INSTALL_LBC="$(yq eval '.aiPlatform.awsLoadBalancerController.install // false' "$cfg")"
+    # Bring-your-own AWS target group (Mode 2). When enabled the script keeps
+    # the public Service as ClusterIP and applies a TargetGroupBinding so LBC
+    # registers nginx pod IPs into the customer's pre-existing target group.
+    # Requires INSTALL_LBC=true.
+    BYO_TG_ENABLED="$(yq eval '.aiPlatform.byoTargetGroup.enabled // false' "$cfg")"
+    BYO_TG_ARN="$(yq eval '.aiPlatform.byoTargetGroup.targetGroupArn // ""' "$cfg")"
+    BYO_TG_SG_ID="$(yq eval '.aiPlatform.byoTargetGroup.securityGroupId // ""' "$cfg")"
     INGRESS_HOST="$(yq eval '.aiPlatform.ingress.host' "$cfg")"
     INGRESS_CLASS="$(yq eval '.aiPlatform.ingress.className' "$cfg")"
     INGRESS_TLS_SECRET="$(yq eval '.aiPlatform.ingress.tlsSecretName' "$cfg")"
@@ -120,9 +134,11 @@ load_config() {
     RAY_WORKER_IMAGE="$(yq eval '.images.ray.workerImage' "$cfg")"
     WEAVIATE_IMAGE="$(yq eval '.images.weaviate.image' "$cfg")"
     SAIA_API_IMAGE="$(yq eval '.images.saia.apiImage' "$cfg")"
+    SAIA_API_V2_IMAGE="$(yq eval '.images.saia.apiV2Image // ""' "$cfg")"
     SAIA_DATALOADER_IMAGE="$(yq eval '.images.saia.dataLoaderImage' "$cfg")"
     FLUENT_BIT_IMAGE="$(yq eval '.images.fluentBit.image' "$cfg")"
     OTEL_COLLECTOR_IMAGE="$(yq eval '.images.otelCollector.image' "$cfg")"
+    NGINX_IMAGE="$(yq eval '.images.nginx.image // "docker.io/library/nginx:1.27-alpine"' "$cfg")"
 
     # Subnets - read as arrays (support both cluster.subnets and top-level subnets)
     PRIVATE_SUBNETS=()
@@ -172,6 +188,12 @@ load_config() {
     SAIA_SERVICE_SA="saia-service-sa"
     DEFAULT_ACCELERATOR="L40S"
     WORKER_IMAGE_REGISTRY=""
+    SAIA_SERVICE_TYPE=""
+    SAIA_SERVICE_NODE_PORT=""
+    INSTALL_LBC="false"
+    BYO_TG_ENABLED="false"
+    BYO_TG_ARN=""
+    BYO_TG_SG_ID=""
     INGRESS_HOST="ai.example.com"
     INGRESS_CLASS="nginx"
     INGRESS_TLS_SECRET="ai-platform-tls"
@@ -179,6 +201,8 @@ load_config() {
     SPLUNK_OPERATOR_FILE="./splunk-operator-cluster.yaml"
     SPLUNK_AI_FILE="./artifacts.yaml"
     SPLUNK_IMAGE="splunk/splunk:10.2.0-dev1"
+    SAIA_API_V2_IMAGE=""
+    NGINX_IMAGE="docker.io/library/nginx:1.27-alpine"
     RAY_VERSION="v1.2.2"
     NVIDIA_VERSION="v0.17.3"
     ENABLE_CPU=true
@@ -230,6 +254,19 @@ load_config() {
   # Splunk operators
   SPLUNK_AI_NS="splunk-ai-operator-system"
 
+  # AWS Load Balancer Controller (LBC) — required when a Service of type=LoadBalancer
+  # uses the "service.beta.kubernetes.io/aws-load-balancer-type: external" annotation
+  # (the in-tree EKS cloud controller intentionally skips those Services). Pinned
+  # chart and policy versions keep installs reproducible against a vetted upstream
+  # release (supply-chain hygiene: codeguard-0-supply-chain-security).
+  LBC_NS="kube-system"
+  LBC_SA="aws-load-balancer-controller"
+  LBC_RELEASE="aws-load-balancer-controller"
+  LBC_ROLE_NAME="AWSLoadBalancerControllerRole-${CLUSTER_NAME}"
+  LBC_POLICY_NAME="AWSLoadBalancerControllerIAMPolicy-${CLUSTER_NAME}"
+  LBC_CHART_VERSION="1.8.2"   # helm chart version (appVersion v2.8.2)
+  LBC_POLICY_VERSION="v2.8.2" # upstream tag used to fetch iam_policy.json
+
   log "Configuration loaded: cluster=${CLUSTER_NAME}, region=${REGION}, namespace=${AI_NS}"
 }
 
@@ -386,47 +423,67 @@ configure_images() {
   local ray_worker_full=$(build_image_url "$IMAGE_REGISTRY" "$RAY_WORKER_IMAGE")
   local weaviate_full=$(build_image_url "$IMAGE_REGISTRY" "$WEAVIATE_IMAGE")
   local saia_api_full=$(build_image_url "$IMAGE_REGISTRY" "$SAIA_API_IMAGE")
+  local saia_api_v2_full=""
   local saia_dataloader_full=$(build_image_url "$IMAGE_REGISTRY" "$SAIA_DATALOADER_IMAGE")
   local fluent_bit_full=$(build_image_url "$IMAGE_REGISTRY" "$FLUENT_BIT_IMAGE")
   local otel_collector_full=$(build_image_url "$IMAGE_REGISTRY" "$OTEL_COLLECTOR_IMAGE")
+  local nginx_full=$(build_image_url "$IMAGE_REGISTRY" "$NGINX_IMAGE")
+  if [[ -n "${SAIA_API_V2_IMAGE}" && "${SAIA_API_V2_IMAGE}" != "null" ]]; then
+    saia_api_v2_full=$(build_image_url "$IMAGE_REGISTRY" "$SAIA_API_V2_IMAGE")
+  fi
 
   # Escape special characters for sed
   local ray_head_escaped=$(echo "$ray_head_full" | sed 's/[\/&]/\\&/g')
   local ray_worker_escaped=$(echo "$ray_worker_full" | sed 's/[\/&]/\\&/g')
   local weaviate_escaped=$(echo "$weaviate_full" | sed 's/[\/&]/\\&/g')
   local saia_api_escaped=$(echo "$saia_api_full" | sed 's/[\/&]/\\&/g')
+  local saia_api_v2_escaped=""
   local saia_dataloader_escaped=$(echo "$saia_dataloader_full" | sed 's/[\/&]/\\&/g')
   local fluent_bit_escaped=$(echo "$fluent_bit_full" | sed 's/[\/&]/\\&/g')
   local otel_collector_escaped=$(echo "$otel_collector_full" | sed 's/[\/&]/\\&/g')
+  local nginx_escaped=$(echo "$nginx_full" | sed 's/[\/&]/\\&/g')
   local operator_escaped=$(echo "$operator_full" | sed 's/[\/&]/\\&/g')
+  if [[ -n "${saia_api_v2_full}" ]]; then
+    saia_api_v2_escaped=$(echo "$saia_api_v2_full" | sed 's/[\/&]/\\&/g')
+  fi
 
-  SEDOPTION="-i"
+  local SED_INPLACE
   if [[ "$OSTYPE" == "darwin"* ]]; then
-    SEDOPTION="-i ''"
+    SED_INPLACE=(sed -i "")
+  else
+    SED_INPLACE=(sed -i)
   fi
   # Replace RELATED_IMAGE_ env vars by matching the env var name (not the value pattern)
   # This works regardless of what registry/image was there before
-  sed $SEDOPTION "/name: RELATED_IMAGE_RAY_HEAD/,/value:/ s|value:.*|value: ${ray_head_escaped}|" "$SPLUNK_AI_FILE"
-  sed $SEDOPTION "/name: RELATED_IMAGE_RAY_WORKER/,/value:/ s|value:.*|value: ${ray_worker_escaped}|" "$SPLUNK_AI_FILE"
-  sed $SEDOPTION "/name: RELATED_IMAGE_WEAVIATE/,/value:/ s|value:.*|value: ${weaviate_escaped}|" "$SPLUNK_AI_FILE"
-  sed $SEDOPTION "/name: RELATED_IMAGE_SAIA_API/,/value:/ s|value:.*|value: ${saia_api_escaped}|" "$SPLUNK_AI_FILE"
-  sed $SEDOPTION "/name: RELATED_IMAGE_POST_INSTALL_HOOK/,/value:/ s|value:.*|value: ${saia_dataloader_escaped}|" "$SPLUNK_AI_FILE"
-  sed $SEDOPTION "/name: RELATED_IMAGE_FLUENT_BIT/,/value:/ s|value:.*|value: ${fluent_bit_escaped}|" "$SPLUNK_AI_FILE"
-  sed $SEDOPTION "/name: RELATED_IMAGE_OTEL_COLLECTOR/,/value:/ s|value:.*|value: ${otel_collector_escaped}|" "$SPLUNK_AI_FILE"
-  sed $SEDOPTION "/name: MODEL_VERSION/,/value:/ s|value:.*|value: ${MODEL_VERSION}|" "$SPLUNK_AI_FILE"
-  sed $SEDOPTION "/name: RAY_VERSION/,/value:/ s|value:.*|value: ${RAY_RUNTIME_VERSION}|" "$SPLUNK_AI_FILE"
+  "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_RAY_HEAD/,/value:/ s|value:.*|value: ${ray_head_escaped}|" "$SPLUNK_AI_FILE"
+  "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_RAY_WORKER/,/value:/ s|value:.*|value: ${ray_worker_escaped}|" "$SPLUNK_AI_FILE"
+  "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_WEAVIATE/,/value:/ s|value:.*|value: ${weaviate_escaped}|" "$SPLUNK_AI_FILE"
+  "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_SAIA_API$/,/value:/ s|value:.*|value: ${saia_api_escaped}|" "$SPLUNK_AI_FILE"
+  if [[ -n "${saia_api_v2_escaped}" ]]; then
+    "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_SAIA_API_V2/,/value:/ s|value:.*|value: ${saia_api_v2_escaped}|" "$SPLUNK_AI_FILE"
+  fi
+  "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_POST_INSTALL_HOOK/,/value:/ s|value:.*|value: ${saia_dataloader_escaped}|" "$SPLUNK_AI_FILE"
+  "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_FLUENT_BIT/,/value:/ s|value:.*|value: ${fluent_bit_escaped}|" "$SPLUNK_AI_FILE"
+  "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_OTEL_COLLECTOR/,/value:/ s|value:.*|value: ${otel_collector_escaped}|" "$SPLUNK_AI_FILE"
+  "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_NGINX/,/value:/ s|value:.*|value: ${nginx_escaped}|" "$SPLUNK_AI_FILE"
+  "${SED_INPLACE[@]}" "/name: MODEL_VERSION/,/value:/ s|value:.*|value: ${MODEL_VERSION}|" "$SPLUNK_AI_FILE"
+  "${SED_INPLACE[@]}" "/name: RAY_VERSION/,/value:/ s|value:.*|value: ${RAY_RUNTIME_VERSION}|" "$SPLUNK_AI_FILE"
 
   # Replace operator image (the container image itself, not env var)
   # Find the line with "image:" that's near "splunk-ai-operator" and replace it
-  sed $SEDOPTION "s|image: .*splunk.*ai.*operator.*|image: ${operator_escaped}|I" "$SPLUNK_AI_FILE"
+  "${SED_INPLACE[@]}" "s|image: .*splunk.*ai.*operator.*|image: ${operator_escaped}|I" "$SPLUNK_AI_FILE"
 
   log "  ✓ Updated RELATED_IMAGE_RAY_HEAD: $ray_head_full"
   log "  ✓ Updated RELATED_IMAGE_RAY_WORKER: $ray_worker_full"
   log "  ✓ Updated RELATED_IMAGE_WEAVIATE: $weaviate_full"
   log "  ✓ Updated RELATED_IMAGE_SAIA_API: $saia_api_full"
+  if [[ -n "${saia_api_v2_full}" ]]; then
+    log "  ✓ Updated RELATED_IMAGE_SAIA_API_V2: $saia_api_v2_full"
+  fi
   log "  ✓ Updated RELATED_IMAGE_POST_INSTALL_HOOK: $saia_dataloader_full"
   log "  ✓ Updated RELATED_IMAGE_FLUENT_BIT: $fluent_bit_full"
   log "  ✓ Updated RELATED_IMAGE_OTEL_COLLECTOR: $otel_collector_full"
+  log "  ✓ Updated RELATED_IMAGE_NGINX: $nginx_full"
   log "  ✓ Updated operator image: $operator_full"
   log "  ✓ Updated MODEL_VERSION: $MODEL_VERSION"
   log "  ✓ Updated RAY_VERSION: $RAY_RUNTIME_VERSION"
@@ -441,10 +498,10 @@ configure_images() {
   local splunk_op_escaped=$(echo "$splunk_operator_full" | sed 's/[\/&]/\\&/g')
 
   # Replace RELATED_IMAGE_SPLUNK_ENTERPRISE env var
-  sed $SEDOPTION "/name: RELATED_IMAGE_SPLUNK_ENTERPRISE/,/value:/ s|value:.*|value: ${splunk_escaped}|" "$SPLUNK_OPERATOR_FILE"
+  "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_SPLUNK_ENTERPRISE/,/value:/ s|value:.*|value: ${splunk_escaped}|" "$SPLUNK_OPERATOR_FILE"
 
   # Replace splunk-operator image (the container image itself)
-  sed $SEDOPTION "s|image: .*splunk.*operator.*|image: ${splunk_op_escaped}|I" "$SPLUNK_OPERATOR_FILE"
+  "${SED_INPLACE[@]}" "s|image: .*splunk.*operator.*|image: ${splunk_op_escaped}|I" "$SPLUNK_OPERATOR_FILE"
 
   log "  ✓ Updated Splunk Enterprise image: $splunk_full"
   log "  ✓ Updated Splunk Operator image: $splunk_operator_full"
@@ -890,6 +947,11 @@ ${public_subnets}"
     fi
   else
     log "No subnets specified - eksctl will create new subnets automatically"
+    # One NAT gateway => one Elastic IP. HighlyAvailable uses one NAT per AZ
+    # (often 3 EIPs) and commonly trips the default regional EIP quota (5).
+    vpc_config="vpc:
+  nat:
+    gateway: Single"
   fi
 
   cat <<EOF > eks-cluster-config.yaml
@@ -899,6 +961,8 @@ metadata:
   name: ${CLUSTER_NAME}
   region: ${REGION}
   version: "${K8S_VERSION}"
+autoModeConfig:
+  enabled: false
 iam:
   withOIDC: true
 addons:
@@ -1365,6 +1429,139 @@ install_cert_manager() {
   check_ready cert-manager "app.kubernetes.io/instance=cert-manager,app.kubernetes.io/component=controller"
 }
 
+# ---------- AWS Load Balancer Controller (LBC) ----------
+# LBC watches Services with the "aws-load-balancer-type: external" annotation
+# (the in-tree cloud controller skips those Services on purpose) and drives
+# NLB/ALB provisioning through the AWS ELBv2 API. Without LBC installed, such
+# Services stay in EXTERNAL-IP=<pending> forever. LBC also gives us IP-mode
+# targeting, ACM-backed TLS termination, and modern NLB attributes — all
+# features the in-tree controller does not support.
+
+# Fetches the upstream-recommended IAM policy for LBC from a pinned git tag and
+# creates a customer-managed policy in the account (idempotent). Emits the ARN
+# on stdout so the caller can attach it via eksctl. Uses a cluster-scoped name
+# so teardown of one cluster won't remove a policy shared with other clusters.
+ensure_lbc_iam_policy() {
+  # Resolve the caller's account ID; construct the canonical policy ARN
+  # deterministically (IAM policy names are unique per account). This avoids
+  # parsing AWS CLI text output -- some CLI/JMESPath combinations have been
+  # observed to emit multi-line "None\nNone" for `Policies[?...].Arn | [0]`
+  # when no match exists, which would otherwise slip past a "!= None" guard.
+  local acct policy_arn
+  acct="$(aws sts get-caller-identity --query Account --output text 2>/dev/null | tr -d '[:space:]')"
+  if [[ -z "$acct" || ! "$acct" =~ ^[0-9]{12}$ ]]; then
+    err "Could not resolve a valid AWS account ID via STS (got: '${acct}')"
+  fi
+  policy_arn="arn:aws:iam::${acct}:policy/${LBC_POLICY_NAME}"
+
+  if aws iam get-policy --policy-arn "$policy_arn" >/dev/null 2>&1; then
+    log "✓ LBC IAM policy already exists: ${policy_arn}" >&2
+    printf "%s" "$policy_arn"
+    return 0
+  fi
+
+  local tmp; tmp="$(mktemp)"; TMP_FILES+=("$tmp")
+  local url="https://raw.githubusercontent.com/kubernetes-sigs/aws-load-balancer-controller/${LBC_POLICY_VERSION}/docs/install/iam_policy.json"
+  log "Fetching LBC IAM policy ${LBC_POLICY_VERSION} from ${url}" >&2
+  if ! curl -fsSL --max-time 60 "$url" -o "$tmp"; then
+    err "Failed to download AWS LBC IAM policy from ${url}. Check network access or bump LBC_POLICY_VERSION."
+  fi
+  if ! jq -e . "$tmp" >/dev/null 2>&1; then
+    err "Downloaded LBC IAM policy is not valid JSON. Refusing to proceed."
+  fi
+
+  local created
+  created="$(aws iam create-policy \
+    --policy-name "${LBC_POLICY_NAME}" \
+    --policy-document "file://${tmp}" \
+    --description "AWS Load Balancer Controller policy for ${CLUSTER_NAME} (${LBC_POLICY_VERSION})" \
+    --query 'Policy.Arn' --output text 2>/dev/null | tr -d '[:space:]')"
+  if [[ -z "$created" || "$created" != arn:aws:iam::* ]]; then
+    err "create-policy did not return a valid ARN for ${LBC_POLICY_NAME} (got: '${created}')"
+  fi
+  log "✓ Created LBC IAM policy ${LBC_POLICY_NAME}: ${created}" >&2
+  printf "%s" "$created"
+}
+
+# Creates the IRSA-bound ServiceAccount used by the LBC deployment. Uses eksctl
+# so the trust policy is pinned to this cluster's OIDC provider and SA subject.
+ensure_lbc_irsa() {
+  log "Ensuring IRSA for AWS Load Balancer Controller (${LBC_NS}/${LBC_SA})..."
+  local policy_arn; policy_arn="$(ensure_lbc_iam_policy)"
+  if [[ -z "$policy_arn" || "$policy_arn" != arn:aws:iam::* ]]; then
+    err "LBC IAM policy ARN is empty/invalid ('${policy_arn}'); cannot configure IRSA"
+  fi
+
+  eksctl create iamserviceaccount \
+    --cluster "${CLUSTER_NAME}" \
+    --region "${REGION}" \
+    --namespace "${LBC_NS}" \
+    --name "${LBC_SA}" \
+    --role-name "${LBC_ROLE_NAME}" \
+    --attach-policy-arn "${policy_arn}" \
+    --approve \
+    --override-existing-serviceaccounts
+
+  wait_resource_exists "${LBC_NS}" sa "${LBC_SA}" 180
+  log "✓ LBC IRSA role and service account configured"
+}
+
+# Tags user-provided subnets so LBC can auto-discover where to place LBs.
+# eksctl already tags subnets it creates, so this is a no-op when the cluster
+# was created without explicit cluster.subnets.
+tag_lbc_subnets() {
+  if [[ ${#PUBLIC_SUBNETS[@]} -eq 0 && ${#PRIVATE_SUBNETS[@]} -eq 0 ]]; then
+    log "No user-provided subnets; eksctl-created subnets are already tagged for LBC discovery."
+    return 0
+  fi
+  log "Tagging user-provided subnets for AWS Load Balancer Controller discovery..."
+  if [[ ${#PUBLIC_SUBNETS[@]} -gt 0 ]]; then
+    log "  Public subnets (${#PUBLIC_SUBNETS[@]}): kubernetes.io/role/elb=1"
+    aws ec2 create-tags --region "${REGION}" \
+      --resources "${PUBLIC_SUBNETS[@]}" \
+      --tags Key=kubernetes.io/role/elb,Value=1 \
+             "Key=kubernetes.io/cluster/${CLUSTER_NAME},Value=shared"
+  fi
+  if [[ ${#PRIVATE_SUBNETS[@]} -gt 0 ]]; then
+    log "  Private subnets (${#PRIVATE_SUBNETS[@]}): kubernetes.io/role/internal-elb=1"
+    aws ec2 create-tags --region "${REGION}" \
+      --resources "${PRIVATE_SUBNETS[@]}" \
+      --tags Key=kubernetes.io/role/internal-elb,Value=1 \
+             "Key=kubernetes.io/cluster/${CLUSTER_NAME},Value=shared"
+  fi
+  log "✓ Subnets tagged for LBC auto-discovery"
+}
+
+install_aws_load_balancer_controller() {
+  log "Installing AWS Load Balancer Controller (helm chart ${LBC_CHART_VERSION})..."
+
+  local vpc_id
+  vpc_id="$(aws eks describe-cluster --name "${CLUSTER_NAME}" --region "${REGION}" \
+    --query 'cluster.resourcesVpcConfig.vpcId' --output text 2>/dev/null || true)"
+  if [[ -z "$vpc_id" || "$vpc_id" == "None" ]]; then
+    err "Could not determine VPC ID for cluster ${CLUSTER_NAME}. LBC install requires vpcId."
+  fi
+
+  if ! aws iam get-role --role-name "${LBC_ROLE_NAME}" >/dev/null 2>&1; then
+    err "IRSA role ${LBC_ROLE_NAME} not found. ensure_lbc_irsa must run first."
+  fi
+
+  helm repo add eks https://aws.github.io/eks-charts >/dev/null
+  helm repo update >/dev/null
+  helm_retry 5 upgrade --install "${LBC_RELEASE}" eks/aws-load-balancer-controller \
+    --namespace "${LBC_NS}" \
+    --version "${LBC_CHART_VERSION}" \
+    --set clusterName="${CLUSTER_NAME}" \
+    --set region="${REGION}" \
+    --set vpcId="${vpc_id}" \
+    --set serviceAccount.create=false \
+    --set serviceAccount.name="${LBC_SA}" \
+    --wait --timeout 10m
+
+  check_ready "${LBC_NS}" "app.kubernetes.io/name=aws-load-balancer-controller"
+  log "✓ AWS Load Balancer Controller ${LBC_CHART_VERSION} installed and ready"
+}
+
 # ---------- External S3-compatible object storage (credentials only; no in-cluster install) ----------
 ensure_s3compat_credentials() {
   # Only create credentials secret when using external S3-compatible storage (s3compat, minio, seaweedfs).
@@ -1536,6 +1733,39 @@ ensure_s3_upload_splunk_app() {
   fi
 }
 
+ensure_external_objstore_upload_splunk_app() {
+  if [[ -z "${SPLUNK_APP_LOCAL_PATH}" ]]; then
+    log "SPLUNK_APP_LOCAL_PATH not set; skipping app upload to ${OBJ_STORE_TYPE}://${OBJ_STORE_BUCKET}/apps/"
+    return 0
+  fi
+  if [[ ! -f "${SPLUNK_APP_LOCAL_PATH}" ]]; then
+    warn "SPLUNK_APP_LOCAL_PATH='${SPLUNK_APP_LOCAL_PATH}' not found; skipping upload"
+    return 0
+  fi
+  if [[ -z "${OBJ_STORE_ENDPOINT}" ]]; then
+    warn "OBJ_STORE_ENDPOINT not set; cannot upload Splunk app to external object store"
+    return 0
+  fi
+
+  local base key
+  base="$(basename "${SPLUNK_APP_LOCAL_PATH}")"
+  key="apps/${base}"
+  log "Ensuring Splunk app '${base}' exists at ${OBJ_STORE_TYPE}://${OBJ_STORE_BUCKET}/${key}"
+
+  if AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \
+    aws --endpoint-url "${OBJ_STORE_ENDPOINT}" s3api head-object --bucket "${OBJ_STORE_BUCKET}" --key "${key}" >/dev/null 2>&1; then
+    log "App already present at ${OBJ_STORE_TYPE}://${OBJ_STORE_BUCKET}/${key}; skipping upload"
+  else
+    AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \
+      aws --endpoint-url "${OBJ_STORE_ENDPOINT}" s3 cp "${SPLUNK_APP_LOCAL_PATH}" "s3://${OBJ_STORE_BUCKET}/${key}"
+    log "Uploaded ${base} to ${OBJ_STORE_TYPE}://${OBJ_STORE_BUCKET}/${key}"
+  fi
+}
+
+should_wait_for_splunk_app_install() {
+  [[ -n "${SPLUNK_APP_LOCAL_PATH:-}" && -f "${SPLUNK_APP_LOCAL_PATH}" ]]
+}
+
 ensure_namespace() { kubectl get ns "$1" >/dev/null 2>&1 || kubectl create ns "$1"; }
 
 ensure_bucket_policy() {
@@ -2112,6 +2342,233 @@ show_platform_access_info() {
   log ""
 }
 
+saia_service_template_enabled() {
+  [[ -n "${SAIA_SERVICE_TYPE:-}" && "${SAIA_SERVICE_TYPE}" != "null" && "${SAIA_SERVICE_TYPE}" != "ClusterIP" ]]
+}
+
+saia_aiservice_name() {
+  local platform_name="${1:-${AI_PLATFORM_NAME}}"
+  printf "%s-saia" "${platform_name}"
+}
+
+wait_for_aiservice_exists() {
+  local name="$1" timeout="${2:-600}" waited=0
+  while ! kubectl -n "${AI_NS}" get aiservice "${name}" >/dev/null 2>&1; do
+    [[ $waited -ge $timeout ]] && err "Timed out waiting for AIService ${AI_NS}/${name}"
+    sleep 5
+    waited=$((waited + 5))
+  done
+}
+
+apply_saia_service_annotations() {
+  local aiservice_name="$1"
+  local annotation_keys key value
+
+  annotation_keys="$(yq eval '.aiPlatform.serviceTemplate.annotations // {} | keys | .[]' "${CONFIG_FILE}" 2>/dev/null || true)"
+  [[ -z "${annotation_keys}" ]] && return 0
+
+  local annotate_args=()
+  while IFS= read -r key; do
+    [[ -z "${key}" || "${key}" == "null" ]] && continue
+    value="$(yq eval ".aiPlatform.serviceTemplate.annotations.\"${key}\"" "${CONFIG_FILE}" 2>/dev/null || echo "")"
+    [[ -z "${value}" || "${value}" == "null" ]] && continue
+    annotate_args+=("${key}=${value}")
+  done <<< "${annotation_keys}"
+
+  if [[ ${#annotate_args[@]} -gt 0 ]]; then
+    log "Applying SAIA Service annotations to AIService/${aiservice_name}..."
+    kubectl -n "${AI_NS}" annotate aiservice "${aiservice_name}" "${annotate_args[@]}" --overwrite
+  fi
+}
+
+byo_target_group_enabled() {
+  [[ "${BYO_TG_ENABLED:-false}" == "true" ]]
+}
+
+# Validates BYO target-group configuration and warns about misconfigurations
+# before any kubectl/aws calls are issued. Caller decides whether to err or
+# return on warnings — we treat missing required fields as fatal because the
+# rest of the install would silently misroute traffic.
+validate_byo_target_group_config() {
+  byo_target_group_enabled || return 0
+
+  if [[ "${INSTALL_LBC:-false}" != "true" ]]; then
+    err "byoTargetGroup.enabled=true requires awsLoadBalancerController.install=true (LBC manages the TargetGroupBinding)."
+  fi
+  if [[ -z "${BYO_TG_ARN:-}" || "${BYO_TG_ARN}" == "null" ]]; then
+    err "byoTargetGroup.enabled=true requires byoTargetGroup.targetGroupArn to be set."
+  fi
+  if [[ "${BYO_TG_ARN}" != arn:aws:elasticloadbalancing:* ]]; then
+    err "byoTargetGroup.targetGroupArn must look like 'arn:aws:elasticloadbalancing:<region>:<account>:targetgroup/<name>/<id>' (got: ${BYO_TG_ARN})."
+  fi
+  if [[ -z "${BYO_TG_SG_ID:-}" || "${BYO_TG_SG_ID}" == "null" ]]; then
+    err "byoTargetGroup.enabled=true requires byoTargetGroup.securityGroupId (the customer LB's SG) so LBC opens pod-SG ingress correctly."
+  fi
+  if [[ "${SAIA_SERVICE_TYPE:-}" == "LoadBalancer" ]]; then
+    log "WARNING: byoTargetGroup.enabled=true with serviceTemplate.type=LoadBalancer creates BOTH an operator-managed LB AND a TargetGroupBinding. Set serviceTemplate.type=ClusterIP for pure BYO." >&2
+  fi
+}
+
+# Apply a TargetGroupBinding CR pointing at the customer's pre-provisioned
+# target group. AWS LBC reads this CR and registers the SAIA Service's pod
+# IPs (targetType: ip) into the customer's TG, then deregisters them on pod
+# rotation. The networking.ingress block has LBC open the pod SG to the LB's
+# SG only — never 0.0.0.0/0 (codeguard-0-iac-security).
+apply_byo_target_group_binding() {
+  local platform_name="${1:-${AI_PLATFORM_NAME}}"
+  local svc_name
+  svc_name="$(saia_aiservice_name "${platform_name}")-saia-service"
+
+  byo_target_group_enabled || return 0
+
+  log "Applying TargetGroupBinding for BYO target group ${BYO_TG_ARN}..."
+  cat <<YAML | kubectl -n "${AI_NS}" apply -f -
+apiVersion: elbv2.k8s.aws/v1beta1
+kind: TargetGroupBinding
+metadata:
+  name: ${svc_name}-tgb
+  namespace: ${AI_NS}
+spec:
+  serviceRef:
+    name: ${svc_name}
+    port: 8080
+  targetGroupARN: ${BYO_TG_ARN}
+  targetType: ip
+  networking:
+    ingress:
+      - from:
+          - securityGroup:
+              groupID: ${BYO_TG_SG_ID}
+        ports:
+          - protocol: TCP
+            port: 8080
+YAML
+  log "✓ TargetGroupBinding ${AI_NS}/${svc_name}-tgb applied"
+}
+
+# Disable kube-proxy NodePort allocation on the rendered SAIA Service. The
+# operator's reconcileSAIAService only touches Selector/Ports on existing
+# Services (pkg/ai/features/saia/impl.go), so this patch survives subsequent
+# reconciles. externalTrafficPolicy=Local preserves real client IP for
+# MetalLB-style providers; for AWS NLB ip-target mode it is a no-op since
+# LBC bypasses kube-proxy entirely.
+patch_saia_service_disable_nodeport() {
+  local platform_name="${1:-${AI_PLATFORM_NAME}}"
+  local svc_name
+  svc_name="$(saia_aiservice_name "${platform_name}")-saia-service"
+
+  # Only meaningful when the Service is type=LoadBalancer; ClusterIP services
+  # don't allocate NodePorts.
+  local svc_type
+  svc_type="$(kubectl -n "${AI_NS}" get svc "${svc_name}" -o jsonpath='{.spec.type}' 2>/dev/null || true)"
+  [[ "${svc_type}" != "LoadBalancer" ]] && return 0
+
+  log "Patching Service ${AI_NS}/${svc_name} to disable NodePort allocation..."
+  kubectl -n "${AI_NS}" patch svc "${svc_name}" --type=merge -p '{
+  "spec": {
+    "allocateLoadBalancerNodePorts": false,
+    "externalTrafficPolicy": "Local"
+  }
+}' >/dev/null
+  log "✓ Service ${AI_NS}/${svc_name}: allocateLoadBalancerNodePorts=false, externalTrafficPolicy=Local"
+}
+
+patch_saia_public_service_workaround() {
+  local platform_name="${1:-${AI_PLATFORM_NAME}}"
+  local aiservice_name public_svc_name effective_type
+
+  aiservice_name="$(saia_aiservice_name "${platform_name}")"
+  public_svc_name="${aiservice_name}-saia-service"
+
+  wait_for_aiservice_exists "${aiservice_name}"
+
+  # In BYO mode the customer owns the LB; force the SAIA Service to ClusterIP
+  # regardless of what serviceTemplate.type says — TargetGroupBinding wires
+  # everything else.
+  if byo_target_group_enabled; then
+    effective_type="ClusterIP"
+  else
+    effective_type="${SAIA_SERVICE_TYPE}"
+  fi
+
+  if [[ -n "${effective_type:-}" && "${effective_type}" != "null" ]]; then
+    log "Patching AIService/${aiservice_name} with SAIA public exposure settings (type=${effective_type})..."
+    if [[ "${effective_type}" == "NodePort" && -n "${SAIA_SERVICE_NODE_PORT:-}" && "${SAIA_SERVICE_NODE_PORT}" != "null" ]]; then
+      log "WARNING: NodePort exposure is discouraged; consider Mode 1 (LoadBalancer + LBC) or Mode 2 (BYO target group) instead." >&2
+      kubectl -n "${AI_NS}" patch aiservice "${aiservice_name}" --type merge -p "{
+  \"spec\": {
+    \"serviceTemplate\": {
+      \"spec\": {
+        \"type\": \"NodePort\",
+        \"ports\": [
+          {
+            \"name\": \"http\",
+            \"port\": 8080,
+            \"targetPort\": 8080,
+            \"nodePort\": ${SAIA_SERVICE_NODE_PORT}
+          }
+        ]
+      }
+    }
+  }
+}"
+    else
+      kubectl -n "${AI_NS}" patch aiservice "${aiservice_name}" --type merge -p "{
+  \"spec\": {
+    \"serviceTemplate\": {
+      \"spec\": {
+        \"type\": \"${effective_type}\"
+      }
+    }
+  }
+}"
+    fi
+  fi
+
+  apply_saia_service_annotations "${aiservice_name}"
+
+  kubectl -n "${AI_NS}" annotate aiservice "${aiservice_name}" script-reconcile-ts="$(date +%s)" --overwrite >/dev/null
+
+  if [[ -n "${effective_type:-}" && "${effective_type}" != "null" && "${effective_type}" != "ClusterIP" ]]; then
+    log "Recreating SAIA public Service to ensure patched settings take effect..."
+    kubectl -n "${AI_NS}" delete svc "${public_svc_name}" --ignore-not-found >/dev/null 2>&1 || true
+    wait_resource_exists "${AI_NS}" service "${public_svc_name}" 300
+  fi
+
+  # NodePort-free hardening: disable kube-proxy NodePort allocation on
+  # LoadBalancer Services and apply BYO TargetGroupBinding if configured.
+  patch_saia_service_disable_nodeport "${platform_name}"
+  apply_byo_target_group_binding "${platform_name}"
+}
+
+wait_for_saia_load_balancer() {
+  local platform_name="${1:-${AI_PLATFORM_NAME}}" timeout="${2:-1200}" waited=0
+  local svc_name hostname=""
+  svc_name="$(saia_aiservice_name "${platform_name}")-saia-service"
+
+  # In BYO mode the Service is ClusterIP and the customer's LB DNS is not
+  # surfaced via .status.loadBalancer; skip the wait. Mode 1 (operator-
+  # managed NLB) still gates on SAIA_SERVICE_TYPE=LoadBalancer.
+  if byo_target_group_enabled; then
+    log "byoTargetGroup.enabled=true — skipping wait for operator-managed LB hostname (LB is customer-managed)."
+    return 0
+  fi
+  [[ "${SAIA_SERVICE_TYPE:-}" == "LoadBalancer" ]] || return 0
+
+  log "Waiting for SAIA LoadBalancer Service ${AI_NS}/${svc_name} to receive an external hostname..."
+  while true; do
+    hostname="$(kubectl -n "${AI_NS}" get svc "${svc_name}" -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null || true)"
+    [[ -z "${hostname}" ]] && hostname="$(kubectl -n "${AI_NS}" get svc "${svc_name}" -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true)"
+    if [[ -n "${hostname}" ]]; then
+      log "✓ SAIA external endpoint: ${hostname}"
+      return 0
+    fi
+    [[ $waited -ge $timeout ]] && err "Timed out waiting for SAIA LoadBalancer Service ${AI_NS}/${svc_name}"
+    sleep 5
+    waited=$((waited + 5))
+  done
+}
+
 # Quick status check function - can be called standalone
 check_aiplatform_status() {
   local platform_name="${1:-${AI_PLATFORM_NAME}}"
@@ -2262,6 +2719,14 @@ YAML
       ;;
   esac
 
+  local svc_template_yaml=""
+  if saia_service_template_enabled; then
+    svc_template_yaml="  serviceTemplate:"$'\n'"    spec:"$'\n'"      type: ${SAIA_SERVICE_TYPE}"$'\n'
+    if [[ "${SAIA_SERVICE_TYPE}" == "NodePort" && -n "${SAIA_SERVICE_NODE_PORT:-}" && "${SAIA_SERVICE_NODE_PORT}" != "null" ]]; then
+      svc_template_yaml+="      ports:"$'\n'"      - name: http"$'\n'"        port: 8080"$'\n'"        targetPort: 8080"$'\n'"        nodePort: ${SAIA_SERVICE_NODE_PORT}"$'\n'
+    fi
+  fi
+
   cat <<YAML | kubectl -n "${AI_NS}" apply --server-side --force-conflicts -f -
 apiVersion: ai.splunk.com/v1
 kind: AIPlatform
@@ -2279,6 +2744,7 @@ spec:
     - name: saia
       version: "1.1.0"
       serviceAccountName: ${SAIA_SERVICE_SA}
+${svc_template_yaml}
   storage:
     vectorDB:
       size: ${VECTORDB_SIZE}
@@ -2314,6 +2780,8 @@ spec:
 YAML
 
   wait_aiplatform_ready
+  patch_saia_public_service_workaround "${AI_PLATFORM_NAME}"
+  wait_for_saia_load_balancer "${AI_PLATFORM_NAME}" 1200
 }
 
 # Wait until Splunk AI Assistant app shows as installed in Standalone status
@@ -2616,6 +3084,7 @@ delete_cluster_minimal() {
   delete_iamserviceaccount_if_exists "${AI_NS}" "${RAY_WORKER_SA}"
   delete_iamserviceaccount_if_exists "${AI_NS}" "${SAIA_SERVICE_SA}"
   delete_iamserviceaccount_if_exists "${EBS_NS}" "${EBS_SA}"
+  delete_iamserviceaccount_if_exists "${LBC_NS}" "${LBC_SA}"
   echo ""
 
   log "Step 2: Deleting IAM roles..."
@@ -2624,6 +3093,7 @@ delete_cluster_minimal() {
   delete_role_if_exists "IRSA-${CLUSTER_NAME}-${RAY_WORKER_SA}"
   delete_role_if_exists "IRSA-${CLUSTER_NAME}-${SAIA_SERVICE_SA}"
   delete_role_if_exists "${EBS_IRSA_ROLE_NAME}"
+  delete_role_if_exists "${LBC_ROLE_NAME}"
   echo ""
 
   log "Step 3: Cleaning up any eksctl-created EBS CSI addon roles..."
@@ -2720,6 +3190,7 @@ delete_cluster_minimal() {
   else
     delete_policy_if_exists "${AI_BUCKET_POLICY_NAME}"
   fi
+  delete_policy_if_exists "${LBC_POLICY_NAME}"
   echo ""
 
   log "Step 8: Purging all IRSA roles associated with this cluster's OIDC provider..."
@@ -2779,6 +3250,7 @@ delete_everything() {
   helm uninstall "${AUTOSCALER_RELEASE}" -n "${AUTOSCALER_NS}" || true
   kubectl delete -f https://github.com/splunk/splunk-operator/releases/download/2.8.1/splunk-operator-cluster.yaml --ignore-not-found
   kubectl delete -k "github.com/ray-project/kuberay/ray-operator/config/default?ref=v1.2.2" --ignore-not-found
+  helm uninstall "${LBC_RELEASE}" -n "${LBC_NS}" || true
   helm uninstall kube-prometheus -n monitoring || true
   helm uninstall cert-manager -n cert-manager || true
   kubectl delete storageclass gp3 --ignore-not-found
@@ -2856,7 +3328,7 @@ preflight_env() {
   fi
 
   pf_header "Tools"
-  for t in aws eksctl kubectl helm git jq yq; do
+  for t in aws eksctl kubectl helm git jq yq curl; do
     if command -v "$t" >/dev/null 2>&1; then pf_ok "$t found ($(command -v $t))"; else pf_fail "$t not found in PATH"; fi
   done
 
@@ -2878,7 +3350,18 @@ preflight_env() {
     fi
   fi
   if [[ $subnet_count -eq 0 ]]; then
-    pf_ok "No subnets specified - eksctl will create new VPC and subnets automatically"
+    pf_ok "No subnets specified - eksctl will create new VPC and subnets automatically (NAT mode: Single = 1 Elastic IP)"
+    pf_header "Elastic IP headroom (new VPC)"
+    local eip_cnt
+    eip_cnt="$(aws ec2 describe-addresses --region "${REGION}" --query 'length(Addresses)' --output text 2>/dev/null || true)"
+    if [[ -n "${eip_cnt}" && "${eip_cnt}" =~ ^[0-9]+$ ]]; then
+      pf_ok "Allocated Elastic IPs in ${REGION}: ${eip_cnt}"
+      if (( eip_cnt >= 5 )); then
+        pf_warn "Typical default EIP quota is 5 per region. At ${eip_cnt}+ addresses, NAT gateway EIP allocation may fail (you saw: maximum number of addresses). Release unused EIPs in EC2 → Elastic IPs or request a quota increase before create cluster."
+      fi
+    else
+      pf_warn "Could not list Elastic IPs (aws ec2 describe-addresses). If create fails on NAT/EIP, check quotas and unused addresses."
+    fi
   else
     local all_subnets=("${PRIVATE_SUBNETS[@]}" "${PUBLIC_SUBNETS[@]}")
     local vpc_id=""
@@ -3102,6 +3585,7 @@ install_ai_platform_stack() {
   log "=== Setting up Splunk AI Platform stack ==="
   if [[ "${USE_EXTERNAL_OBJ_STORE}" == "true" ]]; then
     log "Using external S3-compatible object storage (${OBJ_STORE_TYPE}); skipping S3 bucket creation; using ECR-only policy for IRSA."
+    ensure_external_objstore_upload_splunk_app
   else
     ensure_s3_bucket_and_prefixes
     ensure_s3_upload_splunk_app
@@ -3169,19 +3653,41 @@ reconcile_flow() {
   fi
   install_kube_prometheus
   install_cert_manager
+  # Validate BYO target-group config before any side-effecting calls. Fail
+  # fast if the customer set byoTargetGroup.enabled=true without LBC or
+  # required ARN/SG fields — better an early error than a silently-broken
+  # data path.
+  validate_byo_target_group_config
+  # AWS Load Balancer Controller (LBC) — required when the operator provisions
+  # NLBs/ALBs (Mode 1: Service type=LoadBalancer + `aws-load-balancer-type:
+  # external` annotation) or when binding the SAIA Service to a customer-
+  # managed target group via TargetGroupBinding (Mode 2: byoTargetGroup
+  # enabled). Off-AWS deployments leave this false.
+  if [[ "${INSTALL_LBC}" == "true" ]]; then
+    log "aiPlatform.awsLoadBalancerController.install=true — installing AWS Load Balancer Controller"
+    tag_lbc_subnets
+    ensure_lbc_irsa
+    install_aws_load_balancer_controller
+  else
+    log "aiPlatform.awsLoadBalancerController.install=false — skipping LBC install"
+  fi
   ensure_s3compat_credentials
   install_otel_operator_and_contrib_collector
   install_ray_operator
   install_splunk_operator
   install_splunk_ai_operator
   install_ai_platform_stack
-  wait_splunk_ai_assistant_installed "Splunk_AI_Assistant_Cloud.tgz" 1200
+  if should_wait_for_splunk_app_install; then
+    wait_splunk_ai_assistant_installed "Splunk_AI_Assistant_Cloud.tgz" 1200
+  else
+    log "Skipping Splunk AI Assistant app wait because no local app archive is configured"
+  fi
   # push_saia_conf_into_pod
 }
 
 # ---------- MAIN ----------
 main_install() {
-  for t in aws eksctl kubectl helm git jq yq; do need "$t"; done
+  for t in aws eksctl kubectl helm git jq yq curl; do need "$t"; done
 
   # Load configuration from YAML file
   load_config
diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml
index 9faa669..7e733af 100644
--- a/tools/cluster_setup/k0s-cluster-config.yaml
+++ b/tools/cluster_setup/k0s-cluster-config.yaml
@@ -13,7 +13,7 @@
 # ---------- Cluster Configuration ----------
 cluster:
   name: airgap-cluster
-  # region: us-east-2                    # Ignored for on-prem, but required in config
+  region: us-east-2                       # CHANGE THIS — required when storage.objectStore.type=aws (region of the S3 bucket); ignored for true on-prem
   sshUser: ec2-user                       # CHANGE THIS: SSH user for remote nodes
   sshKeyPath: ~/.ssh/id_rsa                  # CHANGE THIS: Path to SSH private key
 
@@ -38,7 +38,7 @@ nodes:
 #   - /var/lib/k0s must have at least 100 GB free on controllers
 #   If using a dedicated disk, mount it at /var/lib/k0s before running this script.
 #
-# Object storage: AWS S3 or external S3-compatible (no in-cluster MinIO install for external).
+# Object storage: AWS S3 or external S3-compatible (no in-cluster MinIO install).
 # Use objectStore.type: aws (S3) or s3compat | minio | seaweedfs (external; endpoint + credentials required).
 storage:
   storageClass: "local-path"                 # Storage class for Kubernetes PVCs (gp3, gp2, io1, io2)
@@ -58,8 +58,8 @@ storage:
     # endpoint: "http://3.144.157.201:8333"      # SeaweedFS (deprecated — see comment above)
     endpoint: "http://10.0.0.5:9000"              # CHANGE THIS: MinIO/SeaweedFS S3 API endpoint
     auth:
-      rootUser: "minioadmin"
-      rootPassword: "minioadmin"
+      rootUser: "<paste-AWS_ACCESS_KEY_ID-here>"          # CHANGE THIS — AWS_ACCESS_KEY_ID (AKIA…) or MinIO root user
+      rootPassword: "<paste-AWS_SECRET_ACCESS_KEY-here>"  # CHANGE THIS — AWS secret OR MinIO root password; NEVER commit real keys
 
 # ---------- Container Images Configuration ----------
 images:
@@ -132,32 +132,36 @@ aiPlatform:
   workerGroupConfig:
     imageRegistry: ""
 
-  # ---------- SAIA public exposure (OPTIONAL) ----------
-  # The SAIA "public" Service (nginx reverse proxy in front of v1+v2 API pods)
-  # defaults to ClusterIP, meaning it is only reachable from inside the cluster.
-  #
-  # Two call patterns hit this Service:
-  #   (A) Splunk Enterprise pod      → saia-service   (works with ClusterIP)
-  #   (B) End user's browser         → saia-service   (needs external exposure)
+  # ---------- SAIA public exposure (NodePort-free) ----------
+  # The SAIA "public" Service (nginx reverse proxy in front of v1 + v2 API
+  # pods) defaults to ClusterIP — only reachable from inside the cluster. Two
+  # call patterns hit it:
+  #   (A) Splunk Enterprise pod   → saia-service   (works with ClusterIP)
+  #   (B) End user's browser      → saia-service   (needs external exposure)
   #
   # Pattern B is used by the v2 chat UI (/query streaming, conversations,
   # feedback, admin endpoints). Without external exposure the v2 chat UI
-  # breaks for users, even though v1 one-shot SPL features still work.
+  # breaks for users; v1 one-shot SPL still works.
+  #
+  # The supported on-prem path is `type: LoadBalancer` backed by MetalLB
+  # (allocates a routable VIP from a pool you provide; ARP / BGP-announces it
+  # on your network). NodePort is intentionally avoided so we never open
+  # 30000-32767 on every worker node.
   #
-  # To DISABLE external exposure (use ClusterIP only), either:
-  #   * Delete / comment-out the entire `serviceTemplate:` block below, OR
-  #   * Set `type: ClusterIP` explicitly.
-  # Either is treated identically — the installer skips emitting serviceTemplate
-  # into the AIPlatform CR and the operator falls through to the ClusterIP
-  # default in reconcileSAIAService().
+  # The installer:
+  #   * Installs MetalLB (set metallb.install: true below).
+  #   * Applies an IPAddressPool + L2Advertisement (or BGPAdvertisement) from
+  #     the metallb config below.
+  #   * Renders the SAIA Service as type: LoadBalancer; MetalLB allocates a
+  #     VIP from the pool and announces it.
+  #   * Patches the Service with `allocateLoadBalancerNodePorts: false` and
+  #     `externalTrafficPolicy: Local` so kube-proxy does not open a NodePort.
   #
-  # To ENABLE external exposure for on-prem / airgap customers, NodePort is the
-  # recommended default: any k8s node IP + the configured nodePort yields a
-  # reachable endpoint from VPN-connected users. No cloud LB / cert-manager
-  # needed. Use LoadBalancer only if the customer runs MetalLB or a cloud LB.
+  # To DISABLE external exposure (ClusterIP only), comment out the whole
+  # serviceTemplate block AND set metallb.install: false.
   serviceTemplate:
-    type: NodePort          # ClusterIP | NodePort | LoadBalancer (omit block = ClusterIP)
-    nodePort: 30080         # Fixed NodePort (30000-32767). Required for stable DNS.
+    type: LoadBalancer      # ClusterIP | LoadBalancer (NodePort is not used on k0s)
+    # No nodePort field — explicitly NodePort-free.
 
   features:
     - name: "saia"
@@ -175,6 +179,38 @@ aiPlatform:
         value: "true"
         effect: "NoSchedule"
 
+# ---------- MetalLB (k0s LoadBalancer provider) ----------
+# Required when aiPlatform.serviceTemplate.type=LoadBalancer on a bare-metal
+# / k0s cluster. Pinned chart version for supply-chain reproducibility
+# (codeguard-0-supply-chain-security).
+#
+# If serviceTemplate.type=NodePort, the installer skips MetalLB entirely even
+# when metallb.install=true (NodePort does not use a LoadBalancer provider).
+metallb:
+  install: true                    # set false if MetalLB is already installed or not needed
+  chartVersion: "0.14.8"           # metallb/metallb Helm chart (matches MetalLB v0.14.8)
+  namespace: "metallb-system"
+
+  # Address pool — a range of IPs MetalLB can hand out to LoadBalancer
+  # Services. Must be routable from clients (VPN-connected users) to your k0s
+  # workers. Use IPs that are NOT used elsewhere on the LAN.
+  pool:
+    name: "saia-pool"
+    addresses:
+      - "10.20.30.100-10.20.30.110"   # CHANGE THIS to a free range on your network
+
+  # Advertisement mode: "layer2" works on most LANs without network gear
+  # changes (one elected node answers ARP for the VIP at a time; failover ~
+  # seconds). Use "bgp" only if your fabric supports BGP peering — then also
+  # populate metallb.bgpPeers below.
+  mode: "layer2"                  # layer2 | bgp
+
+  # Required only when mode=bgp. Leave empty for layer2.
+  bgpPeers: []
+    # - peerAddress: "10.0.0.1"
+    #   peerASN: 65001
+    #   myASN: 65000
+
 # ---------- Image Pull Secrets ----------
 imagePullSecrets:
   secrets:
diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh
index 2adcffe..b4d508f 100755
--- a/tools/cluster_setup/k0s_cluster_with_stack.sh
+++ b/tools/cluster_setup/k0s_cluster_with_stack.sh
@@ -388,6 +388,17 @@ configure_images() {
   log "✓ All images configured successfully"
 }
 
+# True if objectStore.auth values are still obvious template text. Non-empty
+# placeholders otherwise pass the length preflight and get applied into
+# minio-credentials, which makes SAIA fail at startup with InvalidAccessKeyId.
+object_store_auth_looks_like_placeholder() {
+  case "${MINIO_ROOT_USER}${MINIO_ROOT_PASSWORD}" in
+    *\<*|*\>*) return 0 ;;
+    *CHANGEME*|*changeme*) return 0 ;;
+  esac
+  return 1
+}
+
 # ====== PREFLIGHT CHECKS ======
 preflight_checks() {
   pf_header "Required tools"
@@ -423,6 +434,9 @@ preflight_checks() {
     [[ -n "${OBJ_STORE_ENDPOINT}" ]] && pf_ok "Endpoint: ${OBJ_STORE_ENDPOINT}" || pf_fail "objectStore.endpoint is required"
   fi
   [[ -n "${MINIO_ROOT_PASSWORD}" ]] && pf_ok "Credentials configured" || pf_fail "Object store credentials required (objectStore.auth.rootPassword)"
+  if object_store_auth_looks_like_placeholder; then
+    pf_fail "objectStore.auth still contains template placeholders (e.g. <...> or CHANGEME). Replace with a real access key and secret in your config (keep secrets in a Git-ignored file such as tools/cluster_setup/k0s-config.local.yaml)."
+  fi
 
   pf_header "Infrastructure mode"
   pf_ok "Using existing infrastructure (on-prem/baremetal)"
@@ -1118,6 +1132,10 @@ ensure_namespace() {
 # the Kubernetes credentials secret so the operator and workloads can auth.
 ensure_s3compat_credentials() {
   log "Creating credentials secret for S3-compatible object storage (${OBJ_STORE_TYPE})..."
+  if object_store_auth_looks_like_placeholder; then
+    err "Refusing to create minio-credentials: objectStore.auth contains template placeholders; fix ${CONFIG_FILE}"
+    return 1
+  fi
   if [[ -z "${OBJ_STORE_ENDPOINT}" && -z "${MINIO_ENDPOINT}" ]]; then
     err "storage.objectStore.type=${OBJ_STORE_TYPE} requires storage.objectStore.endpoint"
     return 1
@@ -2637,15 +2655,23 @@ install_ai_platform_cr() {
 
   # Ensure object storage credentials secret exists in AI namespace
   log "Creating/updating S3-compatible credentials secret (minio-credentials) in ${AI_NS}..."
-  kubectl -n "${AI_NS}" create secret generic minio-credentials \
-    --from-literal=AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" \
-    --from-literal=AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \
-    --from-literal=s3_access_key="${MINIO_ROOT_USER}" \
-    --from-literal=s3_secret_key="${MINIO_ROOT_PASSWORD}" \
-    --from-literal=MINIO_ACCESS_KEY="${MINIO_ROOT_USER}" \
-    --from-literal=MINIO_SECRET_KEY="${MINIO_ROOT_PASSWORD}" \
-    --dry-run=client -o yaml | kubectl -n "${AI_NS}" apply -f -
-  log "✓ Object storage credentials secret ready"
+  if object_store_auth_looks_like_placeholder; then
+    if kubectl get secret minio-credentials -n "${AI_NS}" &>/dev/null; then
+      warn "Skipping minio-credentials apply: auth in ${CONFIG_FILE} still looks like a template (e.g. contains '<'). Preserving existing secret."
+    else
+      err "minio-credentials missing and cannot be created: fix objectStore.auth in ${CONFIG_FILE} (remove <...> placeholders)."
+    fi
+  else
+    kubectl -n "${AI_NS}" create secret generic minio-credentials \
+      --from-literal=AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" \
+      --from-literal=AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \
+      --from-literal=s3_access_key="${MINIO_ROOT_USER}" \
+      --from-literal=s3_secret_key="${MINIO_ROOT_PASSWORD}" \
+      --from-literal=MINIO_ACCESS_KEY="${MINIO_ROOT_USER}" \
+      --from-literal=MINIO_SECRET_KEY="${MINIO_ROOT_PASSWORD}" \
+      --dry-run=client -o yaml | kubectl -n "${AI_NS}" apply -f -
+    log "✓ Object storage credentials secret ready"
+  fi
 
   # Build imagePullSecrets YAML from created secrets
   local image_pull_secrets=""
@@ -2819,6 +2845,270 @@ YAML
   log "AIPlatform CR installed successfully"
 }
 
+saia_service_template_enabled_k0s() {
+  local svc_type
+  svc_type=$(yq eval '.aiPlatform.serviceTemplate.type // ""' "${CONFIG_FILE}" 2>/dev/null || echo "")
+  [[ -n "${svc_type}" && "${svc_type}" != "null" && "${svc_type}" != "ClusterIP" ]]
+}
+
+# True when SAIA public Service is explicitly NodePort. MetalLB is not used in
+# that mode, so install_metallb skips the Helm install even if metallb.install=true.
+k0s_saia_service_template_is_nodeport() {
+  local svc_type
+  svc_type=$(yq eval '.aiPlatform.serviceTemplate.type // ""' "${CONFIG_FILE}" 2>/dev/null || echo "")
+  [[ "${svc_type}" == "NodePort" ]]
+}
+
+wait_for_k0s_aiservice_exists() {
+  local name="$1" timeout="${2:-600}" waited=0
+  while ! kubectl -n "${AI_NS}" get aiservice "${name}" >/dev/null 2>&1; do
+    [[ $waited -ge $timeout ]] && err "Timed out waiting for AIService ${AI_NS}/${name}"
+    sleep 5
+    waited=$((waited + 5))
+  done
+}
+
+apply_k0s_saia_service_annotations() {
+  local aiservice_name="$1"
+  local annotation_keys key value
+
+  annotation_keys="$(yq eval '.aiPlatform.serviceTemplate.annotations // {} | keys | .[]' "${CONFIG_FILE}" 2>/dev/null || true)"
+  [[ -z "${annotation_keys}" ]] && return 0
+
+  local annotate_args=()
+  while IFS= read -r key; do
+    [[ -z "${key}" || "${key}" == "null" ]] && continue
+    value="$(yq eval ".aiPlatform.serviceTemplate.annotations.\"${key}\"" "${CONFIG_FILE}" 2>/dev/null || echo "")"
+    [[ -z "${value}" || "${value}" == "null" ]] && continue
+    annotate_args+=("${key}=${value}")
+  done <<< "${annotation_keys}"
+
+  if [[ ${#annotate_args[@]} -gt 0 ]]; then
+    log "Applying SAIA Service annotations to AIService/${aiservice_name}..."
+    kubectl -n "${AI_NS}" annotate aiservice "${aiservice_name}" "${annotate_args[@]}" --overwrite
+  fi
+}
+
+# ---------- MetalLB (k0s LoadBalancer provider) ----------
+# k0s ships without a Service.type=LoadBalancer provider. MetalLB fills that
+# gap by allocating a VIP from a customer-provided pool and announcing it via
+# Layer-2 (ARP/NDP) or BGP. We pin the chart version for supply-chain
+# reproducibility (codeguard-0-supply-chain-security).
+
+metallb_enabled_k0s() {
+  local v
+  v="$(yq eval '.metallb.install // false' "${CONFIG_FILE}" 2>/dev/null || echo false)"
+  [[ "${v}" == "true" ]]
+}
+
+install_metallb() {
+  metallb_enabled_k0s || { log "metallb.install != true — skipping MetalLB install"; return 0; }
+
+  if k0s_saia_service_template_is_nodeport; then
+    log "Skipping MetalLB install: aiPlatform.serviceTemplate.type=NodePort (LoadBalancer provider not used for SAIA)."
+    log "NOTE: metallb.install=true has no effect while SAIA uses NodePort. Set metallb.install=false to match config, or use type=LoadBalancer to install MetalLB."
+    return 0
+  fi
+
+  local ns chart_version pool_name addr_count mode
+  ns="$(yq eval '.metallb.namespace // "metallb-system"' "${CONFIG_FILE}" 2>/dev/null)"
+  chart_version="$(yq eval '.metallb.chartVersion // "0.14.8"' "${CONFIG_FILE}" 2>/dev/null)"
+  pool_name="$(yq eval '.metallb.pool.name // "saia-pool"' "${CONFIG_FILE}" 2>/dev/null)"
+  addr_count="$(yq eval '.metallb.pool.addresses // [] | length' "${CONFIG_FILE}" 2>/dev/null || echo 0)"
+  mode="$(yq eval '.metallb.mode // "layer2"' "${CONFIG_FILE}" 2>/dev/null)"
+
+  if [[ "${addr_count}" == "0" ]]; then
+    err "metallb.install=true but metallb.pool.addresses is empty. Provide at least one IP range routable on your network."
+  fi
+  if [[ "${mode}" != "layer2" && "${mode}" != "bgp" ]]; then
+    err "metallb.mode must be 'layer2' or 'bgp' (got: ${mode})."
+  fi
+
+  log "Installing MetalLB ${chart_version} into namespace ${ns}..."
+  helm repo add metallb https://metallb.github.io/metallb >/dev/null 2>&1 || true
+  helm repo update >/dev/null 2>&1 || true
+
+  kubectl get ns "${ns}" >/dev/null 2>&1 || kubectl create ns "${ns}"
+
+  helm upgrade --install metallb metallb/metallb \
+    --namespace "${ns}" \
+    --version "${chart_version}" \
+    --wait --timeout 5m
+
+  # Wait for the controller webhook to be Ready before applying CRs, otherwise
+  # the IPAddressPool / L2Advertisement applies race the validating webhook.
+  log "Waiting for MetalLB controller to be ready..."
+  kubectl -n "${ns}" rollout status deploy/metallb-controller --timeout=180s
+
+  # Render IPAddressPool with the configured address ranges.
+  local addresses_yaml=""
+  local i
+  local pool_count
+  pool_count="$(yq eval '.metallb.pool.addresses | length' "${CONFIG_FILE}" 2>/dev/null || echo 0)"
+  for ((i=0; i<pool_count; i++)); do
+    local addr
+    addr="$(yq eval ".metallb.pool.addresses[${i}]" "${CONFIG_FILE}" 2>/dev/null)"
+    [[ -z "${addr}" || "${addr}" == "null" ]] && continue
+    addresses_yaml+="    - ${addr}"$'\n'
+  done
+
+  log "Applying MetalLB IPAddressPool '${pool_name}' (${addr_count} range(s))..."
+  cat <<YAML | kubectl -n "${ns}" apply -f -
+apiVersion: metallb.io/v1beta1
+kind: IPAddressPool
+metadata:
+  name: ${pool_name}
+  namespace: ${ns}
+spec:
+  addresses:
+${addresses_yaml}
+YAML
+
+  if [[ "${mode}" == "layer2" ]]; then
+    log "Applying MetalLB L2Advertisement for pool '${pool_name}'..."
+    cat <<YAML | kubectl -n "${ns}" apply -f -
+apiVersion: metallb.io/v1beta1
+kind: L2Advertisement
+metadata:
+  name: ${pool_name}-l2
+  namespace: ${ns}
+spec:
+  ipAddressPools:
+    - ${pool_name}
+YAML
+  else
+    # BGP mode — render BGPPeers from config and attach a BGPAdvertisement.
+    local peer_count
+    peer_count="$(yq eval '.metallb.bgpPeers // [] | length' "${CONFIG_FILE}" 2>/dev/null || echo 0)"
+    if [[ "${peer_count}" == "0" ]]; then
+      err "metallb.mode=bgp requires metallb.bgpPeers to be non-empty (peerAddress, peerASN, myASN per peer)."
+    fi
+    local p
+    for ((p=0; p<peer_count; p++)); do
+      local peer_addr peer_asn my_asn
+      peer_addr="$(yq eval ".metallb.bgpPeers[${p}].peerAddress" "${CONFIG_FILE}" 2>/dev/null)"
+      peer_asn="$(yq eval ".metallb.bgpPeers[${p}].peerASN" "${CONFIG_FILE}" 2>/dev/null)"
+      my_asn="$(yq eval ".metallb.bgpPeers[${p}].myASN" "${CONFIG_FILE}" 2>/dev/null)"
+      [[ -z "${peer_addr}" || -z "${peer_asn}" || -z "${my_asn}" ]] && \
+        err "metallb.bgpPeers[${p}] missing peerAddress / peerASN / myASN."
+      cat <<YAML | kubectl -n "${ns}" apply -f -
+apiVersion: metallb.io/v1beta1
+kind: BGPPeer
+metadata:
+  name: bgp-peer-${p}
+  namespace: ${ns}
+spec:
+  peerAddress: ${peer_addr}
+  peerASN: ${peer_asn}
+  myASN: ${my_asn}
+YAML
+    done
+    cat <<YAML | kubectl -n "${ns}" apply -f -
+apiVersion: metallb.io/v1beta1
+kind: BGPAdvertisement
+metadata:
+  name: ${pool_name}-bgp
+  namespace: ${ns}
+spec:
+  ipAddressPools:
+    - ${pool_name}
+YAML
+  fi
+
+  log "✓ MetalLB ${chart_version} installed (${mode}, pool=${pool_name})"
+}
+
+# Disable kube-proxy NodePort allocation on the rendered SAIA Service so
+# kube-proxy never opens 30000-32767 on workers. The operator's
+# reconcileSAIAService only mutates Selector/Ports on existing Services
+# (pkg/ai/features/saia/impl.go), so this patch survives subsequent
+# reconciles. externalTrafficPolicy=Local preserves the real client IP for
+# MetalLB-style L4 providers (the announcing node forwards directly to a
+# local pod with no SNAT).
+patch_k0s_saia_service_disable_nodeport() {
+  local platform_name="${CLUSTER_NAME}-ai-platform"
+  local aiservice_name="${platform_name}-saia"
+  local svc_name="${aiservice_name}-saia-service"
+
+  local svc_type
+  svc_type="$(kubectl -n "${AI_NS}" get svc "${svc_name}" -o jsonpath='{.spec.type}' 2>/dev/null || true)"
+  [[ "${svc_type}" != "LoadBalancer" ]] && return 0
+
+  log "Patching Service ${AI_NS}/${svc_name} to disable NodePort allocation..."
+  kubectl -n "${AI_NS}" patch svc "${svc_name}" --type=merge -p '{
+  "spec": {
+    "allocateLoadBalancerNodePorts": false,
+    "externalTrafficPolicy": "Local"
+  }
+}' >/dev/null
+  log "✓ Service ${AI_NS}/${svc_name}: allocateLoadBalancerNodePorts=false, externalTrafficPolicy=Local"
+}
+
+patch_k0s_saia_public_service_workaround() {
+  local platform_name="${CLUSTER_NAME}-ai-platform"
+  local aiservice_name="${platform_name}-saia"
+  local public_svc_name="${aiservice_name}-saia-service"
+  local svc_type svc_node_port
+
+  svc_type=$(yq eval '.aiPlatform.serviceTemplate.type // ""' "${CONFIG_FILE}" 2>/dev/null || echo "")
+  svc_node_port=$(yq eval '.aiPlatform.serviceTemplate.nodePort // ""' "${CONFIG_FILE}" 2>/dev/null || echo "")
+
+  wait_for_k0s_aiservice_exists "${aiservice_name}"
+
+  if saia_service_template_enabled_k0s; then
+    log "Patching AIService/${aiservice_name} with SAIA public exposure settings (type=${svc_type})..."
+    if [[ "${svc_type}" == "NodePort" && -n "${svc_node_port}" && "${svc_node_port}" != "null" ]]; then
+      log "WARNING: NodePort exposure is discouraged on k0s. Prefer type=LoadBalancer with metallb.install=true (MetalLB install is skipped automatically when type=NodePort)." >&2
+      kubectl -n "${AI_NS}" patch aiservice "${aiservice_name}" --type merge -p "{
+  \"spec\": {
+    \"serviceTemplate\": {
+      \"spec\": {
+        \"type\": \"NodePort\",
+        \"ports\": [
+          {
+            \"name\": \"http\",
+            \"port\": 8080,
+            \"targetPort\": 8080,
+            \"nodePort\": ${svc_node_port}
+          }
+        ]
+      }
+    }
+  }
+}"
+    else
+      kubectl -n "${AI_NS}" patch aiservice "${aiservice_name}" --type merge -p "{
+  \"spec\": {
+    \"serviceTemplate\": {
+      \"spec\": {
+        \"type\": \"${svc_type}\"
+      }
+    }
+  }
+}"
+    fi
+  fi
+
+  apply_k0s_saia_service_annotations "${aiservice_name}"
+
+  kubectl -n "${AI_NS}" annotate aiservice "${aiservice_name}" script-reconcile-ts="$(date +%s)" --overwrite >/dev/null
+
+  if saia_service_template_enabled_k0s; then
+    log "Recreating SAIA public Service to ensure patched settings take effect..."
+    kubectl -n "${AI_NS}" delete svc "${public_svc_name}" --ignore-not-found >/dev/null 2>&1 || true
+    # Wait briefly for the operator to recreate it before patching NodePort
+    # allocation off; if it doesn't come back the patch will be a no-op.
+    local waited=0
+    while ! kubectl -n "${AI_NS}" get svc "${public_svc_name}" >/dev/null 2>&1; do
+      [[ ${waited} -ge 300 ]] && break
+      sleep 5
+      waited=$((waited + 5))
+    done
+  fi
+
+  patch_k0s_saia_service_disable_nodeport
+}
+
 # ====== INSTALL FULL STACK ======
 install_ai_platform_stack() {
   log "Installing complete AI Platform stack..."
@@ -2901,9 +3191,18 @@ install_ai_platform_stack() {
   # Apply Splunk Standalone CR (non-blocking — pod boots in background)
   install_splunk_standalone
 
+  # MetalLB must be installed BEFORE the AIPlatform CR is reconciled — the
+  # operator renders a Service.type=LoadBalancer for SAIA and we need a
+  # provider in the cluster to allocate a VIP, otherwise the Service is
+  # stuck in EXTERNAL-IP=<pending> indefinitely. No-op when
+  # metallb.install=false (e.g., user is bringing their own MetalLB or wants
+  # ClusterIP only).
+  install_metallb
+
   # Install AI Platform operator and CR while Splunk Standalone boots
   install_splunk_ai_operator
   install_ai_platform_cr
+  patch_k0s_saia_public_service_workaround
 
   # Now wait for Splunk Standalone to be ready (likely already done by now)
   wait_for_splunk_standalone
@@ -2923,7 +3222,10 @@ check_platform_health() {
   # Check 1: Cluster nodes
   log "Checking cluster nodes..."
   local not_ready
-  not_ready=$(kubectl get nodes --no-headers 2>/dev/null | grep -v " Ready " | wc -l || echo "0")
+  # Count nodes whose status is not Ready without relying on grep exit codes.
+  # This avoids `set -euo pipefail` aborting the script when all nodes are
+  # Ready, while still producing a whitespace-free numeric result.
+  not_ready=$(kubectl get nodes --no-headers 2>/dev/null | awk 'index($0, " Ready ") == 0 { count++ } END { print count+0 }')
   if [[ "${not_ready}" -gt 0 ]]; then
     warn "Found ${not_ready} node(s) not in Ready state"
     kubectl get nodes