diff --git a/tools/cluster_setup/cluster-config.yaml b/tools/cluster_setup/cluster-config.yaml index c0ed83a..6109331 100644 --- a/tools/cluster_setup/cluster-config.yaml +++ b/tools/cluster_setup/cluster-config.yaml @@ -13,8 +13,27 @@ # ---------- Cluster Configuration ---------- cluster: - useExisting: false # true = do not create cluster; use existing one (script fails if cluster not found) - name: "my-ai-cluster" # CHANGE THIS: Your EKS cluster name (DNS-1123 compliant: lowercase, numbers, hyphens) + # ------------------------------------------------------------------------ + # LIFECYCLE WORKFLOW (to avoid VPC/IGW quota churn and DELETE_FAILED loops) + # ------------------------------------------------------------------------ + # 1. FIRST install (cluster does not exist yet): + # useExisting: false # eksctl creates the cluster + VPC + # ./eks_cluster_with_stack.sh install + # + # 2. AFTER first install succeeds, flip this one line: + # useExisting: false # subsequent `install` only reconciles + # # operators/CRs on the existing cluster. + # Re-running `install` is now safe and does NOT create new VPCs/IGWs. + # + # 3. When you genuinely want to tear down: + # ALWAYS use `delete-full` (NOT `delete`). It uninstalls CRs/operators + # first so the AWS Load Balancer Controller removes its NLBs + SGs + # before CFN deletes the VPC -- this prevents DELETE_FAILED stacks + # leaving orphan VPCs behind and eating your per-region quota. + # ./eks_cluster_with_stack.sh delete-full + # ------------------------------------------------------------------------ + useExisting: false # true = do not create cluster; use existing one (script fails if cluster not found) + name: "my-ai-cluster" # CHANGE THIS: Your EKS cluster name (DNS-1123 compliant: lowercase, numbers, hyphens) region: "us-east-2" # CHANGE THIS: Your AWS region (e.g., us-east-1, us-west-2, eu-west-1) k8sVersion: "1.31" # Kubernetes version (1.29, 1.30, 1.31 supported) # When true: require subnets (existing VPC). On 'delete', only EKS and related resources are removed; VPC is preserved so you can redeploy (e.g. with MinIO on EC2 in same VPC). @@ -73,7 +92,7 @@ nodeGroups: desiredCapacity: 2 # Initial number of GPU nodes minSize: 2 # Minimum GPU nodes maxSize: 4 # Maximum GPU nodes (set equal to desiredCapacity for H100) - volumeSize: 1000 # EBS volume size per GPU node (GB) - larger for model storage + volumeSize: 500 # EBS volume size per GPU node (GB) - larger for model storage volumeType: "gp3" # EBS volume type # ── H100 ONLY ────────────────────────────────────────────────────────────── @@ -93,7 +112,7 @@ nodeGroups: # Object storage: only AWS S3 or external S3-compatible (no in-cluster MinIO install). # Use objectStore.type: aws (S3) or s3compat | minio | seaweedfs (external; endpoint + credentials required). storage: - s3Bucket: "ai-platform-bucket-minio-us-east-2" # Used when objectStore.type is aws + s3Bucket: "ai-platform-bucket-us-east-2" # Used when objectStore.type is aws storageClass: "gp3" # Storage class for Kubernetes PVCs (gp3, gp2, io1, io2) vectorDbSize: "50Gi" # VectorDB persistent volume size @@ -102,12 +121,8 @@ storage: # - minio: same wiring as s3compat but path uses minio:// (use if an older operator webhook rejects s3compat://) # - seaweedfs: path uses seaweedfs:// (requires operator webhook that allows that scheme) objectStore: - type: "minio" # aws | s3compat | minio | seaweedfs (external only for non-aws) - bucket: "ai-platform-bucket-minio-us-east-2" - endpoint: "http://10.0.0.5:9000" # CHANGE THIS: MinIO API (9000) or SeaweedFS S3 gateway (8333) - auth: - rootUser: "" # CHANGE THIS: S3-compatible access key (or MinIO root user) - rootPassword: "" # CHANGE THIS: S3-compatible secret key (or MinIO root password) + type: "aws" # aws | s3compat | minio | seaweedfs (external only for non-aws) + bucket: "ai-platform-bucket-us-east-2" # Must match SeaweedFS env (AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY) or MinIO root # ---------- Container Images Configuration ---------- images: @@ -147,7 +162,7 @@ images: # Result: "docker.io/myorg/splunk-ai-operator:v1.0.0" # Bump tag after building fixed operator (SAIA 8Gi default, SchemaJobId persist, feature config) #image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/splunk-ai-operator:v0.1.8" - image: "docker.io/kpratyush775/splunk-ai-operator:v0.1.31" + image: "docker.io/kbhos698/splunk-ai-operator:ai-tier" # Splunk Enterprise Images splunk: @@ -170,8 +185,8 @@ images: # Option 2: Full path with different registry # headImage: "docker.io/rayproject/ray:2.44.0" # Result: "docker.io/rayproject/ray:2.44.0" - headImage: "ml-platform/ray/ray-head:build-008" - workerImage: "ml-platform/ray/ray-worker-gpu:build-008" + headImage: "ml-platform/ray/ray-head:build-v2-008" + workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-008" # Weaviate Vector Database weaviate: @@ -183,9 +198,14 @@ images: # SAIA (Splunk AI Assistant) Images saia: # Relative paths - registry prefix auto-applied - apiImage: "ml-platform/saia/saia-api:build-005" - dataLoaderImage: "ml-platform/saia/saia-data-loader:build-003" - + # NOTE: keep dataLoaderImage in sync with apiImage/apiV2Image. Tags older than + # v2-008 (specifically pre v2.0.4-13-g3b677604) ship a broken URL-compat shim + # that ignores VECTOR_DB_GRPC_* env vars and falls back to grpc.:443 TLS, + # causing the vector-db-setup posthook Job to fail with a Weaviate gRPC health + # check error. See pkg/ai/features/saia/impl.go (reconcilePostInstallHook). + apiImage: "ml-platform/saia/saia-api:build-v2-009" + apiV2Image: "ml-platform/saia/saia-api-v2:build-v2-009" + dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v2-009" # Supporting Images fluentBit: # Docker Hub public image (has full path, registry prefix ignored) @@ -198,6 +218,14 @@ images: # Public image - full path so registry prefix is NOT applied; validation checks this URL image: "docker.io/otel/opentelemetry-collector-contrib:0.122.1" + # NGINX reverse proxy used by the SAIA reconciler to route v1 / v2 requests + # by path. OPTIONAL: omit this block to use the script default + # (docker.io/library/nginx:1.27-alpine). Add it only to pin a specific tag + # or point at an internal mirror in airgapped clusters. + # + # nginx: + # image: "harbor.internal/library/nginx:1.27-alpine" + # ---------- Operator Versions ---------- operators: ray: @@ -239,6 +267,124 @@ aiPlatform: serviceAccountName: "ray-worker-sa" imageRegistry: "" # Leave empty for default + # --------------------------------------------------------------------------- + # Public SAIA exposure (NodePort-free) + # --------------------------------------------------------------------------- + # The operator renders a public Kubernetes Service named + # `-saia-service`; because the AIService is typically named + # `-saia`, the resulting Service is usually + # `-saia-saia-service`. Its endpoints are the in-cluster + # nginx pods (nginx terminates path routing to saia v1 / v2). The install + # script then configures HOW that Service is reached from outside the cluster. + # + # IMPORTANT: this template intentionally does NOT use Service.type=NodePort. + # Many enterprise security policies prohibit opening 30000-32767 on every + # worker. All three modes below are NodePort-free — the script sets + # `allocateLoadBalancerNodePorts: false` on LoadBalancer Services so + # kube-proxy never opens a node port; for the BYO mode the Service stays + # ClusterIP and AWS LBC registers pod IPs into the customer's target group. + # + # Pick ONE of the modes below by editing the active block at the bottom of + # this section. Each mode shows: the YAML to use, what the script does, and + # what you must provision outside the cluster. + # + # --------------------------------------------------------------------------- + # MODE 1 — Operator-managed AWS NLB, IP-target mode (DEFAULT) + # --------------------------------------------------------------------------- + # serviceTemplate: + # type: LoadBalancer + # annotations: + # service.beta.kubernetes.io/aws-load-balancer-type: "external" + # service.beta.kubernetes.io/aws-load-balancer-scheme: "internet-facing" # or "internal" + # service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: "ip" # ← pods, not nodes + # service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: "true" + # # Optional TLS termination at the NLB: + # # service.beta.kubernetes.io/aws-load-balancer-ssl-cert: "arn:aws:acm:..." + # # service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "443" + # # service.beta.kubernetes.io/aws-load-balancer-ssl-negotiation-policy: "ELBSecurityPolicy-TLS13-1-2-2021-06" + # awsLoadBalancerController: + # install: true + # byoTargetGroup: + # enabled: false + # + # Script does: + # * Installs AWS Load Balancer Controller (LBC) with IRSA, tags subnets. + # * Creates the LoadBalancer Service; LBC provisions an NLB whose targets + # are pod IPs (no NodePort, no kube-proxy hop, real client IP preserved). + # * Patches the rendered Service to set `allocateLoadBalancerNodePorts: + # false` and `externalTrafficPolicy: Local`. + # You must do: nothing on the AWS side. DNS appears in + # `.status.loadBalancer.ingress[0].hostname` after ~2-3 min. + # + # --------------------------------------------------------------------------- + # MODE 2 — Bring-your-own AWS LB (TargetGroupBinding, IP-target) + # --------------------------------------------------------------------------- + # Customer already owns the NLB / ALB / target group. LBC is installed only + # to manage target-group membership; it does NOT create LBs in this mode. + # + # serviceTemplate: + # type: ClusterIP # LB is owned by the customer + # awsLoadBalancerController: + # install: true # required for TargetGroupBinding + # byoTargetGroup: + # enabled: true + # targetGroupArn: "arn:aws:elasticloadbalancing:::targetgroup//" + # securityGroupId: "sg-xxxxxxxxxxxxxxxxx" # the customer's LB security group + # + # Script does: + # * Installs LBC. + # * Leaves the public Service as ClusterIP. + # * Applies a TargetGroupBinding CR with `targetType: ip` so LBC registers + # nginx pod IPs into the customer's target group as endpoints change. + # You must do (outside the cluster): + # 1. Pre-create the target group in the EKS VPC with: + # - Target type: ip + # - Protocol/Port: TCP/8080 (NLB) or HTTP/8080 (ALB) ← pod port, not 30080 + # - Health check: HTTP /nginx_health on traffic-port, 200 OK + # 2. Attach the target group to your existing LB listener. + # 3. Worker pod SG ingress 8080 from the LB SG only — the + # TargetGroupBinding `networking.ingress.from.securityGroup` block + # configured by the script does this for you. + # + # --------------------------------------------------------------------------- + # MODE 3 — On-prem / k0s / airgap (NOT applicable to this EKS template) + # --------------------------------------------------------------------------- + # Use the dedicated `k0s-cluster-config.yaml` template, which configures + # MetalLB to allocate a routable VIP. The user-facing contract there is + # identical (`type: LoadBalancer`) — only the LB provider changes. + # + # --------------------------------------------------------------------------- + # SECURITY NOTES (apply to all modes) + # --------------------------------------------------------------------------- + # * Always terminate TLS at the LB (ACM cert on AWS) and place an auth + # layer in front (oauth2-proxy, Cognito on the ALB, API Gateway, …) + # before exposing on the public internet. + # * Restrict the LB listener to trusted source CIDRs / SGs (never + # 0.0.0.0/0 to a sensitive endpoint). + # * Pod SG ingress should allow 8080 only from the LB SG. + # --------------------------------------------------------------------------- + + # Active mode below — EDIT to switch. Default is MODE 1. + serviceTemplate: + type: LoadBalancer + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: "external" + service.beta.kubernetes.io/aws-load-balancer-scheme: "internet-facing" + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: "ip" + service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: "true" + + awsLoadBalancerController: + install: true + + # Bring-your-own AWS target group (Mode 2). Set enabled: true and provide + # both targetGroupArn and securityGroupId; the script will then leave the + # SAIA Service as ClusterIP and apply a TargetGroupBinding (LBC manages + # target registration into your existing target group). + byoTargetGroup: + enabled: false + # targetGroupArn: "arn:aws:elasticloadbalancing:us-east-2:123456789012:targetgroup/my-saia-tg/abc123" + # securityGroupId: "sg-0123456789abcdef0" + # CPU Scheduling cpuScheduling: nodeSelector: {} diff --git a/tools/cluster_setup/eks_cluster_with_stack.sh b/tools/cluster_setup/eks_cluster_with_stack.sh index 93df2bd..dccb365 100755 --- a/tools/cluster_setup/eks_cluster_with_stack.sh +++ b/tools/cluster_setup/eks_cluster_with_stack.sh @@ -91,6 +91,20 @@ load_config() { SAIA_SERVICE_SA="$(yq eval '.aiPlatform.serviceAccounts.saiaService' "$cfg")" DEFAULT_ACCELERATOR="$(yq eval '.aiPlatform.defaultAcceleratorType' "$cfg")" WORKER_IMAGE_REGISTRY="$(yq eval '.aiPlatform.workerGroupConfig.imageRegistry' "$cfg")" + SAIA_SERVICE_TYPE="$(yq eval '.aiPlatform.serviceTemplate.type // ""' "$cfg")" + SAIA_SERVICE_NODE_PORT="$(yq eval '.aiPlatform.serviceTemplate.nodePort // ""' "$cfg")" + # AWS Load Balancer Controller (LBC) install toggle. Required for both + # operator-managed NLB provisioning (Mode 1) and customer-owned LB + # registration via TargetGroupBinding (Mode 2). Off-AWS users (k0s) leave + # this false. + INSTALL_LBC="$(yq eval '.aiPlatform.awsLoadBalancerController.install // false' "$cfg")" + # Bring-your-own AWS target group (Mode 2). When enabled the script keeps + # the public Service as ClusterIP and applies a TargetGroupBinding so LBC + # registers nginx pod IPs into the customer's pre-existing target group. + # Requires INSTALL_LBC=true. + BYO_TG_ENABLED="$(yq eval '.aiPlatform.byoTargetGroup.enabled // false' "$cfg")" + BYO_TG_ARN="$(yq eval '.aiPlatform.byoTargetGroup.targetGroupArn // ""' "$cfg")" + BYO_TG_SG_ID="$(yq eval '.aiPlatform.byoTargetGroup.securityGroupId // ""' "$cfg")" INGRESS_HOST="$(yq eval '.aiPlatform.ingress.host' "$cfg")" INGRESS_CLASS="$(yq eval '.aiPlatform.ingress.className' "$cfg")" INGRESS_TLS_SECRET="$(yq eval '.aiPlatform.ingress.tlsSecretName' "$cfg")" @@ -120,9 +134,11 @@ load_config() { RAY_WORKER_IMAGE="$(yq eval '.images.ray.workerImage' "$cfg")" WEAVIATE_IMAGE="$(yq eval '.images.weaviate.image' "$cfg")" SAIA_API_IMAGE="$(yq eval '.images.saia.apiImage' "$cfg")" + SAIA_API_V2_IMAGE="$(yq eval '.images.saia.apiV2Image // ""' "$cfg")" SAIA_DATALOADER_IMAGE="$(yq eval '.images.saia.dataLoaderImage' "$cfg")" FLUENT_BIT_IMAGE="$(yq eval '.images.fluentBit.image' "$cfg")" OTEL_COLLECTOR_IMAGE="$(yq eval '.images.otelCollector.image' "$cfg")" + NGINX_IMAGE="$(yq eval '.images.nginx.image // "docker.io/library/nginx:1.27-alpine"' "$cfg")" # Subnets - read as arrays (support both cluster.subnets and top-level subnets) PRIVATE_SUBNETS=() @@ -172,6 +188,12 @@ load_config() { SAIA_SERVICE_SA="saia-service-sa" DEFAULT_ACCELERATOR="L40S" WORKER_IMAGE_REGISTRY="" + SAIA_SERVICE_TYPE="" + SAIA_SERVICE_NODE_PORT="" + INSTALL_LBC="false" + BYO_TG_ENABLED="false" + BYO_TG_ARN="" + BYO_TG_SG_ID="" INGRESS_HOST="ai.example.com" INGRESS_CLASS="nginx" INGRESS_TLS_SECRET="ai-platform-tls" @@ -179,6 +201,8 @@ load_config() { SPLUNK_OPERATOR_FILE="./splunk-operator-cluster.yaml" SPLUNK_AI_FILE="./artifacts.yaml" SPLUNK_IMAGE="splunk/splunk:10.2.0-dev1" + SAIA_API_V2_IMAGE="" + NGINX_IMAGE="docker.io/library/nginx:1.27-alpine" RAY_VERSION="v1.2.2" NVIDIA_VERSION="v0.17.3" ENABLE_CPU=true @@ -230,6 +254,19 @@ load_config() { # Splunk operators SPLUNK_AI_NS="splunk-ai-operator-system" + # AWS Load Balancer Controller (LBC) — required when a Service of type=LoadBalancer + # uses the "service.beta.kubernetes.io/aws-load-balancer-type: external" annotation + # (the in-tree EKS cloud controller intentionally skips those Services). Pinned + # chart and policy versions keep installs reproducible against a vetted upstream + # release (supply-chain hygiene: codeguard-0-supply-chain-security). + LBC_NS="kube-system" + LBC_SA="aws-load-balancer-controller" + LBC_RELEASE="aws-load-balancer-controller" + LBC_ROLE_NAME="AWSLoadBalancerControllerRole-${CLUSTER_NAME}" + LBC_POLICY_NAME="AWSLoadBalancerControllerIAMPolicy-${CLUSTER_NAME}" + LBC_CHART_VERSION="1.8.2" # helm chart version (appVersion v2.8.2) + LBC_POLICY_VERSION="v2.8.2" # upstream tag used to fetch iam_policy.json + log "Configuration loaded: cluster=${CLUSTER_NAME}, region=${REGION}, namespace=${AI_NS}" } @@ -386,47 +423,67 @@ configure_images() { local ray_worker_full=$(build_image_url "$IMAGE_REGISTRY" "$RAY_WORKER_IMAGE") local weaviate_full=$(build_image_url "$IMAGE_REGISTRY" "$WEAVIATE_IMAGE") local saia_api_full=$(build_image_url "$IMAGE_REGISTRY" "$SAIA_API_IMAGE") + local saia_api_v2_full="" local saia_dataloader_full=$(build_image_url "$IMAGE_REGISTRY" "$SAIA_DATALOADER_IMAGE") local fluent_bit_full=$(build_image_url "$IMAGE_REGISTRY" "$FLUENT_BIT_IMAGE") local otel_collector_full=$(build_image_url "$IMAGE_REGISTRY" "$OTEL_COLLECTOR_IMAGE") + local nginx_full=$(build_image_url "$IMAGE_REGISTRY" "$NGINX_IMAGE") + if [[ -n "${SAIA_API_V2_IMAGE}" && "${SAIA_API_V2_IMAGE}" != "null" ]]; then + saia_api_v2_full=$(build_image_url "$IMAGE_REGISTRY" "$SAIA_API_V2_IMAGE") + fi # Escape special characters for sed local ray_head_escaped=$(echo "$ray_head_full" | sed 's/[\/&]/\\&/g') local ray_worker_escaped=$(echo "$ray_worker_full" | sed 's/[\/&]/\\&/g') local weaviate_escaped=$(echo "$weaviate_full" | sed 's/[\/&]/\\&/g') local saia_api_escaped=$(echo "$saia_api_full" | sed 's/[\/&]/\\&/g') + local saia_api_v2_escaped="" local saia_dataloader_escaped=$(echo "$saia_dataloader_full" | sed 's/[\/&]/\\&/g') local fluent_bit_escaped=$(echo "$fluent_bit_full" | sed 's/[\/&]/\\&/g') local otel_collector_escaped=$(echo "$otel_collector_full" | sed 's/[\/&]/\\&/g') + local nginx_escaped=$(echo "$nginx_full" | sed 's/[\/&]/\\&/g') local operator_escaped=$(echo "$operator_full" | sed 's/[\/&]/\\&/g') + if [[ -n "${saia_api_v2_full}" ]]; then + saia_api_v2_escaped=$(echo "$saia_api_v2_full" | sed 's/[\/&]/\\&/g') + fi - SEDOPTION="-i" + local SED_INPLACE if [[ "$OSTYPE" == "darwin"* ]]; then - SEDOPTION="-i ''" + SED_INPLACE=(sed -i "") + else + SED_INPLACE=(sed -i) fi # Replace RELATED_IMAGE_ env vars by matching the env var name (not the value pattern) # This works regardless of what registry/image was there before - sed $SEDOPTION "/name: RELATED_IMAGE_RAY_HEAD/,/value:/ s|value:.*|value: ${ray_head_escaped}|" "$SPLUNK_AI_FILE" - sed $SEDOPTION "/name: RELATED_IMAGE_RAY_WORKER/,/value:/ s|value:.*|value: ${ray_worker_escaped}|" "$SPLUNK_AI_FILE" - sed $SEDOPTION "/name: RELATED_IMAGE_WEAVIATE/,/value:/ s|value:.*|value: ${weaviate_escaped}|" "$SPLUNK_AI_FILE" - sed $SEDOPTION "/name: RELATED_IMAGE_SAIA_API/,/value:/ s|value:.*|value: ${saia_api_escaped}|" "$SPLUNK_AI_FILE" - sed $SEDOPTION "/name: RELATED_IMAGE_POST_INSTALL_HOOK/,/value:/ s|value:.*|value: ${saia_dataloader_escaped}|" "$SPLUNK_AI_FILE" - sed $SEDOPTION "/name: RELATED_IMAGE_FLUENT_BIT/,/value:/ s|value:.*|value: ${fluent_bit_escaped}|" "$SPLUNK_AI_FILE" - sed $SEDOPTION "/name: RELATED_IMAGE_OTEL_COLLECTOR/,/value:/ s|value:.*|value: ${otel_collector_escaped}|" "$SPLUNK_AI_FILE" - sed $SEDOPTION "/name: MODEL_VERSION/,/value:/ s|value:.*|value: ${MODEL_VERSION}|" "$SPLUNK_AI_FILE" - sed $SEDOPTION "/name: RAY_VERSION/,/value:/ s|value:.*|value: ${RAY_RUNTIME_VERSION}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_RAY_HEAD/,/value:/ s|value:.*|value: ${ray_head_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_RAY_WORKER/,/value:/ s|value:.*|value: ${ray_worker_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_WEAVIATE/,/value:/ s|value:.*|value: ${weaviate_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_SAIA_API$/,/value:/ s|value:.*|value: ${saia_api_escaped}|" "$SPLUNK_AI_FILE" + if [[ -n "${saia_api_v2_escaped}" ]]; then + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_SAIA_API_V2/,/value:/ s|value:.*|value: ${saia_api_v2_escaped}|" "$SPLUNK_AI_FILE" + fi + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_POST_INSTALL_HOOK/,/value:/ s|value:.*|value: ${saia_dataloader_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_FLUENT_BIT/,/value:/ s|value:.*|value: ${fluent_bit_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_OTEL_COLLECTOR/,/value:/ s|value:.*|value: ${otel_collector_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_NGINX/,/value:/ s|value:.*|value: ${nginx_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: MODEL_VERSION/,/value:/ s|value:.*|value: ${MODEL_VERSION}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RAY_VERSION/,/value:/ s|value:.*|value: ${RAY_RUNTIME_VERSION}|" "$SPLUNK_AI_FILE" # Replace operator image (the container image itself, not env var) # Find the line with "image:" that's near "splunk-ai-operator" and replace it - sed $SEDOPTION "s|image: .*splunk.*ai.*operator.*|image: ${operator_escaped}|I" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "s|image: .*splunk.*ai.*operator.*|image: ${operator_escaped}|I" "$SPLUNK_AI_FILE" log " ✓ Updated RELATED_IMAGE_RAY_HEAD: $ray_head_full" log " ✓ Updated RELATED_IMAGE_RAY_WORKER: $ray_worker_full" log " ✓ Updated RELATED_IMAGE_WEAVIATE: $weaviate_full" log " ✓ Updated RELATED_IMAGE_SAIA_API: $saia_api_full" + if [[ -n "${saia_api_v2_full}" ]]; then + log " ✓ Updated RELATED_IMAGE_SAIA_API_V2: $saia_api_v2_full" + fi log " ✓ Updated RELATED_IMAGE_POST_INSTALL_HOOK: $saia_dataloader_full" log " ✓ Updated RELATED_IMAGE_FLUENT_BIT: $fluent_bit_full" log " ✓ Updated RELATED_IMAGE_OTEL_COLLECTOR: $otel_collector_full" + log " ✓ Updated RELATED_IMAGE_NGINX: $nginx_full" log " ✓ Updated operator image: $operator_full" log " ✓ Updated MODEL_VERSION: $MODEL_VERSION" log " ✓ Updated RAY_VERSION: $RAY_RUNTIME_VERSION" @@ -441,10 +498,10 @@ configure_images() { local splunk_op_escaped=$(echo "$splunk_operator_full" | sed 's/[\/&]/\\&/g') # Replace RELATED_IMAGE_SPLUNK_ENTERPRISE env var - sed $SEDOPTION "/name: RELATED_IMAGE_SPLUNK_ENTERPRISE/,/value:/ s|value:.*|value: ${splunk_escaped}|" "$SPLUNK_OPERATOR_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_SPLUNK_ENTERPRISE/,/value:/ s|value:.*|value: ${splunk_escaped}|" "$SPLUNK_OPERATOR_FILE" # Replace splunk-operator image (the container image itself) - sed $SEDOPTION "s|image: .*splunk.*operator.*|image: ${splunk_op_escaped}|I" "$SPLUNK_OPERATOR_FILE" + "${SED_INPLACE[@]}" "s|image: .*splunk.*operator.*|image: ${splunk_op_escaped}|I" "$SPLUNK_OPERATOR_FILE" log " ✓ Updated Splunk Enterprise image: $splunk_full" log " ✓ Updated Splunk Operator image: $splunk_operator_full" @@ -890,6 +947,11 @@ ${public_subnets}" fi else log "No subnets specified - eksctl will create new subnets automatically" + # One NAT gateway => one Elastic IP. HighlyAvailable uses one NAT per AZ + # (often 3 EIPs) and commonly trips the default regional EIP quota (5). + vpc_config="vpc: + nat: + gateway: Single" fi cat < eks-cluster-config.yaml @@ -899,6 +961,8 @@ metadata: name: ${CLUSTER_NAME} region: ${REGION} version: "${K8S_VERSION}" +autoModeConfig: + enabled: false iam: withOIDC: true addons: @@ -1365,6 +1429,139 @@ install_cert_manager() { check_ready cert-manager "app.kubernetes.io/instance=cert-manager,app.kubernetes.io/component=controller" } +# ---------- AWS Load Balancer Controller (LBC) ---------- +# LBC watches Services with the "aws-load-balancer-type: external" annotation +# (the in-tree cloud controller skips those Services on purpose) and drives +# NLB/ALB provisioning through the AWS ELBv2 API. Without LBC installed, such +# Services stay in EXTERNAL-IP= forever. LBC also gives us IP-mode +# targeting, ACM-backed TLS termination, and modern NLB attributes — all +# features the in-tree controller does not support. + +# Fetches the upstream-recommended IAM policy for LBC from a pinned git tag and +# creates a customer-managed policy in the account (idempotent). Emits the ARN +# on stdout so the caller can attach it via eksctl. Uses a cluster-scoped name +# so teardown of one cluster won't remove a policy shared with other clusters. +ensure_lbc_iam_policy() { + # Resolve the caller's account ID; construct the canonical policy ARN + # deterministically (IAM policy names are unique per account). This avoids + # parsing AWS CLI text output -- some CLI/JMESPath combinations have been + # observed to emit multi-line "None\nNone" for `Policies[?...].Arn | [0]` + # when no match exists, which would otherwise slip past a "!= None" guard. + local acct policy_arn + acct="$(aws sts get-caller-identity --query Account --output text 2>/dev/null | tr -d '[:space:]')" + if [[ -z "$acct" || ! "$acct" =~ ^[0-9]{12}$ ]]; then + err "Could not resolve a valid AWS account ID via STS (got: '${acct}')" + fi + policy_arn="arn:aws:iam::${acct}:policy/${LBC_POLICY_NAME}" + + if aws iam get-policy --policy-arn "$policy_arn" >/dev/null 2>&1; then + log "✓ LBC IAM policy already exists: ${policy_arn}" >&2 + printf "%s" "$policy_arn" + return 0 + fi + + local tmp; tmp="$(mktemp)"; TMP_FILES+=("$tmp") + local url="https://raw.githubusercontent.com/kubernetes-sigs/aws-load-balancer-controller/${LBC_POLICY_VERSION}/docs/install/iam_policy.json" + log "Fetching LBC IAM policy ${LBC_POLICY_VERSION} from ${url}" >&2 + if ! curl -fsSL --max-time 60 "$url" -o "$tmp"; then + err "Failed to download AWS LBC IAM policy from ${url}. Check network access or bump LBC_POLICY_VERSION." + fi + if ! jq -e . "$tmp" >/dev/null 2>&1; then + err "Downloaded LBC IAM policy is not valid JSON. Refusing to proceed." + fi + + local created + created="$(aws iam create-policy \ + --policy-name "${LBC_POLICY_NAME}" \ + --policy-document "file://${tmp}" \ + --description "AWS Load Balancer Controller policy for ${CLUSTER_NAME} (${LBC_POLICY_VERSION})" \ + --query 'Policy.Arn' --output text 2>/dev/null | tr -d '[:space:]')" + if [[ -z "$created" || "$created" != arn:aws:iam::* ]]; then + err "create-policy did not return a valid ARN for ${LBC_POLICY_NAME} (got: '${created}')" + fi + log "✓ Created LBC IAM policy ${LBC_POLICY_NAME}: ${created}" >&2 + printf "%s" "$created" +} + +# Creates the IRSA-bound ServiceAccount used by the LBC deployment. Uses eksctl +# so the trust policy is pinned to this cluster's OIDC provider and SA subject. +ensure_lbc_irsa() { + log "Ensuring IRSA for AWS Load Balancer Controller (${LBC_NS}/${LBC_SA})..." + local policy_arn; policy_arn="$(ensure_lbc_iam_policy)" + if [[ -z "$policy_arn" || "$policy_arn" != arn:aws:iam::* ]]; then + err "LBC IAM policy ARN is empty/invalid ('${policy_arn}'); cannot configure IRSA" + fi + + eksctl create iamserviceaccount \ + --cluster "${CLUSTER_NAME}" \ + --region "${REGION}" \ + --namespace "${LBC_NS}" \ + --name "${LBC_SA}" \ + --role-name "${LBC_ROLE_NAME}" \ + --attach-policy-arn "${policy_arn}" \ + --approve \ + --override-existing-serviceaccounts + + wait_resource_exists "${LBC_NS}" sa "${LBC_SA}" 180 + log "✓ LBC IRSA role and service account configured" +} + +# Tags user-provided subnets so LBC can auto-discover where to place LBs. +# eksctl already tags subnets it creates, so this is a no-op when the cluster +# was created without explicit cluster.subnets. +tag_lbc_subnets() { + if [[ ${#PUBLIC_SUBNETS[@]} -eq 0 && ${#PRIVATE_SUBNETS[@]} -eq 0 ]]; then + log "No user-provided subnets; eksctl-created subnets are already tagged for LBC discovery." + return 0 + fi + log "Tagging user-provided subnets for AWS Load Balancer Controller discovery..." + if [[ ${#PUBLIC_SUBNETS[@]} -gt 0 ]]; then + log " Public subnets (${#PUBLIC_SUBNETS[@]}): kubernetes.io/role/elb=1" + aws ec2 create-tags --region "${REGION}" \ + --resources "${PUBLIC_SUBNETS[@]}" \ + --tags Key=kubernetes.io/role/elb,Value=1 \ + "Key=kubernetes.io/cluster/${CLUSTER_NAME},Value=shared" + fi + if [[ ${#PRIVATE_SUBNETS[@]} -gt 0 ]]; then + log " Private subnets (${#PRIVATE_SUBNETS[@]}): kubernetes.io/role/internal-elb=1" + aws ec2 create-tags --region "${REGION}" \ + --resources "${PRIVATE_SUBNETS[@]}" \ + --tags Key=kubernetes.io/role/internal-elb,Value=1 \ + "Key=kubernetes.io/cluster/${CLUSTER_NAME},Value=shared" + fi + log "✓ Subnets tagged for LBC auto-discovery" +} + +install_aws_load_balancer_controller() { + log "Installing AWS Load Balancer Controller (helm chart ${LBC_CHART_VERSION})..." + + local vpc_id + vpc_id="$(aws eks describe-cluster --name "${CLUSTER_NAME}" --region "${REGION}" \ + --query 'cluster.resourcesVpcConfig.vpcId' --output text 2>/dev/null || true)" + if [[ -z "$vpc_id" || "$vpc_id" == "None" ]]; then + err "Could not determine VPC ID for cluster ${CLUSTER_NAME}. LBC install requires vpcId." + fi + + if ! aws iam get-role --role-name "${LBC_ROLE_NAME}" >/dev/null 2>&1; then + err "IRSA role ${LBC_ROLE_NAME} not found. ensure_lbc_irsa must run first." + fi + + helm repo add eks https://aws.github.io/eks-charts >/dev/null + helm repo update >/dev/null + helm_retry 5 upgrade --install "${LBC_RELEASE}" eks/aws-load-balancer-controller \ + --namespace "${LBC_NS}" \ + --version "${LBC_CHART_VERSION}" \ + --set clusterName="${CLUSTER_NAME}" \ + --set region="${REGION}" \ + --set vpcId="${vpc_id}" \ + --set serviceAccount.create=false \ + --set serviceAccount.name="${LBC_SA}" \ + --wait --timeout 10m + + check_ready "${LBC_NS}" "app.kubernetes.io/name=aws-load-balancer-controller" + log "✓ AWS Load Balancer Controller ${LBC_CHART_VERSION} installed and ready" +} + # ---------- External S3-compatible object storage (credentials only; no in-cluster install) ---------- ensure_s3compat_credentials() { # Only create credentials secret when using external S3-compatible storage (s3compat, minio, seaweedfs). @@ -1536,6 +1733,39 @@ ensure_s3_upload_splunk_app() { fi } +ensure_external_objstore_upload_splunk_app() { + if [[ -z "${SPLUNK_APP_LOCAL_PATH}" ]]; then + log "SPLUNK_APP_LOCAL_PATH not set; skipping app upload to ${OBJ_STORE_TYPE}://${OBJ_STORE_BUCKET}/apps/" + return 0 + fi + if [[ ! -f "${SPLUNK_APP_LOCAL_PATH}" ]]; then + warn "SPLUNK_APP_LOCAL_PATH='${SPLUNK_APP_LOCAL_PATH}' not found; skipping upload" + return 0 + fi + if [[ -z "${OBJ_STORE_ENDPOINT}" ]]; then + warn "OBJ_STORE_ENDPOINT not set; cannot upload Splunk app to external object store" + return 0 + fi + + local base key + base="$(basename "${SPLUNK_APP_LOCAL_PATH}")" + key="apps/${base}" + log "Ensuring Splunk app '${base}' exists at ${OBJ_STORE_TYPE}://${OBJ_STORE_BUCKET}/${key}" + + if AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \ + aws --endpoint-url "${OBJ_STORE_ENDPOINT}" s3api head-object --bucket "${OBJ_STORE_BUCKET}" --key "${key}" >/dev/null 2>&1; then + log "App already present at ${OBJ_STORE_TYPE}://${OBJ_STORE_BUCKET}/${key}; skipping upload" + else + AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \ + aws --endpoint-url "${OBJ_STORE_ENDPOINT}" s3 cp "${SPLUNK_APP_LOCAL_PATH}" "s3://${OBJ_STORE_BUCKET}/${key}" + log "Uploaded ${base} to ${OBJ_STORE_TYPE}://${OBJ_STORE_BUCKET}/${key}" + fi +} + +should_wait_for_splunk_app_install() { + [[ -n "${SPLUNK_APP_LOCAL_PATH:-}" && -f "${SPLUNK_APP_LOCAL_PATH}" ]] +} + ensure_namespace() { kubectl get ns "$1" >/dev/null 2>&1 || kubectl create ns "$1"; } ensure_bucket_policy() { @@ -2112,6 +2342,233 @@ show_platform_access_info() { log "" } +saia_service_template_enabled() { + [[ -n "${SAIA_SERVICE_TYPE:-}" && "${SAIA_SERVICE_TYPE}" != "null" && "${SAIA_SERVICE_TYPE}" != "ClusterIP" ]] +} + +saia_aiservice_name() { + local platform_name="${1:-${AI_PLATFORM_NAME}}" + printf "%s-saia" "${platform_name}" +} + +wait_for_aiservice_exists() { + local name="$1" timeout="${2:-600}" waited=0 + while ! kubectl -n "${AI_NS}" get aiservice "${name}" >/dev/null 2>&1; do + [[ $waited -ge $timeout ]] && err "Timed out waiting for AIService ${AI_NS}/${name}" + sleep 5 + waited=$((waited + 5)) + done +} + +apply_saia_service_annotations() { + local aiservice_name="$1" + local annotation_keys key value + + annotation_keys="$(yq eval '.aiPlatform.serviceTemplate.annotations // {} | keys | .[]' "${CONFIG_FILE}" 2>/dev/null || true)" + [[ -z "${annotation_keys}" ]] && return 0 + + local annotate_args=() + while IFS= read -r key; do + [[ -z "${key}" || "${key}" == "null" ]] && continue + value="$(yq eval ".aiPlatform.serviceTemplate.annotations.\"${key}\"" "${CONFIG_FILE}" 2>/dev/null || echo "")" + [[ -z "${value}" || "${value}" == "null" ]] && continue + annotate_args+=("${key}=${value}") + done <<< "${annotation_keys}" + + if [[ ${#annotate_args[@]} -gt 0 ]]; then + log "Applying SAIA Service annotations to AIService/${aiservice_name}..." + kubectl -n "${AI_NS}" annotate aiservice "${aiservice_name}" "${annotate_args[@]}" --overwrite + fi +} + +byo_target_group_enabled() { + [[ "${BYO_TG_ENABLED:-false}" == "true" ]] +} + +# Validates BYO target-group configuration and warns about misconfigurations +# before any kubectl/aws calls are issued. Caller decides whether to err or +# return on warnings — we treat missing required fields as fatal because the +# rest of the install would silently misroute traffic. +validate_byo_target_group_config() { + byo_target_group_enabled || return 0 + + if [[ "${INSTALL_LBC:-false}" != "true" ]]; then + err "byoTargetGroup.enabled=true requires awsLoadBalancerController.install=true (LBC manages the TargetGroupBinding)." + fi + if [[ -z "${BYO_TG_ARN:-}" || "${BYO_TG_ARN}" == "null" ]]; then + err "byoTargetGroup.enabled=true requires byoTargetGroup.targetGroupArn to be set." + fi + if [[ "${BYO_TG_ARN}" != arn:aws:elasticloadbalancing:* ]]; then + err "byoTargetGroup.targetGroupArn must look like 'arn:aws:elasticloadbalancing:::targetgroup//' (got: ${BYO_TG_ARN})." + fi + if [[ -z "${BYO_TG_SG_ID:-}" || "${BYO_TG_SG_ID}" == "null" ]]; then + err "byoTargetGroup.enabled=true requires byoTargetGroup.securityGroupId (the customer LB's SG) so LBC opens pod-SG ingress correctly." + fi + if [[ "${SAIA_SERVICE_TYPE:-}" == "LoadBalancer" ]]; then + log "WARNING: byoTargetGroup.enabled=true with serviceTemplate.type=LoadBalancer creates BOTH an operator-managed LB AND a TargetGroupBinding. Set serviceTemplate.type=ClusterIP for pure BYO." >&2 + fi +} + +# Apply a TargetGroupBinding CR pointing at the customer's pre-provisioned +# target group. AWS LBC reads this CR and registers the SAIA Service's pod +# IPs (targetType: ip) into the customer's TG, then deregisters them on pod +# rotation. The networking.ingress block has LBC open the pod SG to the LB's +# SG only — never 0.0.0.0/0 (codeguard-0-iac-security). +apply_byo_target_group_binding() { + local platform_name="${1:-${AI_PLATFORM_NAME}}" + local svc_name + svc_name="$(saia_aiservice_name "${platform_name}")-saia-service" + + byo_target_group_enabled || return 0 + + log "Applying TargetGroupBinding for BYO target group ${BYO_TG_ARN}..." + cat </dev/null || true)" + [[ "${svc_type}" != "LoadBalancer" ]] && return 0 + + log "Patching Service ${AI_NS}/${svc_name} to disable NodePort allocation..." + kubectl -n "${AI_NS}" patch svc "${svc_name}" --type=merge -p '{ + "spec": { + "allocateLoadBalancerNodePorts": false, + "externalTrafficPolicy": "Local" + } +}' >/dev/null + log "✓ Service ${AI_NS}/${svc_name}: allocateLoadBalancerNodePorts=false, externalTrafficPolicy=Local" +} + +patch_saia_public_service_workaround() { + local platform_name="${1:-${AI_PLATFORM_NAME}}" + local aiservice_name public_svc_name effective_type + + aiservice_name="$(saia_aiservice_name "${platform_name}")" + public_svc_name="${aiservice_name}-saia-service" + + wait_for_aiservice_exists "${aiservice_name}" + + # In BYO mode the customer owns the LB; force the SAIA Service to ClusterIP + # regardless of what serviceTemplate.type says — TargetGroupBinding wires + # everything else. + if byo_target_group_enabled; then + effective_type="ClusterIP" + else + effective_type="${SAIA_SERVICE_TYPE}" + fi + + if [[ -n "${effective_type:-}" && "${effective_type}" != "null" ]]; then + log "Patching AIService/${aiservice_name} with SAIA public exposure settings (type=${effective_type})..." + if [[ "${effective_type}" == "NodePort" && -n "${SAIA_SERVICE_NODE_PORT:-}" && "${SAIA_SERVICE_NODE_PORT}" != "null" ]]; then + log "WARNING: NodePort exposure is discouraged; consider Mode 1 (LoadBalancer + LBC) or Mode 2 (BYO target group) instead." >&2 + kubectl -n "${AI_NS}" patch aiservice "${aiservice_name}" --type merge -p "{ + \"spec\": { + \"serviceTemplate\": { + \"spec\": { + \"type\": \"NodePort\", + \"ports\": [ + { + \"name\": \"http\", + \"port\": 8080, + \"targetPort\": 8080, + \"nodePort\": ${SAIA_SERVICE_NODE_PORT} + } + ] + } + } + } +}" + else + kubectl -n "${AI_NS}" patch aiservice "${aiservice_name}" --type merge -p "{ + \"spec\": { + \"serviceTemplate\": { + \"spec\": { + \"type\": \"${effective_type}\" + } + } + } +}" + fi + fi + + apply_saia_service_annotations "${aiservice_name}" + + kubectl -n "${AI_NS}" annotate aiservice "${aiservice_name}" script-reconcile-ts="$(date +%s)" --overwrite >/dev/null + + if [[ -n "${effective_type:-}" && "${effective_type}" != "null" && "${effective_type}" != "ClusterIP" ]]; then + log "Recreating SAIA public Service to ensure patched settings take effect..." + kubectl -n "${AI_NS}" delete svc "${public_svc_name}" --ignore-not-found >/dev/null 2>&1 || true + wait_resource_exists "${AI_NS}" service "${public_svc_name}" 300 + fi + + # NodePort-free hardening: disable kube-proxy NodePort allocation on + # LoadBalancer Services and apply BYO TargetGroupBinding if configured. + patch_saia_service_disable_nodeport "${platform_name}" + apply_byo_target_group_binding "${platform_name}" +} + +wait_for_saia_load_balancer() { + local platform_name="${1:-${AI_PLATFORM_NAME}}" timeout="${2:-1200}" waited=0 + local svc_name hostname="" + svc_name="$(saia_aiservice_name "${platform_name}")-saia-service" + + # In BYO mode the Service is ClusterIP and the customer's LB DNS is not + # surfaced via .status.loadBalancer; skip the wait. Mode 1 (operator- + # managed NLB) still gates on SAIA_SERVICE_TYPE=LoadBalancer. + if byo_target_group_enabled; then + log "byoTargetGroup.enabled=true — skipping wait for operator-managed LB hostname (LB is customer-managed)." + return 0 + fi + [[ "${SAIA_SERVICE_TYPE:-}" == "LoadBalancer" ]] || return 0 + + log "Waiting for SAIA LoadBalancer Service ${AI_NS}/${svc_name} to receive an external hostname..." + while true; do + hostname="$(kubectl -n "${AI_NS}" get svc "${svc_name}" -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null || true)" + [[ -z "${hostname}" ]] && hostname="$(kubectl -n "${AI_NS}" get svc "${svc_name}" -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true)" + if [[ -n "${hostname}" ]]; then + log "✓ SAIA external endpoint: ${hostname}" + return 0 + fi + [[ $waited -ge $timeout ]] && err "Timed out waiting for SAIA LoadBalancer Service ${AI_NS}/${svc_name}" + sleep 5 + waited=$((waited + 5)) + done +} + # Quick status check function - can be called standalone check_aiplatform_status() { local platform_name="${1:-${AI_PLATFORM_NAME}}" @@ -2262,6 +2719,14 @@ YAML ;; esac + local svc_template_yaml="" + if saia_service_template_enabled; then + svc_template_yaml=" serviceTemplate:"$'\n'" spec:"$'\n'" type: ${SAIA_SERVICE_TYPE}"$'\n' + if [[ "${SAIA_SERVICE_TYPE}" == "NodePort" && -n "${SAIA_SERVICE_NODE_PORT:-}" && "${SAIA_SERVICE_NODE_PORT}" != "null" ]]; then + svc_template_yaml+=" ports:"$'\n'" - name: http"$'\n'" port: 8080"$'\n'" targetPort: 8080"$'\n'" nodePort: ${SAIA_SERVICE_NODE_PORT}"$'\n' + fi + fi + cat </dev/null 2>&1; then pf_ok "$t found ($(command -v $t))"; else pf_fail "$t not found in PATH"; fi done @@ -2878,7 +3350,18 @@ preflight_env() { fi fi if [[ $subnet_count -eq 0 ]]; then - pf_ok "No subnets specified - eksctl will create new VPC and subnets automatically" + pf_ok "No subnets specified - eksctl will create new VPC and subnets automatically (NAT mode: Single = 1 Elastic IP)" + pf_header "Elastic IP headroom (new VPC)" + local eip_cnt + eip_cnt="$(aws ec2 describe-addresses --region "${REGION}" --query 'length(Addresses)' --output text 2>/dev/null || true)" + if [[ -n "${eip_cnt}" && "${eip_cnt}" =~ ^[0-9]+$ ]]; then + pf_ok "Allocated Elastic IPs in ${REGION}: ${eip_cnt}" + if (( eip_cnt >= 5 )); then + pf_warn "Typical default EIP quota is 5 per region. At ${eip_cnt}+ addresses, NAT gateway EIP allocation may fail (you saw: maximum number of addresses). Release unused EIPs in EC2 → Elastic IPs or request a quota increase before create cluster." + fi + else + pf_warn "Could not list Elastic IPs (aws ec2 describe-addresses). If create fails on NAT/EIP, check quotas and unused addresses." + fi else local all_subnets=("${PRIVATE_SUBNETS[@]}" "${PUBLIC_SUBNETS[@]}") local vpc_id="" @@ -3102,6 +3585,7 @@ install_ai_platform_stack() { log "=== Setting up Splunk AI Platform stack ===" if [[ "${USE_EXTERNAL_OBJ_STORE}" == "true" ]]; then log "Using external S3-compatible object storage (${OBJ_STORE_TYPE}); skipping S3 bucket creation; using ECR-only policy for IRSA." + ensure_external_objstore_upload_splunk_app else ensure_s3_bucket_and_prefixes ensure_s3_upload_splunk_app @@ -3169,19 +3653,41 @@ reconcile_flow() { fi install_kube_prometheus install_cert_manager + # Validate BYO target-group config before any side-effecting calls. Fail + # fast if the customer set byoTargetGroup.enabled=true without LBC or + # required ARN/SG fields — better an early error than a silently-broken + # data path. + validate_byo_target_group_config + # AWS Load Balancer Controller (LBC) — required when the operator provisions + # NLBs/ALBs (Mode 1: Service type=LoadBalancer + `aws-load-balancer-type: + # external` annotation) or when binding the SAIA Service to a customer- + # managed target group via TargetGroupBinding (Mode 2: byoTargetGroup + # enabled). Off-AWS deployments leave this false. + if [[ "${INSTALL_LBC}" == "true" ]]; then + log "aiPlatform.awsLoadBalancerController.install=true — installing AWS Load Balancer Controller" + tag_lbc_subnets + ensure_lbc_irsa + install_aws_load_balancer_controller + else + log "aiPlatform.awsLoadBalancerController.install=false — skipping LBC install" + fi ensure_s3compat_credentials install_otel_operator_and_contrib_collector install_ray_operator install_splunk_operator install_splunk_ai_operator install_ai_platform_stack - wait_splunk_ai_assistant_installed "Splunk_AI_Assistant_Cloud.tgz" 1200 + if should_wait_for_splunk_app_install; then + wait_splunk_ai_assistant_installed "Splunk_AI_Assistant_Cloud.tgz" 1200 + else + log "Skipping Splunk AI Assistant app wait because no local app archive is configured" + fi # push_saia_conf_into_pod } # ---------- MAIN ---------- main_install() { - for t in aws eksctl kubectl helm git jq yq; do need "$t"; done + for t in aws eksctl kubectl helm git jq yq curl; do need "$t"; done # Load configuration from YAML file load_config diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml index 9faa669..7e733af 100644 --- a/tools/cluster_setup/k0s-cluster-config.yaml +++ b/tools/cluster_setup/k0s-cluster-config.yaml @@ -13,7 +13,7 @@ # ---------- Cluster Configuration ---------- cluster: name: airgap-cluster - # region: us-east-2 # Ignored for on-prem, but required in config + region: us-east-2 # CHANGE THIS — required when storage.objectStore.type=aws (region of the S3 bucket); ignored for true on-prem sshUser: ec2-user # CHANGE THIS: SSH user for remote nodes sshKeyPath: ~/.ssh/id_rsa # CHANGE THIS: Path to SSH private key @@ -38,7 +38,7 @@ nodes: # - /var/lib/k0s must have at least 100 GB free on controllers # If using a dedicated disk, mount it at /var/lib/k0s before running this script. # -# Object storage: AWS S3 or external S3-compatible (no in-cluster MinIO install for external). +# Object storage: AWS S3 or external S3-compatible (no in-cluster MinIO install). # Use objectStore.type: aws (S3) or s3compat | minio | seaweedfs (external; endpoint + credentials required). storage: storageClass: "local-path" # Storage class for Kubernetes PVCs (gp3, gp2, io1, io2) @@ -58,8 +58,8 @@ storage: # endpoint: "http://3.144.157.201:8333" # SeaweedFS (deprecated — see comment above) endpoint: "http://10.0.0.5:9000" # CHANGE THIS: MinIO/SeaweedFS S3 API endpoint auth: - rootUser: "minioadmin" - rootPassword: "minioadmin" + rootUser: "" # CHANGE THIS — AWS_ACCESS_KEY_ID (AKIA…) or MinIO root user + rootPassword: "" # CHANGE THIS — AWS secret OR MinIO root password; NEVER commit real keys # ---------- Container Images Configuration ---------- images: @@ -132,32 +132,36 @@ aiPlatform: workerGroupConfig: imageRegistry: "" - # ---------- SAIA public exposure (OPTIONAL) ---------- - # The SAIA "public" Service (nginx reverse proxy in front of v1+v2 API pods) - # defaults to ClusterIP, meaning it is only reachable from inside the cluster. - # - # Two call patterns hit this Service: - # (A) Splunk Enterprise pod → saia-service (works with ClusterIP) - # (B) End user's browser → saia-service (needs external exposure) + # ---------- SAIA public exposure (NodePort-free) ---------- + # The SAIA "public" Service (nginx reverse proxy in front of v1 + v2 API + # pods) defaults to ClusterIP — only reachable from inside the cluster. Two + # call patterns hit it: + # (A) Splunk Enterprise pod → saia-service (works with ClusterIP) + # (B) End user's browser → saia-service (needs external exposure) # # Pattern B is used by the v2 chat UI (/query streaming, conversations, # feedback, admin endpoints). Without external exposure the v2 chat UI - # breaks for users, even though v1 one-shot SPL features still work. + # breaks for users; v1 one-shot SPL still works. + # + # The supported on-prem path is `type: LoadBalancer` backed by MetalLB + # (allocates a routable VIP from a pool you provide; ARP / BGP-announces it + # on your network). NodePort is intentionally avoided so we never open + # 30000-32767 on every worker node. # - # To DISABLE external exposure (use ClusterIP only), either: - # * Delete / comment-out the entire `serviceTemplate:` block below, OR - # * Set `type: ClusterIP` explicitly. - # Either is treated identically — the installer skips emitting serviceTemplate - # into the AIPlatform CR and the operator falls through to the ClusterIP - # default in reconcileSAIAService(). + # The installer: + # * Installs MetalLB (set metallb.install: true below). + # * Applies an IPAddressPool + L2Advertisement (or BGPAdvertisement) from + # the metallb config below. + # * Renders the SAIA Service as type: LoadBalancer; MetalLB allocates a + # VIP from the pool and announces it. + # * Patches the Service with `allocateLoadBalancerNodePorts: false` and + # `externalTrafficPolicy: Local` so kube-proxy does not open a NodePort. # - # To ENABLE external exposure for on-prem / airgap customers, NodePort is the - # recommended default: any k8s node IP + the configured nodePort yields a - # reachable endpoint from VPN-connected users. No cloud LB / cert-manager - # needed. Use LoadBalancer only if the customer runs MetalLB or a cloud LB. + # To DISABLE external exposure (ClusterIP only), comment out the whole + # serviceTemplate block AND set metallb.install: false. serviceTemplate: - type: NodePort # ClusterIP | NodePort | LoadBalancer (omit block = ClusterIP) - nodePort: 30080 # Fixed NodePort (30000-32767). Required for stable DNS. + type: LoadBalancer # ClusterIP | LoadBalancer (NodePort is not used on k0s) + # No nodePort field — explicitly NodePort-free. features: - name: "saia" @@ -175,6 +179,38 @@ aiPlatform: value: "true" effect: "NoSchedule" +# ---------- MetalLB (k0s LoadBalancer provider) ---------- +# Required when aiPlatform.serviceTemplate.type=LoadBalancer on a bare-metal +# / k0s cluster. Pinned chart version for supply-chain reproducibility +# (codeguard-0-supply-chain-security). +# +# If serviceTemplate.type=NodePort, the installer skips MetalLB entirely even +# when metallb.install=true (NodePort does not use a LoadBalancer provider). +metallb: + install: true # set false if MetalLB is already installed or not needed + chartVersion: "0.14.8" # metallb/metallb Helm chart (matches MetalLB v0.14.8) + namespace: "metallb-system" + + # Address pool — a range of IPs MetalLB can hand out to LoadBalancer + # Services. Must be routable from clients (VPN-connected users) to your k0s + # workers. Use IPs that are NOT used elsewhere on the LAN. + pool: + name: "saia-pool" + addresses: + - "10.20.30.100-10.20.30.110" # CHANGE THIS to a free range on your network + + # Advertisement mode: "layer2" works on most LANs without network gear + # changes (one elected node answers ARP for the VIP at a time; failover ~ + # seconds). Use "bgp" only if your fabric supports BGP peering — then also + # populate metallb.bgpPeers below. + mode: "layer2" # layer2 | bgp + + # Required only when mode=bgp. Leave empty for layer2. + bgpPeers: [] + # - peerAddress: "10.0.0.1" + # peerASN: 65001 + # myASN: 65000 + # ---------- Image Pull Secrets ---------- imagePullSecrets: secrets: diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index 2adcffe..b4d508f 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -388,6 +388,17 @@ configure_images() { log "✓ All images configured successfully" } +# True if objectStore.auth values are still obvious template text. Non-empty +# placeholders otherwise pass the length preflight and get applied into +# minio-credentials, which makes SAIA fail at startup with InvalidAccessKeyId. +object_store_auth_looks_like_placeholder() { + case "${MINIO_ROOT_USER}${MINIO_ROOT_PASSWORD}" in + *\<*|*\>*) return 0 ;; + *CHANGEME*|*changeme*) return 0 ;; + esac + return 1 +} + # ====== PREFLIGHT CHECKS ====== preflight_checks() { pf_header "Required tools" @@ -423,6 +434,9 @@ preflight_checks() { [[ -n "${OBJ_STORE_ENDPOINT}" ]] && pf_ok "Endpoint: ${OBJ_STORE_ENDPOINT}" || pf_fail "objectStore.endpoint is required" fi [[ -n "${MINIO_ROOT_PASSWORD}" ]] && pf_ok "Credentials configured" || pf_fail "Object store credentials required (objectStore.auth.rootPassword)" + if object_store_auth_looks_like_placeholder; then + pf_fail "objectStore.auth still contains template placeholders (e.g. <...> or CHANGEME). Replace with a real access key and secret in your config (keep secrets in a Git-ignored file such as tools/cluster_setup/k0s-config.local.yaml)." + fi pf_header "Infrastructure mode" pf_ok "Using existing infrastructure (on-prem/baremetal)" @@ -1118,6 +1132,10 @@ ensure_namespace() { # the Kubernetes credentials secret so the operator and workloads can auth. ensure_s3compat_credentials() { log "Creating credentials secret for S3-compatible object storage (${OBJ_STORE_TYPE})..." + if object_store_auth_looks_like_placeholder; then + err "Refusing to create minio-credentials: objectStore.auth contains template placeholders; fix ${CONFIG_FILE}" + return 1 + fi if [[ -z "${OBJ_STORE_ENDPOINT}" && -z "${MINIO_ENDPOINT}" ]]; then err "storage.objectStore.type=${OBJ_STORE_TYPE} requires storage.objectStore.endpoint" return 1 @@ -2637,15 +2655,23 @@ install_ai_platform_cr() { # Ensure object storage credentials secret exists in AI namespace log "Creating/updating S3-compatible credentials secret (minio-credentials) in ${AI_NS}..." - kubectl -n "${AI_NS}" create secret generic minio-credentials \ - --from-literal=AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" \ - --from-literal=AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \ - --from-literal=s3_access_key="${MINIO_ROOT_USER}" \ - --from-literal=s3_secret_key="${MINIO_ROOT_PASSWORD}" \ - --from-literal=MINIO_ACCESS_KEY="${MINIO_ROOT_USER}" \ - --from-literal=MINIO_SECRET_KEY="${MINIO_ROOT_PASSWORD}" \ - --dry-run=client -o yaml | kubectl -n "${AI_NS}" apply -f - - log "✓ Object storage credentials secret ready" + if object_store_auth_looks_like_placeholder; then + if kubectl get secret minio-credentials -n "${AI_NS}" &>/dev/null; then + warn "Skipping minio-credentials apply: auth in ${CONFIG_FILE} still looks like a template (e.g. contains '<'). Preserving existing secret." + else + err "minio-credentials missing and cannot be created: fix objectStore.auth in ${CONFIG_FILE} (remove <...> placeholders)." + fi + else + kubectl -n "${AI_NS}" create secret generic minio-credentials \ + --from-literal=AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" \ + --from-literal=AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \ + --from-literal=s3_access_key="${MINIO_ROOT_USER}" \ + --from-literal=s3_secret_key="${MINIO_ROOT_PASSWORD}" \ + --from-literal=MINIO_ACCESS_KEY="${MINIO_ROOT_USER}" \ + --from-literal=MINIO_SECRET_KEY="${MINIO_ROOT_PASSWORD}" \ + --dry-run=client -o yaml | kubectl -n "${AI_NS}" apply -f - + log "✓ Object storage credentials secret ready" + fi # Build imagePullSecrets YAML from created secrets local image_pull_secrets="" @@ -2819,6 +2845,270 @@ YAML log "AIPlatform CR installed successfully" } +saia_service_template_enabled_k0s() { + local svc_type + svc_type=$(yq eval '.aiPlatform.serviceTemplate.type // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + [[ -n "${svc_type}" && "${svc_type}" != "null" && "${svc_type}" != "ClusterIP" ]] +} + +# True when SAIA public Service is explicitly NodePort. MetalLB is not used in +# that mode, so install_metallb skips the Helm install even if metallb.install=true. +k0s_saia_service_template_is_nodeport() { + local svc_type + svc_type=$(yq eval '.aiPlatform.serviceTemplate.type // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + [[ "${svc_type}" == "NodePort" ]] +} + +wait_for_k0s_aiservice_exists() { + local name="$1" timeout="${2:-600}" waited=0 + while ! kubectl -n "${AI_NS}" get aiservice "${name}" >/dev/null 2>&1; do + [[ $waited -ge $timeout ]] && err "Timed out waiting for AIService ${AI_NS}/${name}" + sleep 5 + waited=$((waited + 5)) + done +} + +apply_k0s_saia_service_annotations() { + local aiservice_name="$1" + local annotation_keys key value + + annotation_keys="$(yq eval '.aiPlatform.serviceTemplate.annotations // {} | keys | .[]' "${CONFIG_FILE}" 2>/dev/null || true)" + [[ -z "${annotation_keys}" ]] && return 0 + + local annotate_args=() + while IFS= read -r key; do + [[ -z "${key}" || "${key}" == "null" ]] && continue + value="$(yq eval ".aiPlatform.serviceTemplate.annotations.\"${key}\"" "${CONFIG_FILE}" 2>/dev/null || echo "")" + [[ -z "${value}" || "${value}" == "null" ]] && continue + annotate_args+=("${key}=${value}") + done <<< "${annotation_keys}" + + if [[ ${#annotate_args[@]} -gt 0 ]]; then + log "Applying SAIA Service annotations to AIService/${aiservice_name}..." + kubectl -n "${AI_NS}" annotate aiservice "${aiservice_name}" "${annotate_args[@]}" --overwrite + fi +} + +# ---------- MetalLB (k0s LoadBalancer provider) ---------- +# k0s ships without a Service.type=LoadBalancer provider. MetalLB fills that +# gap by allocating a VIP from a customer-provided pool and announcing it via +# Layer-2 (ARP/NDP) or BGP. We pin the chart version for supply-chain +# reproducibility (codeguard-0-supply-chain-security). + +metallb_enabled_k0s() { + local v + v="$(yq eval '.metallb.install // false' "${CONFIG_FILE}" 2>/dev/null || echo false)" + [[ "${v}" == "true" ]] +} + +install_metallb() { + metallb_enabled_k0s || { log "metallb.install != true — skipping MetalLB install"; return 0; } + + if k0s_saia_service_template_is_nodeport; then + log "Skipping MetalLB install: aiPlatform.serviceTemplate.type=NodePort (LoadBalancer provider not used for SAIA)." + log "NOTE: metallb.install=true has no effect while SAIA uses NodePort. Set metallb.install=false to match config, or use type=LoadBalancer to install MetalLB." + return 0 + fi + + local ns chart_version pool_name addr_count mode + ns="$(yq eval '.metallb.namespace // "metallb-system"' "${CONFIG_FILE}" 2>/dev/null)" + chart_version="$(yq eval '.metallb.chartVersion // "0.14.8"' "${CONFIG_FILE}" 2>/dev/null)" + pool_name="$(yq eval '.metallb.pool.name // "saia-pool"' "${CONFIG_FILE}" 2>/dev/null)" + addr_count="$(yq eval '.metallb.pool.addresses // [] | length' "${CONFIG_FILE}" 2>/dev/null || echo 0)" + mode="$(yq eval '.metallb.mode // "layer2"' "${CONFIG_FILE}" 2>/dev/null)" + + if [[ "${addr_count}" == "0" ]]; then + err "metallb.install=true but metallb.pool.addresses is empty. Provide at least one IP range routable on your network." + fi + if [[ "${mode}" != "layer2" && "${mode}" != "bgp" ]]; then + err "metallb.mode must be 'layer2' or 'bgp' (got: ${mode})." + fi + + log "Installing MetalLB ${chart_version} into namespace ${ns}..." + helm repo add metallb https://metallb.github.io/metallb >/dev/null 2>&1 || true + helm repo update >/dev/null 2>&1 || true + + kubectl get ns "${ns}" >/dev/null 2>&1 || kubectl create ns "${ns}" + + helm upgrade --install metallb metallb/metallb \ + --namespace "${ns}" \ + --version "${chart_version}" \ + --wait --timeout 5m + + # Wait for the controller webhook to be Ready before applying CRs, otherwise + # the IPAddressPool / L2Advertisement applies race the validating webhook. + log "Waiting for MetalLB controller to be ready..." + kubectl -n "${ns}" rollout status deploy/metallb-controller --timeout=180s + + # Render IPAddressPool with the configured address ranges. + local addresses_yaml="" + local i + local pool_count + pool_count="$(yq eval '.metallb.pool.addresses | length' "${CONFIG_FILE}" 2>/dev/null || echo 0)" + for ((i=0; i/dev/null)" + [[ -z "${addr}" || "${addr}" == "null" ]] && continue + addresses_yaml+=" - ${addr}"$'\n' + done + + log "Applying MetalLB IPAddressPool '${pool_name}' (${addr_count} range(s))..." + cat </dev/null || echo 0)" + if [[ "${peer_count}" == "0" ]]; then + err "metallb.mode=bgp requires metallb.bgpPeers to be non-empty (peerAddress, peerASN, myASN per peer)." + fi + local p + for ((p=0; p/dev/null)" + peer_asn="$(yq eval ".metallb.bgpPeers[${p}].peerASN" "${CONFIG_FILE}" 2>/dev/null)" + my_asn="$(yq eval ".metallb.bgpPeers[${p}].myASN" "${CONFIG_FILE}" 2>/dev/null)" + [[ -z "${peer_addr}" || -z "${peer_asn}" || -z "${my_asn}" ]] && \ + err "metallb.bgpPeers[${p}] missing peerAddress / peerASN / myASN." + cat </dev/null || true)" + [[ "${svc_type}" != "LoadBalancer" ]] && return 0 + + log "Patching Service ${AI_NS}/${svc_name} to disable NodePort allocation..." + kubectl -n "${AI_NS}" patch svc "${svc_name}" --type=merge -p '{ + "spec": { + "allocateLoadBalancerNodePorts": false, + "externalTrafficPolicy": "Local" + } +}' >/dev/null + log "✓ Service ${AI_NS}/${svc_name}: allocateLoadBalancerNodePorts=false, externalTrafficPolicy=Local" +} + +patch_k0s_saia_public_service_workaround() { + local platform_name="${CLUSTER_NAME}-ai-platform" + local aiservice_name="${platform_name}-saia" + local public_svc_name="${aiservice_name}-saia-service" + local svc_type svc_node_port + + svc_type=$(yq eval '.aiPlatform.serviceTemplate.type // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + svc_node_port=$(yq eval '.aiPlatform.serviceTemplate.nodePort // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + + wait_for_k0s_aiservice_exists "${aiservice_name}" + + if saia_service_template_enabled_k0s; then + log "Patching AIService/${aiservice_name} with SAIA public exposure settings (type=${svc_type})..." + if [[ "${svc_type}" == "NodePort" && -n "${svc_node_port}" && "${svc_node_port}" != "null" ]]; then + log "WARNING: NodePort exposure is discouraged on k0s. Prefer type=LoadBalancer with metallb.install=true (MetalLB install is skipped automatically when type=NodePort)." >&2 + kubectl -n "${AI_NS}" patch aiservice "${aiservice_name}" --type merge -p "{ + \"spec\": { + \"serviceTemplate\": { + \"spec\": { + \"type\": \"NodePort\", + \"ports\": [ + { + \"name\": \"http\", + \"port\": 8080, + \"targetPort\": 8080, + \"nodePort\": ${svc_node_port} + } + ] + } + } + } +}" + else + kubectl -n "${AI_NS}" patch aiservice "${aiservice_name}" --type merge -p "{ + \"spec\": { + \"serviceTemplate\": { + \"spec\": { + \"type\": \"${svc_type}\" + } + } + } +}" + fi + fi + + apply_k0s_saia_service_annotations "${aiservice_name}" + + kubectl -n "${AI_NS}" annotate aiservice "${aiservice_name}" script-reconcile-ts="$(date +%s)" --overwrite >/dev/null + + if saia_service_template_enabled_k0s; then + log "Recreating SAIA public Service to ensure patched settings take effect..." + kubectl -n "${AI_NS}" delete svc "${public_svc_name}" --ignore-not-found >/dev/null 2>&1 || true + # Wait briefly for the operator to recreate it before patching NodePort + # allocation off; if it doesn't come back the patch will be a no-op. + local waited=0 + while ! kubectl -n "${AI_NS}" get svc "${public_svc_name}" >/dev/null 2>&1; do + [[ ${waited} -ge 300 ]] && break + sleep 5 + waited=$((waited + 5)) + done + fi + + patch_k0s_saia_service_disable_nodeport +} + # ====== INSTALL FULL STACK ====== install_ai_platform_stack() { log "Installing complete AI Platform stack..." @@ -2901,9 +3191,18 @@ install_ai_platform_stack() { # Apply Splunk Standalone CR (non-blocking — pod boots in background) install_splunk_standalone + # MetalLB must be installed BEFORE the AIPlatform CR is reconciled — the + # operator renders a Service.type=LoadBalancer for SAIA and we need a + # provider in the cluster to allocate a VIP, otherwise the Service is + # stuck in EXTERNAL-IP= indefinitely. No-op when + # metallb.install=false (e.g., user is bringing their own MetalLB or wants + # ClusterIP only). + install_metallb + # Install AI Platform operator and CR while Splunk Standalone boots install_splunk_ai_operator install_ai_platform_cr + patch_k0s_saia_public_service_workaround # Now wait for Splunk Standalone to be ready (likely already done by now) wait_for_splunk_standalone @@ -2923,7 +3222,10 @@ check_platform_health() { # Check 1: Cluster nodes log "Checking cluster nodes..." local not_ready - not_ready=$(kubectl get nodes --no-headers 2>/dev/null | grep -v " Ready " | wc -l || echo "0") + # Count nodes whose status is not Ready without relying on grep exit codes. + # This avoids `set -euo pipefail` aborting the script when all nodes are + # Ready, while still producing a whitespace-free numeric result. + not_ready=$(kubectl get nodes --no-headers 2>/dev/null | awk 'index($0, " Ready ") == 0 { count++ } END { print count+0 }') if [[ "${not_ready}" -gt 0 ]]; then warn "Found ${not_ready} node(s) not in Ready state" kubectl get nodes