From 68068eaea9007942a269a2b52b1cb7fd6d2b5820 Mon Sep 17 00:00:00 2001 From: kbhos Date: Sun, 26 Apr 2026 18:24:19 +0530 Subject: [PATCH 1/5] feat(ai-platform): SAIA service exposure to external requests --- tools/cluster_setup/cluster-config.yaml | 156 ++++++- tools/cluster_setup/eks_cluster_with_stack.sh | 394 +++++++++++++++++- tools/cluster_setup/k0s-cluster-config.yaml | 24 +- tools/cluster_setup/k0s_cluster_with_stack.sh | 91 ++++ .../splunk-operator-cluster.yaml | 1 - 5 files changed, 623 insertions(+), 43 deletions(-) diff --git a/tools/cluster_setup/cluster-config.yaml b/tools/cluster_setup/cluster-config.yaml index 513b425..9111689 100644 --- a/tools/cluster_setup/cluster-config.yaml +++ b/tools/cluster_setup/cluster-config.yaml @@ -13,7 +13,26 @@ # ---------- Cluster Configuration ---------- cluster: - useExisting: false # true = do not create cluster; use existing one (script fails if cluster not found) + # ------------------------------------------------------------------------ + # LIFECYCLE WORKFLOW (to avoid VPC/IGW quota churn and DELETE_FAILED loops) + # ------------------------------------------------------------------------ + # 1. FIRST install (cluster does not exist yet): + # useExisting: false # eksctl creates the cluster + VPC + # ./eks_cluster_with_stack.sh install + # + # 2. AFTER first install succeeds, flip this one line: + # useExisting: true # subsequent `install` only reconciles + # # operators/CRs on the existing cluster. + # Re-running `install` is now safe and does NOT create new VPCs/IGWs. + # + # 3. When you genuinely want to tear down: + # ALWAYS use `delete-full` (NOT `delete`). It uninstalls CRs/operators + # first so the AWS Load Balancer Controller removes its NLBs + SGs + # before CFN deletes the VPC -- this prevents DELETE_FAILED stacks + # leaving orphan VPCs behind and eating your per-region quota. + # ./eks_cluster_with_stack.sh delete-full + # ------------------------------------------------------------------------ + useExisting: true # true = do not create cluster; use existing one (script fails if cluster not found) name: "ai-tier-sok-test-east2" # CHANGE THIS: Your EKS cluster name (DNS-1123 compliant: lowercase, numbers, hyphens) region: "us-east-2" # CHANGE THIS: Your AWS region (e.g., us-east-1, us-west-2, eu-west-1) k8sVersion: "1.31" # Kubernetes version (1.29, 1.30, 1.31 supported) @@ -79,7 +98,7 @@ nodeGroups: desiredCapacity: 2 # Initial number of GPU nodes minSize: 2 # Minimum GPU nodes maxSize: 4 # Maximum GPU nodes (set equal to desiredCapacity for H100) - volumeSize: 1000 # EBS volume size per GPU node (GB) - larger for model storage + volumeSize: 500 # EBS volume size per GPU node (GB) - larger for model storage volumeType: "gp3" # EBS volume type # ── H100 ONLY ────────────────────────────────────────────────────────────── @@ -99,7 +118,7 @@ nodeGroups: # Object storage: only AWS S3 or external S3-compatible (no in-cluster MinIO install). # Use objectStore.type: aws (S3) or s3compat | minio | seaweedfs (external; endpoint + credentials required). storage: - s3Bucket: "ai-platform-bucket-minio-us-east-2" # Used when objectStore.type is aws + s3Bucket: "ai-platform-bucket-us-east-2" # Used when objectStore.type is aws storageClass: "gp3" # Storage class for Kubernetes PVCs (gp3, gp2, io1, io2) vectorDbSize: "50Gi" # VectorDB persistent volume size @@ -108,12 +127,8 @@ storage: # - minio: same wiring as s3compat but path uses minio:// (use if an older operator webhook rejects s3compat://) # - seaweedfs: path uses seaweedfs:// (requires operator webhook that allows that scheme) objectStore: - type: "minio" # aws | s3compat | minio | seaweedfs (external only for non-aws) - bucket: "ai-platform-bucket-minio-us-east-2" - endpoint: "http://13.59.216.105:9000" # MinIO API (9000) or SeaweedFS S3 gateway (8333) - auth: - rootUser: "minioadmin" - rootPassword: "minioadmin" # Must match SeaweedFS env (AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY) or MinIO root + type: "aws" # aws | s3compat | minio | seaweedfs (external only for non-aws) + bucket: "ai-platform-bucket-us-east-2" # Must match SeaweedFS env (AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY) or MinIO root # ---------- Container Images Configuration ---------- images: @@ -153,7 +168,7 @@ images: # Result: "docker.io/myorg/splunk-ai-operator:v1.0.0" # Bump tag after building fixed operator (SAIA 8Gi default, SchemaJobId persist, feature config) #image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/splunk-ai-operator:v0.1.8" - image: "docker.io/kpratyush775/splunk-ai-operator:v0.1.31" + image: "docker.io/kbhos698/splunk-ai-operator:ai-tier" # Splunk Enterprise Images splunk: @@ -176,8 +191,8 @@ images: # Option 2: Full path with different registry # headImage: "docker.io/rayproject/ray:2.44.0" # Result: "docker.io/rayproject/ray:2.44.0" - headImage: "ml-platform/ray/ray-head:build-008" - workerImage: "ml-platform/ray/ray-worker-gpu:build-008" + headImage: "ml-platform/ray/ray-head:build-v2-008" + workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-008" # Weaviate Vector Database weaviate: @@ -189,9 +204,14 @@ images: # SAIA (Splunk AI Assistant) Images saia: # Relative paths - registry prefix auto-applied - apiImage: "ml-platform/saia/saia-api:build-005" - dataLoaderImage: "ml-platform/saia/saia-data-loader:build-003" - + # NOTE: keep dataLoaderImage in sync with apiImage/apiV2Image. Tags older than + # v2-008 (specifically pre v2.0.4-13-g3b677604) ship a broken URL-compat shim + # that ignores VECTOR_DB_GRPC_* env vars and falls back to grpc.:443 TLS, + # causing the vector-db-setup posthook Job to fail with a Weaviate gRPC health + # check error. See pkg/ai/features/saia/impl.go (reconcilePostInstallHook). + apiImage: "ml-platform/saia/saia-api:build-v2-009" + apiV2Image: "ml-platform/saia/saia-api-v2:build-v2-009" + dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v2-009" # Supporting Images fluentBit: # Docker Hub public image (has full path, registry prefix ignored) @@ -204,6 +224,14 @@ images: # Public image - full path so registry prefix is NOT applied; validation checks this URL image: "docker.io/otel/opentelemetry-collector-contrib:0.122.1" + # NGINX reverse proxy used by the SAIA reconciler to route v1 / v2 requests + # by path. OPTIONAL: omit this block to use the script default + # (docker.io/library/nginx:1.27-alpine). Add it only to pin a specific tag + # or point at an internal mirror in airgapped clusters. + # + # nginx: + # image: "harbor.internal/library/nginx:1.27-alpine" + # ---------- Operator Versions ---------- operators: ray: @@ -246,6 +274,104 @@ aiPlatform: serviceAccountName: "ray-worker-sa" imageRegistry: "" # Leave empty for default + # --------------------------------------------------------------------------- + # Public SAIA exposure + # --------------------------------------------------------------------------- + # The operator always renders a public Kubernetes Service named + # `-saia-service` whose endpoints are the in-cluster nginx + # pods (nginx terminates path routing to saia v1 / v2). HOW that Service is + # reached from outside the cluster depends on two settings below: + # + # - aiPlatform.serviceTemplate.{type, nodePort, annotations} + # - aiPlatform.awsLoadBalancerController.install + # + # Pick ONE of the three modes below. Each row shows: what you put in this + # file, what the install script does, and what you (the customer) must + # provision outside the cluster. + # + # --------------------------------------------------------------------------- + # MODE 1 — Operator-managed AWS NLB (default, simplest on EKS) + # --------------------------------------------------------------------------- + # serviceTemplate: + # type: LoadBalancer + # annotations: + # service.beta.kubernetes.io/aws-load-balancer-type: "external" + # service.beta.kubernetes.io/aws-load-balancer-scheme: "internet-facing" # or "internal" + # service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: "instance" + # awsLoadBalancerController: + # install: true + # + # Script does: + # * Installs AWS Load Balancer Controller (LBC) with IRSA and tags + # public/private subnets for auto-discovery. + # * Creates the LoadBalancer-typed Service; LBC reads the annotations and + # provisions an internet-facing AWS NLB (~2-3 min). Public DNS appears + # in `.status.loadBalancer.ingress[0].hostname`. + # You must do: + # * Nothing on the AWS side — fully automated. + # * (Optional) Add an ACM cert listener annotation if you want TLS + # termination at the NLB. + # + # --------------------------------------------------------------------------- + # MODE 2 — Bring-your-own AWS LB (you already have an NLB / ALB) + # --------------------------------------------------------------------------- + # serviceTemplate: + # type: NodePort + # nodePort: 30080 # any free port in 30000-32767 + # awsLoadBalancerController: + # install: false # no LBC, no operator-created LB + # + # Script does: + # * Creates the public Service as NodePort 30080 on every worker. + # * Skips LBC install entirely. + # You must do (in AWS, outside the script): + # 1. Pre-create an NLB or ALB (any scheme). + # 2. Create a target group: + # - Target type: instance + # - Protocol/Port: TCP/30080 (NLB) or HTTP/30080 (ALB) + # - Health check: HTTP /nginx_health on port "traffic-port", 200 OK + # 3. Attach the EKS managed-nodegroup ASG to the target group so + # membership tracks node scale-in/out, e.g. via Terraform: + # resource "aws_autoscaling_attachment" "saia" { + # autoscaling_group_name = "eks---NodeGroup-XXXX" + # lb_target_group_arn = "arn:aws:elasticloadbalancing:...:targetgroup/my-saia-tg/..." + # } + # 4. Worker node SG: allow ingress TCP/30080 from the NLB subnet CIDRs + # (NLB) or from the ALB's security group (ALB). + # + # --------------------------------------------------------------------------- + # MODE 3 — On-prem / k0s / airgap (HAProxy, F5, MetalLB, hardware LB, …) + # --------------------------------------------------------------------------- + # serviceTemplate: + # type: NodePort + # nodePort: 30080 + # awsLoadBalancerController: + # install: false # has no effect off-AWS, leave false + # + # Script does: + # * Same as Mode 2 — creates the public Service as NodePort 30080. + # You must do (outside the cluster): + # * Point your existing L4 LB (HAProxy / F5 / MetalLB / hardware) at every + # worker node IP on TCP/30080, with HTTP health-check /nginx_health. + # Sample HAProxy backend: + # backend saia_be + # option httpchk GET /nginx_health + # server worker1 10.0.1.11:30080 check + # server worker2 10.0.1.12:30080 check + # + # --------------------------------------------------------------------------- + + # Active mode below — EDIT to switch. Default is MODE 1. + serviceTemplate: + type: LoadBalancer + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: "external" + service.beta.kubernetes.io/aws-load-balancer-scheme: "internet-facing" + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: "instance" + + awsLoadBalancerController: + install: true + # CPU Scheduling cpuScheduling: nodeSelector: {} diff --git a/tools/cluster_setup/eks_cluster_with_stack.sh b/tools/cluster_setup/eks_cluster_with_stack.sh index 7426ae1..b6982fb 100755 --- a/tools/cluster_setup/eks_cluster_with_stack.sh +++ b/tools/cluster_setup/eks_cluster_with_stack.sh @@ -91,6 +91,14 @@ load_config() { SAIA_SERVICE_SA="$(yq eval '.aiPlatform.serviceAccounts.saiaService' "$cfg")" DEFAULT_ACCELERATOR="$(yq eval '.aiPlatform.defaultAcceleratorType' "$cfg")" WORKER_IMAGE_REGISTRY="$(yq eval '.aiPlatform.workerGroupConfig.imageRegistry' "$cfg")" + SAIA_SERVICE_TYPE="$(yq eval '.aiPlatform.serviceTemplate.type // ""' "$cfg")" + SAIA_SERVICE_NODE_PORT="$(yq eval '.aiPlatform.serviceTemplate.nodePort // ""' "$cfg")" + # AWS Load Balancer Controller (LBC) install toggle. Default: false — the + # script assumes customers bring their own LB and point it at NodePort + # (Path A). Set to true only when you want operator-managed NLB/ALB + # provisioning via the `aws-load-balancer-type: external` annotation or + # dynamic target registration via TargetGroupBinding CRs (Path B). + INSTALL_LBC="$(yq eval '.aiPlatform.awsLoadBalancerController.install // false' "$cfg")" INGRESS_HOST="$(yq eval '.aiPlatform.ingress.host' "$cfg")" INGRESS_CLASS="$(yq eval '.aiPlatform.ingress.className' "$cfg")" INGRESS_TLS_SECRET="$(yq eval '.aiPlatform.ingress.tlsSecretName' "$cfg")" @@ -120,9 +128,11 @@ load_config() { RAY_WORKER_IMAGE="$(yq eval '.images.ray.workerImage' "$cfg")" WEAVIATE_IMAGE="$(yq eval '.images.weaviate.image' "$cfg")" SAIA_API_IMAGE="$(yq eval '.images.saia.apiImage' "$cfg")" + SAIA_API_V2_IMAGE="$(yq eval '.images.saia.apiV2Image // ""' "$cfg")" SAIA_DATALOADER_IMAGE="$(yq eval '.images.saia.dataLoaderImage' "$cfg")" FLUENT_BIT_IMAGE="$(yq eval '.images.fluentBit.image' "$cfg")" OTEL_COLLECTOR_IMAGE="$(yq eval '.images.otelCollector.image' "$cfg")" + NGINX_IMAGE="$(yq eval '.images.nginx.image // "docker.io/library/nginx:1.27-alpine"' "$cfg")" # Subnets - read as arrays (support both cluster.subnets and top-level subnets) PRIVATE_SUBNETS=() @@ -172,6 +182,9 @@ load_config() { SAIA_SERVICE_SA="saia-service-sa" DEFAULT_ACCELERATOR="L40S" WORKER_IMAGE_REGISTRY="" + SAIA_SERVICE_TYPE="" + SAIA_SERVICE_NODE_PORT="" + INSTALL_LBC="false" INGRESS_HOST="ai.example.com" INGRESS_CLASS="nginx" INGRESS_TLS_SECRET="ai-platform-tls" @@ -179,6 +192,8 @@ load_config() { SPLUNK_OPERATOR_FILE="./splunk-operator-cluster.yaml" SPLUNK_AI_FILE="./artifacts.yaml" SPLUNK_IMAGE="splunk/splunk:10.2.0-dev1" + SAIA_API_V2_IMAGE="" + NGINX_IMAGE="docker.io/library/nginx:1.27-alpine" RAY_VERSION="v1.2.2" NVIDIA_VERSION="v0.17.3" ENABLE_CPU=true @@ -230,6 +245,19 @@ load_config() { # Splunk operators SPLUNK_AI_NS="splunk-ai-operator-system" + # AWS Load Balancer Controller (LBC) — required when a Service of type=LoadBalancer + # uses the "service.beta.kubernetes.io/aws-load-balancer-type: external" annotation + # (the in-tree EKS cloud controller intentionally skips those Services). Pinned + # chart and policy versions keep installs reproducible against a vetted upstream + # release (supply-chain hygiene: codeguard-0-supply-chain-security). + LBC_NS="kube-system" + LBC_SA="aws-load-balancer-controller" + LBC_RELEASE="aws-load-balancer-controller" + LBC_ROLE_NAME="AWSLoadBalancerControllerRole-${CLUSTER_NAME}" + LBC_POLICY_NAME="AWSLoadBalancerControllerIAMPolicy-${CLUSTER_NAME}" + LBC_CHART_VERSION="1.8.2" # helm chart version (appVersion v2.8.2) + LBC_POLICY_VERSION="v2.8.2" # upstream tag used to fetch iam_policy.json + log "Configuration loaded: cluster=${CLUSTER_NAME}, region=${REGION}, namespace=${AI_NS}" } @@ -386,47 +414,67 @@ configure_images() { local ray_worker_full=$(build_image_url "$IMAGE_REGISTRY" "$RAY_WORKER_IMAGE") local weaviate_full=$(build_image_url "$IMAGE_REGISTRY" "$WEAVIATE_IMAGE") local saia_api_full=$(build_image_url "$IMAGE_REGISTRY" "$SAIA_API_IMAGE") + local saia_api_v2_full="" local saia_dataloader_full=$(build_image_url "$IMAGE_REGISTRY" "$SAIA_DATALOADER_IMAGE") local fluent_bit_full=$(build_image_url "$IMAGE_REGISTRY" "$FLUENT_BIT_IMAGE") local otel_collector_full=$(build_image_url "$IMAGE_REGISTRY" "$OTEL_COLLECTOR_IMAGE") + local nginx_full=$(build_image_url "$IMAGE_REGISTRY" "$NGINX_IMAGE") + if [[ -n "${SAIA_API_V2_IMAGE}" && "${SAIA_API_V2_IMAGE}" != "null" ]]; then + saia_api_v2_full=$(build_image_url "$IMAGE_REGISTRY" "$SAIA_API_V2_IMAGE") + fi # Escape special characters for sed local ray_head_escaped=$(echo "$ray_head_full" | sed 's/[\/&]/\\&/g') local ray_worker_escaped=$(echo "$ray_worker_full" | sed 's/[\/&]/\\&/g') local weaviate_escaped=$(echo "$weaviate_full" | sed 's/[\/&]/\\&/g') local saia_api_escaped=$(echo "$saia_api_full" | sed 's/[\/&]/\\&/g') + local saia_api_v2_escaped="" local saia_dataloader_escaped=$(echo "$saia_dataloader_full" | sed 's/[\/&]/\\&/g') local fluent_bit_escaped=$(echo "$fluent_bit_full" | sed 's/[\/&]/\\&/g') local otel_collector_escaped=$(echo "$otel_collector_full" | sed 's/[\/&]/\\&/g') + local nginx_escaped=$(echo "$nginx_full" | sed 's/[\/&]/\\&/g') local operator_escaped=$(echo "$operator_full" | sed 's/[\/&]/\\&/g') + if [[ -n "${saia_api_v2_full}" ]]; then + saia_api_v2_escaped=$(echo "$saia_api_v2_full" | sed 's/[\/&]/\\&/g') + fi - SEDOPTION="-i" + local SED_INPLACE if [[ "$OSTYPE" == "darwin"* ]]; then - SEDOPTION="-i ''" + SED_INPLACE=(sed -i "") + else + SED_INPLACE=(sed -i) fi # Replace RELATED_IMAGE_ env vars by matching the env var name (not the value pattern) # This works regardless of what registry/image was there before - sed $SEDOPTION "/name: RELATED_IMAGE_RAY_HEAD/,/value:/ s|value:.*|value: ${ray_head_escaped}|" "$SPLUNK_AI_FILE" - sed $SEDOPTION "/name: RELATED_IMAGE_RAY_WORKER/,/value:/ s|value:.*|value: ${ray_worker_escaped}|" "$SPLUNK_AI_FILE" - sed $SEDOPTION "/name: RELATED_IMAGE_WEAVIATE/,/value:/ s|value:.*|value: ${weaviate_escaped}|" "$SPLUNK_AI_FILE" - sed $SEDOPTION "/name: RELATED_IMAGE_SAIA_API/,/value:/ s|value:.*|value: ${saia_api_escaped}|" "$SPLUNK_AI_FILE" - sed $SEDOPTION "/name: RELATED_IMAGE_POST_INSTALL_HOOK/,/value:/ s|value:.*|value: ${saia_dataloader_escaped}|" "$SPLUNK_AI_FILE" - sed $SEDOPTION "/name: RELATED_IMAGE_FLUENT_BIT/,/value:/ s|value:.*|value: ${fluent_bit_escaped}|" "$SPLUNK_AI_FILE" - sed $SEDOPTION "/name: RELATED_IMAGE_OTEL_COLLECTOR/,/value:/ s|value:.*|value: ${otel_collector_escaped}|" "$SPLUNK_AI_FILE" - sed $SEDOPTION "/name: MODEL_VERSION/,/value:/ s|value:.*|value: ${MODEL_VERSION}|" "$SPLUNK_AI_FILE" - sed $SEDOPTION "/name: RAY_VERSION/,/value:/ s|value:.*|value: ${RAY_RUNTIME_VERSION}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_RAY_HEAD/,/value:/ s|value:.*|value: ${ray_head_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_RAY_WORKER/,/value:/ s|value:.*|value: ${ray_worker_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_WEAVIATE/,/value:/ s|value:.*|value: ${weaviate_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_SAIA_API$/,/value:/ s|value:.*|value: ${saia_api_escaped}|" "$SPLUNK_AI_FILE" + if [[ -n "${saia_api_v2_escaped}" ]]; then + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_SAIA_API_V2/,/value:/ s|value:.*|value: ${saia_api_v2_escaped}|" "$SPLUNK_AI_FILE" + fi + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_POST_INSTALL_HOOK/,/value:/ s|value:.*|value: ${saia_dataloader_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_FLUENT_BIT/,/value:/ s|value:.*|value: ${fluent_bit_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_OTEL_COLLECTOR/,/value:/ s|value:.*|value: ${otel_collector_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_NGINX/,/value:/ s|value:.*|value: ${nginx_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: MODEL_VERSION/,/value:/ s|value:.*|value: ${MODEL_VERSION}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RAY_VERSION/,/value:/ s|value:.*|value: ${RAY_RUNTIME_VERSION}|" "$SPLUNK_AI_FILE" # Replace operator image (the container image itself, not env var) # Find the line with "image:" that's near "splunk-ai-operator" and replace it - sed $SEDOPTION "s|image: .*splunk.*ai.*operator.*|image: ${operator_escaped}|I" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "s|image: .*splunk.*ai.*operator.*|image: ${operator_escaped}|I" "$SPLUNK_AI_FILE" log " ✓ Updated RELATED_IMAGE_RAY_HEAD: $ray_head_full" log " ✓ Updated RELATED_IMAGE_RAY_WORKER: $ray_worker_full" log " ✓ Updated RELATED_IMAGE_WEAVIATE: $weaviate_full" log " ✓ Updated RELATED_IMAGE_SAIA_API: $saia_api_full" + if [[ -n "${saia_api_v2_full}" ]]; then + log " ✓ Updated RELATED_IMAGE_SAIA_API_V2: $saia_api_v2_full" + fi log " ✓ Updated RELATED_IMAGE_POST_INSTALL_HOOK: $saia_dataloader_full" log " ✓ Updated RELATED_IMAGE_FLUENT_BIT: $fluent_bit_full" log " ✓ Updated RELATED_IMAGE_OTEL_COLLECTOR: $otel_collector_full" + log " ✓ Updated RELATED_IMAGE_NGINX: $nginx_full" log " ✓ Updated operator image: $operator_full" log " ✓ Updated MODEL_VERSION: $MODEL_VERSION" log " ✓ Updated RAY_VERSION: $RAY_RUNTIME_VERSION" @@ -441,10 +489,10 @@ configure_images() { local splunk_op_escaped=$(echo "$splunk_operator_full" | sed 's/[\/&]/\\&/g') # Replace RELATED_IMAGE_SPLUNK_ENTERPRISE env var - sed $SEDOPTION "/name: RELATED_IMAGE_SPLUNK_ENTERPRISE/,/value:/ s|value:.*|value: ${splunk_escaped}|" "$SPLUNK_OPERATOR_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_SPLUNK_ENTERPRISE/,/value:/ s|value:.*|value: ${splunk_escaped}|" "$SPLUNK_OPERATOR_FILE" # Replace splunk-operator image (the container image itself) - sed $SEDOPTION "s|image: .*splunk.*operator.*|image: ${splunk_op_escaped}|I" "$SPLUNK_OPERATOR_FILE" + "${SED_INPLACE[@]}" "s|image: .*splunk.*operator.*|image: ${splunk_op_escaped}|I" "$SPLUNK_OPERATOR_FILE" log " ✓ Updated Splunk Enterprise image: $splunk_full" log " ✓ Updated Splunk Operator image: $splunk_operator_full" @@ -1365,6 +1413,139 @@ install_cert_manager() { check_ready cert-manager "app.kubernetes.io/instance=cert-manager,app.kubernetes.io/component=controller" } +# ---------- AWS Load Balancer Controller (LBC) ---------- +# LBC watches Services with the "aws-load-balancer-type: external" annotation +# (the in-tree cloud controller skips those Services on purpose) and drives +# NLB/ALB provisioning through the AWS ELBv2 API. Without LBC installed, such +# Services stay in EXTERNAL-IP= forever. LBC also gives us IP-mode +# targeting, ACM-backed TLS termination, and modern NLB attributes — all +# features the in-tree controller does not support. + +# Fetches the upstream-recommended IAM policy for LBC from a pinned git tag and +# creates a customer-managed policy in the account (idempotent). Emits the ARN +# on stdout so the caller can attach it via eksctl. Uses a cluster-scoped name +# so teardown of one cluster won't remove a policy shared with other clusters. +ensure_lbc_iam_policy() { + # Resolve the caller's account ID; construct the canonical policy ARN + # deterministically (IAM policy names are unique per account). This avoids + # parsing AWS CLI text output -- some CLI/JMESPath combinations have been + # observed to emit multi-line "None\nNone" for `Policies[?...].Arn | [0]` + # when no match exists, which would otherwise slip past a "!= None" guard. + local acct policy_arn + acct="$(aws sts get-caller-identity --query Account --output text 2>/dev/null | tr -d '[:space:]')" + if [[ -z "$acct" || ! "$acct" =~ ^[0-9]{12}$ ]]; then + err "Could not resolve a valid AWS account ID via STS (got: '${acct}')" + fi + policy_arn="arn:aws:iam::${acct}:policy/${LBC_POLICY_NAME}" + + if aws iam get-policy --policy-arn "$policy_arn" >/dev/null 2>&1; then + log "✓ LBC IAM policy already exists: ${policy_arn}" >&2 + printf "%s" "$policy_arn" + return 0 + fi + + local tmp; tmp="$(mktemp)"; TMP_FILES+=("$tmp") + local url="https://raw.githubusercontent.com/kubernetes-sigs/aws-load-balancer-controller/${LBC_POLICY_VERSION}/docs/install/iam_policy.json" + log "Fetching LBC IAM policy ${LBC_POLICY_VERSION} from ${url}" >&2 + if ! curl -fsSL --max-time 60 "$url" -o "$tmp"; then + err "Failed to download AWS LBC IAM policy from ${url}. Check network access or bump LBC_POLICY_VERSION." + fi + if ! jq -e . "$tmp" >/dev/null 2>&1; then + err "Downloaded LBC IAM policy is not valid JSON. Refusing to proceed." + fi + + local created + created="$(aws iam create-policy \ + --policy-name "${LBC_POLICY_NAME}" \ + --policy-document "file://${tmp}" \ + --description "AWS Load Balancer Controller policy for ${CLUSTER_NAME} (${LBC_POLICY_VERSION})" \ + --query 'Policy.Arn' --output text 2>/dev/null | tr -d '[:space:]')" + if [[ -z "$created" || "$created" != arn:aws:iam::* ]]; then + err "create-policy did not return a valid ARN for ${LBC_POLICY_NAME} (got: '${created}')" + fi + log "✓ Created LBC IAM policy ${LBC_POLICY_NAME}: ${created}" >&2 + printf "%s" "$created" +} + +# Creates the IRSA-bound ServiceAccount used by the LBC deployment. Uses eksctl +# so the trust policy is pinned to this cluster's OIDC provider and SA subject. +ensure_lbc_irsa() { + log "Ensuring IRSA for AWS Load Balancer Controller (${LBC_NS}/${LBC_SA})..." + local policy_arn; policy_arn="$(ensure_lbc_iam_policy)" + if [[ -z "$policy_arn" || "$policy_arn" != arn:aws:iam::* ]]; then + err "LBC IAM policy ARN is empty/invalid ('${policy_arn}'); cannot configure IRSA" + fi + + eksctl create iamserviceaccount \ + --cluster "${CLUSTER_NAME}" \ + --region "${REGION}" \ + --namespace "${LBC_NS}" \ + --name "${LBC_SA}" \ + --role-name "${LBC_ROLE_NAME}" \ + --attach-policy-arn "${policy_arn}" \ + --approve \ + --override-existing-serviceaccounts + + wait_resource_exists "${LBC_NS}" sa "${LBC_SA}" 180 + log "✓ LBC IRSA role and service account configured" +} + +# Tags user-provided subnets so LBC can auto-discover where to place LBs. +# eksctl already tags subnets it creates, so this is a no-op when the cluster +# was created without explicit cluster.subnets. +tag_lbc_subnets() { + if [[ ${#PUBLIC_SUBNETS[@]} -eq 0 && ${#PRIVATE_SUBNETS[@]} -eq 0 ]]; then + log "No user-provided subnets; eksctl-created subnets are already tagged for LBC discovery." + return 0 + fi + log "Tagging user-provided subnets for AWS Load Balancer Controller discovery..." + if [[ ${#PUBLIC_SUBNETS[@]} -gt 0 ]]; then + log " Public subnets (${#PUBLIC_SUBNETS[@]}): kubernetes.io/role/elb=1" + aws ec2 create-tags --region "${REGION}" \ + --resources "${PUBLIC_SUBNETS[@]}" \ + --tags Key=kubernetes.io/role/elb,Value=1 \ + "Key=kubernetes.io/cluster/${CLUSTER_NAME},Value=shared" + fi + if [[ ${#PRIVATE_SUBNETS[@]} -gt 0 ]]; then + log " Private subnets (${#PRIVATE_SUBNETS[@]}): kubernetes.io/role/internal-elb=1" + aws ec2 create-tags --region "${REGION}" \ + --resources "${PRIVATE_SUBNETS[@]}" \ + --tags Key=kubernetes.io/role/internal-elb,Value=1 \ + "Key=kubernetes.io/cluster/${CLUSTER_NAME},Value=shared" + fi + log "✓ Subnets tagged for LBC auto-discovery" +} + +install_aws_load_balancer_controller() { + log "Installing AWS Load Balancer Controller (helm chart ${LBC_CHART_VERSION})..." + + local vpc_id + vpc_id="$(aws eks describe-cluster --name "${CLUSTER_NAME}" --region "${REGION}" \ + --query 'cluster.resourcesVpcConfig.vpcId' --output text 2>/dev/null || true)" + if [[ -z "$vpc_id" || "$vpc_id" == "None" ]]; then + err "Could not determine VPC ID for cluster ${CLUSTER_NAME}. LBC install requires vpcId." + fi + + if ! aws iam get-role --role-name "${LBC_ROLE_NAME}" >/dev/null 2>&1; then + err "IRSA role ${LBC_ROLE_NAME} not found. ensure_lbc_irsa must run first." + fi + + helm repo add eks https://aws.github.io/eks-charts >/dev/null + helm repo update >/dev/null + helm_retry 5 upgrade --install "${LBC_RELEASE}" eks/aws-load-balancer-controller \ + --namespace "${LBC_NS}" \ + --version "${LBC_CHART_VERSION}" \ + --set clusterName="${CLUSTER_NAME}" \ + --set region="${REGION}" \ + --set vpcId="${vpc_id}" \ + --set serviceAccount.create=false \ + --set serviceAccount.name="${LBC_SA}" \ + --wait --timeout 10m + + check_ready "${LBC_NS}" "app.kubernetes.io/name=aws-load-balancer-controller" + log "✓ AWS Load Balancer Controller ${LBC_CHART_VERSION} installed and ready" +} + # ---------- External S3-compatible object storage (credentials only; no in-cluster install) ---------- ensure_s3compat_credentials() { # Only create credentials secret when using external S3-compatible storage (s3compat, minio, seaweedfs). @@ -1536,6 +1717,39 @@ ensure_s3_upload_splunk_app() { fi } +ensure_external_objstore_upload_splunk_app() { + if [[ -z "${SPLUNK_APP_LOCAL_PATH}" ]]; then + log "SPLUNK_APP_LOCAL_PATH not set; skipping app upload to ${OBJ_STORE_TYPE}://${OBJ_STORE_BUCKET}/apps/" + return 0 + fi + if [[ ! -f "${SPLUNK_APP_LOCAL_PATH}" ]]; then + warn "SPLUNK_APP_LOCAL_PATH='${SPLUNK_APP_LOCAL_PATH}' not found; skipping upload" + return 0 + fi + if [[ -z "${OBJ_STORE_ENDPOINT}" ]]; then + warn "OBJ_STORE_ENDPOINT not set; cannot upload Splunk app to external object store" + return 0 + fi + + local base key + base="$(basename "${SPLUNK_APP_LOCAL_PATH}")" + key="apps/${base}" + log "Ensuring Splunk app '${base}' exists at ${OBJ_STORE_TYPE}://${OBJ_STORE_BUCKET}/${key}" + + if AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \ + aws --endpoint-url "${OBJ_STORE_ENDPOINT}" s3api head-object --bucket "${OBJ_STORE_BUCKET}" --key "${key}" >/dev/null 2>&1; then + log "App already present at ${OBJ_STORE_TYPE}://${OBJ_STORE_BUCKET}/${key}; skipping upload" + else + AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \ + aws --endpoint-url "${OBJ_STORE_ENDPOINT}" s3 cp "${SPLUNK_APP_LOCAL_PATH}" "s3://${OBJ_STORE_BUCKET}/${key}" + log "Uploaded ${base} to ${OBJ_STORE_TYPE}://${OBJ_STORE_BUCKET}/${key}" + fi +} + +should_wait_for_splunk_app_install() { + [[ -n "${SPLUNK_APP_LOCAL_PATH:-}" && -f "${SPLUNK_APP_LOCAL_PATH}" ]] +} + ensure_namespace() { kubectl get ns "$1" >/dev/null 2>&1 || kubectl create ns "$1"; } ensure_bucket_policy() { @@ -2112,6 +2326,119 @@ show_platform_access_info() { log "" } +saia_service_template_enabled() { + [[ -n "${SAIA_SERVICE_TYPE:-}" && "${SAIA_SERVICE_TYPE}" != "null" && "${SAIA_SERVICE_TYPE}" != "ClusterIP" ]] +} + +saia_aiservice_name() { + local platform_name="${1:-${AI_PLATFORM_NAME}}" + printf "%s-saia" "${platform_name}" +} + +wait_for_aiservice_exists() { + local name="$1" timeout="${2:-600}" waited=0 + while ! kubectl -n "${AI_NS}" get aiservice "${name}" >/dev/null 2>&1; do + [[ $waited -ge $timeout ]] && err "Timed out waiting for AIService ${AI_NS}/${name}" + sleep 5 + waited=$((waited + 5)) + done +} + +apply_saia_service_annotations() { + local aiservice_name="$1" + local annotation_keys key value + + annotation_keys="$(yq eval '.aiPlatform.serviceTemplate.annotations // {} | keys | .[]' "${CONFIG_FILE}" 2>/dev/null || true)" + [[ -z "${annotation_keys}" ]] && return 0 + + local annotate_args=() + while IFS= read -r key; do + [[ -z "${key}" || "${key}" == "null" ]] && continue + value="$(yq eval ".aiPlatform.serviceTemplate.annotations.\"${key}\"" "${CONFIG_FILE}" 2>/dev/null || echo "")" + [[ -z "${value}" || "${value}" == "null" ]] && continue + annotate_args+=("${key}=${value}") + done <<< "${annotation_keys}" + + if [[ ${#annotate_args[@]} -gt 0 ]]; then + log "Applying SAIA Service annotations to AIService/${aiservice_name}..." + kubectl -n "${AI_NS}" annotate aiservice "${aiservice_name}" "${annotate_args[@]}" --overwrite + fi +} + +patch_saia_public_service_workaround() { + local platform_name="${1:-${AI_PLATFORM_NAME}}" + local aiservice_name public_svc_name + + aiservice_name="$(saia_aiservice_name "${platform_name}")" + public_svc_name="${aiservice_name}-saia-service" + + wait_for_aiservice_exists "${aiservice_name}" + + if saia_service_template_enabled; then + log "Patching AIService/${aiservice_name} with SAIA public exposure settings..." + if [[ "${SAIA_SERVICE_TYPE}" == "NodePort" && -n "${SAIA_SERVICE_NODE_PORT:-}" && "${SAIA_SERVICE_NODE_PORT}" != "null" ]]; then + kubectl -n "${AI_NS}" patch aiservice "${aiservice_name}" --type merge -p "{ + \"spec\": { + \"serviceTemplate\": { + \"spec\": { + \"type\": \"NodePort\", + \"ports\": [ + { + \"name\": \"http\", + \"port\": 8080, + \"targetPort\": 8080, + \"nodePort\": ${SAIA_SERVICE_NODE_PORT} + } + ] + } + } + } +}" + else + kubectl -n "${AI_NS}" patch aiservice "${aiservice_name}" --type merge -p "{ + \"spec\": { + \"serviceTemplate\": { + \"spec\": { + \"type\": \"${SAIA_SERVICE_TYPE}\" + } + } + } +}" + fi + fi + + apply_saia_service_annotations "${aiservice_name}" + + kubectl -n "${AI_NS}" annotate aiservice "${aiservice_name}" script-reconcile-ts="$(date +%s)" --overwrite >/dev/null + + if saia_service_template_enabled; then + log "Recreating SAIA public Service to ensure patched settings take effect..." + kubectl -n "${AI_NS}" delete svc "${public_svc_name}" --ignore-not-found >/dev/null 2>&1 || true + wait_resource_exists "${AI_NS}" service "${public_svc_name}" 300 + fi +} + +wait_for_saia_load_balancer() { + local platform_name="${1:-${AI_PLATFORM_NAME}}" timeout="${2:-1200}" waited=0 + local svc_name hostname="" + svc_name="$(saia_aiservice_name "${platform_name}")-saia-service" + + [[ "${SAIA_SERVICE_TYPE:-}" == "LoadBalancer" ]] || return 0 + + log "Waiting for SAIA LoadBalancer Service ${AI_NS}/${svc_name} to receive an external hostname..." + while true; do + hostname="$(kubectl -n "${AI_NS}" get svc "${svc_name}" -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null || true)" + [[ -z "${hostname}" ]] && hostname="$(kubectl -n "${AI_NS}" get svc "${svc_name}" -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true)" + if [[ -n "${hostname}" ]]; then + log "✓ SAIA external endpoint: ${hostname}" + return 0 + fi + [[ $waited -ge $timeout ]] && err "Timed out waiting for SAIA LoadBalancer Service ${AI_NS}/${svc_name}" + sleep 5 + waited=$((waited + 5)) + done +} + # Quick status check function - can be called standalone check_aiplatform_status() { local platform_name="${1:-${AI_PLATFORM_NAME}}" @@ -2262,6 +2589,14 @@ YAML ;; esac + local svc_template_yaml="" + if saia_service_template_enabled; then + svc_template_yaml=" serviceTemplate:"$'\n'" spec:"$'\n'" type: ${SAIA_SERVICE_TYPE}"$'\n' + if [[ "${SAIA_SERVICE_TYPE}" == "NodePort" && -n "${SAIA_SERVICE_NODE_PORT:-}" && "${SAIA_SERVICE_NODE_PORT}" != "null" ]]; then + svc_template_yaml+=" ports:"$'\n'" - name: http"$'\n'" port: 8080"$'\n'" targetPort: 8080"$'\n'" nodePort: ${SAIA_SERVICE_NODE_PORT}"$'\n' + fi + fi + cat </dev/null 2>&1; then pf_ok "$t found ($(command -v $t))"; else pf_fail "$t not found in PATH"; fi done @@ -3102,6 +3444,7 @@ install_ai_platform_stack() { log "=== Setting up Splunk AI Platform stack ===" if [[ "${USE_EXTERNAL_OBJ_STORE}" == "true" ]]; then log "Using external S3-compatible object storage (${OBJ_STORE_TYPE}); skipping S3 bucket creation; using ECR-only policy for IRSA." + ensure_external_objstore_upload_splunk_app else ensure_s3_bucket_and_prefixes ensure_s3_upload_splunk_app @@ -3169,19 +3512,36 @@ reconcile_flow() { fi install_kube_prometheus install_cert_manager + # AWS Load Balancer Controller (LBC) — only install when the operator itself + # needs to provision NLBs/ALBs (Service type=LoadBalancer with the + # `aws-load-balancer-type: external` annotation) or when binding k8s Services + # to customer-managed target groups via TargetGroupBinding CRs. Customers who + # bring their own LB and point it at NodePort (Path A) should leave this off. + if [[ "${INSTALL_LBC}" == "true" ]]; then + log "aiPlatform.awsLoadBalancerController.install=true — installing AWS Load Balancer Controller" + tag_lbc_subnets + ensure_lbc_irsa + install_aws_load_balancer_controller + else + log "aiPlatform.awsLoadBalancerController.install=false — skipping LBC install (bring-your-own-LB / NodePort path)" + fi ensure_s3compat_credentials install_otel_operator_and_contrib_collector install_ray_operator install_splunk_operator install_splunk_ai_operator install_ai_platform_stack - wait_splunk_ai_assistant_installed "Splunk_AI_Assistant_Cloud.tgz" 1200 + if should_wait_for_splunk_app_install; then + wait_splunk_ai_assistant_installed "Splunk_AI_Assistant_Cloud.tgz" 1200 + else + log "Skipping Splunk AI Assistant app wait because no local app archive is configured" + fi # push_saia_conf_into_pod } # ---------- MAIN ---------- main_install() { - for t in aws eksctl kubectl helm git jq yq; do need "$t"; done + for t in aws eksctl kubectl helm git jq yq curl; do need "$t"; done # Load configuration from YAML file load_config diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml index 124373f..3935404 100644 --- a/tools/cluster_setup/k0s-cluster-config.yaml +++ b/tools/cluster_setup/k0s-cluster-config.yaml @@ -15,7 +15,7 @@ cluster: name: airgap-cluster # region: us-east-2 # Ignored for on-prem, but required in config sshUser: ec2-user # CHANGE THIS: SSH user for remote nodes - sshKeyPath: /Users/mohaari2/.ssh/ai-key-arif.pem # CHANGE THIS: Path to SSH private key + sshKeyPath: /Users/kiran/.ssh/ai-key-arif.pem # CHANGE THIS: Path to SSH private key # ---------- Node Configuration ---------- nodes: @@ -25,12 +25,11 @@ nodes: existingIPs: controllers: - - 3.144.14.96 # CHANGE THIS: Your controller server IP + - 3.149.241.167 workers: - - 3.14.134.16 # CHANGE THIS: CPU worker 1 - - 13.59.78.115 # CHANGE THIS: GPU worker 1 - - 3.15.20.136 # CHANGE THIS: GPU worker 2 - + - 18.221.244.241 + - 18.191.19.128 + - 3.137.209.219 # ---------- Storage Configuration ---------- # Object storage: AWS S3 or external S3-compatible (no in-cluster MinIO install for external). # Use objectStore.type: aws (S3) or s3compat | minio | seaweedfs (external; endpoint + credentials required). @@ -103,7 +102,7 @@ images: # Build & push with: # IMG=658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.26 \ # make docker-build-amd64 docker-push - image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.28" + image: "docker.io/kbhos698/splunk-ai-operator:ai-tier" splunk: image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/splunk/splunk:10-2-ai-custom" #TODO this update @@ -175,8 +174,8 @@ kubernetes: # ---------- File Paths ---------- files: - splunkOperator: "/Users/mohaari2/Files/repos/AI/splunk-ai-operator/tools/cluster_setup/splunk-operator-cluster.yaml" - aiPlatform: "/Users/mohaari2/Files/repos/AI/splunk-ai-operator/tools/cluster_setup/artifacts.yaml" + splunkOperator: "./splunk-operator-cluster.yaml" + aiPlatform: "./artifacts.yaml" # ---------- Splunk Configuration ---------- splunk: @@ -213,7 +212,12 @@ aiPlatform: # To ENABLE external exposure for on-prem / airgap customers, NodePort is the # recommended default: any k8s node IP + the configured nodePort yields a # reachable endpoint from VPN-connected users. No cloud LB / cert-manager - # needed. Use LoadBalancer only if the customer runs MetalLB or a cloud LB. + # needed. Use LoadBalancer only if the cluster has MetalLB/cloud LB support. + # + # Note: the current operator image preserves serviceTemplate.spec.type, but + # not nested serviceTemplate.metadata.annotations. The k0s installer applies + # any annotations below directly to the generated AIService after creation, + # which the current operator already copies onto the rendered Service. serviceTemplate: type: NodePort # ClusterIP | NodePort | LoadBalancer (omit block = ClusterIP) nodePort: 30080 # Fixed NodePort (30000-32767). Required for stable DNS. diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index 1f45cfc..915118d 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -3675,6 +3675,96 @@ YAML log "AIPlatform CR installed successfully" } +saia_service_template_enabled_k0s() { + local svc_type + svc_type=$(yq eval '.aiPlatform.serviceTemplate.type // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + [[ -n "${svc_type}" && "${svc_type}" != "null" && "${svc_type}" != "ClusterIP" ]] +} + +wait_for_k0s_aiservice_exists() { + local name="$1" timeout="${2:-600}" waited=0 + while ! kubectl -n "${AI_NS}" get aiservice "${name}" >/dev/null 2>&1; do + [[ $waited -ge $timeout ]] && err "Timed out waiting for AIService ${AI_NS}/${name}" + sleep 5 + waited=$((waited + 5)) + done +} + +apply_k0s_saia_service_annotations() { + local aiservice_name="$1" + local annotation_keys key value + + annotation_keys="$(yq eval '.aiPlatform.serviceTemplate.annotations // {} | keys | .[]' "${CONFIG_FILE}" 2>/dev/null || true)" + [[ -z "${annotation_keys}" ]] && return 0 + + local annotate_args=() + while IFS= read -r key; do + [[ -z "${key}" || "${key}" == "null" ]] && continue + value="$(yq eval ".aiPlatform.serviceTemplate.annotations.\"${key}\"" "${CONFIG_FILE}" 2>/dev/null || echo "")" + [[ -z "${value}" || "${value}" == "null" ]] && continue + annotate_args+=("${key}=${value}") + done <<< "${annotation_keys}" + + if [[ ${#annotate_args[@]} -gt 0 ]]; then + log "Applying SAIA Service annotations to AIService/${aiservice_name}..." + kubectl -n "${AI_NS}" annotate aiservice "${aiservice_name}" "${annotate_args[@]}" --overwrite + fi +} + +patch_k0s_saia_public_service_workaround() { + local platform_name="${CLUSTER_NAME}-ai-platform" + local aiservice_name="${platform_name}-saia" + local public_svc_name="${aiservice_name}-saia-service" + local svc_type svc_node_port + + svc_type=$(yq eval '.aiPlatform.serviceTemplate.type // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + svc_node_port=$(yq eval '.aiPlatform.serviceTemplate.nodePort // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + + wait_for_k0s_aiservice_exists "${aiservice_name}" + + if saia_service_template_enabled_k0s; then + log "Patching AIService/${aiservice_name} with SAIA public exposure settings..." + if [[ "${svc_type}" == "NodePort" && -n "${svc_node_port}" && "${svc_node_port}" != "null" ]]; then + kubectl -n "${AI_NS}" patch aiservice "${aiservice_name}" --type merge -p "{ + \"spec\": { + \"serviceTemplate\": { + \"spec\": { + \"type\": \"NodePort\", + \"ports\": [ + { + \"name\": \"http\", + \"port\": 8080, + \"targetPort\": 8080, + \"nodePort\": ${svc_node_port} + } + ] + } + } + } +}" + else + kubectl -n "${AI_NS}" patch aiservice "${aiservice_name}" --type merge -p "{ + \"spec\": { + \"serviceTemplate\": { + \"spec\": { + \"type\": \"${svc_type}\" + } + } + } +}" + fi + fi + + apply_k0s_saia_service_annotations "${aiservice_name}" + + kubectl -n "${AI_NS}" annotate aiservice "${aiservice_name}" script-reconcile-ts="$(date +%s)" --overwrite >/dev/null + + if saia_service_template_enabled_k0s; then + log "Recreating SAIA public Service to ensure patched settings take effect..." + kubectl -n "${AI_NS}" delete svc "${public_svc_name}" --ignore-not-found >/dev/null 2>&1 || true + fi +} + # ====== INSTALL FULL STACK ====== install_ai_platform_stack() { log "Installing complete AI Platform stack..." @@ -3770,6 +3860,7 @@ install_ai_platform_stack() { # Install AI Platform operator and CR while Splunk Standalone boots install_splunk_ai_operator install_ai_platform_cr + patch_k0s_saia_public_service_workaround # Now wait for Splunk Standalone to be ready (likely already done by now) wait_for_splunk_standalone diff --git a/tools/cluster_setup/splunk-operator-cluster.yaml b/tools/cluster_setup/splunk-operator-cluster.yaml index 0732ea3..467879e 100644 --- a/tools/cluster_setup/splunk-operator-cluster.yaml +++ b/tools/cluster_setup/splunk-operator-cluster.yaml @@ -55325,7 +55325,6 @@ subjects: apiVersion: v1 data: OPERATOR_NAME: '"splunk-operator"' - # TODO identify whats this ?? RELATED_IMAGE_SPLUNK_ENTERPRISE: 667741767953.dkr.ecr.us-west-2.amazonaws.com/splunk/splunk:splunk-redhat-8-amd64-10.2.0-ef65e8205e4d-6d943f7-28228924 WATCH_NAMESPACE: "" kind: ConfigMap From 6433a1565de3234b43003afdef0a5432332e8239 Mon Sep 17 00:00:00 2001 From: kbhos Date: Wed, 29 Apr 2026 14:17:35 +0530 Subject: [PATCH 2/5] WIP: pre-merge in-progress work on saia-gateway-changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - cluster-config.yaml: rewrite SAIA exposure as 3 NodePort-free modes, drop redundant nginx image entry; add byoTargetGroup config block - eks_cluster_with_stack.sh: read BYO_TG_*; add validate_byo_target_group_config, apply_byo_target_group_binding, patch_saia_service_disable_nodeport; update patch_saia_public_service_workaround for NodePort-free mode - k0s-cluster-config.yaml: switch SAIA exposure to LoadBalancer + MetalLB; add metallb config block; revert object storage to type=minio with AWS S3 endpoint (the only working path on k0s — type=aws is silently swapped to in-cluster MinIO by the install script) - k0s_cluster_with_stack.sh: add install_metallb function (chart pin 0.14.8, L2 / BGP advertisements); patch_k0s_saia_service_disable_nodeport; fix describe_pod node-count whitespace bug - artifacts.yaml: minor diff (will be overwritten by upcoming merge) Pre-merge of origin/ai-tier-v2-k0s; will be subsumed by the merge commit. Made-with: Cursor --- tools/cluster_setup/artifacts.yaml | 67 ++++--- tools/cluster_setup/cluster-config.yaml | 136 ++++++++------ tools/cluster_setup/eks_cluster_with_stack.sh | 162 ++++++++++++++-- tools/cluster_setup/k0s-cluster-config.yaml | 165 +++++++++++----- tools/cluster_setup/k0s_cluster_with_stack.sh | 177 +++++++++++++++++- 5 files changed, 552 insertions(+), 155 deletions(-) diff --git a/tools/cluster_setup/artifacts.yaml b/tools/cluster_setup/artifacts.yaml index 69c3664..9b5b51f 100644 --- a/tools/cluster_setup/artifacts.yaml +++ b/tools/cluster_setup/artifacts.yaml @@ -1061,11 +1061,18 @@ spec: items: description: FeatureSpec defines the features to enable in the AIPlatform properties: + env: + additionalProperties: + type: string + description: Env specifies environment variables to propagate + to the child AIService. + type: object name: description: Name of the feature, e.g. "saia" or "seca" enum: - saia - seca + - weaviate-service type: string scaleFactor: description: ScaleFactor is the desired fixed number of replicas @@ -2085,6 +2092,11 @@ spec: type: object x-kubernetes-map-type: atomic type: array + otelImage: + default: otel/opentelemetry-collector-contrib:0.122.1 + description: OTelImage is the OpenTelemetry Collector sidecar + image + type: string rayHeadGroupImage: description: Ray head group image, e.g. "rayproject/ray-head:latest" type: string @@ -2225,7 +2237,8 @@ spec: type: object objectStorage: description: |- - ObjectStorage defines the object storage configuration for AI artifacts, tasks, and models + ObjectStorage defines the object storage configuration for AI artifacts, tasks, and models. + It is optional for platforms that only enable features that do not require object storage. Supported providers: S3, GCS, Azure Blob Storage, MinIO properties: endpoint: @@ -2237,8 +2250,8 @@ spec: path: description: |- Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, minio://bucketname/, seaweedfs://bucketname/, or s3compat://bucketname/ - pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ + azure://containername/, or minio://bucketname/ + pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ type: string region: description: Region of the remote storage volume. Required for @@ -2908,8 +2921,6 @@ spec: pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ type: string type: object - required: - - objectStorage type: object status: description: AIPlatformStatus defines observed state @@ -4084,11 +4095,18 @@ spec: features: description: Feature defines the features to be enabled for the AIService properties: + env: + additionalProperties: + type: string + description: Env specifies environment variables to propagate + to the child AIService. + type: object name: description: Name of the feature, e.g. "saia" or "seca" enum: - saia - seca + - weaviate-service type: string scaleFactor: description: ScaleFactor is the desired fixed number of replicas @@ -4866,27 +4884,15 @@ spec: properties: endpoint: description: |- - Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS) - Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible (MinIO, SeaweedFS, etc.) + Optional override endpoint (only needed for S3-compatible services like MinIO) + Must be a valid HTTP/HTTPS URL pattern: ^https?://.*$ type: string path: description: |- Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, s3compat://bucketname/ (generic S3-compatible), minio://, or seaweedfs:// - pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ - type: string - provider: - description: |- - Provider is an optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint. - Values: aws, minio, seaweedfs, s3compat, gcs, azure - enum: - - aws - - minio - - seaweedfs - - s3compat - - gcs - - azure + azure://containername/, or minio://bucketname/ + pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ type: string region: description: Region of the remote storage volume. Required for @@ -4894,8 +4900,7 @@ spec: minLength: 1 type: string secretRef: - description: Secret name containing storage credentials (e.g. - s3_access_key, s3_secret_key for S3-compatible backends) + description: Secret name containing storage credentials maxLength: 253 minLength: 1 type: string @@ -5682,19 +5687,19 @@ spec: fieldRef: fieldPath: metadata.name - name: RELATED_IMAGE_RAY_HEAD - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-head:build-v2-002 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-head:build-v2-008 - name: RELATED_IMAGE_RAY_WORKER - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-worker-gpu:build-v2-002 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-worker-gpu:build-v2-008 - name: RELATED_IMAGE_WEAVIATE value: docker.io/semitechnologies/weaviate:stable-v1.28-007846a + - name: RELATED_IMAGE_WEAVIATE_SERVICE + value: docker.io/semitechnologies/weaviate:stable-v1.28-007846a - name: RELATED_IMAGE_SAIA_API - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:build-v2-002 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:build-v2-009 - name: RELATED_IMAGE_SAIA_API_V2 - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api-v2:build-v2-002 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api-v2:build-v2-009 - name: RELATED_IMAGE_POST_INSTALL_HOOK - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-data-loader:v2.0.4-31-g9efe1fc - - name: SPLUNK_METRICS_INDEX_NAME - value: _metrics + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-data-loader:build-v2-009 - name: RELATED_IMAGE_FLUENT_BIT value: docker.io/fluent/fluent-bit:1.9.6 - name: RELATED_IMAGE_OTEL_COLLECTOR @@ -5705,7 +5710,7 @@ spec: value: v0.3.14-36-g1549f5a - name: RAY_VERSION value: 2.53.0 - image: 658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.25 + image: docker.io/kbhos698/splunk-ai-operator:ai-tier livenessProbe: httpGet: path: /healthz diff --git a/tools/cluster_setup/cluster-config.yaml b/tools/cluster_setup/cluster-config.yaml index 9111689..bafef6a 100644 --- a/tools/cluster_setup/cluster-config.yaml +++ b/tools/cluster_setup/cluster-config.yaml @@ -275,91 +275,99 @@ aiPlatform: imageRegistry: "" # Leave empty for default # --------------------------------------------------------------------------- - # Public SAIA exposure + # Public SAIA exposure (NodePort-free) # --------------------------------------------------------------------------- - # The operator always renders a public Kubernetes Service named + # The operator renders a public Kubernetes Service named # `-saia-service` whose endpoints are the in-cluster nginx - # pods (nginx terminates path routing to saia v1 / v2). HOW that Service is - # reached from outside the cluster depends on two settings below: + # pods (nginx terminates path routing to saia v1 / v2). The install script + # then configures HOW that Service is reached from outside the cluster. # - # - aiPlatform.serviceTemplate.{type, nodePort, annotations} - # - aiPlatform.awsLoadBalancerController.install + # IMPORTANT: this template intentionally does NOT use Service.type=NodePort. + # Many enterprise security policies prohibit opening 30000-32767 on every + # worker. All three modes below are NodePort-free — the script sets + # `allocateLoadBalancerNodePorts: false` on LoadBalancer Services so + # kube-proxy never opens a node port; for the BYO mode the Service stays + # ClusterIP and AWS LBC registers pod IPs into the customer's target group. # - # Pick ONE of the three modes below. Each row shows: what you put in this - # file, what the install script does, and what you (the customer) must - # provision outside the cluster. + # Pick ONE of the modes below by editing the active block at the bottom of + # this section. Each mode shows: the YAML to use, what the script does, and + # what you must provision outside the cluster. # # --------------------------------------------------------------------------- - # MODE 1 — Operator-managed AWS NLB (default, simplest on EKS) + # MODE 1 — Operator-managed AWS NLB, IP-target mode (DEFAULT) # --------------------------------------------------------------------------- # serviceTemplate: # type: LoadBalancer # annotations: # service.beta.kubernetes.io/aws-load-balancer-type: "external" - # service.beta.kubernetes.io/aws-load-balancer-scheme: "internet-facing" # or "internal" - # service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: "instance" + # service.beta.kubernetes.io/aws-load-balancer-scheme: "internet-facing" # or "internal" + # service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: "ip" # ← pods, not nodes + # service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: "true" + # # Optional TLS termination at the NLB: + # # service.beta.kubernetes.io/aws-load-balancer-ssl-cert: "arn:aws:acm:..." + # # service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "443" + # # service.beta.kubernetes.io/aws-load-balancer-ssl-negotiation-policy: "ELBSecurityPolicy-TLS13-1-2-2021-06" # awsLoadBalancerController: # install: true + # byoTargetGroup: + # enabled: false # # Script does: - # * Installs AWS Load Balancer Controller (LBC) with IRSA and tags - # public/private subnets for auto-discovery. - # * Creates the LoadBalancer-typed Service; LBC reads the annotations and - # provisions an internet-facing AWS NLB (~2-3 min). Public DNS appears - # in `.status.loadBalancer.ingress[0].hostname`. - # You must do: - # * Nothing on the AWS side — fully automated. - # * (Optional) Add an ACM cert listener annotation if you want TLS - # termination at the NLB. + # * Installs AWS Load Balancer Controller (LBC) with IRSA, tags subnets. + # * Creates the LoadBalancer Service; LBC provisions an NLB whose targets + # are pod IPs (no NodePort, no kube-proxy hop, real client IP preserved). + # * Patches the rendered Service to set `allocateLoadBalancerNodePorts: + # false` and `externalTrafficPolicy: Local`. + # You must do: nothing on the AWS side. DNS appears in + # `.status.loadBalancer.ingress[0].hostname` after ~2-3 min. # # --------------------------------------------------------------------------- - # MODE 2 — Bring-your-own AWS LB (you already have an NLB / ALB) + # MODE 2 — Bring-your-own AWS LB (TargetGroupBinding, IP-target) # --------------------------------------------------------------------------- + # Customer already owns the NLB / ALB / target group. LBC is installed only + # to manage target-group membership; it does NOT create LBs in this mode. + # # serviceTemplate: - # type: NodePort - # nodePort: 30080 # any free port in 30000-32767 + # type: ClusterIP # LB is owned by the customer # awsLoadBalancerController: - # install: false # no LBC, no operator-created LB + # install: true # required for TargetGroupBinding + # byoTargetGroup: + # enabled: true + # targetGroupArn: "arn:aws:elasticloadbalancing:::targetgroup//" + # securityGroupId: "sg-xxxxxxxxxxxxxxxxx" # the customer's LB security group # # Script does: - # * Creates the public Service as NodePort 30080 on every worker. - # * Skips LBC install entirely. - # You must do (in AWS, outside the script): - # 1. Pre-create an NLB or ALB (any scheme). - # 2. Create a target group: - # - Target type: instance - # - Protocol/Port: TCP/30080 (NLB) or HTTP/30080 (ALB) - # - Health check: HTTP /nginx_health on port "traffic-port", 200 OK - # 3. Attach the EKS managed-nodegroup ASG to the target group so - # membership tracks node scale-in/out, e.g. via Terraform: - # resource "aws_autoscaling_attachment" "saia" { - # autoscaling_group_name = "eks---NodeGroup-XXXX" - # lb_target_group_arn = "arn:aws:elasticloadbalancing:...:targetgroup/my-saia-tg/..." - # } - # 4. Worker node SG: allow ingress TCP/30080 from the NLB subnet CIDRs - # (NLB) or from the ALB's security group (ALB). + # * Installs LBC. + # * Leaves the public Service as ClusterIP. + # * Applies a TargetGroupBinding CR with `targetType: ip` so LBC registers + # nginx pod IPs into the customer's target group as endpoints change. + # You must do (outside the cluster): + # 1. Pre-create the target group in the EKS VPC with: + # - Target type: ip + # - Protocol/Port: TCP/8080 (NLB) or HTTP/8080 (ALB) ← pod port, not 30080 + # - Health check: HTTP /nginx_health on traffic-port, 200 OK + # 2. Attach the target group to your existing LB listener. + # 3. Worker pod SG ingress 8080 from the LB SG only — the + # TargetGroupBinding `networking.ingress.from.securityGroup` block + # configured by the script does this for you. # # --------------------------------------------------------------------------- - # MODE 3 — On-prem / k0s / airgap (HAProxy, F5, MetalLB, hardware LB, …) + # MODE 3 — On-prem / k0s / airgap (NOT applicable to this EKS template) # --------------------------------------------------------------------------- - # serviceTemplate: - # type: NodePort - # nodePort: 30080 - # awsLoadBalancerController: - # install: false # has no effect off-AWS, leave false - # - # Script does: - # * Same as Mode 2 — creates the public Service as NodePort 30080. - # You must do (outside the cluster): - # * Point your existing L4 LB (HAProxy / F5 / MetalLB / hardware) at every - # worker node IP on TCP/30080, with HTTP health-check /nginx_health. - # Sample HAProxy backend: - # backend saia_be - # option httpchk GET /nginx_health - # server worker1 10.0.1.11:30080 check - # server worker2 10.0.1.12:30080 check + # Use the dedicated `k0s-cluster-config.yaml` template, which configures + # MetalLB to allocate a routable VIP. The user-facing contract there is + # identical (`type: LoadBalancer`) — only the LB provider changes. # # --------------------------------------------------------------------------- + # SECURITY NOTES (apply to all modes) + # --------------------------------------------------------------------------- + # * Always terminate TLS at the LB (ACM cert on AWS) and place an auth + # layer in front (oauth2-proxy, Cognito on the ALB, API Gateway, …) + # before exposing on the public internet. + # * Restrict the LB listener to trusted source CIDRs / SGs (never + # 0.0.0.0/0 to a sensitive endpoint). + # * Pod SG ingress should allow 8080 only from the LB SG. + # --------------------------------------------------------------------------- # Active mode below — EDIT to switch. Default is MODE 1. serviceTemplate: @@ -367,11 +375,21 @@ aiPlatform: annotations: service.beta.kubernetes.io/aws-load-balancer-type: "external" service.beta.kubernetes.io/aws-load-balancer-scheme: "internet-facing" - service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: "instance" + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: "ip" + service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: "true" awsLoadBalancerController: install: true + # Bring-your-own AWS target group (Mode 2). Set enabled: true and provide + # both targetGroupArn and securityGroupId; the script will then leave the + # SAIA Service as ClusterIP and apply a TargetGroupBinding (LBC manages + # target registration into your existing target group). + byoTargetGroup: + enabled: false + # targetGroupArn: "arn:aws:elasticloadbalancing:us-east-2:123456789012:targetgroup/my-saia-tg/abc123" + # securityGroupId: "sg-0123456789abcdef0" + # CPU Scheduling cpuScheduling: nodeSelector: {} diff --git a/tools/cluster_setup/eks_cluster_with_stack.sh b/tools/cluster_setup/eks_cluster_with_stack.sh index b6982fb..9c815d8 100755 --- a/tools/cluster_setup/eks_cluster_with_stack.sh +++ b/tools/cluster_setup/eks_cluster_with_stack.sh @@ -93,12 +93,18 @@ load_config() { WORKER_IMAGE_REGISTRY="$(yq eval '.aiPlatform.workerGroupConfig.imageRegistry' "$cfg")" SAIA_SERVICE_TYPE="$(yq eval '.aiPlatform.serviceTemplate.type // ""' "$cfg")" SAIA_SERVICE_NODE_PORT="$(yq eval '.aiPlatform.serviceTemplate.nodePort // ""' "$cfg")" - # AWS Load Balancer Controller (LBC) install toggle. Default: false — the - # script assumes customers bring their own LB and point it at NodePort - # (Path A). Set to true only when you want operator-managed NLB/ALB - # provisioning via the `aws-load-balancer-type: external` annotation or - # dynamic target registration via TargetGroupBinding CRs (Path B). + # AWS Load Balancer Controller (LBC) install toggle. Required for both + # operator-managed NLB provisioning (Mode 1) and customer-owned LB + # registration via TargetGroupBinding (Mode 2). Off-AWS users (k0s) leave + # this false. INSTALL_LBC="$(yq eval '.aiPlatform.awsLoadBalancerController.install // false' "$cfg")" + # Bring-your-own AWS target group (Mode 2). When enabled the script keeps + # the public Service as ClusterIP and applies a TargetGroupBinding so LBC + # registers nginx pod IPs into the customer's pre-existing target group. + # Requires INSTALL_LBC=true. + BYO_TG_ENABLED="$(yq eval '.aiPlatform.byoTargetGroup.enabled // false' "$cfg")" + BYO_TG_ARN="$(yq eval '.aiPlatform.byoTargetGroup.targetGroupArn // ""' "$cfg")" + BYO_TG_SG_ID="$(yq eval '.aiPlatform.byoTargetGroup.securityGroupId // ""' "$cfg")" INGRESS_HOST="$(yq eval '.aiPlatform.ingress.host' "$cfg")" INGRESS_CLASS="$(yq eval '.aiPlatform.ingress.className' "$cfg")" INGRESS_TLS_SECRET="$(yq eval '.aiPlatform.ingress.tlsSecretName' "$cfg")" @@ -185,6 +191,9 @@ load_config() { SAIA_SERVICE_TYPE="" SAIA_SERVICE_NODE_PORT="" INSTALL_LBC="false" + BYO_TG_ENABLED="false" + BYO_TG_ARN="" + BYO_TG_SG_ID="" INGRESS_HOST="ai.example.com" INGRESS_CLASS="nginx" INGRESS_TLS_SECRET="ai-platform-tls" @@ -2365,18 +2374,120 @@ apply_saia_service_annotations() { fi } +byo_target_group_enabled() { + [[ "${BYO_TG_ENABLED:-false}" == "true" ]] +} + +# Validates BYO target-group configuration and warns about misconfigurations +# before any kubectl/aws calls are issued. Caller decides whether to err or +# return on warnings — we treat missing required fields as fatal because the +# rest of the install would silently misroute traffic. +validate_byo_target_group_config() { + byo_target_group_enabled || return 0 + + if [[ "${INSTALL_LBC:-false}" != "true" ]]; then + err "byoTargetGroup.enabled=true requires awsLoadBalancerController.install=true (LBC manages the TargetGroupBinding)." + fi + if [[ -z "${BYO_TG_ARN:-}" || "${BYO_TG_ARN}" == "null" ]]; then + err "byoTargetGroup.enabled=true requires byoTargetGroup.targetGroupArn to be set." + fi + if [[ "${BYO_TG_ARN}" != arn:aws:elasticloadbalancing:* ]]; then + err "byoTargetGroup.targetGroupArn must look like 'arn:aws:elasticloadbalancing:::targetgroup//' (got: ${BYO_TG_ARN})." + fi + if [[ -z "${BYO_TG_SG_ID:-}" || "${BYO_TG_SG_ID}" == "null" ]]; then + err "byoTargetGroup.enabled=true requires byoTargetGroup.securityGroupId (the customer LB's SG) so LBC opens pod-SG ingress correctly." + fi + if [[ "${SAIA_SERVICE_TYPE:-}" == "LoadBalancer" ]]; then + log "WARNING: byoTargetGroup.enabled=true with serviceTemplate.type=LoadBalancer creates BOTH an operator-managed LB AND a TargetGroupBinding. Set serviceTemplate.type=ClusterIP for pure BYO." >&2 + fi +} + +# Apply a TargetGroupBinding CR pointing at the customer's pre-provisioned +# target group. AWS LBC reads this CR and registers the SAIA Service's pod +# IPs (targetType: ip) into the customer's TG, then deregisters them on pod +# rotation. The networking.ingress block has LBC open the pod SG to the LB's +# SG only — never 0.0.0.0/0 (codeguard-0-iac-security). +apply_byo_target_group_binding() { + local platform_name="${1:-${AI_PLATFORM_NAME}}" + local svc_name + svc_name="$(saia_aiservice_name "${platform_name}")-saia-service" + + byo_target_group_enabled || return 0 + + log "Applying TargetGroupBinding for BYO target group ${BYO_TG_ARN}..." + cat </dev/null || true)" + [[ "${svc_type}" != "LoadBalancer" ]] && return 0 + + log "Patching Service ${AI_NS}/${svc_name} to disable NodePort allocation..." + kubectl -n "${AI_NS}" patch svc "${svc_name}" --type=merge -p '{ + "spec": { + "allocateLoadBalancerNodePorts": false, + "externalTrafficPolicy": "Local" + } +}' >/dev/null + log "✓ Service ${AI_NS}/${svc_name}: allocateLoadBalancerNodePorts=false, externalTrafficPolicy=Local" +} + patch_saia_public_service_workaround() { local platform_name="${1:-${AI_PLATFORM_NAME}}" - local aiservice_name public_svc_name + local aiservice_name public_svc_name effective_type aiservice_name="$(saia_aiservice_name "${platform_name}")" public_svc_name="${aiservice_name}-saia-service" wait_for_aiservice_exists "${aiservice_name}" - if saia_service_template_enabled; then - log "Patching AIService/${aiservice_name} with SAIA public exposure settings..." - if [[ "${SAIA_SERVICE_TYPE}" == "NodePort" && -n "${SAIA_SERVICE_NODE_PORT:-}" && "${SAIA_SERVICE_NODE_PORT}" != "null" ]]; then + # In BYO mode the customer owns the LB; force the SAIA Service to ClusterIP + # regardless of what serviceTemplate.type says — TargetGroupBinding wires + # everything else. + if byo_target_group_enabled; then + effective_type="ClusterIP" + else + effective_type="${SAIA_SERVICE_TYPE}" + fi + + if [[ -n "${effective_type:-}" && "${effective_type}" != "null" ]]; then + log "Patching AIService/${aiservice_name} with SAIA public exposure settings (type=${effective_type})..." + if [[ "${effective_type}" == "NodePort" && -n "${SAIA_SERVICE_NODE_PORT:-}" && "${SAIA_SERVICE_NODE_PORT}" != "null" ]]; then + log "WARNING: NodePort exposure is discouraged; consider Mode 1 (LoadBalancer + LBC) or Mode 2 (BYO target group) instead." >&2 kubectl -n "${AI_NS}" patch aiservice "${aiservice_name}" --type merge -p "{ \"spec\": { \"serviceTemplate\": { @@ -2399,7 +2510,7 @@ patch_saia_public_service_workaround() { \"spec\": { \"serviceTemplate\": { \"spec\": { - \"type\": \"${SAIA_SERVICE_TYPE}\" + \"type\": \"${effective_type}\" } } } @@ -2411,11 +2522,16 @@ patch_saia_public_service_workaround() { kubectl -n "${AI_NS}" annotate aiservice "${aiservice_name}" script-reconcile-ts="$(date +%s)" --overwrite >/dev/null - if saia_service_template_enabled; then + if [[ -n "${effective_type:-}" && "${effective_type}" != "null" && "${effective_type}" != "ClusterIP" ]]; then log "Recreating SAIA public Service to ensure patched settings take effect..." kubectl -n "${AI_NS}" delete svc "${public_svc_name}" --ignore-not-found >/dev/null 2>&1 || true wait_resource_exists "${AI_NS}" service "${public_svc_name}" 300 fi + + # NodePort-free hardening: disable kube-proxy NodePort allocation on + # LoadBalancer Services and apply BYO TargetGroupBinding if configured. + patch_saia_service_disable_nodeport "${platform_name}" + apply_byo_target_group_binding "${platform_name}" } wait_for_saia_load_balancer() { @@ -2423,6 +2539,13 @@ wait_for_saia_load_balancer() { local svc_name hostname="" svc_name="$(saia_aiservice_name "${platform_name}")-saia-service" + # In BYO mode the Service is ClusterIP and the customer's LB DNS is not + # surfaced via .status.loadBalancer; skip the wait. Mode 1 (operator- + # managed NLB) still gates on SAIA_SERVICE_TYPE=LoadBalancer. + if byo_target_group_enabled; then + log "byoTargetGroup.enabled=true — skipping wait for operator-managed LB hostname (LB is customer-managed)." + return 0 + fi [[ "${SAIA_SERVICE_TYPE:-}" == "LoadBalancer" ]] || return 0 log "Waiting for SAIA LoadBalancer Service ${AI_NS}/${svc_name} to receive an external hostname..." @@ -3512,18 +3635,23 @@ reconcile_flow() { fi install_kube_prometheus install_cert_manager - # AWS Load Balancer Controller (LBC) — only install when the operator itself - # needs to provision NLBs/ALBs (Service type=LoadBalancer with the - # `aws-load-balancer-type: external` annotation) or when binding k8s Services - # to customer-managed target groups via TargetGroupBinding CRs. Customers who - # bring their own LB and point it at NodePort (Path A) should leave this off. + # Validate BYO target-group config before any side-effecting calls. Fail + # fast if the customer set byoTargetGroup.enabled=true without LBC or + # required ARN/SG fields — better an early error than a silently-broken + # data path. + validate_byo_target_group_config + # AWS Load Balancer Controller (LBC) — required when the operator provisions + # NLBs/ALBs (Mode 1: Service type=LoadBalancer + `aws-load-balancer-type: + # external` annotation) or when binding the SAIA Service to a customer- + # managed target group via TargetGroupBinding (Mode 2: byoTargetGroup + # enabled). Off-AWS deployments leave this false. if [[ "${INSTALL_LBC}" == "true" ]]; then log "aiPlatform.awsLoadBalancerController.install=true — installing AWS Load Balancer Controller" tag_lbc_subnets ensure_lbc_irsa install_aws_load_balancer_controller else - log "aiPlatform.awsLoadBalancerController.install=false — skipping LBC install (bring-your-own-LB / NodePort path)" + log "aiPlatform.awsLoadBalancerController.install=false — skipping LBC install" fi ensure_s3compat_credentials install_otel_operator_and_contrib_collector diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml index 3935404..e107334 100644 --- a/tools/cluster_setup/k0s-cluster-config.yaml +++ b/tools/cluster_setup/k0s-cluster-config.yaml @@ -13,7 +13,7 @@ # ---------- Cluster Configuration ---------- cluster: name: airgap-cluster - # region: us-east-2 # Ignored for on-prem, but required in config + region: us-east-2 # CHANGE THIS — required when storage.objectStore.type=aws (region of the S3 bucket); ignored for true on-prem sshUser: ec2-user # CHANGE THIS: SSH user for remote nodes sshKeyPath: /Users/kiran/.ssh/ai-key-arif.pem # CHANGE THIS: Path to SSH private key @@ -31,31 +31,76 @@ nodes: - 18.191.19.128 - 3.137.209.219 # ---------- Storage Configuration ---------- -# Object storage: AWS S3 or external S3-compatible (no in-cluster MinIO install for external). -# Use objectStore.type: aws (S3) or s3compat | minio | seaweedfs (external; endpoint + credentials required). +# Object storage choices (`storage.objectStore.type`): +# * aws — real AWS S3 (this template's default). SAIA pods authenticate +# via the AWS SDK default credential chain (see prerequisite +# below). No in-cluster Secret is created. +# * minio — external MinIO (AWS-spec compliant). Provide endpoint + auth. +# * seaweedfs — external SeaweedFS S3 gateway. Provide endpoint + auth. +# * s3compat — generic S3-compatible. Provide endpoint + auth. storage: - s3Bucket: "ai-platform-bucket-minio-us-east-2" # Used when objectStore.type is aws - storageClass: "local-path" # Storage class for Kubernetes PVCs (gp3, gp2, io1, io2) - vectorDbSize: "50Gi" # VectorDB persistent volume size + # --------------------------------------------------------------------- + # Real AWS S3 on k0s — via s3compat with AWS S3 endpoint + # --------------------------------------------------------------------- + # WHY NOT type=aws? + # The k0s install script and operator do NOT actually support + # objectStore.type=aws today. When set, the script silently: + # * installs an in-cluster MinIO into the minio-system namespace, + # * points SAIA's S3COMPAT_OBJECT_STORE_ENDPOINT_URL at that MinIO + # (http://minio.minio-system.svc.cluster.local:9000), + # * ignores AIPlatform.spec.objectStorage.path = s3://... + # Result: SAIA always uses the in-cluster MinIO regardless of the + # objectStore.type setting. This was verified by inspecting pod env + # vars after a clean install with type=aws — see operator bug filed + # for proper k0s AWS S3 support. + # + # WORKAROUND — use type=s3compat with the AWS S3 regional endpoint: + # The s3compat adapter is just boto3 with explicit endpoint_url + + # credentials. AWS S3 IS S3-compatible, so the same code path works + # when pointed at https://s3..amazonaws.com with a real AWS + # access-key/secret pair. SAIA pods authenticate with the static + # AWS keys you put in auth.rootUser / auth.rootPassword, the + # installer creates a Kubernetes Secret named "minio-credentials" + # with those keys, and SAIA's storage_adapters/factory.py uses them + # to sign SigV4 requests to AWS S3 — which AWS accepts as valid. + # + # SECURITY NOTES (codeguard-1-hardcoded-credentials, + # codeguard-0-additional-cryptography): + # * The AWS access key below MUST come from a dedicated IAM USER (not + # a root account) with a least-privilege policy scoped to the + # bucket only: + # s3:GetObject, s3:PutObject, s3:DeleteObject, + # s3:GetObjectTagging, s3:PutObjectTagging + # on arn:aws:s3:::/* + # s3:ListBucket, s3:GetBucketLocation + # on arn:aws:s3::: + # * Rotate the access key every 90 days at most; deactivate the + # prior key after rollout. + # * Do NOT commit these values to source control — populate from a + # real secrets manager (Vault / AWS Secrets Manager / sops) at + # deploy time, not from this YAML file. + # * `endpoint` MUST be HTTPS — never use plaintext for S3 traffic. + # + # When the operator is fixed to support k0s+IRSA-style or k0s+IMDS + # auth properly, switch back to type=aws and remove the auth block. + # --------------------------------------------------------------------- + s3Bucket: "ai-platform-bucket-us-east-2" # CHANGE THIS — must match objectStore.bucket below + storageClass: "local-path" # k0s default storage class (NOT "gp3" — gp3 is EKS-only) + vectorDbSize: "50Gi" # VectorDB persistent volume size objectStore: - # 2026-04-21: switched from seaweedfs to minio because SeaweedFS returns - # S3 InternalError/500 (not NoSuchKey/404) for GetObjectTagging on a - # missing key. The SAIA v2 S3ConversationStore (added by Tony in - # saia-service commits 3d3756f3/8e2a9f40, shipped in image build-v2-002) - # calls GetObjectTagging on the conversation key *before* the first - # PutObject, so every brand-new draft: conversation hit a 502 from the - # SDK's 5-retry backoff. MinIO is AWS-spec compliant (NoSuchKey/404) and - # hosts the same bucket name at :9000, so swapping the endpoint is - # sufficient. Fallback: flip back by setting type: "seaweedfs" and - # endpoint to :8333 (but note the 502 on every draft conversation). - type: "minio" # aws | s3compat | minio | seaweedfs (external only for non-aws) - bucket: "ai-platform-bucket-minio-us-east-2" - # endpoint: "http://3.144.157.201:8333" # SeaweedFS (deprecated — see comment above) - endpoint: "http://13.59.216.105:9000" # MinIO (AWS-spec compliant GetObjectTagging semantics) + # type=minio (NOT s3compat) is required: the AIPlatform CRD pattern + # only accepts ^(s3|gs|azure|minio)://... — `s3compat://` is rejected by + # the API server. The path scheme is purely a label for the CRD; the + # runtime endpoint below decides which backend SAIA actually talks to. + # Pointing endpoint at AWS S3 with real AWS keys makes this configuration + # use real AWS S3, not MinIO, despite the type label. + type: "minio" + bucket: "ai-platform-bucket-us-east-2" # CHANGE THIS — must match storage.s3Bucket above + endpoint: "https://s3.us-east-2.amazonaws.com" # CHANGE THIS — AWS regional S3 endpoint, MUST be HTTPS auth: - rootUser: "minioadmin" - rootPassword: "minioadmin" + rootUser: "" # CHANGE THIS — AWS_ACCESS_KEY_ID (AKIA...) + rootPassword: "" # CHANGE THIS — never ship a real key in this file # ---------- Container Images Configuration ---------- images: @@ -190,37 +235,36 @@ aiPlatform: workerGroupConfig: imageRegistry: "" - # ---------- SAIA public exposure (OPTIONAL) ---------- - # The SAIA "public" Service (nginx reverse proxy in front of v1+v2 API pods) - # defaults to ClusterIP, meaning it is only reachable from inside the cluster. - # - # Two call patterns hit this Service: - # (A) Splunk Enterprise pod → saia-service (works with ClusterIP) - # (B) End user's browser → saia-service (needs external exposure) + # ---------- SAIA public exposure (NodePort-free) ---------- + # The SAIA "public" Service (nginx reverse proxy in front of v1 + v2 API + # pods) defaults to ClusterIP — only reachable from inside the cluster. Two + # call patterns hit it: + # (A) Splunk Enterprise pod → saia-service (works with ClusterIP) + # (B) End user's browser → saia-service (needs external exposure) # # Pattern B is used by the v2 chat UI (/query streaming, conversations, # feedback, admin endpoints). Without external exposure the v2 chat UI - # breaks for users, even though v1 one-shot SPL features still work. + # breaks for users; v1 one-shot SPL still works. # - # To DISABLE external exposure (use ClusterIP only), either: - # * Delete / comment-out the entire `serviceTemplate:` block below, OR - # * Set `type: ClusterIP` explicitly. - # Either is treated identically — the installer skips emitting serviceTemplate - # into the AIPlatform CR and the operator falls through to the ClusterIP - # default in reconcileSAIAService(). + # The supported on-prem path is `type: LoadBalancer` backed by MetalLB + # (allocates a routable VIP from a pool you provide; ARP / BGP-announces it + # on your network). NodePort is intentionally avoided so we never open + # 30000-32767 on every worker node. # - # To ENABLE external exposure for on-prem / airgap customers, NodePort is the - # recommended default: any k8s node IP + the configured nodePort yields a - # reachable endpoint from VPN-connected users. No cloud LB / cert-manager - # needed. Use LoadBalancer only if the cluster has MetalLB/cloud LB support. + # The installer: + # * Installs MetalLB (set metallb.install: true below). + # * Applies an IPAddressPool + L2Advertisement (or BGPAdvertisement) from + # the metallb config below. + # * Renders the SAIA Service as type: LoadBalancer; MetalLB allocates a + # VIP from the pool and announces it. + # * Patches the Service with `allocateLoadBalancerNodePorts: false` and + # `externalTrafficPolicy: Local` so kube-proxy does not open a NodePort. # - # Note: the current operator image preserves serviceTemplate.spec.type, but - # not nested serviceTemplate.metadata.annotations. The k0s installer applies - # any annotations below directly to the generated AIService after creation, - # which the current operator already copies onto the rendered Service. + # To DISABLE external exposure (ClusterIP only), comment out the whole + # serviceTemplate block AND set metallb.install: false. serviceTemplate: - type: NodePort # ClusterIP | NodePort | LoadBalancer (omit block = ClusterIP) - nodePort: 30080 # Fixed NodePort (30000-32767). Required for stable DNS. + type: LoadBalancer # ClusterIP | LoadBalancer (NodePort is not used on k0s) + # No nodePort field — explicitly NodePort-free. features: - name: "saia" @@ -238,6 +282,35 @@ aiPlatform: value: "true" effect: "NoSchedule" +# ---------- MetalLB (k0s LoadBalancer provider) ---------- +# Required when aiPlatform.serviceTemplate.type=LoadBalancer on a bare-metal +# / k0s cluster. Pinned chart version for supply-chain reproducibility +# (codeguard-0-supply-chain-security). +metallb: + install: true # set false if MetalLB is already installed or not needed + chartVersion: "0.14.8" # metallb/metallb Helm chart (matches MetalLB v0.14.8) + namespace: "metallb-system" + + # Address pool — a range of IPs MetalLB can hand out to LoadBalancer + # Services. Must be routable from clients (VPN-connected users) to your k0s + # workers. Use IPs that are NOT used elsewhere on the LAN. + pool: + name: "saia-pool" + addresses: + - "10.20.30.100-10.20.30.110" # CHANGE THIS to a free range on your network + + # Advertisement mode: "layer2" works on most LANs without network gear + # changes (one elected node answers ARP for the VIP at a time; failover ~ + # seconds). Use "bgp" only if your fabric supports BGP peering — then also + # populate metallb.bgpPeers below. + mode: "layer2" # layer2 | bgp + + # Required only when mode=bgp. Leave empty for layer2. + bgpPeers: [] + # - peerAddress: "10.0.0.1" + # peerASN: 65001 + # myASN: 65000 + # ---------- Image Pull Secrets ---------- imagePullSecrets: secrets: diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index 915118d..c5384c0 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -3711,6 +3711,155 @@ apply_k0s_saia_service_annotations() { fi } +# ---------- MetalLB (k0s LoadBalancer provider) ---------- +# k0s ships without a Service.type=LoadBalancer provider. MetalLB fills that +# gap by allocating a VIP from a customer-provided pool and announcing it via +# Layer-2 (ARP/NDP) or BGP. We pin the chart version for supply-chain +# reproducibility (codeguard-0-supply-chain-security). + +metallb_enabled_k0s() { + local v + v="$(yq eval '.metallb.install // false' "${CONFIG_FILE}" 2>/dev/null || echo false)" + [[ "${v}" == "true" ]] +} + +install_metallb() { + metallb_enabled_k0s || { log "metallb.install != true — skipping MetalLB install"; return 0; } + + local ns chart_version pool_name addr_count mode + ns="$(yq eval '.metallb.namespace // "metallb-system"' "${CONFIG_FILE}" 2>/dev/null)" + chart_version="$(yq eval '.metallb.chartVersion // "0.14.8"' "${CONFIG_FILE}" 2>/dev/null)" + pool_name="$(yq eval '.metallb.pool.name // "saia-pool"' "${CONFIG_FILE}" 2>/dev/null)" + addr_count="$(yq eval '.metallb.pool.addresses // [] | length' "${CONFIG_FILE}" 2>/dev/null || echo 0)" + mode="$(yq eval '.metallb.mode // "layer2"' "${CONFIG_FILE}" 2>/dev/null)" + + if [[ "${addr_count}" == "0" ]]; then + err "metallb.install=true but metallb.pool.addresses is empty. Provide at least one IP range routable on your network." + fi + if [[ "${mode}" != "layer2" && "${mode}" != "bgp" ]]; then + err "metallb.mode must be 'layer2' or 'bgp' (got: ${mode})." + fi + + log "Installing MetalLB ${chart_version} into namespace ${ns}..." + helm repo add metallb https://metallb.github.io/metallb >/dev/null 2>&1 || true + helm repo update >/dev/null 2>&1 || true + + kubectl get ns "${ns}" >/dev/null 2>&1 || kubectl create ns "${ns}" + + helm upgrade --install metallb metallb/metallb \ + --namespace "${ns}" \ + --version "${chart_version}" \ + --wait --timeout 5m + + # Wait for the controller webhook to be Ready before applying CRs, otherwise + # the IPAddressPool / L2Advertisement applies race the validating webhook. + log "Waiting for MetalLB controller to be ready..." + kubectl -n "${ns}" rollout status deploy/metallb-controller --timeout=180s + + # Render IPAddressPool with the configured address ranges. + local addresses_yaml="" + local i + local pool_count + pool_count="$(yq eval '.metallb.pool.addresses | length' "${CONFIG_FILE}" 2>/dev/null || echo 0)" + for ((i=0; i/dev/null)" + [[ -z "${addr}" || "${addr}" == "null" ]] && continue + addresses_yaml+=" - ${addr}"$'\n' + done + + log "Applying MetalLB IPAddressPool '${pool_name}' (${addr_count} range(s))..." + cat </dev/null || echo 0)" + if [[ "${peer_count}" == "0" ]]; then + err "metallb.mode=bgp requires metallb.bgpPeers to be non-empty (peerAddress, peerASN, myASN per peer)." + fi + local p + for ((p=0; p/dev/null)" + peer_asn="$(yq eval ".metallb.bgpPeers[${p}].peerASN" "${CONFIG_FILE}" 2>/dev/null)" + my_asn="$(yq eval ".metallb.bgpPeers[${p}].myASN" "${CONFIG_FILE}" 2>/dev/null)" + [[ -z "${peer_addr}" || -z "${peer_asn}" || -z "${my_asn}" ]] && \ + err "metallb.bgpPeers[${p}] missing peerAddress / peerASN / myASN." + cat </dev/null || true)" + [[ "${svc_type}" != "LoadBalancer" ]] && return 0 + + log "Patching Service ${AI_NS}/${svc_name} to disable NodePort allocation..." + kubectl -n "${AI_NS}" patch svc "${svc_name}" --type=merge -p '{ + "spec": { + "allocateLoadBalancerNodePorts": false, + "externalTrafficPolicy": "Local" + } +}' >/dev/null + log "✓ Service ${AI_NS}/${svc_name}: allocateLoadBalancerNodePorts=false, externalTrafficPolicy=Local" +} + patch_k0s_saia_public_service_workaround() { local platform_name="${CLUSTER_NAME}-ai-platform" local aiservice_name="${platform_name}-saia" @@ -3723,8 +3872,9 @@ patch_k0s_saia_public_service_workaround() { wait_for_k0s_aiservice_exists "${aiservice_name}" if saia_service_template_enabled_k0s; then - log "Patching AIService/${aiservice_name} with SAIA public exposure settings..." + log "Patching AIService/${aiservice_name} with SAIA public exposure settings (type=${svc_type})..." if [[ "${svc_type}" == "NodePort" && -n "${svc_node_port}" && "${svc_node_port}" != "null" ]]; then + log "WARNING: NodePort exposure is discouraged on k0s. Prefer type=LoadBalancer with metallb.install=true." >&2 kubectl -n "${AI_NS}" patch aiservice "${aiservice_name}" --type merge -p "{ \"spec\": { \"serviceTemplate\": { @@ -3762,7 +3912,17 @@ patch_k0s_saia_public_service_workaround() { if saia_service_template_enabled_k0s; then log "Recreating SAIA public Service to ensure patched settings take effect..." kubectl -n "${AI_NS}" delete svc "${public_svc_name}" --ignore-not-found >/dev/null 2>&1 || true + # Wait briefly for the operator to recreate it before patching NodePort + # allocation off; if it doesn't come back the patch will be a no-op. + local waited=0 + while ! kubectl -n "${AI_NS}" get svc "${public_svc_name}" >/dev/null 2>&1; do + [[ ${waited} -ge 300 ]] && break + sleep 5 + waited=$((waited + 5)) + done fi + + patch_k0s_saia_service_disable_nodeport } # ====== INSTALL FULL STACK ====== @@ -3857,6 +4017,14 @@ install_ai_platform_stack() { # Apply Splunk Standalone CR (non-blocking — pod boots in background) install_splunk_standalone + # MetalLB must be installed BEFORE the AIPlatform CR is reconciled — the + # operator renders a Service.type=LoadBalancer for SAIA and we need a + # provider in the cluster to allocate a VIP, otherwise the Service is + # stuck in EXTERNAL-IP= indefinitely. No-op when + # metallb.install=false (e.g., user is bringing their own MetalLB or wants + # ClusterIP only). + install_metallb + # Install AI Platform operator and CR while Splunk Standalone boots install_splunk_ai_operator install_ai_platform_cr @@ -3880,7 +4048,12 @@ check_platform_health() { # Check 1: Cluster nodes log "Checking cluster nodes..." local not_ready - not_ready=$(kubectl get nodes --no-headers 2>/dev/null | grep -v " Ready " | wc -l || echo "0") + # `wc -l` on macOS returns " N" with leading whitespace and the `|| + # echo` fallback can append a second value, so the resulting string was + # tripping the `[[ -gt 0 ]]` test ("[[: 0\n0: syntax error"). Strip + # whitespace and default to 0 if grep returns 1 (no matches). + not_ready=$(kubectl get nodes --no-headers 2>/dev/null | grep -v " Ready " | wc -l | tr -d '[:space:]') + not_ready="${not_ready:-0}" if [[ "${not_ready}" -gt 0 ]]; then warn "Found ${not_ready} node(s) not in Ready state" kubectl get nodes From 293cffb3daa70c6fa35496e63e86d4a82cccbd95 Mon Sep 17 00:00:00 2001 From: kbhos Date: Thu, 30 Apr 2026 01:57:17 +0530 Subject: [PATCH 3/5] metalLB changes --- tools/cluster_setup/eks_cluster_with_stack.sh | 20 ++++++- tools/cluster_setup/k0s-cluster-config.yaml | 3 + tools/cluster_setup/k0s_cluster_with_stack.sh | 60 +++++++++++++++---- 3 files changed, 72 insertions(+), 11 deletions(-) diff --git a/tools/cluster_setup/eks_cluster_with_stack.sh b/tools/cluster_setup/eks_cluster_with_stack.sh index 9c815d8..eef3d51 100755 --- a/tools/cluster_setup/eks_cluster_with_stack.sh +++ b/tools/cluster_setup/eks_cluster_with_stack.sh @@ -947,6 +947,11 @@ ${public_subnets}" fi else log "No subnets specified - eksctl will create new subnets automatically" + # One NAT gateway => one Elastic IP. HighlyAvailable uses one NAT per AZ + # (often 3 EIPs) and commonly trips the default regional EIP quota (5). + vpc_config="vpc: + nat: + gateway: Single" fi cat < eks-cluster-config.yaml @@ -956,6 +961,8 @@ metadata: name: ${CLUSTER_NAME} region: ${REGION} version: "${K8S_VERSION}" +autoModeConfig: + enabled: false iam: withOIDC: true addons: @@ -3343,7 +3350,18 @@ preflight_env() { fi fi if [[ $subnet_count -eq 0 ]]; then - pf_ok "No subnets specified - eksctl will create new VPC and subnets automatically" + pf_ok "No subnets specified - eksctl will create new VPC and subnets automatically (NAT mode: Single = 1 Elastic IP)" + pf_header "Elastic IP headroom (new VPC)" + local eip_cnt + eip_cnt="$(aws ec2 describe-addresses --region "${REGION}" --query 'length(Addresses)' --output text 2>/dev/null || true)" + if [[ -n "${eip_cnt}" && "${eip_cnt}" =~ ^[0-9]+$ ]]; then + pf_ok "Allocated Elastic IPs in ${REGION}: ${eip_cnt}" + if (( eip_cnt >= 5 )); then + pf_warn "Typical default EIP quota is 5 per region. At ${eip_cnt}+ addresses, NAT gateway EIP allocation may fail (you saw: maximum number of addresses). Release unused EIPs in EC2 → Elastic IPs or request a quota increase before create cluster." + fi + else + pf_warn "Could not list Elastic IPs (aws ec2 describe-addresses). If create fails on NAT/EIP, check quotas and unused addresses." + fi else local all_subnets=("${PRIVATE_SUBNETS[@]}" "${PUBLIC_SUBNETS[@]}") local vpc_id="" diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml index bcb37b9..e796987 100644 --- a/tools/cluster_setup/k0s-cluster-config.yaml +++ b/tools/cluster_setup/k0s-cluster-config.yaml @@ -313,6 +313,9 @@ aiPlatform: # Required when aiPlatform.serviceTemplate.type=LoadBalancer on a bare-metal # / k0s cluster. Pinned chart version for supply-chain reproducibility # (codeguard-0-supply-chain-security). +# +# If serviceTemplate.type=NodePort, the installer skips MetalLB entirely even +# when metallb.install=true (NodePort does not use a LoadBalancer provider). metallb: install: true # set false if MetalLB is already installed or not needed chartVersion: "0.14.8" # metallb/metallb Helm chart (matches MetalLB v0.14.8) diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index e1f5683..56b635a 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -388,6 +388,17 @@ configure_images() { log "✓ All images configured successfully" } +# True if objectStore.auth values are still obvious template text. Non-empty +# placeholders otherwise pass the length preflight and get applied into +# minio-credentials, which makes SAIA fail at startup with InvalidAccessKeyId. +object_store_auth_looks_like_placeholder() { + case "${MINIO_ROOT_USER}${MINIO_ROOT_PASSWORD}" in + *\<*|*\>*) return 0 ;; + *CHANGEME*|*changeme*) return 0 ;; + esac + return 1 +} + # ====== PREFLIGHT CHECKS ====== preflight_checks() { pf_header "Required tools" @@ -423,6 +434,9 @@ preflight_checks() { [[ -n "${OBJ_STORE_ENDPOINT}" ]] && pf_ok "Endpoint: ${OBJ_STORE_ENDPOINT}" || pf_fail "objectStore.endpoint is required" fi [[ -n "${MINIO_ROOT_PASSWORD}" ]] && pf_ok "Credentials configured" || pf_fail "Object store credentials required (objectStore.auth.rootPassword)" + if object_store_auth_looks_like_placeholder; then + pf_fail "objectStore.auth still contains template placeholders (e.g. <...> or CHANGEME). Replace with a real access key and secret in your config (keep secrets in a Git-ignored file such as tools/cluster_setup/k0s-config.local.yaml)." + fi pf_header "Infrastructure mode" pf_ok "Using existing infrastructure (on-prem/baremetal)" @@ -1118,6 +1132,10 @@ ensure_namespace() { # the Kubernetes credentials secret so the operator and workloads can auth. ensure_s3compat_credentials() { log "Creating credentials secret for S3-compatible object storage (${OBJ_STORE_TYPE})..." + if object_store_auth_looks_like_placeholder; then + err "Refusing to create minio-credentials: objectStore.auth contains template placeholders; fix ${CONFIG_FILE}" + return 1 + fi if [[ -z "${OBJ_STORE_ENDPOINT}" && -z "${MINIO_ENDPOINT}" ]]; then err "storage.objectStore.type=${OBJ_STORE_TYPE} requires storage.objectStore.endpoint" return 1 @@ -2637,15 +2655,23 @@ install_ai_platform_cr() { # Ensure object storage credentials secret exists in AI namespace log "Creating/updating S3-compatible credentials secret (minio-credentials) in ${AI_NS}..." - kubectl -n "${AI_NS}" create secret generic minio-credentials \ - --from-literal=AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" \ - --from-literal=AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \ - --from-literal=s3_access_key="${MINIO_ROOT_USER}" \ - --from-literal=s3_secret_key="${MINIO_ROOT_PASSWORD}" \ - --from-literal=MINIO_ACCESS_KEY="${MINIO_ROOT_USER}" \ - --from-literal=MINIO_SECRET_KEY="${MINIO_ROOT_PASSWORD}" \ - --dry-run=client -o yaml | kubectl -n "${AI_NS}" apply -f - - log "✓ Object storage credentials secret ready" + if object_store_auth_looks_like_placeholder; then + if kubectl get secret minio-credentials -n "${AI_NS}" &>/dev/null; then + warn "Skipping minio-credentials apply: auth in ${CONFIG_FILE} still looks like a template (e.g. contains '<'). Preserving existing secret." + else + err "minio-credentials missing and cannot be created: fix objectStore.auth in ${CONFIG_FILE} (remove <...> placeholders)." + fi + else + kubectl -n "${AI_NS}" create secret generic minio-credentials \ + --from-literal=AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" \ + --from-literal=AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \ + --from-literal=s3_access_key="${MINIO_ROOT_USER}" \ + --from-literal=s3_secret_key="${MINIO_ROOT_PASSWORD}" \ + --from-literal=MINIO_ACCESS_KEY="${MINIO_ROOT_USER}" \ + --from-literal=MINIO_SECRET_KEY="${MINIO_ROOT_PASSWORD}" \ + --dry-run=client -o yaml | kubectl -n "${AI_NS}" apply -f - + log "✓ Object storage credentials secret ready" + fi # Build imagePullSecrets YAML from created secrets local image_pull_secrets="" @@ -2825,6 +2851,14 @@ saia_service_template_enabled_k0s() { [[ -n "${svc_type}" && "${svc_type}" != "null" && "${svc_type}" != "ClusterIP" ]] } +# True when SAIA public Service is explicitly NodePort. MetalLB is not used in +# that mode, so install_metallb skips the Helm install even if metallb.install=true. +k0s_saia_service_template_is_nodeport() { + local svc_type + svc_type=$(yq eval '.aiPlatform.serviceTemplate.type // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + [[ "${svc_type}" == "NodePort" ]] +} + wait_for_k0s_aiservice_exists() { local name="$1" timeout="${2:-600}" waited=0 while ! kubectl -n "${AI_NS}" get aiservice "${name}" >/dev/null 2>&1; do @@ -2870,6 +2904,12 @@ metallb_enabled_k0s() { install_metallb() { metallb_enabled_k0s || { log "metallb.install != true — skipping MetalLB install"; return 0; } + if k0s_saia_service_template_is_nodeport; then + log "Skipping MetalLB install: aiPlatform.serviceTemplate.type=NodePort (LoadBalancer provider not used for SAIA)." + log "NOTE: metallb.install=true has no effect while SAIA uses NodePort. Set metallb.install=false to match config, or use type=LoadBalancer to install MetalLB." + return 0 + fi + local ns chart_version pool_name addr_count mode ns="$(yq eval '.metallb.namespace // "metallb-system"' "${CONFIG_FILE}" 2>/dev/null)" chart_version="$(yq eval '.metallb.chartVersion // "0.14.8"' "${CONFIG_FILE}" 2>/dev/null)" @@ -3018,7 +3058,7 @@ patch_k0s_saia_public_service_workaround() { if saia_service_template_enabled_k0s; then log "Patching AIService/${aiservice_name} with SAIA public exposure settings (type=${svc_type})..." if [[ "${svc_type}" == "NodePort" && -n "${svc_node_port}" && "${svc_node_port}" != "null" ]]; then - log "WARNING: NodePort exposure is discouraged on k0s. Prefer type=LoadBalancer with metallb.install=true." >&2 + log "WARNING: NodePort exposure is discouraged on k0s. Prefer type=LoadBalancer with metallb.install=true (MetalLB install is skipped automatically when type=NodePort)." >&2 kubectl -n "${AI_NS}" patch aiservice "${aiservice_name}" --type merge -p "{ \"spec\": { \"serviceTemplate\": { From 7b100b797cbb3ffb4da8d70af52e4152283062cc Mon Sep 17 00:00:00 2001 From: kbhos-splunk Date: Thu, 30 Apr 2026 02:26:59 +0530 Subject: [PATCH 4/5] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- tools/cluster_setup/k0s_cluster_with_stack.sh | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index ee0e3aa..b4d508f 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -3222,12 +3222,10 @@ check_platform_health() { # Check 1: Cluster nodes log "Checking cluster nodes..." local not_ready - # `wc -l` on macOS returns " N" with leading whitespace and the `|| - # echo` fallback can append a second value, so the resulting string was - # tripping the `[[ -gt 0 ]]` test ("[[: 0\n0: syntax error"). Strip - # whitespace and default to 0 if grep returns 1 (no matches). - not_ready=$(kubectl get nodes --no-headers 2>/dev/null | grep -v " Ready " | wc -l | tr -d '[:space:]') - not_ready="${not_ready:-0}" + # Count nodes whose status is not Ready without relying on grep exit codes. + # This avoids `set -euo pipefail` aborting the script when all nodes are + # Ready, while still producing a whitespace-free numeric result. + not_ready=$(kubectl get nodes --no-headers 2>/dev/null | awk 'index($0, " Ready ") == 0 { count++ } END { print count+0 }') if [[ "${not_ready}" -gt 0 ]]; then warn "Found ${not_ready} node(s) not in Ready state" kubectl get nodes From bfc43001b5fa4163e49267783c105d79537e0874 Mon Sep 17 00:00:00 2001 From: kbhos-splunk Date: Thu, 30 Apr 2026 02:27:06 +0530 Subject: [PATCH 5/5] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- tools/cluster_setup/cluster-config.yaml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/cluster_setup/cluster-config.yaml b/tools/cluster_setup/cluster-config.yaml index ebd7b82..6109331 100644 --- a/tools/cluster_setup/cluster-config.yaml +++ b/tools/cluster_setup/cluster-config.yaml @@ -271,9 +271,11 @@ aiPlatform: # Public SAIA exposure (NodePort-free) # --------------------------------------------------------------------------- # The operator renders a public Kubernetes Service named - # `-saia-service` whose endpoints are the in-cluster nginx - # pods (nginx terminates path routing to saia v1 / v2). The install script - # then configures HOW that Service is reached from outside the cluster. + # `-saia-service`; because the AIService is typically named + # `-saia`, the resulting Service is usually + # `-saia-saia-service`. Its endpoints are the in-cluster + # nginx pods (nginx terminates path routing to saia v1 / v2). The install + # script then configures HOW that Service is reached from outside the cluster. # # IMPORTANT: this template intentionally does NOT use Service.type=NodePort. # Many enterprise security policies prohibit opening 30000-32767 on every