From 9f357e5c0421c034b3f74a9b43616d052a709ab3 Mon Sep 17 00:00:00 2001
From: Mohammed Arif <marif@splunk.com>
Date: Thu, 7 May 2026 20:22:31 +0530
Subject: [PATCH 1/3] feat: Update concise doc from bug bash feedback

---
 tools/cluster_setup/K0S_QUICKSTART.md | 371 ++++++++++++++++++--------
 1 file changed, 261 insertions(+), 110 deletions(-)

diff --git a/tools/cluster_setup/K0S_QUICKSTART.md b/tools/cluster_setup/K0S_QUICKSTART.md
index ab67e91..a9cfbec 100644
--- a/tools/cluster_setup/K0S_QUICKSTART.md
+++ b/tools/cluster_setup/K0S_QUICKSTART.md
@@ -10,15 +10,89 @@ Deploys the complete Splunk AI Platform stack on k0s Kubernetes using pre-provis
 
 **Nodes (all):** RHEL 9 · passwordless SSH + sudo · Python 3.8+
 
-| Node Type | Min CPU | Min RAM | Min Disk | Notes |
-|-----------|---------|---------|----------|-------|
-| Controller | 4 | 8 GB | 100 GB | API server, etcd, scheduler |
-| CPU Worker | 8 | 32 GB | 200 GB | Weaviate, Ray head, Splunk |
-| GPU Worker | 8 | 32 GB | 500 GB | NVIDIA GPU required (3 * H100, 3 * L40S) |
+| Node Type  | Min CPU | Min RAM (per node) | Min Disk | Notes                                                    |
+| ---------- | ------- | ------------------ | -------- | -------------------------------------------------------- |
+| Controller | 4       | 8 GB               | 100 GB   | API server, etcd, scheduler                              |
+| CPU Worker | 8       | 32 GB              | 200 GB   | Weaviate, Ray head, Splunk, SAIA API/v2, Data Loader     |
+| GPU Worker | 16      | 384 GB             | 500 GB   | NVIDIA GPU required (3 × H100 80GB **or** 6 × L40S 48GB) |
 
-**Ports between nodes:** 22 (SSH), 6443 (API), 2380 (etcd), 10250 (kubelet), 8132 (konnectivity), 4789/UDP (VXLAN), 179 (Calico BGP)
 
-**External storage:** Any S3-compatible endpoint (SeaweedFS, MinIO, AWS S3). Not deployed by the script.
+**Ports between nodes:** 22 (SSH), 6443 (API), 2380 (etcd), 10250 (kubelet), 8132 (konnectivity), 4789/UDP (VXLAN), 179 (Calico BGP). Best practice is to allow all ports between nodes.
+
+**External S3-compatible storage:** Any S3-compatible endpoint (SeaweedFS, MinIO, AWS S3). Must be provisioned **before** running the installer. Customer managed.
+
+The S3 bucket must be pre-populated with the following directories before installation:
+
+
+| Directory          | Required | Description                                                                                                                                                   |
+| ------------------ | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `model_artifacts/` | **Yes**  | Pre-trained model weights. Must be uploaded before install; Ray workers download models from here at startup. Without these, AI inference services will fail. |
+
+
+**Required models in `model_artifacts/`:**
+
+
+| Model                             | Purpose                                         |
+| --------------------------------- | ----------------------------------------------- |
+| `gpt-oss-120b`                    | Primary LLM for chat, SPL generation, reasoning |
+| `gpt-oss-20b`                     | Field descriptions, conversation titles         |
+| `all-minilm-l6-v2`                | Sentence embeddings (data loader, SAIA)         |
+| `bi-encoder`                      | Semantic search ranking                         |
+| `cross-encoder`                   | Re-ranking search results                       |
+| `uae-large`                       | Embedding model                                 |
+| `e5-language-classifier`          | Language detection                              |
+| `xlm-roberta-language-classifier` | Multilingual language classification            |
+| `pii-classifier`                  | PII detection                                   |
+| `mbart-translator`                | Translation                                     |
+
+
+### Downloading and Uploading Model Artifacts
+
+Helper scripts in `tools/artifacts_download_upload_scripts/` automate downloading models from Hugging Face and uploading them to S3-compatible storage.
+
+**Step 1 — Download models from Hugging Face:**
+
+```bash
+cd tools/artifacts_download_upload_scripts
+# Edit model_artifacts_configs.yaml if you need to add/remove models or set HF credentials for gated models
+./download_from_huggingface.sh
+```
+
+Downloads all configured models into `./model_artifacts/`. Auto-installs dependencies (`wget`, `yq`, `git-lfs`). Supports gated models via `hf-token` / `hf-username` in `model_artifacts_configs.yaml`.
+
+**Step 2 — Upload to your object store:**
+
+
+| Storage Type          | Script                   | Key Environment Variables                                                                                                                |
+| --------------------- | ------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------- |
+| MinIO / S3-compatible | `upload_to_minio.sh`     | `OBJECT_STORE_ENDPOINT`, `OBJECT_STORE_BUCKET`, `OBJECT_STORE_ACCESS_KEY`, `OBJECT_STORE_SECRET_KEY`                                     |
+| SeaweedFS             | `upload_to_seaweedfs.sh` | `S3COMPAT_OBJECT_STORE_ENDPOINT`, `S3COMPAT_OBJECT_STORE_BUCKET`, `S3COMPAT_OBJECT_STORE_ACCESS_KEY`, `S3COMPAT_OBJECT_STORE_SECRET_KEY` |
+| AWS S3                | `upload_to_s3.sh`        | `S3_BUCKET`, `S3_REGION` (requires AWS CLI credentials)                                                                                  |
+| MinIO via AWS CLI     | `upload_to_minio_aws.sh` | `S3COMPAT_OBJECT_STORE_ENDPOINT`, `S3COMPAT_OBJECT_STORE_BUCKET`, `S3COMPAT_OBJECT_STORE_ACCESS_KEY`, `S3COMPAT_OBJECT_STORE_SECRET_KEY` |
+
+
+Example (SeaweedFS):
+
+```bash
+S3COMPAT_OBJECT_STORE_ENDPOINT=http://seaweedfs-host:8333 \
+S3COMPAT_OBJECT_STORE_BUCKET=ai-platform-bucket \
+S3COMPAT_OBJECT_STORE_ACCESS_KEY=minioadmin \
+S3COMPAT_OBJECT_STORE_SECRET_KEY=minioadmin \
+./upload_to_seaweedfs.sh
+```
+
+**Additional utilities:**
+
+
+| Script                         | Purpose                                      |
+| ------------------------------ | -------------------------------------------- |
+| `test_minio_connection.sh`     | Diagnose S3-compatible endpoint connectivity |
+| `create_seaweedfs_folders.sh`  | Create standard bucket folder structure      |
+| `install_seaweedfs_systemd.sh` | Install SeaweedFS as a systemd service       |
+| `install_minio_ec2.sh`         | Install MinIO on an EC2 instance             |
+
+
+> See `tools/artifacts_download_upload_scripts/README.md` for full usage details.
 
 ## 2. Quick Start
 
@@ -38,12 +112,11 @@ kubectl get aiplatform -n ai-platform
 
 ## 3. Commands
 
-| Command | Description |
-|---------|-------------|
-| `install` | Create k0s cluster + deploy full AI Platform stack |
-| `delete` | Stop k0s, remove services |
-| `clean-all` | Stop + reset + wipe all k0s state from every node |
-| `join-workers` | Add or rejoin worker nodes to an existing cluster |
+| Command        | Description                                        |
+| -------------- | -------------------------------------------------- |
+| `install`      | Create k0s cluster + deploy full AI Platform stack |
+| `clean-all`    | Stop + reset + wipe all k0s state from every node  |
+| `join-workers` | Add or rejoin worker nodes to an existing cluster  |
 
 ```bash
 CONFIG_FILE=./my-cluster.yaml ./k0s_cluster_with_stack.sh <command>
@@ -51,12 +124,13 @@ CONFIG_FILE=./my-cluster.yaml ./k0s_cluster_with_stack.sh <command>
 
 **Environment variables:**
 
-| Variable | Default | Description |
-|----------|---------|-------------|
-| `CONFIG_FILE` | `./k0s-cluster-config.yaml` | Config file path |
-| `AUTO_APPROVE` | `false` | Skip confirmation prompts |
-| `USE_EXISTING` | from config | Override `cluster.useExisting` |
-| `LOG_DIR` | `./logs` | Session log directory |
+
+| Variable       | Default                     | Description                    |
+| -------------- | --------------------------- | ------------------------------ |
+| `CONFIG_FILE`  | `./k0s-cluster-config.yaml` | Config file path               |
+| `USE_EXISTING` | from config                 | Override `cluster.useExisting` |
+| `LOG_DIR`      | `./logs`                    | Session log directory          |
+
 
 ## 4. What `install` Does
 
@@ -71,7 +145,7 @@ CONFIG_FILE=./my-cluster.yaml ./k0s_cluster_with_stack.sh <command>
 8. Health checks → access info
 ```
 
-**Safety gate:** If the controller already has Ready nodes, `install` refuses to wipe. Use `useExisting: auto` or run `delete` first.
+**Safety gate:** If the controller already has Ready nodes, `install` refuses to wipe. Use `useExisting: auto` or run `clean-all` first.
 
 **Session logging:** All output → `logs/k0s-install-YYYY-MM-DD_HH-MM-SS.log`
 
@@ -81,140 +155,211 @@ The config template is `k0s-cluster-config.yaml`. Copy it and edit. Key sections
 
 ### cluster
 
-| Field | Required | Default | Description |
-|-------|----------|---------|-------------|
-| `name` | Yes | — | Cluster name (kubeconfig, labels) |
-| `useExisting` | No | `never` | `auto` / `force` / `never` |
-| `sshUser` | Yes | `ubuntu` | SSH user for all nodes |
-| `sshKeyPath` | Yes | — | SSH private key path |
+
+| Field         | Required | Default | Description                       |
+| ------------- | -------- | ------- | --------------------------------- |
+| `name`        | Yes      | —       | Cluster name (kubeconfig, labels) |
+| `useExisting` | No       | `never` | `auto` / `force` / `never`        |
+| `sshUser`     | Yes      | `root`  | SSH user for all nodes            |
+| `sshKeyPath`  | Yes      | —       | SSH private key path              |
+
 
 ### nodes
 
-| Field | Required | Default | Description |
-|-------|----------|---------|-------------|
-| `controllers` | **Yes** | `1` | Controller count (1 or 3 for HA) |
-| `cpuWorkers` | **Yes** | `2` | First N workers labeled CPU |
-| `gpuWorkers` | **Yes** | `1` | Remaining workers labeled GPU |
-| `existingIPs.controllers` | **Yes** | — | Controller IP list |
-| `existingIPs.workers` | **Yes** | — | Worker IP list |
+
+| Field                     | Required | Default | Description                      |
+| ------------------------- | -------- | ------- | -------------------------------- |
+| `controllers`             | **Yes**  | —       | Controller count (1 or 3 for HA) |
+| `cpuWorkers`              | **Yes**  | —       | First N workers labeled CPU      |
+| `gpuWorkers`              | **Yes**  | —       | Remaining workers labeled GPU    |
+| `existingIPs.controllers` | **Yes**  | —       | Controller IP list               |
+| `existingIPs.workers`     | **Yes**  | —       | Worker IP list                   |
+
 
 ### storage
 
-| Field | Required | Default | Description |
-|-------|----------|---------|-------------|
-| `storageClass` | **Yes** | `local-path` | StorageClass for PVCs |
-| `vectorDbSize` | **Yes** | `50Gi` | Weaviate PV size |
-| `minimumDiskSpace.controller` | No | `100` | Preflight disk check (GB) |
-| `minimumDiskSpace.cpuWorker` | No | `200` | Preflight disk check (GB) |
-| `minimumDiskSpace.gpuWorker` | No | `500` | Preflight disk check (GB) |
-| `objectStore.type` | **Yes** | `minio` | `aws` / `s3compat` / `minio` / `seaweedfs` |
-| `objectStore.bucket` | **Yes** | `ai-platform-data` | Bucket name |
-| `objectStore.endpoint` | **Yes** | — | S3 endpoint (*required for non-AWS) |
-| `objectStore.auth.rootUser` | Yes | — | Access key |
-| `objectStore.auth.rootPassword` | Yes | — | Secret key |
+
+| Field                           | Required | Default            | Description                                |
+| ------------------------------- | -------- | ------------------ | ------------------------------------------ |
+| `storageClass`                  | **Yes**  | `local-path`       | StorageClass for PVCs                      |
+| `vectorDbSize`                  | **Yes**  | `50Gi`             | Weaviate PV size                           |
+| `minimumDiskSpace.controller`   | No       | `100`              | Preflight disk check (GB)                  |
+| `minimumDiskSpace.cpuWorker`    | No       | `200`              | Preflight disk check (GB)                  |
+| `minimumDiskSpace.gpuWorker`    | No       | `500`              | Preflight disk check (GB)                  |
+| `objectStore.type`              | **Yes**  | `minio`            | `aws` / `s3compat` / `minio` / `seaweedfs` |
+| `objectStore.bucket`            | **Yes**  | `ai-platform-data` | Bucket name                                |
+| `objectStore.endpoint`          | **Yes**  | —                  | S3 endpoint (*required for non-AWS)        |
+| `objectStore.auth.rootUser`     | Yes      | —                  | Access key                                 |
+| `objectStore.auth.rootPassword` | Yes      | —                  | Secret key                                 |
+
+
+#### S3 Bucket Directory Layout
+
+The S3 bucket serves as the shared storage layer for both pre-staged artifacts and runtime data. The following directories are created and managed automatically by the platform:
+
+**Pre-staged (must exist before install):**
+
+
+| Directory          | Owner              | Description                                                |
+| ------------------ | ------------------ | ---------------------------------------------------------- |
+| `model_artifacts/` | Admin (pre-staged) | Pre-trained model weights loaded by Ray workers at startup |
+
+
+**Created at runtime by SAIA services:**
+
+
+| Directory                | Owner                | Description                                                              |
+| ------------------------ | -------------------- | ------------------------------------------------------------------------ |
+| `conversations/`         | SAIA v2 API          | Conversation history per tenant                                          |
+| `config/`                | SAIA v2 API / Worker | Tenant data configuration (`config/tenant_data_config/{tenant}.yaml`)    |
+| `storage_queue/`         | SAIA v2 Worker       | S3-backed task queue for async ingestion (`urgent/`, `batch/`, `locks/`) |
+| `ingestion/tenant_data/` | SAIA v2 Worker       | Temporary ingestion payload storage during processing                    |
+| `field_counts/`          | SAIA v2 Worker       | Cached field count statistics per tenant/index/sourcetype                |
+| `admin/preferences/`     | SAIA v2 API          | Admin-curated markdown preferences per tenant                            |
+| `job_groups/`            | SAIA v1 API          | Background job group state for data upload tasks                         |
+
+
+**Created at runtime by other platform components:**
+
+
+| Directory    | Owner             | Description          |
+| ------------ | ----------------- | -------------------- |
+| `artifacts/` | AI Operator       | Deployment artifacts |
+| `tasks/`     | AI Operator / Ray | Task execution state |
+
+
+> **Note:** Do not manually delete runtime directories (`conversations/`, `config/`, `storage_queue/`) as they contain active state. Deleting `storage_queue/locks/` may be necessary to clear stale distributed locks after a non-graceful pod restart.
 
 ### images
 
 Short paths auto-prefixed with `images.registry`. All marked **Yes** are required; others have defaults.
 
-| Field | Req | Default |
-|-------|-----|---------|
-| `registry` | No | `""` |
-| `operator.image` | **Yes** | — |
-| `splunk.image` | **Yes** | — |
-| `splunk.operatorImage` | No | `docker.io/splunk/splunk-operator:3.0.0` |
-| `ray.headImage` | **Yes** | — |
-| `ray.workerImage` | **Yes** | — |
-| `weaviate.image` | **Yes** | — |
-| `saia.apiImage` | **Yes** | — |
-| `saia.apiV2Image` | **Yes** | — |
-| `saia.dataLoaderImage` | **Yes** | — |
-| `nginx.image` | No | `docker.io/library/nginx:1.27-alpine` |
-| `fluentBit.image` | No | `fluent/fluent-bit:1.9.6` |
-| `otelCollector.image` | No | `otel/opentelemetry-collector-contrib:0.122.1` |
+**Image sources:**
+
+
+| Source      | Images                                                                                                                     |
+| ----------- | -------------------------------------------------------------------------------------------------------------------------- |
+| VOC Portal  | `operator`, `ray.headImage`, `ray.workerImage`, `saia.apiImage`, `saia.apiV2Image`, `saia.dataLoaderImage`, `splunk.image` |
+| `docker.io` | `splunk.operatorImage`, `weaviate.image`, `nginx.image`, `fluentBit.image`, `otelCollector.image`                          |
+| `quay.io`   | KubeRay Operator (deployed via Helm, not in this config)                                                                   |
+
+
+> **Note:** VOC Portal images must be pushed to a registry (e.g., ECR, ACR, GCR, private registry) accessible by the cluster. Set `images.registry` to that registry; short paths like `ml-platform/ray/ray-head:tag` are auto-prefixed with it.
+
+
+| Field                  | Req     | Default                                        |
+| ---------------------- | ------- | ---------------------------------------------- |
+| `registry`             | **Yes** | `""`                                           |
+| `operator.image`       | **Yes** | —                                              |
+| `splunk.image`         | **Yes** | —                                              |
+| `splunk.operatorImage` | No      | `docker.io/splunk/splunk-operator:3.0.0`       |
+| `ray.headImage`        | **Yes** | —                                              |
+| `ray.workerImage`      | **Yes** | —                                              |
+| `weaviate.image`       | **Yes** | —                                              |
+| `saia.apiImage`        | **Yes** | —                                              |
+| `saia.apiV2Image`      | **Yes** | —                                              |
+| `saia.dataLoaderImage` | **Yes** | —                                              |
+| `nginx.image`          | No      | `docker.io/library/nginx:1.27-alpine`          |
+| `fluentBit.image`      | No      | `fluent/fluent-bit:1.9.6`                      |
+| `otelCollector.image`  | No      | `otel/opentelemetry-collector-contrib:0.122.1` |
+
 
 ### aiPlatform
 
-| Field | Required | Default | Description |
-|-------|----------|---------|-------------|
-| `name` | **Yes** | `${CLUSTER_NAME}-ai-platform` | CR name |
-| `defaultAcceleratorType` | **Yes** | `""` | `L40S` / `H100` / empty |
-| `workerGroupConfig.imageRegistry` | No | `""` | Ray worker image override |
-| `features[].name` | Yes | — | Feature name (e.g., `saia`) |
-| `features[].version` | Yes | — | Feature version |
-| `cpuScheduling` | No | auto | `nodeSelector` + `tolerations` for CPU pods |
-| `gpuScheduling` | No | auto | `nodeSelector` + `tolerations` for GPU pods |
-| `serviceTemplate.type` | **Yes** | — | `NodePort` / `LoadBalancer` for SAIA exposure |
-| `serviceTemplate.nodePort` | **Yes** | — | Port number (NodePort only) |
+
+| Field                             | Required | Default                       | Description                                   |
+| --------------------------------- | -------- | ----------------------------- | --------------------------------------------- |
+| `name`                            | **Yes**  | `${CLUSTER_NAME}-ai-platform` | CR name                                       |
+| `defaultAcceleratorType`          | **Yes**  | `""`                          | `L40S` / `H100`                               |
+| `workerGroupConfig.imageRegistry` | No       | `""`                          | Ray worker image override                     |
+| `features[].name`                 | Yes      | —                             | Feature name (e.g., `saia`)                   |
+| `features[].version`              | Yes      | —                             | Feature version                               |
+| `cpuScheduling`                   | No       | auto                          | `nodeSelector` + `tolerations` for CPU pods   |
+| `gpuScheduling`                   | No       | auto                          | `nodeSelector` + `tolerations` for GPU pods   |
+| `serviceTemplate.type`            | **Yes**  | —                             | `NodePort` / `LoadBalancer` for SAIA exposure |
+| `serviceTemplate.nodePort`        | **Yes**  | —                             | Port number (NodePort only)                   |
+
 
 ### imagePullSecrets
 
 The `secrets[]` list is **not consumed**. The script auto-detects secrets by checking hardcoded names (`ecr-registry-secret`, `docker-hub-secret`, `gcr-secret`, `acr-secret`, `custom-registry-secret`).
 
-| Field | Description |
-|-------|-------------|
-| `autoCreateECR` | Create ECR secret from AWS creds |
-| `dockerHub.enabled` | Create Docker Hub secret |
-| `gcr.enabled` | Create GCR secret |
-| `acr.enabled` | Create ACR secret |
-| `custom.enabled` | Create custom registry secret |
+
+| Field               | Description                      |
+| ------------------- | -------------------------------- |
+| `autoCreateECR`     | Create ECR secret from AWS creds |
+| `dockerHub.enabled` | Create Docker Hub secret         |
+| `gcr.enabled`       | Create GCR secret                |
+| `acr.enabled`       | Create ACR secret                |
+| `custom.enabled`    | Create custom registry secret    |
+
 
 ECR tokens expire after 12 hours. Re-run install or set up a CronJob to refresh.
 
 ### ecr
 
-| Field | Description |
-|-------|-------------|
+
+| Field     | Description    |
+| --------- | -------------- |
 | `account` | AWS account ID |
-| `region` | ECR region |
+| `region`  | ECR region     |
+
 
 ## 6. Node Labels & GPU
 
 The script auto-labels nodes:
 
-| Node type | Key labels |
-|-----------|------------|
-| Controller | `splunk.ai/workload-type: control-plane` |
-| CPU Worker | `splunk.ai/workload-type: cpu`, `splunk.ai/instance-type: cpu-worker` |
+
+| Node type  | Key labels                                                                                       |
+| ---------- | ------------------------------------------------------------------------------------------------ |
+| Controller | `splunk.ai/workload-type: control-plane`                                                         |
+| CPU Worker | `splunk.ai/workload-type: cpu`, `splunk.ai/instance-type: cpu-worker`                            |
 | GPU Worker | `splunk.ai/workload-type: gpu`, `nvidia.com/gpu: "true"`, taint `nvidia.com/gpu=true:NoSchedule` |
 
+
 **NVIDIA drivers** are installed directly on GPU nodes (not GPU Operator). Supported: RHEL 9 currently. The script installs kernel headers, CUDA repo, `cuda-drivers`, NVIDIA Container Toolkit, then verifies with `nvidia-smi`.
 
 ## 7. Troubleshooting
 
 **SSH failures:**
+
 ```bash
 ssh -i ~/.ssh/key.pem user@node-ip hostname   # test connectivity
 chmod 600 ~/.ssh/key.pem                       # fix permissions
 ```
 
 **Safety gate ("refusing to wipe"):**
-Set `useExisting: auto` in config, or run `delete` then `install`.
+Set `useExisting: auto` in config, or run `clean-all` then `install`.
 
 **k0s issues:**
+
 ```bash
 ssh user@controller-ip "sudo k0s status"
 ssh user@controller-ip "sudo journalctl -u k0scontroller -f"
 ```
 
 **Worker join failures:**
+
 ```bash
 CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh join-workers
 ```
 
 **GPU not detected:**
+
 ```bash
 kubectl get pods -n kube-system -l name=nvidia-device-plugin-ds
 ssh user@gpu-node nvidia-smi
 ```
 
 **AIPlatform not ready:**
+
 ```bash
 kubectl describe aiplatform -n ai-platform
 kubectl logs -n splunk-ai-operator-system deployment/splunk-ai-operator-controller-manager
 ```
 
 **Session logs:**
+
 ```bash
 ls -lt tools/cluster_setup/logs/
 tail -f tools/cluster_setup/logs/k0s-install-*.log
@@ -231,36 +376,42 @@ tail -f tools/cluster_setup/logs/k0s-install-*.log
 
 **Binaries/charts downloaded by the script:**
 
-| What | Source |
-|------|--------|
-| k0s binary | `https://get.k0s.sh` |
-| cert-manager v1.13.0 | `github.com/cert-manager/cert-manager` |
-| kube-prometheus-stack | `prometheus-community` Helm repo |
-| opentelemetry-operator | `open-telemetry` Helm repo |
-| kuberay-operator v1.2.2 | `ray-project` Helm repo |
-| NVIDIA device plugin | `github.com/NVIDIA/k8s-device-plugin` |
-| local-path-provisioner | `github.com/rancher/local-path-provisioner` |
+
+| What                    | Source                                      |
+| ----------------------- | ------------------------------------------- |
+| k0s binary              | `https://get.k0s.sh`                        |
+| cert-manager v1.13.0    | `github.com/cert-manager/cert-manager`      |
+| kube-prometheus-stack   | `prometheus-community` Helm repo            |
+| opentelemetry-operator  | `open-telemetry` Helm repo                  |
+| kuberay-operator v1.2.2 | `ray-project` Helm repo                     |
+| NVIDIA device plugin    | `github.com/NVIDIA/k8s-device-plugin`       |
+| local-path-provisioner  | `github.com/rancher/local-path-provisioner` |
+
 
 **Container images pulled at runtime:**
 
-| Image | Default Source |
-|-------|---------------|
-| Splunk AI Operator, Ray Head/Worker, SAIA API v1/v2, Data Loader, Splunk Enterprise | ECR or configured registry |
-| Weaviate | `docker.io/semitechnologies/weaviate` |
-| Nginx | `docker.io/library/nginx:1.27-alpine` |
-| Fluent Bit | `docker.io/fluent/fluent-bit:1.9.6` |
-| OTel Collector | `docker.io/otel/opentelemetry-collector-contrib:0.122.1` |
-| Splunk Operator | `docker.io/splunk/splunk-operator:3.0.0` |
-| KubeRay Operator | `quay.io/kuberay/operator:v1.2.2` |
-| Prometheus, Grafana, cert-manager, NVIDIA plugin, local-path | Pulled by their respective Helm charts/manifests |
+
+| Image                                                                               | Default Source                                           |
+| ----------------------------------------------------------------------------------- | -------------------------------------------------------- |
+| Splunk AI Operator, Ray Head/Worker, SAIA API v1/v2, Data Loader, Splunk Enterprise | ECR or configured registry                               |
+| Weaviate                                                                            | `docker.io/semitechnologies/weaviate`                    |
+| Nginx                                                                               | `docker.io/library/nginx:1.27-alpine`                    |
+| Fluent Bit                                                                          | `docker.io/fluent/fluent-bit:1.9.6`                      |
+| OTel Collector                                                                      | `docker.io/otel/opentelemetry-collector-contrib:0.122.1` |
+| Splunk Operator                                                                     | `docker.io/splunk/splunk-operator:3.0.0`                 |
+| KubeRay Operator                                                                    | `quay.io/kuberay/operator:v1.2.2`                        |
+| Prometheus, Grafana, cert-manager, NVIDIA plugin, local-path                        | Pulled by their respective Helm charts/manifests         |
+
 
 **NVIDIA packages on GPU nodes (RHEL 9):**
 
-| Package | Source |
-|---------|--------|
-| CUDA drivers | `developer.download.nvidia.com/compute/cuda/repos/` |
-| Container Toolkit | `nvidia.github.io/libnvidia-container/` |
-| EPEL (RHEL 10 only) | `dl.fedoraproject.org/pub/epel/` |
+
+| Package             | Source                                              |
+| ------------------- | --------------------------------------------------- |
+| CUDA drivers        | `developer.download.nvidia.com/compute/cuda/repos/` |
+| Container Toolkit   | `nvidia.github.io/libnvidia-container/`             |
+| EPEL (RHEL 10 only) | `dl.fedoraproject.org/pub/epel/`                    |
+
 
 ## 9. Architecture
 

From 38bc50d3fcf0974254718da8d11f07ca548d5ca2 Mon Sep 17 00:00:00 2001
From: Mohammed Arif <marif@splunk.com>
Date: Thu, 7 May 2026 22:41:04 +0530
Subject: [PATCH 2/3] fix: addressing review comments

---
 tools/cluster_setup/K0S_QUICKSTART.md | 45 +++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/tools/cluster_setup/K0S_QUICKSTART.md b/tools/cluster_setup/K0S_QUICKSTART.md
index a9cfbec..97db8f2 100644
--- a/tools/cluster_setup/K0S_QUICKSTART.md
+++ b/tools/cluster_setup/K0S_QUICKSTART.md
@@ -277,7 +277,7 @@ Short paths auto-prefixed with `images.registry`. All marked **Yes** are require
 | `cpuScheduling`                   | No       | auto                          | `nodeSelector` + `tolerations` for CPU pods   |
 | `gpuScheduling`                   | No       | auto                          | `nodeSelector` + `tolerations` for GPU pods   |
 | `serviceTemplate.type`            | **Yes**  | —                             | `NodePort` / `LoadBalancer` for SAIA exposure |
-| `serviceTemplate.nodePort`        | **Yes**  | —                             | Port number (NodePort only)                   |
+| `serviceTemplate.nodePort`        | No       | —                             | Port number (when serviceTemplate.type as NodePort only)                   |
 
 
 ### imagePullSecrets
@@ -294,7 +294,7 @@ The `secrets[]` list is **not consumed**. The script auto-detects secrets by che
 | `custom.enabled`    | Create custom registry secret    |
 
 
-ECR tokens expire after 12 hours. Re-run install or set up a CronJob to refresh.
+ECR tokens usually expire after 12 hours. Re-run install or set up a CronJob to refresh.
 
 ### ecr
 
@@ -305,6 +305,47 @@ ECR tokens expire after 12 hours. Re-run install or set up a CronJob to refresh.
 | `region`  | ECR region     |
 
 
+### metallb
+
+k0s has no built-in `LoadBalancer` provider. When `aiPlatform.serviceTemplate.type=LoadBalancer` (the recommended SAIA exposure path), the installer deploys MetalLB to allocate a VIP from a pool you provide. Skipped automatically when `type=NodePort`.
+
+
+| Field            | Required                | Default          | Description                                              |
+| ---------------- | ----------------------- | ---------------- | -------------------------------------------------------- |
+| `install`        | **Yes**                 | `false`          | Set `true` to install MetalLB                            |
+| `chartVersion`   | No                      | `0.14.8`         | `metallb/metallb` Helm chart version                     |
+| `namespace`      | No                      | `metallb-system` | MetalLB install namespace                                |
+| `pool.name`      | No                      | `saia-pool`      | Name of the `IPAddressPool`                              |
+| `pool.addresses` | **Yes**                 | —                | Free, routable IP range(s) on your LAN                   |
+| `mode`           | No                      | `layer2`         | `layer2` (most LANs) or `bgp` (data-center fabric)       |
+| `bgpPeers`       | **Yes when `mode=bgp`** | `[]`             | List of `{peerAddress, peerASN, myASN}` for BGP upstream |
+
+
+**Minimal config (Layer-2):**
+
+```yaml
+metallb:
+  install: true
+  pool:
+    addresses:
+      - "10.20.30.100-10.20.30.110"   # free range on the worker LAN
+  mode: "layer2"
+```
+
+**Verify MetalLB after install:**
+
+```bash
+# MetalLB controller and speakers
+kubectl -n metallb-system get deploy,ds
+
+# Address pool and advertisement
+kubectl -n metallb-system get ipaddresspool,l2advertisement,bgppeer,bgpadvertisement
+
+# SAIA service should have an EXTERNAL-IP from the pool
+kubectl -n ai-platform get svc -l app.kubernetes.io/component=saia
+```
+
+
 ## 6. Node Labels & GPU
 
 The script auto-labels nodes:

From 48b06753a1131ef76a383d756220c152d833d4e7 Mon Sep 17 00:00:00 2001
From: Mohammed Arif <marif@splunk.com>
Date: Thu, 7 May 2026 22:43:01 +0530
Subject: [PATCH 3/3] fix: address review comments

---
 tools/cluster_setup/k0s-cluster-config.yaml   | 2 +-
 tools/cluster_setup/k0s_cluster_with_stack.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml
index 7e733af..a7b859c 100644
--- a/tools/cluster_setup/k0s-cluster-config.yaml
+++ b/tools/cluster_setup/k0s-cluster-config.yaml
@@ -14,8 +14,8 @@
 cluster:
   name: airgap-cluster
   region: us-east-2                       # CHANGE THIS — required when storage.objectStore.type=aws (region of the S3 bucket); ignored for true on-prem
-  sshUser: ec2-user                       # CHANGE THIS: SSH user for remote nodes
   sshKeyPath: ~/.ssh/id_rsa                  # CHANGE THIS: Path to SSH private key
+  sshUser: root                       # CHANGE THIS: SSH user for remote nodes
 
 # ---------- Node Configuration ----------
 nodes:
diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh
index b4d508f..17defca 100755
--- a/tools/cluster_setup/k0s_cluster_with_stack.sh
+++ b/tools/cluster_setup/k0s_cluster_with_stack.sh
@@ -112,7 +112,7 @@ load_config() {
   # Node IPs (for existing infrastructure)
   EXISTING_CONTROLLER_IPS=$(yq eval '.nodes.existingIPs.controllers[]' "${CONFIG_FILE}" 2>/dev/null | tr '\n' ' ' || echo "")
   EXISTING_WORKER_IPS=$(yq eval '.nodes.existingIPs.workers[]' "${CONFIG_FILE}" 2>/dev/null | tr '\n' ' ' || echo "")
-  SSH_USER=$(yq eval '.cluster.sshUser' "${CONFIG_FILE}" 2>/dev/null || echo "ubuntu")
+  SSH_USER=$(yq eval '.cluster.sshUser' "${CONFIG_FILE}" 2>/dev/null || echo "root")
   SSH_KEY_PATH=$(yq eval '.cluster.sshKeyPath' "${CONFIG_FILE}" 2>/dev/null || echo "")
 
   # Validate existingIPs are provided (mandatory for on-prem)