From 9f357e5c0421c034b3f74a9b43616d052a709ab3 Mon Sep 17 00:00:00 2001 From: Mohammed Arif Date: Thu, 7 May 2026 20:22:31 +0530 Subject: [PATCH 1/3] feat: Update concise doc from bug bash feedback --- tools/cluster_setup/K0S_QUICKSTART.md | 371 ++++++++++++++++++-------- 1 file changed, 261 insertions(+), 110 deletions(-) diff --git a/tools/cluster_setup/K0S_QUICKSTART.md b/tools/cluster_setup/K0S_QUICKSTART.md index ab67e91..a9cfbec 100644 --- a/tools/cluster_setup/K0S_QUICKSTART.md +++ b/tools/cluster_setup/K0S_QUICKSTART.md @@ -10,15 +10,89 @@ Deploys the complete Splunk AI Platform stack on k0s Kubernetes using pre-provis **Nodes (all):** RHEL 9 · passwordless SSH + sudo · Python 3.8+ -| Node Type | Min CPU | Min RAM | Min Disk | Notes | -|-----------|---------|---------|----------|-------| -| Controller | 4 | 8 GB | 100 GB | API server, etcd, scheduler | -| CPU Worker | 8 | 32 GB | 200 GB | Weaviate, Ray head, Splunk | -| GPU Worker | 8 | 32 GB | 500 GB | NVIDIA GPU required (3 * H100, 3 * L40S) | +| Node Type | Min CPU | Min RAM (per node) | Min Disk | Notes | +| ---------- | ------- | ------------------ | -------- | -------------------------------------------------------- | +| Controller | 4 | 8 GB | 100 GB | API server, etcd, scheduler | +| CPU Worker | 8 | 32 GB | 200 GB | Weaviate, Ray head, Splunk, SAIA API/v2, Data Loader | +| GPU Worker | 16 | 384 GB | 500 GB | NVIDIA GPU required (3 × H100 80GB **or** 6 × L40S 48GB) | -**Ports between nodes:** 22 (SSH), 6443 (API), 2380 (etcd), 10250 (kubelet), 8132 (konnectivity), 4789/UDP (VXLAN), 179 (Calico BGP) -**External storage:** Any S3-compatible endpoint (SeaweedFS, MinIO, AWS S3). Not deployed by the script. +**Ports between nodes:** 22 (SSH), 6443 (API), 2380 (etcd), 10250 (kubelet), 8132 (konnectivity), 4789/UDP (VXLAN), 179 (Calico BGP). Best practice is to allow all ports between nodes. + +**External S3-compatible storage:** Any S3-compatible endpoint (SeaweedFS, MinIO, AWS S3). Must be provisioned **before** running the installer. Customer managed. + +The S3 bucket must be pre-populated with the following directories before installation: + + +| Directory | Required | Description | +| ------------------ | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `model_artifacts/` | **Yes** | Pre-trained model weights. Must be uploaded before install; Ray workers download models from here at startup. Without these, AI inference services will fail. | + + +**Required models in `model_artifacts/`:** + + +| Model | Purpose | +| --------------------------------- | ----------------------------------------------- | +| `gpt-oss-120b` | Primary LLM for chat, SPL generation, reasoning | +| `gpt-oss-20b` | Field descriptions, conversation titles | +| `all-minilm-l6-v2` | Sentence embeddings (data loader, SAIA) | +| `bi-encoder` | Semantic search ranking | +| `cross-encoder` | Re-ranking search results | +| `uae-large` | Embedding model | +| `e5-language-classifier` | Language detection | +| `xlm-roberta-language-classifier` | Multilingual language classification | +| `pii-classifier` | PII detection | +| `mbart-translator` | Translation | + + +### Downloading and Uploading Model Artifacts + +Helper scripts in `tools/artifacts_download_upload_scripts/` automate downloading models from Hugging Face and uploading them to S3-compatible storage. + +**Step 1 — Download models from Hugging Face:** + +```bash +cd tools/artifacts_download_upload_scripts +# Edit model_artifacts_configs.yaml if you need to add/remove models or set HF credentials for gated models +./download_from_huggingface.sh +``` + +Downloads all configured models into `./model_artifacts/`. Auto-installs dependencies (`wget`, `yq`, `git-lfs`). Supports gated models via `hf-token` / `hf-username` in `model_artifacts_configs.yaml`. + +**Step 2 — Upload to your object store:** + + +| Storage Type | Script | Key Environment Variables | +| --------------------- | ------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------- | +| MinIO / S3-compatible | `upload_to_minio.sh` | `OBJECT_STORE_ENDPOINT`, `OBJECT_STORE_BUCKET`, `OBJECT_STORE_ACCESS_KEY`, `OBJECT_STORE_SECRET_KEY` | +| SeaweedFS | `upload_to_seaweedfs.sh` | `S3COMPAT_OBJECT_STORE_ENDPOINT`, `S3COMPAT_OBJECT_STORE_BUCKET`, `S3COMPAT_OBJECT_STORE_ACCESS_KEY`, `S3COMPAT_OBJECT_STORE_SECRET_KEY` | +| AWS S3 | `upload_to_s3.sh` | `S3_BUCKET`, `S3_REGION` (requires AWS CLI credentials) | +| MinIO via AWS CLI | `upload_to_minio_aws.sh` | `S3COMPAT_OBJECT_STORE_ENDPOINT`, `S3COMPAT_OBJECT_STORE_BUCKET`, `S3COMPAT_OBJECT_STORE_ACCESS_KEY`, `S3COMPAT_OBJECT_STORE_SECRET_KEY` | + + +Example (SeaweedFS): + +```bash +S3COMPAT_OBJECT_STORE_ENDPOINT=http://seaweedfs-host:8333 \ +S3COMPAT_OBJECT_STORE_BUCKET=ai-platform-bucket \ +S3COMPAT_OBJECT_STORE_ACCESS_KEY=minioadmin \ +S3COMPAT_OBJECT_STORE_SECRET_KEY=minioadmin \ +./upload_to_seaweedfs.sh +``` + +**Additional utilities:** + + +| Script | Purpose | +| ------------------------------ | -------------------------------------------- | +| `test_minio_connection.sh` | Diagnose S3-compatible endpoint connectivity | +| `create_seaweedfs_folders.sh` | Create standard bucket folder structure | +| `install_seaweedfs_systemd.sh` | Install SeaweedFS as a systemd service | +| `install_minio_ec2.sh` | Install MinIO on an EC2 instance | + + +> See `tools/artifacts_download_upload_scripts/README.md` for full usage details. ## 2. Quick Start @@ -38,12 +112,11 @@ kubectl get aiplatform -n ai-platform ## 3. Commands -| Command | Description | -|---------|-------------| -| `install` | Create k0s cluster + deploy full AI Platform stack | -| `delete` | Stop k0s, remove services | -| `clean-all` | Stop + reset + wipe all k0s state from every node | -| `join-workers` | Add or rejoin worker nodes to an existing cluster | +| Command | Description | +| -------------- | -------------------------------------------------- | +| `install` | Create k0s cluster + deploy full AI Platform stack | +| `clean-all` | Stop + reset + wipe all k0s state from every node | +| `join-workers` | Add or rejoin worker nodes to an existing cluster | ```bash CONFIG_FILE=./my-cluster.yaml ./k0s_cluster_with_stack.sh @@ -51,12 +124,13 @@ CONFIG_FILE=./my-cluster.yaml ./k0s_cluster_with_stack.sh **Environment variables:** -| Variable | Default | Description | -|----------|---------|-------------| -| `CONFIG_FILE` | `./k0s-cluster-config.yaml` | Config file path | -| `AUTO_APPROVE` | `false` | Skip confirmation prompts | -| `USE_EXISTING` | from config | Override `cluster.useExisting` | -| `LOG_DIR` | `./logs` | Session log directory | + +| Variable | Default | Description | +| -------------- | --------------------------- | ------------------------------ | +| `CONFIG_FILE` | `./k0s-cluster-config.yaml` | Config file path | +| `USE_EXISTING` | from config | Override `cluster.useExisting` | +| `LOG_DIR` | `./logs` | Session log directory | + ## 4. What `install` Does @@ -71,7 +145,7 @@ CONFIG_FILE=./my-cluster.yaml ./k0s_cluster_with_stack.sh 8. Health checks → access info ``` -**Safety gate:** If the controller already has Ready nodes, `install` refuses to wipe. Use `useExisting: auto` or run `delete` first. +**Safety gate:** If the controller already has Ready nodes, `install` refuses to wipe. Use `useExisting: auto` or run `clean-all` first. **Session logging:** All output → `logs/k0s-install-YYYY-MM-DD_HH-MM-SS.log` @@ -81,140 +155,211 @@ The config template is `k0s-cluster-config.yaml`. Copy it and edit. Key sections ### cluster -| Field | Required | Default | Description | -|-------|----------|---------|-------------| -| `name` | Yes | — | Cluster name (kubeconfig, labels) | -| `useExisting` | No | `never` | `auto` / `force` / `never` | -| `sshUser` | Yes | `ubuntu` | SSH user for all nodes | -| `sshKeyPath` | Yes | — | SSH private key path | + +| Field | Required | Default | Description | +| ------------- | -------- | ------- | --------------------------------- | +| `name` | Yes | — | Cluster name (kubeconfig, labels) | +| `useExisting` | No | `never` | `auto` / `force` / `never` | +| `sshUser` | Yes | `root` | SSH user for all nodes | +| `sshKeyPath` | Yes | — | SSH private key path | + ### nodes -| Field | Required | Default | Description | -|-------|----------|---------|-------------| -| `controllers` | **Yes** | `1` | Controller count (1 or 3 for HA) | -| `cpuWorkers` | **Yes** | `2` | First N workers labeled CPU | -| `gpuWorkers` | **Yes** | `1` | Remaining workers labeled GPU | -| `existingIPs.controllers` | **Yes** | — | Controller IP list | -| `existingIPs.workers` | **Yes** | — | Worker IP list | + +| Field | Required | Default | Description | +| ------------------------- | -------- | ------- | -------------------------------- | +| `controllers` | **Yes** | — | Controller count (1 or 3 for HA) | +| `cpuWorkers` | **Yes** | — | First N workers labeled CPU | +| `gpuWorkers` | **Yes** | — | Remaining workers labeled GPU | +| `existingIPs.controllers` | **Yes** | — | Controller IP list | +| `existingIPs.workers` | **Yes** | — | Worker IP list | + ### storage -| Field | Required | Default | Description | -|-------|----------|---------|-------------| -| `storageClass` | **Yes** | `local-path` | StorageClass for PVCs | -| `vectorDbSize` | **Yes** | `50Gi` | Weaviate PV size | -| `minimumDiskSpace.controller` | No | `100` | Preflight disk check (GB) | -| `minimumDiskSpace.cpuWorker` | No | `200` | Preflight disk check (GB) | -| `minimumDiskSpace.gpuWorker` | No | `500` | Preflight disk check (GB) | -| `objectStore.type` | **Yes** | `minio` | `aws` / `s3compat` / `minio` / `seaweedfs` | -| `objectStore.bucket` | **Yes** | `ai-platform-data` | Bucket name | -| `objectStore.endpoint` | **Yes** | — | S3 endpoint (*required for non-AWS) | -| `objectStore.auth.rootUser` | Yes | — | Access key | -| `objectStore.auth.rootPassword` | Yes | — | Secret key | + +| Field | Required | Default | Description | +| ------------------------------- | -------- | ------------------ | ------------------------------------------ | +| `storageClass` | **Yes** | `local-path` | StorageClass for PVCs | +| `vectorDbSize` | **Yes** | `50Gi` | Weaviate PV size | +| `minimumDiskSpace.controller` | No | `100` | Preflight disk check (GB) | +| `minimumDiskSpace.cpuWorker` | No | `200` | Preflight disk check (GB) | +| `minimumDiskSpace.gpuWorker` | No | `500` | Preflight disk check (GB) | +| `objectStore.type` | **Yes** | `minio` | `aws` / `s3compat` / `minio` / `seaweedfs` | +| `objectStore.bucket` | **Yes** | `ai-platform-data` | Bucket name | +| `objectStore.endpoint` | **Yes** | — | S3 endpoint (*required for non-AWS) | +| `objectStore.auth.rootUser` | Yes | — | Access key | +| `objectStore.auth.rootPassword` | Yes | — | Secret key | + + +#### S3 Bucket Directory Layout + +The S3 bucket serves as the shared storage layer for both pre-staged artifacts and runtime data. The following directories are created and managed automatically by the platform: + +**Pre-staged (must exist before install):** + + +| Directory | Owner | Description | +| ------------------ | ------------------ | ---------------------------------------------------------- | +| `model_artifacts/` | Admin (pre-staged) | Pre-trained model weights loaded by Ray workers at startup | + + +**Created at runtime by SAIA services:** + + +| Directory | Owner | Description | +| ------------------------ | -------------------- | ------------------------------------------------------------------------ | +| `conversations/` | SAIA v2 API | Conversation history per tenant | +| `config/` | SAIA v2 API / Worker | Tenant data configuration (`config/tenant_data_config/{tenant}.yaml`) | +| `storage_queue/` | SAIA v2 Worker | S3-backed task queue for async ingestion (`urgent/`, `batch/`, `locks/`) | +| `ingestion/tenant_data/` | SAIA v2 Worker | Temporary ingestion payload storage during processing | +| `field_counts/` | SAIA v2 Worker | Cached field count statistics per tenant/index/sourcetype | +| `admin/preferences/` | SAIA v2 API | Admin-curated markdown preferences per tenant | +| `job_groups/` | SAIA v1 API | Background job group state for data upload tasks | + + +**Created at runtime by other platform components:** + + +| Directory | Owner | Description | +| ------------ | ----------------- | -------------------- | +| `artifacts/` | AI Operator | Deployment artifacts | +| `tasks/` | AI Operator / Ray | Task execution state | + + +> **Note:** Do not manually delete runtime directories (`conversations/`, `config/`, `storage_queue/`) as they contain active state. Deleting `storage_queue/locks/` may be necessary to clear stale distributed locks after a non-graceful pod restart. ### images Short paths auto-prefixed with `images.registry`. All marked **Yes** are required; others have defaults. -| Field | Req | Default | -|-------|-----|---------| -| `registry` | No | `""` | -| `operator.image` | **Yes** | — | -| `splunk.image` | **Yes** | — | -| `splunk.operatorImage` | No | `docker.io/splunk/splunk-operator:3.0.0` | -| `ray.headImage` | **Yes** | — | -| `ray.workerImage` | **Yes** | — | -| `weaviate.image` | **Yes** | — | -| `saia.apiImage` | **Yes** | — | -| `saia.apiV2Image` | **Yes** | — | -| `saia.dataLoaderImage` | **Yes** | — | -| `nginx.image` | No | `docker.io/library/nginx:1.27-alpine` | -| `fluentBit.image` | No | `fluent/fluent-bit:1.9.6` | -| `otelCollector.image` | No | `otel/opentelemetry-collector-contrib:0.122.1` | +**Image sources:** + + +| Source | Images | +| ----------- | -------------------------------------------------------------------------------------------------------------------------- | +| VOC Portal | `operator`, `ray.headImage`, `ray.workerImage`, `saia.apiImage`, `saia.apiV2Image`, `saia.dataLoaderImage`, `splunk.image` | +| `docker.io` | `splunk.operatorImage`, `weaviate.image`, `nginx.image`, `fluentBit.image`, `otelCollector.image` | +| `quay.io` | KubeRay Operator (deployed via Helm, not in this config) | + + +> **Note:** VOC Portal images must be pushed to a registry (e.g., ECR, ACR, GCR, private registry) accessible by the cluster. Set `images.registry` to that registry; short paths like `ml-platform/ray/ray-head:tag` are auto-prefixed with it. + + +| Field | Req | Default | +| ---------------------- | ------- | ---------------------------------------------- | +| `registry` | **Yes** | `""` | +| `operator.image` | **Yes** | — | +| `splunk.image` | **Yes** | — | +| `splunk.operatorImage` | No | `docker.io/splunk/splunk-operator:3.0.0` | +| `ray.headImage` | **Yes** | — | +| `ray.workerImage` | **Yes** | — | +| `weaviate.image` | **Yes** | — | +| `saia.apiImage` | **Yes** | — | +| `saia.apiV2Image` | **Yes** | — | +| `saia.dataLoaderImage` | **Yes** | — | +| `nginx.image` | No | `docker.io/library/nginx:1.27-alpine` | +| `fluentBit.image` | No | `fluent/fluent-bit:1.9.6` | +| `otelCollector.image` | No | `otel/opentelemetry-collector-contrib:0.122.1` | + ### aiPlatform -| Field | Required | Default | Description | -|-------|----------|---------|-------------| -| `name` | **Yes** | `${CLUSTER_NAME}-ai-platform` | CR name | -| `defaultAcceleratorType` | **Yes** | `""` | `L40S` / `H100` / empty | -| `workerGroupConfig.imageRegistry` | No | `""` | Ray worker image override | -| `features[].name` | Yes | — | Feature name (e.g., `saia`) | -| `features[].version` | Yes | — | Feature version | -| `cpuScheduling` | No | auto | `nodeSelector` + `tolerations` for CPU pods | -| `gpuScheduling` | No | auto | `nodeSelector` + `tolerations` for GPU pods | -| `serviceTemplate.type` | **Yes** | — | `NodePort` / `LoadBalancer` for SAIA exposure | -| `serviceTemplate.nodePort` | **Yes** | — | Port number (NodePort only) | + +| Field | Required | Default | Description | +| --------------------------------- | -------- | ----------------------------- | --------------------------------------------- | +| `name` | **Yes** | `${CLUSTER_NAME}-ai-platform` | CR name | +| `defaultAcceleratorType` | **Yes** | `""` | `L40S` / `H100` | +| `workerGroupConfig.imageRegistry` | No | `""` | Ray worker image override | +| `features[].name` | Yes | — | Feature name (e.g., `saia`) | +| `features[].version` | Yes | — | Feature version | +| `cpuScheduling` | No | auto | `nodeSelector` + `tolerations` for CPU pods | +| `gpuScheduling` | No | auto | `nodeSelector` + `tolerations` for GPU pods | +| `serviceTemplate.type` | **Yes** | — | `NodePort` / `LoadBalancer` for SAIA exposure | +| `serviceTemplate.nodePort` | **Yes** | — | Port number (NodePort only) | + ### imagePullSecrets The `secrets[]` list is **not consumed**. The script auto-detects secrets by checking hardcoded names (`ecr-registry-secret`, `docker-hub-secret`, `gcr-secret`, `acr-secret`, `custom-registry-secret`). -| Field | Description | -|-------|-------------| -| `autoCreateECR` | Create ECR secret from AWS creds | -| `dockerHub.enabled` | Create Docker Hub secret | -| `gcr.enabled` | Create GCR secret | -| `acr.enabled` | Create ACR secret | -| `custom.enabled` | Create custom registry secret | + +| Field | Description | +| ------------------- | -------------------------------- | +| `autoCreateECR` | Create ECR secret from AWS creds | +| `dockerHub.enabled` | Create Docker Hub secret | +| `gcr.enabled` | Create GCR secret | +| `acr.enabled` | Create ACR secret | +| `custom.enabled` | Create custom registry secret | + ECR tokens expire after 12 hours. Re-run install or set up a CronJob to refresh. ### ecr -| Field | Description | -|-------|-------------| + +| Field | Description | +| --------- | -------------- | | `account` | AWS account ID | -| `region` | ECR region | +| `region` | ECR region | + ## 6. Node Labels & GPU The script auto-labels nodes: -| Node type | Key labels | -|-----------|------------| -| Controller | `splunk.ai/workload-type: control-plane` | -| CPU Worker | `splunk.ai/workload-type: cpu`, `splunk.ai/instance-type: cpu-worker` | + +| Node type | Key labels | +| ---------- | ------------------------------------------------------------------------------------------------ | +| Controller | `splunk.ai/workload-type: control-plane` | +| CPU Worker | `splunk.ai/workload-type: cpu`, `splunk.ai/instance-type: cpu-worker` | | GPU Worker | `splunk.ai/workload-type: gpu`, `nvidia.com/gpu: "true"`, taint `nvidia.com/gpu=true:NoSchedule` | + **NVIDIA drivers** are installed directly on GPU nodes (not GPU Operator). Supported: RHEL 9 currently. The script installs kernel headers, CUDA repo, `cuda-drivers`, NVIDIA Container Toolkit, then verifies with `nvidia-smi`. ## 7. Troubleshooting **SSH failures:** + ```bash ssh -i ~/.ssh/key.pem user@node-ip hostname # test connectivity chmod 600 ~/.ssh/key.pem # fix permissions ``` **Safety gate ("refusing to wipe"):** -Set `useExisting: auto` in config, or run `delete` then `install`. +Set `useExisting: auto` in config, or run `clean-all` then `install`. **k0s issues:** + ```bash ssh user@controller-ip "sudo k0s status" ssh user@controller-ip "sudo journalctl -u k0scontroller -f" ``` **Worker join failures:** + ```bash CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh join-workers ``` **GPU not detected:** + ```bash kubectl get pods -n kube-system -l name=nvidia-device-plugin-ds ssh user@gpu-node nvidia-smi ``` **AIPlatform not ready:** + ```bash kubectl describe aiplatform -n ai-platform kubectl logs -n splunk-ai-operator-system deployment/splunk-ai-operator-controller-manager ``` **Session logs:** + ```bash ls -lt tools/cluster_setup/logs/ tail -f tools/cluster_setup/logs/k0s-install-*.log @@ -231,36 +376,42 @@ tail -f tools/cluster_setup/logs/k0s-install-*.log **Binaries/charts downloaded by the script:** -| What | Source | -|------|--------| -| k0s binary | `https://get.k0s.sh` | -| cert-manager v1.13.0 | `github.com/cert-manager/cert-manager` | -| kube-prometheus-stack | `prometheus-community` Helm repo | -| opentelemetry-operator | `open-telemetry` Helm repo | -| kuberay-operator v1.2.2 | `ray-project` Helm repo | -| NVIDIA device plugin | `github.com/NVIDIA/k8s-device-plugin` | -| local-path-provisioner | `github.com/rancher/local-path-provisioner` | + +| What | Source | +| ----------------------- | ------------------------------------------- | +| k0s binary | `https://get.k0s.sh` | +| cert-manager v1.13.0 | `github.com/cert-manager/cert-manager` | +| kube-prometheus-stack | `prometheus-community` Helm repo | +| opentelemetry-operator | `open-telemetry` Helm repo | +| kuberay-operator v1.2.2 | `ray-project` Helm repo | +| NVIDIA device plugin | `github.com/NVIDIA/k8s-device-plugin` | +| local-path-provisioner | `github.com/rancher/local-path-provisioner` | + **Container images pulled at runtime:** -| Image | Default Source | -|-------|---------------| -| Splunk AI Operator, Ray Head/Worker, SAIA API v1/v2, Data Loader, Splunk Enterprise | ECR or configured registry | -| Weaviate | `docker.io/semitechnologies/weaviate` | -| Nginx | `docker.io/library/nginx:1.27-alpine` | -| Fluent Bit | `docker.io/fluent/fluent-bit:1.9.6` | -| OTel Collector | `docker.io/otel/opentelemetry-collector-contrib:0.122.1` | -| Splunk Operator | `docker.io/splunk/splunk-operator:3.0.0` | -| KubeRay Operator | `quay.io/kuberay/operator:v1.2.2` | -| Prometheus, Grafana, cert-manager, NVIDIA plugin, local-path | Pulled by their respective Helm charts/manifests | + +| Image | Default Source | +| ----------------------------------------------------------------------------------- | -------------------------------------------------------- | +| Splunk AI Operator, Ray Head/Worker, SAIA API v1/v2, Data Loader, Splunk Enterprise | ECR or configured registry | +| Weaviate | `docker.io/semitechnologies/weaviate` | +| Nginx | `docker.io/library/nginx:1.27-alpine` | +| Fluent Bit | `docker.io/fluent/fluent-bit:1.9.6` | +| OTel Collector | `docker.io/otel/opentelemetry-collector-contrib:0.122.1` | +| Splunk Operator | `docker.io/splunk/splunk-operator:3.0.0` | +| KubeRay Operator | `quay.io/kuberay/operator:v1.2.2` | +| Prometheus, Grafana, cert-manager, NVIDIA plugin, local-path | Pulled by their respective Helm charts/manifests | + **NVIDIA packages on GPU nodes (RHEL 9):** -| Package | Source | -|---------|--------| -| CUDA drivers | `developer.download.nvidia.com/compute/cuda/repos/` | -| Container Toolkit | `nvidia.github.io/libnvidia-container/` | -| EPEL (RHEL 10 only) | `dl.fedoraproject.org/pub/epel/` | + +| Package | Source | +| ------------------- | --------------------------------------------------- | +| CUDA drivers | `developer.download.nvidia.com/compute/cuda/repos/` | +| Container Toolkit | `nvidia.github.io/libnvidia-container/` | +| EPEL (RHEL 10 only) | `dl.fedoraproject.org/pub/epel/` | + ## 9. Architecture From 38bc50d3fcf0974254718da8d11f07ca548d5ca2 Mon Sep 17 00:00:00 2001 From: Mohammed Arif Date: Thu, 7 May 2026 22:41:04 +0530 Subject: [PATCH 2/3] fix: addressing review comments --- tools/cluster_setup/K0S_QUICKSTART.md | 45 +++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/tools/cluster_setup/K0S_QUICKSTART.md b/tools/cluster_setup/K0S_QUICKSTART.md index a9cfbec..97db8f2 100644 --- a/tools/cluster_setup/K0S_QUICKSTART.md +++ b/tools/cluster_setup/K0S_QUICKSTART.md @@ -277,7 +277,7 @@ Short paths auto-prefixed with `images.registry`. All marked **Yes** are require | `cpuScheduling` | No | auto | `nodeSelector` + `tolerations` for CPU pods | | `gpuScheduling` | No | auto | `nodeSelector` + `tolerations` for GPU pods | | `serviceTemplate.type` | **Yes** | — | `NodePort` / `LoadBalancer` for SAIA exposure | -| `serviceTemplate.nodePort` | **Yes** | — | Port number (NodePort only) | +| `serviceTemplate.nodePort` | No | — | Port number (when serviceTemplate.type as NodePort only) | ### imagePullSecrets @@ -294,7 +294,7 @@ The `secrets[]` list is **not consumed**. The script auto-detects secrets by che | `custom.enabled` | Create custom registry secret | -ECR tokens expire after 12 hours. Re-run install or set up a CronJob to refresh. +ECR tokens usually expire after 12 hours. Re-run install or set up a CronJob to refresh. ### ecr @@ -305,6 +305,47 @@ ECR tokens expire after 12 hours. Re-run install or set up a CronJob to refresh. | `region` | ECR region | +### metallb + +k0s has no built-in `LoadBalancer` provider. When `aiPlatform.serviceTemplate.type=LoadBalancer` (the recommended SAIA exposure path), the installer deploys MetalLB to allocate a VIP from a pool you provide. Skipped automatically when `type=NodePort`. + + +| Field | Required | Default | Description | +| ---------------- | ----------------------- | ---------------- | -------------------------------------------------------- | +| `install` | **Yes** | `false` | Set `true` to install MetalLB | +| `chartVersion` | No | `0.14.8` | `metallb/metallb` Helm chart version | +| `namespace` | No | `metallb-system` | MetalLB install namespace | +| `pool.name` | No | `saia-pool` | Name of the `IPAddressPool` | +| `pool.addresses` | **Yes** | — | Free, routable IP range(s) on your LAN | +| `mode` | No | `layer2` | `layer2` (most LANs) or `bgp` (data-center fabric) | +| `bgpPeers` | **Yes when `mode=bgp`** | `[]` | List of `{peerAddress, peerASN, myASN}` for BGP upstream | + + +**Minimal config (Layer-2):** + +```yaml +metallb: + install: true + pool: + addresses: + - "10.20.30.100-10.20.30.110" # free range on the worker LAN + mode: "layer2" +``` + +**Verify MetalLB after install:** + +```bash +# MetalLB controller and speakers +kubectl -n metallb-system get deploy,ds + +# Address pool and advertisement +kubectl -n metallb-system get ipaddresspool,l2advertisement,bgppeer,bgpadvertisement + +# SAIA service should have an EXTERNAL-IP from the pool +kubectl -n ai-platform get svc -l app.kubernetes.io/component=saia +``` + + ## 6. Node Labels & GPU The script auto-labels nodes: From 48b06753a1131ef76a383d756220c152d833d4e7 Mon Sep 17 00:00:00 2001 From: Mohammed Arif Date: Thu, 7 May 2026 22:43:01 +0530 Subject: [PATCH 3/3] fix: address review comments --- tools/cluster_setup/k0s-cluster-config.yaml | 2 +- tools/cluster_setup/k0s_cluster_with_stack.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml index 7e733af..a7b859c 100644 --- a/tools/cluster_setup/k0s-cluster-config.yaml +++ b/tools/cluster_setup/k0s-cluster-config.yaml @@ -14,8 +14,8 @@ cluster: name: airgap-cluster region: us-east-2 # CHANGE THIS — required when storage.objectStore.type=aws (region of the S3 bucket); ignored for true on-prem - sshUser: ec2-user # CHANGE THIS: SSH user for remote nodes sshKeyPath: ~/.ssh/id_rsa # CHANGE THIS: Path to SSH private key + sshUser: root # CHANGE THIS: SSH user for remote nodes # ---------- Node Configuration ---------- nodes: diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index b4d508f..17defca 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -112,7 +112,7 @@ load_config() { # Node IPs (for existing infrastructure) EXISTING_CONTROLLER_IPS=$(yq eval '.nodes.existingIPs.controllers[]' "${CONFIG_FILE}" 2>/dev/null | tr '\n' ' ' || echo "") EXISTING_WORKER_IPS=$(yq eval '.nodes.existingIPs.workers[]' "${CONFIG_FILE}" 2>/dev/null | tr '\n' ' ' || echo "") - SSH_USER=$(yq eval '.cluster.sshUser' "${CONFIG_FILE}" 2>/dev/null || echo "ubuntu") + SSH_USER=$(yq eval '.cluster.sshUser' "${CONFIG_FILE}" 2>/dev/null || echo "root") SSH_KEY_PATH=$(yq eval '.cluster.sshKeyPath' "${CONFIG_FILE}" 2>/dev/null || echo "") # Validate existingIPs are provided (mandatory for on-prem)