From f17b874591201f9156cc83e982865dea05b0ed6c Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Fri, 20 Feb 2026 17:12:56 +0530 Subject: [PATCH 01/55] changes for supporting minio in operator and script --- docs/troubleshooting.md | 161 ++++++++ pkg/ai/raybuilder/builder.go | 88 ++++- pkg/storage/azure.go | 3 + pkg/storage/storageclient.go | 12 + pkg/storage/storageclient_test.go | 22 ++ tools/cluster_setup/EKS_README.md | 152 +++++++- tools/cluster_setup/cluster-config.yaml | 86 +++-- tools/cluster_setup/eks_cluster_with_stack.sh | 364 ++++++++++++++++-- 8 files changed, 802 insertions(+), 86 deletions(-) diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 871dbc7..4f85bdb 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -196,6 +196,167 @@ kubectl logs -l ray.io/node-type=worker -n | grep - `CUDA_VISIBLE_DEVICES is set to empty string` → GPU configuration issue - `RuntimeError: CUDA out of memory` → Increase GPU resources +#### "Invalid repository ID or local directory" (e.g. Llama31Instruct / VLLMTextGenModel) + +If you see a validation error like: + +```text +Invalid repository ID or local directory specified: '/home/ray/.cache/s3/artifacts/model_artifacts/llama31-8b-instruct'. +Please verify the following requirements: +1. Provide a valid Hugging Face repository ID. +2. Specify a local directory that contains a recognized configuration file. + - For Hugging Face models: ensure the presence of a 'config.json'. +``` + +the model loader is trying to use a **local path** where the model should have been downloaded from object storage (S3/MinIO). That path is either missing or does not contain the required files (e.g. `config.json`). Common causes: + +1. **Model not in object storage** + The prefix `model_artifacts/llama31-8b-instruct` must exist in your bucket with a full Hugging Face–style layout (including `config.json` and weight files). + - Download: `./tools/artifacts_download_upload_scripts/download_from_huggingface.sh` + - Upload to MinIO: `./tools/artifacts_download_upload_scripts/upload_to_minio.sh` (set `MINIO_ENDPOINT`, `MINIO_BUCKET`, `MINIO_ACCESS_KEY`, `MINIO_SECRET_KEY` as in the [artifacts README](../tools/artifacts_download_upload_scripts/README.md)). + +2. **Ray workers cannot reach MinIO/S3** + - For **external MinIO** (e.g. EC2): ensure the MinIO endpoint in `cluster-config.yaml` (`storage.minio.endpoint`) is reachable from EKS (security groups, VPC, and if using a public IP, that nodes can egress to it). + - From a Ray worker pod: + `kubectl exec -it -n -- env | grep -E 'MINIO|ARTIFACTS|S3'` + then test connectivity (e.g. curl to the MinIO endpoint or use the same client the SDK uses). + +3. **Wrong or missing credentials** + AIPlatform must have `objectStorage.secretRef` pointing to a secret with `s3_access_key` and `s3_secret_key` (and the operator passes these as `MINIO_ACCESS_KEY` / `MINIO_SECRET_KEY` to Ray). Verify the secret exists and matches the MinIO/S3 account that can read the bucket: + - `kubectl get secret minio-credentials -n -o jsonpath='{.data}'` + +4. **Bucket/prefix mismatch** + The bucket name in AIPlatform `objectStorage.path` (e.g. `minio://`) and the prefix in the application config (`model_artifacts/llama31-8b-instruct`) must match where you uploaded the model. + +**Quick checks:** + +- List objects in MinIO for the model prefix (from a host with `mc` or AWS CLI configured for MinIO): + - `mc ls myminio//model_artifacts/llama31-8b-instruct/` + You should see at least `config.json` and the model weight files. +- From a Ray worker pod, confirm env vars and that the path is writable: + - `kubectl exec -it -n -- ls -la /home/ray/.cache/s3/artifacts/model_artifacts/ 2>/dev/null || echo "path missing or empty"` + If the directory is missing or empty, the download from object storage failed (network, credentials, or missing objects). + +**Full reset when the deployment keeps failing (e.g. Llama31Instruct / LLMDeploymentL40S):** + +If the model is correct in MinIO and credentials are in the serve config but the replica still fails with "Invalid repository ID or local directory", clear the artifact cache and restart Ray so replicas run a fresh download and load. + +1. **Clear the artifact cache on all workers** + Either remove only the failing model prefix or the entire `model_artifacts` tree (more thorough): + + ```bash + export AI_NS="${AI_NS:-ai-platform}" + + # Option A: clear only the failing model (e.g. llama31-8b-instruct) + for p in $(kubectl get pods -n "$AI_NS" -l ray.io/node-type=worker -o jsonpath='{.items[*].metadata.name}'); do + kubectl exec -n "$AI_NS" "$p" -c ray-worker -- rm -rf /home/ray/.cache/s3/artifacts/model_artifacts/llama31-8b-instruct + done + + # Option B: clear entire model_artifacts (use if multiple models or unknown state) + for p in $(kubectl get pods -n "$AI_NS" -l ray.io/node-type=worker -o jsonpath='{.items[*].metadata.name}'); do + kubectl exec -n "$AI_NS" "$p" -c ray-worker -- rm -rf /home/ray/.cache/s3/artifacts/model_artifacts + done + ``` + +2. **Restart worker pods** so new replicas run and download from MinIO: + + ```bash + kubectl delete pods -n "$AI_NS" -l ray.io/node-type=worker + ``` + +3. **Optional: restart the Ray head** to force a full Ray Serve redeploy (new replica placement and startup): + + ```bash + kubectl delete pod -n "$AI_NS" -l ray.io/node-type=head + ``` + +4. **Wait 10–15 minutes** for workers (and head) to be Running and for the deployment replica to download the model and start. The first download can be large (e.g. ~16 GB for Llama 3.1 8B); if the replica is restarted too soon (e.g. after a few quick failures), the download may never complete. + +5. **Verify** the deployment status and, if needed, that a worker has the model: + + ```bash + kubectl get rayservice -n "$AI_NS" -o yaml | grep -A 30 'Llama31Instruct:' + WORKER=$(kubectl get pods -n "$AI_NS" -l ray.io/node-type=worker -o jsonpath='{.items[0].metadata.name}') + kubectl exec -n "$AI_NS" "$WORKER" -c ray-worker -- sh -c 'ls /home/ray/.cache/s3/artifacts/model_artifacts/llama31-8b-instruct/*.safetensors 2>/dev/null || echo "No safetensors"' + ``` + +### MinIO credentials and serve config verification + +When using MinIO, the operator injects credentials from the object storage secret into the Ray Serve config so replicas can download model artifacts. Use these steps to verify the secret and that the updated serve config is applied. + +**1. Check that the AIPlatform object storage secret exists and has the required keys** + +Replace `` with your AIPlatform namespace (e.g. `ai-platform`) and `` with the value of `spec.objectStorage.secretRef` from your AIPlatform (e.g. `minio-credentials`). + +```bash +# Get AIPlatform namespace and secretRef (optional: discover from the CR) +kubectl get aiplatform -A -o custom-columns=NAME:.metadata.name,NS:.metadata.namespace,SECRET:.spec.objectStorage.secretRef + +# Confirm the secret exists in the same namespace as the AIPlatform +kubectl get secret -n + +# List secret keys (names only; values are base64-encoded and must not be logged) +kubectl get secret -n -o jsonpath='{.data}' | jq -r 'keys[]' + +# Verify required keys are present (expect s3_access_key and s3_secret_key) +kubectl get secret -n -o jsonpath='{.data}' | jq -r 'keys[]' | grep -E 's3_access_key|s3_secret_key' +``` + +If either `s3_access_key` or `s3_secret_key` is missing, create or update the secret, for example: + +```bash +kubectl -n create secret generic \ + --from-literal=s3_access_key="" \ + --from-literal=s3_secret_key="" \ + --dry-run=client -o yaml | kubectl apply -f - +``` + +**2. Reconcile or restart the operator with the new image** + +After updating the operator image (with the change that injects MinIO credentials into the serve config), either trigger a reconcile or restart the operator so it rewrites `RayService.spec.serveConfigV2`. + +- **Option A – Restart the operator deployment** (simplest; causes one reconcile when the pod comes back): + + ```bash + # Replace with the namespace where the operator runs (e.g. splunk-ai-operator-system) + kubectl rollout restart deployment splunk-ai-operator-controller-manager -n + kubectl rollout status deployment splunk-ai-operator-controller-manager -n + ``` + +- **Option B – Trigger reconcile by touching the AIPlatform** (no operator restart): + + ```bash + kubectl annotate aiplatform -n \ + reconcile-$(date +%s)=triggered --overwrite + ``` + + The operator will reconcile and regenerate the RayService; ensure the operator is already running the new image before doing this. + +**3. Confirm RayService.spec.serveConfigV2 includes MINIO_ACCESS_KEY and MINIO_SECRET_KEY** + +The serve config is a JSON string in `RayService.spec.serveConfigV2`. Check that it contains the MinIO env vars for the apps (e.g. after the operator has reconciled). + +```bash +# Set your AIPlatform namespace and RayService name (often the same as AIPlatform name, e.g. splunk-ai-stack) +NAMESPACE="" +RAY_SERVICE_NAME="" + +# Count occurrences of MINIO_ACCESS_KEY in the serve config (expect > 0 when using MinIO) +kubectl get rayservice "$RAY_SERVICE_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.serveConfigV2}' | jq -Rs 'split("MINIO_ACCESS_KEY") | length - 1' + +# Show a snippet to confirm the keys are present (values are redacted in output) +kubectl get rayservice "$RAY_SERVICE_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.serveConfigV2}' | grep -o '"MINIO_ACCESS_KEY"[^,]*' | head -1 +kubectl get rayservice "$RAY_SERVICE_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.serveConfigV2}' | grep -o '"MINIO_SECRET_KEY"[^,]*' | head -1 +``` + +If the count is 0, the operator may not be using the new image, or `objectStorage.secretRef` may be unset. Ensure: + +- The AIPlatform has `spec.objectStorage.path` with scheme `minio://...` and `spec.objectStorage.secretRef` set to the secret name. +- The secret exists in the AIPlatform namespace and contains `s3_access_key` and `s3_secret_key`. +- The operator deployment has been restarted (or reconciled) with the image that injects MinIO credentials into the applications template. + +After confirming, restart Ray workers if needed so they pick up the new env (e.g. scale down and up the Ray cluster or wait for rolling restart), then re-check replica logs and the cache path `/home/ray/.cache/s3/artifacts/model_artifacts/...`. + ### Weaviate Errors ```bash diff --git a/pkg/ai/raybuilder/builder.go b/pkg/ai/raybuilder/builder.go index e29a1a7..79dd5c1 100644 --- a/pkg/ai/raybuilder/builder.go +++ b/pkg/ai/raybuilder/builder.go @@ -45,7 +45,11 @@ type Builder struct { type ApplicationParams struct { ArtifactBucketName string `yaml:"ARTIFACTS_S3_BUCKET"` + ArtifactsProvider string `yaml:"ARTIFACTS_PROVIDER"` CloudProvider string `yaml:"CLOUD_PROVIDER"` + MinioEndpointUrl string `yaml:"MINIO_ENDPOINT_URL"` + MinioAccessKey string `yaml:"MINIO_ACCESS_KEY"` + MinioSecretKey string `yaml:"MINIO_SECRET_KEY"` Replicas map[string]int32 `yaml:"REPLICAS"` } @@ -89,15 +93,25 @@ func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPl return err } - // Set CloudProvider based on URL scheme - var cloudProvider string + // Set CloudProvider and artifacts provider/bucket from URL scheme (for SDK model loaders). + // ARTIFACTS_PROVIDER matches storage client GetProvider(): s3/minio -> "s3", gs/gcs -> "gcs", azure -> "azure". + var cloudProvider, artifactsProvider string switch u.Scheme { case "s3": cloudProvider = "aws" - case "gs": + artifactsProvider = "s3" + case "minio": + cloudProvider = "minio" + artifactsProvider = "s3" // MinIO is S3-compatible; SDK uses s3 client + case "gs", "gcs": cloudProvider = "gcp" + artifactsProvider = "gcs" + case "azure": + cloudProvider = "azure" + artifactsProvider = "azure" default: - cloudProvider = "azure" // TODO: FIX THIS, need to support minio + cloudProvider = "azure" + artifactsProvider = "azure" } // Initialize the replicas map by iterating through features @@ -135,9 +149,34 @@ func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPl } } + minioEndpoint := "" + if u.Scheme == "minio" && p.Spec.ObjectStorage.Endpoint != "" { + minioEndpoint = p.Spec.ObjectStorage.Endpoint + } + + var minioAccessKey, minioSecretKey string + if u.Scheme == "minio" && p.Spec.ObjectStorage.SecretRef != "" { + var secret corev1.Secret + secretRef := types.NamespacedName{Namespace: p.Namespace, Name: p.Spec.ObjectStorage.SecretRef} + if err := b.Get(ctx, secretRef, &secret); err != nil { + logger.Error(err, "Failed to get object storage secret for MinIO credentials", "secret", p.Spec.ObjectStorage.SecretRef) + return err + } + if raw, ok := secret.Data["s3_access_key"]; ok { + minioAccessKey = string(raw) + } + if raw, ok := secret.Data["s3_secret_key"]; ok { + minioSecretKey = string(raw) + } + } + param := ApplicationParams{ ArtifactBucketName: u.Host, + ArtifactsProvider: artifactsProvider, CloudProvider: cloudProvider, + MinioEndpointUrl: minioEndpoint, + MinioAccessKey: minioAccessKey, + MinioSecretKey: minioSecretKey, Replicas: replicasMap, } @@ -670,7 +709,41 @@ func (b *Builder) buildClusterConfig(ctx context.Context) (*rayv1.RayClusterSpec }, nil } +// objectStorageSecretEnv returns env vars for MINIO_ACCESS_KEY and MINIO_SECRET_KEY from +// the objectStorage secret (s3_access_key/s3_secret_key) so models and SAIA can access MinIO/S3. +func (b *Builder) objectStorageSecretEnv() []corev1.EnvVar { + if b.ai.Spec.ObjectStorage.SecretRef == "" { + return nil + } + secretName := b.ai.Spec.ObjectStorage.SecretRef + return []corev1.EnvVar{ + { + Name: "MINIO_ACCESS_KEY", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: secretName}, + Key: "s3_access_key", + }, + }, + }, + { + Name: "MINIO_SECRET_KEY", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: secretName}, + Key: "s3_secret_key", + }, + }, + }, + } +} + func (b *Builder) makeHeadTemplate() corev1.PodTemplateSpec { + headEnv := []corev1.EnvVar{ + {Name: "DEFAULT_GPU_TYPE", Value: b.ai.Spec.DefaultAcceleratorType}, + {Name: "CLUSTER_NAME", Value: "ai-platform-models"}, // FIXME + } + headEnv = append(headEnv, b.objectStorageSecretEnv()...) spec := corev1.PodSpec{ Containers: []corev1.Container{{ Name: "ray-head", @@ -684,10 +757,7 @@ func (b *Builder) makeHeadTemplate() corev1.PodTemplateSpec { "-lc", "--", }, - Env: []corev1.EnvVar{ - {Name: "DEFAULT_GPU_TYPE", Value: b.ai.Spec.DefaultAcceleratorType}, - {Name: "CLUSTER_NAME", Value: "ai-platform-models"}, // FIXME - }, + Env: headEnv, Lifecycle: &corev1.Lifecycle{ PreStop: &corev1.LifecycleHandler{ Exec: &corev1.ExecAction{ @@ -783,6 +853,8 @@ func (b *Builder) makeWorkerTemplate(cfg InstanceDetail) corev1.PodTemplateSpec combinedEnv = append(combinedEnv, corev1.EnvVar{Name: key, Value: value}) } } + // MinIO/S3 credentials for models and SAIA (MINIO_ACCESS_KEY, MINIO_SECRET_KEY) + combinedEnv = append(combinedEnv, b.objectStorageSecretEnv()...) rayCommand := fmt.Sprintf(`echo %s worker; ulimit -n 65536; export PATH="/home/ray/anaconda3/bin:$PATH"; diff --git a/pkg/storage/azure.go b/pkg/storage/azure.go index fa5f0ba..abbde0c 100644 --- a/pkg/storage/azure.go +++ b/pkg/storage/azure.go @@ -31,6 +31,9 @@ func NewAzureClient( namespace, container, prefix string, vs ai.ObjectStorageSpec, ) (StorageClient, error) { + if container == "" { + return nil, fmt.Errorf("Azure Blob storage requires a container name; use path format azure://container-name/prefix (e.g. azure://my-container/model_artifacts). Without it, model deployments fail with 'Please specify a container name'") + } var cred azcore.TokenCredential var err error diff --git a/pkg/storage/storageclient.go b/pkg/storage/storageclient.go index 7dbea32..1935616 100644 --- a/pkg/storage/storageclient.go +++ b/pkg/storage/storageclient.go @@ -43,12 +43,24 @@ func NewStorageClient( switch u.Scheme { case "s3": + if u.Host == "" { + return nil, fmt.Errorf("invalid volume URI %q: S3 path must include bucket name (e.g. s3://bucket-name/prefix)", vs.Path) + } return NewS3Client(ctx, k8sClient, namespace, u.Host, prefix, vs) case "gs", "gcs": + if u.Host == "" { + return nil, fmt.Errorf("invalid volume URI %q: GCS path must include bucket name (e.g. gs://bucket-name/prefix)", vs.Path) + } return NewGCSClient(ctx, k8sClient, namespace, u.Host, prefix, vs) case "azure": + if u.Host == "" { + return nil, fmt.Errorf("invalid volume URI %q: Azure path must include container name (e.g. azure://container-name/prefix). Without it, model deployments fail with 'Please specify a container name'", vs.Path) + } return NewAzureClient(ctx, k8sClient, namespace, u.Host, prefix, vs) case "minio": + if u.Host == "" { + return nil, fmt.Errorf("invalid volume URI %q: MinIO path must include bucket name (e.g. minio://bucket-name/prefix)", vs.Path) + } // everything after "//" is host (bucket) and path. We treat u.Host as bucket, // vs.Endpoint *must* be set to our MinIO URL for this case. return NewMinioClient(ctx, k8sClient, namespace, u.Host, prefix, vs) diff --git a/pkg/storage/storageclient_test.go b/pkg/storage/storageclient_test.go index c97dcc2..e395b4d 100644 --- a/pkg/storage/storageclient_test.go +++ b/pkg/storage/storageclient_test.go @@ -117,6 +117,28 @@ func TestNewStorageClient(t *testing.T) { return fake.NewClientBuilder().WithScheme(s) }, }, + { + name: "Azure path without container name", + volumeSpec: ai.ObjectStorageSpec{ + Path: "azure:///model_artifacts", + Region: "eastus", + }, + wantErr: true, + setupClient: func() *fake.ClientBuilder { + return fake.NewClientBuilder().WithScheme(s) + }, + }, + { + name: "S3 path without bucket name", + volumeSpec: ai.ObjectStorageSpec{ + Path: "s3:///prefix", + Region: "us-west-2", + }, + wantErr: true, + setupClient: func() *fake.ClientBuilder { + return fake.NewClientBuilder().WithScheme(s) + }, + }, } for _, tt := range tests { diff --git a/tools/cluster_setup/EKS_README.md b/tools/cluster_setup/EKS_README.md index f160c1a..54893ba 100644 --- a/tools/cluster_setup/EKS_README.md +++ b/tools/cluster_setup/EKS_README.md @@ -53,13 +53,14 @@ The script installs everything needed for the AI Platform: 4. **EBS CSI Driver** - Persistent volumes backed by AWS EBS 5. **Cluster Autoscaler** - Automatic node scaling based on demand 6. **Cert-Manager** - Automated certificate management -7. **Kube-Prometheus Stack** - Monitoring with Prometheus + Grafana -8. **OpenTelemetry Operator** - Distributed tracing and telemetry -9. **NVIDIA Device Plugin** - GPU support for AI workloads -10. **KubeRay Operator** - Ray cluster management for distributed AI -11. **Splunk Operator** - Splunk Enterprise management -12. **Splunk AI Platform Operator** - AI platform orchestration -13. **AI Platform CR** - Complete AI deployment with features +7. **MinIO (optional)** - S3-compatible object storage in-cluster when `storage.minio.enabled: true` +8. **Kube-Prometheus Stack** - Monitoring with Prometheus + Grafana +9. **OpenTelemetry Operator** - Distributed tracing and telemetry +10. **NVIDIA Device Plugin** - GPU support for AI workloads +11. **KubeRay Operator** - Ray cluster management for distributed AI +12. **Splunk Operator** - Splunk Enterprise management +13. **Splunk AI Platform Operator** - AI platform orchestration +14. **AI Platform CR** - Complete AI deployment with features ### AWS Integration Features @@ -539,10 +540,18 @@ storage: # (3-63 chars, lowercase, numbers, hyphens) ``` +**Optional: MinIO (in-cluster or external EC2)** +- **In-cluster:** Set `storage.minio.enabled: true`. The script deploys MinIO via Helm and configures the AIPlatform CR. +- **External (e.g. EC2):** Set `storage.minio.enabled: true`, `storage.minio.external: true`, and `storage.minio.endpoint: "http://:9000"` (and matching `bucket`/`auth`). Use the companion script to install MinIO on an EC2 instance in the same VPC: `CONFIG_FILE=./cluster-config.yaml ./install_minio_ec2.sh --launch-ec2` launches an EC2 in the EKS VPC; then SSH to it and run `./install_minio_ec2.sh --bucket ai-platform --user minioadmin --password ''`. Pre-populate artifacts in MinIO before cluster setup. If you use MinIO, the Splunk app (when using `splunkStandalone.localAppPath`) is not uploaded automatically; upload it to MinIO at `apps/` via MinIO console or `mc`/`aws s3 --endpoint-url`. + +**Idempotency and existing VPC** +- The install is **idempotent**: if the EKS cluster already exists, the script skips cluster creation and only runs reconcile (addons, operators, AIPlatform). Set `cluster.useExisting: true` to require an existing cluster (script fails if the cluster is not found). +- **Use an existing VPC:** Provide `cluster.subnets` (private and public subnet IDs and AZs). eksctl will use that VPC and will not create a new one. + **Important Notes:** - **Cluster Name**: Must be DNS-1123 compliant (lowercase letters, numbers, hyphens; start/end with alphanumeric) -- **S3 Bucket**: Must be globally unique across all AWS accounts -- **Subnets**: If provided, script validates NAT Gateway, Internet Gateway, and route tables exist +- **S3 Bucket**: Must be globally unique across all AWS accounts (ignored when MinIO is enabled) +- **Subnets**: If provided, script validates NAT Gateway, Internet Gateway, and route tables exist; cluster uses this existing VPC - **Subnets**: Leave empty or comment out to let eksctl create a new VPC automatically **What each section configures:** @@ -551,8 +560,10 @@ storage: |---------|--------------|------------------| | `cluster.name` | EKS cluster name | ✅ **REQUIRED:** Change to your cluster name | | `cluster.region` | AWS region | ✅ **REQUIRED:** Change to your region | -| `cluster.subnets` | VPC subnets for nodes | ⚙️ **OPTIONAL:** Leave empty for new VPC or provide existing subnet IDs | -| `storage.s3Bucket` | S3 bucket for AI artifacts | ✅ **REQUIRED:** Choose unique name | +| `cluster.useExisting` | Use existing cluster only (do not create) | ⚙️ Set `true` to skip cluster creation; script fails if cluster not found | +| `cluster.subnets` | VPC subnets for nodes | ⚙️ **OPTIONAL:** Leave empty for new VPC or provide existing subnet IDs to use existing VPC | +| `storage.s3Bucket` | S3 bucket for AI artifacts (used when MinIO is disabled) | ✅ **REQUIRED** if not using MinIO | +| `storage.minio` | MinIO (in-cluster or external) | ⚙️ `enabled: true`; for EC2 set `external: true` and `endpoint: "http://:9000"` | | `images.registry` | Container registry URL | ✅ **REQUIRED:** Your ECR/Docker registry | | `images.*` | All container images | ✅ **REQUIRED:** Configure all image paths | | `nodeGroups.cpu` | CPU node group settings | ⚙️ Optional: adjust size/type | @@ -723,18 +734,93 @@ CONFIG_FILE=./my-cluster-config.yaml ./eks_cluster_with_stack.sh install ### 4. Verify Installation +After running `eks_cluster_with_stack.sh install` (or upgrade) with the latest operator image, use the commands below to verify the setup. Default namespace and AIPlatform name come from `cluster-config.yaml` (`aiPlatform.namespace` and `aiPlatform.name`); if you use a custom config, set `AI_NS` and `AI_PLATFORM_NAME` accordingly. + ```bash # Set kubeconfig (done automatically by script) export KUBECONFIG=~/.kube/config -# Check cluster +# ----- Optional: load namespace/name from your config ----- +# CONFIG_FILE="${CONFIG_FILE:-./cluster-config.yaml}" +# AI_NS="$(yq eval '.aiPlatform.namespace' "$CONFIG_FILE")" +# AI_PLATFORM_NAME="$(yq eval '.aiPlatform.name' "$CONFIG_FILE")" +# Or use defaults: +export AI_NS="${AI_NS:-ai-platform}" +export AI_PLATFORM_NAME="${AI_PLATFORM_NAME:-splunk-ai-stack}" +export SPLUNK_AI_NS="${SPLUNK_AI_NS:-splunk-ai-operator-system}" +``` + +**1. Cluster and nodes** + +```bash kubectl get nodes +kubectl get nodes -o wide +``` -# Check AI Platform -kubectl get aiplatform -n ai-platform +**2. Splunk AI Operator (confirm it is running the image you deployed)** -# Check all pods -kubectl get pods --all-namespaces +```bash +kubectl get deploy -n "$SPLUNK_AI_NS" -l app.kubernetes.io/name=splunk-ai-operator -o wide +kubectl get pods -n "$SPLUNK_AI_NS" -l app.kubernetes.io/name=splunk-ai-operator +# Show operator image (replace deployment name if different) +kubectl get deploy -n "$SPLUNK_AI_NS" -o jsonpath='{.items[0].spec.template.spec.containers[0].image}'; echo +``` + +**3. AIPlatform CR and status** + +```bash +kubectl get aiplatform "$AI_PLATFORM_NAME" -n "$AI_NS" +kubectl get aiplatform "$AI_PLATFORM_NAME" -n "$AI_NS" -o jsonpath='{.status.conditions[*].type}{"\n"}{.status.conditions[*].status}'; echo +# Detailed readiness (expect Ready=True when healthy) +kubectl get aiplatform "$AI_PLATFORM_NAME" -n "$AI_NS" -o jsonpath='{.status.conditions[?(@.type=="Ready")]}' | jq . +``` + +**4. Object storage secret (MinIO/S3 credentials for serve config)** + +```bash +# Secret name comes from AIPlatform spec.objectStorage.secretRef +SECRET_NAME="$(kubectl get aiplatform "$AI_PLATFORM_NAME" -n "$AI_NS" -o jsonpath='{.spec.objectStorage.secretRef}')" +echo "SecretRef: ${SECRET_NAME:-}" +kubectl get secret "${SECRET_NAME:-minio-credentials}" -n "$AI_NS" 2>/dev/null && echo "✓ Secret exists" || echo "✗ Secret missing" +kubectl get secret "${SECRET_NAME:-minio-credentials}" -n "$AI_NS" -o jsonpath='{.data}' 2>/dev/null | jq -r 'keys[]' | grep -E 's3_access_key|s3_secret_key' && echo "✓ Required keys present" || echo "✗ Check s3_access_key / s3_secret_key" +``` + +**5. RayService and serve config (MinIO credentials in apps)** + +```bash +kubectl get rayservice "$AI_PLATFORM_NAME" -n "$AI_NS" +# Count MINIO_ACCESS_KEY in serve config (expect > 0 when using MinIO) +kubectl get rayservice "$AI_PLATFORM_NAME" -n "$AI_NS" -o jsonpath='{.spec.serveConfigV2}' | grep -o 'MINIO_ACCESS_KEY' | wc -l +``` + +**6. Ray and application pods** + +```bash +kubectl get pods -n "$AI_NS" -l ray.io/cluster="$AI_PLATFORM_NAME" +kubectl get pods -n "$AI_NS" -l ai.splunk.com/platform="$AI_PLATFORM_NAME" +``` + +**7. Services (Ray Serve, Weaviate)** + +```bash +kubectl get svc -n "$AI_NS" -l ray.io/cluster="$AI_PLATFORM_NAME" +kubectl get svc -n "$AI_NS" | grep -E "ray|weaviate" +``` + +**8. Events (recent issues)** + +```bash +kubectl get events -n "$AI_NS" --sort-by='.lastTimestamp' | tail -30 +kubectl describe aiplatform "$AI_PLATFORM_NAME" -n "$AI_NS" | tail -40 +``` + +**Quick one-liner summary** + +```bash +echo "--- Operator ---"; kubectl get deploy -n "$SPLUNK_AI_NS" -o 'custom-columns=NAME:.metadata.name,READY:.status.readyReplicas,IMAGE:.spec.template.spec.containers[0].image' +echo "--- AIPlatform ---"; kubectl get aiplatform "$AI_PLATFORM_NAME" -n "$AI_NS" -o 'custom-columns=NAME:.metadata.name,READY:.status.conditions[0].status' +echo "--- RayService ---"; kubectl get rayservice "$AI_PLATFORM_NAME" -n "$AI_NS" +echo "--- Pods ---"; kubectl get pods -n "$AI_NS" --no-headers | wc -l; kubectl get pods -n "$AI_NS" | head -20 ``` --- @@ -2136,6 +2222,40 @@ EOF ## Troubleshooting +### Ray / AI model deployment: "Invalid repository ID or local directory" + +If a Ray Serve replica (e.g. `Llama31Instruct:LLMDeploymentL40S`) fails with: + +```text +Invalid repository ID or local directory specified: '/home/ray/.cache/s3/artifacts/model_artifacts/llama31-8b-instruct'. +Please verify the following requirements: +1. Provide a valid Hugging Face repository ID. +2. Specify a local directory that contains a recognized configuration file (e.g. config.json). +``` + +the model is loaded from object storage (S3/MinIO) into that path inside the pod. The path is missing or incomplete because the download from object storage failed or the model was never uploaded. + +**Checklist:** + +1. **Model is in MinIO/S3** + Upload the model so the bucket has the prefix `model_artifacts/llama31-8b-instruct/` with at least `config.json` and the model weights (see [artifacts README](../artifacts_download_upload_scripts/README.md)): + - Download: `./tools/artifacts_download_upload_scripts/download_from_huggingface.sh` + - Upload: `./tools/artifacts_download_upload_scripts/upload_to_minio.sh` (set `MINIO_ENDPOINT`, `MINIO_BUCKET`, and credentials to match your `cluster-config.yaml`). + +2. **External MinIO reachable from EKS** + If using external MinIO (e.g. EC2), ensure: + - `storage.minio.endpoint` in `cluster-config.yaml` is correct (e.g. `http://:9000`). + - The EC2 security group allows **inbound TCP 9000** from your EKS node security group or VPC CIDR (see `install_minio_ec2.sh` output). + - From a Ray worker pod: + `kubectl exec -it -n -- curl -s -o /dev/null -w "%{http_code}" http:///minio/health/live` + +3. **Credentials secret** + AIPlatform must have `objectStorage.secretRef` set (e.g. `minio-credentials`). The secret must contain `s3_access_key` and `s3_secret_key` matching the MinIO user that can read the bucket: + - `kubectl get secret minio-credentials -n -o jsonpath='{.data}'` + +4. **Full troubleshooting steps** + See [Troubleshooting: Invalid repository ID or local directory](../../docs/troubleshooting.md) in the main docs for verification commands and details. + ### Script Execution Issues #### Issue: Script Exits Silently Without Error Message diff --git a/tools/cluster_setup/cluster-config.yaml b/tools/cluster_setup/cluster-config.yaml index d3738db..f3d49f3 100644 --- a/tools/cluster_setup/cluster-config.yaml +++ b/tools/cluster_setup/cluster-config.yaml @@ -13,27 +13,32 @@ # ---------- Cluster Configuration ---------- cluster: - useExisting: false - name: "my-ai-cluster" # CHANGE THIS: Your EKS cluster name (DNS-1123 compliant: lowercase, numbers, hyphens) - region: "us-west-2" # CHANGE THIS: Your AWS region (e.g., us-east-1, us-west-2, eu-west-1) + useExisting: false # true = do not create cluster; use existing one (script fails if cluster not found) + name: "ai-tier-sok-test-east2" # CHANGE THIS: Your EKS cluster name (DNS-1123 compliant: lowercase, numbers, hyphens) + region: "us-east-2" # CHANGE THIS: Your AWS region (e.g., us-east-1, us-west-2, eu-west-1) k8sVersion: "1.31" # Kubernetes version (1.29, 1.30, 1.31 supported) + # When true: require subnets (existing VPC). On 'delete', only EKS and related resources are removed; VPC is preserved so you can redeploy (e.g. with MinIO on EC2 in same VPC). + preserveVpcOnDelete: false # Set true to keep VPC on delete and redeploy without recreating VPC - # If you donot provide any subnet information, eksctl will create a new VPC with public and private subnets automatically. + # To use an EXISTING VPC: provide subnets below; eksctl will not create a new VPC. Idempotent: cluster is only created if it does not exist. + # If you do not provide subnets, eksctl creates a new VPC and subnets automatically. # VPC Subnets - CHANGE ALL OF THESE to your actual subnet IDs # Find your subnets: aws ec2 describe-subnets --filters "Name=vpc-id,Values=vpc-xxxxx" --region us-west-2 - #subnets: - # private: # Private subnets (at least 2 in different AZs) - # - id: "subnet-1a2b3c4d5e6f7g8h" # CHANGE THIS: Your private subnet 1 - # az: "us-west-2a" # CHANGE THIS: Availability zone for subnet 1 - # - id: "subnet-9h8g7f6e5d4c3b2a" # CHANGE THIS: Your private subnet 2 - # az: "us-west-2b" # CHANGE THIS: Availability zone for subnet 2 - # public: # Public subnets (at least 2 in different AZs) - # - id: "subnet-a1b2c3d4e5f6g7h8" # CHANGE THIS: Your public subnet 1 - # az: "us-west-2a" # CHANGE THIS: Availability zone for subnet 1 - # - id: "subnet-h8g7f6e5d4c3b2a1" # CHANGE THIS: Your public subnet 2 - # az: "us-west-2b" # CHANGE THIS: Availability zone for subnet 2 - # - id: "subnet-1h2g3f4e5d6c7b8a" # OPTIONAL: Additional public subnet for HA - # az: "us-west-2c" # OPTIONAL: Third availability zone +# subnets: +# private: # Private subnets (at least 2 in different AZs) +# - id: "subnet-02734905b10e7ad5a" # CHANGE THIS: Your private subnet 1 +# az: "us-east-2b" # CHANGE THIS: Availability zone for subnet 1 +# - id: "subnet-0c1d7dc49788d11dc" # CHANGE THIS: Your private subnet 2 +# az: "us-east-2c" # CHANGE THIS: Availability zone for subnet 2 +# - id: "subnet-0f8f94998d65dfcd2" # CHANGE THIS: Your private subnet 2 +# az: "us-east-2a" +# public: # Public subnets (at least 2 in different AZs) +# - id: "subnet-0f0ea3b190a618540" # CHANGE THIS: Your public subnet 1 +# az: "us-east-2c" # CHANGE THIS: Availability zone for subnet 1 +# - id: "subnet-02b736130e7c2a787" # CHANGE THIS: Your public subnet 2 +# az: "us-east-2a" # CHANGE THIS: Availability zone for subnet 2 +# - id: "subnet-02c35a8cd0b5d90a5" # OPTIONAL: Additional public subnet for HA +# az: "us-east-2b" # OPTIONAL: Third availability zone # ---------- Node Groups ---------- nodeGroups: @@ -57,10 +62,25 @@ nodeGroups: # ---------- Storage Configuration ---------- storage: - s3Bucket: "my-company-ai-platform-bucket" # CHANGE THIS: Globally unique S3 bucket name + s3Bucket: "ai-platform-bucket-minio-us-east-2" # CHANGE THIS: Globally unique S3 bucket name (used when minio.enabled is false) storageClass: "gp3" # Storage class for Kubernetes PVCs (gp3, gp2, io1, io2) vectorDbSize: "50Gi" # VectorDB persistent volume size + # Optional: MinIO (S3-compatible object storage). Use in-cluster or external (e.g. EC2). + minio: + enabled: true # Set true to use MinIO for object storage + external: true # true = MinIO runs outside cluster (e.g. EC2); set endpoint below + endpoint: "http://13.59.216.105:9000" # When external=true: e.g. "http://10.0.1.50:9000" (EC2 private IP or hostname) + namespace: "minio" # Namespace for in-cluster MinIO (ignored when external=true) + bucket: "ai-platform-bucket-minio-us-east-2" # Bucket name (must exist on MinIO) + replicas: 1 # In-cluster only: number of MinIO replicas + persistence: + size: "150Gi" + storageClass: "" + auth: + rootUser: "minioadmin" + rootPassword: "minioadmin" # Leave empty for in-cluster auto-generate; required for external + # ---------- Container Images Configuration ---------- images: # ================================================================================== @@ -82,7 +102,7 @@ images: # # REQUIRED: Specify your private registry URL for custom images # Leave empty to use Docker Hub defaults for all images - registry: "1234567890.dkr.ecr.us-west-2.amazonaws.com" # CHANGE THIS: Your ECR/Docker/Harbor registry + registry: "658391232643.dkr.ecr.us-east-2.amazonaws.com" # CHANGE THIS: Your ECR/Docker/Harbor registry # ================================================================================== # CONTAINER IMAGES - Specify paths (registry prefix auto-applied if needed) @@ -97,18 +117,15 @@ images: # Option 2: Full path (ignores registry prefix) # image: "docker.io/myorg/splunk-ai-operator:v1.0.0" # Result: "docker.io/myorg/splunk-ai-operator:v1.0.0" - image: "docker.io/splunk/splunk-ai-operator:0.1.0" + image: "docker.io/kpratyush775/splunk-ai-operator:v0.1.1" # Splunk Enterprise Images splunk: - # Option 1: Relative path (uses registry prefix) - # image: "splunk/splunk:10.2.0" - # Result: "123456789012.dkr.ecr.us-west-2.amazonaws.com/splunk/splunk:10.2.0" - # - # Option 2: Full path (ignores registry prefix) - # image: "docker.io/myorg/splunk:10.2.0" - # Result: "docker.io/myorg/splunk:10.2.0" - image: "splunk/splunk:10-2-ai-custom" + # Splunk Enterprise image + # Default behavior: If no registry in path, uses Docker Hub + # "splunk/splunk:10.2.0" → Docker Hub + # "123456789012.dkr.ecr.us-west-2.amazonaws.com/splunk/splunk:10.2.0" → ECR + image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/splunk/splunk:10-2-ai-custom" # Splunk Operator image (optional - has default) # Default: "docker.io/splunk/splunk-operator:3.0.0" @@ -123,8 +140,8 @@ images: # Option 2: Full path with different registry # headImage: "docker.io/rayproject/ray:2.44.0" # Result: "docker.io/rayproject/ray:2.44.0" - headImage: "ml-platform/ray/ray-head:build-17" - workerImage: "ml-platform/ray/ray-worker-gpu:build-17" + headImage: "ml-platform/ray/ray-head:build-v1alpha1" + workerImage: "ml-platform/ray/ray-worker-gpu:build-v1alpha1" # Weaviate Vector Database weaviate: @@ -136,8 +153,8 @@ images: # SAIA (Splunk AI Assistant) Images saia: # Relative paths - registry prefix auto-applied - apiImage: "ml-platform/saia/saia-api:build-1" - dataLoaderImage: "ml-platform/saia/saia-data-loader:build-1" + apiImage: "ml-platform/saia/saia-api:build-v1alpha1" + dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v1alpha1" # Supporting Images fluentBit: @@ -146,6 +163,11 @@ images: # image: "fluent-bit:1.9.6" → uses registry prefix image: "docker.io/fluent/fluent-bit:1.9.6" + # OpenTelemetry Collector (use full URL so it is not rewritten to ECR) + otelCollector: + # Public image - full path so registry prefix is NOT applied; validation checks this URL + image: "docker.io/otel/opentelemetry-collector-contrib:0.122.1" + # ---------- Operator Versions ---------- operators: ray: diff --git a/tools/cluster_setup/eks_cluster_with_stack.sh b/tools/cluster_setup/eks_cluster_with_stack.sh index 62e64ee..2d4f100 100755 --- a/tools/cluster_setup/eks_cluster_with_stack.sh +++ b/tools/cluster_setup/eks_cluster_with_stack.sh @@ -29,6 +29,7 @@ load_config() { CLUSTER_NAME="$(yq eval '.cluster.name' "$cfg")" REGION="$(yq eval '.cluster.region' "$cfg")" K8S_VERSION="$(yq eval '.cluster.k8sVersion' "$cfg")" + USE_EXISTING_CLUSTER="$(yq eval '.cluster.useExisting // false' "$cfg")" # Node groups ENABLE_CPU="$(yq eval '.nodeGroups.cpu.enabled' "$cfg")" @@ -47,10 +48,24 @@ load_config() { GPU_VOLUME_SIZE="$(yq eval '.nodeGroups.gpu.volumeSize' "$cfg")" GPU_VOLUME_TYPE="$(yq eval '.nodeGroups.gpu.volumeType' "$cfg")" + # Cluster options + PRESERVE_VPC_ON_DELETE="$(yq eval '.cluster.preserveVpcOnDelete // false' "$cfg")" + # Storage S3_BUCKET="$(yq eval '.storage.s3Bucket' "$cfg")" STORAGE_CLASS="$(yq eval '.storage.storageClass' "$cfg")" VECTORDB_SIZE="$(yq eval '.storage.vectorDbSize' "$cfg")" + # MinIO (optional S3-compatible object storage) + MINIO_ENABLED="$(yq eval '.storage.minio.enabled // false' "$cfg")" + MINIO_EXTERNAL="$(yq eval '.storage.minio.external // false' "$cfg")" + MINIO_ENDPOINT="$(yq eval '.storage.minio.endpoint // ""' "$cfg")" + MINIO_NS="$(yq eval '.storage.minio.namespace // "minio"' "$cfg")" + MINIO_BUCKET="$(yq eval '.storage.minio.bucket // "ai-platform"' "$cfg")" + MINIO_REPLICAS="$(yq eval '.storage.minio.replicas // 1' "$cfg")" + MINIO_PVC_SIZE="$(yq eval '.storage.minio.persistence.size // "100Gi"' "$cfg")" + MINIO_PVC_STORAGE_CLASS="$(yq eval '.storage.minio.persistence.storageClass // ""' "$cfg")" + MINIO_ROOT_USER="$(yq eval '.storage.minio.auth.rootUser // "minioadmin"' "$cfg")" + MINIO_ROOT_PASSWORD="$(yq eval '.storage.minio.auth.rootPassword // ""' "$cfg")" # AI Platform AI_NS="$(yq eval '.aiPlatform.namespace' "$cfg")" @@ -93,32 +108,44 @@ load_config() { FLUENT_BIT_IMAGE="$(yq eval '.images.fluentBit.image' "$cfg")" OTEL_COLLECTOR_IMAGE="$(yq eval '.images.otelCollector.image' "$cfg")" - # Subnets - read as arrays (Bash 3.2 compatible) + # Subnets - read as arrays (support both cluster.subnets and top-level subnets) PRIVATE_SUBNETS=() while IFS= read -r subnet; do [[ -n "$subnet" ]] && PRIVATE_SUBNETS+=("$subnet") - done < <(yq eval '.cluster.subnets.private[].id' "$cfg") + done < <(yq eval '.cluster.subnets.private[].id // .subnets.private[].id' "$cfg") PRIVATE_SUBNETS_AZ=() while IFS= read -r az; do [[ -n "$az" ]] && PRIVATE_SUBNETS_AZ+=("$az") - done < <(yq eval '.cluster.subnets.private[].az' "$cfg") + done < <(yq eval '.cluster.subnets.private[].az // .subnets.private[].az' "$cfg") PUBLIC_SUBNETS=() while IFS= read -r subnet; do [[ -n "$subnet" ]] && PUBLIC_SUBNETS+=("$subnet") - done < <(yq eval '.cluster.subnets.public[].id' "$cfg") + done < <(yq eval '.cluster.subnets.public[].id // .subnets.public[].id' "$cfg") PUBLIC_SUBNETS_AZ=() while IFS= read -r az; do [[ -n "$az" ]] && PUBLIC_SUBNETS_AZ+=("$az") - done < <(yq eval '.cluster.subnets.public[].az' "$cfg") + done < <(yq eval '.cluster.subnets.public[].az // .subnets.public[].az' "$cfg") else # Fallback: simple grep-based parsing (less robust but works without yq) CLUSTER_NAME="$(grep 'name:' "$cfg" | head -1 | sed 's/.*name: *"\(.*\)".*/\1/')" REGION="$(grep 'region:' "$cfg" | head -1 | sed 's/.*region: *"\(.*\)".*/\1/')" K8S_VERSION="$(grep 'k8sVersion:' "$cfg" | sed 's/.*k8sVersion: *"\(.*\)".*/\1/')" + USE_EXISTING_CLUSTER="false" + PRESERVE_VPC_ON_DELETE="false" S3_BUCKET="$(grep 's3Bucket:' "$cfg" | sed 's/.*s3Bucket: *"\(.*\)".*/\1/')" + MINIO_ENABLED="false" + MINIO_EXTERNAL="false" + MINIO_ENDPOINT="" + MINIO_NS="minio" + MINIO_BUCKET="ai-platform" + MINIO_REPLICAS="1" + MINIO_PVC_SIZE="150Gi" + MINIO_PVC_STORAGE_CLASS="" + MINIO_ROOT_USER="minioadmin" + MINIO_ROOT_PASSWORD="AAnwWE2sLfFduYTpPy4v7PcyczSHGrVM" AI_NS="$(grep 'namespace:' "$cfg" | grep -A2 'aiPlatform:' | tail -1 | sed 's/.*namespace: *"\(.*\)".*/\1/')" AI_PLATFORM_NAME="splunk-ai-stack" AI_STANDALONE_NAME="splunk-standalone" @@ -163,6 +190,7 @@ load_config() { ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" S3_PREFIXES=("artifacts/" "apps/" "tasks/") AI_BUCKET_POLICY_NAME="S3Access-${CLUSTER_NAME}-ai-platform" + AI_ECR_ONLY_POLICY_NAME="ECRAccess-${CLUSTER_NAME}-ai-platform" # IRSA for EBS CSI EBS_IRSA_ROLE_NAME="EBSCSIDriverRole-${CLUSTER_NAME}" @@ -1134,6 +1162,109 @@ install_cert_manager() { check_ready cert-manager "app.kubernetes.io/instance=cert-manager,app.kubernetes.io/component=controller" } +# ---------- MinIO (optional S3-compatible object storage) ---------- +install_minio() { + if [[ "${MINIO_ENABLED}" != "true" ]]; then + log "MinIO is disabled (storage.minio.enabled != true); skipping." + return 0 + fi + + # External MinIO (e.g. on EC2): only create credentials secret; no in-cluster install + if [[ "${MINIO_EXTERNAL}" == "true" ]]; then + log "Using external MinIO (storage.minio.external=true); skipping in-cluster install." + if [[ -z "${MINIO_ENDPOINT}" ]]; then + warn "storage.minio.endpoint is empty; set it to the MinIO URL (e.g. http://:9000) for AIPlatform to use external MinIO." + fi + if [[ -z "${MINIO_ROOT_PASSWORD}" ]]; then + err "External MinIO requires storage.minio.auth.rootPassword to be set (same as on the MinIO server)." + return 1 + fi + ensure_namespace "${AI_NS}" + local secret_name="minio-credentials" + kubectl -n "${AI_NS}" create secret generic "${secret_name}" \ + --from-literal=AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" \ + --from-literal=AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \ + --from-literal=s3_access_key="${MINIO_ROOT_USER}" \ + --from-literal=s3_secret_key="${MINIO_ROOT_PASSWORD}" \ + --from-literal=MINIO_ACCESS_KEY="${MINIO_ROOT_USER}" \ + --from-literal=MINIO_SECRET_KEY="${MINIO_ROOT_PASSWORD}" \ + --dry-run=client -o yaml | kubectl -n "${AI_NS}" apply -f - + log "✓ External MinIO credentials secret ${AI_NS}/${secret_name} ready" + return 0 + fi + + log "Installing MinIO in ${MINIO_NS}..." + ensure_namespace "${MINIO_NS}" + + # Auto-generate root password if not set + local minio_password="${MINIO_ROOT_PASSWORD}" + if [[ -z "$minio_password" ]]; then + minio_password="$(openssl rand -base64 24 2>/dev/null || head -c 32 /dev/urandom | base64)" + MINIO_ROOT_PASSWORD="$minio_password" + log "Generated MinIO root password (saved for secret creation)" + fi + + helm repo add bitnami https://charts.bitnami.com/bitnami + helm repo update + + local helm_args=( + --namespace "${MINIO_NS}" + --set auth.rootUser="${MINIO_ROOT_USER}" + --set auth.rootPassword="${MINIO_ROOT_PASSWORD}" + --set defaultBuckets="${MINIO_BUCKET}" + --set persistence.size="${MINIO_PVC_SIZE}" + --set replicas="${MINIO_REPLICAS}" + ) + [[ -n "${MINIO_PVC_STORAGE_CLASS}" ]] && helm_args+=(--set persistence.storageClass="${MINIO_PVC_STORAGE_CLASS}") + + helm_retry 5 upgrade --install minio bitnami/minio "${helm_args[@]}" --wait --timeout 10m + + # Wait for MinIO deployment to be ready + local minio_deploy="minio" + kubectl -n "${MINIO_NS}" rollout status deployment/"${minio_deploy}" --timeout=300s 2>/dev/null || true + + # Create credentials secret in AI platform namespace for AIPlatform CR (objectStorage.secretRef). + # SAIA and pkg/storage expect s3_access_key/s3_secret_key; models/SAIA expect MINIO_ACCESS_KEY/MINIO_SECRET_KEY. + ensure_namespace "${AI_NS}" + local secret_name="minio-credentials" + kubectl -n "${AI_NS}" create secret generic "${secret_name}" \ + --from-literal=AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" \ + --from-literal=AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \ + --from-literal=s3_access_key="${MINIO_ROOT_USER}" \ + --from-literal=s3_secret_key="${MINIO_ROOT_PASSWORD}" \ + --from-literal=MINIO_ACCESS_KEY="${MINIO_ROOT_USER}" \ + --from-literal=MINIO_SECRET_KEY="${MINIO_ROOT_PASSWORD}" \ + --dry-run=client -o yaml | kubectl -n "${AI_NS}" apply -f - + + # Create prefix "folders" in MinIO bucket (artifacts/, apps/, tasks/) via placeholder objects + log "Creating MinIO bucket prefixes (artifacts/, apps/, tasks/)..." + cat </dev/null || true + + log "✓ MinIO installed; bucket=${MINIO_BUCKET}; credentials secret ${AI_NS}/${secret_name}" +} + # ---------- OTEL Operator + contrib collector (idempotent) ---------- install_otel_operator_and_contrib_collector() { log "Installing OpenTelemetry Operator (Helm)..." @@ -1328,6 +1459,62 @@ EOF printf "%s" "$arn" } +# ECR-only policy for IRSA when using MinIO (no S3) - allows pulling images from ECR +ensure_ecr_only_policy() { + local name="${AI_ECR_ONLY_POLICY_NAME}" + local expected_arn="arn:aws:iam::${ACCOUNT_ID}:policy/${name}" + if aws iam get-policy --policy-arn "$expected_arn" >/dev/null 2>&1; then + printf "%s" "$expected_arn" + return 0 + fi + local arn + arn="$(get_policy_arn_by_name "$name")" + if [[ -z "$arn" ]]; then + log "Creating IAM policy ${name} (ECR read-only, for MinIO-only mode)" + local pd; pd="$(mktemp)"; TMP_FILES+=("$pd") + cat > "$pd" <<'ECRPOL' +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "ECRAuth", + "Effect": "Allow", + "Action": "ecr:GetAuthorizationToken", + "Resource": "*" + }, + { + "Sid": "ECRPull", + "Effect": "Allow", + "Action": [ + "ecr:BatchCheckLayerAvailability", + "ecr:GetDownloadUrlForLayer", + "ecr:BatchGetImage" + ], + "Resource": "arn:aws:ecr:*:*:repository/*" + } + ] +} +ECRPOL + local create_out rc + set +e + create_out="$(aws iam create-policy --policy-name "${name}" --policy-document "file://${pd}" --query 'Policy.Arn' --output text 2>&1)" + rc=$? + set -e + if (( rc == 0 )); then + arn="$(normalize_arn "$create_out")" + else + if grep -qi 'EntityAlreadyExists' <<<"$create_out"; then + arn="$(get_policy_arn_by_name "$name")" + else + err "Failed to create IAM policy ${name}: $create_out" + fi + fi + fi + arn="$(normalize_arn "$arn")" + [[ -z "$arn" ]] && err "Failed to resolve ARN for policy ${name}" + printf "%s" "$arn" +} + # ------- IRSA helpers: ensure & validate ------- generate_irsa_trust_policy() { local ns="$1" sa="$2" @@ -1386,6 +1573,18 @@ ensure_irsa_for_sa() { local sa="$1" ns="$2" policy_arn_raw="${3:-}" local role="IRSA-${CLUSTER_NAME}-${sa}" + # Fail fast if kubectl cannot reach the cluster (e.g. wrong KUBECONFIG or context) + local kerr + kerr="$(kubectl get ns "${ns}" 2>&1)" || true + if echo "${kerr}" | grep -q "connection refused\|localhost:8080\|dial tcp.*8080"; then + err "kubectl cannot reach the cluster (API server connection refused). \ +Fix: run 'aws eks update-kubeconfig --name ${CLUSTER_NAME} --region ${REGION}' and ensure KUBECONFIG (if set) points to that file. \ +Then re-run this script." + fi + if ! kubectl get ns "${ns}" >/dev/null 2>&1; then + err "Cannot access namespace ${ns} (kubectl get ns failed). Ensure the cluster is reachable and the namespace exists." + fi + # Resolve/repair policy ARN if invalid local policy_arn; policy_arn="$(normalize_arn "$policy_arn_raw")" if [[ -z "$policy_arn" || $policy_arn != arn:aws:iam::* ]]; then @@ -1454,28 +1653,34 @@ install_splunk_standalone() { ensure_namespace "${AI_NS}" wait_for_crd standalones.enterprise.splunk.com 600 - # Create IRSA for Splunk Standalone (recommended approach) + # IRSA for Splunk Standalone: S3 bucket policy when using S3, ECR-only when using MinIO log "Setting up IRSA for Splunk Standalone service account..." - local policy_arn; policy_arn="$(ensure_bucket_policy "${AI_BUCKET_POLICY_NAME}" "${S3_BUCKET}")" + local policy_arn + if [[ "${MINIO_ENABLED}" == "true" ]]; then + policy_arn="$(ensure_ecr_only_policy)" + else + policy_arn="$(ensure_bucket_policy "${AI_BUCKET_POLICY_NAME}" "${S3_BUCKET}")" + fi ensure_irsa_for_sa "${STANDALONE_SA}" "${AI_NS}" "${policy_arn}" - # DEPRECATED: Create s3-secret using AWS credentials - # This is legacy approach - IRSA above is preferred, but Splunk Operator may still require the secret - log "Creating s3-secret for Splunk Standalone (fallback if IRSA not fully supported)..." - if resolve_aws_creds_for_secret 2>/dev/null; then - local ak="${AWS_ACCESS_KEY_ID:-}"; local sk="${AWS_SECRET_ACCESS_KEY:-}"; local st="${AWS_SESSION_TOKEN:-}" - if [[ -n "$ak" && -n "$sk" ]]; then - kubectl -n "${AI_NS}" create secret generic s3-secret \ - --from-literal=s3_access_key="${ak}" \ - --from-literal=s3_secret_key="${sk}" \ - $( [[ -n "$st" ]] && printf -- "--from-literal=s3_session_token=%s" "$st" ) \ - --dry-run=client -o yaml | kubectl apply -f - - log "✓ Created s3-secret with explicit credentials" + if [[ "${MINIO_ENABLED}" != "true" ]]; then + # Create s3-secret for Standalone when using S3 (fallback if IRSA not fully supported) + log "Creating s3-secret for Splunk Standalone (S3 mode)..." + if resolve_aws_creds_for_secret 2>/dev/null; then + local ak="${AWS_ACCESS_KEY_ID:-}"; local sk="${AWS_SECRET_ACCESS_KEY:-}"; local st="${AWS_SESSION_TOKEN:-}" + if [[ -n "$ak" && -n "$sk" ]]; then + kubectl -n "${AI_NS}" create secret generic s3-secret \ + --from-literal=s3_access_key="${ak}" \ + --from-literal=s3_secret_key="${sk}" \ + $( [[ -n "$st" ]] && printf -- "--from-literal=s3_session_token=%s" "$st" ) \ + --dry-run=client -o yaml | kubectl apply -f - + log "✓ Created s3-secret with explicit credentials" + else + warn "No AWS credentials available - s3-secret not created. Splunk Standalone will use IRSA." + fi else - warn "No AWS credentials available - s3-secret not created. Splunk Standalone will use IRSA." + warn "AWS credentials not available - s3-secret not created. Splunk Standalone will use IRSA via ${STANDALONE_SA}." fi - else - warn "AWS credentials not available - s3-secret not created. Splunk Standalone will use IRSA via ${STANDALONE_SA}." fi cat <<'YAML' | kubectl -n "${AI_NS}" apply -f - @@ -1497,7 +1702,48 @@ data: sslPassword: password YAML - cat < Date: Wed, 25 Feb 2026 11:01:03 +0530 Subject: [PATCH 02/55] generi object storage changes --- docs/configuration/object-storage.md | 106 ++++++ docs/configuration/storage-artifacts.md | 9 +- .../SEAWEEDFS_SYSTEMD.md | 134 +++++++ .../create_seaweedfs_folders.sh | 60 ++++ .../install_minio_ec2.sh | 335 ++++++++++++++++++ .../install_seaweedfs_systemd.sh | 58 +++ .../seaweedfs.service | 39 ++ .../test_minio_connection.sh | 6 +- .../upload_to_minio.sh | 27 +- .../upload_to_seaweedfs.sh | 264 ++++++++++++++ tools/cluster_setup/EKS_README.md | 49 ++- tools/cluster_setup/K0S_README.md | 2 +- tools/cluster_setup/cluster-config.yaml | 26 +- tools/cluster_setup/eks_cluster_with_stack.sh | 213 +++++------ tools/cluster_setup/k0s_cluster_with_stack.sh | 8 +- 15 files changed, 1163 insertions(+), 173 deletions(-) create mode 100644 docs/configuration/object-storage.md create mode 100644 tools/artifacts_download_upload_scripts/SEAWEEDFS_SYSTEMD.md create mode 100755 tools/artifacts_download_upload_scripts/create_seaweedfs_folders.sh create mode 100755 tools/artifacts_download_upload_scripts/install_minio_ec2.sh create mode 100755 tools/artifacts_download_upload_scripts/install_seaweedfs_systemd.sh create mode 100644 tools/artifacts_download_upload_scripts/seaweedfs.service create mode 100644 tools/artifacts_download_upload_scripts/upload_to_seaweedfs.sh diff --git a/docs/configuration/object-storage.md b/docs/configuration/object-storage.md new file mode 100644 index 0000000..873632b --- /dev/null +++ b/docs/configuration/object-storage.md @@ -0,0 +1,106 @@ +# Object Storage Selection + +This document describes how the Splunk AI Operator chooses the object storage backend and how to configure AWS S3, MinIO, SeaweedFS, or any S3-compatible storage. + +## How the operator decides the backend + +The operator selects the storage backend **only by the path scheme** in `spec.objectStorage.path`: + +| Path scheme | Backend behavior | cloudProvider | artifactsProvider | +|-----------------|-------------------------------------|---------------|-------------------| +| `s3://` | **AWS S3** (region, IRSA, no custom endpoint) | `aws` | `s3` | +| `s3compat://` | **S3-compatible** (generic; requires endpoint + secretRef) | `s3compat` | `s3` | +| `minio://` | **MinIO** (alias for S3-compatible) | `s3compat` | `s3` | +| `seaweedfs://` | **SeaweedFS** (alias for S3-compatible) | `s3compat` | `s3` | +| `gs://` / `gcs://` | **GCP Cloud Storage** | `gcp` | `gcs` | +| `azure://` | **Azure Blob Storage** | `azure` | `azure` | + +- **Path scheme** is the only decision input; there is no separate "provider type" switch in the operator logic. +- For **S3-compatible** backends (MinIO, SeaweedFS, Ceph, or any custom S3 API), use **`s3compat://bucket/prefix`** with `endpoint` and `secretRef` set. You can also use `minio://` or `seaweedfs://` as aliases; all use the same implementation (AWS S3 SDK with custom endpoint and path-style). + +## cloudProvider vs artifactsProvider + +- **cloudProvider**: Identifies the *platform* (e.g. `aws` for native AWS S3, `s3compat` for MinIO/SeaweedFS/other S3-compatible). Used for telemetry and any logic that needs to distinguish "real AWS" from "custom S3-compatible". +- **artifactsProvider**: The *protocol* used to access artifacts. For all S3 API backends (AWS S3, MinIO, SeaweedFS) the protocol is the S3 API, so `artifactsProvider` is always `s3` for those. Only GCS and Azure use different protocols (`gcs`, `azure`). + +## Path schemes and required fields + +- **`s3://bucket/prefix`** + - Use for **AWS S3** only. + - Set `region`. Optionally use `secretRef` for static credentials; otherwise IRSA or default AWS credential chain is used. Do **not** set `endpoint` for native S3. + +- **`s3compat://bucket/prefix`** + - Use for **any S3-compatible** backend (MinIO, SeaweedFS, Ceph, etc.). + - **Required:** `endpoint` (e.g. `http://minio.namespace.svc:9000` or `http://seaweedfs-s3:8333`), `region` (any value), `secretRef` with `s3_access_key` and `s3_secret_key`. + +- **`minio://bucket/prefix`** + - Alias for S3-compatible; use for **MinIO** (in-cluster or external). Same requirements as `s3compat://`. + +- **`seaweedfs://bucket/prefix`** + - Alias for S3-compatible; use for **SeaweedFS** (bring your own). Same requirements as `s3compat://`. + +## Optional provider field + +`spec.objectStorage.provider` is an optional hint for documentation and tooling. Allowed values: `aws`, `minio`, `seaweedfs`, `s3compat`, `gcs`, `azure`. The operator **does not** use this field to select the backend; behavior is derived only from the path scheme (and for `s3://`, absence of endpoint). Use it for clarity in manifests or scripts. + +## YAML examples + +### AWS S3 + +```yaml +spec: + objectStorage: + path: s3://my-ai-bucket/artifacts + region: us-east-2 + # secretRef optional when using IRSA +``` + +### MinIO (in-cluster) + +```yaml +spec: + objectStorage: + path: minio://ai-platform-bucket/artifacts + endpoint: http://minio.minio.svc.cluster.local:9000 + region: us-east-1 + secretRef: minio-credentials +``` + +### MinIO (external, e.g. EC2) + +```yaml +spec: + objectStorage: + path: minio://ai-platform-bucket/artifacts + endpoint: http://10.0.1.50:9000 + region: us-east-1 + secretRef: minio-credentials +``` + +### SeaweedFS + +```yaml +spec: + objectStorage: + path: seaweedfs://my-bucket/artifacts + endpoint: http://seaweedfs-s3.my-namespace.svc:8333 + region: us-east-1 + secretRef: minio-credentials +``` + +### Generic S3-compatible (e.g. Ceph, custom endpoint) + +```yaml +spec: + objectStorage: + path: s3compat://my-bucket/artifacts + endpoint: http://s3-gateway.my-namespace.svc:8333 + region: us-east-1 + secretRef: minio-credentials +``` + +The same Kubernetes secret format is used for all S3-compatible backends: keys `s3_access_key` and `s3_secret_key`. Pods receive these as `MINIO_ACCESS_KEY`, `MINIO_SECRET_KEY`, and `MINIO_ENDPOINT_URL` (when endpoint is set). + +## Adding new S3-compatible backends + +Any storage that exposes an S3-compatible API (e.g. Ceph, DigitalOcean Spaces) can be used by using **`s3compat://bucket`** with the appropriate `endpoint` and `secretRef`. No new client code or scheme is required; `minio://` and `seaweedfs://` remain as optional aliases for clarity. diff --git a/docs/configuration/storage-artifacts.md b/docs/configuration/storage-artifacts.md index 58ae8f9..4584e28 100644 --- a/docs/configuration/storage-artifacts.md +++ b/docs/configuration/storage-artifacts.md @@ -6,10 +6,17 @@ The Splunk AI team has provided global artifact storage in a publicly readable S ## Prerequisites Utilizing the AI Platform requires one of the following remote storage providers: - * An Amazon S3 or S3-API-compliant remote object storage location + * **AWS S3** – Native Amazon S3 (use path scheme `s3://`) + * **MinIO** – S3-compatible, in-cluster or external (use path scheme `s3compat://` or `minio://` with endpoint and credentials) + * **SeaweedFS** – S3-compatible (use path scheme `s3compat://` or `seaweedfs://` with endpoint and credentials) + * Any other **S3-API-compatible** storage (use `s3compat://` with endpoint and secretRef; `minio://` and `seaweedfs://` are optional aliases) * Azure blob storage * GCP Cloud Storage +### Object storage selection + +The operator chooses the backend **by the path scheme** in `spec.objectStorage.path`. Use `s3://` for AWS S3 only; use `s3compat://` (or `minio://` / `seaweedfs://` as aliases) with `endpoint` and `secretRef` for MinIO, SeaweedFS, or any S3-compatible backend. See [Object Storage Selection](object-storage.md) for the full decision table, path schemes, and YAML examples. + ### Prerequisites common to all remote storage providers * Read-write access to the path used to host the files. * Connections to the remote object storage endpoint need to be secured using a minimum version of TLS 1.2. diff --git a/tools/artifacts_download_upload_scripts/SEAWEEDFS_SYSTEMD.md b/tools/artifacts_download_upload_scripts/SEAWEEDFS_SYSTEMD.md new file mode 100644 index 0000000..a4b9caa --- /dev/null +++ b/tools/artifacts_download_upload_scripts/SEAWEEDFS_SYSTEMD.md @@ -0,0 +1,134 @@ +# SeaweedFS as a systemd service + +Run SeaweedFS as a systemd service so it **restarts on failure** and **starts on boot**. + +## Prerequisites + +- **weed** binary at `/usr/local/bin/weed`. If missing, run the upload script once from the artifacts directory (it installs weed), or [download a release](https://github.com/seaweedfs/seaweedfs/releases) and copy `weed` to `/usr/local/bin/`. +- **Root/sudo** on the host to install the service. + +## Quick install (EC2 or single host) + +On the host where SeaweedFS should run: + +```bash +cd /path/to/splunk-ai-operator/tools/cluster_setup +sudo ./install_seaweedfs_systemd.sh +``` + +This copies `seaweedfs.service` to `/etc/systemd/system/`, enables and starts the service. + +## Manual install + +1. Copy the unit file: + ```bash + sudo cp tools/cluster_setup/seaweedfs.service /etc/systemd/system/ + sudo systemctl daemon-reload + ``` + +2. Optionally override credentials or data dir via a drop-in or env file: + ```bash + sudo mkdir -p /etc/systemd/system/seaweedfs.service.d + echo -e '[Service]\nEnvironment="AWS_ACCESS_KEY_ID=mykey"\nEnvironment="AWS_SECRET_ACCESS_KEY=mysecret"' | sudo tee /etc/systemd/system/seaweedfs.service.d/override.conf + sudo systemctl daemon-reload + ``` + +3. Enable and start: + ```bash + sudo systemctl enable seaweedfs + sudo systemctl start seaweedfs + ``` + +## Service details + +- **User:** `ec2-user` (change in the unit if needed). +- **Data dir:** `/home/ec2-user/data` (hardcoded in `ExecStart`; override via a systemd drop-in that replaces `ExecStart` if needed). +- **Volume max:** `100` in `ExecStart` (override via drop-in if needed). +- **S3 credentials:** `minioadmin` / `minioadmin` by default; override with `Environment=` or `EnvironmentFile=-/etc/default/seaweedfs` in a drop-in. +- **Restart:** `on-failure` with 5s delay. +- **Logs:** `journalctl -u seaweedfs -f` + +## Useful commands + +| Command | Description | +|--------|-------------| +| `sudo systemctl status seaweedfs` | Show status | +| `journalctl -u seaweedfs -f` | Follow logs | +| `sudo systemctl restart seaweedfs` | Restart | +| `sudo systemctl stop seaweedfs` | Stop | +| `sudo systemctl disable seaweedfs` | Disable start on boot | + +## After install + +- S3 endpoint: **http://127.0.0.1:8333** (or the host’s IP if accessing remotely). +- Use the same credentials in the upload script or set `OBJECT_STORE_ACCESS_KEY` / `OBJECT_STORE_SECRET_KEY` to match the service. + +## Troubleshooting: "0 node candidates" / "Not enough data nodes found" + +When the Master has no writable volume servers, uploads fail with those errors. Common causes and fixes: + +| Cause | Fix | +|-------|-----| +| **1. Max volumes reached** | Volume server default `-max` is often 7–8. The unit sets `SEAWEEDFS_VOLUME_MAX=100`. To increase: add `Environment="SEAWEEDFS_VOLUME_MAX=200"` in a drop-in and restart. | +| **2. Disk space** | At ~95% usage the volume server reports read-only. Check `df -h` on the host; free space or add storage. | +| **3. Heartbeat / gRPC timeouts** | Under heavy load the volume server may miss heartbeats and be marked dead. Check `journalctl -u seaweedfs` for "heartbeat" or "connection refused" around the failure time. | +| **4. OOM** | On small instances the process may be killed. Run `dmesg -T | grep -i oom` on the host. | + +**When the error is happening, run:** + +```bash +# Master's view of nodes (look for empty Nodes or IsReadOnly: true) +curl -s http://localhost:9333/cluster/status | jq + +# Volume server status (check if Max and Count are equal = full) +curl -s http://127.0.0.1:8080/status | jq +``` + +If `Max == Count` on the volume server, increase `SEAWEEDFS_VOLUME_MAX` and restart the service. + +### "Permission denied" when starting the service (status=203/EXEC) + +The service runs as `ec2-user`. Common causes: + +1. **File permissions** – Ensure the binary is executable by all: + ```bash + sudo chmod 755 /usr/local/bin/weed + ``` + +2. **SELinux (Enforcing)** – On RHEL/Amazon Linux, SELinux can block execution. Fix by labeling the binary: + ```bash + sudo chcon -t bin_t /usr/local/bin/weed + sudo systemctl restart seaweedfs + ``` + To confirm SELinux is the cause: `sudo setenforce 0`, restart the service; if it then runs, re-enable with `sudo setenforce 1` and apply the `chcon` above. + +The install script runs `chmod 755` and, when SELinux is Enforcing, `chcon -t bin_t` automatically. + +### Connect timeout from EKS / Ray pods (Connection to <host> timed out) + +Ray workers (and other pods) in the cluster need to reach the SeaweedFS S3 endpoint to download model artifacts. If you see: + +- `Connect timeout on endpoint URL: "http://:8333/..."` +- `Connection to timed out. (connect timeout=60)"` + +then **pods cannot reach the SeaweedFS host** on port 8333. + +**Fix:** + +1. **Security group on the SeaweedFS EC2** + Allow **inbound TCP port 8333** from the EKS cluster: + - **Option A:** From the **EKS worker node security group** (so any pod on those nodes can reach SeaweedFS). + - **Option B:** From the **VPC CIDR** (e.g. `10.0.0.0/16` or `192.168.0.0/16`) so all pods in the VPC can reach SeaweedFS. + + In AWS Console: EC2 → Security Groups → select the security group attached to the SeaweedFS instance → Edit inbound rules → Add rule: Type = Custom TCP, Port = 8333, Source = node SG or VPC CIDR. + +2. **Prefer private IP when in the same VPC** + If SeaweedFS and EKS are in the same VPC, set `storage.objectStore.endpoint` in `cluster-config.yaml` to the **private IP** and port (e.g. `http://172.31.23.74:8333`). Then: + - Traffic stays inside the VPC (no internet path). + - The security group still must allow 8333 from the node SG or VPC CIDR as above. + +3. **Verify from a pod** (optional): + ```bash + kubectl run -it --rm curl --image=curlimages/curl --restart=Never -- curl -s -o /dev/null -w "%{http_code}" http://:8333 + ``` + Use the same IP (public or private) and port as in your config. A 200/403/400 means the pod can reach SeaweedFS. diff --git a/tools/artifacts_download_upload_scripts/create_seaweedfs_folders.sh b/tools/artifacts_download_upload_scripts/create_seaweedfs_folders.sh new file mode 100755 index 0000000..823c7eb --- /dev/null +++ b/tools/artifacts_download_upload_scripts/create_seaweedfs_folders.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# Create standard folder prefixes in SeaweedFS (S3-compatible). Uses the same +# OBJECT_STORE_* / SEAWEEDFS_* env vars as upload_to_seaweedfs.sh. Run after +# SeaweedFS is up (e.g. systemd service or upload script has started it). + +set -e + +# Same endpoint/credentials as upload_to_seaweedfs.sh +OBJECT_STORE_ENDPOINT="${OBJECT_STORE_ENDPOINT:-${SEAWEEDFS_ENDPOINT:-http://127.0.0.1:8333}}" +OBJECT_STORE_BUCKET="${OBJECT_STORE_BUCKET:-${SEAWEEDFS_BUCKET:-ai-platform-bucket}}" +OBJECT_STORE_ACCESS_KEY="${OBJECT_STORE_ACCESS_KEY:-${SEAWEEDFS_ACCESS_KEY:-minioadmin}}" +OBJECT_STORE_SECRET_KEY="${OBJECT_STORE_SECRET_KEY:-${SEAWEEDFS_SECRET_KEY:-minioadmin}}" + +OBJECT_STORE_BUCKET=$(echo "$OBJECT_STORE_BUCKET" | tr '[:upper:]' '[:lower:]') + +# Standard folders expected by the platform (create by uploading .keep) +FOLDERS=(apps artifacts config job_groups model_artifacts tasks) + +seaweedfs_ok() { + local code + code=$(curl -s -o /dev/null -w "%{http_code}" "${OBJECT_STORE_ENDPOINT}" 2>/dev/null || echo "000") + [[ "$code" == "200" || "$code" == "403" || "$code" == "400" ]] +} + +if ! seaweedfs_ok; then + echo "SeaweedFS not reachable at ${OBJECT_STORE_ENDPOINT}. Start SeaweedFS first (e.g. sudo systemctl start seaweedfs)." + exit 1 +fi + +# Install mc if needed +if ! command -v mc &>/dev/null; then + echo "Installing MinIO Client (mc)..." + OS="$(uname -s)" + ARCH="$(uname -m)" + if [[ "$OS" == "Darwin" ]]; then + if command -v brew &>/dev/null; then + brew install minio/stable/mc + else + if [[ "$ARCH" == "arm64" ]]; then MC_URL="https://dl.min.io/client/mc/release/darwin-arm64/mc"; else MC_URL="https://dl.min.io/client/mc/release/darwin-amd64/mc"; fi + curl -o /tmp/mc "$MC_URL" && chmod +x /tmp/mc && sudo mv /tmp/mc /usr/local/bin/mc + fi + elif [[ "$OS" == "Linux" ]]; then + if [[ "$ARCH" == "x86_64" ]]; then MC_URL="https://dl.min.io/client/mc/release/linux-amd64/mc"; elif [[ "$ARCH" == "aarch64" || "$ARCH" == "arm64" ]]; then MC_URL="https://dl.min.io/client/mc/release/linux-arm64/mc"; else echo "Unsupported arch: $ARCH"; exit 1; fi + curl -o /tmp/mc "$MC_URL" && chmod +x /tmp/mc + sudo mv /tmp/mc /usr/local/bin/mc 2>/dev/null || { mkdir -p ~/.local/bin; mv /tmp/mc ~/.local/bin/mc; export PATH="$PATH:$HOME/.local/bin"; } + else + echo "Unsupported OS: $OS"; exit 1 + fi +fi + +MC_ALIAS="seaweedfs" +mc alias set "$MC_ALIAS" "$OBJECT_STORE_ENDPOINT" "$OBJECT_STORE_ACCESS_KEY" "$OBJECT_STORE_SECRET_KEY" --api S3v4 +mc mb "${MC_ALIAS}/${OBJECT_STORE_BUCKET}" --ignore-existing 2>/dev/null || true + +echo "Creating folders in ${OBJECT_STORE_BUCKET}: ${FOLDERS[*]}" +for dir in "${FOLDERS[@]}"; do + echo "placeholder" | mc pipe "${MC_ALIAS}/${OBJECT_STORE_BUCKET}/${dir}/.keep" 2>/dev/null || true + echo " ${dir}/" +done +echo "Done. Folders: apps/, artifacts/, config/, job_groups/, model_artifacts/, tasks/" diff --git a/tools/artifacts_download_upload_scripts/install_minio_ec2.sh b/tools/artifacts_download_upload_scripts/install_minio_ec2.sh new file mode 100755 index 0000000..fcf93a1 --- /dev/null +++ b/tools/artifacts_download_upload_scripts/install_minio_ec2.sh @@ -0,0 +1,335 @@ +#!/usr/bin/env bash +# ----------------------------------------------------------------------------- +# MinIO on EC2 for Splunk AI Platform (EKS) +# +# Mode 1 - Install on this machine (run ON the EC2 instance after SSH, as root): +# sudo ./install_minio_ec2.sh [--bucket NAME] [--user USER] [--password PASSWORD] +# +# Mode 2 - Launch EC2 in same VPC as EKS, then install MinIO (run from laptop): +# CONFIG_FILE=./cluster-config.yaml ./install_minio_ec2.sh --launch-ec2 +# Then SSH to the instance and run: ./install_minio_ec2.sh (with same bucket/user/password) +# +# Prerequisites: aws CLI, same VPC as EKS (or provide VPC/subnet). For --launch-ec2: jq, yq (optional). +# ----------------------------------------------------------------------------- +set -euo pipefail + +MINIO_BUCKET="${MINIO_BUCKET:-ai-platform}" +MINIO_ROOT_USER="${MINIO_ROOT_USER:-minioadmin}" +MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-}" +MINIO_DATA_DIR="${MINIO_DATA_DIR:-/data/minio}" +MINIO_PORT="${MINIO_PORT:-9000}" + +# Launch-EC2 options (when --launch-ec2) +MINIO_EC2_INSTANCE_TYPE="${MINIO_EC2_INSTANCE_TYPE:-t3.xlarge}" +MINIO_EC2_AMI_QUERY="${MINIO_EC2_AMI_QUERY:-Amazon Linux 2023}" +MINIO_EC2_KEY_NAME="${MINIO_EC2_KEY_NAME:-}" +MINIO_EC2_VOLUME_SIZE="${MINIO_EC2_VOLUME_SIZE:-150}" + +log() { echo "[minio-ec2] $*"; } +err() { echo "[minio-ec2] ERROR: $*" >&2; } + +# ---------- Parse args ---------- +LAUNCH_EC2=false +while [[ $# -gt 0 ]]; do + case "$1" in + --launch-ec2) LAUNCH_EC2=true; shift ;; + --bucket) MINIO_BUCKET="$2"; shift 2 ;; + --user) MINIO_ROOT_USER="$2"; shift 2 ;; + --password) MINIO_ROOT_PASSWORD="$2"; shift 2 ;; + --data-dir) MINIO_DATA_DIR="$2"; shift 2 ;; + --port) MINIO_PORT="$2"; shift 2 ;; + *) echo "Unknown option: $1"; exit 1 ;; + esac +done + +# ---------- Mode 2: Launch EC2 in EKS VPC ---------- +launch_ec2_in_eks_vpc() { + need_file "${CONFIG_FILE:-}" + local cfg="${CONFIG_FILE}" + local cluster_name region vpc_id subnet_id sg_id instance_id private_ip + + if command -v yq &>/dev/null; then + cluster_name="$(yq eval '.cluster.name' "$cfg")" + region="$(yq eval '.cluster.region' "$cfg")" + else + cluster_name="$(grep -A1 'cluster:' "$cfg" | grep 'name:' | head -1 | sed 's/.*name: *"\(.*\)".*/\1/')" + region="$(grep 'region:' "$cfg" | head -1 | sed 's/.*region: *"\(.*\)".*/\1/')" + fi + [[ -z "$cluster_name" || -z "$region" ]] && { err "Could not read cluster.name and cluster.region from $cfg"; exit 1; } + + log "Cluster: $cluster_name, Region: $region" + if ! aws eks describe-cluster --name "$cluster_name" --region "$region" &>/dev/null; then + err "EKS cluster '$cluster_name' not found. Create the cluster first or provide VPC/subnet via MINIO_EC2_VPC_ID and MINIO_EC2_SUBNET_ID." + exit 1 + fi + + vpc_id="$(aws eks describe-cluster --name "$cluster_name" --region "$region" --query 'cluster.resourcesVpcConfig.vpcId' --output text)" + # Prefer private subnet for MinIO + subnet_id="$(aws eks describe-cluster --name "$cluster_name" --region "$region" --query 'cluster.resourcesVpcConfig.subnetIds[0]' --output text)" + [[ -z "$vpc_id" || "$vpc_id" == "None" ]] && { err "No VPC from cluster"; exit 1; } + [[ -z "$subnet_id" || "$subnet_id" == "None" ]] && { err "No subnet from cluster"; exit 1; } + + local vpc_cidr + vpc_cidr="$(aws ec2 describe-vpcs --vpc-ids "$vpc_id" --region "$region" --query 'Vpcs[0].CidrBlock' --output text 2>/dev/null || echo "10.0.0.0/8")" + + log "VPC: $vpc_id, Subnet: $subnet_id, CIDR: $vpc_cidr" + + # Security group: SSH (22) from anywhere; MinIO (9000) from VPC (reuse if exists) + local sg_name="minio-ec2-${cluster_name}" + sg_id="$(aws ec2 describe-security-groups --filters "Name=group-name,Values=$sg_name" "Name=vpc-id,Values=$vpc_id" --region "$region" --query 'SecurityGroups[0].GroupId' --output text 2>/dev/null)" + if [[ -z "$sg_id" || "$sg_id" == "None" ]]; then + sg_id="$(aws ec2 create-security-group --group-name "$sg_name" --description "MinIO EC2 for EKS" --vpc-id "$vpc_id" --region "$region" --query 'GroupId' --output text)" + fi + aws ec2 authorize-security-group-ingress --group-id "$sg_id" --protocol tcp --port 22 --cidr 0.0.0.0/0 --region "$region" 2>/dev/null || true + aws ec2 authorize-security-group-ingress --group-id "$sg_id" --protocol tcp --port "$MINIO_PORT" --cidr "$vpc_cidr" --region "$region" 2>/dev/null || true + log "Security group: $sg_id (22 from 0.0.0.0/0, ${MINIO_PORT} from $vpc_cidr)" + + # Key pair: use existing or create (idempotent: reuse same key name per cluster) + local key_name="$MINIO_EC2_KEY_NAME" + local key_file="" + if [[ -z "$key_name" ]]; then + key_name="minio-ec2-${cluster_name}" + key_file="/tmp/minio-ec2-${cluster_name}.pem" + if aws ec2 describe-key-pairs --key-names "$key_name" --region "$region" &>/dev/null; then + log "Using existing key pair: $key_name (if you lost the .pem, set MINIO_EC2_KEY_NAME to another key)" + elif aws ec2 create-key-pair --key-name "$key_name" --query 'KeyMaterial' --output text --region "$region" > "$key_file" 2>/dev/null; then + chmod 600 "$key_file" + log "Key pair created: $key_name (saved to $key_file)" + else + err "Create key pair failed. Set MINIO_EC2_KEY_NAME to an existing key name in this region." + exit 1 + fi + fi + + # AMI: Amazon Linux 2023 + local ami_id + ami_id="$(aws ec2 describe-images --owners amazon --filters "Name=name,Values=al2023-ami-*-x86_64" "Name=state,Values=available" --query 'sort_by(Images,&CreationDate)[-1].ImageId' --output text --region "$region")" + [[ -z "$ami_id" || "$ami_id" == "None" ]] && ami_id="$(aws ec2 describe-images --owners amazon --filters "Name=name,Values=amzn2-ami-hvm-*-x86_64-gp2" "Name=state,Values=available" --query 'sort_by(Images,&CreationDate)[-1].ImageId' --output text --region "$region")" + + instance_id="$(aws ec2 run-instances \ + --image-id "$ami_id" \ + --instance-type "$MINIO_EC2_INSTANCE_TYPE" \ + --subnet-id "$subnet_id" \ + --security-group-ids "$sg_id" \ + --key-name "$key_name" \ + --block-device-mappings "[{\"DeviceName\":\"/dev/xvda\",\"Ebs\":{\"VolumeSize\":${MINIO_EC2_VOLUME_SIZE},\"VolumeType\":\"gp3\"}}]" \ + --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=minio-ec2-${cluster_name}},{Key=Cluster,Value=${cluster_name}}]" \ + --region "$region" \ + --query 'Instances[0].InstanceId' --output text)" + log "Launched instance: $instance_id (key: $key_name)" + + log "Waiting for instance to get private IP..." + aws ec2 wait instance-running --instance-ids "$instance_id" --region "$region" + private_ip="$(aws ec2 describe-instances --instance-ids "$instance_id" --region "$region" --query 'Reservations[0].Instances[0].PrivateIpAddress' --output text)" + [[ -z "$private_ip" || "$private_ip" == "None" ]] && private_ip="(check console)" + + echo "" + log "=== MinIO EC2 instance ready ===" + echo " Instance ID: $instance_id" + echo " Private IP: $private_ip" + echo " Region: $region" + echo " Key name: $key_name" + [[ -n "$key_file" && -f "$key_file" ]] && echo " Key file: $key_file" + echo "" + echo "Next steps:" + echo " 1. SSH to the instance: ssh -i ${key_file:-/path/to/$key_name.pem} ec2-user@${private_ip}" + echo " 2. On the instance, copy and run this script (install-only mode, requires sudo):" + echo " sudo ./install_minio_ec2.sh --bucket ${MINIO_BUCKET} --user ${MINIO_ROOT_USER} --password ''" + echo " 3. Add to cluster-config.yaml (storage.minio):" + echo " enabled: true" + echo " external: true" + echo " endpoint: \"http://${private_ip}:${MINIO_PORT}\"" + echo " bucket: \"${MINIO_BUCKET}\"" + echo " auth: { rootUser: \"${MINIO_ROOT_USER}\", rootPassword: \"\" }" + echo "" +} + +need_file() { [[ -n "${1:-}" && -f "${1}" ]] || { err "File required: $1"; exit 1; }; } + +# ---------- Entry ---------- +if [[ "$LAUNCH_EC2" == "true" ]]; then + launch_ec2_in_eks_vpc + exit 0 +fi + +# ---------- Mode 1: Install MinIO on this machine ---------- +# Require root (for /usr/local/bin, /etc/default/minio, systemd) +if [[ "$(id -u)" -ne 0 ]]; then + err "This script must be run as root (or with sudo)." + err "Run: sudo $0 ${*:-}" + exit 1 +fi + +# Generate password if not set +if [[ -z "${MINIO_ROOT_PASSWORD}" ]]; then + MINIO_ROOT_PASSWORD="$(openssl rand -base64 24 2>/dev/null || head -c 32 /dev/urandom | base64)" + log "Generated MINIO_ROOT_PASSWORD (save it for cluster-config.yaml)" +fi + +# Install MinIO binary (use stable "latest" URL; archive URLs can 404 and return HTML) +install_minio_binary() { + local arch + arch="$(uname -m)" + case "$arch" in + x86_64|amd64) arch=amd64 ;; + aarch64|arm64) arch=arm64 ;; + *) err "Unsupported arch: $arch"; exit 1 ;; + esac + local url="https://dl.min.io/server/minio/release/linux-${arch}/minio" + local tmp="/tmp/minio.$$" + log "Downloading MinIO (linux-${arch})..." + if ! curl -sSL -o "$tmp" "$url"; then + err "Download failed. Check network or try: curl -sSL -o /tmp/minio '$url'" + rm -f "$tmp" + exit 1 + fi + # Reject HTML/error pages (e.g. 404); binary should not start with < or "Not" + if head -c 4 "$tmp" | grep -q '^<\|^Not'; then + err "Download returned HTML/error instead of binary. URL may be wrong or blocked." + head -1 "$tmp" + rm -f "$tmp" + exit 1 + fi + chmod +x "$tmp" + mv "$tmp" /usr/local/bin/minio + minio --version +} + +install_mc() { + local arch + arch="$(uname -m)" + case "$arch" in + x86_64|amd64) arch=amd64 ;; + aarch64|arm64) arch=arm64 ;; + *) arch=amd64 ;; + esac + local tmp="/tmp/mc.$$" + log "Downloading MinIO Client (mc)..." + if ! curl -sSL -o "$tmp" "https://dl.min.io/client/mc/release/linux-${arch}/mc"; then + err "Download failed for mc." + rm -f "$tmp" + exit 1 + fi + if head -c 4 "$tmp" | grep -q '^<\|^Not'; then + err "mc download returned HTML/error instead of binary." + rm -f "$tmp" + exit 1 + fi + chmod +x "$tmp" + mv "$tmp" /usr/local/bin/mc + mc --version +} + +# Stop MinIO so we can replace the binary without restart loop (e.g. after wrong-arch fix). +systemctl stop minio 2>/dev/null || true +# Always (re)install MinIO binary so we get the correct architecture for this host. +# A wrong-arch binary (e.g. amd64 on arm64 EC2) causes "Exec format error" and crash-loop. +install_minio_binary +if ! command -v mc &>/dev/null; then + install_mc +else + log "mc already present: $(mc --version 2>/dev/null || true)" +fi + +mkdir -p "$MINIO_DATA_DIR" +chmod 755 "$MINIO_DATA_DIR" +ENV_FILE="/etc/default/minio" +cat > "$ENV_FILE" < /etc/systemd/system/minio.service </dev/null | grep -q 200; then + minio_ok=true + break + fi + sleep 2 +done +if [[ "$minio_ok" != "true" ]]; then + err "MinIO did not respond on port ${MINIO_PORT} within 60s. Service may be failing or crash-looping." + echo "" >&2 + systemctl status minio --no-pager 2>&1 || true + echo "" >&2 + journalctl -u minio -n 30 --no-pager 2>&1 || true + exit 1 +fi +# Verify port is actually listening +if ! ( ss -tlnp 2>/dev/null || netstat -tlnp 2>/dev/null ) | grep -qE "[.:]${MINIO_PORT}([^0-9]|$)"; then + err "MinIO health passed but port ${MINIO_PORT} is not listening. Showing service status:" + systemctl status minio --no-pager 2>&1 || true + exit 1 +fi +sleep 2 + +export MC_HOST_local="http://${MINIO_ROOT_USER}:${MINIO_ROOT_PASSWORD}@127.0.0.1:${MINIO_PORT}" +mc mb "local/${MINIO_BUCKET}" --ignore-existing 2>/dev/null || true +for prefix in apps artifacts config job_groups model_artifacts tasks; do + echo -n | mc pipe "local/${MINIO_BUCKET}/${prefix}/.keep" 2>/dev/null || true +done +log "Bucket '${MINIO_BUCKET}' and prefixes apps/, artifacts/, config/, job_groups/, model_artifacts/, tasks/ ready" + +if command -v firewall-cmd &>/dev/null && systemctl is-active --quiet firewalld 2>/dev/null; then + firewall-cmd --permanent --add-port="${MINIO_PORT}/tcp" 2>/dev/null || true + firewall-cmd --reload 2>/dev/null || true +elif command -v ufw &>/dev/null && ufw status 2>/dev/null | grep -q "Status: active"; then + ufw allow "${MINIO_PORT}/tcp" 2>/dev/null || true + ufw reload 2>/dev/null || true +fi + +PRIVATE_IP="" +if command -v hostname &>/dev/null; then + PRIVATE_IP="$(hostname -I 2>/dev/null | awk '{print $1}')" +fi +[[ -z "$PRIVATE_IP" ]] && PRIVATE_IP="$(curl -s --connect-timeout 2 http://169.254.169.254/latest/meta-data/local-ipv4 2>/dev/null || echo 'MINIO_EC2_PRIVATE_IP')" +ENDPOINT="http://${PRIVATE_IP}:${MINIO_PORT}" + +echo "" +log "=== MinIO on EC2 is ready ===" +echo " Endpoint: ${ENDPOINT}" +echo " Bucket: ${MINIO_BUCKET}" +echo " Root user: ${MINIO_ROOT_USER}" +echo " Root pass: ${MINIO_ROOT_PASSWORD}" +echo "" +echo "Add to cluster-config.yaml (storage.minio):" +echo " minio:" +echo " enabled: true" +echo " external: true" +echo " endpoint: \"${ENDPOINT}\"" +echo " bucket: \"${MINIO_BUCKET}\"" +echo " auth:" +echo " rootUser: \"${MINIO_ROOT_USER}\"" +echo " rootPassword: \"${MINIO_ROOT_PASSWORD}\"" +echo "" +echo "Ensure EC2 security group allows inbound TCP ${MINIO_PORT} from your EKS node security group or VPC CIDR." +echo "" +echo "If MinIO is not reachable, check: systemctl status minio && ss -tlnp | grep ${MINIO_PORT}" +echo "" diff --git a/tools/artifacts_download_upload_scripts/install_seaweedfs_systemd.sh b/tools/artifacts_download_upload_scripts/install_seaweedfs_systemd.sh new file mode 100755 index 0000000..2f21090 --- /dev/null +++ b/tools/artifacts_download_upload_scripts/install_seaweedfs_systemd.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# Install SeaweedFS as a systemd service (restart on failure, start on boot). +# Run with sudo on the host where SeaweedFS should run (e.g. EC2). +# Prereqs: weed binary at /usr/local/bin/weed (run upload_to_seaweedfs.sh once to install, or install manually). + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SERVICE_NAME="seaweedfs" +UNIT_FILE="${SCRIPT_DIR}/seaweedfs.service" + +if [[ "$(id -u)" -ne 0 ]]; then + echo "Run with sudo to install the systemd service." + exit 1 +fi + +if [[ ! -f /usr/local/bin/weed ]]; then + echo "weed not found at /usr/local/bin/weed. Install it first, e.g.:" + echo " Run ./upload_to_seaweedfs.sh once (it will install weed), or" + echo " download from https://github.com/seaweedfs/seaweedfs/releases and extract weed to /usr/local/bin/" + exit 1 +fi + +# Service runs as ec2-user; ensure the binary is executable by that user (fixes "Permission denied" on EXEC). +chmod 755 /usr/local/bin/weed +# On SELinux systems (e.g. RHEL, Amazon Linux), label the binary so the service can execute it. +if command -v getenforce &>/dev/null && [[ "$(getenforce 2>/dev/null)" == "Enforcing" ]]; then + if command -v chcon &>/dev/null; then + chcon -t bin_t /usr/local/bin/weed 2>/dev/null || true + fi +fi + +echo "Installing ${SERVICE_NAME}.service..." +cp "$UNIT_FILE" /etc/systemd/system/"${SERVICE_NAME}.service" +chmod 644 /etc/systemd/system/"${SERVICE_NAME}.service" +systemctl daemon-reload + +echo "Enabling ${SERVICE_NAME} to start on boot..." +systemctl enable "${SERVICE_NAME}" + +echo "Starting ${SERVICE_NAME} now..." +systemctl start "${SERVICE_NAME}" + +sleep 2 +if ! systemctl is-active --quiet "${SERVICE_NAME}"; then + echo "Warning: ${SERVICE_NAME} did not stay running. Check: sudo systemctl status ${SERVICE_NAME} && journalctl -u ${SERVICE_NAME} -n 30" + exit 1 +fi + +echo "" +echo "SeaweedFS is running as a systemd service." +echo " status: sudo systemctl status ${SERVICE_NAME}" +echo " logs: journalctl -u ${SERVICE_NAME} -f" +echo " stop: sudo systemctl stop ${SERVICE_NAME}" +echo " restart: sudo systemctl restart ${SERVICE_NAME}" +echo "" +echo "S3 endpoint: http://127.0.0.1:8333 (default credentials minioadmin/minioadmin)" +echo "Data dir: /home/ec2-user/data (edit SEAWEEDFS_DIR in the unit to change)" diff --git a/tools/artifacts_download_upload_scripts/seaweedfs.service b/tools/artifacts_download_upload_scripts/seaweedfs.service new file mode 100644 index 0000000..1dc4079 --- /dev/null +++ b/tools/artifacts_download_upload_scripts/seaweedfs.service @@ -0,0 +1,39 @@ +# SeaweedFS all-in-one server (master, volume, filer, S3). +# Install: see tools/cluster_setup/SEAWEEDFS_SYSTEMD.md or run install_seaweedfs_systemd.sh +# Credentials: set in /etc/default/seaweedfs or use the drop-in below. + +[Unit] +Description=SeaweedFS server (master, volume, filer, S3) +Documentation=https://github.com/seaweedfs/seaweedfs +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +User=ec2-user +Group=ec2-user +# Data directory (must exist and be writable by User). Override via drop-in if needed. +Environment="SEAWEEDFS_DIR=/home/ec2-user/data" +# Max volumes per volume server. Override via drop-in if needed. +Environment="SEAWEEDFS_VOLUME_MAX=100" +# S3 credentials (must match upload script / mc alias) +Environment="AWS_ACCESS_KEY_ID=minioadmin" +Environment="AWS_SECRET_ACCESS_KEY=minioadmin" +# Override with /etc/default/seaweedfs or systemd drop-in if needed: +# EnvironmentFile=-/etc/default/seaweedfs + +# Use explicit paths so ExecStart works even if env expansion is not applied (e.g. after copy from Windows). +ExecStart=/usr/local/bin/weed server -s3 -ip.bind=0.0.0.0 -dir=/home/ec2-user/data -volume.max=100 +WorkingDirectory=/home/ec2-user +Restart=on-failure +RestartSec=5 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=seaweedfs + +# Security: no new privileges, restrict to usual caps +NoNewPrivileges=true +PrivateTmp=true + +[Install] +WantedBy=multi-user.target diff --git a/tools/artifacts_download_upload_scripts/test_minio_connection.sh b/tools/artifacts_download_upload_scripts/test_minio_connection.sh index 6d90525..9f1baf2 100755 --- a/tools/artifacts_download_upload_scripts/test_minio_connection.sh +++ b/tools/artifacts_download_upload_scripts/test_minio_connection.sh @@ -1,10 +1,10 @@ #!/bin/bash # Test script to diagnose MinIO connectivity and bucket creation issues -MINIO_ENDPOINT="${MINIO_ENDPOINT:-http://127.0.0.1:9000}" +MINIO_ENDPOINT="${MINIO_ENDPOINT:-http://18.221.188.50:9000}" MINIO_ROOT_USER="${MINIO_ROOT_USER:-minioadmin}" -MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-minioadmin}" -MINIO_BUCKET="${MINIO_BUCKET:-personal}" +MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-AAnwWE2sLfFduYTpPy4v7PcyczSHGrVM}" +MINIO_BUCKET="${MINIO_BUCKET:-ai-platform-bucket-us-east-2}" echo "==========================================" echo "MinIO Connection Test" diff --git a/tools/artifacts_download_upload_scripts/upload_to_minio.sh b/tools/artifacts_download_upload_scripts/upload_to_minio.sh index 826e275..3b314ac 100755 --- a/tools/artifacts_download_upload_scripts/upload_to_minio.sh +++ b/tools/artifacts_download_upload_scripts/upload_to_minio.sh @@ -1,12 +1,18 @@ #!/bin/bash -# Script to upload model artifacts to MinIO +# Script to upload model artifacts to MinIO or any S3-compatible storage (e.g. SeaweedFS). +# Prefer generic env vars; MINIO_* are accepted for backward compatibility. SOURCE_DIR="./model_artifacts" -MINIO_ENDPOINT="http://127.0.0.1:9000" -# Change the bucket name to the one you want to use. It will be created if it doesn't exist. -MINIO_BUCKET="ai-platform-artifacts-bucket" -MINIO_ROOT_USER="minioadmin" -MINIO_ROOT_PASSWORD="minioadmin" +# Generic names (preferred); fallback to MINIO_* for backward compatibility +OBJECT_STORE_ENDPOINT="${OBJECT_STORE_ENDPOINT:-${MINIO_ENDPOINT:-http://127.0.0.1:9000}}" +OBJECT_STORE_BUCKET="${OBJECT_STORE_BUCKET:-${MINIO_BUCKET:-ai-platform-bucket-minio-us-east-2}}" +OBJECT_STORE_ACCESS_KEY="${OBJECT_STORE_ACCESS_KEY:-${MINIO_ROOT_USER:-${MINIO_ACCESS_KEY:-minioadmin}}}" +OBJECT_STORE_SECRET_KEY="${OBJECT_STORE_SECRET_KEY:-${MINIO_ROOT_PASSWORD:-${MINIO_SECRET_KEY:-minioadmin}}}" +# Internal use (script uses one set) +MINIO_ENDPOINT="${OBJECT_STORE_ENDPOINT}" +MINIO_BUCKET="${OBJECT_STORE_BUCKET}" +MINIO_ROOT_USER="${OBJECT_STORE_ACCESS_KEY}" +MINIO_ROOT_PASSWORD="${OBJECT_STORE_SECRET_KEY}" # Convert bucket name to lowercase (S3/MinIO requirement) ORIGINAL_BUCKET="$MINIO_BUCKET" @@ -176,7 +182,7 @@ if [ $CONNECTION_STATUS -ne 0 ]; then echo "" # Check for specific error types - if echo "$CONNECTION_TEST" | grep -q "Access Denied\|InvalidAccessKeyId\|SignatureDoesNotMatch"; then + if echo "$CONNECTION_TEST" | grep -qi "Access Denied\|InvalidAccessKeyId\|SignatureDoesNotMatch\|signature.*does not match"; then echo "Error: Authentication failed - Invalid credentials" echo "" echo "Current configuration:" @@ -189,7 +195,8 @@ if [ $CONNECTION_STATUS -ne 0 ]; then echo " 3. Default MinIO credentials are usually:" echo " - Username: minioadmin" echo " - Password: minioadmin" - echo " 4. If you changed MinIO credentials, update them in this script" + echo " 4. If you installed MinIO with a custom password (e.g. install_minio_ec2.sh --password 'xxx'), run:" + echo " MINIO_ROOT_PASSWORD='your-password' ./upload_to_minio.sh" elif echo "$CONNECTION_TEST" | grep -q "dial tcp\|connection refused\|no such host"; then echo "Error: Cannot reach MinIO endpoint" echo "" @@ -252,10 +259,10 @@ for artifact_path in "$SOURCE_DIR"/*; do echo "Processing: $id" if [[ -d "$artifact_path" ]]; then - # It's a directory - upload recursively + # It's a directory - upload recursively (trailing slash on source = copy contents, not directory as single object) echo "Uploading directory to MinIO: $MINIO_ENDPOINT/$MINIO_BUCKET/model_artifacts/$id/" - mc cp --recursive "$artifact_path" "$MINIO_ALIAS/$MINIO_BUCKET/model_artifacts/$id/" + mc cp --recursive "$artifact_path/" "$MINIO_ALIAS/$MINIO_BUCKET/model_artifacts/$id/" else # It's a file - upload directly echo "Uploading file to MinIO: $MINIO_ENDPOINT/$MINIO_BUCKET/model_artifacts/$id" diff --git a/tools/artifacts_download_upload_scripts/upload_to_seaweedfs.sh b/tools/artifacts_download_upload_scripts/upload_to_seaweedfs.sh new file mode 100644 index 0000000..8f4bf08 --- /dev/null +++ b/tools/artifacts_download_upload_scripts/upload_to_seaweedfs.sh @@ -0,0 +1,264 @@ +#!/bin/bash +# Upload model artifacts to SeaweedFS (S3-compatible). If SeaweedFS is not running, +# the script can install and start it (weed binary, no Docker). Creates configured +# buckets and uploads from ./model_artifacts. Use OBJECT_STORE_* or SEAWEEDFS_* env vars. + +set -e + +SOURCE_DIR="./model_artifacts" +SEAWEEDFS_PORT="${SEAWEEDFS_PORT:-8333}" + +# Endpoint and credentials (prefer generic OBJECT_STORE_*, then SEAWEEDFS_*). +# SeaweedFS S3 has no built-in users: if the server is started with credentials (env or -config), +# they must match these values. This script sets them when it auto-starts SeaweedFS. +OBJECT_STORE_ENDPOINT="${OBJECT_STORE_ENDPOINT:-${SEAWEEDFS_ENDPOINT:-http://127.0.0.1:8333}}" +OBJECT_STORE_BUCKET="${OBJECT_STORE_BUCKET:-${SEAWEEDFS_BUCKET:-ai-platform-bucket}}" +OBJECT_STORE_ACCESS_KEY="${OBJECT_STORE_ACCESS_KEY:-${SEAWEEDFS_ACCESS_KEY:-minioadmin}}" +OBJECT_STORE_SECRET_KEY="${OBJECT_STORE_SECRET_KEY:-${SEAWEEDFS_SECRET_KEY:-minioadmin}}" +# Bucket list to create (comma-separated). If unset, only primary bucket is created. +SEAWEEDFS_BUCKETS="${SEAWEEDFS_BUCKETS:-$OBJECT_STORE_BUCKET}" +# Set to 1 to skip auto-install and only fail if SeaweedFS is not reachable. +SEAWEEDFS_SKIP_INSTALL="${SEAWEEDFS_SKIP_INSTALL:-0}" +# Retries for each artifact upload (large files can trigger transient "internal error"). +SEAWEEDFS_UPLOAD_RETRIES="${SEAWEEDFS_UPLOAD_RETRIES:-3}" +SEAWEEDFS_UPLOAD_RETRY_DELAY="${SEAWEEDFS_UPLOAD_RETRY_DELAY:-15}" +# Max concurrent uploads (1 = sequential). +SEAWEEDFS_PARALLEL_JOBS="${SEAWEEDFS_PARALLEL_JOBS:-1}" +# Path to log failed artifact ids and messages (appended to on failure). +SEAWEEDFS_ERROR_LOG="${SEAWEEDFS_ERROR_LOG:-./seaweedfs_upload_errors.log}" +# Set to 1 to skip uploading a file if it already exists at destination (avoids re-uploading on script re-runs). +SEAWEEDFS_SKIP_EXISTING="${SEAWEEDFS_SKIP_EXISTING:-0}" +# Wait up to this many seconds for a volume server to appear in the cluster before uploading (avoids "0 node candidates"). +# Set to 0 to skip. Only used when endpoint is local and weed is available. +SEAWEEDFS_WAIT_VOLUME_SERVER="${SEAWEEDFS_WAIT_VOLUME_SERVER:-60}" +# Master address for cluster.ps (default: host from endpoint with port 9333). +SEAWEEDFS_MASTER="${SEAWEEDFS_MASTER:-}" +# Max volumes per volume server (default 100; 0 = auto from disk). Avoids "0 node candidates" when default (e.g. 7) is reached. +SEAWEEDFS_VOLUME_MAX="${SEAWEEDFS_VOLUME_MAX:-100}" + +# Normalize primary bucket to lowercase +OBJECT_STORE_BUCKET=$(echo "$OBJECT_STORE_BUCKET" | tr '[:upper:]' '[:lower:]') + +# ---- Check SeaweedFS is reachable ---- +seaweedfs_ok() { + local code + code=$(curl -s -o /dev/null -w "%{http_code}" "${OBJECT_STORE_ENDPOINT}" 2>/dev/null || echo "000") + [[ "$code" == "200" || "$code" == "403" || "$code" == "400" ]] && return 0 + return 1 +} + +# ---- Install and start SeaweedFS (weed binary from GitHub releases) ---- +install_and_start_seaweedfs() { + local os arch tag asset url tmpdir bindir + os="$(uname -s)" + arch="$(uname -m)" + case "$os" in + Linux) case "$arch" in x86_64|amd64) asset="linux_amd64.tar.gz";; aarch64|arm64) asset="linux_arm64.tar.gz";; *) echo "Unsupported arch: $arch"; return 1;; esac ;; + Darwin) case "$arch" in x86_64|amd64) asset="darwin_amd64.tar.gz";; arm64) asset="darwin_arm64.tar.gz";; *) echo "Unsupported arch: $arch"; return 1;; esac ;; + *) echo "Unsupported OS: $os"; return 1 ;; + esac + echo "Installing SeaweedFS (weed) for $os $arch..." + tag=$(curl -sL https://api.github.com/repos/seaweedfs/seaweedfs/releases/latest | grep '"tag_name":' | sed -E 's/.*"tag_name":\s*"([^"]+)".*/\1/') + [[ -z "$tag" ]] && { echo "Could not get latest SeaweedFS release tag."; return 1; } + url="https://github.com/seaweedfs/seaweedfs/releases/download/${tag}/${asset}" + tmpdir="$(mktemp -d)" + if ! curl -sSL -o "$tmpdir/weed.tar.gz" "$url"; then + echo "Download failed: $url"; rm -rf "$tmpdir"; return 1 + fi + tar -xzf "$tmpdir/weed.tar.gz" -C "$tmpdir" + [[ ! -f "$tmpdir/weed" ]] && { echo "weed binary not found in archive."; rm -rf "$tmpdir"; return 1; } + chmod +x "$tmpdir/weed" + if [[ "$(id -u)" -eq 0 ]] && [[ -d /usr/local/bin ]]; then + mv "$tmpdir/weed" /usr/local/bin/weed + bindir="/usr/local/bin" + elif command -v sudo &>/dev/null && [[ -d /usr/local/bin ]]; then + sudo mv "$tmpdir/weed" /usr/local/bin/weed + bindir="/usr/local/bin" + else + mkdir -p ~/.local/bin + mv "$tmpdir/weed" ~/.local/bin/weed + bindir="$HOME/.local/bin" + export PATH="$PATH:$bindir" + echo "Note: weed installed to $bindir (ensure it is in your PATH)" + fi + rm -rf "$tmpdir" + echo "Installed: $bindir/weed" + "$bindir/weed" version 2>/dev/null || true + echo "Starting SeaweedFS (master, volume, filer, S3 on port ${SEAWEEDFS_PORT}, volume.max=${SEAWEEDFS_VOLUME_MAX})..." + # SeaweedFS S3 validates credentials when provided; use script defaults so mc alias works. + export AWS_ACCESS_KEY_ID="${OBJECT_STORE_ACCESS_KEY:-minioadmin}" + export AWS_SECRET_ACCESS_KEY="${OBJECT_STORE_SECRET_KEY:-minioadmin}" + nohup env AWS_ACCESS_KEY_ID="$AWS_ACCESS_KEY_ID" AWS_SECRET_ACCESS_KEY="$AWS_SECRET_ACCESS_KEY" "$bindir/weed" server -s3 -ip.bind=0.0.0.0 -volume.max="$SEAWEEDFS_VOLUME_MAX" > /tmp/seaweedfs.log 2>&1 & + echo $! > /tmp/seaweedfs.pid + local i + for i in {1..30}; do + sleep 2 + if seaweedfs_ok; then echo "SeaweedFS is up."; return 0; fi + done + echo "Timeout waiting for SeaweedFS. Check /tmp/seaweedfs.log" + return 1 +} + +if ! seaweedfs_ok; then + if [[ "$SEAWEEDFS_SKIP_INSTALL" == "1" ]]; then + echo "Error: SeaweedFS S3 gateway is not reachable at $OBJECT_STORE_ENDPOINT" + echo "Set OBJECT_STORE_ENDPOINT or start SeaweedFS manually (weed server -s3)." + exit 1 + fi + # Only auto-install when endpoint is local (otherwise we'd start local server while user meant a remote one) + if [[ "$OBJECT_STORE_ENDPOINT" != *"127.0.0.1"* ]] && [[ "$OBJECT_STORE_ENDPOINT" != *"localhost"* ]]; then + echo "Error: SeaweedFS is not reachable at $OBJECT_STORE_ENDPOINT" + echo "For a remote endpoint, start SeaweedFS on that host or set OBJECT_STORE_ENDPOINT=http://127.0.0.1:8333 and run again to install locally." + exit 1 + fi + echo "SeaweedFS not reachable at $OBJECT_STORE_ENDPOINT. Attempting to install and start..." + if ! install_and_start_seaweedfs; then + echo "" + echo "Install failed or SeaweedFS did not start. You can:" + echo " 1. Install manually: https://github.com/seaweedfs/seaweedfs/releases" + echo " 2. Run: weed server -s3" + echo " 3. Or set OBJECT_STORE_ENDPOINT=http://:8333 if SeaweedFS runs elsewhere" + exit 1 + fi +fi +echo "SeaweedFS reachable at $OBJECT_STORE_ENDPOINT" + +# ---- Wait for volume server (avoids "Not enough data nodes found" right after restart) ---- +if [[ "$SEAWEEDFS_WAIT_VOLUME_SERVER" -gt 0 ]] && command -v weed &>/dev/null; then + if [[ "$OBJECT_STORE_ENDPOINT" == *"127.0.0.1"* ]] || [[ "$OBJECT_STORE_ENDPOINT" == *"localhost"* ]]; then + master="${SEAWEEDFS_MASTER}" + [[ -z "$master" ]] && master="127.0.0.1:9333" + echo "Waiting up to ${SEAWEEDFS_WAIT_VOLUME_SERVER}s for a volume server in the cluster..." + waited=0 + while [[ $waited -lt "$SEAWEEDFS_WAIT_VOLUME_SERVER" ]]; do + out=$(echo -e "cluster.ps\nexit" | weed shell -master="$master" 2>/dev/null) || true + if echo "$out" | grep -q "volume servers" && echo "$out" | grep -q ":8080"; then + echo "Volume server is ready." + break + fi + sleep 2 + waited=$((waited + 2)) + done + if [[ $waited -ge "$SEAWEEDFS_WAIT_VOLUME_SERVER" ]]; then + echo "Warning: no volume server seen after ${SEAWEEDFS_WAIT_VOLUME_SERVER}s. Upload may fail with 'Not enough data nodes'. Wait longer and re-run, or set SEAWEEDFS_WAIT_VOLUME_SERVER=0 to skip." + fi + fi +fi +echo "" + +# ---- Install mc if needed (same pattern as upload_to_minio.sh) ---- +OS="$(uname -s)" +ARCH="$(uname -m)" +if ! command -v mc &>/dev/null; then + echo "Installing MinIO Client (mc)..." + if [[ "$OS" == "Darwin" ]]; then + if command -v brew &>/dev/null; then + brew install minio/stable/mc + else + if [[ "$ARCH" == "arm64" ]]; then MC_URL="https://dl.min.io/client/mc/release/darwin-arm64/mc"; else MC_URL="https://dl.min.io/client/mc/release/darwin-amd64/mc"; fi + curl -o /tmp/mc "$MC_URL" && chmod +x /tmp/mc && sudo mv /tmp/mc /usr/local/bin/mc + fi + elif [[ "$OS" == "Linux" ]]; then + if [[ "$ARCH" == "x86_64" ]]; then MC_URL="https://dl.min.io/client/mc/release/linux-amd64/mc"; elif [[ "$ARCH" == "aarch64" || "$ARCH" == "arm64" ]]; then MC_URL="https://dl.min.io/client/mc/release/linux-arm64/mc"; else echo "Unsupported arch: $ARCH"; exit 1; fi + curl -o /tmp/mc "$MC_URL" && chmod +x /tmp/mc + sudo mv /tmp/mc /usr/local/bin/mc 2>/dev/null || { mkdir -p ~/.local/bin; mv /tmp/mc ~/.local/bin/mc; export PATH="$PATH:$HOME/.local/bin"; } + else + echo "Unsupported OS: $OS"; exit 1 + fi +fi +mc --version +echo "" + +# ---- Source dir and count ---- +[[ ! -d "$SOURCE_DIR" ]] && { echo "Error: $SOURCE_DIR not found. Run ./download_from_huggingface.sh first."; exit 1; } +artifact_count=$(find "$SOURCE_DIR" -mindepth 1 -maxdepth 1 | wc -l | tr -d ' ') +[[ "$artifact_count" -eq 0 ]] && { echo "No artifacts in $SOURCE_DIR."; exit 1; } +echo "Found $artifact_count artifacts to upload." +echo "" + +# ---- Configure mc alias ---- +MC_ALIAS="seaweedfs" +mc alias set "$MC_ALIAS" "$OBJECT_STORE_ENDPOINT" "$OBJECT_STORE_ACCESS_KEY" "$OBJECT_STORE_SECRET_KEY" --api S3v4 + +# ---- Create buckets (from list + primary) ---- +for b in $(echo "$SEAWEEDFS_BUCKETS" | tr ',' '\n'); do + b=$(echo "$b" | tr '[:upper:]' '[:lower:]' | tr -d ' ') + [[ -z "$b" ]] && continue + mc mb "${MC_ALIAS}/${b}" --ignore-existing 2>/dev/null || true +done +mc mb "${MC_ALIAS}/${OBJECT_STORE_BUCKET}" --ignore-existing 2>/dev/null || true +echo "" + +# ---- Upload with retries (single file; large files can trigger "internal error") ---- +do_upload_file() { + local src="$1" dest="$2" attempt=1 + if [[ "$SEAWEEDFS_SKIP_EXISTING" == "1" ]]; then + mc stat "$dest" &>/dev/null && return 0 + fi + while [[ $attempt -le "$SEAWEEDFS_UPLOAD_RETRIES" ]]; do + mc cp "$src" "$dest" && return 0 + echo "Attempt $attempt/$SEAWEEDFS_UPLOAD_RETRIES failed. Retrying in ${SEAWEEDFS_UPLOAD_RETRY_DELAY}s..." + attempt=$((attempt + 1)) + [[ $attempt -le "$SEAWEEDFS_UPLOAD_RETRIES" ]] && sleep "$SEAWEEDFS_UPLOAD_RETRY_DELAY" + done + return 1 +} + +# Upload a directory artifact file-by-file (per-file retries; one failed file doesn't re-upload the rest). +upload_artifact_dir() { + local artifact_path="$1" dest_base="$2" id="$3" failed=0 f rel + while IFS= read -r -d '' f; do + rel="${f#${artifact_path}/}" + if ! do_upload_file "$f" "${dest_base}/${rel}"; then + echo "$(date -Iseconds 2>/dev/null || date) FAILED: $id $rel" >> "$SEAWEEDFS_ERROR_LOG" + failed=1 + fi + done < <(find "$artifact_path" -type f -print0) + return $failed +} + +# Clear error log from previous runs +: > "$SEAWEEDFS_ERROR_LOG" + +# Build list of artifacts for parallel upload +artifact_paths=() +for artifact_path in "$SOURCE_DIR"/*; do + [[ -e "$artifact_path" ]] || continue + artifact_paths+=("$artifact_path") +done + +parallel_jobs="$SEAWEEDFS_PARALLEL_JOBS" +[[ "$parallel_jobs" -lt 1 ]] && parallel_jobs=1 +idx=0 +total=${#artifact_paths[@]} +echo "Uploading $total artifacts (per-file) with up to $parallel_jobs parallel job(s). Errors logged to: $SEAWEEDFS_ERROR_LOG" +[[ "$SEAWEEDFS_SKIP_EXISTING" == "1" ]] && echo "Skip-existing is ON: files already present at destination will be skipped." +echo "" + +while [[ $idx -lt $total ]]; do + batch=0 + while [[ $batch -lt $parallel_jobs && $idx -lt $total ]]; do + artifact_path="${artifact_paths[$idx]}" + id=$(basename "$artifact_path") + dest_base="${MC_ALIAS}/${OBJECT_STORE_BUCKET}/model_artifacts/$id" + ( + if [[ -d "$artifact_path" ]]; then + upload_artifact_dir "$artifact_path" "$dest_base" "$id" || exit 1 + else + do_upload_file "$artifact_path" "$dest_base" || { echo "$(date -Iseconds 2>/dev/null || date) FAILED: $id" >> "$SEAWEEDFS_ERROR_LOG"; exit 1; } + fi + echo "Completed: $id" + ) & + batch=$((batch + 1)) + idx=$((idx + 1)) + done + wait || true +done + +if [[ -s "$SEAWEEDFS_ERROR_LOG" ]]; then + echo "" + echo "One or more artifacts failed. See $SEAWEEDFS_ERROR_LOG:" + cat "$SEAWEEDFS_ERROR_LOG" + exit 1 +fi +echo "Upload complete. Uploaded $artifact_count artifacts to ${OBJECT_STORE_ENDPOINT}/${OBJECT_STORE_BUCKET}/model_artifacts/" \ No newline at end of file diff --git a/tools/cluster_setup/EKS_README.md b/tools/cluster_setup/EKS_README.md index 54893ba..0a4e464 100644 --- a/tools/cluster_setup/EKS_README.md +++ b/tools/cluster_setup/EKS_README.md @@ -53,7 +53,7 @@ The script installs everything needed for the AI Platform: 4. **EBS CSI Driver** - Persistent volumes backed by AWS EBS 5. **Cluster Autoscaler** - Automatic node scaling based on demand 6. **Cert-Manager** - Automated certificate management -7. **MinIO (optional)** - S3-compatible object storage in-cluster when `storage.minio.enabled: true` +7. **Object storage** - AWS S3 or external S3-compatible only (MinIO, SeaweedFS, etc.; no in-cluster MinIO install) 8. **Kube-Prometheus Stack** - Monitoring with Prometheus + Grafana 9. **OpenTelemetry Operator** - Distributed tracing and telemetry 10. **NVIDIA Device Plugin** - GPU support for AI workloads @@ -540,9 +540,44 @@ storage: # (3-63 chars, lowercase, numbers, hyphens) ``` -**Optional: MinIO (in-cluster or external EC2)** -- **In-cluster:** Set `storage.minio.enabled: true`. The script deploys MinIO via Helm and configures the AIPlatform CR. -- **External (e.g. EC2):** Set `storage.minio.enabled: true`, `storage.minio.external: true`, and `storage.minio.endpoint: "http://:9000"` (and matching `bucket`/`auth`). Use the companion script to install MinIO on an EC2 instance in the same VPC: `CONFIG_FILE=./cluster-config.yaml ./install_minio_ec2.sh --launch-ec2` launches an EC2 in the EKS VPC; then SSH to it and run `./install_minio_ec2.sh --bucket ai-platform --user minioadmin --password ''`. Pre-populate artifacts in MinIO before cluster setup. If you use MinIO, the Splunk app (when using `splunkStandalone.localAppPath`) is not uploaded automatically; upload it to MinIO at `apps/` via MinIO console or `mc`/`aws s3 --endpoint-url`. +**Generic object store (`storage.objectStore.type`)** +Only **AWS S3** or **external S3-compatible** storage is supported (no in-cluster MinIO install). Set `storage.objectStore.type` to `aws`, `s3compat`, `minio`, or `seaweedfs` (default is `aws` when unset). The script sets the AIPlatform `objectStorage.path` and creates a credentials secret for s3compat/minio/seaweedfs; you must provide `endpoint` and credentials. See [Object Storage Selection](../../docs/configuration/object-storage.md). + +**External S3-compatible (MinIO, SeaweedFS, etc.)** +Set `storage.objectStore.type` to `minio`, `s3compat`, or `seaweedfs`, and set `storage.objectStore.endpoint` (e.g. `http://:9000` for MinIO) and credentials. You can run MinIO or SeaweedFS on EC2 or elsewhere; use `install_minio_ec2.sh` to install MinIO on an EC2 in the same VPC if desired. Pre-populate artifacts before cluster setup. The Splunk app (when using `splunkStandalone.localAppPath`) is not uploaded to external object storage automatically; upload it to your bucket at `apps/` via console or `mc`/`aws s3 --endpoint-url`. + +**S3-compatible / SeaweedFS (bring your own)** +- **Generic (`s3compat`):** Set `storage.objectStore.type: s3compat`, `storage.objectStore.endpoint`, `storage.objectStore.bucket`, and credentials. The script creates the credentials secret and sets the path to `s3compat://bucket`; it does not install any storage. Use for any S3-compatible backend (Ceph, custom gateway, etc.). +- **SeaweedFS:** Set `storage.objectStore.type: seaweedfs`, `storage.objectStore.endpoint` (e.g. `http://seaweedfs-s3:8333`), `storage.objectStore.bucket`, and credentials (env `MINIO_ROOT_USER`/`MINIO_ROOT_PASSWORD` or `objectStore.auth`). The script does not install SeaweedFS; it only creates the credentials secret and sets the AIPlatform path to `seaweedfs://bucket`. Ensure your SeaweedFS S3 gateway is reachable from the cluster. + +**Ensuring SeaweedFS is used (not MinIO)** +To force the stack to use SeaweedFS instead of MinIO: + +1. **Config:** In `cluster-config.yaml` set `storage.objectStore.type: "seaweedfs"` and `storage.objectStore.endpoint` to your SeaweedFS S3 URL with **port 8333** (e.g. `http://3.144.157.201:8333`). MinIO uses port 9000; using 8333 avoids pointing at MinIO by mistake. +2. **Preflight:** When you run the install script, preflight prints `Object storage: external S3-compatible (seaweedfs)` and `SeaweedFS endpoint: ...`. If the endpoint shows `:9000`, the script warns you to use `:8333` for SeaweedFS. +3. **After install:** Confirm the AIPlatform CR uses SeaweedFS: + ```bash + kubectl -n ai-platform get aiplatform -o yaml | grep -A6 objectStorage + ``` + You should see `path: seaweedfs://` and `endpoint: "http://...:8333"`. The secret name remains `minio-credentials` (used for any S3-compatible store). + +**Secure MinIO credentials (recommended)** +The script reads MinIO credentials in this order: **environment variables first**, then config file. Prefer not storing passwords in `cluster-config.yaml` (e.g. to avoid committing secrets to Git). + +| Approach | How | When to use | +|----------|-----|-------------| +| **Environment variables** | Export before running the script: `export MINIO_ROOT_USER=minioadmin` and `export MINIO_ROOT_PASSWORD=''`. You can leave `storage.objectStore.auth.rootUser` / `rootPassword` empty or omit them in config; env takes precedence. | Local runs, CI/CD (set secrets in pipeline), one-off setups. | +| **Config file only** | Set `storage.objectStore.auth.rootUser` and `storage.objectStore.auth.rootPassword` in `cluster-config.yaml`. | Quick testing only; avoid if the file is in version control. | +| **Pre-created Kubernetes Secret** | Create the secret yourself (e.g. from Vault or AWS Secrets Manager) in the AI platform namespace as `minio-credentials` with keys `s3_access_key` and `s3_secret_key`. The script can still create the secret from env/config; for stricter control, use a separate flow that only references the existing secret. | GitOps, when you already have a secrets pipeline. | +| **External secret manager** | Store credentials in AWS Secrets Manager, HashiCorp Vault, or similar. Before running the script, fetch the secret and set `MINIO_ROOT_USER` and `MINIO_ROOT_PASSWORD` (e.g. via a wrapper or CI step). Do not put the password in config. | Production; keeps secrets out of config and Git. | + +Example (MinIO credentials from environment only; no secrets in config): + +```bash +export MINIO_ROOT_USER=minioadmin +export MINIO_ROOT_PASSWORD='your-secure-password' +CONFIG_FILE=./cluster-config.yaml ./eks_cluster_with_stack.sh install +``` **Idempotency and existing VPC** - The install is **idempotent**: if the EKS cluster already exists, the script skips cluster creation and only runs reconcile (addons, operators, AIPlatform). Set `cluster.useExisting: true` to require an existing cluster (script fails if the cluster is not found). @@ -562,8 +597,8 @@ storage: | `cluster.region` | AWS region | ✅ **REQUIRED:** Change to your region | | `cluster.useExisting` | Use existing cluster only (do not create) | ⚙️ Set `true` to skip cluster creation; script fails if cluster not found | | `cluster.subnets` | VPC subnets for nodes | ⚙️ **OPTIONAL:** Leave empty for new VPC or provide existing subnet IDs to use existing VPC | -| `storage.s3Bucket` | S3 bucket for AI artifacts (used when MinIO is disabled) | ✅ **REQUIRED** if not using MinIO | -| `storage.minio` | MinIO (in-cluster or external) | ⚙️ `enabled: true`; for EC2 set `external: true` and `endpoint: "http://:9000"` | +| `storage.s3Bucket` | S3 bucket for AI artifacts (used when `objectStore.type` is aws) | ✅ **REQUIRED** if not using MinIO/SeaweedFS | +| `storage.objectStore` | Object store: `type` (aws \| s3compat \| minio \| seaweedfs), `bucket`, `endpoint`, `auth`. Default type is `aws` when unset. External only (no in-cluster install). | ⚙️ Required for s3compat/minio/seaweedfs: set `endpoint` and credentials. See [Object Storage Selection](../../docs/configuration/object-storage.md). | | `images.registry` | Container registry URL | ✅ **REQUIRED:** Your ECR/Docker registry | | `images.*` | All container images | ✅ **REQUIRED:** Configure all image paths | | `nodeGroups.cpu` | CPU node group settings | ⚙️ Optional: adjust size/type | @@ -2244,7 +2279,7 @@ the model is loaded from object storage (S3/MinIO) into that path inside the pod 2. **External MinIO reachable from EKS** If using external MinIO (e.g. EC2), ensure: - - `storage.minio.endpoint` in `cluster-config.yaml` is correct (e.g. `http://:9000`). + - `storage.objectStore.endpoint` in `cluster-config.yaml` is correct (e.g. `http://:9000`). - The EC2 security group allows **inbound TCP 9000** from your EKS node security group or VPC CIDR (see `install_minio_ec2.sh` output). - From a Ray worker pod: `kubectl exec -it -n -- curl -s -o /dev/null -w "%{http_code}" http:///minio/health/live` diff --git a/tools/cluster_setup/K0S_README.md b/tools/cluster_setup/K0S_README.md index 18668d5..bb1adfc 100644 --- a/tools/cluster_setup/K0S_README.md +++ b/tools/cluster_setup/K0S_README.md @@ -390,7 +390,7 @@ The script installs everything needed for the AI Platform: 1. **k0s Kubernetes Cluster** (v1.30+) - CNCF certified Kubernetes 2. **Calico CNI** - High-performance networking with VXLAN -3. **MinIO** - S3-compatible object storage (replaces AWS S3) +3. **MinIO** - S3-compatible object storage (replaces AWS S3). The AI Platform also supports SeaweedFS and other S3-compatible stores via `s3compat://`, `minio://`, or `seaweedfs://`; see [Object storage](../../docs/configuration/object-storage.md) for path schemes and configuration. 4. **Cert-Manager** - Automated certificate management 5. **Kube-Prometheus Stack** - Monitoring with Prometheus + Grafana 6. **OpenTelemetry Operator** - Distributed tracing and telemetry diff --git a/tools/cluster_setup/cluster-config.yaml b/tools/cluster_setup/cluster-config.yaml index f3d49f3..891f170 100644 --- a/tools/cluster_setup/cluster-config.yaml +++ b/tools/cluster_setup/cluster-config.yaml @@ -61,25 +61,23 @@ nodeGroups: volumeType: "gp3" # EBS volume type # ---------- Storage Configuration ---------- +# Object storage: only AWS S3 or external S3-compatible (no in-cluster MinIO install). +# Use objectStore.type: aws (S3) or s3compat | minio | seaweedfs (external; endpoint + credentials required). storage: - s3Bucket: "ai-platform-bucket-minio-us-east-2" # CHANGE THIS: Globally unique S3 bucket name (used when minio.enabled is false) + s3Bucket: "ai-platform-bucket-minio-us-east-2" # Used when objectStore.type is aws storageClass: "gp3" # Storage class for Kubernetes PVCs (gp3, gp2, io1, io2) vectorDbSize: "50Gi" # VectorDB persistent volume size - # Optional: MinIO (S3-compatible object storage). Use in-cluster or external (e.g. EC2). - minio: - enabled: true # Set true to use MinIO for object storage - external: true # true = MinIO runs outside cluster (e.g. EC2); set endpoint below - endpoint: "http://13.59.216.105:9000" # When external=true: e.g. "http://10.0.1.50:9000" (EC2 private IP or hostname) - namespace: "minio" # Namespace for in-cluster MinIO (ignored when external=true) - bucket: "ai-platform-bucket-minio-us-east-2" # Bucket name (must exist on MinIO) - replicas: 1 # In-cluster only: number of MinIO replicas - persistence: - size: "150Gi" - storageClass: "" + # Object store: aws (S3) or external S3-compatible (s3compat, minio, seaweedfs). No in-cluster install. + # - MinIO: endpoint port 9000 (e.g. http://host:9000) + # - SeaweedFS S3: endpoint port 8333 (e.g. http://host:8333); start SeaweedFS with AWS_ACCESS_KEY_ID/SECRET matching auth below + objectStore: + type: "seaweedfs" # aws | s3compat | minio | seaweedfs (external only for non-aws) + bucket: "ai-platform-bucket-minio-us-east-2" + endpoint: "http://3.144.157.201:8333" # SeaweedFS S3 (port 8333). For MinIO use port 9000. auth: rootUser: "minioadmin" - rootPassword: "minioadmin" # Leave empty for in-cluster auto-generate; required for external + rootPassword: "minioadmin" # Must match SeaweedFS env (AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY) or MinIO root # ---------- Container Images Configuration ---------- images: @@ -117,7 +115,7 @@ images: # Option 2: Full path (ignores registry prefix) # image: "docker.io/myorg/splunk-ai-operator:v1.0.0" # Result: "docker.io/myorg/splunk-ai-operator:v1.0.0" - image: "docker.io/kpratyush775/splunk-ai-operator:v0.1.1" + image: "docker.io/kpratyush775/splunk-ai-operator:v0.1.3" # Splunk Enterprise Images splunk: diff --git a/tools/cluster_setup/eks_cluster_with_stack.sh b/tools/cluster_setup/eks_cluster_with_stack.sh index 2d4f100..fe3a033 100755 --- a/tools/cluster_setup/eks_cluster_with_stack.sh +++ b/tools/cluster_setup/eks_cluster_with_stack.sh @@ -55,17 +55,21 @@ load_config() { S3_BUCKET="$(yq eval '.storage.s3Bucket' "$cfg")" STORAGE_CLASS="$(yq eval '.storage.storageClass' "$cfg")" VECTORDB_SIZE="$(yq eval '.storage.vectorDbSize' "$cfg")" - # MinIO (optional S3-compatible object storage) - MINIO_ENABLED="$(yq eval '.storage.minio.enabled // false' "$cfg")" - MINIO_EXTERNAL="$(yq eval '.storage.minio.external // false' "$cfg")" - MINIO_ENDPOINT="$(yq eval '.storage.minio.endpoint // ""' "$cfg")" - MINIO_NS="$(yq eval '.storage.minio.namespace // "minio"' "$cfg")" - MINIO_BUCKET="$(yq eval '.storage.minio.bucket // "ai-platform"' "$cfg")" - MINIO_REPLICAS="$(yq eval '.storage.minio.replicas // 1' "$cfg")" - MINIO_PVC_SIZE="$(yq eval '.storage.minio.persistence.size // "100Gi"' "$cfg")" - MINIO_PVC_STORAGE_CLASS="$(yq eval '.storage.minio.persistence.storageClass // ""' "$cfg")" - MINIO_ROOT_USER="$(yq eval '.storage.minio.auth.rootUser // "minioadmin"' "$cfg")" - MINIO_ROOT_PASSWORD="$(yq eval '.storage.minio.auth.rootPassword // ""' "$cfg")" + # Object storage: objectStore.type (aws | s3compat | minio | seaweedfs); default aws when unset + OBJ_STORE_TYPE="$(yq eval '.storage.objectStore.type // "aws"' "$cfg")" + OBJ_STORE_BUCKET="$(yq eval '.storage.objectStore.bucket // .storage.s3Bucket // "ai-platform"' "$cfg")" + OBJ_STORE_ENDPOINT="$(yq eval '.storage.objectStore.endpoint // ""' "$cfg")" + OBJ_STORE_NS="$(yq eval '.storage.objectStore.namespace // "minio"' "$cfg")" + _obj_user="$(yq eval '.storage.objectStore.auth.rootUser // "minioadmin"' "$cfg")" + _obj_pw="$(yq eval '.storage.objectStore.auth.rootPassword // ""' "$cfg")" + # External S3-compatible only (no in-cluster MinIO install). True when type is s3compat, minio, or seaweedfs. + USE_EXTERNAL_OBJ_STORE="false" + case "${OBJ_STORE_TYPE}" in s3compat|minio|seaweedfs) USE_EXTERNAL_OBJ_STORE="true"; esac + MINIO_ENDPOINT="${OBJ_STORE_ENDPOINT}" + MINIO_NS="${OBJ_STORE_NS}" + MINIO_BUCKET="${OBJ_STORE_BUCKET}" + MINIO_ROOT_USER="${MINIO_ROOT_USER:-$_obj_user}" + MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-$_obj_pw}" # AI Platform AI_NS="$(yq eval '.aiPlatform.namespace' "$cfg")" @@ -136,16 +140,16 @@ load_config() { USE_EXISTING_CLUSTER="false" PRESERVE_VPC_ON_DELETE="false" S3_BUCKET="$(grep 's3Bucket:' "$cfg" | sed 's/.*s3Bucket: *"\(.*\)".*/\1/')" - MINIO_ENABLED="false" - MINIO_EXTERNAL="false" + OBJ_STORE_TYPE="" + OBJ_STORE_BUCKET="${S3_BUCKET}" + OBJ_STORE_ENDPOINT="" + OBJ_STORE_NS="minio" + USE_EXTERNAL_OBJ_STORE="false" MINIO_ENDPOINT="" MINIO_NS="minio" MINIO_BUCKET="ai-platform" - MINIO_REPLICAS="1" - MINIO_PVC_SIZE="150Gi" - MINIO_PVC_STORAGE_CLASS="" - MINIO_ROOT_USER="minioadmin" - MINIO_ROOT_PASSWORD="AAnwWE2sLfFduYTpPy4v7PcyczSHGrVM" + MINIO_ROOT_USER="${MINIO_ROOT_USER:-minioadmin}" + MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-}" AI_NS="$(grep 'namespace:' "$cfg" | grep -A2 'aiPlatform:' | tail -1 | sed 's/.*namespace: *"\(.*\)".*/\1/')" AI_PLATFORM_NAME="splunk-ai-stack" AI_STANDALONE_NAME="splunk-standalone" @@ -1162,69 +1166,22 @@ install_cert_manager() { check_ready cert-manager "app.kubernetes.io/instance=cert-manager,app.kubernetes.io/component=controller" } -# ---------- MinIO (optional S3-compatible object storage) ---------- -install_minio() { - if [[ "${MINIO_ENABLED}" != "true" ]]; then - log "MinIO is disabled (storage.minio.enabled != true); skipping." +# ---------- External S3-compatible object storage (credentials only; no in-cluster install) ---------- +ensure_s3compat_credentials() { + # Only create credentials secret when using external S3-compatible storage (s3compat, minio, seaweedfs). + if [[ "${USE_EXTERNAL_OBJ_STORE}" != "true" ]]; then return 0 fi - # External MinIO (e.g. on EC2): only create credentials secret; no in-cluster install - if [[ "${MINIO_EXTERNAL}" == "true" ]]; then - log "Using external MinIO (storage.minio.external=true); skipping in-cluster install." - if [[ -z "${MINIO_ENDPOINT}" ]]; then - warn "storage.minio.endpoint is empty; set it to the MinIO URL (e.g. http://:9000) for AIPlatform to use external MinIO." - fi - if [[ -z "${MINIO_ROOT_PASSWORD}" ]]; then - err "External MinIO requires storage.minio.auth.rootPassword to be set (same as on the MinIO server)." - return 1 - fi - ensure_namespace "${AI_NS}" - local secret_name="minio-credentials" - kubectl -n "${AI_NS}" create secret generic "${secret_name}" \ - --from-literal=AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" \ - --from-literal=AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \ - --from-literal=s3_access_key="${MINIO_ROOT_USER}" \ - --from-literal=s3_secret_key="${MINIO_ROOT_PASSWORD}" \ - --from-literal=MINIO_ACCESS_KEY="${MINIO_ROOT_USER}" \ - --from-literal=MINIO_SECRET_KEY="${MINIO_ROOT_PASSWORD}" \ - --dry-run=client -o yaml | kubectl -n "${AI_NS}" apply -f - - log "✓ External MinIO credentials secret ${AI_NS}/${secret_name} ready" - return 0 + log "Object store type is ${OBJ_STORE_TYPE}; creating credentials secret for external S3-compatible storage." + if [[ -z "${OBJ_STORE_ENDPOINT}" && -z "${MINIO_ENDPOINT}" ]]; then + err "storage.objectStore.type=${OBJ_STORE_TYPE} requires storage.objectStore.endpoint" + return 1 fi - - log "Installing MinIO in ${MINIO_NS}..." - ensure_namespace "${MINIO_NS}" - - # Auto-generate root password if not set - local minio_password="${MINIO_ROOT_PASSWORD}" - if [[ -z "$minio_password" ]]; then - minio_password="$(openssl rand -base64 24 2>/dev/null || head -c 32 /dev/urandom | base64)" - MINIO_ROOT_PASSWORD="$minio_password" - log "Generated MinIO root password (saved for secret creation)" + if [[ -z "${MINIO_ROOT_PASSWORD}" ]]; then + err "External S3-compatible storage requires credentials (objectStore.auth.rootPassword or MINIO_ROOT_PASSWORD)" + return 1 fi - - helm repo add bitnami https://charts.bitnami.com/bitnami - helm repo update - - local helm_args=( - --namespace "${MINIO_NS}" - --set auth.rootUser="${MINIO_ROOT_USER}" - --set auth.rootPassword="${MINIO_ROOT_PASSWORD}" - --set defaultBuckets="${MINIO_BUCKET}" - --set persistence.size="${MINIO_PVC_SIZE}" - --set replicas="${MINIO_REPLICAS}" - ) - [[ -n "${MINIO_PVC_STORAGE_CLASS}" ]] && helm_args+=(--set persistence.storageClass="${MINIO_PVC_STORAGE_CLASS}") - - helm_retry 5 upgrade --install minio bitnami/minio "${helm_args[@]}" --wait --timeout 10m - - # Wait for MinIO deployment to be ready - local minio_deploy="minio" - kubectl -n "${MINIO_NS}" rollout status deployment/"${minio_deploy}" --timeout=300s 2>/dev/null || true - - # Create credentials secret in AI platform namespace for AIPlatform CR (objectStorage.secretRef). - # SAIA and pkg/storage expect s3_access_key/s3_secret_key; models/SAIA expect MINIO_ACCESS_KEY/MINIO_SECRET_KEY. ensure_namespace "${AI_NS}" local secret_name="minio-credentials" kubectl -n "${AI_NS}" create secret generic "${secret_name}" \ @@ -1235,34 +1192,7 @@ install_minio() { --from-literal=MINIO_ACCESS_KEY="${MINIO_ROOT_USER}" \ --from-literal=MINIO_SECRET_KEY="${MINIO_ROOT_PASSWORD}" \ --dry-run=client -o yaml | kubectl -n "${AI_NS}" apply -f - - - # Create prefix "folders" in MinIO bucket (artifacts/, apps/, tasks/) via placeholder objects - log "Creating MinIO bucket prefixes (artifacts/, apps/, tasks/)..." - cat </dev/null || true - - log "✓ MinIO installed; bucket=${MINIO_BUCKET}; credentials secret ${AI_NS}/${secret_name}" + log "✓ External S3-compatible credentials secret ${AI_NS}/${secret_name} ready" } # ---------- OTEL Operator + contrib collector (idempotent) ---------- @@ -1653,17 +1583,17 @@ install_splunk_standalone() { ensure_namespace "${AI_NS}" wait_for_crd standalones.enterprise.splunk.com 600 - # IRSA for Splunk Standalone: S3 bucket policy when using S3, ECR-only when using MinIO + # IRSA for Splunk Standalone: S3 bucket policy when using AWS S3, ECR-only when using external S3-compatible log "Setting up IRSA for Splunk Standalone service account..." local policy_arn - if [[ "${MINIO_ENABLED}" == "true" ]]; then + if [[ "${USE_EXTERNAL_OBJ_STORE}" == "true" ]]; then policy_arn="$(ensure_ecr_only_policy)" else policy_arn="$(ensure_bucket_policy "${AI_BUCKET_POLICY_NAME}" "${S3_BUCKET}")" fi ensure_irsa_for_sa "${STANDALONE_SA}" "${AI_NS}" "${policy_arn}" - if [[ "${MINIO_ENABLED}" != "true" ]]; then + if [[ "${USE_EXTERNAL_OBJ_STORE}" != "true" ]]; then # Create s3-secret for Standalone when using S3 (fallback if IRSA not fully supported) log "Creating s3-secret for Splunk Standalone (S3 mode)..." if resolve_aws_creds_for_secret 2>/dev/null; then @@ -1702,10 +1632,9 @@ data: sslPassword: password YAML - # Standalone app repo: MinIO (S3-compatible) when storage.minio.enabled=true, else S3 - if [[ "${MINIO_ENABLED}" == "true" ]]; then - local minio_endpoint="${MINIO_ENDPOINT}" - [[ -z "$minio_endpoint" ]] && minio_endpoint="http://minio.${MINIO_NS}.svc.cluster.local:9000" + # Standalone app repo: external S3-compatible when objectStore.type is s3compat/minio/seaweedfs, else S3 + if [[ "${USE_EXTERNAL_OBJ_STORE}" == "true" ]]; then + local minio_endpoint="${MINIO_ENDPOINT:-${OBJ_STORE_ENDPOINT}}" cat </dev/null || echo "m5.4xlarge") GPU_WORKER_INSTANCE_TYPE=$(yq eval '.instanceTypes.gpuWorker' "${CONFIG_FILE}" 2>/dev/null || echo "g5.2xlarge") - # MinIO configuration - MINIO_ACCESS_KEY=$(yq eval '.minio.accessKey' "${CONFIG_FILE}" 2>/dev/null || echo "minioadmin") - MINIO_SECRET_KEY=$(yq eval '.minio.secretKey' "${CONFIG_FILE}" 2>/dev/null || echo "minioadmin123") + # MinIO configuration: prefer environment variables (secure); fall back to config + _minio_ak=$(yq eval '.minio.accessKey' "${CONFIG_FILE}" 2>/dev/null || echo "minioadmin") + _minio_sk=$(yq eval '.minio.secretKey' "${CONFIG_FILE}" 2>/dev/null || echo "minioadmin123") + MINIO_ACCESS_KEY="${MINIO_ACCESS_KEY:-$_minio_ak}" + MINIO_SECRET_KEY="${MINIO_SECRET_KEY:-$_minio_sk}" MINIO_BUCKET=$(yq eval '.minio.bucket' "${CONFIG_FILE}" 2>/dev/null || echo "ai-platform-data") # Kubernetes namespace From c9b8de3b3441bf01d84dd299a517efe2ece8345c Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Wed, 25 Feb 2026 22:39:13 +0530 Subject: [PATCH 03/55] changes for s3 compatable storage in operator --- api/v1/aiplatform_types.go | 16 ++- config/configs/applications.yaml | 65 ++++++++++ .../crd/bases/ai.splunk.com_aiplatforms.yaml | 29 +++-- .../crds/ai.splunk.com_aiplatforms.yaml | 29 +++-- .../templates/deployment.yaml | 2 +- internal/webhook/v1/aiplatform_webhook.go | 15 +-- internal/webhook/v1/aiservice_webhook.go | 4 +- pkg/ai/features/saia/impl.go | 10 +- pkg/ai/raybuilder/builder.go | 25 ++-- pkg/storage/minio.go | 33 +---- pkg/storage/s3compat.go | 47 +++++++ pkg/storage/storageclient.go | 14 +- pkg/storage/storageclient_test.go | 72 ++++++++++- .../README.md | 122 ++++++++++++++++-- .../upload_splunk_app_to_seaweedfs.sh | 46 +++++++ tools/cluster_setup/artifacts.yaml | 8 +- 16 files changed, 434 insertions(+), 103 deletions(-) create mode 100644 pkg/storage/s3compat.go create mode 100755 tools/artifacts_download_upload_scripts/upload_splunk_app_to_seaweedfs.sh diff --git a/api/v1/aiplatform_types.go b/api/v1/aiplatform_types.go index 344c6ce..3d5ba37 100644 --- a/api/v1/aiplatform_types.go +++ b/api/v1/aiplatform_types.go @@ -364,13 +364,13 @@ type SidecarSpec struct { // ObjectStorageSpec defines object storage configuration for AI artifacts, tasks, and models type ObjectStorageSpec struct { // Remote volume URI in the format s3://bucketname/, gs://bucketname/, - // azure://containername/, or minio://bucketname/ + // azure://containername/, s3compat://bucketname/ (generic S3-compatible), minio://, or seaweedfs:// // +kubebuilder:validation:Required - // +kubebuilder:validation:Pattern=`^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$` + // +kubebuilder:validation:Pattern=`^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$` Path string `json:"path"` - // Optional override endpoint (only needed for S3-compatible services like MinIO) - // Must be a valid HTTP/HTTPS URL + // Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS) + // Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible (MinIO, SeaweedFS, etc.) // +kubebuilder:validation:Optional // +kubebuilder:validation:Pattern=`^https?://.*$` Endpoint string `json:"endpoint,omitempty"` @@ -380,11 +380,17 @@ type ObjectStorageSpec struct { // +kubebuilder:validation:MinLength=1 Region string `json:"region"` - // Secret name containing storage credentials + // Secret name containing storage credentials (e.g. s3_access_key, s3_secret_key for S3-compatible backends) // +kubebuilder:validation:Optional // +kubebuilder:validation:MinLength=1 // +kubebuilder:validation:MaxLength=253 SecretRef string `json:"secretRef,omitempty"` + + // Provider is an optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint. + // Values: aws, minio, seaweedfs, s3compat, gcs, azure + // +kubebuilder:validation:Optional + // +kubebuilder:validation:Enum=aws;minio;seaweedfs;s3compat;gcs;azure + Provider string `json:"provider,omitempty"` } // IngressSpec defines Ingress configuration for external access to platform services diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index fe8f28d..91999de 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -7,7 +7,12 @@ applications: API_VERSION: "v1" APPLICATION_NAME: entrypoint ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" + MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" + MINIO_SECRET_KEY: "{{.MinioSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -57,7 +62,12 @@ applications: API_VERSION: "v1" APPLICATION_NAME: uae_large ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" + MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" + MINIO_SECRET_KEY: "{{.MinioSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -101,7 +111,12 @@ applications: API_VERSION: "v1" APPLICATION_NAME: all_minilm_l6_v2 ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" + MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" + MINIO_SECRET_KEY: "{{.MinioSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -145,7 +160,12 @@ applications: API_VERSION: "v1" APPLICATION_NAME: bi_encoder ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" + MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" + MINIO_SECRET_KEY: "{{.MinioSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -185,7 +205,12 @@ applications: API_VERSION: "v1" APPLICATION_NAME: mbart_translator ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" + MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" + MINIO_SECRET_KEY: "{{.MinioSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -236,7 +261,12 @@ applications: API_VERSION: "v1" APPLICATION_NAME: xlm_roberta_language_classifier ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" + MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" + MINIO_SECRET_KEY: "{{.MinioSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -256,7 +286,12 @@ applications: APPLICATION_NAME: "PromptInjectionTfidf" API_VERSION: "v1" ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" + MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" + MINIO_SECRET_KEY: "{{.MinioSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -301,7 +336,12 @@ applications: API_VERSION: "v1" APPLICATION_NAME: cross_encoder ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" + MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" + MINIO_SECRET_KEY: "{{.MinioSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -372,7 +412,12 @@ applications: API_VERSION: "v1" APPLICATION_NAME: llama31_instruct ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" + MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" + MINIO_SECRET_KEY: "{{.MinioSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -424,7 +469,12 @@ applications: API_VERSION: "v1" APPLICATION_NAME: e5_language_classifier ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" + MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" + MINIO_SECRET_KEY: "{{.MinioSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -504,7 +554,12 @@ applications: API_VERSION: "v1" APPLICATION_NAME: llama31_70b_instruct_awq ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" + MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" + MINIO_SECRET_KEY: "{{.MinioSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -545,7 +600,12 @@ applications: API_VERSION: "v1" APPLICATION_NAME: prompt_injection_cross_encoder ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" + MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" + MINIO_SECRET_KEY: "{{.MinioSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -571,7 +631,12 @@ applications: API_VERSION: "v1" APPLICATION_NAME: prompt_injection_classifier ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" + MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" + MINIO_SECRET_KEY: "{{.MinioSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" diff --git a/config/crd/bases/ai.splunk.com_aiplatforms.yaml b/config/crd/bases/ai.splunk.com_aiplatforms.yaml index 98675dc..25bf11b 100644 --- a/config/crd/bases/ai.splunk.com_aiplatforms.yaml +++ b/config/crd/bases/ai.splunk.com_aiplatforms.yaml @@ -2222,20 +2222,33 @@ spec: type: object objectStorage: description: |- - ObjectStorage defines the object storage configuration for AI artifacts, tasks, and models - Supported providers: S3, GCS, Azure Blob Storage, MinIO + ObjectStorage defines the object storage configuration for AI artifacts, tasks, and models. + Supported: AWS S3, MinIO, SeaweedFS, any S3-compatible (s3:// + endpoint), GCS, Azure Blob. + Backend is selected by path scheme; when endpoint is set with s3://, backend is S3-compatible. properties: endpoint: description: |- - Optional override endpoint (only needed for S3-compatible services like MinIO) - Must be a valid HTTP/HTTPS URL + Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS). + Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible. pattern: ^https?://.*$ type: string path: description: |- - Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, or minio://bucketname/ - pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ + Remote volume URI: s3://bucket/prefix, gs://bucket/prefix, azure://container/prefix, + minio://bucket/prefix, or seaweedfs://bucket/prefix + pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ + type: string + provider: + description: |- + Optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint. + Values: aws, minio, seaweedfs, s3compat, gcs, azure + enum: + - aws + - minio + - seaweedfs + - s3compat + - gcs + - azure type: string region: description: Region of the remote storage volume. Required for @@ -2243,7 +2256,7 @@ spec: minLength: 1 type: string secretRef: - description: Secret name containing storage credentials + description: Secret name containing storage credentials (e.g. s3_access_key, s3_secret_key for S3-compatible) maxLength: 253 minLength: 1 type: string diff --git a/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiplatforms.yaml b/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiplatforms.yaml index 98675dc..25bf11b 100644 --- a/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiplatforms.yaml +++ b/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiplatforms.yaml @@ -2222,20 +2222,33 @@ spec: type: object objectStorage: description: |- - ObjectStorage defines the object storage configuration for AI artifacts, tasks, and models - Supported providers: S3, GCS, Azure Blob Storage, MinIO + ObjectStorage defines the object storage configuration for AI artifacts, tasks, and models. + Supported: AWS S3, MinIO, SeaweedFS, any S3-compatible (s3:// + endpoint), GCS, Azure Blob. + Backend is selected by path scheme; when endpoint is set with s3://, backend is S3-compatible. properties: endpoint: description: |- - Optional override endpoint (only needed for S3-compatible services like MinIO) - Must be a valid HTTP/HTTPS URL + Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS). + Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible. pattern: ^https?://.*$ type: string path: description: |- - Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, or minio://bucketname/ - pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ + Remote volume URI: s3://bucket/prefix, gs://bucket/prefix, azure://container/prefix, + minio://bucket/prefix, or seaweedfs://bucket/prefix + pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ + type: string + provider: + description: |- + Optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint. + Values: aws, minio, seaweedfs, s3compat, gcs, azure + enum: + - aws + - minio + - seaweedfs + - s3compat + - gcs + - azure type: string region: description: Region of the remote storage volume. Required for @@ -2243,7 +2256,7 @@ spec: minLength: 1 type: string secretRef: - description: Secret name containing storage credentials + description: Secret name containing storage credentials (e.g. s3_access_key, s3_secret_key for S3-compatible) maxLength: 253 minLength: 1 type: string diff --git a/helm-chart/splunk-ai-operator/templates/deployment.yaml b/helm-chart/splunk-ai-operator/templates/deployment.yaml index 579e800..34ed56a 100644 --- a/helm-chart/splunk-ai-operator/templates/deployment.yaml +++ b/helm-chart/splunk-ai-operator/templates/deployment.yaml @@ -40,7 +40,7 @@ spec: {{- toYaml .Values.securityContext | nindent 8 }} containers: - name: manager - image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion | default "latest" }}" + image: "{{ if .Values.image.digest }}{{ .Values.image.repository }}@{{ .Values.image.digest }}{{ else }}{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion | default "latest" }}{{ end }}" imagePullPolicy: {{ .Values.image.pullPolicy }} args: - --metrics-bind-address=:8443 diff --git a/internal/webhook/v1/aiplatform_webhook.go b/internal/webhook/v1/aiplatform_webhook.go index 7e8ceb2..674471f 100644 --- a/internal/webhook/v1/aiplatform_webhook.go +++ b/internal/webhook/v1/aiplatform_webhook.go @@ -195,14 +195,7 @@ func (v *AIPlatformCustomValidator) ValidateUpdate(ctx context.Context, oldObj, warnings = append(warnings, createWarnings...) } - // Validate immutable fields - if oldPlatform.Spec.ObjectStorage.Path != aiplatform.Spec.ObjectStorage.Path { - allErrs = append(allErrs, field.Forbidden( - field.NewPath("spec").Child("objectStorage").Child("path"), - "objectStorage.path is immutable", - )) - } - + // Validate immutable fields (path is mutable to allow switching storage backends, e.g. MinIO to SeaweedFS) if oldPlatform.Spec.ObjectStorage.Region != aiplatform.Spec.ObjectStorage.Region { allErrs = append(allErrs, field.Forbidden( field.NewPath("spec").Child("objectStorage").Child("region"), @@ -237,8 +230,8 @@ func (v *AIPlatformCustomValidator) validateObjectStorage(objStorage *aiv1.Objec if objStorage.Path == "" { allErrs = append(allErrs, field.Required(fldPath.Child("path"), "objectStorage.path must be specified")) } else { - // Validate path format (s3://, gs://, azure://, minio://) - validPrefixes := []string{"s3://", "gs://", "azure://", "minio://"} + // Validate path format (s3://, gs://, azure://, s3compat://, minio://, seaweedfs://) + validPrefixes := []string{"s3://", "gs://", "azure://", "s3compat://", "minio://", "seaweedfs://"} hasValidPrefix := false for _, prefix := range validPrefixes { if strings.HasPrefix(objStorage.Path, prefix) { @@ -250,7 +243,7 @@ func (v *AIPlatformCustomValidator) validateObjectStorage(objStorage *aiv1.Objec allErrs = append(allErrs, field.Invalid( fldPath.Child("path"), objStorage.Path, - "path must start with s3://, gs://, azure://, or minio://", + "path must start with s3://, gs://, azure://, s3compat://, minio://, or seaweedfs://", )) } } diff --git a/internal/webhook/v1/aiservice_webhook.go b/internal/webhook/v1/aiservice_webhook.go index 69a0f46..7d81d77 100644 --- a/internal/webhook/v1/aiservice_webhook.go +++ b/internal/webhook/v1/aiservice_webhook.go @@ -275,7 +275,7 @@ func (v *AIServiceCustomValidator) validateTaskVolume(taskVolume *aiv1.ObjectSto } else { // Validate path format /* - validPrefixes := []string{"s3://", "gs://", "azure://", "minio://"} + validPrefixes := []string{"s3://", "gs://", "azure://", "s3compat://", "minio://", "seaweedfs://"} hasValidPrefix := false for _, prefix := range validPrefixes { if strings.HasPrefix(taskVolume.Path, prefix) { @@ -287,7 +287,7 @@ func (v *AIServiceCustomValidator) validateTaskVolume(taskVolume *aiv1.ObjectSto allErrs = append(allErrs, field.Invalid( fldPath.Child("path"), taskVolume.Path, - "path must start with s3://, gs://, azure://, or minio://", + "path must start with s3://, gs://, azure://, s3compat://, minio://, or seaweedfs://", )) } */ diff --git a/pkg/ai/features/saia/impl.go b/pkg/ai/features/saia/impl.go index b3106b1..9a0fa29 100644 --- a/pkg/ai/features/saia/impl.go +++ b/pkg/ai/features/saia/impl.go @@ -619,8 +619,8 @@ func (r *SaiaReconciler) reconcileSAIADeployment( {Name: "S3_BUCKET", Value: extractBucketName(ai.Spec.TaskVolume.Path)}, } - // MinIO support: Add MinIO-specific environment variables if endpoint is configured - if strings.HasPrefix(ai.Spec.TaskVolume.Path, "minio") && ai.Spec.TaskVolume.Endpoint != "" { + // MinIO/S3-compatible: SAIA service expects MINIO_ENDPOINT_URL when using custom endpoint (MinIO or S3-compatible) + if ai.Spec.TaskVolume.Endpoint != "" { env = append(env, corev1.EnvVar{Name: "MINIO_ENDPOINT_URL", Value: ai.Spec.TaskVolume.Endpoint}) } @@ -932,14 +932,16 @@ func (r *SaiaReconciler) createOrUpdateConfigMap( } // extractBucketName extracts the bucket name from an object storage path. -// Supports s3://, minio://, gs://, and azure:// prefixes. +// Supports s3://, s3compat://, minio://, seaweedfs://, gs://, and azure:// prefixes. // Examples: // - "s3://my-bucket/path/to/dir" -> "my-bucket" +// - "s3compat://bucket-name" -> "bucket-name" // - "minio://bucket-name" -> "bucket-name" +// - "seaweedfs://my-bucket/prefix" -> "my-bucket" // - "gs://my-bucket" -> "my-bucket" func extractBucketName(path string) string { // Remove supported prefixes - prefixes := []string{"s3://", "minio://", "gs://", "azure://"} + prefixes := []string{"s3://", "s3compat://", "minio://", "seaweedfs://", "gs://", "azure://"} for _, prefix := range prefixes { if strings.HasPrefix(path, prefix) { path = strings.TrimPrefix(path, prefix) diff --git a/pkg/ai/raybuilder/builder.go b/pkg/ai/raybuilder/builder.go index 79dd5c1..f9c64a0 100644 --- a/pkg/ai/raybuilder/builder.go +++ b/pkg/ai/raybuilder/builder.go @@ -94,15 +94,20 @@ func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPl } // Set CloudProvider and artifacts provider/bucket from URL scheme (for SDK model loaders). - // ARTIFACTS_PROVIDER matches storage client GetProvider(): s3/minio -> "s3", gs/gcs -> "gcs", azure -> "azure". + // ARTIFACTS_PROVIDER matches storage client GetProvider(): s3/minio/seaweedfs/s3compat -> "s3", gs/gcs -> "gcs", azure -> "azure". + // S3 (AWS) uses cloudProvider "aws" when no custom endpoint; s3compat/minio/seaweedfs use "s3compat". var cloudProvider, artifactsProvider string switch u.Scheme { case "s3": - cloudProvider = "aws" + if p.Spec.ObjectStorage.Endpoint != "" { + cloudProvider = "s3compat" + } else { + cloudProvider = "aws" + } + artifactsProvider = "s3" + case "s3compat", "minio", "seaweedfs": + cloudProvider = "s3compat" artifactsProvider = "s3" - case "minio": - cloudProvider = "minio" - artifactsProvider = "s3" // MinIO is S3-compatible; SDK uses s3 client case "gs", "gcs": cloudProvider = "gcp" artifactsProvider = "gcs" @@ -149,17 +154,19 @@ func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPl } } + // S3-compatible backends (s3compat, MinIO, SeaweedFS) need custom endpoint and credentials. S3 (AWS) uses region/IRSA only. + s3CompatScheme := (u.Scheme == "s3compat" || u.Scheme == "minio" || u.Scheme == "seaweedfs") minioEndpoint := "" - if u.Scheme == "minio" && p.Spec.ObjectStorage.Endpoint != "" { + if s3CompatScheme && p.Spec.ObjectStorage.Endpoint != "" { minioEndpoint = p.Spec.ObjectStorage.Endpoint } var minioAccessKey, minioSecretKey string - if u.Scheme == "minio" && p.Spec.ObjectStorage.SecretRef != "" { + if p.Spec.ObjectStorage.SecretRef != "" && s3CompatScheme { var secret corev1.Secret secretRef := types.NamespacedName{Namespace: p.Namespace, Name: p.Spec.ObjectStorage.SecretRef} if err := b.Get(ctx, secretRef, &secret); err != nil { - logger.Error(err, "Failed to get object storage secret for MinIO credentials", "secret", p.Spec.ObjectStorage.SecretRef) + logger.Error(err, "Failed to get object storage secret for S3-compatible credentials", "secret", p.Spec.ObjectStorage.SecretRef) return err } if raw, ok := secret.Data["s3_access_key"]; ok { @@ -859,7 +866,7 @@ func (b *Builder) makeWorkerTemplate(cfg InstanceDetail) corev1.PodTemplateSpec ulimit -n 65536; export PATH="/home/ray/anaconda3/bin:$PATH"; KUBERAY_GEN_RAY_START_CMD=$(echo $KUBERAY_GEN_RAY_START_CMD | sed -e 's/"{/{/g' -e 's/}"/}/g' -e 's/\\\"/"/g'); - $KUBERAY_GEN_RAY_START_CMD;`, cfg.Tier) + $KUBERAY_GEN_RAY_START_CMD`, cfg.Tier) spec := corev1.PodSpec{ Affinity: b.ai.Spec.GPUSchedulingSpec.Affinity, Tolerations: b.ai.Spec.GPUSchedulingSpec.Tolerations, diff --git a/pkg/storage/minio.go b/pkg/storage/minio.go index f55a4ba..d8a2abd 100644 --- a/pkg/storage/minio.go +++ b/pkg/storage/minio.go @@ -3,44 +3,17 @@ package storage import ( "context" - "github.com/aws/aws-sdk-go/aws" - "github.com/aws/aws-sdk-go/aws/credentials" - "github.com/aws/aws-sdk-go/aws/session" - "github.com/aws/aws-sdk-go/service/s3" ai "github.com/splunk/splunk-ai-operator/api/v1" - corev1 "k8s.io/api/core/v1" "sigs.k8s.io/controller-runtime/pkg/client" ) +// NewMinioClient creates a StorageClient for MinIO (S3-compatible). It delegates to NewS3CompatibleClient. +// Deprecated: Prefer NewS3CompatibleClient for MinIO, SeaweedFS, or any S3-compatible backend. func NewMinioClient( ctx context.Context, k8sClient client.Client, namespace, bucket, prefix string, vs ai.ObjectStorageSpec, ) (StorageClient, error) { - awsCfg := &aws.Config{ - Endpoint: aws.String(vs.Endpoint), - Region: aws.String(vs.Region), - S3ForcePathStyle: aws.Bool(true), - } - if vs.SecretRef != "" { - secret := &corev1.Secret{} - if err := k8sClient.Get(ctx, - client.ObjectKey{Namespace: namespace, Name: vs.SecretRef}, - secret, - ); err != nil { - return nil, err - } - awsCfg.Credentials = credentials.NewStaticCredentials( - string(secret.Data["s3_access_key"]), - string(secret.Data["s3_secret_key"]), - "", - ) - } - // no SecretRef → AWS SDK default chain (IRSA, env, etc) - sess, err := session.NewSession(awsCfg) - if err != nil { - return nil, err - } - return &s3Client{cli: s3.New(sess), bucket: bucket, prefix: prefix}, nil + return NewS3CompatibleClient(ctx, k8sClient, namespace, bucket, prefix, vs) } diff --git a/pkg/storage/s3compat.go b/pkg/storage/s3compat.go new file mode 100644 index 0000000..b50a735 --- /dev/null +++ b/pkg/storage/s3compat.go @@ -0,0 +1,47 @@ +package storage + +import ( + "context" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/credentials" + "github.com/aws/aws-sdk-go/aws/session" + "github.com/aws/aws-sdk-go/service/s3" + ai "github.com/splunk/splunk-ai-operator/api/v1" + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// NewS3CompatibleClient creates a StorageClient for any S3-compatible backend (MinIO, SeaweedFS, etc.). +// Endpoint must be set on vs; credentials come from vs.SecretRef (s3_access_key, s3_secret_key) if set. +func NewS3CompatibleClient( + ctx context.Context, + k8sClient client.Client, + namespace, bucket, prefix string, + vs ai.ObjectStorageSpec, +) (StorageClient, error) { + awsCfg := &aws.Config{ + Endpoint: aws.String(vs.Endpoint), + Region: aws.String(vs.Region), + S3ForcePathStyle: aws.Bool(true), + } + if vs.SecretRef != "" { + secret := &corev1.Secret{} + if err := k8sClient.Get(ctx, + client.ObjectKey{Namespace: namespace, Name: vs.SecretRef}, + secret, + ); err != nil { + return nil, err + } + awsCfg.Credentials = credentials.NewStaticCredentials( + string(secret.Data["s3_access_key"]), + string(secret.Data["s3_secret_key"]), + "", + ) + } + sess, err := session.NewSession(awsCfg) + if err != nil { + return nil, err + } + return &s3Client{cli: s3.New(sess), bucket: bucket, prefix: prefix}, nil +} diff --git a/pkg/storage/storageclient.go b/pkg/storage/storageclient.go index 1935616..9f73b95 100644 --- a/pkg/storage/storageclient.go +++ b/pkg/storage/storageclient.go @@ -57,13 +57,21 @@ func NewStorageClient( return nil, fmt.Errorf("invalid volume URI %q: Azure path must include container name (e.g. azure://container-name/prefix). Without it, model deployments fail with 'Please specify a container name'", vs.Path) } return NewAzureClient(ctx, k8sClient, namespace, u.Host, prefix, vs) + case "s3compat": + if u.Host == "" { + return nil, fmt.Errorf("invalid volume URI %q: S3-compatible path must include bucket name (e.g. s3compat://bucket-name/prefix)", vs.Path) + } + return NewS3CompatibleClient(ctx, k8sClient, namespace, u.Host, prefix, vs) case "minio": if u.Host == "" { return nil, fmt.Errorf("invalid volume URI %q: MinIO path must include bucket name (e.g. minio://bucket-name/prefix)", vs.Path) } - // everything after "//" is host (bucket) and path. We treat u.Host as bucket, - // vs.Endpoint *must* be set to our MinIO URL for this case. - return NewMinioClient(ctx, k8sClient, namespace, u.Host, prefix, vs) + return NewS3CompatibleClient(ctx, k8sClient, namespace, u.Host, prefix, vs) + case "seaweedfs": + if u.Host == "" { + return nil, fmt.Errorf("invalid volume URI %q: SeaweedFS path must include bucket name (e.g. seaweedfs://bucket-name/prefix)", vs.Path) + } + return NewS3CompatibleClient(ctx, k8sClient, namespace, u.Host, prefix, vs) case "fixture": // fixture:// is a special scheme for testing purposes, using a fake client. // It does not require any credentials or endpoint. diff --git a/pkg/storage/storageclient_test.go b/pkg/storage/storageclient_test.go index e395b4d..87742d2 100644 --- a/pkg/storage/storageclient_test.go +++ b/pkg/storage/storageclient_test.go @@ -75,12 +75,52 @@ func TestNewStorageClient(t *testing.T) { }, }, { - name: "MinIO storage", + name: "MinIO storage (S3-compatible)", volumeSpec: ai.ObjectStorageSpec{ Path: "minio://my-bucket/prefix", Endpoint: "http://minio.default.svc:9000", + Region: "us-east-1", }, - wantType: "minio", + wantType: "s3", + wantErr: false, + setupClient: func() *fake.ClientBuilder { + return fake.NewClientBuilder().WithScheme(s) + }, + }, + { + name: "S3-compatible storage (generic s3compat scheme)", + volumeSpec: ai.ObjectStorageSpec{ + Path: "s3compat://my-bucket/prefix", + Endpoint: "http://s3compat.default.svc:9000", + Region: "us-east-1", + }, + wantType: "s3", + wantErr: false, + setupClient: func() *fake.ClientBuilder { + return fake.NewClientBuilder().WithScheme(s) + }, + }, + { + name: "SeaweedFS storage (S3-compatible)", + volumeSpec: ai.ObjectStorageSpec{ + Path: "seaweedfs://my-bucket/prefix", + Endpoint: "http://seaweedfs.default.svc:8333", + Region: "us-east-1", + }, + wantType: "s3", + wantErr: false, + setupClient: func() *fake.ClientBuilder { + return fake.NewClientBuilder().WithScheme(s) + }, + }, + { + name: "S3 with custom endpoint (S3-compatible)", + volumeSpec: ai.ObjectStorageSpec{ + Path: "s3://my-bucket/prefix", + Endpoint: "http://custom-s3.example.com:9000", + Region: "us-east-1", + }, + wantType: "s3", wantErr: false, setupClient: func() *fake.ClientBuilder { return fake.NewClientBuilder().WithScheme(s) @@ -91,7 +131,7 @@ func TestNewStorageClient(t *testing.T) { volumeSpec: ai.ObjectStorageSpec{ Path: "fixture://test-bucket/prefix", }, - wantType: "fixture", + wantType: "s3", // fixtureClient.GetProvider() returns "s3" for artifact compatibility wantErr: false, setupClient: func() *fake.ClientBuilder { return fake.NewClientBuilder().WithScheme(s) @@ -139,6 +179,30 @@ func TestNewStorageClient(t *testing.T) { return fake.NewClientBuilder().WithScheme(s) }, }, + { + name: "S3-compatible path without bucket name", + volumeSpec: ai.ObjectStorageSpec{ + Path: "s3compat:///prefix", + Endpoint: "http://s3compat:9000", + Region: "us-east-1", + }, + wantErr: true, + setupClient: func() *fake.ClientBuilder { + return fake.NewClientBuilder().WithScheme(s) + }, + }, + { + name: "SeaweedFS path without bucket name", + volumeSpec: ai.ObjectStorageSpec{ + Path: "seaweedfs:///prefix", + Endpoint: "http://seaweedfs:8333", + Region: "us-east-1", + }, + wantErr: true, + setupClient: func() *fake.ClientBuilder { + return fake.NewClientBuilder().WithScheme(s) + }, + }, } for _, tt := range tests { @@ -156,7 +220,7 @@ func TestNewStorageClient(t *testing.T) { // Verify provider matches expected type provider := client.GetProvider() - assert.NotEmpty(t, provider) + assert.Equal(t, tt.wantType, provider, "GetProvider() should match wantType") // Verify bucket/container is extracted bucket := client.GetBucket() diff --git a/tools/artifacts_download_upload_scripts/README.md b/tools/artifacts_download_upload_scripts/README.md index 98a5ce6..e8b7c3d 100755 --- a/tools/artifacts_download_upload_scripts/README.md +++ b/tools/artifacts_download_upload_scripts/README.md @@ -70,13 +70,14 @@ sudo ./download_from_huggingface.sh - Script returns non-zero exit code on failure (suitable for CI/CD pipelines) ### 2. `upload_to_minio.sh` -Uploads downloaded artifacts to MinIO storage. +Uploads downloaded artifacts to MinIO or any S3-compatible storage (e.g. SeaweedFS). **Features:** - Automatically uploads **all artifacts** from `./model_artifacts/` directory - No config file needed - just uploads everything found - **Auto-creates bucket** if it doesn't exist - Uses native MinIO Client (mc) for optimal performance +- Works with **MinIO, SeaweedFS, or any S3-compatible** backend; set endpoint and credentials to match your store. - Comprehensive dependency installation: - MinIO Client via **Homebrew on macOS** or **direct download on Linux** - Supports macOS (Intel & Apple Silicon) and Linux (amd64 & arm64) @@ -92,16 +93,109 @@ Or with sudo if dependency installation fails: sudo ./upload_to_minio.sh ``` +**Environment variables (S3-compatible target):** +Preferred generic names; `MINIO_*` are accepted for backward compatibility. + +| Preferred (generic) | Fallback | Description | +|---------------------|----------|-------------| +| `OBJECT_STORE_ENDPOINT` | `MINIO_ENDPOINT` | S3 API endpoint URL (e.g. http://host:9000 for MinIO, http://host:8333 for SeaweedFS) | +| `OBJECT_STORE_BUCKET` | `MINIO_BUCKET` | Bucket name | +| `OBJECT_STORE_ACCESS_KEY` | `MINIO_ROOT_USER` or `MINIO_ACCESS_KEY` | Access key | +| `OBJECT_STORE_SECRET_KEY` | `MINIO_ROOT_PASSWORD` or `MINIO_SECRET_KEY` | Secret key | + +Example for SeaweedFS: `OBJECT_STORE_ENDPOINT=http://seaweedfs:8333 OBJECT_STORE_BUCKET=my-bucket ./upload_to_minio.sh` + **Prerequisites:** - Run `download_from_huggingface.sh` first to download artifacts - May require sudo for installing MinIO Client (mc) -- Configure MinIO settings in the script or use environment variables: - - `MINIO_ENDPOINT` (default: http://127.0.0.1:9000) - - `MINIO_BUCKET` (default: personal) - - `MINIO_ROOT_USER` (default: minioadmin) - - `MINIO_ROOT_PASSWORD` (default: minioadmin) +- Set endpoint, bucket, and credentials via the env vars above (defaults point to a local MinIO). + +### 3. `upload_to_seaweedfs.sh` +Uploads downloaded artifacts to SeaweedFS (S3-compatible). If SeaweedFS is not running at the endpoint, the script can **install and start it** (downloads the `weed` binary from GitHub releases, no Docker). If you run SeaweedFS via **systemd** (see **§4 `install_seaweedfs_systemd.sh`** below), ensure the service is up (`sudo systemctl start seaweedfs`) before running the upload script so the script doesn’t start a second instance. + +**Features:** +- **Auto-install SeaweedFS** when not reachable: downloads latest `weed` for Linux/macOS (amd64/arm64), installs to `/usr/local/bin` or `~/.local/bin`, and starts `weed server -s3` in the background (S3 gateway on port 8333). +- Auto-install only runs when the endpoint is local (`127.0.0.1` or `localhost`). For remote endpoints, SeaweedFS must already be running. +- Creates configured buckets (from `SEAWEEDFS_BUCKETS` or primary bucket), then uploads all of `./model_artifacts/` to the primary bucket. +- Uses MinIO Client (mc); installs mc if missing. + +**Usage:** +```bash +./upload_to_seaweedfs.sh +``` + +With a remote SeaweedFS: +```bash +OBJECT_STORE_ENDPOINT=http://seaweedfs-host:8333 OBJECT_STORE_BUCKET=my-bucket ./upload_to_seaweedfs.sh +``` + +To skip auto-install and only fail if unreachable: +```bash +SEAWEEDFS_SKIP_INSTALL=1 ./upload_to_seaweedfs.sh +``` + +**Volume limit:** When the script starts SeaweedFS it uses `-volume.max=100` (set `SEAWEEDFS_VOLUME_MAX`; use `0` for auto). The default (~7) can cause "0 node candidates" once the volume server is "full." + +**Environment variables:** `OBJECT_STORE_ENDPOINT` (default: http://127.0.0.1:8333), `OBJECT_STORE_BUCKET`, `OBJECT_STORE_ACCESS_KEY`, `OBJECT_STORE_SECRET_KEY`, `SEAWEEDFS_BUCKETS`, `SEAWEEDFS_SKIP_INSTALL`, `SEAWEEDFS_UPLOAD_RETRIES`, `SEAWEEDFS_UPLOAD_RETRY_DELAY`, `SEAWEEDFS_PARALLEL_JOBS`, `SEAWEEDFS_ERROR_LOG`, `SEAWEEDFS_SKIP_EXISTING`, `SEAWEEDFS_WAIT_VOLUME_SERVER`, `SEAWEEDFS_MASTER`, `SEAWEEDFS_VOLUME_MAX` (default 100). -### 3. `upload_to_minio_aws.sh` +**SeaweedFS credentials:** SeaweedFS S3 has no built-in users (unlike MinIO’s default `minioadmin`). If you start SeaweedFS yourself, it must be configured to accept the same access key/secret the script uses (defaults: `minioadmin`/`minioadmin`). Options: (1) Start with env vars: `AWS_ACCESS_KEY_ID=minioadmin AWS_SECRET_ACCESS_KEY=minioadmin weed server -s3`; (2) Use a JSON config file with `weed s3 -config=/path/to/s3.json` (see [SeaweedFS S3 Credentials](https://github.com/seaweedfs/seaweedfs/wiki/S3-Credentials)). If you see *"The access key ID you provided does not exist in our records"*, restart SeaweedFS with the same credentials as `OBJECT_STORE_ACCESS_KEY`/`OBJECT_STORE_SECRET_KEY` (or set those env vars to match your SeaweedFS config). + +**Volume server readiness:** After SeaweedFS has just started (or restarted), the master may not see a volume server yet, so uploads can fail with "Not enough data nodes found". The script can **wait for a volume server** (when endpoint is local and `weed` is available): it polls `weed shell -master=... cluster.ps` for up to `SEAWEEDFS_WAIT_VOLUME_SERVER` seconds (default 60) before starting uploads. Set `SEAWEEDFS_WAIT_VOLUME_SERVER=0` to skip. + +**Parallel uploads and error log:** Uploads run in parallel (up to `SEAWEEDFS_PARALLEL_JOBS` at a time, default 3). Directory artifacts are uploaded **file-by-file** with per-file retries, so one failed file (e.g. a single `.safetensors` shard) only retries that file, not the whole artifact. Failed files/artifacts are appended to `SEAWEEDFS_ERROR_LOG` (default `./seaweedfs_upload_errors.log`) with artifact id and relative path; at the end the script prints that file and exits with code 1 if any failed. + +**Large artifacts (e.g. LLaMA 70B):** Uploads of very large files (multi-GB `.safetensors` shards) can fail with *"We encountered an internal error, please try again"*. The script retries each artifact up to `SEAWEEDFS_UPLOAD_RETRIES` (default 3) with `SEAWEEDFS_UPLOAD_RETRY_DELAY` seconds between attempts. If failures persist, check SeaweedFS host memory and disk (`/tmp/seaweedfs.log` or volume server logs), ensure enough free space for the full object, and consider increasing retries: `SEAWEEDFS_UPLOAD_RETRIES=5 SEAWEEDFS_UPLOAD_RETRY_DELAY=30 ./upload_to_seaweedfs.sh`. + +**"0 node candidates" / "Not enough data nodes":** Usually the volume server hit its max volume count (default ~7), disk is near full (read-only), heartbeat timeouts, or OOM. The script and systemd unit use `-volume.max=100` by default. When the error happens: `curl -s http://localhost:9333/cluster/status | jq` (master view); `curl -s http://127.0.0.1:8080/status | jq` (volume server; if Max==Count, increase `SEAWEEDFS_VOLUME_MAX`). See `tools/artifacts_download_upload_scripts/SEAWEEDFS_SYSTEMD.md` for full troubleshooting. + +**Prerequisites:** +- Run `download_from_huggingface.sh` first to download artifacts +- For auto-install: curl, tar; optional sudo for `/usr/local/bin` +- No Docker required + +**Create standard folders:** To create the platform folders (`apps/`, `artifacts/`, `config/`, `job_groups/`, `model_artifacts/`, `tasks/`) in SeaweedFS, run `./create_seaweedfs_folders.sh` after SeaweedFS is up. It uses the same endpoint and credentials as `upload_to_seaweedfs.sh`. + +**Upload Splunk AI Assistant app:** To upload `Splunk_AI_Assistant_Cloud.tgz` to `bucket/apps/`, run `./upload_splunk_app_to_seaweedfs.sh`. Put the .tgz in the current directory or set `SPLUNK_APP_LOCAL_PATH=/path/to/Splunk_AI_Assistant_Cloud.tgz`. Same endpoint/credentials as above. + +### 4. `install_seaweedfs_systemd.sh` +Installs SeaweedFS as a **systemd service** so it starts on boot and restarts on failure. Run this on the host where SeaweedFS should run (e.g. EC2), after the `weed` binary is installed. + +**Features:** +- Copies `seaweedfs.service` from this directory into `/etc/systemd/system/` +- Enables and starts the `seaweedfs` service (master, volume, filer, S3 gateway) +- Service runs as `ec2-user` (configurable in the unit file); data directory is `/home/ec2-user/data` by default +- Handles SELinux: on Enforcing systems, labels `/usr/local/bin/weed` so the service can execute it +- Requires the `weed` binary at `/usr/local/bin/weed` (install it first via `upload_to_seaweedfs.sh` or manually from [SeaweedFS releases](https://github.com/seaweedfs/seaweedfs/releases)) + +**Usage:** +```bash +# 1. Install weed first (e.g. run upload_to_seaweedfs.sh once, or download weed and put it in /usr/local/bin) +# 2. Then install the systemd service (requires sudo) +sudo ./install_seaweedfs_systemd.sh +``` + +**Prerequisites:** +- `weed` at `/usr/local/bin/weed` (run `./upload_to_seaweedfs.sh` once to auto-install it, or download and extract from GitHub releases) +- Run the script as root: `sudo ./install_seaweedfs_systemd.sh` +- The `seaweedfs.service` unit file must be in the same directory as the script + +**After install:** +- **Status:** `sudo systemctl status seaweedfs` +- **Logs:** `journalctl -u seaweedfs -f` +- **Stop:** `sudo systemctl stop seaweedfs` +- **Restart:** `sudo systemctl restart seaweedfs` +- **S3 endpoint:** http://127.0.0.1:8333 (default credentials: minioadmin/minioadmin) +- **Data directory:** `/home/ec2-user/data` (edit the unit file or use a drop-in to change) + +**Unit file details (`seaweedfs.service`):** +- `ExecStart`: `/usr/local/bin/weed server -s3 -ip.bind=0.0.0.0 -dir=/home/ec2-user/data -volume.max=100` +- `Restart=on-failure`, `RestartSec=5` +- S3 credentials are set via `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` in the unit (default minioadmin/minioadmin); override with `/etc/default/seaweedfs` or a systemd drop-in if needed +- To use a different user or data dir, copy the unit to a drop-in or edit `/etc/systemd/system/seaweedfs.service` after install + +**Troubleshooting:** If the service fails to start, check `sudo systemctl status seaweedfs` and `journalctl -u seaweedfs -n 50`. Ensure `/home/ec2-user/data` exists and is writable by `ec2-user`, and that `/usr/local/bin/weed` is executable. On SELinux systems, the script runs `chcon -t bin_t /usr/local/bin/weed` to allow execution. + +### 5. `upload_to_minio_aws.sh` Uploads downloaded artifacts to MinIO using AWS CLI (S3-compatible API). **Features:** @@ -139,7 +233,7 @@ sudo ./upload_to_minio_aws.sh - Use this if you already have AWS CLI installed - Use `upload_to_minio.sh` for better MinIO native support -### 4. `upload_to_s3.sh` +### 6. `upload_to_s3.sh` Uploads downloaded artifacts to AWS S3 storage. **Features:** @@ -181,7 +275,7 @@ sudo S3_BUCKET=your-bucket-name ./upload_to_s3.sh - Set `S3_BUCKET` environment variable - Optional: Set `S3_REGION` (default: us-east-1) and `S3_PREFIX` (default: model_artifacts) -### 5. `test_minio_connection.sh` +### 7. `test_minio_connection.sh` Diagnostic script to test MinIO connectivity and troubleshoot issues. **Features:** @@ -333,12 +427,12 @@ All artifacts in the list will be downloaded and uploaded automatically. ### For Download Script: - No additional environment variables needed (reads from `model_artifacts_configs.yaml`) -### For MinIO Upload Script (using mc): +### For MinIO / S3-compatible Upload Script (using mc, `upload_to_minio.sh`): - No config file needed - automatically uploads all artifacts from `./model_artifacts/` -- `MINIO_ENDPOINT`: MinIO server endpoint (default: http://127.0.0.1:9000) -- `MINIO_BUCKET`: Target bucket name (default: personal) -- `MINIO_ROOT_USER`: MinIO access key (default: minioadmin) -- `MINIO_ROOT_PASSWORD`: MinIO secret key (default: minioadmin) +- Works with MinIO, SeaweedFS, or any S3-compatible backend. +- **Preferred (generic):** `OBJECT_STORE_ENDPOINT`, `OBJECT_STORE_BUCKET`, `OBJECT_STORE_ACCESS_KEY`, `OBJECT_STORE_SECRET_KEY` +- **Backward compatibility:** `MINIO_ENDPOINT`, `MINIO_BUCKET`, `MINIO_ROOT_USER`, `MINIO_ROOT_PASSWORD` (or `MINIO_ACCESS_KEY`/`MINIO_SECRET_KEY`) +- Defaults: endpoint http://127.0.0.1:9000, bucket ai-platform-bucket-minio-us-east-2, minioadmin/minioadmin ### For MinIO Upload Script (using AWS CLI): - No config file needed - automatically uploads all artifacts from `./model_artifacts/` diff --git a/tools/artifacts_download_upload_scripts/upload_splunk_app_to_seaweedfs.sh b/tools/artifacts_download_upload_scripts/upload_splunk_app_to_seaweedfs.sh new file mode 100755 index 0000000..eafda79 --- /dev/null +++ b/tools/artifacts_download_upload_scripts/upload_splunk_app_to_seaweedfs.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Upload Splunk_AI_Assistant_Cloud.tgz to SeaweedFS at bucket/apps/Splunk_AI_Assistant_Cloud.tgz. +# Uses the same OBJECT_STORE_* / SEAWEEDFS_* env vars as upload_to_seaweedfs.sh and create_seaweedfs_folders.sh. + +set -e + +APP_FILENAME="${SPLUNK_APP_FILENAME:-Splunk_AI_Assistant_Cloud.tgz}" +LOCAL_PATH="${SPLUNK_APP_LOCAL_PATH:-./${APP_FILENAME}}" + +OBJECT_STORE_ENDPOINT="${OBJECT_STORE_ENDPOINT:-${SEAWEEDFS_ENDPOINT:-http://127.0.0.1:8333}}" +OBJECT_STORE_BUCKET="${OBJECT_STORE_BUCKET:-${SEAWEEDFS_BUCKET:-ai-platform-bucket-minio-us-east-2}}" +OBJECT_STORE_ACCESS_KEY="${OBJECT_STORE_ACCESS_KEY:-${SEAWEEDFS_ACCESS_KEY:-minioadmin}}" +OBJECT_STORE_SECRET_KEY="${OBJECT_STORE_SECRET_KEY:-${SEAWEEDFS_SECRET_KEY:-minioadmin}}" + +OBJECT_STORE_BUCKET=$(echo "$OBJECT_STORE_BUCKET" | tr '[:upper:]' '[:lower:]') + +seaweedfs_ok() { + local code + code=$(curl -s -o /dev/null -w "%{http_code}" "${OBJECT_STORE_ENDPOINT}" 2>/dev/null || echo "000") + [[ "$code" == "200" || "$code" == "403" || "$code" == "400" ]] +} + +if [[ ! -f "$LOCAL_PATH" ]]; then + echo "Error: App file not found: $LOCAL_PATH" + echo "Set SPLUNK_APP_LOCAL_PATH to the path of Splunk_AI_Assistant_Cloud.tgz, or put the file in the current directory." + exit 1 +fi + +if ! seaweedfs_ok; then + echo "SeaweedFS not reachable at ${OBJECT_STORE_ENDPOINT}. Start SeaweedFS first (e.g. sudo systemctl start seaweedfs)." + exit 1 +fi + +if ! command -v mc &>/dev/null; then + echo "MinIO Client (mc) is required. Install it or run create_seaweedfs_folders.sh first (it installs mc)." + exit 1 +fi + +MC_ALIAS="seaweedfs" +mc alias set "$MC_ALIAS" "$OBJECT_STORE_ENDPOINT" "$OBJECT_STORE_ACCESS_KEY" "$OBJECT_STORE_SECRET_KEY" --api S3v4 +mc mb "${MC_ALIAS}/${OBJECT_STORE_BUCKET}" --ignore-existing 2>/dev/null || true + +DEST="${MC_ALIAS}/${OBJECT_STORE_BUCKET}/apps/${APP_FILENAME}" +echo "Uploading ${LOCAL_PATH} to ${DEST}..." +mc cp "$LOCAL_PATH" "$DEST" +echo "Done. App is at ${OBJECT_STORE_BUCKET}/apps/${APP_FILENAME}" diff --git a/tools/cluster_setup/artifacts.yaml b/tools/cluster_setup/artifacts.yaml index 39d6c5d..c44a36e 100644 --- a/tools/cluster_setup/artifacts.yaml +++ b/tools/cluster_setup/artifacts.yaml @@ -2237,8 +2237,8 @@ spec: path: description: |- Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, or minio://bucketname/ - pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ + azure://containername/, minio://bucketname/, seaweedfs://bucketname/, or s3compat://bucketname/ + pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ type: string region: description: Region of the remote storage volume. Required for @@ -4873,8 +4873,8 @@ spec: path: description: |- Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, or minio://bucketname/ - pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ + azure://containername/, minio://bucketname/, seaweedfs://bucketname/, or s3compat://bucketname/ + pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ type: string region: description: Region of the remote storage volume. Required for From f529ea8d80bfe4a2676fd1094d37ca7a3cbac9a5 Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Thu, 26 Feb 2026 12:46:36 +0530 Subject: [PATCH 04/55] vulnerability issue: version upgrade for opentelemetry-go from v1.33.0 to 1.40.0 --- config/configs/applications.yaml | 78 +++++++++---------- .../crd/bases/ai.splunk.com_aiplatforms.yaml | 18 ++--- 2 files changed, 48 insertions(+), 48 deletions(-) diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index 91999de..1cebe04 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -10,9 +10,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" - MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" - MINIO_SECRET_KEY: "{{.MinioSecretKey}}" + OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" + OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" + OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -65,9 +65,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" - MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" - MINIO_SECRET_KEY: "{{.MinioSecretKey}}" + OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" + OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" + OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -114,9 +114,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" - MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" - MINIO_SECRET_KEY: "{{.MinioSecretKey}}" + OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" + OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" + OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -163,9 +163,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" - MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" - MINIO_SECRET_KEY: "{{.MinioSecretKey}}" + OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" + OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" + OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -208,9 +208,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" - MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" - MINIO_SECRET_KEY: "{{.MinioSecretKey}}" + OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" + OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" + OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -264,9 +264,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" - MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" - MINIO_SECRET_KEY: "{{.MinioSecretKey}}" + OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" + OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" + OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -289,9 +289,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" - MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" - MINIO_SECRET_KEY: "{{.MinioSecretKey}}" + OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" + OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" + OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -339,9 +339,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" - MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" - MINIO_SECRET_KEY: "{{.MinioSecretKey}}" + OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" + OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" + OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -415,9 +415,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" - MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" - MINIO_SECRET_KEY: "{{.MinioSecretKey}}" + OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" + OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" + OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -472,9 +472,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" - MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" - MINIO_SECRET_KEY: "{{.MinioSecretKey}}" + OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" + OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" + OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -557,9 +557,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" - MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" - MINIO_SECRET_KEY: "{{.MinioSecretKey}}" + OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" + OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" + OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -603,9 +603,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" - MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" - MINIO_SECRET_KEY: "{{.MinioSecretKey}}" + OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" + OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" + OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -634,9 +634,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - MINIO_ENDPOINT_URL: "{{.MinioEndpointUrl}}" - MINIO_ACCESS_KEY: "{{.MinioAccessKey}}" - MINIO_SECRET_KEY: "{{.MinioSecretKey}}" + OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" + OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" + OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" diff --git a/config/crd/bases/ai.splunk.com_aiplatforms.yaml b/config/crd/bases/ai.splunk.com_aiplatforms.yaml index 25bf11b..67fc505 100644 --- a/config/crd/bases/ai.splunk.com_aiplatforms.yaml +++ b/config/crd/bases/ai.splunk.com_aiplatforms.yaml @@ -2222,25 +2222,24 @@ spec: type: object objectStorage: description: |- - ObjectStorage defines the object storage configuration for AI artifacts, tasks, and models. - Supported: AWS S3, MinIO, SeaweedFS, any S3-compatible (s3:// + endpoint), GCS, Azure Blob. - Backend is selected by path scheme; when endpoint is set with s3://, backend is S3-compatible. + ObjectStorage defines the object storage configuration for AI artifacts, tasks, and models + Supported providers: S3, GCS, Azure Blob Storage, MinIO properties: endpoint: description: |- - Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS). - Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible. + Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS) + Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible (MinIO, SeaweedFS, etc.) pattern: ^https?://.*$ type: string path: description: |- - Remote volume URI: s3://bucket/prefix, gs://bucket/prefix, azure://container/prefix, - minio://bucket/prefix, or seaweedfs://bucket/prefix + Remote volume URI in the format s3://bucketname/, gs://bucketname/, + azure://containername/, s3compat://bucketname/ (generic S3-compatible), minio://, or seaweedfs:// pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ type: string provider: description: |- - Optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint. + Provider is an optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint. Values: aws, minio, seaweedfs, s3compat, gcs, azure enum: - aws @@ -2256,7 +2255,8 @@ spec: minLength: 1 type: string secretRef: - description: Secret name containing storage credentials (e.g. s3_access_key, s3_secret_key for S3-compatible) + description: Secret name containing storage credentials (e.g. + s3_access_key, s3_secret_key for S3-compatible backends) maxLength: 253 minLength: 1 type: string From 718a31cacf09851a37bdb0ebb0b46650fce9b904 Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Thu, 26 Feb 2026 13:28:55 +0530 Subject: [PATCH 05/55] s3object storage changes --- config/configs/applications.yaml | 78 +++++++++---------- .../crd/bases/ai.splunk.com_aiservices.yaml | 23 ++++-- docs/configuration/object-storage.md | 2 +- docs/troubleshooting.md | 36 ++++----- pkg/ai/features/saia/impl.go | 10 +-- pkg/ai/raybuilder/builder.go | 50 ++++++------ .../README.md | 49 ++++++------ tools/cluster_setup/EKS_README.md | 8 +- 8 files changed, 134 insertions(+), 122 deletions(-) diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index 1cebe04..22ecbfb 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -10,9 +10,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" - OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" - OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -65,9 +65,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" - OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" - OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -114,9 +114,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" - OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" - OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -163,9 +163,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" - OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" - OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -208,9 +208,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" - OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" - OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -264,9 +264,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" - OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" - OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -289,9 +289,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" - OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" - OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -339,9 +339,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" - OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" - OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -415,9 +415,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" - OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" - OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -472,9 +472,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" - OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" - OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -557,9 +557,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" - OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" - OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -603,9 +603,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" - OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" - OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -634,9 +634,9 @@ applications: S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" - OBJECT_STORE_ENDPOINT_URL: "{{.ObjectStoreEndpointUrl}}" - OBJECT_STORE_ACCESS_KEY: "{{.ObjectStoreAccessKey}}" - OBJECT_STORE_SECRET_KEY: "{{.ObjectStoreSecretKey}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" diff --git a/config/crd/bases/ai.splunk.com_aiservices.yaml b/config/crd/bases/ai.splunk.com_aiservices.yaml index f9c3493..5bce496 100644 --- a/config/crd/bases/ai.splunk.com_aiservices.yaml +++ b/config/crd/bases/ai.splunk.com_aiservices.yaml @@ -1818,15 +1818,27 @@ spec: properties: endpoint: description: |- - Optional override endpoint (only needed for S3-compatible services like MinIO) - Must be a valid HTTP/HTTPS URL + Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS) + Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible (MinIO, SeaweedFS, etc.) pattern: ^https?://.*$ type: string path: description: |- Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, or minio://bucketname/ - pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ + azure://containername/, s3compat://bucketname/ (generic S3-compatible), minio://, or seaweedfs:// + pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ + type: string + provider: + description: |- + Provider is an optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint. + Values: aws, minio, seaweedfs, s3compat, gcs, azure + enum: + - aws + - minio + - seaweedfs + - s3compat + - gcs + - azure type: string region: description: Region of the remote storage volume. Required for @@ -1834,7 +1846,8 @@ spec: minLength: 1 type: string secretRef: - description: Secret name containing storage credentials + description: Secret name containing storage credentials (e.g. + s3_access_key, s3_secret_key for S3-compatible backends) maxLength: 253 minLength: 1 type: string diff --git a/docs/configuration/object-storage.md b/docs/configuration/object-storage.md index 873632b..70a1f7a 100644 --- a/docs/configuration/object-storage.md +++ b/docs/configuration/object-storage.md @@ -99,7 +99,7 @@ spec: secretRef: minio-credentials ``` -The same Kubernetes secret format is used for all S3-compatible backends: keys `s3_access_key` and `s3_secret_key`. Pods receive these as `MINIO_ACCESS_KEY`, `MINIO_SECRET_KEY`, and `MINIO_ENDPOINT_URL` (when endpoint is set). +The same Kubernetes secret format is used for all S3-compatible backends: keys `s3_access_key` and `s3_secret_key`. Pods receive **`S3COMPAT_OBJECT_STORE_ENDPOINT_URL`** (when endpoint is set), **`S3COMPAT_OBJECT_STORE_ACCESS_KEY`**, and **`S3COMPAT_OBJECT_STORE_SECRET_KEY`** from the operator. ## Adding new S3-compatible backends diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 4f85bdb..57854d5 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -213,16 +213,16 @@ the model loader is trying to use a **local path** where the model should have b 1. **Model not in object storage** The prefix `model_artifacts/llama31-8b-instruct` must exist in your bucket with a full Hugging Face–style layout (including `config.json` and weight files). - Download: `./tools/artifacts_download_upload_scripts/download_from_huggingface.sh` - - Upload to MinIO: `./tools/artifacts_download_upload_scripts/upload_to_minio.sh` (set `MINIO_ENDPOINT`, `MINIO_BUCKET`, `MINIO_ACCESS_KEY`, `MINIO_SECRET_KEY` as in the [artifacts README](../tools/artifacts_download_upload_scripts/README.md)). + - Upload to MinIO/S3-compatible: `./tools/artifacts_download_upload_scripts/upload_to_minio.sh` (set `S3COMPAT_OBJECT_STORE_ENDPOINT`, `S3COMPAT_OBJECT_STORE_BUCKET`, and credentials as in the [artifacts README](../tools/artifacts_download_upload_scripts/README.md); `MINIO_*` env vars are also accepted). 2. **Ray workers cannot reach MinIO/S3** - For **external MinIO** (e.g. EC2): ensure the MinIO endpoint in `cluster-config.yaml` (`storage.minio.endpoint`) is reachable from EKS (security groups, VPC, and if using a public IP, that nodes can egress to it). - From a Ray worker pod: - `kubectl exec -it -n -- env | grep -E 'MINIO|ARTIFACTS|S3'` - then test connectivity (e.g. curl to the MinIO endpoint or use the same client the SDK uses). + `kubectl exec -it -n -- env | grep -E 'OBJECT_STORE|ARTIFACTS|S3'` + then test connectivity (e.g. curl to the object store endpoint or use the same client the SDK uses). 3. **Wrong or missing credentials** - AIPlatform must have `objectStorage.secretRef` pointing to a secret with `s3_access_key` and `s3_secret_key` (and the operator passes these as `MINIO_ACCESS_KEY` / `MINIO_SECRET_KEY` to Ray). Verify the secret exists and matches the MinIO/S3 account that can read the bucket: + AIPlatform must have `objectStorage.secretRef` pointing to a secret with `s3_access_key` and `s3_secret_key` (the operator passes these as `S3COMPAT_OBJECT_STORE_ACCESS_KEY` / `S3COMPAT_OBJECT_STORE_SECRET_KEY` to Ray). Verify the secret exists and matches the S3-compatible account that can read the bucket: - `kubectl get secret minio-credentials -n -o jsonpath='{.data}'` 4. **Bucket/prefix mismatch** @@ -230,7 +230,7 @@ the model loader is trying to use a **local path** where the model should have b **Quick checks:** -- List objects in MinIO for the model prefix (from a host with `mc` or AWS CLI configured for MinIO): +- List objects in the object store for the model prefix (from a host with `mc` or AWS CLI configured): - `mc ls myminio//model_artifacts/llama31-8b-instruct/` You should see at least `config.json` and the model weight files. - From a Ray worker pod, confirm env vars and that the path is writable: @@ -239,7 +239,7 @@ the model loader is trying to use a **local path** where the model should have b **Full reset when the deployment keeps failing (e.g. Llama31Instruct / LLMDeploymentL40S):** -If the model is correct in MinIO and credentials are in the serve config but the replica still fails with "Invalid repository ID or local directory", clear the artifact cache and restart Ray so replicas run a fresh download and load. +If the model is correct in object storage and credentials are in the serve config but the replica still fails with "Invalid repository ID or local directory", clear the artifact cache and restart Ray so replicas run a fresh download and load. 1. **Clear the artifact cache on all workers** Either remove only the failing model prefix or the entire `model_artifacts` tree (more thorough): @@ -258,7 +258,7 @@ If the model is correct in MinIO and credentials are in the serve config but the done ``` -2. **Restart worker pods** so new replicas run and download from MinIO: +2. **Restart worker pods** so new replicas run and download from object storage: ```bash kubectl delete pods -n "$AI_NS" -l ray.io/node-type=worker @@ -280,9 +280,9 @@ If the model is correct in MinIO and credentials are in the serve config but the kubectl exec -n "$AI_NS" "$WORKER" -c ray-worker -- sh -c 'ls /home/ray/.cache/s3/artifacts/model_artifacts/llama31-8b-instruct/*.safetensors 2>/dev/null || echo "No safetensors"' ``` -### MinIO credentials and serve config verification +### Object store credentials and serve config verification -When using MinIO, the operator injects credentials from the object storage secret into the Ray Serve config so replicas can download model artifacts. Use these steps to verify the secret and that the updated serve config is applied. +When using S3-compatible object storage (MinIO, SeaweedFS, etc.), the operator injects credentials from the object storage secret into the Ray Serve config so replicas can download model artifacts. Use these steps to verify the secret and that the updated serve config is applied. **1. Check that the AIPlatform object storage secret exists and has the required keys** @@ -313,7 +313,7 @@ kubectl -n create secret generic \ **2. Reconcile or restart the operator with the new image** -After updating the operator image (with the change that injects MinIO credentials into the serve config), either trigger a reconcile or restart the operator so it rewrites `RayService.spec.serveConfigV2`. +After updating the operator image (with the change that injects object store credentials into the serve config), either trigger a reconcile or restart the operator so it rewrites `RayService.spec.serveConfigV2`. - **Option A – Restart the operator deployment** (simplest; causes one reconcile when the pod comes back): @@ -332,28 +332,28 @@ After updating the operator image (with the change that injects MinIO credential The operator will reconcile and regenerate the RayService; ensure the operator is already running the new image before doing this. -**3. Confirm RayService.spec.serveConfigV2 includes MINIO_ACCESS_KEY and MINIO_SECRET_KEY** +**3. Confirm RayService.spec.serveConfigV2 includes S3COMPAT_OBJECT_STORE_ACCESS_KEY and S3COMPAT_OBJECT_STORE_SECRET_KEY** -The serve config is a JSON string in `RayService.spec.serveConfigV2`. Check that it contains the MinIO env vars for the apps (e.g. after the operator has reconciled). +The serve config is a JSON string in `RayService.spec.serveConfigV2`. Check that it contains the object store env vars for the apps (e.g. after the operator has reconciled). ```bash # Set your AIPlatform namespace and RayService name (often the same as AIPlatform name, e.g. splunk-ai-stack) NAMESPACE="" RAY_SERVICE_NAME="" -# Count occurrences of MINIO_ACCESS_KEY in the serve config (expect > 0 when using MinIO) -kubectl get rayservice "$RAY_SERVICE_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.serveConfigV2}' | jq -Rs 'split("MINIO_ACCESS_KEY") | length - 1' +# Count occurrences of S3COMPAT_OBJECT_STORE_ACCESS_KEY in the serve config (expect > 0 when using S3-compatible storage) +kubectl get rayservice "$RAY_SERVICE_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.serveConfigV2}' | jq -Rs 'split("S3COMPAT_OBJECT_STORE_ACCESS_KEY") | length - 1' # Show a snippet to confirm the keys are present (values are redacted in output) -kubectl get rayservice "$RAY_SERVICE_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.serveConfigV2}' | grep -o '"MINIO_ACCESS_KEY"[^,]*' | head -1 -kubectl get rayservice "$RAY_SERVICE_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.serveConfigV2}' | grep -o '"MINIO_SECRET_KEY"[^,]*' | head -1 +kubectl get rayservice "$RAY_SERVICE_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.serveConfigV2}' | grep -o '"S3COMPAT_OBJECT_STORE_ACCESS_KEY"[^,]*' | head -1 +kubectl get rayservice "$RAY_SERVICE_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.serveConfigV2}' | grep -o '"S3COMPAT_OBJECT_STORE_SECRET_KEY"[^,]*' | head -1 ``` If the count is 0, the operator may not be using the new image, or `objectStorage.secretRef` may be unset. Ensure: -- The AIPlatform has `spec.objectStorage.path` with scheme `minio://...` and `spec.objectStorage.secretRef` set to the secret name. +- The AIPlatform has `spec.objectStorage.path` with scheme `s3compat://`, `minio://`, or `seaweedfs://` and `spec.objectStorage.secretRef` set to the secret name. - The secret exists in the AIPlatform namespace and contains `s3_access_key` and `s3_secret_key`. -- The operator deployment has been restarted (or reconciled) with the image that injects MinIO credentials into the applications template. +- The operator deployment has been restarted (or reconciled) with the image that injects object store credentials into the applications template. After confirming, restart Ray workers if needed so they pick up the new env (e.g. scale down and up the Ray cluster or wait for rolling restart), then re-check replica logs and the cache path `/home/ray/.cache/s3/artifacts/model_artifacts/...`. diff --git a/pkg/ai/features/saia/impl.go b/pkg/ai/features/saia/impl.go index 9a0fa29..1f0138d 100644 --- a/pkg/ai/features/saia/impl.go +++ b/pkg/ai/features/saia/impl.go @@ -619,16 +619,16 @@ func (r *SaiaReconciler) reconcileSAIADeployment( {Name: "S3_BUCKET", Value: extractBucketName(ai.Spec.TaskVolume.Path)}, } - // MinIO/S3-compatible: SAIA service expects MINIO_ENDPOINT_URL when using custom endpoint (MinIO or S3-compatible) + // S3-compatible object store: set S3COMPAT_OBJECT_STORE_ENDPOINT_URL for custom endpoint (MinIO, SeaweedFS, etc.). if ai.Spec.TaskVolume.Endpoint != "" { - env = append(env, corev1.EnvVar{Name: "MINIO_ENDPOINT_URL", Value: ai.Spec.TaskVolume.Endpoint}) + env = append(env, corev1.EnvVar{Name: "S3COMPAT_OBJECT_STORE_ENDPOINT_URL", Value: ai.Spec.TaskVolume.Endpoint}) } - // MinIO credentials: If secretRef is provided, add MINIO_ACCESS_KEY and MINIO_SECRET_KEY from secret + // S3-compatible object store credentials from secretRef (S3COMPAT_OBJECT_STORE_ACCESS_KEY, S3COMPAT_OBJECT_STORE_SECRET_KEY). if ai.Spec.TaskVolume.SecretRef != "" { env = append(env, corev1.EnvVar{ - Name: "MINIO_ACCESS_KEY", + Name: "S3COMPAT_OBJECT_STORE_ACCESS_KEY", ValueFrom: &corev1.EnvVarSource{ SecretKeyRef: &corev1.SecretKeySelector{ LocalObjectReference: corev1.LocalObjectReference{Name: ai.Spec.TaskVolume.SecretRef}, @@ -637,7 +637,7 @@ func (r *SaiaReconciler) reconcileSAIADeployment( }, }, corev1.EnvVar{ - Name: "MINIO_SECRET_KEY", + Name: "S3COMPAT_OBJECT_STORE_SECRET_KEY", ValueFrom: &corev1.EnvVarSource{ SecretKeyRef: &corev1.SecretKeySelector{ LocalObjectReference: corev1.LocalObjectReference{Name: ai.Spec.TaskVolume.SecretRef}, diff --git a/pkg/ai/raybuilder/builder.go b/pkg/ai/raybuilder/builder.go index f9c64a0..b45a859 100644 --- a/pkg/ai/raybuilder/builder.go +++ b/pkg/ai/raybuilder/builder.go @@ -44,13 +44,13 @@ type Builder struct { } type ApplicationParams struct { - ArtifactBucketName string `yaml:"ARTIFACTS_S3_BUCKET"` - ArtifactsProvider string `yaml:"ARTIFACTS_PROVIDER"` - CloudProvider string `yaml:"CLOUD_PROVIDER"` - MinioEndpointUrl string `yaml:"MINIO_ENDPOINT_URL"` - MinioAccessKey string `yaml:"MINIO_ACCESS_KEY"` - MinioSecretKey string `yaml:"MINIO_SECRET_KEY"` - Replicas map[string]int32 `yaml:"REPLICAS"` + ArtifactBucketName string `yaml:"ARTIFACTS_S3_BUCKET"` + ArtifactsProvider string `yaml:"ARTIFACTS_PROVIDER"` + CloudProvider string `yaml:"CLOUD_PROVIDER"` + S3CompatObjectStoreEndpointUrl string `yaml:"S3COMPAT_OBJECT_STORE_ENDPOINT_URL"` + S3CompatObjectStoreAccessKey string `yaml:"S3COMPAT_OBJECT_STORE_ACCESS_KEY"` + S3CompatObjectStoreSecretKey string `yaml:"S3COMPAT_OBJECT_STORE_SECRET_KEY"` + Replicas map[string]int32 `yaml:"REPLICAS"` } type WorkerConfigs map[string][]InstanceDetail @@ -154,14 +154,14 @@ func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPl } } - // S3-compatible backends (s3compat, MinIO, SeaweedFS) need custom endpoint and credentials. S3 (AWS) uses region/IRSA only. + // S3-compatible backends (s3compat, minio, seaweedfs) need custom endpoint and credentials. S3 (AWS) uses region/IRSA only. s3CompatScheme := (u.Scheme == "s3compat" || u.Scheme == "minio" || u.Scheme == "seaweedfs") - minioEndpoint := "" + s3CompatObjectStoreEndpoint := "" if s3CompatScheme && p.Spec.ObjectStorage.Endpoint != "" { - minioEndpoint = p.Spec.ObjectStorage.Endpoint + s3CompatObjectStoreEndpoint = p.Spec.ObjectStorage.Endpoint } - var minioAccessKey, minioSecretKey string + var s3CompatObjectStoreAccessKey, s3CompatObjectStoreSecretKey string if p.Spec.ObjectStorage.SecretRef != "" && s3CompatScheme { var secret corev1.Secret secretRef := types.NamespacedName{Namespace: p.Namespace, Name: p.Spec.ObjectStorage.SecretRef} @@ -170,21 +170,21 @@ func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPl return err } if raw, ok := secret.Data["s3_access_key"]; ok { - minioAccessKey = string(raw) + s3CompatObjectStoreAccessKey = string(raw) } if raw, ok := secret.Data["s3_secret_key"]; ok { - minioSecretKey = string(raw) + s3CompatObjectStoreSecretKey = string(raw) } } param := ApplicationParams{ - ArtifactBucketName: u.Host, - ArtifactsProvider: artifactsProvider, - CloudProvider: cloudProvider, - MinioEndpointUrl: minioEndpoint, - MinioAccessKey: minioAccessKey, - MinioSecretKey: minioSecretKey, - Replicas: replicasMap, + ArtifactBucketName: u.Host, + ArtifactsProvider: artifactsProvider, + CloudProvider: cloudProvider, + S3CompatObjectStoreEndpointUrl: s3CompatObjectStoreEndpoint, + S3CompatObjectStoreAccessKey: s3CompatObjectStoreAccessKey, + S3CompatObjectStoreSecretKey: s3CompatObjectStoreSecretKey, + Replicas: replicasMap, } // Use embedded applications.yaml content @@ -716,8 +716,8 @@ func (b *Builder) buildClusterConfig(ctx context.Context) (*rayv1.RayClusterSpec }, nil } -// objectStorageSecretEnv returns env vars for MINIO_ACCESS_KEY and MINIO_SECRET_KEY from -// the objectStorage secret (s3_access_key/s3_secret_key) so models and SAIA can access MinIO/S3. +// objectStorageSecretEnv returns env vars for S3COMPAT_OBJECT_STORE_ACCESS_KEY and S3COMPAT_OBJECT_STORE_SECRET_KEY from +// the objectStorage secret (s3_access_key/s3_secret_key) for S3-compatible object storage. func (b *Builder) objectStorageSecretEnv() []corev1.EnvVar { if b.ai.Spec.ObjectStorage.SecretRef == "" { return nil @@ -725,7 +725,7 @@ func (b *Builder) objectStorageSecretEnv() []corev1.EnvVar { secretName := b.ai.Spec.ObjectStorage.SecretRef return []corev1.EnvVar{ { - Name: "MINIO_ACCESS_KEY", + Name: "S3COMPAT_OBJECT_STORE_ACCESS_KEY", ValueFrom: &corev1.EnvVarSource{ SecretKeyRef: &corev1.SecretKeySelector{ LocalObjectReference: corev1.LocalObjectReference{Name: secretName}, @@ -734,7 +734,7 @@ func (b *Builder) objectStorageSecretEnv() []corev1.EnvVar { }, }, { - Name: "MINIO_SECRET_KEY", + Name: "S3COMPAT_OBJECT_STORE_SECRET_KEY", ValueFrom: &corev1.EnvVarSource{ SecretKeyRef: &corev1.SecretKeySelector{ LocalObjectReference: corev1.LocalObjectReference{Name: secretName}, @@ -860,7 +860,7 @@ func (b *Builder) makeWorkerTemplate(cfg InstanceDetail) corev1.PodTemplateSpec combinedEnv = append(combinedEnv, corev1.EnvVar{Name: key, Value: value}) } } - // MinIO/S3 credentials for models and SAIA (MINIO_ACCESS_KEY, MINIO_SECRET_KEY) + // S3-compatible object store credentials for models and SAIA (S3COMPAT_OBJECT_STORE_*) combinedEnv = append(combinedEnv, b.objectStorageSecretEnv()...) rayCommand := fmt.Sprintf(`echo %s worker; ulimit -n 65536; diff --git a/tools/artifacts_download_upload_scripts/README.md b/tools/artifacts_download_upload_scripts/README.md index e8b7c3d..3f47a7f 100755 --- a/tools/artifacts_download_upload_scripts/README.md +++ b/tools/artifacts_download_upload_scripts/README.md @@ -98,12 +98,12 @@ Preferred generic names; `MINIO_*` are accepted for backward compatibility. | Preferred (generic) | Fallback | Description | |---------------------|----------|-------------| -| `OBJECT_STORE_ENDPOINT` | `MINIO_ENDPOINT` | S3 API endpoint URL (e.g. http://host:9000 for MinIO, http://host:8333 for SeaweedFS) | -| `OBJECT_STORE_BUCKET` | `MINIO_BUCKET` | Bucket name | -| `OBJECT_STORE_ACCESS_KEY` | `MINIO_ROOT_USER` or `MINIO_ACCESS_KEY` | Access key | -| `OBJECT_STORE_SECRET_KEY` | `MINIO_ROOT_PASSWORD` or `MINIO_SECRET_KEY` | Secret key | +| `S3COMPAT_OBJECT_STORE_ENDPOINT` | `MINIO_ENDPOINT` | S3 API endpoint URL (e.g. http://host:9000 for MinIO, http://host:8333 for SeaweedFS) | +| `S3COMPAT_OBJECT_STORE_BUCKET` | `MINIO_BUCKET` | Bucket name | +| `S3COMPAT_OBJECT_STORE_ACCESS_KEY` | `MINIO_ROOT_USER` or `MINIO_ACCESS_KEY` | Access key | +| `S3COMPAT_OBJECT_STORE_SECRET_KEY` | `MINIO_ROOT_PASSWORD` or `MINIO_SECRET_KEY` | Secret key | -Example for SeaweedFS: `OBJECT_STORE_ENDPOINT=http://seaweedfs:8333 OBJECT_STORE_BUCKET=my-bucket ./upload_to_minio.sh` +Example for SeaweedFS: `S3COMPAT_OBJECT_STORE_ENDPOINT=http://seaweedfs:8333 S3COMPAT_OBJECT_STORE_BUCKET=my-bucket ./upload_to_minio.sh` **Prerequisites:** - Run `download_from_huggingface.sh` first to download artifacts @@ -126,7 +126,7 @@ Uploads downloaded artifacts to SeaweedFS (S3-compatible). If SeaweedFS is not r With a remote SeaweedFS: ```bash -OBJECT_STORE_ENDPOINT=http://seaweedfs-host:8333 OBJECT_STORE_BUCKET=my-bucket ./upload_to_seaweedfs.sh +S3COMPAT_OBJECT_STORE_ENDPOINT=http://seaweedfs-host:8333 S3COMPAT_OBJECT_STORE_BUCKET=my-bucket ./upload_to_seaweedfs.sh ``` To skip auto-install and only fail if unreachable: @@ -136,9 +136,9 @@ SEAWEEDFS_SKIP_INSTALL=1 ./upload_to_seaweedfs.sh **Volume limit:** When the script starts SeaweedFS it uses `-volume.max=100` (set `SEAWEEDFS_VOLUME_MAX`; use `0` for auto). The default (~7) can cause "0 node candidates" once the volume server is "full." -**Environment variables:** `OBJECT_STORE_ENDPOINT` (default: http://127.0.0.1:8333), `OBJECT_STORE_BUCKET`, `OBJECT_STORE_ACCESS_KEY`, `OBJECT_STORE_SECRET_KEY`, `SEAWEEDFS_BUCKETS`, `SEAWEEDFS_SKIP_INSTALL`, `SEAWEEDFS_UPLOAD_RETRIES`, `SEAWEEDFS_UPLOAD_RETRY_DELAY`, `SEAWEEDFS_PARALLEL_JOBS`, `SEAWEEDFS_ERROR_LOG`, `SEAWEEDFS_SKIP_EXISTING`, `SEAWEEDFS_WAIT_VOLUME_SERVER`, `SEAWEEDFS_MASTER`, `SEAWEEDFS_VOLUME_MAX` (default 100). +**Environment variables:** `S3COMPAT_OBJECT_STORE_ENDPOINT` (default: http://127.0.0.1:8333), `S3COMPAT_OBJECT_STORE_BUCKET`, `S3COMPAT_OBJECT_STORE_ACCESS_KEY`, `S3COMPAT_OBJECT_STORE_SECRET_KEY`, `SEAWEEDFS_BUCKETS`, `SEAWEEDFS_SKIP_INSTALL`, `SEAWEEDFS_UPLOAD_RETRIES`, `SEAWEEDFS_UPLOAD_RETRY_DELAY`, `SEAWEEDFS_PARALLEL_JOBS`, `SEAWEEDFS_ERROR_LOG`, `SEAWEEDFS_SKIP_EXISTING`, `SEAWEEDFS_WAIT_VOLUME_SERVER`, `SEAWEEDFS_MASTER`, `SEAWEEDFS_VOLUME_MAX` (default 100). -**SeaweedFS credentials:** SeaweedFS S3 has no built-in users (unlike MinIO’s default `minioadmin`). If you start SeaweedFS yourself, it must be configured to accept the same access key/secret the script uses (defaults: `minioadmin`/`minioadmin`). Options: (1) Start with env vars: `AWS_ACCESS_KEY_ID=minioadmin AWS_SECRET_ACCESS_KEY=minioadmin weed server -s3`; (2) Use a JSON config file with `weed s3 -config=/path/to/s3.json` (see [SeaweedFS S3 Credentials](https://github.com/seaweedfs/seaweedfs/wiki/S3-Credentials)). If you see *"The access key ID you provided does not exist in our records"*, restart SeaweedFS with the same credentials as `OBJECT_STORE_ACCESS_KEY`/`OBJECT_STORE_SECRET_KEY` (or set those env vars to match your SeaweedFS config). +**SeaweedFS credentials:** SeaweedFS S3 has no built-in users (unlike MinIO’s default `minioadmin`). If you start SeaweedFS yourself, it must be configured to accept the same access key/secret the script uses (defaults: `minioadmin`/`minioadmin`). Options: (1) Start with env vars: `AWS_ACCESS_KEY_ID=minioadmin AWS_SECRET_ACCESS_KEY=minioadmin weed server -s3`; (2) Use a JSON config file with `weed s3 -config=/path/to/s3.json` (see [SeaweedFS S3 Credentials](https://github.com/seaweedfs/seaweedfs/wiki/S3-Credentials)). If you see *"The access key ID you provided does not exist in our records"*, restart SeaweedFS with the same credentials as `S3COMPAT_OBJECT_STORE_ACCESS_KEY`/`S3COMPAT_OBJECT_STORE_SECRET_KEY` (or set those env vars to match your SeaweedFS config). **Volume server readiness:** After SeaweedFS has just started (or restarted), the master may not see a volume server yet, so uploads can fail with "Not enough data nodes found". The script can **wait for a volume server** (when endpoint is local and `weed` is available): it polls `weed shell -master=... cluster.ps` for up to `SEAWEEDFS_WAIT_VOLUME_SERVER` seconds (default 60) before starting uploads. Set `SEAWEEDFS_WAIT_VOLUME_SERVER=0` to skip. @@ -222,11 +222,11 @@ sudo ./upload_to_minio_aws.sh **Prerequisites:** - Run `download_from_huggingface.sh` first to download artifacts - May require sudo for installing AWS CLI -- Configure MinIO settings in the script: - - `MINIO_ENDPOINT` (default: http://127.0.0.1:9000) - - `MINIO_BUCKET` (default: ml-platform-artifacts) - - `MINIO_ACCESS_KEY` (default: minioadmin) - - `MINIO_SECRET_KEY` (default: minioadmin) +- Use generic env vars (MINIO_* accepted for backward compatibility): + - `S3COMPAT_OBJECT_STORE_ENDPOINT` (default: http://127.0.0.1:9000) + - `S3COMPAT_OBJECT_STORE_BUCKET` (default: ai-platform-artifacts-bucket) + - `S3COMPAT_OBJECT_STORE_ACCESS_KEY` (default: minioadmin) + - `S3COMPAT_OBJECT_STORE_SECRET_KEY` (default: minioadmin) **When to use this vs `upload_to_minio.sh`:** - Use this if you prefer AWS CLI over MinIO Client (mc) @@ -276,11 +276,11 @@ sudo S3_BUCKET=your-bucket-name ./upload_to_s3.sh - Optional: Set `S3_REGION` (default: us-east-1) and `S3_PREFIX` (default: model_artifacts) ### 7. `test_minio_connection.sh` -Diagnostic script to test MinIO connectivity and troubleshoot issues. +Diagnostic script to test S3-compatible object store connectivity (MinIO, SeaweedFS, etc.) and troubleshoot issues. **Features:** - Tests MinIO Client (mc) installation -- Verifies MinIO endpoint connectivity +- Verifies endpoint connectivity - Tests authentication with credentials - Lists all existing buckets - Tests bucket creation permissions @@ -291,9 +291,9 @@ Diagnostic script to test MinIO connectivity and troubleshoot issues. ./test_minio_connection.sh ``` -Or with custom settings: +Or with custom settings (use generic names; MINIO_* also accepted): ```bash -MINIO_ENDPOINT=http://localhost:9000 MINIO_BUCKET=nexus ./test_minio_connection.sh +S3COMPAT_OBJECT_STORE_ENDPOINT=http://localhost:9000 S3COMPAT_OBJECT_STORE_BUCKET=nexus ./test_minio_connection.sh ``` Or with sudo if dependency installation fails: @@ -307,7 +307,7 @@ sudo ./test_minio_connection.sh **When to use:** - Before running upload scripts for the first time - When bucket creation fails -- To diagnose MinIO connectivity issues +- To diagnose object store connectivity issues - To verify credentials and permissions ## Configuration @@ -430,16 +430,15 @@ All artifacts in the list will be downloaded and uploaded automatically. ### For MinIO / S3-compatible Upload Script (using mc, `upload_to_minio.sh`): - No config file needed - automatically uploads all artifacts from `./model_artifacts/` - Works with MinIO, SeaweedFS, or any S3-compatible backend. -- **Preferred (generic):** `OBJECT_STORE_ENDPOINT`, `OBJECT_STORE_BUCKET`, `OBJECT_STORE_ACCESS_KEY`, `OBJECT_STORE_SECRET_KEY` +- **Preferred (generic):** `S3COMPAT_OBJECT_STORE_ENDPOINT`, `S3COMPAT_OBJECT_STORE_BUCKET`, `S3COMPAT_OBJECT_STORE_ACCESS_KEY`, `S3COMPAT_OBJECT_STORE_SECRET_KEY` - **Backward compatibility:** `MINIO_ENDPOINT`, `MINIO_BUCKET`, `MINIO_ROOT_USER`, `MINIO_ROOT_PASSWORD` (or `MINIO_ACCESS_KEY`/`MINIO_SECRET_KEY`) -- Defaults: endpoint http://127.0.0.1:9000, bucket ai-platform-bucket-minio-us-east-2, minioadmin/minioadmin +- Defaults: endpoint http://127.0.0.1:9000, bucket ai-platform-bucket, minioadmin/minioadmin -### For MinIO Upload Script (using AWS CLI): +### For S3-compatible Upload Script (using AWS CLI, `upload_to_minio_aws.sh`): - No config file needed - automatically uploads all artifacts from `./model_artifacts/` -- `MINIO_ENDPOINT`: MinIO server endpoint (default: http://127.0.0.1:9000) -- `MINIO_BUCKET`: Target bucket name (default: ml-platform-artifacts) -- `MINIO_ACCESS_KEY`: MinIO access key (default: minioadmin) -- `MINIO_SECRET_KEY`: MinIO secret key (default: minioadmin) +- **Preferred (generic):** `S3COMPAT_OBJECT_STORE_ENDPOINT`, `S3COMPAT_OBJECT_STORE_BUCKET`, `S3COMPAT_OBJECT_STORE_ACCESS_KEY`, `S3COMPAT_OBJECT_STORE_SECRET_KEY` +- **Backward compatibility:** `MINIO_ENDPOINT`, `MINIO_BUCKET`, `MINIO_ACCESS_KEY`, `MINIO_SECRET_KEY` (or `MINIO_ROOT_USER`/`MINIO_ROOT_PASSWORD`) +- Defaults: endpoint http://127.0.0.1:9000, bucket ai-platform-artifacts-bucket, minioadmin/minioadmin ### For S3 Upload Script: - No config file needed - automatically uploads all artifacts from `./model_artifacts/` diff --git a/tools/cluster_setup/EKS_README.md b/tools/cluster_setup/EKS_README.md index 0a4e464..c65c94e 100644 --- a/tools/cluster_setup/EKS_README.md +++ b/tools/cluster_setup/EKS_README.md @@ -820,12 +820,12 @@ kubectl get secret "${SECRET_NAME:-minio-credentials}" -n "$AI_NS" 2>/dev/null & kubectl get secret "${SECRET_NAME:-minio-credentials}" -n "$AI_NS" -o jsonpath='{.data}' 2>/dev/null | jq -r 'keys[]' | grep -E 's3_access_key|s3_secret_key' && echo "✓ Required keys present" || echo "✗ Check s3_access_key / s3_secret_key" ``` -**5. RayService and serve config (MinIO credentials in apps)** +**5. RayService and serve config (object store credentials in apps)** ```bash kubectl get rayservice "$AI_PLATFORM_NAME" -n "$AI_NS" -# Count MINIO_ACCESS_KEY in serve config (expect > 0 when using MinIO) -kubectl get rayservice "$AI_PLATFORM_NAME" -n "$AI_NS" -o jsonpath='{.spec.serveConfigV2}' | grep -o 'MINIO_ACCESS_KEY' | wc -l +# Count S3COMPAT_OBJECT_STORE_ACCESS_KEY in serve config (expect > 0 when using S3-compatible storage) +kubectl get rayservice "$AI_PLATFORM_NAME" -n "$AI_NS" -o jsonpath='{.spec.serveConfigV2}' | grep -o 'S3COMPAT_OBJECT_STORE_ACCESS_KEY' | wc -l ``` **6. Ray and application pods** @@ -2275,7 +2275,7 @@ the model is loaded from object storage (S3/MinIO) into that path inside the pod 1. **Model is in MinIO/S3** Upload the model so the bucket has the prefix `model_artifacts/llama31-8b-instruct/` with at least `config.json` and the model weights (see [artifacts README](../artifacts_download_upload_scripts/README.md)): - Download: `./tools/artifacts_download_upload_scripts/download_from_huggingface.sh` - - Upload: `./tools/artifacts_download_upload_scripts/upload_to_minio.sh` (set `MINIO_ENDPOINT`, `MINIO_BUCKET`, and credentials to match your `cluster-config.yaml`). + - Upload: `./tools/artifacts_download_upload_scripts/upload_to_minio.sh` (set `S3COMPAT_OBJECT_STORE_ENDPOINT`, `S3COMPAT_OBJECT_STORE_BUCKET`, and credentials to match your `cluster-config.yaml`; `MINIO_*` env vars are also accepted). 2. **External MinIO reachable from EKS** If using external MinIO (e.g. EC2), ensure: From 6bb3a603377c600811fc734415f9ed045a856fd2 Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Sat, 21 Mar 2026 00:25:21 +0530 Subject: [PATCH 06/55] fix: bump splunk-operator helm dependency from 3.0.0 to 3.1.0 Version 3.0.0 does not exist in the splunk helm repo; 3.1.0 is the latest available. Also regenerates Chart.lock with correct digest. --- helm-chart/splunk-ai-operator/Chart.lock | 6 +++--- helm-chart/splunk-ai-operator/Chart.yaml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/helm-chart/splunk-ai-operator/Chart.lock b/helm-chart/splunk-ai-operator/Chart.lock index 0136098..e80b049 100644 --- a/helm-chart/splunk-ai-operator/Chart.lock +++ b/helm-chart/splunk-ai-operator/Chart.lock @@ -13,6 +13,6 @@ dependencies: version: 72.4.0 - name: splunk-operator repository: https://splunk.github.io/splunk-operator - version: 3.0.0 -digest: sha256:41032e66994109109208bc66b07b6f10890c9c8dafe019aa480d73d4effe915a -generated: "2025-12-11T11:23:06.233099-08:00" + version: 3.1.0 +digest: sha256:bc5e962d5c6b465b26a13a91660d7fa45687c394e124abe2beb96e4a2e3760df +generated: "2026-03-21T00:24:00.448397+05:30" diff --git a/helm-chart/splunk-ai-operator/Chart.yaml b/helm-chart/splunk-ai-operator/Chart.yaml index 782cb8e..6101ae9 100644 --- a/helm-chart/splunk-ai-operator/Chart.yaml +++ b/helm-chart/splunk-ai-operator/Chart.yaml @@ -86,6 +86,6 @@ dependencies: # Splunk Operator - Required for managing Splunk Enterprise instances - name: splunk-operator - version: "3.0.0" + version: "3.1.0" repository: "https://splunk.github.io/splunk-operator" condition: splunk-operator.enabled From fd6727c29d99a2449f7e5717ac8cc494b56c4f13 Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Sat, 21 Mar 2026 23:07:45 +0530 Subject: [PATCH 07/55] fix: update Ray serve import paths to remove splunkai_models_apps prefix The splunkai_models_apps package no longer exists in ai-platform-models. The ray applications are now resolved relative to their working_dir zip, so import paths should be bare module names (main:SERVE_APP / main:create_serve_app). --- config/configs/applications.yaml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index 22ecbfb..f809a83 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -1,6 +1,6 @@ applications: - name: Entrypoint - import_path: splunkai_models_apps.custom.deployments.entrypoint.main:SERVE_APP + import_path: main:SERVE_APP route_prefix: / runtime_env: env_vars: @@ -55,7 +55,7 @@ applications: object_storage: prefix: model_artifacts/uae-large name: UaeLarge - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /uae_large runtime_env: env_vars: @@ -104,7 +104,7 @@ applications: object_storage: prefix: model_artifacts/all-minilm-l6-v2 name: AllMinilmL6V2 - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /all_minilm_l6_v2 runtime_env: env_vars: @@ -153,7 +153,7 @@ applications: object_storage: prefix: model_artifacts/bi-encoder name: BiEncoder - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /bi_encoder runtime_env: env_vars: @@ -198,7 +198,7 @@ applications: object_storage: prefix: model_artifacts/mbart-translator name: MbartTranslator - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /mbart_translator runtime_env: env_vars: @@ -254,7 +254,7 @@ applications: object_storage: prefix: model_artifacts/xlm-roberta-language-classifier name: XlmRobertaLanguageClassifier - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /xlm_roberta_language_classifier runtime_env: env_vars: @@ -279,7 +279,7 @@ applications: custom_deployment_import_path: prompt_injection_tfidf:PromptInjectionTfidfDeployment deployment_type: custom_deployment name: PromptInjectionTfidf - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /prompt_injection_tfidf runtime_env: env_vars: @@ -329,7 +329,7 @@ applications: prefix: model_artifacts/cross-encoder model_type: vllm_scoring_model name: CrossEncoder - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /cross_encoder runtime_env: env_vars: @@ -405,7 +405,7 @@ applications: - tokenizer.json prefix: model_artifacts/llama31-8b-instruct name: Llama31Instruct - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /llama31_instruct runtime_env: env_vars: @@ -462,7 +462,7 @@ applications: object_storage: prefix: model_artifacts/e5-language-classifier name: E5LanguageClassifier - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /e5_language_classifier runtime_env: env_vars: @@ -547,7 +547,7 @@ applications: - tokenizer.json prefix: model_artifacts/llama31-70b-instruct-awq name: Llama3170bInstructAwq - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /llama31_70b_instruct_awq runtime_env: env_vars: @@ -593,7 +593,7 @@ applications: local_path: /home/ray/local_model_artifacts/prompt-injection-cross-encoder-1114 model_type: sentence_transformer_cross_encoder name: PromptInjectionCrossEncoder - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /prompt_injection_cross_encoder runtime_env: env_vars: @@ -624,7 +624,7 @@ applications: local_path: /home/ray/local_model_artifacts/prompt-injection-classifier-01052025 model_type: custom_model name: PromptInjectionClassifier - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /prompt_injection_classifier runtime_env: env_vars: From 8d4f721ac188118c19f6a65647538694f756996a Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Sat, 21 Mar 2026 23:46:20 +0530 Subject: [PATCH 08/55] fix: add working_dir to Ray serve apps and wire WorkingDirBase/ModelVersion into ApplicationParams Without working_dir, Ray has no zip to load main from and fails with 'No module named main'. Added WorkingDirBase and ModelVersion fields to ApplicationParams, computed from object storage path and MODEL_VERSION env var, and templated working_dir into all 13 app entries in applications.yaml. --- config/configs/applications.yaml | 13 ++++++++ pkg/ai/raybuilder/builder.go | 53 +++++++++++++++++++++++--------- 2 files changed, 51 insertions(+), 15 deletions(-) diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index f809a83..5f01e69 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -3,6 +3,7 @@ applications: import_path: main:SERVE_APP route_prefix: / runtime_env: + working_dir: "{{.WorkingDirBase}}/Entrypoint-{{.ModelVersion}}.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: entrypoint @@ -58,6 +59,7 @@ applications: import_path: main:create_serve_app route_prefix: /uae_large runtime_env: + working_dir: "{{.WorkingDirBase}}/UaeLarge-{{.ModelVersion}}.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: uae_large @@ -107,6 +109,7 @@ applications: import_path: main:create_serve_app route_prefix: /all_minilm_l6_v2 runtime_env: + working_dir: "{{.WorkingDirBase}}/AllMinilmL6V2-{{.ModelVersion}}.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: all_minilm_l6_v2 @@ -156,6 +159,7 @@ applications: import_path: main:create_serve_app route_prefix: /bi_encoder runtime_env: + working_dir: "{{.WorkingDirBase}}/BiEncoder-{{.ModelVersion}}.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: bi_encoder @@ -201,6 +205,7 @@ applications: import_path: main:create_serve_app route_prefix: /mbart_translator runtime_env: + working_dir: "{{.WorkingDirBase}}/MbartTranslator-{{.ModelVersion}}.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: mbart_translator @@ -257,6 +262,7 @@ applications: import_path: main:create_serve_app route_prefix: /xlm_roberta_language_classifier runtime_env: + working_dir: "{{.WorkingDirBase}}/XlmRobertaLanguageClassifier-{{.ModelVersion}}.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: xlm_roberta_language_classifier @@ -282,6 +288,7 @@ applications: import_path: main:create_serve_app route_prefix: /prompt_injection_tfidf runtime_env: + working_dir: "{{.WorkingDirBase}}/PromptInjectionTfidf-{{.ModelVersion}}.zip" env_vars: APPLICATION_NAME: "PromptInjectionTfidf" API_VERSION: "v1" @@ -332,6 +339,7 @@ applications: import_path: main:create_serve_app route_prefix: /cross_encoder runtime_env: + working_dir: "{{.WorkingDirBase}}/CrossEncoder-{{.ModelVersion}}.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: cross_encoder @@ -408,6 +416,7 @@ applications: import_path: main:create_serve_app route_prefix: /llama31_instruct runtime_env: + working_dir: "{{.WorkingDirBase}}/Llama31Instruct-{{.ModelVersion}}.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: llama31_instruct @@ -465,6 +474,7 @@ applications: import_path: main:create_serve_app route_prefix: /e5_language_classifier runtime_env: + working_dir: "{{.WorkingDirBase}}/E5LanguageClassifier-{{.ModelVersion}}.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: e5_language_classifier @@ -550,6 +560,7 @@ applications: import_path: main:create_serve_app route_prefix: /llama31_70b_instruct_awq runtime_env: + working_dir: "{{.WorkingDirBase}}/Llama3170bInstructAwq-{{.ModelVersion}}.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: llama31_70b_instruct_awq @@ -596,6 +607,7 @@ applications: import_path: main:create_serve_app route_prefix: /prompt_injection_cross_encoder runtime_env: + working_dir: "{{.WorkingDirBase}}/PromptInjectionCrossEncoder-{{.ModelVersion}}.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: prompt_injection_cross_encoder @@ -627,6 +639,7 @@ applications: import_path: main:create_serve_app route_prefix: /prompt_injection_classifier runtime_env: + working_dir: "{{.WorkingDirBase}}/PromptInjectionClassifier-{{.ModelVersion}}.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: prompt_injection_classifier diff --git a/pkg/ai/raybuilder/builder.go b/pkg/ai/raybuilder/builder.go index b45a859..5d795c2 100644 --- a/pkg/ai/raybuilder/builder.go +++ b/pkg/ai/raybuilder/builder.go @@ -44,13 +44,15 @@ type Builder struct { } type ApplicationParams struct { - ArtifactBucketName string `yaml:"ARTIFACTS_S3_BUCKET"` - ArtifactsProvider string `yaml:"ARTIFACTS_PROVIDER"` - CloudProvider string `yaml:"CLOUD_PROVIDER"` - S3CompatObjectStoreEndpointUrl string `yaml:"S3COMPAT_OBJECT_STORE_ENDPOINT_URL"` - S3CompatObjectStoreAccessKey string `yaml:"S3COMPAT_OBJECT_STORE_ACCESS_KEY"` - S3CompatObjectStoreSecretKey string `yaml:"S3COMPAT_OBJECT_STORE_SECRET_KEY"` - Replicas map[string]int32 `yaml:"REPLICAS"` + ArtifactBucketName string `yaml:"ARTIFACTS_S3_BUCKET"` + ArtifactsProvider string `yaml:"ARTIFACTS_PROVIDER"` + CloudProvider string `yaml:"CLOUD_PROVIDER"` + S3CompatObjectStoreEndpointUrl string `yaml:"S3COMPAT_OBJECT_STORE_ENDPOINT_URL"` + S3CompatObjectStoreAccessKey string `yaml:"S3COMPAT_OBJECT_STORE_ACCESS_KEY"` + S3CompatObjectStoreSecretKey string `yaml:"S3COMPAT_OBJECT_STORE_SECRET_KEY"` + Replicas map[string]int32 `yaml:"REPLICAS"` + WorkingDirBase string `yaml:"WORKING_DIR_BASE"` + ModelVersion string `yaml:"MODEL_VERSION"` } type WorkerConfigs map[string][]InstanceDetail @@ -77,6 +79,14 @@ func New(ai *enterpriseApi.AIPlatform, client client.Client, scheme *runtime.Sch } } +// effectiveAcceleratorType returns spec.defaultAcceleratorType or L40S when unset, matching instance.yaml keys (L40S, H100_NVL). +func (b *Builder) effectiveAcceleratorType() string { + if s := strings.TrimSpace(b.ai.Spec.DefaultAcceleratorType); s != "" { + return s + } + return "L40S" +} + // --- 7️⃣ ReconcileRayService: build & create/update the RayService CR --- func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPlatform) error { logger := log.FromContext(ctx) // Define logger @@ -177,6 +187,10 @@ func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPl } } + // Build working_dir base: {scheme}://{bucket}/ray-services/ai-platform/applications + // Apps append "/{AppName}-{ModelVersion}.zip" to this in the template. + workingDirBase := fmt.Sprintf("%s://%s/ray-services/ai-platform/applications", u.Scheme, u.Host) + param := ApplicationParams{ ArtifactBucketName: u.Host, ArtifactsProvider: artifactsProvider, @@ -185,6 +199,8 @@ func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPl S3CompatObjectStoreAccessKey: s3CompatObjectStoreAccessKey, S3CompatObjectStoreSecretKey: s3CompatObjectStoreSecretKey, Replicas: replicasMap, + WorkingDirBase: workingDirBase, + ModelVersion: os.Getenv("MODEL_VERSION"), } // Use embedded applications.yaml content @@ -624,6 +640,7 @@ func (b *Builder) Build(ctx context.Context) (*rayv1.RayService, error) { } func (b *Builder) buildClusterConfig(ctx context.Context) (*rayv1.RayClusterSpec, error) { + acceleratorType := b.effectiveAcceleratorType() annotations, labels := buildHeadAnnotationsAndLabels(b.ai) head := rayv1.HeadGroupSpec{ RayStartParams: map[string]string{ @@ -674,7 +691,7 @@ func (b *Builder) buildClusterConfig(ctx context.Context) (*rayv1.RayClusterSpec if err != nil { return nil, fmt.Errorf("failed to parse feature YAML file %s: %v", fileName, err) } - for k, val := range featureConfig.InstanceScale[b.ai.Spec.DefaultAcceleratorType] { + for k, val := range featureConfig.InstanceScale[acceleratorType] { old_val, ok := instanceScale[k] if ok { instanceScale[k] = old_val + val @@ -685,17 +702,23 @@ func (b *Builder) buildClusterConfig(ctx context.Context) (*rayv1.RayClusterSpec } var workers []rayv1.WorkerGroupSpec - var gpuConfigs = instanceMap[b.ai.Spec.DefaultAcceleratorType] + gpuConfigs := instanceMap[acceleratorType] + if len(gpuConfigs) == 0 { + return nil, fmt.Errorf("instance.yaml has no worker tiers for defaultAcceleratorType %q; keys must match exactly (e.g. L40S, H100_NVL)", acceleratorType) + } for _, cfg := range gpuConfigs { annotations, labels := buildWorkerAnnotationsAndLabels(b.ai, cfg) cpuLimit := cfg.Resources.Limits[corev1.ResourceCPU] + replicas := instanceScale[cfg.Tier] wg := rayv1.WorkerGroupSpec{ - GroupName: cfg.Tier, - Replicas: int32Ptr(instanceScale[cfg.Tier]), + GroupName: cfg.Tier, + Replicas: int32Ptr(replicas), + MinReplicas: int32Ptr(replicas), + MaxReplicas: int32Ptr(replicas + 5), RayStartParams: map[string]string{ "num-cpus": cpuLimit.String(), - "resources": fmt.Sprintf(`"{\"accelerator_type:%s\":1,\"gpu_count:%d\":1}"`, b.ai.Spec.DefaultAcceleratorType, cfg.GPUsPerPod), + "resources": fmt.Sprintf(`"{\"accelerator_type:%s\":1,\"gpu_count:%d\":1}"`, acceleratorType, cfg.GPUsPerPod), }, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ @@ -747,7 +770,7 @@ func (b *Builder) objectStorageSecretEnv() []corev1.EnvVar { func (b *Builder) makeHeadTemplate() corev1.PodTemplateSpec { headEnv := []corev1.EnvVar{ - {Name: "DEFAULT_GPU_TYPE", Value: b.ai.Spec.DefaultAcceleratorType}, + {Name: "DEFAULT_GPU_TYPE", Value: b.effectiveAcceleratorType()}, {Name: "CLUSTER_NAME", Value: "ai-platform-models"}, // FIXME } headEnv = append(headEnv, b.objectStorageSecretEnv()...) @@ -833,13 +856,13 @@ func (b *Builder) makeHeadTemplate() corev1.PodTemplateSpec { func (b *Builder) makeWorkerTemplate(cfg InstanceDetail) corev1.PodTemplateSpec { defaultEnv := []corev1.EnvVar{ - {Name: "DEFAULT_GPU_TYPE", Value: b.ai.Spec.DefaultAcceleratorType}, + {Name: "DEFAULT_GPU_TYPE", Value: b.effectiveAcceleratorType()}, {Name: "RAY_HEAD_SERVICE_HOST", Value: fmt.Sprintf("%s.%s.svc.%s", b.ai.Name+"-head-svc", b.ai.Namespace, os.Getenv("CLUSTER_DOMAIN"))}, {Name: "SERVICE_NAME", Value: b.ai.Name}, {Name: "SERVICE_INTERNAL_NAME", Value: b.ai.Name}, {Name: "USE_SYSTEM_PERMISSIONS", Value: "true"}, {Name: "GPG_PUBLICKEY_PATH", Value: "kv-splunk/al-platform.ray-worker-sa/gpgkey"}, // FIXME - {Name: "GPU_TYPE", Value: b.ai.Spec.DefaultAcceleratorType}, // FIXME + {Name: "GPU_TYPE", Value: b.effectiveAcceleratorType()}, // FIXME } // Combine defaultEnv with cfg.Env to create combinedEnv From be671052ece7e62b5f6f72a804de8018afa160f3 Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Sun, 22 Mar 2026 21:23:22 +0530 Subject: [PATCH 09/55] fix: path-style addressing for MinIO and rename object_storage to blob_storage Two bugs causing NoSuchBucket when Ray downloads working_dir zips: 1. rayS3DownloadEnv() was missing AWS_S3_ADDRESSING_STYLE=path. Boto3 defaults to virtual-hosted style (bucket.endpoint) for custom endpoints, which fails DNS resolution with MinIO. Path-style (endpoint/bucket/key) is required for all S3-compatible stores. 2. applications.yaml used 'object_storage' as the model_loader sub-field but ModelLoader in model_definition.py defines it as 'blob_storage' (renamed in commit e62d93da). Pydantic silently ignored the unknown key, leaving blob_storage=None and causing a model validation error at startup. --- config/configs/applications.yaml | 22 +++++----- pkg/ai/raybuilder/builder.go | 75 ++++++++++++++++++++++++++++++-- 2 files changed, 82 insertions(+), 15 deletions(-) diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index 5f01e69..a22bff4 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -53,7 +53,7 @@ applications: tensor_parallel_size: 1 model_id: uae_large model_loader: - object_storage: + blob_storage: prefix: model_artifacts/uae-large name: UaeLarge import_path: main:create_serve_app @@ -103,7 +103,7 @@ applications: tensor_parallel_size: 1 model_id: all_minilm_l6_v2 model_loader: - object_storage: + blob_storage: prefix: model_artifacts/all-minilm-l6-v2 name: AllMinilmL6V2 import_path: main:create_serve_app @@ -153,7 +153,7 @@ applications: tensor_parallel_size: 1 model_id: bi_encoder model_loader: - object_storage: + blob_storage: prefix: model_artifacts/bi-encoder name: BiEncoder import_path: main:create_serve_app @@ -199,7 +199,7 @@ applications: model_definition: model_id: mbart_translator model_loader: - object_storage: + blob_storage: prefix: model_artifacts/mbart-translator name: MbartTranslator import_path: main:create_serve_app @@ -256,7 +256,7 @@ applications: tensor_parallel_size: 1 model_id: xlm_roberta_language_classifier model_loader: - object_storage: + blob_storage: prefix: model_artifacts/xlm-roberta-language-classifier name: XlmRobertaLanguageClassifier import_path: main:create_serve_app @@ -332,7 +332,7 @@ applications: tensor_parallel_size: 1 model_id: cross_encoder model_loader: - object_storage: + blob_storage: prefix: model_artifacts/cross-encoder model_type: vllm_scoring_model name: CrossEncoder @@ -401,12 +401,12 @@ applications: tensor_parallel_size: 4 model_id: llama31_instruct model_loader: - object_storage: + blob_storage: prefix: model_artifacts/llama31-8b-instruct tokenizer_definition: model_id: llama31_instruct model_loader: - object_storage: + blob_storage: artifacts_list: - config.json - tokenizer_config.json @@ -468,7 +468,7 @@ applications: tensor_parallel_size: 1 model_id: e5_language_classifier model_loader: - object_storage: + blob_storage: prefix: model_artifacts/e5-language-classifier name: E5LanguageClassifier import_path: main:create_serve_app @@ -545,12 +545,12 @@ applications: tensor_parallel_size: 8 model_id: llama31_70b_instruct_awq model_loader: - object_storage: + blob_storage: prefix: model_artifacts/llama31-70b-instruct-awq tokenizer_definition: model_id: llama31_70b_instruct_awq model_loader: - object_storage: + blob_storage: artifacts_list: - config.json - tokenizer_config.json diff --git a/pkg/ai/raybuilder/builder.go b/pkg/ai/raybuilder/builder.go index 5d795c2..0a7775d 100644 --- a/pkg/ai/raybuilder/builder.go +++ b/pkg/ai/raybuilder/builder.go @@ -87,6 +87,21 @@ func (b *Builder) effectiveAcceleratorType() string { return "L40S" } +// rayRuntimeWorkingDirScheme maps AIPlatform object storage for runtime_env.working_dir URIs. +// Ray Serve's ServeDeploySchema only allows specific protocols (e.g. S3, HTTPS, GCS)—not minio:// or s3compat://. +// All S3 API backends (AWS, MinIO, s3compat, SeaweedFS) therefore use s3:// here; non-AWS endpoints use +// S3COMPAT_OBJECT_STORE_ENDPOINT_URL and optional keys in runtime_env. +func rayRuntimeWorkingDirScheme(scheme string) string { + switch strings.ToLower(scheme) { + case "s3", "s3compat", "minio", "seaweedfs": + return "s3" + case "gcs": + return "gs" + default: + return scheme + } +} + // --- 7️⃣ ReconcileRayService: build & create/update the RayService CR --- func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPlatform) error { logger := log.FromContext(ctx) // Define logger @@ -187,9 +202,8 @@ func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPl } } - // Build working_dir base: {scheme}://{bucket}/ray-services/ai-platform/applications - // Apps append "/{AppName}-{ModelVersion}.zip" to this in the template. - workingDirBase := fmt.Sprintf("%s://%s/ray-services/ai-platform/applications", u.Scheme, u.Host) + // Build working_dir base (always s3:// for S3 API stores—required by Ray Serve; see rayRuntimeWorkingDirScheme). + workingDirBase := fmt.Sprintf("%s://%s/ray-services/ai-platform/applications", rayRuntimeWorkingDirScheme(u.Scheme), u.Host) param := ApplicationParams{ ArtifactBucketName: u.Host, @@ -768,11 +782,63 @@ func (b *Builder) objectStorageSecretEnv() []corev1.EnvVar { } } +// rayS3DownloadEnv sets AWS_* variables so Ray's runtime_env agent (boto3/smart_open) resolves s3:// working_dir +// URIs against the configured S3-compatible endpoint. S3COMPAT_OBJECT_STORE_* is for application code only. +func (b *Builder) rayS3DownloadEnv() []corev1.EnvVar { + u, err := url.Parse(b.ai.Spec.ObjectStorage.Path) + if err != nil { + return nil + } + endpoint := strings.TrimSpace(b.ai.Spec.ObjectStorage.Endpoint) + s3CompatScheme := u.Scheme == "s3compat" || u.Scheme == "minio" || u.Scheme == "seaweedfs" + s3WithCustomEndpoint := u.Scheme == "s3" && endpoint != "" + if (!s3CompatScheme && !s3WithCustomEndpoint) || endpoint == "" { + return nil + } + var out []corev1.EnvVar + out = append(out, corev1.EnvVar{Name: "AWS_ENDPOINT_URL", Value: endpoint}) + // MinIO and other S3-compatible stores require path-style addressing (endpoint/bucket/key). + // Without this, boto3 defaults to virtual-hosted style (bucket.endpoint) which fails DNS resolution. + out = append(out, corev1.EnvVar{Name: "AWS_S3_ADDRESSING_STYLE", Value: "path"}) + if r := strings.TrimSpace(b.ai.Spec.ObjectStorage.Region); r != "" { + out = append(out, + corev1.EnvVar{Name: "AWS_DEFAULT_REGION", Value: r}, + corev1.EnvVar{Name: "AWS_REGION", Value: r}, + ) + } + if b.ai.Spec.ObjectStorage.SecretRef == "" { + return out + } + sn := b.ai.Spec.ObjectStorage.SecretRef + out = append(out, + corev1.EnvVar{ + Name: "AWS_ACCESS_KEY_ID", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: sn}, + Key: "s3_access_key", + }, + }, + }, + corev1.EnvVar{ + Name: "AWS_SECRET_ACCESS_KEY", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: sn}, + Key: "s3_secret_key", + }, + }, + }, + ) + return out +} + func (b *Builder) makeHeadTemplate() corev1.PodTemplateSpec { headEnv := []corev1.EnvVar{ {Name: "DEFAULT_GPU_TYPE", Value: b.effectiveAcceleratorType()}, {Name: "CLUSTER_NAME", Value: "ai-platform-models"}, // FIXME } + headEnv = append(headEnv, b.rayS3DownloadEnv()...) headEnv = append(headEnv, b.objectStorageSecretEnv()...) spec := corev1.PodSpec{ Containers: []corev1.Container{{ @@ -883,7 +949,8 @@ func (b *Builder) makeWorkerTemplate(cfg InstanceDetail) corev1.PodTemplateSpec combinedEnv = append(combinedEnv, corev1.EnvVar{Name: key, Value: value}) } } - // S3-compatible object store credentials for models and SAIA (S3COMPAT_OBJECT_STORE_*) + // S3-compatible: boto3 for Ray runtime_env working_dir + app-level S3COMPAT_* keys + combinedEnv = append(combinedEnv, b.rayS3DownloadEnv()...) combinedEnv = append(combinedEnv, b.objectStorageSecretEnv()...) rayCommand := fmt.Sprintf(`echo %s worker; ulimit -n 65536; From b9e10d96e2f580fbbf70d06b4a2ba409a6224cb3 Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Sun, 22 Mar 2026 22:31:04 +0530 Subject: [PATCH 10/55] fix: use MinIO HTTP endpoint for working_dir instead of broken s3:// handler Ray's s3:// protocol handler (protocol.py _handle_s3_protocol) creates a plain boto3.Session().client('s3') with no endpoint_url, so it always hits AWS S3 regardless of AWS_ENDPOINT_URL set on the pod. This causes NoSuchBucket when the bucket only exists in MinIO. Replace rayRuntimeWorkingDirScheme() with rayWorkingDirBase() which, for S3-compatible stores with a custom endpoint, builds the working_dir as a direct HTTP URL to MinIO (endpoint/bucket/path). Ray's https handler uses urllib which simply fetches the URL without any S3-specific boto3 logic. Also remove the ineffective AWS_S3_ADDRESSING_STYLE env var added in the previous commit. --- pkg/ai/raybuilder/builder.go | 46 +++++++++++++++++++++---------- pkg/ai/raybuilder/builder_test.go | 30 ++++++++++++++++++++ 2 files changed, 61 insertions(+), 15 deletions(-) diff --git a/pkg/ai/raybuilder/builder.go b/pkg/ai/raybuilder/builder.go index 0a7775d..b4f042c 100644 --- a/pkg/ai/raybuilder/builder.go +++ b/pkg/ai/raybuilder/builder.go @@ -87,18 +87,35 @@ func (b *Builder) effectiveAcceleratorType() string { return "L40S" } -// rayRuntimeWorkingDirScheme maps AIPlatform object storage for runtime_env.working_dir URIs. -// Ray Serve's ServeDeploySchema only allows specific protocols (e.g. S3, HTTPS, GCS)—not minio:// or s3compat://. -// All S3 API backends (AWS, MinIO, s3compat, SeaweedFS) therefore use s3:// here; non-AWS endpoints use -// S3COMPAT_OBJECT_STORE_ENDPOINT_URL and optional keys in runtime_env. -func rayRuntimeWorkingDirScheme(scheme string) string { +// rayWorkingDirBase builds the base URL for runtime_env.working_dir zip files. +// +// Ray's S3 protocol handler (protocol.py) creates a plain boto3 client with no endpoint_url, +// so it always hits AWS S3 regardless of AWS_ENDPOINT_URL. For S3-compatible stores (MinIO, +// SeaweedFS, s3compat) we therefore use the MinIO HTTP endpoint directly as an https:// URL: +// +// https:////ray-services/ai-platform/applications +// +// Ray's https handler uses urllib which respects no special AWS config and works fine for +// publicly-accessible or pre-signed URLs. If the bucket is private, the zips must be made +// publicly readable or the MinIO endpoint must be accessible without auth (internal cluster). +// +// For plain AWS S3 (no custom endpoint) we keep s3:// so Ray uses its normal AWS credential chain. +// For GCS we use gs://. +func rayWorkingDirBase(scheme, bucket, endpoint string) string { + s3CompatScheme := scheme == "s3compat" || scheme == "minio" || scheme == "seaweedfs" + s3WithEndpoint := scheme == "s3" && endpoint != "" + if (s3CompatScheme || s3WithEndpoint) && endpoint != "" { + // Strip trailing slash from endpoint, then append bucket and path. + ep := strings.TrimRight(endpoint, "/") + return fmt.Sprintf("%s/%s/ray-services/ai-platform/applications", ep, bucket) + } switch strings.ToLower(scheme) { case "s3", "s3compat", "minio", "seaweedfs": - return "s3" + return fmt.Sprintf("s3://%s/ray-services/ai-platform/applications", bucket) case "gcs": - return "gs" + return fmt.Sprintf("gs://%s/ray-services/ai-platform/applications", bucket) default: - return scheme + return fmt.Sprintf("%s://%s/ray-services/ai-platform/applications", scheme, bucket) } } @@ -202,8 +219,9 @@ func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPl } } - // Build working_dir base (always s3:// for S3 API stores—required by Ray Serve; see rayRuntimeWorkingDirScheme). - workingDirBase := fmt.Sprintf("%s://%s/ray-services/ai-platform/applications", rayRuntimeWorkingDirScheme(u.Scheme), u.Host) + // Build working_dir base. For S3-compatible stores we use the MinIO HTTP endpoint directly + // (https://endpoint/bucket/path) because Ray's s3:// handler ignores AWS_ENDPOINT_URL. + workingDirBase := rayWorkingDirBase(u.Scheme, u.Host, strings.TrimSpace(p.Spec.ObjectStorage.Endpoint)) param := ApplicationParams{ ArtifactBucketName: u.Host, @@ -782,8 +800,9 @@ func (b *Builder) objectStorageSecretEnv() []corev1.EnvVar { } } -// rayS3DownloadEnv sets AWS_* variables so Ray's runtime_env agent (boto3/smart_open) resolves s3:// working_dir -// URIs against the configured S3-compatible endpoint. S3COMPAT_OBJECT_STORE_* is for application code only. +// rayS3DownloadEnv sets AWS_* variables so application code (boto3) can reach S3-compatible stores. +// Note: Ray's runtime_env s3:// handler ignores AWS_ENDPOINT_URL (creates a bare boto3 client with no endpoint_url), +// so working_dir uses the MinIO HTTP endpoint directly instead — see rayWorkingDirBase. func (b *Builder) rayS3DownloadEnv() []corev1.EnvVar { u, err := url.Parse(b.ai.Spec.ObjectStorage.Path) if err != nil { @@ -797,9 +816,6 @@ func (b *Builder) rayS3DownloadEnv() []corev1.EnvVar { } var out []corev1.EnvVar out = append(out, corev1.EnvVar{Name: "AWS_ENDPOINT_URL", Value: endpoint}) - // MinIO and other S3-compatible stores require path-style addressing (endpoint/bucket/key). - // Without this, boto3 defaults to virtual-hosted style (bucket.endpoint) which fails DNS resolution. - out = append(out, corev1.EnvVar{Name: "AWS_S3_ADDRESSING_STYLE", Value: "path"}) if r := strings.TrimSpace(b.ai.Spec.ObjectStorage.Region); r != "" { out = append(out, corev1.EnvVar{Name: "AWS_DEFAULT_REGION", Value: r}, diff --git a/pkg/ai/raybuilder/builder_test.go b/pkg/ai/raybuilder/builder_test.go index 394d700..9a983bb 100644 --- a/pkg/ai/raybuilder/builder_test.go +++ b/pkg/ai/raybuilder/builder_test.go @@ -498,3 +498,33 @@ func TestSetImageRegistry(t *testing.T) { }) } } + +func TestRayWorkingDirBase(t *testing.T) { + const bucket = "my-bucket" + const ep = "http://minio:9000" + tests := []struct { + name string + scheme string + endpoint string + want string + }{ + // S3-compatible with endpoint → use HTTP endpoint directly (Ray s3:// ignores AWS_ENDPOINT_URL) + {name: "minio with endpoint", scheme: "minio", endpoint: ep, want: ep + "/" + bucket + "/ray-services/ai-platform/applications"}, + {name: "s3compat with endpoint", scheme: "s3compat", endpoint: ep, want: ep + "/" + bucket + "/ray-services/ai-platform/applications"}, + {name: "seaweedfs with endpoint", scheme: "seaweedfs", endpoint: ep, want: ep + "/" + bucket + "/ray-services/ai-platform/applications"}, + {name: "s3 with custom endpoint", scheme: "s3", endpoint: ep, want: ep + "/" + bucket + "/ray-services/ai-platform/applications"}, + // endpoint trailing slash stripped + {name: "endpoint trailing slash", scheme: "minio", endpoint: ep + "/", want: ep + "/" + bucket + "/ray-services/ai-platform/applications"}, + // Plain AWS S3 (no endpoint) → s3:// + {name: "s3 aws no endpoint", scheme: "s3", endpoint: "", want: "s3://" + bucket + "/ray-services/ai-platform/applications"}, + // GCS → gs:// + {name: "gcs", scheme: "gcs", endpoint: "", want: "gs://" + bucket + "/ray-services/ai-platform/applications"}, + // Azure → azure:// + {name: "azure", scheme: "azure", endpoint: "", want: "azure://" + bucket + "/ray-services/ai-platform/applications"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, rayWorkingDirBase(tt.scheme, bucket, tt.endpoint)) + }) + } +} From 858c14834bb99691ce738024c03e37fcb6bf39d9 Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Sun, 22 Mar 2026 22:41:38 +0530 Subject: [PATCH 11/55] fix: bundle app code into image via file:// working_dir instead of MinIO zips MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ray's s3:// protocol handler creates a bare boto3.Session().client('s3') with no endpoint_url, so it always hits AWS S3 regardless of any custom endpoint config. Rather than fighting Ray internals, switch to file:// working_dir pointing to app source baked into the Ray image. - applications.yaml: replace all 'minio-zip' working_dir templates with file:///home/ray/ray/applications/entrypoint (Entrypoint) and file:///home/ray/ray/applications/generic_application (all other apps) - builder.go: remove WorkingDirBase, ModelVersion fields and rayWorkingDirBase() function — no longer needed since working_dir is a static file:// path - builder_test.go: remove TestRayWorkingDirBase test for deleted function --- config/configs/applications.yaml | 26 ++++++++++---------- pkg/ai/raybuilder/builder.go | 41 ------------------------------- pkg/ai/raybuilder/builder_test.go | 29 ---------------------- 3 files changed, 13 insertions(+), 83 deletions(-) diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index a22bff4..8c2b266 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -3,7 +3,7 @@ applications: import_path: main:SERVE_APP route_prefix: / runtime_env: - working_dir: "{{.WorkingDirBase}}/Entrypoint-{{.ModelVersion}}.zip" + working_dir: "file:///home/ray/ray/applications/entrypoint" env_vars: API_VERSION: "v1" APPLICATION_NAME: entrypoint @@ -59,7 +59,7 @@ applications: import_path: main:create_serve_app route_prefix: /uae_large runtime_env: - working_dir: "{{.WorkingDirBase}}/UaeLarge-{{.ModelVersion}}.zip" + working_dir: "file:///home/ray/ray/applications/generic_application" env_vars: API_VERSION: "v1" APPLICATION_NAME: uae_large @@ -109,7 +109,7 @@ applications: import_path: main:create_serve_app route_prefix: /all_minilm_l6_v2 runtime_env: - working_dir: "{{.WorkingDirBase}}/AllMinilmL6V2-{{.ModelVersion}}.zip" + working_dir: "file:///home/ray/ray/applications/generic_application" env_vars: API_VERSION: "v1" APPLICATION_NAME: all_minilm_l6_v2 @@ -159,7 +159,7 @@ applications: import_path: main:create_serve_app route_prefix: /bi_encoder runtime_env: - working_dir: "{{.WorkingDirBase}}/BiEncoder-{{.ModelVersion}}.zip" + working_dir: "file:///home/ray/ray/applications/generic_application" env_vars: API_VERSION: "v1" APPLICATION_NAME: bi_encoder @@ -205,7 +205,7 @@ applications: import_path: main:create_serve_app route_prefix: /mbart_translator runtime_env: - working_dir: "{{.WorkingDirBase}}/MbartTranslator-{{.ModelVersion}}.zip" + working_dir: "file:///home/ray/ray/applications/generic_application" env_vars: API_VERSION: "v1" APPLICATION_NAME: mbart_translator @@ -262,7 +262,7 @@ applications: import_path: main:create_serve_app route_prefix: /xlm_roberta_language_classifier runtime_env: - working_dir: "{{.WorkingDirBase}}/XlmRobertaLanguageClassifier-{{.ModelVersion}}.zip" + working_dir: "file:///home/ray/ray/applications/generic_application" env_vars: API_VERSION: "v1" APPLICATION_NAME: xlm_roberta_language_classifier @@ -288,7 +288,7 @@ applications: import_path: main:create_serve_app route_prefix: /prompt_injection_tfidf runtime_env: - working_dir: "{{.WorkingDirBase}}/PromptInjectionTfidf-{{.ModelVersion}}.zip" + working_dir: "file:///home/ray/ray/applications/generic_application" env_vars: APPLICATION_NAME: "PromptInjectionTfidf" API_VERSION: "v1" @@ -339,7 +339,7 @@ applications: import_path: main:create_serve_app route_prefix: /cross_encoder runtime_env: - working_dir: "{{.WorkingDirBase}}/CrossEncoder-{{.ModelVersion}}.zip" + working_dir: "file:///home/ray/ray/applications/generic_application" env_vars: API_VERSION: "v1" APPLICATION_NAME: cross_encoder @@ -416,7 +416,7 @@ applications: import_path: main:create_serve_app route_prefix: /llama31_instruct runtime_env: - working_dir: "{{.WorkingDirBase}}/Llama31Instruct-{{.ModelVersion}}.zip" + working_dir: "file:///home/ray/ray/applications/generic_application" env_vars: API_VERSION: "v1" APPLICATION_NAME: llama31_instruct @@ -474,7 +474,7 @@ applications: import_path: main:create_serve_app route_prefix: /e5_language_classifier runtime_env: - working_dir: "{{.WorkingDirBase}}/E5LanguageClassifier-{{.ModelVersion}}.zip" + working_dir: "file:///home/ray/ray/applications/generic_application" env_vars: API_VERSION: "v1" APPLICATION_NAME: e5_language_classifier @@ -560,7 +560,7 @@ applications: import_path: main:create_serve_app route_prefix: /llama31_70b_instruct_awq runtime_env: - working_dir: "{{.WorkingDirBase}}/Llama3170bInstructAwq-{{.ModelVersion}}.zip" + working_dir: "file:///home/ray/ray/applications/generic_application" env_vars: API_VERSION: "v1" APPLICATION_NAME: llama31_70b_instruct_awq @@ -607,7 +607,7 @@ applications: import_path: main:create_serve_app route_prefix: /prompt_injection_cross_encoder runtime_env: - working_dir: "{{.WorkingDirBase}}/PromptInjectionCrossEncoder-{{.ModelVersion}}.zip" + working_dir: "file:///home/ray/ray/applications/generic_application" env_vars: API_VERSION: "v1" APPLICATION_NAME: prompt_injection_cross_encoder @@ -639,7 +639,7 @@ applications: import_path: main:create_serve_app route_prefix: /prompt_injection_classifier runtime_env: - working_dir: "{{.WorkingDirBase}}/PromptInjectionClassifier-{{.ModelVersion}}.zip" + working_dir: "file:///home/ray/ray/applications/generic_application" env_vars: API_VERSION: "v1" APPLICATION_NAME: prompt_injection_classifier diff --git a/pkg/ai/raybuilder/builder.go b/pkg/ai/raybuilder/builder.go index b4f042c..60d0602 100644 --- a/pkg/ai/raybuilder/builder.go +++ b/pkg/ai/raybuilder/builder.go @@ -51,8 +51,6 @@ type ApplicationParams struct { S3CompatObjectStoreAccessKey string `yaml:"S3COMPAT_OBJECT_STORE_ACCESS_KEY"` S3CompatObjectStoreSecretKey string `yaml:"S3COMPAT_OBJECT_STORE_SECRET_KEY"` Replicas map[string]int32 `yaml:"REPLICAS"` - WorkingDirBase string `yaml:"WORKING_DIR_BASE"` - ModelVersion string `yaml:"MODEL_VERSION"` } type WorkerConfigs map[string][]InstanceDetail @@ -87,37 +85,6 @@ func (b *Builder) effectiveAcceleratorType() string { return "L40S" } -// rayWorkingDirBase builds the base URL for runtime_env.working_dir zip files. -// -// Ray's S3 protocol handler (protocol.py) creates a plain boto3 client with no endpoint_url, -// so it always hits AWS S3 regardless of AWS_ENDPOINT_URL. For S3-compatible stores (MinIO, -// SeaweedFS, s3compat) we therefore use the MinIO HTTP endpoint directly as an https:// URL: -// -// https:////ray-services/ai-platform/applications -// -// Ray's https handler uses urllib which respects no special AWS config and works fine for -// publicly-accessible or pre-signed URLs. If the bucket is private, the zips must be made -// publicly readable or the MinIO endpoint must be accessible without auth (internal cluster). -// -// For plain AWS S3 (no custom endpoint) we keep s3:// so Ray uses its normal AWS credential chain. -// For GCS we use gs://. -func rayWorkingDirBase(scheme, bucket, endpoint string) string { - s3CompatScheme := scheme == "s3compat" || scheme == "minio" || scheme == "seaweedfs" - s3WithEndpoint := scheme == "s3" && endpoint != "" - if (s3CompatScheme || s3WithEndpoint) && endpoint != "" { - // Strip trailing slash from endpoint, then append bucket and path. - ep := strings.TrimRight(endpoint, "/") - return fmt.Sprintf("%s/%s/ray-services/ai-platform/applications", ep, bucket) - } - switch strings.ToLower(scheme) { - case "s3", "s3compat", "minio", "seaweedfs": - return fmt.Sprintf("s3://%s/ray-services/ai-platform/applications", bucket) - case "gcs": - return fmt.Sprintf("gs://%s/ray-services/ai-platform/applications", bucket) - default: - return fmt.Sprintf("%s://%s/ray-services/ai-platform/applications", scheme, bucket) - } -} // --- 7️⃣ ReconcileRayService: build & create/update the RayService CR --- func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPlatform) error { @@ -219,10 +186,6 @@ func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPl } } - // Build working_dir base. For S3-compatible stores we use the MinIO HTTP endpoint directly - // (https://endpoint/bucket/path) because Ray's s3:// handler ignores AWS_ENDPOINT_URL. - workingDirBase := rayWorkingDirBase(u.Scheme, u.Host, strings.TrimSpace(p.Spec.ObjectStorage.Endpoint)) - param := ApplicationParams{ ArtifactBucketName: u.Host, ArtifactsProvider: artifactsProvider, @@ -231,8 +194,6 @@ func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPl S3CompatObjectStoreAccessKey: s3CompatObjectStoreAccessKey, S3CompatObjectStoreSecretKey: s3CompatObjectStoreSecretKey, Replicas: replicasMap, - WorkingDirBase: workingDirBase, - ModelVersion: os.Getenv("MODEL_VERSION"), } // Use embedded applications.yaml content @@ -801,8 +762,6 @@ func (b *Builder) objectStorageSecretEnv() []corev1.EnvVar { } // rayS3DownloadEnv sets AWS_* variables so application code (boto3) can reach S3-compatible stores. -// Note: Ray's runtime_env s3:// handler ignores AWS_ENDPOINT_URL (creates a bare boto3 client with no endpoint_url), -// so working_dir uses the MinIO HTTP endpoint directly instead — see rayWorkingDirBase. func (b *Builder) rayS3DownloadEnv() []corev1.EnvVar { u, err := url.Parse(b.ai.Spec.ObjectStorage.Path) if err != nil { diff --git a/pkg/ai/raybuilder/builder_test.go b/pkg/ai/raybuilder/builder_test.go index 9a983bb..e5a1120 100644 --- a/pkg/ai/raybuilder/builder_test.go +++ b/pkg/ai/raybuilder/builder_test.go @@ -499,32 +499,3 @@ func TestSetImageRegistry(t *testing.T) { } } -func TestRayWorkingDirBase(t *testing.T) { - const bucket = "my-bucket" - const ep = "http://minio:9000" - tests := []struct { - name string - scheme string - endpoint string - want string - }{ - // S3-compatible with endpoint → use HTTP endpoint directly (Ray s3:// ignores AWS_ENDPOINT_URL) - {name: "minio with endpoint", scheme: "minio", endpoint: ep, want: ep + "/" + bucket + "/ray-services/ai-platform/applications"}, - {name: "s3compat with endpoint", scheme: "s3compat", endpoint: ep, want: ep + "/" + bucket + "/ray-services/ai-platform/applications"}, - {name: "seaweedfs with endpoint", scheme: "seaweedfs", endpoint: ep, want: ep + "/" + bucket + "/ray-services/ai-platform/applications"}, - {name: "s3 with custom endpoint", scheme: "s3", endpoint: ep, want: ep + "/" + bucket + "/ray-services/ai-platform/applications"}, - // endpoint trailing slash stripped - {name: "endpoint trailing slash", scheme: "minio", endpoint: ep + "/", want: ep + "/" + bucket + "/ray-services/ai-platform/applications"}, - // Plain AWS S3 (no endpoint) → s3:// - {name: "s3 aws no endpoint", scheme: "s3", endpoint: "", want: "s3://" + bucket + "/ray-services/ai-platform/applications"}, - // GCS → gs:// - {name: "gcs", scheme: "gcs", endpoint: "", want: "gs://" + bucket + "/ray-services/ai-platform/applications"}, - // Azure → azure:// - {name: "azure", scheme: "azure", endpoint: "", want: "azure://" + bucket + "/ray-services/ai-platform/applications"}, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - assert.Equal(t, tt.want, rayWorkingDirBase(tt.scheme, bucket, tt.endpoint)) - }) - } -} From 12a8e298514dc9b033ba767c026a8778a9c31ace Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Sun, 22 Mar 2026 22:45:26 +0530 Subject: [PATCH 12/55] fix: use file:// working_dir for bundled prompt injection models, remote URL for others PromptInjectionTfidf, PromptInjectionCrossEncoder, PromptInjectionClassifier are baked into the Ray worker image at /home/ray/ray/applications/generic_application, so they use file:// working_dir with no network dependency. All other apps (UaeLarge, AllMinilmL6V2, BiEncoder, MbartTranslator, etc.) continue to use {{.WorkingDirBase}}/AppName-{{.ModelVersion}}.zip resolved at runtime from the configured object storage (s3, gs, azure, or s3compat/MinIO endpoint). --- config/configs/applications.yaml | 20 ++++++++-------- pkg/ai/raybuilder/builder.go | 41 ++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 10 deletions(-) diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index 8c2b266..3994833 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -3,7 +3,7 @@ applications: import_path: main:SERVE_APP route_prefix: / runtime_env: - working_dir: "file:///home/ray/ray/applications/entrypoint" + working_dir: "{{.WorkingDirBase}}/Entrypoint-{{.ModelVersion}}.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: entrypoint @@ -59,7 +59,7 @@ applications: import_path: main:create_serve_app route_prefix: /uae_large runtime_env: - working_dir: "file:///home/ray/ray/applications/generic_application" + working_dir: "{{.WorkingDirBase}}/UaeLarge-{{.ModelVersion}}.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: uae_large @@ -109,7 +109,7 @@ applications: import_path: main:create_serve_app route_prefix: /all_minilm_l6_v2 runtime_env: - working_dir: "file:///home/ray/ray/applications/generic_application" + working_dir: "{{.WorkingDirBase}}/AllMinilmL6V2-{{.ModelVersion}}.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: all_minilm_l6_v2 @@ -159,7 +159,7 @@ applications: import_path: main:create_serve_app route_prefix: /bi_encoder runtime_env: - working_dir: "file:///home/ray/ray/applications/generic_application" + working_dir: "{{.WorkingDirBase}}/BiEncoder-{{.ModelVersion}}.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: bi_encoder @@ -205,7 +205,7 @@ applications: import_path: main:create_serve_app route_prefix: /mbart_translator runtime_env: - working_dir: "file:///home/ray/ray/applications/generic_application" + working_dir: "{{.WorkingDirBase}}/MbartTranslator-{{.ModelVersion}}.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: mbart_translator @@ -262,7 +262,7 @@ applications: import_path: main:create_serve_app route_prefix: /xlm_roberta_language_classifier runtime_env: - working_dir: "file:///home/ray/ray/applications/generic_application" + working_dir: "{{.WorkingDirBase}}/XlmRobertaLanguageClassifier-{{.ModelVersion}}.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: xlm_roberta_language_classifier @@ -339,7 +339,7 @@ applications: import_path: main:create_serve_app route_prefix: /cross_encoder runtime_env: - working_dir: "file:///home/ray/ray/applications/generic_application" + working_dir: "{{.WorkingDirBase}}/CrossEncoder-{{.ModelVersion}}.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: cross_encoder @@ -416,7 +416,7 @@ applications: import_path: main:create_serve_app route_prefix: /llama31_instruct runtime_env: - working_dir: "file:///home/ray/ray/applications/generic_application" + working_dir: "{{.WorkingDirBase}}/Llama31Instruct-{{.ModelVersion}}.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: llama31_instruct @@ -474,7 +474,7 @@ applications: import_path: main:create_serve_app route_prefix: /e5_language_classifier runtime_env: - working_dir: "file:///home/ray/ray/applications/generic_application" + working_dir: "{{.WorkingDirBase}}/E5LanguageClassifier-{{.ModelVersion}}.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: e5_language_classifier @@ -560,7 +560,7 @@ applications: import_path: main:create_serve_app route_prefix: /llama31_70b_instruct_awq runtime_env: - working_dir: "file:///home/ray/ray/applications/generic_application" + working_dir: "{{.WorkingDirBase}}/Llama3170bInstructAwq-{{.ModelVersion}}.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: llama31_70b_instruct_awq diff --git a/pkg/ai/raybuilder/builder.go b/pkg/ai/raybuilder/builder.go index 60d0602..b4f042c 100644 --- a/pkg/ai/raybuilder/builder.go +++ b/pkg/ai/raybuilder/builder.go @@ -51,6 +51,8 @@ type ApplicationParams struct { S3CompatObjectStoreAccessKey string `yaml:"S3COMPAT_OBJECT_STORE_ACCESS_KEY"` S3CompatObjectStoreSecretKey string `yaml:"S3COMPAT_OBJECT_STORE_SECRET_KEY"` Replicas map[string]int32 `yaml:"REPLICAS"` + WorkingDirBase string `yaml:"WORKING_DIR_BASE"` + ModelVersion string `yaml:"MODEL_VERSION"` } type WorkerConfigs map[string][]InstanceDetail @@ -85,6 +87,37 @@ func (b *Builder) effectiveAcceleratorType() string { return "L40S" } +// rayWorkingDirBase builds the base URL for runtime_env.working_dir zip files. +// +// Ray's S3 protocol handler (protocol.py) creates a plain boto3 client with no endpoint_url, +// so it always hits AWS S3 regardless of AWS_ENDPOINT_URL. For S3-compatible stores (MinIO, +// SeaweedFS, s3compat) we therefore use the MinIO HTTP endpoint directly as an https:// URL: +// +// https:////ray-services/ai-platform/applications +// +// Ray's https handler uses urllib which respects no special AWS config and works fine for +// publicly-accessible or pre-signed URLs. If the bucket is private, the zips must be made +// publicly readable or the MinIO endpoint must be accessible without auth (internal cluster). +// +// For plain AWS S3 (no custom endpoint) we keep s3:// so Ray uses its normal AWS credential chain. +// For GCS we use gs://. +func rayWorkingDirBase(scheme, bucket, endpoint string) string { + s3CompatScheme := scheme == "s3compat" || scheme == "minio" || scheme == "seaweedfs" + s3WithEndpoint := scheme == "s3" && endpoint != "" + if (s3CompatScheme || s3WithEndpoint) && endpoint != "" { + // Strip trailing slash from endpoint, then append bucket and path. + ep := strings.TrimRight(endpoint, "/") + return fmt.Sprintf("%s/%s/ray-services/ai-platform/applications", ep, bucket) + } + switch strings.ToLower(scheme) { + case "s3", "s3compat", "minio", "seaweedfs": + return fmt.Sprintf("s3://%s/ray-services/ai-platform/applications", bucket) + case "gcs": + return fmt.Sprintf("gs://%s/ray-services/ai-platform/applications", bucket) + default: + return fmt.Sprintf("%s://%s/ray-services/ai-platform/applications", scheme, bucket) + } +} // --- 7️⃣ ReconcileRayService: build & create/update the RayService CR --- func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPlatform) error { @@ -186,6 +219,10 @@ func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPl } } + // Build working_dir base. For S3-compatible stores we use the MinIO HTTP endpoint directly + // (https://endpoint/bucket/path) because Ray's s3:// handler ignores AWS_ENDPOINT_URL. + workingDirBase := rayWorkingDirBase(u.Scheme, u.Host, strings.TrimSpace(p.Spec.ObjectStorage.Endpoint)) + param := ApplicationParams{ ArtifactBucketName: u.Host, ArtifactsProvider: artifactsProvider, @@ -194,6 +231,8 @@ func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPl S3CompatObjectStoreAccessKey: s3CompatObjectStoreAccessKey, S3CompatObjectStoreSecretKey: s3CompatObjectStoreSecretKey, Replicas: replicasMap, + WorkingDirBase: workingDirBase, + ModelVersion: os.Getenv("MODEL_VERSION"), } // Use embedded applications.yaml content @@ -762,6 +801,8 @@ func (b *Builder) objectStorageSecretEnv() []corev1.EnvVar { } // rayS3DownloadEnv sets AWS_* variables so application code (boto3) can reach S3-compatible stores. +// Note: Ray's runtime_env s3:// handler ignores AWS_ENDPOINT_URL (creates a bare boto3 client with no endpoint_url), +// so working_dir uses the MinIO HTTP endpoint directly instead — see rayWorkingDirBase. func (b *Builder) rayS3DownloadEnv() []corev1.EnvVar { u, err := url.Parse(b.ai.Spec.ObjectStorage.Path) if err != nil { From 969a73789ca6f29ed917ab719c10a38499389af5 Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Sun, 22 Mar 2026 23:10:43 +0530 Subject: [PATCH 13/55] fix: SAIA resource defaults and preserve AIService resources on reconcile - saia/impl.go: bump default memory request 1Gi->2Gi, limits CPU 1->2 / memory 2Gi->4Gi to prevent kubelet OOMKill during SAIA startup - reconciler.go: preserve existing AIService Resources on reconcile so user-set limits are not wiped back to defaults on every AIPlatform reconcile --- pkg/ai/features/saia/impl.go | 8 ++++---- pkg/ai/reconciler.go | 13 +++++++++++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/pkg/ai/features/saia/impl.go b/pkg/ai/features/saia/impl.go index 1f0138d..80ae6fe 100644 --- a/pkg/ai/features/saia/impl.go +++ b/pkg/ai/features/saia/impl.go @@ -162,17 +162,17 @@ func (r *SaiaReconciler) validateAIService( return fmt.Errorf("VectorDbUrl must be set (either from AIPlatformRef or explicitly)") } - // Default resources + // Default resources — SAIA API needs headroom beyond 2Gi or the kubelet OOMKills during startup. if ai.Spec.Resources.Requests == nil { ai.Spec.Resources.Requests = corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("500m"), - corev1.ResourceMemory: resource.MustParse("1Gi"), + corev1.ResourceMemory: resource.MustParse("2Gi"), } } if ai.Spec.Resources.Limits == nil { ai.Spec.Resources.Limits = corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("1"), - corev1.ResourceMemory: resource.MustParse("2Gi"), + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("4Gi"), } } if ai.Spec.TaskVolume.Path == "" { diff --git a/pkg/ai/reconciler.go b/pkg/ai/reconciler.go index 3230af1..9e2a803 100644 --- a/pkg/ai/reconciler.go +++ b/pkg/ai/reconciler.go @@ -131,6 +131,9 @@ func (r *AIPlatformReconciler) ReconcileFeatures(ctx context.Context, platform * svc.Namespace = platform.Namespace _, err := controllerutil.CreateOrUpdate(ctx, r.Client, &svc, func() error { + // After client Get, svc holds the live AIService (empty on first create). + preservedResources := svc.Spec.Resources + // Ensure ownership if err := controllerutil.SetControllerReference(platform, &svc, r.Scheme); err != nil { return err @@ -142,6 +145,12 @@ func (r *AIPlatformReconciler) ReconcileFeatures(ctx context.Context, platform * // Copy desired spec svc.Spec = built.Spec + // buildAIService does not set Resources; without this, every AIPlatform reconcile + // wipes kubectl patches / user-set limits (e.g. SAIA memory) back to empty → 2Gi defaults. + if resourceRequirementsNonEmpty(preservedResources) { + svc.Spec.Resources = preservedResources + } + // Merge labels if svc.Labels == nil { svc.Labels = map[string]string{} @@ -189,6 +198,10 @@ func (r *AIPlatformReconciler) ReconcileFeatures(ctx context.Context, platform * return nil } +func resourceRequirementsNonEmpty(r corev1.ResourceRequirements) bool { + return len(r.Requests) > 0 || len(r.Limits) > 0 +} + func (r *AIPlatformReconciler) buildAIService(ctx context.Context, platform *aiApi.AIPlatform, feature aiApi.FeatureSpec, name string) *aiApi.AIService { vectorDbUrl := platform.Status.VectorDbServiceName From 2da61200f4578b3482134588a28567003bb3044b Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Mon, 23 Mar 2026 23:24:02 +0530 Subject: [PATCH 14/55] fix: point file:// working_dir to .zip file not directory Ray requires file:// working_dir URIs to point to a .zip or .whl file. Update the 3 prompt injection apps to reference generic_application.zip which is built during the Docker image build in ai-platform-models. --- config/configs/applications.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index 3994833..61d027e 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -288,7 +288,7 @@ applications: import_path: main:create_serve_app route_prefix: /prompt_injection_tfidf runtime_env: - working_dir: "file:///home/ray/ray/applications/generic_application" + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: APPLICATION_NAME: "PromptInjectionTfidf" API_VERSION: "v1" @@ -607,7 +607,7 @@ applications: import_path: main:create_serve_app route_prefix: /prompt_injection_cross_encoder runtime_env: - working_dir: "file:///home/ray/ray/applications/generic_application" + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: prompt_injection_cross_encoder @@ -639,7 +639,7 @@ applications: import_path: main:create_serve_app route_prefix: /prompt_injection_classifier runtime_env: - working_dir: "file:///home/ray/ray/applications/generic_application" + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: prompt_injection_classifier From 4477cb9fcdc4491edbce2d9191fde3ca98003b28 Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Tue, 24 Mar 2026 00:39:32 +0530 Subject: [PATCH 15/55] fix: use file:// working_dir for all apps; use s3:// for minio working_dir base All 13 Ray Serve apps now use the generic_application.zip baked into the Ray head image via file://, eliminating the need to upload versioned zips to MinIO. Also fixes rayWorkingDirBase to return s3:// for all S3-compatible backends (minio, seaweedfs, s3compat) so AWS_ENDPOINT_URL on the pods redirects boto3 to the MinIO endpoint. Co-Authored-By: Claude Opus 4.6 --- config/configs/applications.yaml | 20 ++++++++-------- pkg/ai/raybuilder/builder.go | 40 +++++++++++--------------------- 2 files changed, 23 insertions(+), 37 deletions(-) diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index 61d027e..e06fd0a 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -3,7 +3,7 @@ applications: import_path: main:SERVE_APP route_prefix: / runtime_env: - working_dir: "{{.WorkingDirBase}}/Entrypoint-{{.ModelVersion}}.zip" + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: entrypoint @@ -59,7 +59,7 @@ applications: import_path: main:create_serve_app route_prefix: /uae_large runtime_env: - working_dir: "{{.WorkingDirBase}}/UaeLarge-{{.ModelVersion}}.zip" + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: uae_large @@ -109,7 +109,7 @@ applications: import_path: main:create_serve_app route_prefix: /all_minilm_l6_v2 runtime_env: - working_dir: "{{.WorkingDirBase}}/AllMinilmL6V2-{{.ModelVersion}}.zip" + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: all_minilm_l6_v2 @@ -159,7 +159,7 @@ applications: import_path: main:create_serve_app route_prefix: /bi_encoder runtime_env: - working_dir: "{{.WorkingDirBase}}/BiEncoder-{{.ModelVersion}}.zip" + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: bi_encoder @@ -205,7 +205,7 @@ applications: import_path: main:create_serve_app route_prefix: /mbart_translator runtime_env: - working_dir: "{{.WorkingDirBase}}/MbartTranslator-{{.ModelVersion}}.zip" + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: mbart_translator @@ -262,7 +262,7 @@ applications: import_path: main:create_serve_app route_prefix: /xlm_roberta_language_classifier runtime_env: - working_dir: "{{.WorkingDirBase}}/XlmRobertaLanguageClassifier-{{.ModelVersion}}.zip" + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: xlm_roberta_language_classifier @@ -339,7 +339,7 @@ applications: import_path: main:create_serve_app route_prefix: /cross_encoder runtime_env: - working_dir: "{{.WorkingDirBase}}/CrossEncoder-{{.ModelVersion}}.zip" + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: cross_encoder @@ -416,7 +416,7 @@ applications: import_path: main:create_serve_app route_prefix: /llama31_instruct runtime_env: - working_dir: "{{.WorkingDirBase}}/Llama31Instruct-{{.ModelVersion}}.zip" + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: llama31_instruct @@ -474,7 +474,7 @@ applications: import_path: main:create_serve_app route_prefix: /e5_language_classifier runtime_env: - working_dir: "{{.WorkingDirBase}}/E5LanguageClassifier-{{.ModelVersion}}.zip" + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: e5_language_classifier @@ -560,7 +560,7 @@ applications: import_path: main:create_serve_app route_prefix: /llama31_70b_instruct_awq runtime_env: - working_dir: "{{.WorkingDirBase}}/Llama3170bInstructAwq-{{.ModelVersion}}.zip" + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: llama31_70b_instruct_awq diff --git a/pkg/ai/raybuilder/builder.go b/pkg/ai/raybuilder/builder.go index b4f042c..9abef70 100644 --- a/pkg/ai/raybuilder/builder.go +++ b/pkg/ai/raybuilder/builder.go @@ -87,32 +87,20 @@ func (b *Builder) effectiveAcceleratorType() string { return "L40S" } -// rayWorkingDirBase builds the base URL for runtime_env.working_dir zip files. +// rayWorkingDirBase builds the base URI for runtime_env.working_dir application zips. // -// Ray's S3 protocol handler (protocol.py) creates a plain boto3 client with no endpoint_url, -// so it always hits AWS S3 regardless of AWS_ENDPOINT_URL. For S3-compatible stores (MinIO, -// SeaweedFS, s3compat) we therefore use the MinIO HTTP endpoint directly as an https:// URL: +// Ray's Serve config rejects plain http:// for remote working_dir URIs; allowed schemes include +// s3 and https. We always use s3:// for S3 and S3-compatible backends (AWS, MinIO, SeaweedFS, etc.). +// Ray pods receive AWS_ENDPOINT_URL plus AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY (when applicable) +// from rayS3DownloadEnv; modern boto3/botocore honor AWS_ENDPOINT_URL for the S3 client used to +// fetch runtime_env packages. // -// https:////ray-services/ai-platform/applications -// -// Ray's https handler uses urllib which respects no special AWS config and works fine for -// publicly-accessible or pre-signed URLs. If the bucket is private, the zips must be made -// publicly readable or the MinIO endpoint must be accessible without auth (internal cluster). -// -// For plain AWS S3 (no custom endpoint) we keep s3:// so Ray uses its normal AWS credential chain. -// For GCS we use gs://. -func rayWorkingDirBase(scheme, bucket, endpoint string) string { - s3CompatScheme := scheme == "s3compat" || scheme == "minio" || scheme == "seaweedfs" - s3WithEndpoint := scheme == "s3" && endpoint != "" - if (s3CompatScheme || s3WithEndpoint) && endpoint != "" { - // Strip trailing slash from endpoint, then append bucket and path. - ep := strings.TrimRight(endpoint, "/") - return fmt.Sprintf("%s/%s/ray-services/ai-platform/applications", ep, bucket) - } +// For GCS we use gs:// (scheme may be gs or gcs in objectStorage.path). +func rayWorkingDirBase(scheme, bucket string) string { switch strings.ToLower(scheme) { case "s3", "s3compat", "minio", "seaweedfs": return fmt.Sprintf("s3://%s/ray-services/ai-platform/applications", bucket) - case "gcs": + case "gs", "gcs": return fmt.Sprintf("gs://%s/ray-services/ai-platform/applications", bucket) default: return fmt.Sprintf("%s://%s/ray-services/ai-platform/applications", scheme, bucket) @@ -219,9 +207,8 @@ func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPl } } - // Build working_dir base. For S3-compatible stores we use the MinIO HTTP endpoint directly - // (https://endpoint/bucket/path) because Ray's s3:// handler ignores AWS_ENDPOINT_URL. - workingDirBase := rayWorkingDirBase(u.Scheme, u.Host, strings.TrimSpace(p.Spec.ObjectStorage.Endpoint)) + // Build working_dir base (s3:// or gs://; see rayWorkingDirBase). + workingDirBase := rayWorkingDirBase(u.Scheme, u.Host) param := ApplicationParams{ ArtifactBucketName: u.Host, @@ -800,9 +787,8 @@ func (b *Builder) objectStorageSecretEnv() []corev1.EnvVar { } } -// rayS3DownloadEnv sets AWS_* variables so application code (boto3) can reach S3-compatible stores. -// Note: Ray's runtime_env s3:// handler ignores AWS_ENDPOINT_URL (creates a bare boto3 client with no endpoint_url), -// so working_dir uses the MinIO HTTP endpoint directly instead — see rayWorkingDirBase. +// rayS3DownloadEnv sets AWS_* variables so application code and Ray's runtime_env S3 fetch use the +// configured S3-compatible endpoint (via AWS_ENDPOINT_URL) and credentials when present. func (b *Builder) rayS3DownloadEnv() []corev1.EnvVar { u, err := url.Parse(b.ai.Spec.ObjectStorage.Path) if err != nil { From 11798a1cbddbfcfe14b8a1bfec2f619f935f9a3c Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Tue, 24 Mar 2026 19:03:37 +0530 Subject: [PATCH 16/55] fix: use entrypoint.zip for Entrypoint app working_dir --- config/configs/applications.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index e06fd0a..2f49dec 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -3,7 +3,7 @@ applications: import_path: main:SERVE_APP route_prefix: / runtime_env: - working_dir: "file:///home/ray/ray/applications/generic_application.zip" + working_dir: "file:///home/ray/ray/applications/entrypoint.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: entrypoint From f0b9785d2f2d8a2e493a5edffb4b10503bded48b Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Tue, 24 Mar 2026 22:53:46 +0530 Subject: [PATCH 17/55] fix: rename blob_storage prefix to blob_prefix to match SDK field name --- config/configs/applications.yaml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index 2f49dec..f4d85fa 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -54,7 +54,7 @@ applications: model_id: uae_large model_loader: blob_storage: - prefix: model_artifacts/uae-large + blob_prefix: model_artifacts/uae-large name: UaeLarge import_path: main:create_serve_app route_prefix: /uae_large @@ -104,7 +104,7 @@ applications: model_id: all_minilm_l6_v2 model_loader: blob_storage: - prefix: model_artifacts/all-minilm-l6-v2 + blob_prefix: model_artifacts/all-minilm-l6-v2 name: AllMinilmL6V2 import_path: main:create_serve_app route_prefix: /all_minilm_l6_v2 @@ -154,7 +154,7 @@ applications: model_id: bi_encoder model_loader: blob_storage: - prefix: model_artifacts/bi-encoder + blob_prefix: model_artifacts/bi-encoder name: BiEncoder import_path: main:create_serve_app route_prefix: /bi_encoder @@ -200,7 +200,7 @@ applications: model_id: mbart_translator model_loader: blob_storage: - prefix: model_artifacts/mbart-translator + blob_prefix: model_artifacts/mbart-translator name: MbartTranslator import_path: main:create_serve_app route_prefix: /mbart_translator @@ -257,7 +257,7 @@ applications: model_id: xlm_roberta_language_classifier model_loader: blob_storage: - prefix: model_artifacts/xlm-roberta-language-classifier + blob_prefix: model_artifacts/xlm-roberta-language-classifier name: XlmRobertaLanguageClassifier import_path: main:create_serve_app route_prefix: /xlm_roberta_language_classifier @@ -333,7 +333,7 @@ applications: model_id: cross_encoder model_loader: blob_storage: - prefix: model_artifacts/cross-encoder + blob_prefix: model_artifacts/cross-encoder model_type: vllm_scoring_model name: CrossEncoder import_path: main:create_serve_app @@ -402,7 +402,7 @@ applications: model_id: llama31_instruct model_loader: blob_storage: - prefix: model_artifacts/llama31-8b-instruct + blob_prefix: model_artifacts/llama31-8b-instruct tokenizer_definition: model_id: llama31_instruct model_loader: @@ -411,7 +411,7 @@ applications: - config.json - tokenizer_config.json - tokenizer.json - prefix: model_artifacts/llama31-8b-instruct + blob_prefix: model_artifacts/llama31-8b-instruct name: Llama31Instruct import_path: main:create_serve_app route_prefix: /llama31_instruct @@ -469,7 +469,7 @@ applications: model_id: e5_language_classifier model_loader: blob_storage: - prefix: model_artifacts/e5-language-classifier + blob_prefix: model_artifacts/e5-language-classifier name: E5LanguageClassifier import_path: main:create_serve_app route_prefix: /e5_language_classifier @@ -546,7 +546,7 @@ applications: model_id: llama31_70b_instruct_awq model_loader: blob_storage: - prefix: model_artifacts/llama31-70b-instruct-awq + blob_prefix: model_artifacts/llama31-70b-instruct-awq tokenizer_definition: model_id: llama31_70b_instruct_awq model_loader: @@ -555,7 +555,7 @@ applications: - config.json - tokenizer_config.json - tokenizer.json - prefix: model_artifacts/llama31-70b-instruct-awq + blob_prefix: model_artifacts/llama31-70b-instruct-awq name: Llama3170bInstructAwq import_path: main:create_serve_app route_prefix: /llama31_70b_instruct_awq From 2d44244b6d8ce28b189be84dcd8aa567edc68f03 Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Tue, 24 Mar 2026 23:21:11 +0530 Subject: [PATCH 18/55] fix: remove task: classify from engine_args (not supported in vllm 0.15.1) --- config/configs/applications.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index f4d85fa..b9cda60 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -252,7 +252,6 @@ applications: model_config: engine_args: gpu_memory_utilization: 0.1 - task: classify tensor_parallel_size: 1 model_id: xlm_roberta_language_classifier model_loader: @@ -464,7 +463,6 @@ applications: model_config: engine_args: gpu_memory_utilization: 0.1 - task: classify tensor_parallel_size: 1 model_id: e5_language_classifier model_loader: From 977934eadce52cd929941e36b8ceab2997e966b3 Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Wed, 25 Mar 2026 08:38:38 +0530 Subject: [PATCH 19/55] fix: remove model_definition from MbartTranslator app config MbartTranslatorDeployment hardcodes its blob_prefix and does not accept model_definition as an init arg, so passing it via .bind() caused a Ray pickle error. --- config/configs/applications.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index b9cda60..0056d7f 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -196,11 +196,6 @@ applications: ray_actor_options: num_gpus: 0.2 deployment_type: custom_deployment - model_definition: - model_id: mbart_translator - model_loader: - blob_storage: - blob_prefix: model_artifacts/mbart-translator name: MbartTranslator import_path: main:create_serve_app route_prefix: /mbart_translator From 2d7c3b2cf42367e9543294055e7c6b009f689c42 Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Wed, 25 Mar 2026 13:16:21 +0530 Subject: [PATCH 20/55] feat: replace llama models with gpt-oss-20b and gpt-oss-120b Replace Llama31Instruct (8b) with GptOss20b and Llama3170bInstructAwq (70b) with GptOss120b. L40S-only, tool_parser: openai, VLLM_ATTENTION_BACKEND: TRITON_ATTN, 1 GPU for 20b and 4 GPUs for 120b. --- config/configs/applications.yaml | 123 ++++++++++++------------------ config/configs/features/saia.yaml | 4 +- 2 files changed, 49 insertions(+), 78 deletions(-) diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index 0056d7f..902dce6 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -352,68 +352,58 @@ applications: SKIP_VERIFICATION: "true" USE_SYSTEM_PERMISSIONS: "true" - args: - application_name: Llama31Instruct + application_name: GptOss20b deployment_configs: LLMDeployment: gpu_type_options_override: - A10G: - ray_actor_options: - num_gpus: 2 - H100: - ray_actor_options: - num_gpus: 0.5 L40S: ray_actor_options: num_gpus: 1 - T4: - ray_actor_options: - num_gpus: 4 runtime_env: - pip: - - triton==3.2.0 + env_vars: + VLLM_ATTENTION_BACKEND: TRITON_ATTN options: autoscaling_config: - max_replicas: {{.Replicas.Llama31Instruct}} - min_replicas: {{.Replicas.Llama31Instruct}} + max_replicas: {{.Replicas.GptOss20b}} + min_replicas: {{.Replicas.GptOss20b}} deployment_type: text_gen_model_deployment gpu_types: '["L40S"]' model_definition: gpu_type_model_config_override: - A10G: - engine_args: - tensor_parallel_size: 2 - H100: - engine_args: - gpu_memory_utilization: 0.5 - tensor_parallel_size: 1 L40S: engine_args: + gpu_memory_utilization: 0.90 tensor_parallel_size: 1 - T4: - engine_args: - dtype: half - tensor_parallel_size: 4 - model_id: llama31_instruct + model_config: + openai_serving_config: + chat: + enable_auto_tools: true + tool_parser: openai + responses: + enable_auto_tools: true + tool_parser: openai + model_id: gpt_oss_20b model_loader: blob_storage: - blob_prefix: model_artifacts/llama31-8b-instruct + blob_prefix: model_artifacts/gpt-oss-20b tokenizer_definition: - model_id: llama31_instruct + model_id: gpt_oss_20b model_loader: blob_storage: artifacts_list: + - chat_template.jinja - config.json - tokenizer_config.json - tokenizer.json - blob_prefix: model_artifacts/llama31-8b-instruct - name: Llama31Instruct + blob_prefix: model_artifacts/gpt-oss-20b + name: GptOss20b import_path: main:create_serve_app - route_prefix: /llama31_instruct + route_prefix: /gpt_oss_20b runtime_env: working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" - APPLICATION_NAME: llama31_instruct + APPLICATION_NAME: gpt_oss_20b ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" @@ -486,77 +476,58 @@ applications: SKIP_VERIFICATION: "true" USE_SYSTEM_PERMISSIONS: "true" - args: - application_name: Llama3170bInstructAwq + application_name: GptOss120b deployment_configs: LLMDeployment: gpu_type_options_override: - A100: - ray_actor_options: - num_gpus: 4 - A10G: - ray_actor_options: - num_gpus: 4 - H100: - ray_actor_options: - num_gpus: 1 L40S: ray_actor_options: - num_gpus: 2 - T4: - ray_actor_options: - num_gpus: 8 + num_gpus: 4 runtime_env: - pip: - - triton==3.2.0 + env_vars: + VLLM_ATTENTION_BACKEND: TRITON_ATTN options: autoscaling_config: - max_replicas: {{.Replicas.Llama3170bInstructAwq}} - min_replicas: {{.Replicas.Llama3170bInstructAwq}} - max_ongoing_requests: 4 + max_replicas: {{.Replicas.GptOss120b}} + min_replicas: {{.Replicas.GptOss120b}} deployment_type: text_gen_model_deployment - gpu_types: '["L40S"] ' + gpu_types: '["L40S"]' model_definition: gpu_type_model_config_override: - A100: - engine_args: - tensor_parallel_size: 4 - A10G: - engine_args: - gpu_memory_utilization: 0.95 - tensor_parallel_size: 4 - H100: - engine_args: - gpu_memory_utilization: 0.95 - tensor_parallel_size: 1 L40S: engine_args: - gpu_memory_utilization: 0.95 - tensor_parallel_size: 2 - T4: - engine_args: - dtype: half - tensor_parallel_size: 8 - model_id: llama31_70b_instruct_awq + gpu_memory_utilization: 0.90 + tensor_parallel_size: 4 + model_config: + openai_serving_config: + chat: + enable_auto_tools: true + tool_parser: openai + responses: + enable_auto_tools: true + tool_parser: openai + model_id: gpt_oss_120b model_loader: blob_storage: - blob_prefix: model_artifacts/llama31-70b-instruct-awq + blob_prefix: model_artifacts/gpt-oss-120b tokenizer_definition: - model_id: llama31_70b_instruct_awq + model_id: gpt_oss_120b model_loader: blob_storage: artifacts_list: + - chat_template.jinja - config.json - tokenizer_config.json - tokenizer.json - blob_prefix: model_artifacts/llama31-70b-instruct-awq - name: Llama3170bInstructAwq + blob_prefix: model_artifacts/gpt-oss-120b + name: GptOss120b import_path: main:create_serve_app - route_prefix: /llama31_70b_instruct_awq + route_prefix: /gpt_oss_120b runtime_env: working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" - APPLICATION_NAME: llama31_70b_instruct_awq + APPLICATION_NAME: gpt_oss_120b ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" diff --git a/config/configs/features/saia.yaml b/config/configs/features/saia.yaml index 0fec5fc..4c5dc26 100644 --- a/config/configs/features/saia.yaml +++ b/config/configs/features/saia.yaml @@ -4,8 +4,8 @@ applicationScale: CrossEncoder: 1 E5LanguageClassifier: 1 Entrypoint: 1 - Llama31Instruct: 1 - Llama3170bInstructAwq: 1 + GptOss20b: 1 + GptOss120b: 1 MbartTranslator: 1 PromptInjectionClassifier: 1 PromptInjectionCrossEncoder: 1 From 0b83c1e8cb19bb3259aa1385c1edcf00370e2850 Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Wed, 25 Mar 2026 14:35:31 +0530 Subject: [PATCH 21/55] fix: move VLLM_ATTENTION_BACKEND to top-level runtime_env to prevent env override Actor-level runtime_env.env_vars in gpu_type_options_override replaces the app-level runtime_env, causing APPLICATION_NAME and other vars to be lost. Move VLLM_ATTENTION_BACKEND: TRITON_ATTN to the top-level env_vars instead. --- config/configs/applications.yaml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index 902dce6..781101e 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -359,9 +359,6 @@ applications: L40S: ray_actor_options: num_gpus: 1 - runtime_env: - env_vars: - VLLM_ATTENTION_BACKEND: TRITON_ATTN options: autoscaling_config: max_replicas: {{.Replicas.GptOss20b}} @@ -404,6 +401,7 @@ applications: env_vars: API_VERSION: "v1" APPLICATION_NAME: gpt_oss_20b + VLLM_ATTENTION_BACKEND: TRITON_ATTN ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" @@ -483,9 +481,6 @@ applications: L40S: ray_actor_options: num_gpus: 4 - runtime_env: - env_vars: - VLLM_ATTENTION_BACKEND: TRITON_ATTN options: autoscaling_config: max_replicas: {{.Replicas.GptOss120b}} @@ -528,6 +523,7 @@ applications: env_vars: API_VERSION: "v1" APPLICATION_NAME: gpt_oss_120b + VLLM_ATTENTION_BACKEND: TRITON_ATTN ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" S3_BUCKET: "{{.ArtifactBucketName}}" ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" From a1a13fdd31703a515919a7f0d7aed204d736784c Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Wed, 25 Mar 2026 15:58:40 +0530 Subject: [PATCH 22/55] fix: reduce GptOss120b to 1 GPU / tensor_parallel_size 1 (quantized model) --- config/configs/applications.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index 781101e..663d9da 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -480,7 +480,7 @@ applications: gpu_type_options_override: L40S: ray_actor_options: - num_gpus: 4 + num_gpus: 1 options: autoscaling_config: max_replicas: {{.Replicas.GptOss120b}} @@ -492,7 +492,7 @@ applications: L40S: engine_args: gpu_memory_utilization: 0.90 - tensor_parallel_size: 4 + tensor_parallel_size: 1 model_config: openai_serving_config: chat: From d056a69ecd40458c2979ef73d19d0f4ff135ff98 Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Wed, 25 Mar 2026 19:02:57 +0530 Subject: [PATCH 23/55] fix: set IdleTimeoutSeconds=600 on worker groups to prevent autoscaler terminating nodes during model load Large model loading (e.g. gpt-oss-120b) takes several minutes. Without an idle timeout, the Ray autoscaler terminates worker nodes after 60s (default), killing the replica mid-load with SIGTERM. --- pkg/ai/raybuilder/builder.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pkg/ai/raybuilder/builder.go b/pkg/ai/raybuilder/builder.go index 9abef70..de08e0f 100644 --- a/pkg/ai/raybuilder/builder.go +++ b/pkg/ai/raybuilder/builder.go @@ -731,10 +731,11 @@ func (b *Builder) buildClusterConfig(ctx context.Context) (*rayv1.RayClusterSpec cpuLimit := cfg.Resources.Limits[corev1.ResourceCPU] replicas := instanceScale[cfg.Tier] wg := rayv1.WorkerGroupSpec{ - GroupName: cfg.Tier, - Replicas: int32Ptr(replicas), - MinReplicas: int32Ptr(replicas), - MaxReplicas: int32Ptr(replicas + 5), + GroupName: cfg.Tier, + Replicas: int32Ptr(replicas), + MinReplicas: int32Ptr(replicas), + MaxReplicas: int32Ptr(replicas + 5), + IdleTimeoutSeconds: int32Ptr(600), RayStartParams: map[string]string{ "num-cpus": cpuLimit.String(), "resources": fmt.Sprintf(`"{\"accelerator_type:%s\":1,\"gpu_count:%d\":1}"`, acceleratorType, cfg.GPUsPerPod), From b122df7f2a7fce1368ffeed88aafc036d68121d6 Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Thu, 26 Mar 2026 15:27:21 +0530 Subject: [PATCH 24/55] fix: use AutoscalerOptions.IdleTimeoutSeconds instead of WorkerGroupSpec field WorkerGroupSpec.idleTimeoutSeconds is rejected as unknown by the installed KubeRay CRD version. AutoscalerOptions.IdleTimeoutSeconds is set at the cluster level and read directly by the Ray autoscaler process, achieving the same effect without requiring a CRD upgrade. 600s idle timeout prevents the autoscaler from terminating worker nodes while large models (e.g. gpt-oss-120b) are loading. Co-Authored-By: Claude Opus 4.6 --- pkg/ai/raybuilder/builder.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pkg/ai/raybuilder/builder.go b/pkg/ai/raybuilder/builder.go index de08e0f..70f1e92 100644 --- a/pkg/ai/raybuilder/builder.go +++ b/pkg/ai/raybuilder/builder.go @@ -731,11 +731,10 @@ func (b *Builder) buildClusterConfig(ctx context.Context) (*rayv1.RayClusterSpec cpuLimit := cfg.Resources.Limits[corev1.ResourceCPU] replicas := instanceScale[cfg.Tier] wg := rayv1.WorkerGroupSpec{ - GroupName: cfg.Tier, - Replicas: int32Ptr(replicas), - MinReplicas: int32Ptr(replicas), - MaxReplicas: int32Ptr(replicas + 5), - IdleTimeoutSeconds: int32Ptr(600), + GroupName: cfg.Tier, + Replicas: int32Ptr(replicas), + MinReplicas: int32Ptr(replicas), + MaxReplicas: int32Ptr(replicas + 5), RayStartParams: map[string]string{ "num-cpus": cpuLimit.String(), "resources": fmt.Sprintf(`"{\"accelerator_type:%s\":1,\"gpu_count:%d\":1}"`, acceleratorType, cfg.GPUsPerPod), @@ -751,9 +750,11 @@ func (b *Builder) buildClusterConfig(ctx context.Context) (*rayv1.RayClusterSpec workers = append(workers, wg) } + idleTimeout := int32Ptr(600) return &rayv1.RayClusterSpec{ RayVersion: os.Getenv("RAY_VERSION"), EnableInTreeAutoscaling: boolPtr(true), + AutoscalerOptions: &rayv1.AutoscalerOptions{IdleTimeoutSeconds: idleTimeout}, HeadGroupSpec: head, WorkerGroupSpecs: workers, }, nil From 9945411fc80254dac4b9e165c4396004a115ab7b Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Thu, 26 Mar 2026 20:27:36 +0530 Subject: [PATCH 25/55] fix: increase l40s-1-gpu ephemeral storage to 200Gi and memory to 64Gi The 120b model download/load exceeds the previous 50Gi ephemeral storage limit, causing pod eviction. 200Gi matches the storage needed for large model artifacts. Memory increased from 16Gi to 64Gi to support vLLM process memory requirements for a 120b model. Co-Authored-By: Claude Opus 4.6 --- config/configs/instance.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/configs/instance.yaml b/config/configs/instance.yaml index 46518de..00acba7 100644 --- a/config/configs/instance.yaml +++ b/config/configs/instance.yaml @@ -18,8 +18,8 @@ L40S: cpu: "4" limits: cpu: "16" - memory: "16Gi" - ephemeral-storage: "50Gi" + memory: "64Gi" + ephemeral-storage: "200Gi" nvidia.com/gpu: "1" - tier: l40s-2-gpu gpusPerPod: 2 From c856da25f3f6d0d468142d912c8a429c30fde331 Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Fri, 27 Mar 2026 11:34:39 +0530 Subject: [PATCH 26/55] fix: use 2 GPUs and tensor_parallel_size=2 for gpt-oss-120b The model is mxfp4 quantized at 65GB on disk, requiring more than a single L40S (46Gi) GPU. Switch to num_gpus=2 / tensor_parallel_size=2 to use 2x L40S = 92Gi, comfortably fitting the model at runtime. Also increase l40s-1-gpu ephemeral-storage to 200Gi and memory to 64Gi to prevent pod eviction during large model downloads. --- config/configs/applications.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index 663d9da..9918a14 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -480,7 +480,7 @@ applications: gpu_type_options_override: L40S: ray_actor_options: - num_gpus: 1 + num_gpus: 2 options: autoscaling_config: max_replicas: {{.Replicas.GptOss120b}} @@ -492,7 +492,7 @@ applications: L40S: engine_args: gpu_memory_utilization: 0.90 - tensor_parallel_size: 1 + tensor_parallel_size: 2 model_config: openai_serving_config: chat: From 03ef451c79f2892abf46d6d78db1bdd1dd6e4fcc Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Fri, 27 Mar 2026 12:08:00 +0530 Subject: [PATCH 27/55] fix: increase l40s-2-gpu memory to 128Gi and ephemeral-storage to 200Gi for gpt-oss-120b --- config/configs/instance.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/config/configs/instance.yaml b/config/configs/instance.yaml index 00acba7..a1cf4e2 100644 --- a/config/configs/instance.yaml +++ b/config/configs/instance.yaml @@ -25,11 +25,11 @@ L40S: gpusPerPod: 2 resources: requests: - cpu: "1" + cpu: "4" limits: - cpu: "2" - memory: "48Gi" - ephemeral-storage: "100Gi" + cpu: "16" + memory: "128Gi" + ephemeral-storage: "200Gi" nvidia.com/gpu: "2" H100_NVL: - tier: h100-nvl-0-gpu From 61bec1ef1144addefce77963497ba8067d256f58 Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Fri, 27 Mar 2026 12:30:05 +0530 Subject: [PATCH 28/55] Revert "fix: increase l40s-2-gpu memory to 128Gi and ephemeral-storage to 200Gi for gpt-oss-120b" This reverts commit 03ef451c79f2892abf46d6d78db1bdd1dd6e4fcc. --- config/configs/instance.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/config/configs/instance.yaml b/config/configs/instance.yaml index a1cf4e2..00acba7 100644 --- a/config/configs/instance.yaml +++ b/config/configs/instance.yaml @@ -25,11 +25,11 @@ L40S: gpusPerPod: 2 resources: requests: - cpu: "4" + cpu: "1" limits: - cpu: "16" - memory: "128Gi" - ephemeral-storage: "200Gi" + cpu: "2" + memory: "48Gi" + ephemeral-storage: "100Gi" nvidia.com/gpu: "2" H100_NVL: - tier: h100-nvl-0-gpu From e9bb76a8e82133cc8cbfd254b876ab6ae2f989a9 Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Tue, 31 Mar 2026 12:50:23 +0530 Subject: [PATCH 29/55] feat: add H100 support with configurable gpu_types via defaultAcceleratorType - Add H100 worker tiers to instance.yaml (h100-0-gpu, h100-1-gpu) - Add H100 instanceScale block to features/saia.yaml - Add AcceleratorType field to ApplicationParams in builder.go, populated from effectiveAcceleratorType(), so applications.yaml can template gpu_types - Template gpu_types in applications.yaml for GptOss20b and GptOss120b using {{.AcceleratorType}} instead of hardcoded ["L40S"] - Add H100 gpu_type_options_override and gpu_type_model_config_override entries for GptOss20b (0.5 GPU, tp=1) and GptOss120b (1 GPU, tp=1) - Fix UaeLarge H100 num_gpus and gpu_memory_utilization: 0.025 -> 0.0375 - Require yq in preflight checks for eks and k0s scripts (fail instead of silently falling back to fragile grep/awk parsing) Co-Authored-By: Claude Opus 4.6 --- config/configs/applications.yaml | 22 +++++++++++++---- config/configs/features/saia.yaml | 3 +++ config/configs/instance.yaml | 24 +++++++++++++++++++ pkg/ai/raybuilder/builder.go | 2 ++ tools/cluster_setup/eks_cluster_with_stack.sh | 4 ++-- tools/cluster_setup/k0s_cluster_with_stack.sh | 9 +------ 6 files changed, 50 insertions(+), 14 deletions(-) diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index 9918a14..d40e531 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -28,7 +28,7 @@ applications: gpu_type_options_override: H100: ray_actor_options: - num_gpus: 0.025 + num_gpus: 0.0375 L40S: ray_actor_options: num_gpus: 0.05 @@ -43,7 +43,7 @@ applications: gpu_type_model_config_override: H100: engine_args: - gpu_memory_utilization: 0.025 + gpu_memory_utilization: 0.0375 L40S: engine_args: gpu_memory_utilization: 0.05 @@ -356,6 +356,9 @@ applications: deployment_configs: LLMDeployment: gpu_type_options_override: + H100: + ray_actor_options: + num_gpus: 0.5 L40S: ray_actor_options: num_gpus: 1 @@ -364,9 +367,13 @@ applications: max_replicas: {{.Replicas.GptOss20b}} min_replicas: {{.Replicas.GptOss20b}} deployment_type: text_gen_model_deployment - gpu_types: '["L40S"]' + gpu_types: '["{{.AcceleratorType}}"]' model_definition: gpu_type_model_config_override: + H100: + engine_args: + gpu_memory_utilization: 0.90 + tensor_parallel_size: 1 L40S: engine_args: gpu_memory_utilization: 0.90 @@ -478,6 +485,9 @@ applications: deployment_configs: LLMDeployment: gpu_type_options_override: + H100: + ray_actor_options: + num_gpus: 1 L40S: ray_actor_options: num_gpus: 2 @@ -486,9 +496,13 @@ applications: max_replicas: {{.Replicas.GptOss120b}} min_replicas: {{.Replicas.GptOss120b}} deployment_type: text_gen_model_deployment - gpu_types: '["L40S"]' + gpu_types: '["{{.AcceleratorType}}"]' model_definition: gpu_type_model_config_override: + H100: + engine_args: + gpu_memory_utilization: 0.90 + tensor_parallel_size: 1 L40S: engine_args: gpu_memory_utilization: 0.90 diff --git a/config/configs/features/saia.yaml b/config/configs/features/saia.yaml index 4c5dc26..a9192da 100644 --- a/config/configs/features/saia.yaml +++ b/config/configs/features/saia.yaml @@ -17,6 +17,9 @@ instanceScale: l40s-0-gpu: 1 l40s-1-gpu: 2 l40s-2-gpu: 1 + H100: + h100-0-gpu: 1 + h100-1-gpu: 2 H100_NVL: h100-nvl-0-gpu: 1 h100-nvl-1-gpu: 2 \ No newline at end of file diff --git a/config/configs/instance.yaml b/config/configs/instance.yaml index 00acba7..e704fd7 100644 --- a/config/configs/instance.yaml +++ b/config/configs/instance.yaml @@ -31,6 +31,30 @@ L40S: memory: "48Gi" ephemeral-storage: "100Gi" nvidia.com/gpu: "2" +H100: + - tier: h100-0-gpu + gpusPerPod: 0 + env: + NVIDIA_VISIBLE_DEVICES: void + resources: + limits: + cpu: "16" + memory: "32Gi" + ephemeral-storage: "10Gi" + nvidia.com/gpu: "0" + requests: + cpu: "4" + - tier: h100-1-gpu + gpusPerPod: 1 + # No NVIDIA_VISIBLE_DEVICES here - GPUs must be visible for vLLM + resources: + requests: + cpu: "4" + limits: + cpu: "16" + memory: "48Gi" + ephemeral-storage: "100Gi" + nvidia.com/gpu: "1" H100_NVL: - tier: h100-nvl-0-gpu gpusPerPod: 0 diff --git a/pkg/ai/raybuilder/builder.go b/pkg/ai/raybuilder/builder.go index 70f1e92..3f2ed32 100644 --- a/pkg/ai/raybuilder/builder.go +++ b/pkg/ai/raybuilder/builder.go @@ -53,6 +53,7 @@ type ApplicationParams struct { Replicas map[string]int32 `yaml:"REPLICAS"` WorkingDirBase string `yaml:"WORKING_DIR_BASE"` ModelVersion string `yaml:"MODEL_VERSION"` + AcceleratorType string `yaml:"ACCELERATOR_TYPE"` } type WorkerConfigs map[string][]InstanceDetail @@ -220,6 +221,7 @@ func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPl Replicas: replicasMap, WorkingDirBase: workingDirBase, ModelVersion: os.Getenv("MODEL_VERSION"), + AcceleratorType: b.effectiveAcceleratorType(), } // Use embedded applications.yaml content diff --git a/tools/cluster_setup/eks_cluster_with_stack.sh b/tools/cluster_setup/eks_cluster_with_stack.sh index fe3a033..11a739f 100755 --- a/tools/cluster_setup/eks_cluster_with_stack.sh +++ b/tools/cluster_setup/eks_cluster_with_stack.sh @@ -2656,7 +2656,7 @@ preflight_env() { fi pf_header "Tools" - for t in aws eksctl kubectl helm git jq; do + for t in aws eksctl kubectl helm git jq yq; do if command -v "$t" >/dev/null 2>&1; then pf_ok "$t found ($(command -v $t))"; else pf_fail "$t not found in PATH"; fi done @@ -2964,7 +2964,7 @@ reconcile_flow() { # ---------- MAIN ---------- main_install() { - for t in aws eksctl kubectl helm git jq; do need "$t"; done + for t in aws eksctl kubectl helm git jq yq; do need "$t"; done # Load configuration from YAML file load_config diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index eb93bfa..cf2e313 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -178,7 +178,7 @@ load_config() { # ====== PREFLIGHT CHECKS ====== preflight_checks() { pf_header "Required tools" - for tool in ssh kubectl helm git jq; do + for tool in ssh kubectl helm git jq yq; do if command -v "$tool" >/dev/null 2>&1; then pf_ok "$tool found" else @@ -186,13 +186,6 @@ preflight_checks() { fi done - # Check for yq - if command -v yq >/dev/null 2>&1; then - pf_ok "yq found" - else - pf_warn "yq not found - using fallback parsing (install yq for better results)" - fi - pf_header "Configuration" [[ -n "${CLUSTER_NAME}" ]] && pf_ok "Cluster name: ${CLUSTER_NAME}" || pf_fail "Cluster name not set" [[ -f "${SPLUNK_OPERATOR_FILE}" ]] && pf_ok "Splunk operator file: ${SPLUNK_OPERATOR_FILE}" || pf_warn "Splunk operator file not found: ${SPLUNK_OPERATOR_FILE}" From ab17c85de546102795c7443549aabe7a8d5d80ce Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Tue, 31 Mar 2026 14:03:07 +0530 Subject: [PATCH 30/55] feat: add H100/L40S cluster setup support in eks and k0s scripts eks_cluster_with_stack.sh: - Read GPU_CAPACITY_RESERVATION_ID/AZ and GPU_AVAILABILITY_ZONES from cluster-config.yaml in load_config() - generate_node_groups(): skip standard GPU node group for H100 with capacity reservation; add availabilityZones support for other types - New create_gpu_nodegroup_with_capacity_block(): CloudFormation-based H100 node group using CapacityType: CAPACITY_BLOCK, only invoked when defaultAcceleratorType=H100 and capacityReservation.id is set - create_cluster_flow/reconcile_flow: gate capacity block creation on DEFAULT_ACCELERATOR=H100, idempotent GPU node count check - main_install: export AWS_DEFAULT_REGION/AWS_REGION after load_config - Add missing --region flag to 3 eksctl create iamserviceaccount calls k0s_cluster_with_stack.sh: - load_config: read defaultAcceleratorType from config, default L40S cluster-config.yaml: - GPU TYPE QUICK REFERENCE comment block: L40S/H100/H100_NVL instance types, when to use capacityReservation and availabilityZones - H100-only capacityReservation and availabilityZones commented-out blocks - defaultAcceleratorType comment cross-referencing instance types k0s-cluster-config.yaml (new file): - Config template for k0s script with GPU TYPE QUICK REFERENCE - Documents L40S/H100/H100_NVL gpuWorker instance types alongside defaultAcceleratorType Co-Authored-By: Claude Opus 4.6 --- tools/cluster_setup/cluster-config.yaml | 68 +++++- tools/cluster_setup/eks_cluster_with_stack.sh | 228 +++++++++++++++++- tools/cluster_setup/k0s-cluster-config.yaml | 119 +++++++++ tools/cluster_setup/k0s_cluster_with_stack.sh | 6 +- 4 files changed, 404 insertions(+), 17 deletions(-) create mode 100644 tools/cluster_setup/k0s-cluster-config.yaml diff --git a/tools/cluster_setup/cluster-config.yaml b/tools/cluster_setup/cluster-config.yaml index 891f170..513b425 100644 --- a/tools/cluster_setup/cluster-config.yaml +++ b/tools/cluster_setup/cluster-config.yaml @@ -41,6 +41,28 @@ cluster: # az: "us-east-2b" # OPTIONAL: Third availability zone # ---------- Node Groups ---------- +# +# GPU TYPE QUICK REFERENCE — set instanceType and defaultAcceleratorType (under aiPlatform) together: +# +# L40S (default): +# instanceType: g6e.12xlarge (4x L40S GPUs, 48 GB VRAM each) +# defaultAcceleratorType: L40S +# capacityReservation: not required +# availabilityZones: not required +# +# H100: +# instanceType: p5.4xlarge (8x H100 GPUs, 80 GB VRAM each; capacity reservation required) +# defaultAcceleratorType: H100 +# capacityReservation: required — uncomment block below and set id + az +# availabilityZones: required — must match capacityReservation.az +# maxSize: must equal desiredCapacity (capacity reservations are fixed-size) +# +# H100_NVL: +# instanceType: p4de.24xlarge (8x H100 NVL GPUs, 94 GB VRAM each) +# defaultAcceleratorType: H100_NVL +# capacityReservation: not required +# availabilityZones: not required +# nodeGroups: cpu: enabled: true # Set to false to skip CPU node group @@ -53,13 +75,26 @@ nodeGroups: gpu: enabled: true # Set to false to skip GPU nodes (saves cost) - instanceType: "g6e.12xlarge" # GPU instance type (g6e.12xlarge=4xL40S GPUs, g5.xlarge=1xA10G) + instanceType: "g6e.12xlarge" # CHANGE THIS: see GPU TYPE QUICK REFERENCE above desiredCapacity: 2 # Initial number of GPU nodes minSize: 2 # Minimum GPU nodes - maxSize: 4 # Maximum GPU nodes + maxSize: 4 # Maximum GPU nodes (set equal to desiredCapacity for H100) volumeSize: 1000 # EBS volume size per GPU node (GB) - larger for model storage volumeType: "gp3" # EBS volume type + # ── H100 ONLY ────────────────────────────────────────────────────────────── + # Capacity Reservation: required for P5/H100 instances (scarce capacity). + # Uncomment and fill in when defaultAcceleratorType is H100. + # capacityReservation: + # id: "cr-xxxxxxxxxxxxxxxxx" # CHANGE THIS: your capacity reservation ID + # az: "us-east-2c" # CHANGE THIS: AZ where the reservation exists + + # Availability Zones: lock GPU nodes to the AZ matching the capacity reservation. + # Uncomment and fill in when defaultAcceleratorType is H100. + # availabilityZones: + # - "us-east-2c" # CHANGE THIS: must match capacityReservation.az + # ─────────────────────────────────────────────────────────────────────────── + # ---------- Storage Configuration ---------- # Object storage: only AWS S3 or external S3-compatible (no in-cluster MinIO install). # Use objectStore.type: aws (S3) or s3compat | minio | seaweedfs (external; endpoint + credentials required). @@ -69,12 +104,13 @@ storage: vectorDbSize: "50Gi" # VectorDB persistent volume size # Object store: aws (S3) or external S3-compatible (s3compat, minio, seaweedfs). No in-cluster install. - # - MinIO: endpoint port 9000 (e.g. http://host:9000) - # - SeaweedFS S3: endpoint port 8333 (e.g. http://host:8333); start SeaweedFS with AWS_ACCESS_KEY_ID/SECRET matching auth below + # - s3compat: generic S3 API (MinIO :9000, SeaweedFS S3 :8333, etc.) — AIPlatform path uses s3compat://bucket + # - minio: same wiring as s3compat but path uses minio:// (use if an older operator webhook rejects s3compat://) + # - seaweedfs: path uses seaweedfs:// (requires operator webhook that allows that scheme) objectStore: - type: "seaweedfs" # aws | s3compat | minio | seaweedfs (external only for non-aws) + type: "minio" # aws | s3compat | minio | seaweedfs (external only for non-aws) bucket: "ai-platform-bucket-minio-us-east-2" - endpoint: "http://3.144.157.201:8333" # SeaweedFS S3 (port 8333). For MinIO use port 9000. + endpoint: "http://13.59.216.105:9000" # MinIO API (9000) or SeaweedFS S3 gateway (8333) auth: rootUser: "minioadmin" rootPassword: "minioadmin" # Must match SeaweedFS env (AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY) or MinIO root @@ -115,7 +151,9 @@ images: # Option 2: Full path (ignores registry prefix) # image: "docker.io/myorg/splunk-ai-operator:v1.0.0" # Result: "docker.io/myorg/splunk-ai-operator:v1.0.0" - image: "docker.io/kpratyush775/splunk-ai-operator:v0.1.3" + # Bump tag after building fixed operator (SAIA 8Gi default, SchemaJobId persist, feature config) + #image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/splunk-ai-operator:v0.1.8" + image: "docker.io/kpratyush775/splunk-ai-operator:v0.1.31" # Splunk Enterprise Images splunk: @@ -138,8 +176,8 @@ images: # Option 2: Full path with different registry # headImage: "docker.io/rayproject/ray:2.44.0" # Result: "docker.io/rayproject/ray:2.44.0" - headImage: "ml-platform/ray/ray-head:build-v1alpha1" - workerImage: "ml-platform/ray/ray-worker-gpu:build-v1alpha1" + headImage: "ml-platform/ray/ray-head:build-008" + workerImage: "ml-platform/ray/ray-worker-gpu:build-008" # Weaviate Vector Database weaviate: @@ -151,8 +189,8 @@ images: # SAIA (Splunk AI Assistant) Images saia: # Relative paths - registry prefix auto-applied - apiImage: "ml-platform/saia/saia-api:build-v1alpha1" - dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v1alpha1" + apiImage: "ml-platform/saia/saia-api:build-005" + dataLoaderImage: "ml-platform/saia/saia-data-loader:build-003" # Supporting Images fluentBit: @@ -190,8 +228,12 @@ aiPlatform: rayWorker: "ray-worker-sa" # no change saiaService: "saia-service-sa" # no change - # Default accelerator type - defaultAcceleratorType: "L40S" + # Default accelerator type — must match a top-level key in instance.yaml. + # Must be changed in sync with nodeGroups.gpu.instanceType (see GPU TYPE QUICK REFERENCE above). + # L40S → instanceType: g6e.12xlarge + # H100 → instanceType: p5.4xlarge (also uncomment capacityReservation + availabilityZones) + # H100_NVL → instanceType: p4de.24xlarge + defaultAcceleratorType: "L40S" # Features to enable features: # no change diff --git a/tools/cluster_setup/eks_cluster_with_stack.sh b/tools/cluster_setup/eks_cluster_with_stack.sh index 11a739f..7426ae1 100755 --- a/tools/cluster_setup/eks_cluster_with_stack.sh +++ b/tools/cluster_setup/eks_cluster_with_stack.sh @@ -48,6 +48,18 @@ load_config() { GPU_VOLUME_SIZE="$(yq eval '.nodeGroups.gpu.volumeSize' "$cfg")" GPU_VOLUME_TYPE="$(yq eval '.nodeGroups.gpu.volumeType' "$cfg")" + # GPU Availability Zones (optional - for capacity-constrained instance types like P5/H100) + GPU_AVAILABILITY_ZONES=() + while IFS= read -r az; do + [[ -n "$az" ]] && GPU_AVAILABILITY_ZONES+=("$az") + done < <(yq eval '.nodeGroups.gpu.availabilityZones[]' "$cfg" 2>/dev/null) + + # Capacity Reservation (optional - for H100/P5 instances) + GPU_CAPACITY_RESERVATION_ID="$(yq eval '.nodeGroups.gpu.capacityReservation.id' "$cfg" 2>/dev/null)" + GPU_CAPACITY_RESERVATION_AZ="$(yq eval '.nodeGroups.gpu.capacityReservation.az' "$cfg" 2>/dev/null)" + [[ "$GPU_CAPACITY_RESERVATION_ID" == "null" ]] && GPU_CAPACITY_RESERVATION_ID="" + [[ "$GPU_CAPACITY_RESERVATION_AZ" == "null" ]] && GPU_CAPACITY_RESERVATION_AZ="" + # Cluster options PRESERVE_VPC_ON_DELETE="$(yq eval '.cluster.preserveVpcOnDelete // false' "$cfg")" @@ -183,6 +195,9 @@ load_config() { GPU_MAX=4 GPU_VOLUME_SIZE=1000 GPU_VOLUME_TYPE="gp3" + GPU_AVAILABILITY_ZONES=() + GPU_CAPACITY_RESERVATION_ID="" + GPU_CAPACITY_RESERVATION_AZ="" SPLUNK_APP_LOCAL_PATH="" # Hardcoded subnets for fallback @@ -794,7 +809,11 @@ generate_node_groups() { k8s.io/cluster-autoscaler/enabled: \"true\" k8s.io/cluster-autoscaler/${CLUSTER_NAME}: owned" fi - if [[ "$ENABLE_GPU" == "true" ]]; then + # H100 with capacity reservation: node group created separately via CloudFormation + # All other GPU types (L40S, H100_NVL): standard eksctl managed node group + if [[ "$ENABLE_GPU" == "true" && "$DEFAULT_ACCELERATOR" == "H100" && -n "$GPU_CAPACITY_RESERVATION_ID" ]]; then + log "GPU nodes will be created separately with capacity reservation ${GPU_CAPACITY_RESERVATION_ID}" + elif [[ "$ENABLE_GPU" == "true" ]]; then nodes+=" - name: gpu-nodes instanceType: ${GPU_INSTANCE_TYPE} @@ -802,7 +821,17 @@ generate_node_groups() { minSize: ${GPU_MIN} maxSize: ${GPU_MAX} volumeSize: ${GPU_VOLUME_SIZE} - volumeType: ${GPU_VOLUME_TYPE} + volumeType: ${GPU_VOLUME_TYPE}" + # Lock to specific AZ when availabilityZones are specified (e.g. for H100_NVL) + if [[ ${#GPU_AVAILABILITY_ZONES[@]} -gt 0 ]]; then + nodes+=" + availabilityZones:" + for az in "${GPU_AVAILABILITY_ZONES[@]}"; do + nodes+=" + - ${az}" + done + fi + nodes+=" tags: Name: ${CLUSTER_NAME}-gpu Environment: prod @@ -885,6 +914,174 @@ EOF create_cluster() { log "Creating EKS cluster..."; eksctl create cluster -f eks-cluster-config.yaml; ensure_kubeconfig; } +# Create GPU node group with Capacity Block using CloudFormation. +# Only called when DEFAULT_ACCELERATOR=H100 and GPU_CAPACITY_RESERVATION_ID is set. +create_gpu_nodegroup_with_capacity_block() { + if [[ "$DEFAULT_ACCELERATOR" != "H100" || -z "$GPU_CAPACITY_RESERVATION_ID" ]]; then + return 0 + fi + + log "Creating GPU node group with Capacity Block (H100)..." + log " Reservation: ${GPU_CAPACITY_RESERVATION_ID} in ${GPU_CAPACITY_RESERVATION_AZ}" + + local stack_name="${CLUSTER_NAME}-gpu-capacity-block" + local cfn_template_file="/tmp/${stack_name}-template.yaml" + + # Get cluster info + local cluster_info vpc_id cluster_sg + cluster_info=$(aws eks describe-cluster --name "${CLUSTER_NAME}" --region "${REGION}" --query 'cluster') + vpc_id=$(echo "$cluster_info" | jq -r '.resourcesVpcConfig.vpcId') + cluster_sg=$(echo "$cluster_info" | jq -r '.resourcesVpcConfig.clusterSecurityGroupId') + log " VPC: ${vpc_id}, Security Group: ${cluster_sg}" + + # Get EKS GPU AMI + local ami_id + ami_id=$(aws ssm get-parameter \ + --name "/aws/service/eks/optimized-ami/${K8S_VERSION}/amazon-linux-2-gpu/recommended/image_id" \ + --region "${REGION}" --query 'Parameter.Value' --output text) + log " AMI: ${ami_id}" + + # Get node IAM role created by eksctl for the CPU node group + local node_role_arn + node_role_arn=$(aws iam list-roles \ + --query "Roles[?contains(RoleName, '${CLUSTER_NAME}') && contains(RoleName, 'NodeInstanceRole')].Arn" \ + --output text | head -1) + log " Node Role: ${node_role_arn}" + + if [[ -z "$node_role_arn" || "$node_role_arn" == "None" ]]; then + err "Node role not found — ensure CPU node group was created first." + fi + + # Find subnet in the capacity reservation AZ + local subnet_id + subnet_id=$(aws ec2 describe-subnets --region "${REGION}" \ + --filters "Name=availability-zone,Values=${GPU_CAPACITY_RESERVATION_AZ}" \ + "Name=vpc-id,Values=${vpc_id}" \ + "Name=tag:Name,Values=*eksctl-${CLUSTER_NAME}*Private*" \ + --query 'Subnets[0].SubnetId' --output text) + if [[ -z "$subnet_id" || "$subnet_id" == "None" ]]; then + subnet_id=$(aws ec2 describe-subnets --region "${REGION}" \ + --filters "Name=availability-zone,Values=${GPU_CAPACITY_RESERVATION_AZ}" \ + "Name=vpc-id,Values=${vpc_id}" \ + --query 'Subnets[0].SubnetId' --output text) + fi + if [[ -z "$subnet_id" || "$subnet_id" == "None" ]]; then + err "Subnet not found in ${GPU_CAPACITY_RESERVATION_AZ} for VPC ${vpc_id}" + fi + log " Subnet: ${subnet_id}" + + # Generate CloudFormation template + cat > "${cfn_template_file}" </dev/null || echo "NOT_EXISTS") + + if [[ "$stack_status" == "CREATE_COMPLETE" || "$stack_status" == "UPDATE_COMPLETE" ]]; then + log "GPU node group already exists and is healthy — skipping." + rm -f "${cfn_template_file}"; return 0 + elif [[ "$stack_status" != "NOT_EXISTS" ]]; then + log "Deleting ${stack_status} stack before retry..." + aws cloudformation delete-stack --stack-name "${stack_name}" --region "${REGION}" + aws cloudformation wait stack-delete-complete --stack-name "${stack_name}" --region "${REGION}" || true + fi + + aws cloudformation deploy \ + --template-file "${cfn_template_file}" \ + --stack-name "${stack_name}" \ + --region "${REGION}" \ + --parameter-overrides \ + ClusterName="${CLUSTER_NAME}" \ + ReservationId="${GPU_CAPACITY_RESERVATION_ID}" \ + SubnetId="${subnet_id}" \ + NodeRoleArn="${node_role_arn}" \ + SecurityGroupId="${cluster_sg}" \ + AmiId="${ami_id}" \ + InstanceType="${GPU_INSTANCE_TYPE}" \ + VolumeSize="${GPU_VOLUME_SIZE}" \ + DesiredCapacity="${GPU_DESIRED}" \ + --capabilities CAPABILITY_IAM \ + --no-fail-on-empty-changeset + + rm -f "${cfn_template_file}" + + local final_status + final_status=$(aws cloudformation describe-stacks --stack-name "${stack_name}" --region "${REGION}" \ + --query 'Stacks[0].StackStatus' --output text) + if [[ "$final_status" != "CREATE_COMPLETE" && "$final_status" != "UPDATE_COMPLETE" ]]; then + err "CloudFormation stack failed: ${final_status}. Check: aws cloudformation describe-stack-events --stack-name ${stack_name} --region ${REGION}" + fi + + log "GPU node group with Capacity Block created successfully." + log "Waiting for nodes to join cluster..." + sleep 30 + kubectl get nodes -l nvidia.com/gpu=true 2>/dev/null || log "(Nodes may still be joining...)" +} + ensure_oidc() { log "Ensuring IAM OIDC provider is associated..." @@ -1034,6 +1231,7 @@ ensure_ebs_irsa_role() { # Create IRSA for EBS CSI using eksctl (handles role creation, trust policy, and SA annotation) eksctl create iamserviceaccount \ --cluster "${CLUSTER_NAME}" \ + --region "${REGION}" \ --namespace "${EBS_NS}" \ --name "${EBS_SA}" \ --role-name "${EBS_IRSA_ROLE_NAME}" \ @@ -1118,6 +1316,7 @@ install_cluster_autoscaler() { log "Installing Cluster Autoscaler with IRSA..." eksctl create iamserviceaccount \ --cluster "${CLUSTER_NAME}" \ + --region "${REGION}" \ --name "${AUTOSCALER_SA}" \ --namespace "${AUTOSCALER_NS}" \ --role-name "${AUTOSCALER_ROLE_NAME}" \ @@ -1530,6 +1729,7 @@ Then re-run this script." log "Ensuring IRSA (role ${role}) for ${ns}/${sa} with policy ${policy_arn}" eksctl create iamserviceaccount \ --cluster "${CLUSTER_NAME}" \ + --region "${REGION}" \ --namespace "${ns}" \ --name "${sa}" \ --role-name "${role}" \ @@ -2939,7 +3139,14 @@ install_ai_platform_stack() { } # ---------- CREATE / RECONCILE / DELETE FLOWS ---------- -create_cluster_flow() { create_cluster_config; create_cluster; } +create_cluster_flow() { + create_cluster_config + create_cluster + # H100 with capacity reservation: eksctl cannot manage these nodes — create via CloudFormation + if [[ "$DEFAULT_ACCELERATOR" == "H100" && -n "$GPU_CAPACITY_RESERVATION_ID" ]]; then + create_gpu_nodegroup_with_capacity_block + fi +} reconcile_flow() { ensure_oidc @@ -2950,6 +3157,16 @@ reconcile_flow() { install_cluster_autoscaler install_nvidia_device_plugin uncordon_ready_nodes + # H100 with capacity reservation: create GPU node group if not already present + if [[ "$DEFAULT_ACCELERATOR" == "H100" && -n "$GPU_CAPACITY_RESERVATION_ID" ]]; then + local gpu_node_count + gpu_node_count=$(kubectl get nodes -l nvidia.com/gpu=true --no-headers 2>/dev/null | wc -l | tr -d ' ') + if [[ "$gpu_node_count" -lt 1 ]]; then + create_gpu_nodegroup_with_capacity_block + else + log "Found ${gpu_node_count} H100 GPU node(s) — skipping capacity block creation." + fi + fi install_kube_prometheus install_cert_manager ensure_s3compat_credentials @@ -2969,6 +3186,11 @@ main_install() { # Load configuration from YAML file load_config + # Force region for all AWS CLI and eksctl commands + export AWS_DEFAULT_REGION="${REGION}" + export AWS_REGION="${REGION}" + log "Using AWS Region: ${REGION}" + # Validate and configure container images validate_image_config configure_images diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml new file mode 100644 index 0000000..258f43a --- /dev/null +++ b/tools/cluster_setup/k0s-cluster-config.yaml @@ -0,0 +1,119 @@ +# =================================================================== +# k0s Cluster Configuration Template for Splunk AI Platform +# =================================================================== +# IMPORTANT: This is a template file with placeholder values. +# Copy this file and replace ALL placeholder values with your actual resources. +# +# Quick Start: +# 1. Copy: cp k0s-cluster-config.yaml my-cluster-config.yaml +# 2. Edit: vi my-cluster-config.yaml +# 3. Replace all values marked with "CHANGE THIS" +# 4. Run: CONFIG_FILE=./my-cluster-config.yaml ./k0s_cluster_with_stack.sh install +# =================================================================== + +# ---------- Cluster Configuration ---------- +cluster: + name: "my-ai-cluster" # CHANGE THIS: Your cluster name + useExisting: "auto" # auto | force | never + region: "us-east-2" # CHANGE THIS: AWS region (used when creating EC2 instances) + sshUser: "ubuntu" # SSH username for nodes + sshKeyPath: "~/.ssh/id_rsa" # CHANGE THIS: Path to SSH private key + +# ---------- Node Configuration ---------- +# +# GPU TYPE QUICK REFERENCE — set gpuWorker instanceType and defaultAcceleratorType together: +# +# L40S (default): +# gpuWorker instanceType: g6e.12xlarge (4x L40S GPUs, 48 GB VRAM each) +# defaultAcceleratorType: L40S +# +# H100: +# gpuWorker instanceType: p5.4xlarge (8x H100 GPUs, 80 GB VRAM each) +# defaultAcceleratorType: H100 +# +# H100_NVL: +# gpuWorker instanceType: p4de.24xlarge (8x H100 NVL GPUs, 94 GB VRAM each) +# defaultAcceleratorType: H100_NVL +# +# On-premises (existing hardware): +# Set existingIPs below — instanceTypes are ignored when IPs are provided. +# The defaultAcceleratorType must still match the physical GPU in your nodes. +# +nodes: + controllers: 1 # 1 (single) or 3 (HA) + cpuWorkers: 2 # Number of CPU worker nodes (EC2 mode only) + gpuWorkers: 1 # Number of GPU worker nodes (EC2 mode only) + + # On-premises / existing nodes: provide IPs to skip EC2 instance creation. + # Leave lists empty to create new EC2 instances automatically. + existingIPs: + controllers: [] # e.g. ["10.0.0.1"] or ["10.0.0.1", "10.0.0.2", "10.0.0.3"] for HA + workers: [] # e.g. ["10.0.1.1", "10.0.1.2", "10.0.2.1"] + +# ---------- EC2 Instance Types (ignored when existingIPs are set) ---------- +instanceTypes: + controller: "t3.xlarge" # Controller node (4 vCPU, 16 GB RAM) + cpuWorker: "m5.4xlarge" # CPU worker (16 vCPU, 64 GB RAM) + gpuWorker: "g6e.12xlarge" # CHANGE THIS: see GPU TYPE QUICK REFERENCE above + +# ---------- EC2 Network (required when creating EC2 instances) ---------- +ec2: + vpcId: "" # CHANGE THIS: your VPC ID (e.g. vpc-xxxxxxxxxxxxxxxxx) + subnetId: "" # CHANGE THIS: your subnet ID (e.g. subnet-xxxxxxxxxxxxxxxxx) + keyName: "" # CHANGE THIS: your EC2 key pair name + +# ---------- MinIO Object Storage ---------- +minio: + accessKey: "minioadmin" # CHANGE THIS: MinIO admin username + secretKey: "minioadmin" # CHANGE THIS: MinIO admin password + bucket: "ai-platform-data" # MinIO bucket name + +# ---------- Kubernetes ---------- +kubernetes: + namespace: "ai-platform" # no change + +# ---------- Splunk ---------- +splunk: + standaloneName: "splunk-standalone" # no change + +# ---------- ECR (for private AWS image repositories) ---------- +ecr: + account: "" # CHANGE THIS: your AWS account ID (e.g. "123456789012") + # Leave empty to auto-detect from AWS CLI + +# ---------- Image Pull Secrets ---------- +imagePullSecrets: + autoCreateECR: false # Set true to auto-create ECR pull secret + dockerHub: + enabled: false # Set true if images are on Docker Hub (private) + username: "" + password: "" + gcr: + enabled: false + acr: + enabled: false + custom: + enabled: false + +# ---------- File Paths ---------- +files: + splunkOperator: "./splunk-operator-cluster.yaml" + aiPlatform: "./artifacts.yaml" + +# ---------- AI Platform Configuration ---------- +aiPlatform: + namespace: "ai-platform" # no change + name: "splunk-ai-stack" # no change + + # Service Accounts + serviceAccounts: + rayHead: "ray-head-sa" # no change + rayWorker: "ray-worker-sa" # no change + saiaService: "saia-service-sa" # no change + + # Default accelerator type — must match a top-level key in instance.yaml. + # Must be changed in sync with instanceTypes.gpuWorker (see GPU TYPE QUICK REFERENCE above). + # L40S → gpuWorker: g6e.12xlarge + # H100 → gpuWorker: p5.4xlarge + # H100_NVL → gpuWorker: p4de.24xlarge + defaultAcceleratorType: "L40S" diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index cf2e313..4ac7787 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -157,7 +157,11 @@ load_config() { SPLUNK_OPERATOR_FILE=$(yq eval '.files.splunkOperator' "${CONFIG_FILE}" 2>/dev/null || echo "./splunk-operator-cluster.yaml") SPLUNK_AI_FILE=$(yq eval '.files.aiPlatform' "${CONFIG_FILE}" 2>/dev/null || echo "./artifacts.yaml") - log "Configuration loaded: cluster=${CLUSTER_NAME}, namespace=${AI_NS}" + # Default accelerator type (must match a key in instance.yaml: L40S | H100 | H100_NVL) + DEFAULT_ACCELERATOR=$(yq eval '.aiPlatform.defaultAcceleratorType' "${CONFIG_FILE}" 2>/dev/null || echo "") + [[ "$DEFAULT_ACCELERATOR" == "null" || -z "$DEFAULT_ACCELERATOR" ]] && DEFAULT_ACCELERATOR="L40S" + + log "Configuration loaded: cluster=${CLUSTER_NAME}, namespace=${AI_NS}, accelerator=${DEFAULT_ACCELERATOR}" if [[ -n "${ECR_ACCOUNT}" ]]; then log "ECR Account: ${ECR_ACCOUNT}" fi From 802e52f19d719da8c439e6a96c15f68f5625f769 Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Tue, 31 Mar 2026 14:10:16 +0530 Subject: [PATCH 31/55] fix: upgrade grpc and cert-manager to patch CVE-2026-33186 and CVE-2026-25518 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - google.golang.org/grpc: v1.78.0 → v1.79.3 (Critical: CVE-2026-33186) - github.com/cert-manager/cert-manager: v1.18.0 → v1.18.5 (Medium: CVE-2026-25518) Co-Authored-By: Claude Opus 4.6 --- go.mod | 14 +++++++------- go.sum | 32 ++++++++++++++++---------------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/go.mod b/go.mod index 8860ea8..e5daf45 100644 --- a/go.mod +++ b/go.mod @@ -10,7 +10,7 @@ require ( github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.9.0 github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.6.1 github.com/aws/aws-sdk-go v1.55.7 - github.com/cert-manager/cert-manager v1.18.0 + github.com/cert-manager/cert-manager v1.18.5 github.com/go-logr/logr v1.4.3 github.com/google/go-cmp v0.7.0 github.com/onsi/ginkgo/v2 v2.22.2 @@ -31,7 +31,7 @@ require ( ) require ( - cel.dev/expr v0.24.0 // indirect + cel.dev/expr v0.25.1 // indirect cloud.google.com/go v0.121.1 // indirect cloud.google.com/go/auth v0.16.1 // indirect cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect @@ -48,11 +48,11 @@ require ( github.com/blang/semver/v4 v4.0.0 // indirect github.com/cenkalti/backoff/v5 v5.0.3 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect - github.com/cncf/xds/go v0.0.0-20251022180443-0feb69152e9f // indirect + github.com/cncf/xds/go v0.0.0-20251210132809-ee656c7534f5 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/emicklei/go-restful/v3 v3.12.1 // indirect - github.com/envoyproxy/go-control-plane/envoy v1.35.0 // indirect - github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect + github.com/envoyproxy/go-control-plane/envoy v1.36.0 // indirect + github.com/envoyproxy/protoc-gen-validate v1.3.0 // indirect github.com/evanphx/json-patch/v5 v5.9.11 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.8.0 // indirect @@ -97,7 +97,7 @@ require ( github.com/stoewer/go-strcase v1.3.0 // indirect github.com/x448/float16 v0.8.4 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect - go.opentelemetry.io/contrib/detectors/gcp v1.38.0 // indirect + go.opentelemetry.io/contrib/detectors/gcp v1.39.0 // indirect go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect go.opentelemetry.io/otel v1.40.0 // indirect @@ -124,7 +124,7 @@ require ( google.golang.org/genproto v0.0.0-20250505200425-f936aa4a68b2 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20260128011058-8636f8732409 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20260128011058-8636f8732409 // indirect - google.golang.org/grpc v1.78.0 // indirect + google.golang.org/grpc v1.79.3 // indirect google.golang.org/protobuf v1.36.11 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect diff --git a/go.sum b/go.sum index c6c9fa9..7021646 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,5 @@ -cel.dev/expr v0.24.0 h1:56OvJKSH3hDGL0ml5uSxZmz3/3Pq4tJ+fb1unVLAFcY= -cel.dev/expr v0.24.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw= +cel.dev/expr v0.25.1 h1:1KrZg61W6TWSxuNZ37Xy49ps13NUovb66QLprthtwi4= +cel.dev/expr v0.25.1/go.mod h1:hrXvqGP6G6gyx8UAHSHJ5RGk//1Oj5nXQ2NI02Nrsg4= cloud.google.com/go v0.121.1 h1:S3kTQSydxmu1JfLRLpKtxRPA7rSrYPRPEUmL/PavVUw= cloud.google.com/go v0.121.1/go.mod h1:nRFlrHq39MNVWu+zESP2PosMWA0ryJw8KUBZ2iZpxbw= cloud.google.com/go/auth v0.16.1 h1:XrXauHMd30LhQYVRHLGvJiYeczweKQXZxsTbV9TiguU= @@ -54,12 +54,12 @@ github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= -github.com/cert-manager/cert-manager v1.18.0 h1:v7vxC1Mx5tkDz1oGOAktB88zA6TbGKcmpLM92+AIXRc= -github.com/cert-manager/cert-manager v1.18.0/go.mod h1:icDJx4kG9BCNpGjBvrmsFd99d+lXUvWdkkcrSSQdIiw= +github.com/cert-manager/cert-manager v1.18.5 h1:Gx4FSpSPYcSC4MQf43QjbxDfyTEbwZgfZQs5Lq9QlBs= +github.com/cert-manager/cert-manager v1.18.5/go.mod h1:HbPSO5MW/44wu19t84eY/K4c4/WwyPB4bA3uffOH92s= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cncf/xds/go v0.0.0-20251022180443-0feb69152e9f h1:Y8xYupdHxryycyPlc9Y+bSQAYZnetRJ70VMVKm5CKI0= -github.com/cncf/xds/go v0.0.0-20251022180443-0feb69152e9f/go.mod h1:HlzOvOjVBOfTGSRXRyY0OiCS/3J1akRGQQpRO/7zyF4= +github.com/cncf/xds/go v0.0.0-20251210132809-ee656c7534f5 h1:6xNmx7iTtyBRev0+D/Tv1FZd4SCg8axKApyNyRsAt/w= +github.com/cncf/xds/go v0.0.0-20251210132809-ee656c7534f5/go.mod h1:KdCmV+x/BuvyMxRnYBlmVaq4OLiKW6iRQfvC62cvdkI= github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -69,14 +69,14 @@ github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/r github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= github.com/emicklei/go-restful/v3 v3.12.1 h1:PJMDIM/ak7btuL8Ex0iYET9hxM3CI2sjZtzpL63nKAU= github.com/emicklei/go-restful/v3 v3.12.1/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= -github.com/envoyproxy/go-control-plane v0.13.5-0.20251024222203-75eaa193e329 h1:K+fnvUM0VZ7ZFJf0n4L/BRlnsb9pL/GuDG6FqaH+PwM= -github.com/envoyproxy/go-control-plane v0.13.5-0.20251024222203-75eaa193e329/go.mod h1:Alz8LEClvR7xKsrq3qzoc4N0guvVNSS8KmSChGYr9hs= -github.com/envoyproxy/go-control-plane/envoy v1.35.0 h1:ixjkELDE+ru6idPxcHLj8LBVc2bFP7iBytj353BoHUo= -github.com/envoyproxy/go-control-plane/envoy v1.35.0/go.mod h1:09qwbGVuSWWAyN5t/b3iyVfz5+z8QWGrzkoqm/8SbEs= +github.com/envoyproxy/go-control-plane v0.14.0 h1:hbG2kr4RuFj222B6+7T83thSPqLjwBIfQawTkC++2HA= +github.com/envoyproxy/go-control-plane v0.14.0/go.mod h1:NcS5X47pLl/hfqxU70yPwL9ZMkUlwlKxtAohpi2wBEU= +github.com/envoyproxy/go-control-plane/envoy v1.36.0 h1:yg/JjO5E7ubRyKX3m07GF3reDNEnfOboJ0QySbH736g= +github.com/envoyproxy/go-control-plane/envoy v1.36.0/go.mod h1:ty89S1YCCVruQAm9OtKeEkQLTb+Lkz0k8v9W0Oxsv98= github.com/envoyproxy/go-control-plane/ratelimit v0.1.0 h1:/G9QYbddjL25KvtKTv3an9lx6VBE2cnb8wp1vEGNYGI= github.com/envoyproxy/go-control-plane/ratelimit v0.1.0/go.mod h1:Wk+tMFAFbCXaJPzVVHnPgRKdUdwW/KdbRt94AzgRee4= -github.com/envoyproxy/protoc-gen-validate v1.2.1 h1:DEo3O99U8j4hBFwbJfrz9VtgcDfUKS7KJ7spH3d86P8= -github.com/envoyproxy/protoc-gen-validate v1.2.1/go.mod h1:d/C80l/jxXLdfEIhX1W2TmLfsJ31lvEjwamM4DxlWXU= +github.com/envoyproxy/protoc-gen-validate v1.3.0 h1:TvGH1wof4H33rezVKWSpqKz5NXWg5VPuZ0uONDT6eb4= +github.com/envoyproxy/protoc-gen-validate v1.3.0/go.mod h1:HvYl7zwPa5mffgyeTUHA9zHIH36nmrm7oCbo4YKoSWA= github.com/evanphx/json-patch v5.9.0+incompatible h1:fBXyNpNMuTTDdquAq/uisOr2lShz4oaXpDTX2bLe7ls= github.com/evanphx/json-patch v5.9.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= @@ -225,8 +225,8 @@ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= -go.opentelemetry.io/contrib/detectors/gcp v1.38.0 h1:ZoYbqX7OaA/TAikspPl3ozPI6iY6LiIY9I8cUfm+pJs= -go.opentelemetry.io/contrib/detectors/gcp v1.38.0/go.mod h1:SU+iU7nu5ud4oCb3LQOhIZ3nRLj6FNVrKgtflbaf2ts= +go.opentelemetry.io/contrib/detectors/gcp v1.39.0 h1:kWRNZMsfBHZ+uHjiH4y7Etn2FK26LAGkNFw7RHv1DhE= +go.opentelemetry.io/contrib/detectors/gcp v1.39.0/go.mod h1:t/OGqzHBa5v6RHZwrDBJ2OirWc+4q/w2fTbLZwAKjTk= go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0 h1:x7wzEgXfnzJcHDwStJT+mxOz4etr2EcexjqhBvmoakw= go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0/go.mod h1:rg+RlpR5dKwaS95IyyZqj5Wd4E13lk/msnTS0Xl9lJM= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 h1:sbiXRNDSWJOTobXh5HyQKjq6wUC5tNybqjIqDpAY4CU= @@ -313,8 +313,8 @@ google.golang.org/genproto/googleapis/api v0.0.0-20260128011058-8636f8732409 h1: google.golang.org/genproto/googleapis/api v0.0.0-20260128011058-8636f8732409/go.mod h1:fl8J1IvUjCilwZzQowmw2b7HQB2eAuYBabMXzWurF+I= google.golang.org/genproto/googleapis/rpc v0.0.0-20260128011058-8636f8732409 h1:H86B94AW+VfJWDqFeEbBPhEtHzJwJfTbgE2lZa54ZAQ= google.golang.org/genproto/googleapis/rpc v0.0.0-20260128011058-8636f8732409/go.mod h1:j9x/tPzZkyxcgEFkiKEEGxfvyumM01BEtsW8xzOahRQ= -google.golang.org/grpc v1.78.0 h1:K1XZG/yGDJnzMdd/uZHAkVqJE+xIDOcmdSFZkBUicNc= -google.golang.org/grpc v1.78.0/go.mod h1:I47qjTo4OKbMkjA/aOOwxDIiPSBofUtQUI5EfpWvW7U= +google.golang.org/grpc v1.79.3 h1:sybAEdRIEtvcD68Gx7dmnwjZKlyfuc61Dyo9pGXXkKE= +google.golang.org/grpc v1.79.3/go.mod h1:KmT0Kjez+0dde/v2j9vzwoAScgEPx/Bw1CYChhHLrHQ= google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= From 8f56527e4363faa09c6de05d4ff0791c5b52353d Mon Sep 17 00:00:00 2001 From: Mohammed Arif Date: Wed, 15 Apr 2026 18:14:24 +0530 Subject: [PATCH 32/55] all k0s changes + fixes --- Dockerfile | 4 +- Dockerfile.k0s-runner | 19 + Makefile | 4 + .../crds/ai.splunk.com_aiplatforms.yaml | 18 +- .../crds/ai.splunk.com_aiservices.yaml | 23 +- pkg/ai/features/saia/impl.go | 6 +- pkg/ai/raybuilder/builder.go | 4 +- pkg/ai/raybuilder/builder_additional_test.go | 2 +- pkg/ai/weaviate.go | 7 +- tools/cluster_setup/K0S_README.md | 762 +++++-- tools/cluster_setup/artifacts.yaml | 12 +- tools/cluster_setup/k0s-cluster-config.yaml | 219 +- tools/cluster_setup/k0s_cluster_with_stack.sh | 1841 +++++++++++++++-- .../cluster_setup/refresh_ecr_credentials.sh | 84 + .../splunk-operator-cluster.yaml | 3 +- 15 files changed, 2456 insertions(+), 552 deletions(-) create mode 100644 Dockerfile.k0s-runner create mode 100755 tools/cluster_setup/refresh_ecr_credentials.sh diff --git a/Dockerfile b/Dockerfile index 25c47bc..67224af 100644 --- a/Dockerfile +++ b/Dockerfile @@ -43,7 +43,9 @@ COPY LICENSE LICENSE-2.0.txt COPY --from=builder /certs/tls.crt /certs/tls.crt COPY --from=builder /certs/tls.key /certs/tls.key -USER 65532:65532 +# USER 65532:65532 +# GID 0 required for Red Hat / OpenShift SCC compatibility on k0s nodes +USER 1001:0 ENV INSTANCE_FILE=/instance.yaml ENV APPLICATION_FILE=/applications.yaml ENTRYPOINT ["/manager"] diff --git a/Dockerfile.k0s-runner b/Dockerfile.k0s-runner new file mode 100644 index 0000000..aa0eda2 --- /dev/null +++ b/Dockerfile.k0s-runner @@ -0,0 +1,19 @@ +FROM registry.access.redhat.com/ubi9/ubi:latest + +RUN dnf install -y --allowerasing openssh-clients git jq && dnf clean all + +ARG TARGETARCH + +# kubectl +RUN curl -fsSL "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/${TARGETARCH}/kubectl" \ + -o /usr/local/bin/kubectl && chmod +x /usr/local/bin/kubectl + +# helm +RUN curl -fsSL "https://get.helm.sh/helm-v3.17.1-linux-${TARGETARCH}.tar.gz" | tar xz -C /tmp \ + && mv /tmp/linux-${TARGETARCH}/helm /usr/local/bin/helm && rm -rf /tmp/linux-${TARGETARCH} + +# yq +RUN curl -fsSL "https://github.com/mikefarah/yq/releases/latest/download/yq_linux_${TARGETARCH}" \ + -o /usr/local/bin/yq && chmod +x /usr/local/bin/yq + +WORKDIR /workspace diff --git a/Makefile b/Makefile index 203cb3d..d6a7f7b 100644 --- a/Makefile +++ b/Makefile @@ -217,6 +217,10 @@ run: manifests generate fmt vet ## Run a controller from your host. docker-build: ## Build docker image with the manager. $(CONTAINER_TOOL) build -t ${IMG} . +.PHONY: docker-build-amd64 +docker-build-amd64: ## Build docker image for linux/amd64 (e.g. for x86_64 servers/EC2). + $(CONTAINER_TOOL) build --platform=linux/amd64 -t ${IMG} . + .PHONY: docker-push docker-push: ## Push docker image with the manager. $(CONTAINER_TOOL) push ${IMG} diff --git a/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiplatforms.yaml b/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiplatforms.yaml index 25bf11b..67fc505 100644 --- a/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiplatforms.yaml +++ b/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiplatforms.yaml @@ -2222,25 +2222,24 @@ spec: type: object objectStorage: description: |- - ObjectStorage defines the object storage configuration for AI artifacts, tasks, and models. - Supported: AWS S3, MinIO, SeaweedFS, any S3-compatible (s3:// + endpoint), GCS, Azure Blob. - Backend is selected by path scheme; when endpoint is set with s3://, backend is S3-compatible. + ObjectStorage defines the object storage configuration for AI artifacts, tasks, and models + Supported providers: S3, GCS, Azure Blob Storage, MinIO properties: endpoint: description: |- - Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS). - Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible. + Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS) + Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible (MinIO, SeaweedFS, etc.) pattern: ^https?://.*$ type: string path: description: |- - Remote volume URI: s3://bucket/prefix, gs://bucket/prefix, azure://container/prefix, - minio://bucket/prefix, or seaweedfs://bucket/prefix + Remote volume URI in the format s3://bucketname/, gs://bucketname/, + azure://containername/, s3compat://bucketname/ (generic S3-compatible), minio://, or seaweedfs:// pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ type: string provider: description: |- - Optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint. + Provider is an optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint. Values: aws, minio, seaweedfs, s3compat, gcs, azure enum: - aws @@ -2256,7 +2255,8 @@ spec: minLength: 1 type: string secretRef: - description: Secret name containing storage credentials (e.g. s3_access_key, s3_secret_key for S3-compatible) + description: Secret name containing storage credentials (e.g. + s3_access_key, s3_secret_key for S3-compatible backends) maxLength: 253 minLength: 1 type: string diff --git a/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiservices.yaml b/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiservices.yaml index f9c3493..5bce496 100644 --- a/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiservices.yaml +++ b/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiservices.yaml @@ -1818,15 +1818,27 @@ spec: properties: endpoint: description: |- - Optional override endpoint (only needed for S3-compatible services like MinIO) - Must be a valid HTTP/HTTPS URL + Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS) + Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible (MinIO, SeaweedFS, etc.) pattern: ^https?://.*$ type: string path: description: |- Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, or minio://bucketname/ - pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ + azure://containername/, s3compat://bucketname/ (generic S3-compatible), minio://, or seaweedfs:// + pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ + type: string + provider: + description: |- + Provider is an optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint. + Values: aws, minio, seaweedfs, s3compat, gcs, azure + enum: + - aws + - minio + - seaweedfs + - s3compat + - gcs + - azure type: string region: description: Region of the remote storage volume. Required for @@ -1834,7 +1846,8 @@ spec: minLength: 1 type: string secretRef: - description: Secret name containing storage credentials + description: Secret name containing storage credentials (e.g. + s3_access_key, s3_secret_key for S3-compatible backends) maxLength: 253 minLength: 1 type: string diff --git a/pkg/ai/features/saia/impl.go b/pkg/ai/features/saia/impl.go index 80ae6fe..16d4642 100644 --- a/pkg/ai/features/saia/impl.go +++ b/pkg/ai/features/saia/impl.go @@ -312,6 +312,8 @@ func (r *SaiaReconciler) reconcileSAIAConfigMap( "SAIA_API_VERSION": "0.3.1", // TODO make configurable "TELEMETRY_ENV": "NOTLOCAL", // TODO make configurable "LOG_LEVEL": "info", + "USE_GPT_OSS": "true", + "SCS_TOKEN": "no-auth-required", } found := &corev1.ConfigMap{} @@ -555,7 +557,7 @@ func (r *SaiaReconciler) reconcilePostInstallHook( { Name: "vector-db-setup-container", Image: hookImage, - ImagePullPolicy: corev1.PullAlways, + ImagePullPolicy: corev1.PullIfNotPresent, Env: []corev1.EnvVar{ {Name: "VECTOR_DB_URL", Value: uri}, {Name: "SPLUNK_AI_ASSISTANT_SERVICE_CMP", Value: "true"}, @@ -740,7 +742,7 @@ func (r *SaiaReconciler) reconcileSAIADeployment( Containers: []corev1.Container{{ Name: ai.Name, Image: os.Getenv("RELATED_IMAGE_SAIA_API"), - ImagePullPolicy: corev1.PullAlways, + ImagePullPolicy: corev1.PullIfNotPresent, Ports: ports, VolumeMounts: mounts, Resources: ai.Spec.Resources, diff --git a/pkg/ai/raybuilder/builder.go b/pkg/ai/raybuilder/builder.go index 3f2ed32..c088f3c 100644 --- a/pkg/ai/raybuilder/builder.go +++ b/pkg/ai/raybuilder/builder.go @@ -850,7 +850,7 @@ func (b *Builder) makeHeadTemplate() corev1.PodTemplateSpec { Containers: []corev1.Container{{ Name: "ray-head", Image: SetImageRegistry("RELATED_IMAGE_RAY_HEAD", b.ai.Spec.Images.RayHeadGroupImage), - ImagePullPolicy: corev1.PullAlways, + ImagePullPolicy: corev1.PullIfNotPresent, Args: []string{ "ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD", }, @@ -971,7 +971,7 @@ func (b *Builder) makeWorkerTemplate(cfg InstanceDetail) corev1.PodTemplateSpec Containers: []corev1.Container{{ Name: "ray-worker", Image: SetImageRegistry("RELATED_IMAGE_RAY_WORKER", b.ai.Spec.WorkerGroupConfig.ImageRegistry), - ImagePullPolicy: corev1.PullAlways, + ImagePullPolicy: corev1.PullIfNotPresent, Command: []string{ "/bin/bash", "-lc", diff --git a/pkg/ai/raybuilder/builder_additional_test.go b/pkg/ai/raybuilder/builder_additional_test.go index 4a39746..22e0da7 100644 --- a/pkg/ai/raybuilder/builder_additional_test.go +++ b/pkg/ai/raybuilder/builder_additional_test.go @@ -543,7 +543,7 @@ func TestBuilder_makeWorkerTemplate(t *testing.T) { // Verify ray-worker container (first container is always ray-worker) rayWorker := template.Spec.Containers[0] assert.Equal(t, "ray-worker", rayWorker.Name) - assert.Equal(t, corev1.PullAlways, rayWorker.ImagePullPolicy) + assert.Equal(t, corev1.PullIfNotPresent, rayWorker.ImagePullPolicy) assert.Contains(t, rayWorker.Command, "/bin/bash") // Verify environment variables diff --git a/pkg/ai/weaviate.go b/pkg/ai/weaviate.go index 23a0007..8189c1f 100644 --- a/pkg/ai/weaviate.go +++ b/pkg/ai/weaviate.go @@ -192,9 +192,10 @@ func (r *AIPlatformReconciler) ReconcileWeaviateDatabase(ctx context.Context, in // Container definition sts.Spec.Template.Spec.Containers = []corev1.Container{{ - Name: "weaviate", - Image: weaviateImage, - Resources: resources, + Name: "weaviate", + Image: weaviateImage, + ImagePullPolicy: corev1.PullIfNotPresent, + Resources: resources, VolumeMounts: volumeMounts, Ports: []corev1.ContainerPort{{ Name: "http", diff --git a/tools/cluster_setup/K0S_README.md b/tools/cluster_setup/K0S_README.md index bb1adfc..a116b62 100644 --- a/tools/cluster_setup/K0S_README.md +++ b/tools/cluster_setup/K0S_README.md @@ -16,6 +16,7 @@ Complete guide for deploying Splunk AI Platform on k0s Kubernetes clusters. - [Advanced Topics](#advanced-topics) - [Troubleshooting](#troubleshooting) - [Security](#security) +- [Internet Dependencies](#internet-dependencies) - [Migration Guide](#migration-guide) --- @@ -24,10 +25,11 @@ Complete guide for deploying Splunk AI Platform on k0s Kubernetes clusters. The `k0s_cluster_with_stack.sh` script deploys the complete Splunk AI Platform on k0s Kubernetes, supporting: -- **On-premises deployments** with existing hardware -- **Bare metal servers** with customer-managed infrastructure -- **AWS EC2 instances** for testing and simulation -- **Air-gapped environments** with MinIO object storage +- **Bare metal / on-premises deployments** with existing hardware and SSH access +- **AWS EC2 instances** for testing and simulation (auto-creates instances) +- **External S3-compatible object storage** (SeaweedFS, MinIO, or any S3-compatible endpoint) +- **In-cluster MinIO** as a fallback when no external storage is configured +- **Air-gapped environments** with private registries ### What is k0s? @@ -47,8 +49,8 @@ The `k0s_cluster_with_stack.sh` script deploys the complete Splunk AI Platform o ### What Works Without AWS -✅ **Complete AI Platform Stack** - All features work in pure on-prem environments -✅ **MinIO Object Storage** - Replaces AWS S3, runs entirely in your cluster +✅ **Complete AI Platform Stack** - All features (SAIA, Slim, SECA) work in pure on-prem environments +✅ **Flexible Object Storage** - External SeaweedFS/MinIO/S3-compatible, or in-cluster MinIO ✅ **No Cloud Dependencies** - No AWS services required ✅ **Air-Gapped Support** - Can run completely disconnected from the internet ✅ **Private Registries** - Use your own container registry instead of ECR @@ -123,52 +125,74 @@ The `k0s_cluster_with_stack.sh` script deploys the complete Splunk AI Platform o - **Pod Network (10.244.0.0/16)**: Calico VXLAN overlay network - **Service Network (10.96.0.0/16)**: Kubernetes ClusterIP services - All pod-to-pod communication happens over VXLAN (no cloud networking) -- MinIO storage is local to the cluster (no S3) +- Object storage is internal / external to the cluster (SeaweedFS, MinIO, or S3-compatible endpoint) ### Configuration Example (Pure On-Premises) ```yaml cluster: name: onprem-ai-cluster - region: us-west-2 # Ignored for on-prem, but required in config sshUser: ubuntu sshKeyPath: ~/.ssh/onprem-key nodes: controllers: 1 - cpuWorkers: 0 # Not used with existingIPs - gpuWorkers: 0 # Not used with existingIPs + cpuWorkers: 2 # First 2 workers are CPU + gpuWorkers: 2 # Remaining 2 workers are GPU existingIPs: controllers: - 10.0.1.10 # Your controller server IP workers: - - 10.0.1.20 # CPU worker 1 - - 10.0.1.21 # CPU worker 2 - - 10.0.1.30 # GPU worker 1 - - 10.0.1.31 # GPU worker 2 - -minio: - accessKey: minio-admin - secretKey: SuperSecurePassword123! - bucket: ai-platform-data + - 10.0.1.20 # CPU worker 1 (index 0) + - 10.0.1.21 # CPU worker 2 (index 1) + - 10.0.1.30 # GPU worker 1 (index 2) + - 10.0.1.31 # GPU worker 2 (index 3) + +storage: + storageClass: "local-path" + vectorDbSize: "100Gi" + objectStore: + type: "minio" # External MinIO endpoint + bucket: "ai-platform-data" + endpoint: "http://10.0.1.50:9000" + auth: + rootUser: "minio-admin" + rootPassword: "SuperSecurePassword123!" + +images: + registry: "registry.yourcompany.com" + operator: + image: "registry.yourcompany.com/splunk/splunk-ai-operator:v0.1.5" + splunk: + image: "registry.yourcompany.com/splunk/splunk:latest" + operatorImage: "registry.yourcompany.com/splunk/splunk-operator:3.0.0" + ray: + headImage: "registry.yourcompany.com/ray/ray-head:build-v1alpha1" + workerImage: "registry.yourcompany.com/ray/ray-worker-gpu:build-v1alpha1" + weaviate: + image: "registry.yourcompany.com/weaviate:stable-v1.28" + saia: + apiImage: "registry.yourcompany.com/saia/saia-api:build-v1alpha1" + dataLoaderImage: "registry.yourcompany.com/saia/saia-data-loader:build-v1alpha1" + slim: + apiImage: "registry.yourcompany.com/slim/slim-api:v0.0.1" kubernetes: namespace: ai-platform imagePullSecrets: secrets: - - private-registry-secret # Your private registry - autoCreateECR: false # No AWS ECR + - private-registry-secret + autoCreateECR: false -aiplatform: - vectordb: - storageSize: "100Gi" - workers: - cpu: - maxReplicas: 4 - gpu: - maxReplicas: 2 +aiPlatform: + name: "onprem-ai-stack" + features: + - name: "saia" + version: "1.1.0" + - name: "slim" + version: "1.0.0" ``` ### Installation Steps (Pure On-Premises) @@ -388,31 +412,33 @@ sudo iptables -A INPUT -p tcp --dport 179 -s 10.0.0.0/16 -j ACCEPT The script installs everything needed for the AI Platform: -1. **k0s Kubernetes Cluster** (v1.30+) - CNCF certified Kubernetes +1. **k0s Kubernetes Cluster** - CNCF certified, single-binary Kubernetes 2. **Calico CNI** - High-performance networking with VXLAN -3. **MinIO** - S3-compatible object storage (replaces AWS S3). The AI Platform also supports SeaweedFS and other S3-compatible stores via `s3compat://`, `minio://`, or `seaweedfs://`; see [Object storage](../../docs/configuration/object-storage.md) for path schemes and configuration. -4. **Cert-Manager** - Automated certificate management -5. **Kube-Prometheus Stack** - Monitoring with Prometheus + Grafana -6. **OpenTelemetry Operator** - Distributed tracing and telemetry -7. **NVIDIA GPU Operator** - GPU support for AI workloads (optional) -8. **KubeRay Operator** - Ray cluster management for distributed AI -9. **Splunk Operator** - Splunk Enterprise management -10. **Splunk AI Platform Operator** - AI platform orchestration -11. **AI Platform CR** - Complete AI deployment with features +3. **local-path Storage Provisioner** - Default StorageClass for PVCs +4. **Object Storage** - External S3-compatible (SeaweedFS/MinIO) or in-cluster MinIO +5. **Cert-Manager v1.13.0** - Automated certificate management +6. **Kube-Prometheus Stack** - Monitoring with Prometheus + Grafana +7. **OpenTelemetry Operator** - Distributed tracing and telemetry +8. **NVIDIA Host Drivers + Device Plugin** - GPU support for AI workloads (optional, bare-metal driver install) +9. **KubeRay Operator v1.0.0** - Ray cluster management for distributed AI +10. **Splunk Operator** - Splunk Enterprise management +11. **Splunk AI Platform Operator** - AI platform orchestration (SAIA, Slim, SECA features) +12. **AIPlatform CR** - Complete AI deployment with features, scheduling, and secrets ### Two Deployment Modes -#### Mode 1: On-Premises/Baremetal ✅ -- Provide existing IP addresses +#### Mode 1: Bare Metal / On-Premises +- Provide existing IP addresses in `nodes.existingIPs` +- Script SSHs into each node and installs k0s, NVIDIA drivers (if GPU), iptables, and PyYAML - Passwordless SSH with sudo access required - Production-ready for on-prem deployments -- Air-gapped support with MinIO +- Air-gapped support with private registries -#### Mode 2: AWS EC2 (Testing) 🧪 -- Automatically creates EC2 instances -- Simulates on-prem environment -- Quick setup for testing/validation -- Uses AWS networking +#### Mode 2: AWS EC2 (Testing / Simulation) +- Automatically creates EC2 instances (controller, CPU workers, GPU workers) +- Creates or reuses a Security Group with required k0s ports open +- User-data bootstraps nodes with `curl`, `wget`, `jq`, and k0s binary +- Quick setup for testing and validation before on-prem rollout ### Image Pull Secrets Support 🔐 @@ -488,7 +514,7 @@ Open the following ports between nodes: - Sufficient EC2 quotas: - t3.xlarge (controllers): 1+ instances - m5.4xlarge (CPU workers): 2+ instances - - g5.2xlarge (GPU workers): 1+ instances + - g5.2xlarge (GPU workers): 2+ instances **Verify AWS Access:** ```bash @@ -561,23 +587,26 @@ kubectl get pods --all-namespaces The `k0s-cluster-config.yaml` file controls all aspects of the deployment: ```yaml -cluster: # Cluster-wide settings -nodes: # Node configuration -ec2: # AWS EC2 settings (if using EC2 mode) -instanceTypes: # EC2 instance types -minio: # MinIO object storage -kubernetes: # Kubernetes settings -splunk: # Splunk configuration -ecr: # ECR configuration -imagePullSecrets: # Private registry secrets -aiplatform: # AI Platform settings +cluster: # Cluster name, useExisting, region, SSH user/key +nodes: # Controller/worker counts and existingIPs +storage: # storageClass, vectorDbSize, objectStore (type/endpoint/auth) +images: # registry prefix, operator, splunk, ray, weaviate, saia, slim, fluentBit, otelCollector +operators: # ray (version/modelVersion/rayVersion), certManager, nvidia devicePluginVersion +kubernetes: # namespace +files: # splunkOperator, aiPlatform manifest paths +splunk: # standaloneName +aiPlatform: # defaultAcceleratorType, workerGroupConfig, features, scheduling +imagePullSecrets: # secrets list, autoCreateECR, dockerHub, gcr, acr, custom +ecr: # account, region +ec2: # vpcId, subnetId, keyName (EC2 mode) +instanceTypes: # controller, cpuWorker, gpuWorker (EC2 mode) ``` ### Configuration Examples -#### Example 1: On-Premises Production Cluster +#### Example 1: On-Premises / Bare Metal Production Cluster -**Use Case:** Production deployment on existing hardware +**Use Case:** Production deployment on existing hardware with external object storage ```yaml cluster: @@ -587,43 +616,65 @@ cluster: nodes: controllers: 1 - cpuWorkers: 0 # Ignored when using existingIPs - gpuWorkers: 0 # Ignored when using existingIPs + cpuWorkers: 2 # First 2 workers treated as CPU + gpuWorkers: 2 # Remaining 2 workers treated as GPU existingIPs: controllers: - 10.0.1.10 # Physical server 1 workers: - - 10.0.1.20 # Physical server 2 (CPU) - - 10.0.1.21 # Physical server 3 (CPU) - - 10.0.1.22 # Physical server 4 (GPU) - - 10.0.1.23 # Physical server 5 (GPU) - -minio: - accessKey: admin - secretKey: Change-This-Strong-Password-123! - bucket: ai-platform-production + - 10.0.1.20 # Physical server 2 (CPU - worker index 0) + - 10.0.1.21 # Physical server 3 (CPU - worker index 1) + - 10.0.1.22 # Physical server 4 (GPU - worker index 2) + - 10.0.1.23 # Physical server 5 (GPU - worker index 3) + +storage: + storageClass: "local-path" + vectorDbSize: "200Gi" + objectStore: + type: "seaweedfs" + bucket: "ai-platform-production" + endpoint: "http://10.0.1.50:8333" + auth: + rootUser: "admin" + rootPassword: "Change-This-Strong-Password-123!" + +images: # TODO update images with released versions (from docker.io / how ?) + registry: "registry.corp.com" + operator: + image: "registry.corp.com/splunk/splunk-ai-operator:v0.1.5" + splunk: + image: "registry.corp.com/splunk/splunk:latest" + operatorImage: "docker.io/splunk/splunk-operator:3.0.0" + ray: + headImage: "registry.corp.com/ray/ray-head:build-v1alpha1" + workerImage: "registry.corp.com/ray/ray-worker-gpu:build-v1alpha1" + weaviate: + image: "docker.io/semitechnologies/weaviate:stable-v1.28" + saia: + apiImage: "registry.corp.com/saia/saia-api:build-v1alpha1" + dataLoaderImage: "registry.corp.com/saia/saia-data-loader:build-v1alpha1" + slim: + apiImage: "registry.corp.com/slim/slim-api:v0.0.1" kubernetes: namespace: ai-platform splunk: standaloneName: splunk-prod - index: ai-platform imagePullSecrets: secrets: - - ecr-registry-secret - autoCreateECR: false # Manually create in air-gapped + - private-registry-secret + autoCreateECR: false -aiplatform: - vectordb: - storageSize: "200Gi" # Large storage for production - workers: - cpu: - maxReplicas: 8 - gpu: - maxReplicas: 4 +aiPlatform: + name: "prod-ai-stack" + features: + - name: "saia" + version: "1.1.0" + - name: "slim" + version: "1.0.0" ``` #### Example 2: AWS EC2 Testing Cluster @@ -634,22 +685,20 @@ aiplatform: cluster: name: test-ai-platform region: us-west-2 - useExisting: auto - sshUser: ubuntu + sshUser: ec2-user sshKeyPath: ~/.ssh/test-key.pem nodes: controllers: 1 cpuWorkers: 2 gpuWorkers: 1 - existingIPs: controllers: [] # Empty = auto-create EC2 workers: [] # Empty = auto-create EC2 ec2: vpcId: vpc-0123456789abcdef0 - subnetId: "" # Auto-select first available + subnetId: "" keyName: test-key instanceTypes: @@ -657,17 +706,29 @@ instanceTypes: cpuWorker: m5.2xlarge gpuWorker: g5.xlarge +storage: + storageClass: "local-path" + vectorDbSize: "50Gi" + objectStore: + type: "minio" + bucket: "ai-platform-test" + endpoint: "http://minio-host:9000" + auth: + rootUser: "minioadmin" + rootPassword: "minioadmin123" + +images: + registry: "123456789012.dkr.ecr.us-west-2.amazonaws.com" + operator: + image: "123456789012.dkr.ecr.us-west-2.amazonaws.com/splunk-ai-operator:latest" + ecr: - account: "123456789012" # Your AWS account ID + account: "123456789012" # Your AWS account ID + region: us-west-2 imagePullSecrets: - secrets: [] # Auto-added when autoCreateECR=true - autoCreateECR: true # Automatically create ECR secret - -minio: - accessKey: minioadmin - secretKey: minioadmin123 - bucket: ai-platform-test + secrets: [] + autoCreateECR: true kubernetes: namespace: ai-platform @@ -686,16 +747,17 @@ cluster: nodes: controllers: 1 - cpuWorkers: 2 # Will create 2 new EC2 CPU workers - gpuWorkers: 0 # No new GPU workers + cpuWorkers: 2 # First 2 workers are CPU (on-prem), + 2 EC2 CPU workers created + gpuWorkers: 2 # Remaining 2 on-prem workers are GPU existingIPs: controllers: - 192.168.1.10 # Existing on-prem controller workers: - - 192.168.1.20 # Existing GPU worker 1 - - 192.168.1.21 # Existing GPU worker 2 - # + 2 CPU workers will be created in EC2 + - 192.168.1.20 # Existing on-prem worker (CPU - index 0) + - 192.168.1.21 # Existing on-prem worker (CPU - index 1) + - 192.168.1.30 # Existing on-prem worker (GPU - index 2) + - 192.168.1.31 # Existing on-prem worker (GPU - index 3) ec2: vpcId: vpc-0123456789abcdef0 @@ -720,8 +782,8 @@ cluster: nodes: controllers: 3 # HA setup - cpuWorkers: 0 - gpuWorkers: 0 + cpuWorkers: 2 # First 2 workers are CPU + gpuWorkers: 1 # Last worker is GPU existingIPs: controllers: @@ -729,14 +791,25 @@ nodes: - 172.16.0.11 - 172.16.0.12 workers: - - 172.16.0.20 - - 172.16.0.21 - - 172.16.0.22 - -minio: - accessKey: secure-admin - secretKey: Very-Long-Secure-Password-456! - bucket: airgap-storage + - 172.16.0.20 # CPU + - 172.16.0.21 # CPU + - 172.16.0.22 # GPU + +storage: + storageClass: "local-path" + vectorDbSize: "100Gi" + objectStore: + type: "minio" + bucket: "airgap-storage" + endpoint: "http://172.16.0.50:9000" + auth: + rootUser: "secure-admin" + rootPassword: "Very-Long-Secure-Password-456!" + +images: + registry: "registry.airgap.local" + operator: + image: "registry.airgap.local/splunk-ai-operator:v0.1.5" imagePullSecrets: secrets: @@ -744,6 +817,7 @@ imagePullSecrets: autoCreateECR: false # Note: Pre-pull all images to local registry before installation +# See the "Internet Dependencies" section for the full list of images ``` ### Configuration Reference @@ -759,7 +833,9 @@ cluster: # Options: auto (detect), force (fail if not found), never (always create) useExisting: auto - # AWS region (required for EC2 mode) + # AWS region. Required for EC2 mode. Also used as fallback for ecr.region + # when pulling images from ECR (even in bare-metal mode). + # Not needed for pure on-prem with no AWS. region: us-west-2 # SSH configuration @@ -774,29 +850,152 @@ nodes: # Number of controller nodes (1 or 3 for HA) controllers: 1 - # Number of CPU worker nodes (only for EC2 mode) + # Number of CPU workers. In EC2 mode: instances to create. + # In bare-metal mode: first N entries in workers[] are CPU, rest are GPU. + # Controls node labeling, NVIDIA driver install, and GPU device plugin. cpuWorkers: 2 - # Number of GPU worker nodes (only for EC2 mode) + # Number of GPU workers. In EC2 mode: instances to create. + # In bare-metal mode: workers after the first cpuWorkers are treated as GPU. gpuWorkers: 1 - # Existing IP addresses (on-prem mode) + # Existing IP addresses (bare-metal / on-prem mode) existingIPs: controllers: [] # Leave empty for EC2 auto-creation workers: [] # Leave empty for EC2 auto-creation ``` +#### Storage Section + +```yaml +storage: + storageClass: "local-path" # Kubernetes StorageClass for PVCs + vectorDbSize: "50Gi" # Weaviate PersistentVolume size + + objectStore: + type: "seaweedfs" # aws | s3compat | minio | seaweedfs + bucket: "ai-platform-bucket" # S3 bucket name + endpoint: "http://host:8333" # S3-compatible endpoint URL + auth: + rootUser: "admin" # Access key / root user + rootPassword: "password" # Secret key / root password +``` + +#### Images Section + +Short image paths (without a FQDN) are automatically prefixed with `images.registry`. + +```yaml +images: + registry: "myregistry.com" # Prefix applied to short image paths + operator: + image: "myregistry.com/splunk-ai-operator:v0.1.5" + splunk: + image: "myregistry.com/splunk:latest" + operatorImage: "docker.io/splunk/splunk-operator:3.0.0" + ray: + headImage: "ray/ray-head:build-v1alpha1" + workerImage: "ray/ray-worker-gpu:build-v1alpha1" + weaviate: + image: "docker.io/semitechnologies/weaviate:stable-v1.28" + saia: + apiImage: "saia/saia-api:build-v1alpha1" + dataLoaderImage: "saia/saia-data-loader:build-v1alpha1" + slim: + apiImage: "myregistry.com/slim-api:v0.0.1" + fluentBit: + image: "docker.io/fluent/fluent-bit:1.9.6" + otelCollector: + image: "docker.io/otel/opentelemetry-collector-contrib:0.122.1" +``` + +**Image patching chain:** The script reads these config values, resolves them via `build_image_url()` (prepends registry if needed), then uses `sed` to patch the corresponding `RELATED_IMAGE_*` env vars in the manifest files: + +| Config field | Env var patched | Target file | +|---|---|---| +| `images.operator.image` | Container `image:` field | `artifacts.yaml` | +| `images.splunk.image` | `RELATED_IMAGE_SPLUNK_ENTERPRISE` | `splunk-operator-cluster.yaml` only | +| `images.splunk.operatorImage` | Container `image:` field | `splunk-operator-cluster.yaml` | +| `images.ray.headImage` | `RELATED_IMAGE_RAY_HEAD` | `artifacts.yaml` | +| `images.ray.workerImage` | `RELATED_IMAGE_RAY_WORKER` | `artifacts.yaml` | +| `images.weaviate.image` | `RELATED_IMAGE_WEAVIATE` | `artifacts.yaml` | +| `images.saia.apiImage` | `RELATED_IMAGE_SAIA_API` | `artifacts.yaml` | +| `images.saia.dataLoaderImage` | `RELATED_IMAGE_POST_INSTALL_HOOK` | `artifacts.yaml` | +| `images.slim.apiImage` | `RELATED_IMAGE_SLIM_API` | `artifacts.yaml` | +| `images.fluentBit.image` | `RELATED_IMAGE_FLUENT_BIT` | `artifacts.yaml` | +| `images.otelCollector.image` | `RELATED_IMAGE_OTEL_COLLECTOR` | `artifacts.yaml` | +| `operators.ray.modelVersion` | `MODEL_VERSION` | `artifacts.yaml` | +| `operators.ray.rayVersion` | `RAY_VERSION` | `artifacts.yaml` | + +> **Note:** `RELATED_IMAGE_SPLUNK_ENTERPRISE` also exists in `artifacts.yaml` but is only +> patched in `splunk-operator-cluster.yaml`. `SPLUNK_METRICS_INDEX_NAME` in `artifacts.yaml` +> is not configurable from the config file. + +#### Operators Section + +```yaml +operators: + ray: + version: "v1.2.2" # KubeRay operator Helm chart version + modelVersion: "v0.3.14-36-g1549f5a" # Model version label for Ray + rayVersion: "2.44.0" # Ray runtime version + certManager: + installCRDs: true # Install cert-manager CRDs + nvidia: + devicePluginVersion: "v0.17.3" # NVIDIA k8s device plugin version +``` + +#### AI Platform Section + +```yaml +aiPlatform: + defaultAcceleratorType: "L40S" # GPU tier: L40S, H100_NVL, or "" + workerGroupConfig: + imageRegistry: "" # Override registry for Ray worker images + # Note: name, features, cpuScheduling, gpuScheduling are defined + # for reference but currently hardcoded in the script's CR template +``` + #### Image Pull Secrets Section +The `secrets` list is **not consumed** by the script. Instead, the script auto-detects +which secrets exist in the namespace by checking for hardcoded names: `ecr-registry-secret`, +`docker-hub-secret`, `gcr-secret`, `acr-secret`, `custom-registry-secret`. + ```yaml imagePullSecrets: - # List of secret names to use - secrets: + secrets: # Pre-existing secret names; NOT consumed; script auto-detects secrets in namespace - ecr-registry-secret - docker-hub-secret - - # Auto-create ECR secret - autoCreateECR: true # Requires AWS credentials + autoCreateECR: true # Auto-create ECR secret from AWS creds + + # Docker Hub (optional) + dockerHub: + enabled: false + username: "" + password: "" + email: "" + + # Google Container Registry (optional) + gcr: + enabled: false + jsonKey: "" # GCP service account JSON key + + # Azure Container Registry (optional) + acr: + enabled: false + registry: "" # e.g. myregistry.azurecr.io + username: "" + password: "" + + # Custom Docker-compatible registry (optional) + custom: + enabled: false + name: "custom-registry-secret" + server: "" + username: "" + password: "" + email: "" ``` --- @@ -812,11 +1011,11 @@ CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh install # Delete entire cluster CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh delete -# Health check -CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh health +# Clean all k0s state from bare-metal nodes (stop/reset/remove) +CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh clean-all -# Get cluster info -CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh info +# Join additional workers to an existing cluster +CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh join-workers ``` ### Advanced Commands @@ -825,15 +1024,8 @@ CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh info # Install without confirmation prompts AUTO_APPROVE=true CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh install -# Skip specific components -SKIP_MINIO=true CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh install -SKIP_GPU_OPERATOR=true CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh install - -# Use existing cluster (skip k0s installation) +# Use existing cluster (skip k0s installation, deploy stack only) USE_EXISTING=force CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh install - -# Join additional workers -CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh join-workers ``` ### Post-Installation Tasks @@ -1018,7 +1210,7 @@ graph TB subgraph "AI Platform Namespace" AIPLATFORM[AIPlatform CR
Custom Resource] - AISERVICE[AIService CRs
saia, dspy, etc.] + AISERVICE[AIService CRs
saia, slim, seca] RAYSERVICE[RayService
Ray Serve + Cluster] RAYCLUSTER[RayCluster
Head + Workers] WEAVIATE[Weaviate
Vector Database] @@ -1198,6 +1390,7 @@ graph TB subgraph "AI Services" SAIA[AIService: saia
Splunk AI Assistant] + SLIM[AIService: slim
Slim API] end subgraph "Ray Infrastructure" @@ -1230,9 +1423,8 @@ graph TB end end - subgraph "gpu-operator Namespace" - GPUOP[NVIDIA GPU Operator] - GPUPLUGIN[NVIDIA Device Plugin] + subgraph "kube-system (GPU)" + GPUPLUGIN[NVIDIA Device Plugin
DaemonSet] end end @@ -1281,7 +1473,6 @@ graph TB RAYWORKER1 -->|sends traces| OTELCOL OTELCOL -->|forwards to| SPLUNK - GPUOP -->|installs| GPUPLUGIN GPUPLUGIN -->|provides GPUs to| RAYWORKER2 style AIOP fill:#e1f5ff,stroke:#01579b,stroke-width:3px @@ -1604,27 +1795,6 @@ EOF ### Backup and Restore -#### Backup MinIO Data - -```bash -# Install MinIO client -wget https://dl.min.io/client/mc/release/linux-amd64/mc -chmod +x mc -sudo mv mc /usr/local/bin/ - -# Configure alias -mc alias set k0s-minio \ - http://localhost:9000 \ - minioadmin \ - minioadmin123 - -# Backup bucket -mc mirror k0s-minio/ai-platform-bucket ./backup/minio-data - -# Backup configuration -kubectl get secret -n minio-system minio-creds -o yaml > backup/minio-secret.yaml -``` - #### Backup etcd ```bash @@ -1643,9 +1813,6 @@ scp ubuntu@controller-ip:/tmp/etcd-backup.db ./backup/ scp ./backup/etcd-backup.db ubuntu@controller-ip:/tmp/ ssh ubuntu@controller-ip sudo k0s etcd snapshot restore /tmp/etcd-backup.db - -# Restore MinIO data -mc mirror ./backup/minio-data k0s-minio/ai-platform-bucket ``` --- @@ -1806,13 +1973,11 @@ kubectl logs -n local-path-storage deployment/local-path-provisioner #### GPU Not Detected -```bash -# Check GPU operator pods -kubectl get pods -n gpu-operator +The script installs NVIDIA host drivers and the device plugin DaemonSet directly (not the GPU Operator). -# All pods should be Running -# If not, check logs: -kubectl logs -n gpu-operator deployment/gpu-operator +```bash +# Check NVIDIA device plugin pods +kubectl get pods -n kube-system -l name=nvidia-device-plugin-ds # Check node GPU resources kubectl get nodes -o json | jq '.items[].status.capacity | select(.["nvidia.com/gpu"] != null)' @@ -2106,6 +2271,76 @@ EOF --- +## Internet Dependencies + +The script downloads various binaries, manifests, Helm charts, OS packages, and container images from the internet. This section lists every external download grouped by where it occurs, which is important for air-gapped planning and security review. + +### Downloads from the Local Machine (where the script runs) + +| What | URL / Source | +|------|-------------| +| Public IP detection | `https://checkip.amazonaws.com`, `https://ipinfo.io/ip`, `https://api.ipify.org` | +| cert-manager manifest | `https://github.com/cert-manager/cert-manager/releases/download/v1.13.0/cert-manager.yaml` | +| NVIDIA k8s device plugin | `https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin//deployments/static/nvidia-device-plugin.yml` | +| local-path-provisioner | `https://raw.githubusercontent.com/rancher/local-path-provisioner/v0.0.24/deploy/local-path-storage.yaml` | +| Prometheus Helm repo | `https://prometheus-community.github.io/helm-charts` | +| kube-prometheus-stack chart | `prometheus-community/kube-prometheus-stack` (via `helm install`) | +| OpenTelemetry Helm repo | `https://open-telemetry.github.io/opentelemetry-helm-charts` | +| OpenTelemetry Operator chart | `open-telemetry/opentelemetry-operator` (via `helm install`) | +| KubeRay Helm repo | `https://ray-project.github.io/kuberay-helm/` | +| KubeRay Operator chart | `kuberay/kuberay-operator` version `1.0.0` (via `helm install`) | + +### Downloads on EC2 Nodes (user-data bootstrap, EC2 mode only) + +| What | URL / Source | +|------|-------------| +| System packages | `apt-get install -y curl wget jq` | +| k0s binary | `curl -sSLf https://get.k0s.sh \| sh` | + +### Downloads on All Nodes via SSH (bare-metal & EC2) + +| What | URL / Source | +|------|-------------| +| iptables-nft | `dnf install -y iptables-nft` (RHEL/Fedora, if missing) | +| python3-pyyaml | `dnf install -y python3-pyyaml` or `apt-get install -y python3-yaml` or `pip3 install pyyaml` | +| k0s binary | `curl -sSLf https://get.k0s.sh \| sudo sh` (if not already installed) | + +### Downloads on GPU Worker Nodes via SSH + +| What | URL / Source | +|------|-------------| +| Kernel headers | `dnf/yum install kernel-devel-$(uname -r) kernel-headers-$(uname -r)` or `apt-get install linux-headers-$(uname -r)` | +| NVIDIA GPU driver (AL2023) | Repo: `https://developer.download.nvidia.com/compute/cuda/repos/amzn2023/x86_64/cuda-amzn2023.repo` | +| NVIDIA GPU driver (RHEL 9/10) | Repo: `https://developer.download.nvidia.com/compute/cuda/repos/rhel{9,10}/x86_64/...` | +| NVIDIA GPU driver (Ubuntu) | `https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb` + `nvidia-driver-550` | +| EPEL for dkms (RHEL 10) | `https://dl.fedoraproject.org/pub/epel/epel-release-latest-10.noarch.rpm` | +| NVIDIA Container Toolkit | Repo: `https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo`, GPG: `https://nvidia.github.io/libnvidia-container/gpgkey` | + +### Container Images Pulled by Kubernetes at Runtime + +These images are pulled from registries when pods are scheduled. They can be pre-pulled for air-gapped environments. + +| Image | Default Source | +|-------|---------------| +| Splunk AI Operator | ECR or configured registry | +| Ray Head / Ray Worker GPU | ECR or configured registry | +| Weaviate | `docker.io/semitechnologies/weaviate:...` | +| SAIA API / Data Loader | ECR or configured registry | +| Slim API | ECR or configured registry | +| Fluent Bit | `docker.io/fluent/fluent-bit:1.9.6` | +| OpenTelemetry Collector | `docker.io/otel/opentelemetry-collector-contrib:0.122.1` | +| Splunk Enterprise | ECR or configured registry | +| Splunk Operator | `docker.io/splunk/splunk-operator:3.0.0` | +| MinIO + MinIO Client | `minio/minio:latest`, `minio/mc:latest` (only when in-cluster MinIO is deployed) | +| Prometheus, Grafana, Alertmanager | Pulled by kube-prometheus-stack Helm chart | +| KubeRay Operator | Pulled by kuberay Helm chart | +| OpenTelemetry Operator | Pulled by opentelemetry-operator Helm chart | +| cert-manager (controller, webhook, cainjector) | Pulled by cert-manager manifest | +| NVIDIA device plugin | Pulled by DaemonSet manifest | +| local-path-provisioner | Pulled by provisioner manifest | + +--- + ## Migration Guide ### From EKS to k0s @@ -2245,21 +2480,110 @@ See the main repository LICENSE file. cluster: name: my-cluster # Cluster identifier useExisting: auto # auto|force|never - region: us-west-2 # AWS region (EC2 mode) - sshUser: ubuntu # SSH username - sshKeyPath: ~/.ssh/key.pem # SSH private key + region: us-west-2 # EC2 mode + ECR fallback region (not needed for pure on-prem) + sshUser: ubuntu # SSH username for node access + sshKeyPath: ~/.ssh/key.pem # SSH private key path nodes: controllers: 1 # 1 or 3 for HA - cpuWorkers: 2 # For EC2 mode - gpuWorkers: 1 # For EC2 mode + cpuWorkers: 2 # EC2: create count. Bare metal: first N workers = CPU + gpuWorkers: 1 # EC2: create count. Bare metal: remaining workers = GPU existingIPs: - controllers: [] # Empty = create EC2 - workers: [] # Or list of IPs + controllers: [] # Empty = create EC2, or list of IPs (bare metal) + workers: [] # Empty = create EC2, or list of IPs (bare metal) + +# --- Storage --- +storage: + storageClass: "local-path" # StorageClass for PVCs + vectorDbSize: "50Gi" # Weaviate PV size + objectStore: + type: "seaweedfs" # aws | s3compat | minio | seaweedfs + bucket: "ai-platform-bucket" # S3 bucket name + endpoint: "http://host:8333" # S3-compatible endpoint + auth: + rootUser: "admin" # Access key + rootPassword: "password" # Secret key + +# --- Container Images --- +images: + registry: "myregistry.com" # Registry prefix for short image paths + operator: + image: "myregistry.com/splunk-ai-operator:v0.1.5" + splunk: + image: "myregistry.com/splunk:latest" + operatorImage: "docker.io/splunk/splunk-operator:3.0.0" + ray: + headImage: "myregistry.com/ray/ray-head:build-v1alpha1" + workerImage: "myregistry.com/ray/ray-worker-gpu:build-v1alpha1" + weaviate: + image: "docker.io/semitechnologies/weaviate:stable-v1.28" + saia: + apiImage: "myregistry.com/saia/saia-api:build-v1alpha1" + dataLoaderImage: "myregistry.com/saia/saia-data-loader:build-v1alpha1" + slim: + apiImage: "myregistry.com/slim-api:v0.0.1" + fluentBit: + image: "docker.io/fluent/fluent-bit:1.9.6" + otelCollector: + image: "docker.io/otel/opentelemetry-collector-contrib:0.122.1" + +# --- Operator Versions --- +operators: + ray: + version: "v1.2.2" # KubeRay operator chart version + modelVersion: "v0.3.14-36-g1549f5a" # Model version label + rayVersion: "2.44.0" # Ray runtime version + certManager: + installCRDs: true + nvidia: + devicePluginVersion: "v0.17.3" # NVIDIA k8s device plugin tag + +# --- Kubernetes --- +kubernetes: + namespace: ai-platform # AI Platform namespace + +# --- File Paths --- +files: + splunkOperator: "./splunk-operator-cluster.yaml" # Splunk Operator manifest path + aiPlatform: "./artifacts.yaml" # AI Operator manifest path + +# --- Splunk --- +splunk: + standaloneName: splunk-standalone # Splunk Standalone CR name + +# --- AI Platform --- +# NOTE: defaultAcceleratorType and workerGroupConfig.imageRegistry are consumed +# by the script. The remaining fields are NOT consumed and are hardcoded in +# the AIPlatform CR template inside the script: +# - name: hardcoded as "${CLUSTER_NAME}-ai-platform" +# - features: hardcoded to only "saia" (slim/seca must be added manually) +# - cpuScheduling/gpuScheduling: hardcoded with node selectors +# - objectStorage.region: hardcoded to "us-east-1" +aiPlatform: + name: "splunk-ai-stack" # Reference only; NOT consumed; CR name = ${CLUSTER_NAME}-ai-platform + defaultAcceleratorType: "L40S" # Consumed → AIPlatform CR spec + workerGroupConfig: + imageRegistry: "" # Override registry for Ray worker images + features: # Reference only (hardcoded in script) + - name: "saia" + version: "1.1.0" + - name: "slim" + version: "1.0.0" + cpuScheduling: # Reference only (hardcoded in script) + nodeSelector: {} + tolerations: [] + gpuScheduling: # Reference only (hardcoded in script) + nodeSelector: {} + tolerations: + - key: "nvidia.com/gpu" + operator: "Equal" + value: "true" + effect: "NoSchedule" +# --- EC2 Mode (optional) --- ec2: - vpcId: vpc-xxx # Required for EC2 - subnetId: subnet-xxx # Optional + vpcId: vpc-xxx # Required for EC2 mode + subnetId: subnet-xxx # Optional, auto-selects first available keyName: my-key # AWS key pair name instanceTypes: @@ -2267,48 +2591,45 @@ instanceTypes: cpuWorker: m5.4xlarge # 16 CPU, 64GB RAM gpuWorker: g5.2xlarge # 8 CPU, 24GB RAM, A10G GPU -minio: - accessKey: admin # MinIO admin user - secretKey: password123 # MinIO admin password - bucket: ai-platform-data # Default bucket - -kubernetes: - namespace: ai-platform # AI Platform namespace - -splunk: - standaloneName: splunk-standalone # Splunk instance name - hecEndpoint: "" # Optional external HEC - hecToken: "" # Optional HEC token - index: ai-platform # Splunk index name +# --- Image Pull Secrets --- +# NOTE: secrets[] list is NOT consumed by the script. The script auto-detects +# which secrets exist in the namespace by checking hardcoded names: +# ecr-registry-secret, docker-hub-secret, gcr-secret, acr-secret, custom-registry-secret. +imagePullSecrets: + secrets: [] # NOT consumed; script auto-detects in namespace + autoCreateECR: true # Consumed → creates ECR secret from AWS creds + + # Docker Hub private registry + dockerHub: + enabled: false + username: "" + password: "" # Use token, not plaintext password + email: "" + + # Google Container Registry + gcr: + enabled: false + jsonKey: "" # GCP service account JSON key + + # Azure Container Registry + acr: + enabled: false + registry: "" # e.g. myregistry.azurecr.io + username: "" + password: "" + + # Custom Docker-compatible registry + custom: + enabled: false + name: "custom-registry-secret" # Secret name to create + server: "" # Registry URL + username: "" + password: "" + email: "" ecr: account: "123456789012" # AWS account ID - -imagePullSecrets: - secrets: [] # Manual secret names - autoCreateECR: true # Auto-create ECR secret - -aiplatform: - ray: - version: "2.9.0" - image: "rayproject/ray:2.9.0" - vectordb: - image: "semitechnologies/weaviate:1.28.0" - storageSize: "50Gi" - workers: - cpu: - minReplicas: 1 - maxReplicas: 5 - resourcesPerWorker: - cpu: "4" - memory: "16Gi" - gpu: - minReplicas: 0 - maxReplicas: 2 - resourcesPerWorker: - cpu: "8" - memory: "32Gi" - nvidia.com/gpu: "1" + region: us-east-2 # ECR region ``` ### Environment Variables @@ -2320,17 +2641,8 @@ CONFIG_FILE=./my-config.yaml # Skip confirmation prompts AUTO_APPROVE=true -# Use existing cluster +# Use existing cluster (skip k0s installation) USE_EXISTING=force - -# Skip components -SKIP_MINIO=true -SKIP_GPU_OPERATOR=true -SKIP_PROMETHEUS=true -SKIP_OTEL=true - -# Debug mode -DEBUG=true ``` ### Common Recipes @@ -2361,6 +2673,6 @@ CONFIG_FILE=dev.yaml AUTO_APPROVE=true ./k0s_cluster_with_stack.sh install --- -**Version:** 1.0 -**Last Updated:** 2024 +**Version:** 2.0 +**Last Updated:** February 2026 **Maintainer:** Splunk AI Platform Team diff --git a/tools/cluster_setup/artifacts.yaml b/tools/cluster_setup/artifacts.yaml index c44a36e..70d48d1 100644 --- a/tools/cluster_setup/artifacts.yaml +++ b/tools/cluster_setup/artifacts.yaml @@ -5523,15 +5523,15 @@ spec: fieldRef: fieldPath: metadata.name - name: RELATED_IMAGE_RAY_HEAD - value: splunk/ai/ray/ray-head:build-17 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-head:9a24502-ai-tier - name: RELATED_IMAGE_RAY_WORKER - value: splunk/ai/ray/ray-worker-gpu:build-17 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-worker-gpu:9a24502-ai-tier - name: RELATED_IMAGE_WEAVIATE value: docker.io/semitechnologies/weaviate:stable-v1.28-007846a - name: RELATED_IMAGE_SAIA_API - value: splunk/ai/saia/saia-api:build-1 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:build-006 - name: RELATED_IMAGE_POST_INSTALL_HOOK - value: splunk/ai/saia/saia-data-loader:build-1 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-data-loader:build-003 - name: SPLUNK_METRICS_INDEX_NAME value: _metrics - name: RELATED_IMAGE_FLUENT_BIT @@ -5541,8 +5541,8 @@ spec: - name: MODEL_VERSION value: v0.3.14-36-g1549f5a - name: RAY_VERSION - value: 2.44.0 - image: splunk/ai/splunk-ai-operator:build-v1alpha1 + value: 2.53.0 + image: 658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.9 livenessProbe: httpGet: path: /healthz diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml index 258f43a..03a283b 100644 --- a/tools/cluster_setup/k0s-cluster-config.yaml +++ b/tools/cluster_setup/k0s-cluster-config.yaml @@ -1,119 +1,150 @@ # =================================================================== -# k0s Cluster Configuration Template for Splunk AI Platform +# k0s Cluster Configuration for Splunk AI Platform # =================================================================== -# IMPORTANT: This is a template file with placeholder values. -# Copy this file and replace ALL placeholder values with your actual resources. +# Mirrors cluster-config.yaml (EKS) but adapted for k0s on bare-metal / EC2. # # Quick Start: -# 1. Copy: cp k0s-cluster-config.yaml my-cluster-config.yaml -# 2. Edit: vi my-cluster-config.yaml +# 1. Copy: cp k0s-cluster-config.yaml my-k0s-config.yaml +# 2. Edit: vi my-k0s-config.yaml # 3. Replace all values marked with "CHANGE THIS" -# 4. Run: CONFIG_FILE=./my-cluster-config.yaml ./k0s_cluster_with_stack.sh install +# 4. Run: CONFIG_FILE=./my-k0s-config.yaml ./k0s_cluster_with_stack.sh install # =================================================================== # ---------- Cluster Configuration ---------- cluster: - name: "my-ai-cluster" # CHANGE THIS: Your cluster name - useExisting: "auto" # auto | force | never - region: "us-east-2" # CHANGE THIS: AWS region (used when creating EC2 instances) - sshUser: "ubuntu" # SSH username for nodes - sshKeyPath: "~/.ssh/id_rsa" # CHANGE THIS: Path to SSH private key + name: airgap-cluster + # region: us-east-2 # Ignored for on-prem, but required in config + sshUser: ec2-user # CHANGE THIS: SSH user for remote nodes + sshKeyPath: /Users/mohaari2/.ssh/ai-key-arif.pem # CHANGE THIS: Path to SSH private key # ---------- Node Configuration ---------- -# -# GPU TYPE QUICK REFERENCE — set gpuWorker instanceType and defaultAcceleratorType together: -# -# L40S (default): -# gpuWorker instanceType: g6e.12xlarge (4x L40S GPUs, 48 GB VRAM each) -# defaultAcceleratorType: L40S -# -# H100: -# gpuWorker instanceType: p5.4xlarge (8x H100 GPUs, 80 GB VRAM each) -# defaultAcceleratorType: H100 -# -# H100_NVL: -# gpuWorker instanceType: p4de.24xlarge (8x H100 NVL GPUs, 94 GB VRAM each) -# defaultAcceleratorType: H100_NVL -# -# On-premises (existing hardware): -# Set existingIPs below — instanceTypes are ignored when IPs are provided. -# The defaultAcceleratorType must still match the physical GPU in your nodes. -# nodes: - controllers: 1 # 1 (single) or 3 (HA) - cpuWorkers: 2 # Number of CPU worker nodes (EC2 mode only) - gpuWorkers: 1 # Number of GPU worker nodes (EC2 mode only) + controllers: 1 + cpuWorkers: 1 # Not used with existingIPs + gpuWorkers: 2 # Not used with existingIPs - # On-premises / existing nodes: provide IPs to skip EC2 instance creation. - # Leave lists empty to create new EC2 instances automatically. existingIPs: - controllers: [] # e.g. ["10.0.0.1"] or ["10.0.0.1", "10.0.0.2", "10.0.0.3"] for HA - workers: [] # e.g. ["10.0.1.1", "10.0.1.2", "10.0.2.1"] - -# ---------- EC2 Instance Types (ignored when existingIPs are set) ---------- -instanceTypes: - controller: "t3.xlarge" # Controller node (4 vCPU, 16 GB RAM) - cpuWorker: "m5.4xlarge" # CPU worker (16 vCPU, 64 GB RAM) - gpuWorker: "g6e.12xlarge" # CHANGE THIS: see GPU TYPE QUICK REFERENCE above - -# ---------- EC2 Network (required when creating EC2 instances) ---------- -ec2: - vpcId: "" # CHANGE THIS: your VPC ID (e.g. vpc-xxxxxxxxxxxxxxxxx) - subnetId: "" # CHANGE THIS: your subnet ID (e.g. subnet-xxxxxxxxxxxxxxxxx) - keyName: "" # CHANGE THIS: your EC2 key pair name - -# ---------- MinIO Object Storage ---------- -minio: - accessKey: "minioadmin" # CHANGE THIS: MinIO admin username - secretKey: "minioadmin" # CHANGE THIS: MinIO admin password - bucket: "ai-platform-data" # MinIO bucket name + controllers: + - 3.144.14.96 # CHANGE THIS: Your controller server IP + workers: + - 3.14.134.16 # CHANGE THIS: CPU worker 1 + - 13.59.78.115 # CHANGE THIS: GPU worker 1 + - 3.15.20.136 # CHANGE THIS: GPU worker 2 -# ---------- Kubernetes ---------- -kubernetes: - namespace: "ai-platform" # no change +# ---------- Storage Configuration ---------- +# Object storage: AWS S3 or external S3-compatible (no in-cluster MinIO install for external). +# Use objectStore.type: aws (S3) or s3compat | minio | seaweedfs (external; endpoint + credentials required). +storage: + s3Bucket: "ai-platform-bucket-minio-us-east-2" # Used when objectStore.type is aws + storageClass: "local-path" # Storage class for Kubernetes PVCs (gp3, gp2, io1, io2) + vectorDbSize: "50Gi" # VectorDB persistent volume size -# ---------- Splunk ---------- -splunk: - standaloneName: "splunk-standalone" # no change + objectStore: + # type: "minio" # aws | s3compat | minio | seaweedfs (external only for non-aws) + type: "seaweedfs" # aws | s3compat | minio | seaweedfs (external only for non-aws) + bucket: "ai-platform-bucket-minio-us-east-2" + # endpoint: "http://13.59.216.105:9000" # MinIO port 9000. For SeaweedFS use port 8333. + endpoint: "http://3.144.157.201:8333" + auth: + rootUser: "minioadmin" + rootPassword: "minioadmin" -# ---------- ECR (for private AWS image repositories) ---------- -ecr: - account: "" # CHANGE THIS: your AWS account ID (e.g. "123456789012") - # Leave empty to auto-detect from AWS CLI +# ---------- Container Images Configuration ---------- +images: + # Registry prefix - applied to images without a full registry path + registry: "658391232643.dkr.ecr.us-east-2.amazonaws.com" # CHANGE THIS: Your ECR/Docker/Harbor registry -# ---------- Image Pull Secrets ---------- -imagePullSecrets: - autoCreateECR: false # Set true to auto-create ECR pull secret - dockerHub: - enabled: false # Set true if images are on Docker Hub (private) - username: "" - password: "" - gcr: - enabled: false - acr: - enabled: false - custom: - enabled: false + operator: + # image: "docker.io/kpratyush775/splunk-ai-operator:v0.1.29" + image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.9" + + splunk: + image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/splunk/splunk:10-2-ai-custom" + operatorImage: "docker.io/splunk/splunk-operator:3.0.0" + + ray: + # headImage: "ml-platform/ray/ray-head:build-v1alpha1" + # headImage: "ml-platform/ray/ray-head:087e40e" + # headImage: "ml-platform/ray/ray-head:build-010" + headImage: "ml-platform/ray/ray-head:9a24502-ai-tier" + + # workerImage: "ml-platform/ray/ray-worker-gpu:build-v1alpha1" + # workerImage: "ml-platform/ray/ray-worker-gpu:087e40e" + # workerImage: "ml-platform/ray/ray-worker-gpu:build-010" + workerImage: "ml-platform/ray/ray-worker-gpu:9a24502-ai-tier" + + weaviate: + image: "docker.io/semitechnologies/weaviate:stable-v1.28-007846a" + + saia: + # apiImage: "ml-platform/saia/saia-api:build-v1alpha1" + apiImage: "ml-platform/saia/saia-api:build-006" + + # dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v1alpha1" + dataLoaderImage: "ml-platform/saia/saia-data-loader:build-003" + + fluentBit: + image: "docker.io/fluent/fluent-bit:1.9.6" + + otelCollector: + image: "docker.io/otel/opentelemetry-collector-contrib:0.122.1" + +# ---------- Operator Versions ---------- +operators: + ray: + version: "v1.2.2" + modelVersion: "v0.3.14-36-g1549f5a" + rayVersion: "2.53.0" + + certManager: + installCRDs: true + + nvidia: + devicePluginVersion: "v0.17.3" + +# ---------- Kubernetes ---------- +kubernetes: + namespace: ai-platform # ---------- File Paths ---------- files: - splunkOperator: "./splunk-operator-cluster.yaml" - aiPlatform: "./artifacts.yaml" + splunkOperator: "/Users/mohaari2/Files/repos/AI/splunk-ai-operator/tools/cluster_setup/splunk-operator-cluster.yaml" + aiPlatform: "/Users/mohaari2/Files/repos/AI/splunk-ai-operator/tools/cluster_setup/artifacts.yaml" + +# ---------- Splunk Configuration ---------- +splunk: + standaloneName: splunk-standalone # ---------- AI Platform Configuration ---------- aiPlatform: - namespace: "ai-platform" # no change - name: "splunk-ai-stack" # no change - - # Service Accounts - serviceAccounts: - rayHead: "ray-head-sa" # no change - rayWorker: "ray-worker-sa" # no change - saiaService: "saia-service-sa" # no change - - # Default accelerator type — must match a top-level key in instance.yaml. - # Must be changed in sync with instanceTypes.gpuWorker (see GPU TYPE QUICK REFERENCE above). - # L40S → gpuWorker: g6e.12xlarge - # H100 → gpuWorker: p5.4xlarge - # H100_NVL → gpuWorker: p4de.24xlarge + name: "splunk-ai-stack" defaultAcceleratorType: "L40S" + + workerGroupConfig: + imageRegistry: "" + + features: + - name: "saia" + version: "1.1.0" + + cpuScheduling: + nodeSelector: {} + tolerations: [] + + gpuScheduling: + nodeSelector: {} + tolerations: + - key: "nvidia.com/gpu" + operator: "Equal" + value: "true" + effect: "NoSchedule" + +# ---------- Image Pull Secrets ---------- +imagePullSecrets: + secrets: + - ecr-registry-secret + autoCreateECR: true + +ecr: + account: "658391232643" + region: us-east-2 diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index 4ac7787..08017fe 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -10,8 +10,11 @@ set -euo pipefail # 2. AWS EC2: Automatically create EC2 instances for testing # ============================================================================= -# --- Unset conflicting AWS credentials --- -unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY AWS_SESSION_TOKEN AWS_PROFILE 2>/dev/null || true +# --- AWS credentials handling --- +# Don't unset AWS credentials - they may be needed for ECR access in on-prem/air-gapped scenarios +# The original unset was to prevent conflicts, but it breaks SSO/assumed-role credentials +# If you need to clear credentials, do it explicitly before running the script +# unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY AWS_SESSION_TOKEN AWS_PROFILE 2>/dev/null || true # --- Non-interactive setup --- export AWS_PAGER="" @@ -45,7 +48,7 @@ helm_retry() { set -e if (( rc == 0 )); then printf "%s\n" "$out"; return 0; fi # Check for transient errors that should be retried - if grep -qiE 'timed out|operation timed out|i/o timeout|connection reset|TLS handshake timeout|could not get information about the resource' <<<"$out"; then + if grep -qiE 'timed out|operation timed out|i/o timeout|connection reset|TLS handshake timeout|could not get information about the resource|context deadline exceeded|not ready' <<<"$out"; then warn "Helm transient error (attempt $i/$tries). Retrying in ${backoff}s…" warn "$out" sleep "$backoff"; backoff=$(( backoff*2 )); (( i++ )) @@ -120,12 +123,25 @@ load_config() { CPU_WORKER_INSTANCE_TYPE=$(yq eval '.instanceTypes.cpuWorker' "${CONFIG_FILE}" 2>/dev/null || echo "m5.4xlarge") GPU_WORKER_INSTANCE_TYPE=$(yq eval '.instanceTypes.gpuWorker' "${CONFIG_FILE}" 2>/dev/null || echo "g5.2xlarge") - # MinIO configuration: prefer environment variables (secure); fall back to config - _minio_ak=$(yq eval '.minio.accessKey' "${CONFIG_FILE}" 2>/dev/null || echo "minioadmin") - _minio_sk=$(yq eval '.minio.secretKey' "${CONFIG_FILE}" 2>/dev/null || echo "minioadmin123") - MINIO_ACCESS_KEY="${MINIO_ACCESS_KEY:-$_minio_ak}" - MINIO_SECRET_KEY="${MINIO_SECRET_KEY:-$_minio_sk}" - MINIO_BUCKET=$(yq eval '.minio.bucket' "${CONFIG_FILE}" 2>/dev/null || echo "ai-platform-data") + # Storage configuration + STORAGE_CLASS=$(yq eval '.storage.storageClass // "local-path"' "${CONFIG_FILE}" 2>/dev/null || echo "local-path") + VECTORDB_SIZE=$(yq eval '.storage.vectorDbSize // "50Gi"' "${CONFIG_FILE}" 2>/dev/null || echo "50Gi") + + # Object storage: objectStore.type (aws | s3compat | minio | seaweedfs); default minio when unset + OBJ_STORE_TYPE="$(yq eval '.storage.objectStore.type // "minio"' "$CONFIG_FILE" 2>/dev/null || echo "minio")" + OBJ_STORE_BUCKET="$(yq eval '.storage.objectStore.bucket // "ai-platform-data"' "$CONFIG_FILE" 2>/dev/null || echo "ai-platform-data")" + OBJ_STORE_ENDPOINT="$(yq eval '.storage.objectStore.endpoint // ""' "$CONFIG_FILE" 2>/dev/null || echo "")" + _obj_user="$(yq eval '.storage.objectStore.auth.rootUser // "minioadmin"' "$CONFIG_FILE" 2>/dev/null || echo "minioadmin")" + _obj_pw="$(yq eval '.storage.objectStore.auth.rootPassword // ""' "$CONFIG_FILE" 2>/dev/null || echo "")" + USE_EXTERNAL_OBJ_STORE="false" + case "${OBJ_STORE_TYPE}" in s3compat|minio|seaweedfs) USE_EXTERNAL_OBJ_STORE="true"; esac + MINIO_ENDPOINT="${OBJ_STORE_ENDPOINT}" + MINIO_BUCKET="${OBJ_STORE_BUCKET}" + MINIO_ROOT_USER="${MINIO_ROOT_USER:-$_obj_user}" + MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-$_obj_pw}" + + # Legacy compat: MINIO_NS for in-cluster MinIO (unused when external) + MINIO_NS="minio-system" # Kubernetes namespace AI_NS=$(yq eval '.kubernetes.namespace' "${CONFIG_FILE}" 2>/dev/null || echo "ai-platform") @@ -133,8 +149,33 @@ load_config() { # Splunk configuration AI_STANDALONE_NAME=$(yq eval '.splunk.standaloneName' "${CONFIG_FILE}" 2>/dev/null || echo "splunk-standalone") + # Container images + IMAGE_REGISTRY="$(yq eval '.images.registry // ""' "$CONFIG_FILE" 2>/dev/null || echo "")" + OPERATOR_IMAGE="$(yq eval '.images.operator.image' "$CONFIG_FILE" 2>/dev/null || echo "")" + SPLUNK_IMAGE="$(yq eval '.images.splunk.image' "$CONFIG_FILE" 2>/dev/null || echo "")" + SPLUNK_OPERATOR_IMAGE="$(yq eval '.images.splunk.operatorImage' "$CONFIG_FILE" 2>/dev/null || echo "")" + RAY_HEAD_IMAGE="$(yq eval '.images.ray.headImage' "$CONFIG_FILE" 2>/dev/null || echo "")" + RAY_WORKER_IMAGE="$(yq eval '.images.ray.workerImage' "$CONFIG_FILE" 2>/dev/null || echo "")" + WEAVIATE_IMAGE="$(yq eval '.images.weaviate.image' "$CONFIG_FILE" 2>/dev/null || echo "")" + SAIA_API_IMAGE="$(yq eval '.images.saia.apiImage' "$CONFIG_FILE" 2>/dev/null || echo "")" + SAIA_DATALOADER_IMAGE="$(yq eval '.images.saia.dataLoaderImage' "$CONFIG_FILE" 2>/dev/null || echo "")" + FLUENT_BIT_IMAGE="$(yq eval '.images.fluentBit.image' "$CONFIG_FILE" 2>/dev/null || echo "")" + OTEL_COLLECTOR_IMAGE="$(yq eval '.images.otelCollector.image' "$CONFIG_FILE" 2>/dev/null || echo "")" + + # Operator versions + MODEL_VERSION="$(yq eval '.operators.ray.modelVersion // ""' "$CONFIG_FILE" 2>/dev/null || echo "")" + RAY_RUNTIME_VERSION="$(yq eval '.operators.ray.rayVersion // "2.44.0"' "$CONFIG_FILE" 2>/dev/null || echo "2.44.0")" + + # AI Platform CR configuration + DEFAULT_ACCELERATOR=$(yq eval '.aiPlatform.defaultAcceleratorType // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + WORKER_IMAGE_REGISTRY=$(yq eval '.aiPlatform.workerGroupConfig.imageRegistry // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + + # NVIDIA device plugin version + NVIDIA_VERSION=$(yq eval '.operators.nvidia.devicePluginVersion // "v0.17.3"' "${CONFIG_FILE}" 2>/dev/null || echo "v0.17.3") + # ECR configuration (for private image repositories) ECR_ACCOUNT=$(yq eval '.ecr.account' "${CONFIG_FILE}" 2>/dev/null || echo "") + ECR_REGION=$(yq eval '.ecr.region // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") # Get AWS account if using EC2 if [[ -z "${EXISTING_CONTROLLER_IPS}" ]]; then @@ -157,11 +198,12 @@ load_config() { SPLUNK_OPERATOR_FILE=$(yq eval '.files.splunkOperator' "${CONFIG_FILE}" 2>/dev/null || echo "./splunk-operator-cluster.yaml") SPLUNK_AI_FILE=$(yq eval '.files.aiPlatform' "${CONFIG_FILE}" 2>/dev/null || echo "./artifacts.yaml") - # Default accelerator type (must match a key in instance.yaml: L40S | H100 | H100_NVL) - DEFAULT_ACCELERATOR=$(yq eval '.aiPlatform.defaultAcceleratorType' "${CONFIG_FILE}" 2>/dev/null || echo "") - [[ "$DEFAULT_ACCELERATOR" == "null" || -z "$DEFAULT_ACCELERATOR" ]] && DEFAULT_ACCELERATOR="L40S" - - log "Configuration loaded: cluster=${CLUSTER_NAME}, namespace=${AI_NS}, accelerator=${DEFAULT_ACCELERATOR}" + log "Configuration loaded: cluster=${CLUSTER_NAME}, namespace=${AI_NS}" + if [[ "${USE_EXTERNAL_OBJ_STORE}" == "true" ]]; then + log "Object storage: external S3-compatible (${OBJ_STORE_TYPE}), endpoint=${OBJ_STORE_ENDPOINT:-not set}, bucket=${OBJ_STORE_BUCKET}" + else + log "Object storage: AWS S3, bucket=${OBJ_STORE_BUCKET}" + fi if [[ -n "${ECR_ACCOUNT}" ]]; then log "ECR Account: ${ECR_ACCOUNT}" fi @@ -179,10 +221,152 @@ load_config() { fi } +# ====== IMAGE HELPERS ====== +build_image_url() { + local registry="$1" + local image_path="$2" + if [[ "$image_path" =~ ^([a-zA-Z0-9.-]+\.[a-zA-Z]{2,}|[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(:[0-9]+)?)/.*:.+ ]]; then + echo "$image_path" + return 0 + fi + if [[ -n "$registry" && "$registry" != "null" ]]; then + echo "${registry}/${image_path}" + else + echo "$image_path" + fi +} + +validate_image_config() { + log "Validating image configuration..." + + if [[ -z "$OPERATOR_IMAGE" || "$OPERATOR_IMAGE" == "null" ]]; then + err "REQUIRED: images.operator.image must be specified in k0s-cluster-config.yaml" + fi + if [[ -z "$SPLUNK_IMAGE" || "$SPLUNK_IMAGE" == "null" ]]; then + err "REQUIRED: images.splunk.image must be specified in k0s-cluster-config.yaml" + fi + if [[ -z "$RAY_HEAD_IMAGE" || "$RAY_HEAD_IMAGE" == "null" ]]; then + err "REQUIRED: images.ray.headImage must be specified in k0s-cluster-config.yaml" + fi + if [[ -z "$RAY_WORKER_IMAGE" || "$RAY_WORKER_IMAGE" == "null" ]]; then + err "REQUIRED: images.ray.workerImage must be specified in k0s-cluster-config.yaml" + fi + if [[ -z "$WEAVIATE_IMAGE" || "$WEAVIATE_IMAGE" == "null" ]]; then + err "REQUIRED: images.weaviate.image must be specified in k0s-cluster-config.yaml" + fi + if [[ -z "$SAIA_API_IMAGE" || "$SAIA_API_IMAGE" == "null" ]]; then + err "REQUIRED: images.saia.apiImage must be specified in k0s-cluster-config.yaml" + fi + if [[ -z "$SAIA_DATALOADER_IMAGE" || "$SAIA_DATALOADER_IMAGE" == "null" ]]; then + err "REQUIRED: images.saia.dataLoaderImage must be specified in k0s-cluster-config.yaml" + fi + if [[ -z "$SPLUNK_OPERATOR_IMAGE" || "$SPLUNK_OPERATOR_IMAGE" == "null" ]]; then + SPLUNK_OPERATOR_IMAGE="docker.io/splunk/splunk-operator:3.0.0" + log "Using default Splunk Operator image: $SPLUNK_OPERATOR_IMAGE" + fi + if [[ -z "$FLUENT_BIT_IMAGE" || "$FLUENT_BIT_IMAGE" == "null" ]]; then + FLUENT_BIT_IMAGE="fluent/fluent-bit:1.9.6" + log "Using default Fluent Bit image: $FLUENT_BIT_IMAGE" + fi + if [[ -z "$OTEL_COLLECTOR_IMAGE" || "$OTEL_COLLECTOR_IMAGE" == "null" ]]; then + OTEL_COLLECTOR_IMAGE="otel/opentelemetry-collector-contrib:0.122.1" + log "Using default OpenTelemetry Collector image: $OTEL_COLLECTOR_IMAGE" + fi + if [[ -z "$MODEL_VERSION" || "$MODEL_VERSION" == "null" ]]; then + MODEL_VERSION="v0.3.14-36-g1549f5a" + log "Using default Model version: $MODEL_VERSION" + fi + if [[ -z "$RAY_RUNTIME_VERSION" || "$RAY_RUNTIME_VERSION" == "null" ]]; then + RAY_RUNTIME_VERSION="2.44.0" + log "Using default Ray runtime version: $RAY_RUNTIME_VERSION" + fi + + log "✓ Image configuration validated successfully" +} + +configure_images() { + log "Configuring container images in manifest files..." + + if [[ ! -f "${SPLUNK_AI_FILE}.original" ]]; then + log "Creating backup: ${SPLUNK_AI_FILE}.original" + cp "$SPLUNK_AI_FILE" "${SPLUNK_AI_FILE}.original" + fi + if [[ ! -f "${SPLUNK_OPERATOR_FILE}.original" ]]; then + log "Creating backup: ${SPLUNK_OPERATOR_FILE}.original" + cp "$SPLUNK_OPERATOR_FILE" "${SPLUNK_OPERATOR_FILE}.original" + fi + + log "Restoring from clean originals to ensure idempotent updates..." + cp "${SPLUNK_AI_FILE}.original" "$SPLUNK_AI_FILE" + cp "${SPLUNK_OPERATOR_FILE}.original" "$SPLUNK_OPERATOR_FILE" + + log "Updating $SPLUNK_AI_FILE..." + + local operator_full=$(build_image_url "$IMAGE_REGISTRY" "$OPERATOR_IMAGE") + local ray_head_full=$(build_image_url "$IMAGE_REGISTRY" "$RAY_HEAD_IMAGE") + local ray_worker_full=$(build_image_url "$IMAGE_REGISTRY" "$RAY_WORKER_IMAGE") + local weaviate_full=$(build_image_url "$IMAGE_REGISTRY" "$WEAVIATE_IMAGE") + local saia_api_full=$(build_image_url "$IMAGE_REGISTRY" "$SAIA_API_IMAGE") + local saia_dataloader_full=$(build_image_url "$IMAGE_REGISTRY" "$SAIA_DATALOADER_IMAGE") + local fluent_bit_full=$(build_image_url "$IMAGE_REGISTRY" "$FLUENT_BIT_IMAGE") + local otel_collector_full=$(build_image_url "$IMAGE_REGISTRY" "$OTEL_COLLECTOR_IMAGE") + + local ray_head_escaped=$(echo "$ray_head_full" | sed 's/[\/&]/\\&/g') + local ray_worker_escaped=$(echo "$ray_worker_full" | sed 's/[\/&]/\\&/g') + local weaviate_escaped=$(echo "$weaviate_full" | sed 's/[\/&]/\\&/g') + local saia_api_escaped=$(echo "$saia_api_full" | sed 's/[\/&]/\\&/g') + local saia_dataloader_escaped=$(echo "$saia_dataloader_full" | sed 's/[\/&]/\\&/g') + local fluent_bit_escaped=$(echo "$fluent_bit_full" | sed 's/[\/&]/\\&/g') + local otel_collector_escaped=$(echo "$otel_collector_full" | sed 's/[\/&]/\\&/g') + local operator_escaped=$(echo "$operator_full" | sed 's/[\/&]/\\&/g') + + SEDOPTION="-i" + if [[ "$OSTYPE" == "darwin"* ]]; then + SEDOPTION="-i ''" + fi + + sed $SEDOPTION "/name: RELATED_IMAGE_RAY_HEAD/,/value:/ s|value:.*|value: ${ray_head_escaped}|" "$SPLUNK_AI_FILE" + sed $SEDOPTION "/name: RELATED_IMAGE_RAY_WORKER/,/value:/ s|value:.*|value: ${ray_worker_escaped}|" "$SPLUNK_AI_FILE" + sed $SEDOPTION "/name: RELATED_IMAGE_WEAVIATE/,/value:/ s|value:.*|value: ${weaviate_escaped}|" "$SPLUNK_AI_FILE" + sed $SEDOPTION "/name: RELATED_IMAGE_SAIA_API/,/value:/ s|value:.*|value: ${saia_api_escaped}|" "$SPLUNK_AI_FILE" + sed $SEDOPTION "/name: RELATED_IMAGE_POST_INSTALL_HOOK/,/value:/ s|value:.*|value: ${saia_dataloader_escaped}|" "$SPLUNK_AI_FILE" + sed $SEDOPTION "/name: RELATED_IMAGE_FLUENT_BIT/,/value:/ s|value:.*|value: ${fluent_bit_escaped}|" "$SPLUNK_AI_FILE" + sed $SEDOPTION "/name: RELATED_IMAGE_OTEL_COLLECTOR/,/value:/ s|value:.*|value: ${otel_collector_escaped}|" "$SPLUNK_AI_FILE" + sed $SEDOPTION "/name: MODEL_VERSION/,/value:/ s|value:.*|value: ${MODEL_VERSION}|" "$SPLUNK_AI_FILE" + sed $SEDOPTION "/name: RAY_VERSION/,/value:/ s|value:.*|value: ${RAY_RUNTIME_VERSION}|" "$SPLUNK_AI_FILE" + sed $SEDOPTION "s|image: .*splunk.*ai.*operator.*|image: ${operator_escaped}|I" "$SPLUNK_AI_FILE" + + log " ✓ Updated RELATED_IMAGE_RAY_HEAD: $ray_head_full" + log " ✓ Updated RELATED_IMAGE_RAY_WORKER: $ray_worker_full" + log " ✓ Updated RELATED_IMAGE_WEAVIATE: $weaviate_full" + log " ✓ Updated RELATED_IMAGE_SAIA_API: $saia_api_full" + log " ✓ Updated RELATED_IMAGE_POST_INSTALL_HOOK: $saia_dataloader_full" + log " ✓ Updated RELATED_IMAGE_FLUENT_BIT: $fluent_bit_full" + log " ✓ Updated RELATED_IMAGE_OTEL_COLLECTOR: $otel_collector_full" + log " ✓ Updated operator image: $operator_full" + log " ✓ Updated MODEL_VERSION: $MODEL_VERSION" + log " ✓ Updated RAY_VERSION: $RAY_RUNTIME_VERSION" + + log "Updating $SPLUNK_OPERATOR_FILE..." + + local splunk_full=$(build_image_url "$IMAGE_REGISTRY" "$SPLUNK_IMAGE") + local splunk_operator_full=$(build_image_url "$IMAGE_REGISTRY" "$SPLUNK_OPERATOR_IMAGE") + + local splunk_escaped=$(echo "$splunk_full" | sed 's/[\/&]/\\&/g') + local splunk_op_escaped=$(echo "$splunk_operator_full" | sed 's/[\/&]/\\&/g') + + sed $SEDOPTION "/name: RELATED_IMAGE_SPLUNK_ENTERPRISE/,/value:/ s|value:.*|value: ${splunk_escaped}|" "$SPLUNK_OPERATOR_FILE" + sed $SEDOPTION "s|image: .*splunk.*operator.*|image: ${splunk_op_escaped}|I" "$SPLUNK_OPERATOR_FILE" + + log " ✓ Updated Splunk Enterprise image: $splunk_full" + log " ✓ Updated Splunk Operator image: $splunk_operator_full" + log "✓ All images configured successfully" +} + # ====== PREFLIGHT CHECKS ====== preflight_checks() { pf_header "Required tools" - for tool in ssh kubectl helm git jq yq; do + for tool in ssh kubectl helm git jq; do if command -v "$tool" >/dev/null 2>&1; then pf_ok "$tool found" else @@ -190,11 +374,35 @@ preflight_checks() { fi done + # Check for yq + if command -v yq >/dev/null 2>&1; then + pf_ok "yq found" + else + pf_warn "yq not found - using fallback parsing (install yq for better results)" + fi + pf_header "Configuration" [[ -n "${CLUSTER_NAME}" ]] && pf_ok "Cluster name: ${CLUSTER_NAME}" || pf_fail "Cluster name not set" [[ -f "${SPLUNK_OPERATOR_FILE}" ]] && pf_ok "Splunk operator file: ${SPLUNK_OPERATOR_FILE}" || pf_warn "Splunk operator file not found: ${SPLUNK_OPERATOR_FILE}" [[ -f "${SPLUNK_AI_FILE}" ]] && pf_ok "AI platform file: ${SPLUNK_AI_FILE}" || pf_warn "AI platform file not found: ${SPLUNK_AI_FILE}" + pf_header "Object storage" + if [[ "${USE_EXTERNAL_OBJ_STORE}" == "true" ]]; then + pf_ok "Object storage: external S3-compatible (${OBJ_STORE_TYPE})" + if [[ "${OBJ_STORE_TYPE}" == "seaweedfs" ]]; then + if echo "${OBJ_STORE_ENDPOINT}" | grep -q ':9000'; then + pf_warn "SeaweedFS uses port 8333 (not 9000). Endpoint has :9000 (MinIO); use http://host:8333 for SeaweedFS." + else + pf_ok "SeaweedFS endpoint: ${OBJ_STORE_ENDPOINT}" + fi + else + [[ -n "${OBJ_STORE_ENDPOINT}" ]] && pf_ok "Endpoint: ${OBJ_STORE_ENDPOINT}" || pf_fail "External object store requires endpoint" + fi + [[ -n "${MINIO_ROOT_PASSWORD}" ]] && pf_ok "Credentials configured" || pf_fail "Object store credentials required" + else + pf_ok "Object storage: in-cluster MinIO or AWS S3 (bucket=${OBJ_STORE_BUCKET})" + fi + pf_header "Infrastructure mode" if [[ -n "${EXISTING_CONTROLLER_IPS}" ]]; then pf_ok "Using existing infrastructure (on-prem/baremetal)" @@ -544,6 +752,165 @@ EOF SSH_KEY_PATH="${HOME}/.ssh/${KEY_NAME}.pem" } +# ====== PREPARE NODES (RHEL/Fedora compatibility + k0s binary) ====== +prepare_nodes_for_k0s() { + local node_ips=("$@") + log "Preparing ${#node_ips[@]} node(s) for k0s (OS compatibility + binary)..." + for node_ip in "${node_ips[@]}"; do + log " Preparing node ${node_ip}..." + ssh_exec "${node_ip}" " + # Disable firewalld if active (blocks k0s ports: 6443, 10250, 8472, etc.) + if systemctl is-active firewalld >/dev/null 2>&1; then + echo 'Disabling firewalld...' + sudo systemctl stop firewalld + sudo systemctl disable firewalld + fi + + # Ensure iptables is available (RHEL 10+ ships only nftables) + if ! command -v iptables >/dev/null 2>&1; then + if command -v dnf >/dev/null 2>&1 && dnf list available iptables-nft 2>/dev/null | grep -q iptables-nft; then + echo 'Installing iptables-nft...' + sudo dnf install -y iptables-nft >/dev/null 2>&1 + fi + fi + + # Ensure python3 + PyYAML are available (used for k0s config generation) + if ! python3 -c 'import yaml' 2>/dev/null; then + if command -v dnf >/dev/null 2>&1; then + sudo dnf install -y python3-pyyaml 2>/dev/null || sudo pip3 install pyyaml 2>/dev/null || true + elif command -v apt-get >/dev/null 2>&1; then + sudo apt-get install -y python3-yaml 2>/dev/null || true + fi + fi + + # Install k0s binary if not present + if ! command -v k0s >/dev/null 2>&1; then + echo 'Installing k0s binary...' + curl -sSLf https://get.k0s.sh | sudo sh + fi + + # Ensure k0s is in sudo secure_path + if [ -f /usr/local/bin/k0s ] && [ ! -f /usr/bin/k0s ]; then + sudo ln -sf /usr/local/bin/k0s /usr/bin/k0s + fi + " || warn " Preparation had issues on ${node_ip}" + done +} + +# ====== MOUNT NVMe INSTANCE STORE FOR EPHEMERAL STORAGE ====== +# GPU instance types (g5, g6, p4, p5) typically come with large NVMe instance +# store drives but tiny 10 GB EBS root volumes. Kubernetes counts ephemeral +# storage from the filesystem backing /var/lib/k0s/kubelet, so we mount an +# unused NVMe drive there to prevent "Insufficient ephemeral-storage" errors. +mount_nvme_instance_store() { + if [[ ${GPU_WORKER_COUNT} -eq 0 ]]; then + return 0 + fi + + # Ensure WORKER_IPS is populated + if [[ -z "${WORKER_IPS+x}" || ${#WORKER_IPS[@]} -eq 0 ]]; then + if [[ -n "${EXISTING_WORKER_IPS}" ]]; then + IFS=' ' read -ra WORKER_IPS <<< "${EXISTING_WORKER_IPS}" + else + return 0 + fi + fi + + local gpu_ips=() + local idx=0 + for ip in "${WORKER_IPS[@]}"; do + if [[ ${idx} -ge ${CPU_WORKER_COUNT} ]]; then + gpu_ips+=("${ip}") + fi + idx=$((idx + 1)) + done + + if [[ ${#gpu_ips[@]} -eq 0 ]]; then + return 0 + fi + + log "Checking NVMe instance store volumes on GPU workers..." + + for gpu_ip in "${gpu_ips[@]}"; do + ssh_exec "${gpu_ip}" " + # Skip if /var/lib/k0s is already on a large filesystem (>50 GB) + k0s_avail_gb=\$(df --output=avail /var/lib/k0s 2>/dev/null | tail -1 | awk '{print int(\$1/1048576)}') + if [ \"\${k0s_avail_gb:-0}\" -ge 50 ]; then + echo 'NVMe mount: /var/lib/k0s already has >=50 GB, skipping' + exit 0 + fi + + # Find the first NVMe device that is NOT the root disk and has no partitions + ROOT_DEV=\$(lsblk -no PKNAME \$(findmnt -n -o SOURCE /) 2>/dev/null | head -1) + NVME_DEV='' + for dev in /dev/nvme*n1; do + [ -b \"\$dev\" ] || continue + dev_name=\$(basename \"\$dev\") + # Skip the root device + [ \"\$dev_name\" = \"\$ROOT_DEV\" ] && continue + # Skip devices that already have partitions (they are in use) + if lsblk -n \"\$dev\" 2>/dev/null | grep -q part; then continue; fi + # Skip devices already mounted + if mount | grep -q \"\$dev\"; then continue; fi + NVME_DEV=\"\$dev\" + break + done + + if [ -z \"\$NVME_DEV\" ]; then + echo 'NVMe mount: no unused NVMe instance store found, skipping' + exit 0 + fi + + echo \"NVMe mount: formatting \$NVME_DEV and mounting to /var/lib/k0s\" + + # Format + sudo mkfs.xfs -f \"\$NVME_DEV\" >/dev/null 2>&1 + + # If k0s is running, stop it and preserve existing data + if systemctl is-active k0sworker >/dev/null 2>&1; then + sudo systemctl stop k0sworker 2>/dev/null || true + sleep 3 + sudo pkill -9 k0s 2>/dev/null || true + sudo pkill -9 containerd 2>/dev/null || true + sudo pkill -9 containerd-shim 2>/dev/null || true + sleep 2 + fi + + # Lazy unmount anything stuck under /var/lib/k0s + for mp in \$(mount | grep '/var/lib/k0s' | awk '{print \$3}' | sort -r); do + sudo umount -l \"\$mp\" 2>/dev/null || true + done + + # Copy existing data if present + if [ -d /var/lib/k0s ] && [ \"\$(ls -A /var/lib/k0s 2>/dev/null)\" ]; then + sudo mkdir -p /mnt/nvme-staging + sudo mount \"\$NVME_DEV\" /mnt/nvme-staging + sudo cp -a /var/lib/k0s/. /mnt/nvme-staging/ 2>/dev/null || true + sudo umount /mnt/nvme-staging + sudo rmdir /mnt/nvme-staging + fi + + # Mount + sudo rm -rf /var/lib/k0s 2>/dev/null || true + sudo mkdir -p /var/lib/k0s + sudo mount \"\$NVME_DEV\" /var/lib/k0s + + # Persist in fstab + NVME_UUID=\$(sudo blkid -s UUID -o value \"\$NVME_DEV\") + if ! grep -q \"\$NVME_UUID\" /etc/fstab 2>/dev/null; then + echo \"UUID=\$NVME_UUID /var/lib/k0s xfs defaults,nofail 0 2\" | sudo tee -a /etc/fstab >/dev/null + fi + + # Restart k0s if it was running + if systemctl is-enabled k0sworker >/dev/null 2>&1; then + sudo systemctl start k0sworker 2>/dev/null || true + fi + + echo \"NVMe mount: done — \$(df -h \$NVME_DEV | tail -1 | awk '{print \$2}') available on /var/lib/k0s\" + " 2>/dev/null || warn " NVMe mount on ${gpu_ip} had issues — may need manual setup" + done +} + # ====== K0S CLUSTER INSTALLATION ====== install_k0s_cluster() { log "Installing k0s cluster..." @@ -552,20 +919,26 @@ install_k0s_cluster() { if [[ -n "${EXISTING_CONTROLLER_IPS}" ]]; then IFS=' ' read -ra CONTROLLER_IPS <<< "${EXISTING_CONTROLLER_IPS}" IFS=' ' read -ra WORKER_IPS <<< "${EXISTING_WORKER_IPS}" + log "Using existing infrastructure - IPs from config" fi - local controller_ip="${CONTROLLER_IPS[0]}" # Public IP for SSH - local controller_private_ip="${CONTROLLER_PRIVATE_IPS[0]}" # Private IP for k0s - local controller_public_ip="${CONTROLLER_PUBLIC_IPS[0]}" # Public IP for kubectl access + local controller_ip="${CONTROLLER_IPS[0]}" - log "Primary controller - Public IP: ${controller_public_ip}, Private IP: ${controller_private_ip}" + log "Primary controller IP: ${controller_ip}" + + # Prepare all nodes (firewalld, iptables, python3) + local all_ips=("${CONTROLLER_IPS[@]}") + if [[ ${#WORKER_IPS[@]} -gt 0 ]]; then + all_ips+=("${WORKER_IPS[@]}") + fi + prepare_nodes_for_k0s "${all_ips[@]}" # Generate k0s config log "Generating k0s configuration..." ssh_exec "${controller_ip}" "k0s config create > /tmp/k0s.yaml" - # Configure k0s to use private IP for internal communication, add public IP to SANs for external access - log "Configuring k0s: Private IP ${controller_private_ip} for internal, Public IP ${controller_public_ip} for external access..." + # Configure k0s API with the controller IP for SANs and externalAddress + log "Configuring k0s with controller IP ${controller_ip}..." ssh_exec "${controller_ip}" "cat > /tmp/k0s-config-update.py <<'PYSCRIPT' import yaml @@ -573,7 +946,7 @@ import yaml with open('/tmp/k0s.yaml', 'r') as f: config = yaml.safe_load(f) -# Add SANs to API section - include BOTH private and public IPs +# Add the controller IP to SANs (for kubectl access and cluster communication) if 'spec' not in config: config['spec'] = {} if 'api' not in config['spec']: @@ -581,14 +954,10 @@ if 'api' not in config['spec']: if 'sans' not in config['spec']['api']: config['spec']['api']['sans'] = [] -# Add private IP (for internal cluster communication) -config['spec']['api']['sans'].append('${controller_private_ip}') -# Add public IP (for kubectl access from outside) -config['spec']['api']['sans'].append('${controller_public_ip}') +config['spec']['api']['sans'].append('${controller_ip}') -# CRITICAL: Use public IP for externalAddress so konnectivity-agents can connect -# konnectivity-agents run in pods and need to reach API server via routable address -config['spec']['api']['externalAddress'] = '${controller_public_ip}' +# Use the same IP for externalAddress so konnectivity-agents can connect +config['spec']['api']['externalAddress'] = '${controller_ip}' # Set Calico as network provider if 'network' not in config['spec']: @@ -610,9 +979,26 @@ PYSCRIPT" ssh_exec "${controller_ip}" "python3 /tmp/k0s-config-update.py" - log "Verifying k0s configuration includes public IP..." + log "Verifying k0s configuration includes controller IP..." ssh_exec "${controller_ip}" "grep -A3 'api:' /tmp/k0s.yaml | head -5" + # Ensure k0s is in sudo's secure_path (some distros exclude /usr/local/bin) + ssh_exec "${controller_ip}" "if [ -f /usr/local/bin/k0s ] && [ ! -f /usr/bin/k0s ]; then sudo ln -sf /usr/local/bin/k0s /usr/bin/k0s; fi" || true + + # Clean stale k0s state from any previous run + ssh_exec "${controller_ip}" " + sudo systemctl stop k0scontroller 2>/dev/null || true + sudo systemctl reset-failed k0scontroller 2>/dev/null || true + sudo rm -f /etc/systemd/system/k0scontroller.service 2>/dev/null || true + sudo systemctl stop k0sworker 2>/dev/null || true + sudo systemctl reset-failed k0sworker 2>/dev/null || true + sudo rm -f /etc/systemd/system/k0sworker.service 2>/dev/null || true + sudo pkill -9 containerd-shim 2>/dev/null || true + sudo rm -rf /var/lib/k0s /run/k0s /etc/k0s 2>/dev/null || true + sudo rm -f /run/k0s/containerd.sock 2>/dev/null || true + sudo systemctl daemon-reload + " 2>/dev/null || true + # Install k0s controller log "Installing k0s controller on ${controller_ip}..." ssh_exec "${controller_ip}" "sudo k0s install controller --config /tmp/k0s.yaml --enable-worker" @@ -632,8 +1018,19 @@ PYSCRIPT" for worker_ip in "${WORKER_IPS[@]}"; do log " Installing k0s worker on ${worker_ip}..." + # Ensure k0s is in sudo's secure_path (some distros exclude /usr/local/bin) + ssh_exec "${worker_ip}" "if [ -f /usr/local/bin/k0s ] && [ ! -f /usr/bin/k0s ]; then sudo ln -sf /usr/local/bin/k0s /usr/bin/k0s; fi" || true + + # Clean stale k0sworker state from any previous run (service file, data dirs, systemd failed state) + ssh_exec "${worker_ip}" " + sudo systemctl stop k0sworker 2>/dev/null || true + sudo systemctl reset-failed k0sworker 2>/dev/null || true + sudo rm -f /etc/systemd/system/k0sworker.service 2>/dev/null || true + sudo rm -rf /var/lib/k0s /run/k0s /etc/k0s /tmp/k0s-token 2>/dev/null || true + sudo systemctl daemon-reload + " 2>/dev/null || true + # Write token to temp file first (stdin pipe doesn't work reliably over SSH) - # Note: Token file must remain until worker bootstraps, so we don't delete it here if ssh_exec "${worker_ip}" "echo '${worker_token}' | sudo tee /tmp/k0s-token >/dev/null && sudo k0s install worker --token-file=/tmp/k0s-token"; then log " ✓ k0s installed on ${worker_ip}" else @@ -735,9 +1132,9 @@ PYSCRIPT" mkdir -p "${HOME}/.kube" ssh_exec "${controller_ip}" "sudo cat /var/lib/k0s/pki/admin.conf" > "${HOME}/.kube/k0s-${CLUSTER_NAME}" - # Update server address to use public IP for kubectl access from local machine - log "Configuring kubeconfig to use public IP for external access..." - sed -i.bak "s|server: .*|server: https://${controller_public_ip}:6443|" "${HOME}/.kube/k0s-${CLUSTER_NAME}" + # Update server address to use the controller IP for kubectl access from local machine + log "Configuring kubeconfig to use controller IP for external access..." + sed -i.bak "s|server: .*|server: https://${controller_ip}:6443|" "${HOME}/.kube/k0s-${CLUSTER_NAME}" export KUBECONFIG="${HOME}/.kube/k0s-${CLUSTER_NAME}" @@ -748,6 +1145,18 @@ PYSCRIPT" label_nodes } +# ====== RESOLVE NODE NAME ====== +# Maps a config IP to its Kubernetes node name by SSHing to the node +# and reading its hostname (which is what k0s uses as the node name). +# Usage: node_name=$(resolve_node_name "1.2.3.4") +resolve_node_name() { + local ip="$1" + # SSH to the node and get the hostname that k0s registered it with + local node_name + node_name=$(ssh_exec "${ip}" "hostname -f 2>/dev/null || hostname" 2>/dev/null || echo "") + echo "${node_name}" +} + # ====== LABEL NODES FOR WORKLOAD SCHEDULING ====== label_nodes() { log "Labeling nodes for AI workload scheduling..." @@ -767,67 +1176,80 @@ label_nodes() { fi done - # Get all nodes - local all_nodes - all_nodes=$(kubectl get nodes -o jsonpath='{.items[*].metadata.name}') - # Label controller nodes for controller_ip in "${CONTROLLER_IPS[@]}"; do - # Find node by IP local node_name - node_name=$(kubectl get nodes -o json | jq -r ".items[] | select(.status.addresses[]? | select(.type==\"InternalIP\" and .address==\"${controller_ip}\")) | .metadata.name" | head -1) + node_name=$(resolve_node_name "${controller_ip}") - if [[ -n "${node_name}" ]]; then - log "Labeling controller node: ${node_name}" + if [[ -z "${node_name}" ]]; then + warn " Could not resolve hostname for controller ${controller_ip}, skipping..." + continue + fi + + # Verify this node exists in the cluster + if ! kubectl get node "${node_name}" &>/dev/null; then + warn " Node '${node_name}' (from ${controller_ip}) not found in cluster, skipping..." + continue + fi + + log "Labeling controller node: ${node_name} (${controller_ip})" + kubectl label nodes "${node_name}" \ + splunk.ai/node-role=controller \ + splunk.ai/workload-type=control-plane \ + node.kubernetes.io/role=controller \ + --overwrite + + # For single-node clusters (controller with --enable-worker), also add CPU workload labels + if [[ ${#WORKER_IPS[@]} -eq 0 ]]; then + log " → Single-node cluster detected, adding CPU workload labels to controller..." kubectl label nodes "${node_name}" \ - splunk.ai/node-role=controller \ - splunk.ai/workload-type=control-plane \ - node.kubernetes.io/role=controller \ + splunk.ai/workload-type=cpu \ + node.kubernetes.io/workload=ai-cpu \ + splunk.ai/instance-type=cpu-worker \ --overwrite - - # For single-node clusters (controller with --enable-worker), also add CPU workload labels - if [[ ${#WORKER_IPS[@]} -eq 0 ]]; then - log " → Single-node cluster detected, adding CPU workload labels to controller..." - kubectl label nodes "${node_name}" \ - splunk.ai/workload-type=cpu \ - node.kubernetes.io/workload=ai-cpu \ - splunk.ai/instance-type=cpu-worker \ - --overwrite - log " ✓ CPU workload labels added to controller node" - fi + log " ✓ CPU workload labels added to controller node" fi done # Label worker nodes based on their configuration local worker_index=0 for worker_ip in "${WORKER_IPS[@]}"; do - # Find node by IP local node_name - node_name=$(kubectl get nodes -o json | jq -r ".items[] | select(.status.addresses[]? | select(.type==\"InternalIP\" and .address==\"${worker_ip}\")) | .metadata.name" | head -1) - - if [[ -n "${node_name}" ]]; then - # Determine if this is a GPU or CPU worker based on index - # First CPU_WORKER_COUNT workers are CPU, rest are GPU - if [[ ${worker_index} -lt ${CPU_WORKER_COUNT} ]]; then - log "Labeling CPU worker node: ${node_name}" - kubectl label nodes "${node_name}" \ - splunk.ai/node-role=worker \ - splunk.ai/workload-type=cpu \ - node.kubernetes.io/workload=ai-cpu \ - splunk.ai/instance-type=cpu-worker \ - --overwrite - else - log "Labeling GPU worker node: ${node_name}" - kubectl label nodes "${node_name}" \ - splunk.ai/node-role=worker \ - splunk.ai/workload-type=gpu \ - node.kubernetes.io/workload=ai-gpu \ - splunk.ai/instance-type=gpu-worker \ - nvidia.com/gpu=true \ - --overwrite - fi + node_name=$(resolve_node_name "${worker_ip}") + + if [[ -z "${node_name}" ]]; then + warn " Could not resolve hostname for worker ${worker_ip}, skipping..." worker_index=$((worker_index + 1)) + continue fi + + if ! kubectl get node "${node_name}" &>/dev/null; then + warn " Node '${node_name}' (from ${worker_ip}) not found in cluster, skipping..." + worker_index=$((worker_index + 1)) + continue + fi + + # Determine if this is a GPU or CPU worker based on index + # First CPU_WORKER_COUNT workers are CPU, rest are GPU + if [[ ${worker_index} -lt ${CPU_WORKER_COUNT} ]]; then + log "Labeling CPU worker node: ${node_name} (${worker_ip})" + kubectl label nodes "${node_name}" \ + splunk.ai/node-role=worker \ + splunk.ai/workload-type=cpu \ + node.kubernetes.io/workload=ai-cpu \ + splunk.ai/instance-type=cpu-worker \ + --overwrite + else + log "Labeling GPU worker node: ${node_name} (${worker_ip})" + kubectl label nodes "${node_name}" \ + splunk.ai/node-role=worker \ + splunk.ai/workload-type=gpu \ + node.kubernetes.io/workload=ai-gpu \ + splunk.ai/instance-type=gpu-worker \ + nvidia.com/gpu=true \ + --overwrite + fi + worker_index=$((worker_index + 1)) done # Add taints to GPU nodes to prevent non-GPU workloads from scheduling there @@ -868,16 +1290,30 @@ ensure_namespace() { } # ====== INSTALL MINIO ====== +# TODO remove install_minio() { - log "Installing MinIO..." + # When using external S3-compatible storage, skip in-cluster MinIO; credentials + # are created by ensure_s3compat_credentials() instead. + if [[ "${USE_EXTERNAL_OBJ_STORE}" == "true" ]]; then + log "Using external S3-compatible storage (${OBJ_STORE_TYPE}); skipping in-cluster MinIO install." + return 0 + fi - ensure_namespace "minio-system" + # Auto-generate root password if not set + if [[ -z "${MINIO_ROOT_PASSWORD}" ]]; then + MINIO_ROOT_PASSWORD="$(openssl rand -base64 24 2>/dev/null || head -c 32 /dev/urandom | base64)" + log "Generated MinIO root password (saved for secret creation)" + fi + + # In-cluster MinIO installation + log "Installing MinIO in ${MINIO_NS}..." + ensure_namespace "${MINIO_NS}" # Create MinIO secret kubectl create secret generic minio-creds \ - --namespace=minio-system \ - --from-literal=accesskey="${MINIO_ACCESS_KEY}" \ - --from-literal=secretkey="${MINIO_SECRET_KEY}" \ + --namespace="${MINIO_NS}" \ + --from-literal=accesskey="${MINIO_ROOT_USER}" \ + --from-literal=secretkey="${MINIO_ROOT_PASSWORD}" \ --dry-run=client -o yaml | kubectl apply -f - # Deploy MinIO @@ -886,9 +1322,9 @@ apiVersion: v1 kind: PersistentVolumeClaim metadata: name: minio-pvc - namespace: minio-system + namespace: ${MINIO_NS} spec: - storageClassName: local-path + storageClassName: ${STORAGE_CLASS} accessModes: - ReadWriteOnce resources: @@ -899,7 +1335,7 @@ apiVersion: v1 kind: Service metadata: name: minio - namespace: minio-system + namespace: ${MINIO_NS} spec: type: ClusterIP ports: @@ -916,7 +1352,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: minio - namespace: minio-system + namespace: ${MINIO_NS} spec: replicas: 1 selector: @@ -968,13 +1404,26 @@ spec: EOF log "Waiting for MinIO to be ready..." - kubectl wait --for=condition=ready pod -l app=minio -n minio-system --timeout=300s + kubectl wait --for=condition=ready pod -l app=minio -n "${MINIO_NS}" --timeout=300s + + # Create credentials secret in AI platform namespace + # SAIA and pkg/storage expect s3_access_key/s3_secret_key; models/SAIA expect MINIO_ACCESS_KEY/MINIO_SECRET_KEY. + ensure_namespace "${AI_NS}" + local secret_name="minio-credentials" + kubectl -n "${AI_NS}" create secret generic "${secret_name}" \ + --from-literal=AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" \ + --from-literal=AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \ + --from-literal=s3_access_key="${MINIO_ROOT_USER}" \ + --from-literal=s3_secret_key="${MINIO_ROOT_PASSWORD}" \ + --from-literal=MINIO_ACCESS_KEY="${MINIO_ROOT_USER}" \ + --from-literal=MINIO_SECRET_KEY="${MINIO_ROOT_PASSWORD}" \ + --dry-run=client -o yaml | kubectl -n "${AI_NS}" apply -f - # Create bucket and directories using a job log "Verifying MinIO bucket: ${MINIO_BUCKET}..." # Delete existing job if it exists (Jobs are immutable, can't be updated) - kubectl delete job minio-create-bucket -n minio-system --ignore-not-found=true 2>/dev/null || true + kubectl delete job minio-create-bucket -n "${MINIO_NS}" --ignore-not-found=true 2>/dev/null || true sleep 2 cat </dev/null || true + kubectl logs -n "${MINIO_NS}" job/minio-create-bucket --tail=20 2>/dev/null || true else warn "Bucket verification job did not complete in time, checking status..." - kubectl describe job/minio-create-bucket -n minio-system || true - kubectl logs -n minio-system job/minio-create-bucket --tail=50 || true + kubectl describe job/minio-create-bucket -n "${MINIO_NS}" || true + kubectl logs -n "${MINIO_NS}" job/minio-create-bucket --tail=50 || true + fi + + log "✓ MinIO installed; bucket=${MINIO_BUCKET}; credentials secret ${AI_NS}/${secret_name}" +} + +# ====== External S3-compatible object storage (credentials only; no in-cluster install) ====== +ensure_s3compat_credentials() { + if [[ "${USE_EXTERNAL_OBJ_STORE}" != "true" ]]; then + return 0 + fi + + log "Object store type is ${OBJ_STORE_TYPE}; creating credentials secret for external S3-compatible storage." + if [[ -z "${OBJ_STORE_ENDPOINT}" && -z "${MINIO_ENDPOINT}" ]]; then + err "storage.objectStore.type=${OBJ_STORE_TYPE} requires storage.objectStore.endpoint" + return 1 + fi + if [[ -z "${MINIO_ROOT_PASSWORD}" ]]; then + err "External S3-compatible storage requires credentials (objectStore.auth.rootPassword or MINIO_ROOT_PASSWORD)" + return 1 fi + ensure_namespace "${AI_NS}" + local secret_name="minio-credentials" + kubectl -n "${AI_NS}" create secret generic "${secret_name}" \ + --from-literal=AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" \ + --from-literal=AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \ + --from-literal=s3_access_key="${MINIO_ROOT_USER}" \ + --from-literal=s3_secret_key="${MINIO_ROOT_PASSWORD}" \ + --from-literal=MINIO_ACCESS_KEY="${MINIO_ROOT_USER}" \ + --from-literal=MINIO_SECRET_KEY="${MINIO_ROOT_PASSWORD}" \ + --dry-run=client -o yaml | kubectl -n "${AI_NS}" apply -f - + log "✓ External S3-compatible credentials secret ${AI_NS}/${secret_name} ready" } # ====== INSTALL CERT-MANAGER ====== @@ -1146,25 +1626,275 @@ EOF log "cert-manager installed successfully" } -# ====== INSTALL NVIDIA GPU OPERATOR ====== +# ====== INSTALL NVIDIA DRIVERS ON GPU NODES (bare-metal / EC2) ====== +# EKS GPU AMIs ship with NVIDIA drivers pre-installed. +# For k0s on generic AMIs (e.g. Amazon Linux 2023), we must install them +# on the host before the Kubernetes device-plugin can expose GPUs. +install_nvidia_host_drivers() { + if [[ ${GPU_WORKER_COUNT} -eq 0 ]]; then + log "Skipping NVIDIA host driver install (no GPU workers)" + return 0 + fi + + log "Installing NVIDIA drivers & container toolkit on GPU worker nodes..." + + # Ensure WORKER_IPS is populated (it may not be if install_k0s_cluster was skipped) + if [[ -z "${WORKER_IPS+x}" || ${#WORKER_IPS[@]} -eq 0 ]]; then + if [[ -n "${EXISTING_WORKER_IPS}" ]]; then + IFS=' ' read -ra WORKER_IPS <<< "${EXISTING_WORKER_IPS}" + log " Loaded ${#WORKER_IPS[@]} worker IP(s) from config: ${WORKER_IPS[*]}" + else + warn "No worker IPs available; skipping host driver install" + return 0 + fi + fi + + # Identify GPU worker IPs (workers after the first CPU_WORKER_COUNT) + local gpu_ips=() + local idx=0 + for ip in "${WORKER_IPS[@]}"; do + if [[ ${idx} -ge ${CPU_WORKER_COUNT} ]]; then + gpu_ips+=("${ip}") + fi + idx=$((idx + 1)) + done + + if [[ ${#gpu_ips[@]} -eq 0 ]]; then + warn "No GPU worker IPs found; skipping host driver install" + return 0 + fi + + for gpu_ip in "${gpu_ips[@]}"; do + log "Checking NVIDIA driver on ${gpu_ip}..." + + # Check if driver is already installed + if ssh_exec "${gpu_ip}" "nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null" &>/dev/null; then + local driver_ver + driver_ver=$(ssh_exec "${gpu_ip}" "nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null" || echo "unknown") + log " ✓ NVIDIA driver already installed on ${gpu_ip} (version: ${driver_ver})" + else + log " Installing NVIDIA driver on ${gpu_ip}..." + ssh_exec "${gpu_ip}" " + set -e + # Install kernel headers (needed for DKMS driver build) + sudo dnf install -y kernel-devel-\$(uname -r) kernel-headers-\$(uname -r) 2>/dev/null || \ + sudo yum install -y kernel-devel-\$(uname -r) kernel-headers-\$(uname -r) 2>/dev/null || \ + sudo apt-get install -y linux-headers-\$(uname -r) 2>/dev/null || true + + # Detect OS and add appropriate NVIDIA repo + if [ -f /etc/amzn-release ] || grep -qi 'amzn' /etc/os-release 2>/dev/null; then + sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/amzn2023/x86_64/cuda-amzn2023.repo 2>/dev/null || true + sudo dnf install -y nvidia-driver nvidia-driver-cuda nvidia-driver-libs 2>/dev/null || \ + sudo dnf module install -y nvidia-driver:latest-dkms 2>/dev/null || true + elif [ -f /etc/redhat-release ]; then + RHEL_MAJOR=\$(rpm -E %{rhel} 2>/dev/null || echo 9) + if [ \"\${RHEL_MAJOR}\" -ge 10 ]; then + # Add RHEL 10 CUDA repo only; remove any stale rhel9 repo to prevent GPG conflicts + sudo rm -f /etc/yum.repos.d/cuda-rhel9.repo 2>/dev/null || true + sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel10/x86_64/cuda-rhel10.repo 2>/dev/null || true + + # RHEL 10 removed DNF modularity; DKMS kmod requires EPEL + if ! rpm -q epel-release >/dev/null 2>&1; then + echo 'Installing EPEL for dkms...' + sudo dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-10.noarch.rpm 2>/dev/null || true + fi + sudo dnf install -y dkms 2>/dev/null || true + + sudo dnf install -y nvidia-driver nvidia-driver-cuda nvidia-driver-libs 2>/dev/null || \ + sudo dnf install -y --nobest nvidia-driver nvidia-driver-cuda nvidia-driver-libs 2>/dev/null || \ + sudo dnf install -y --nobest nvidia-open 2>/dev/null || true + else + sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo 2>/dev/null || true + sudo dnf module install -y nvidia-driver:latest-dkms 2>/dev/null || \ + sudo dnf install -y --nobest nvidia-driver nvidia-driver-cuda nvidia-driver-libs 2>/dev/null || true + fi + elif [ -f /etc/debian_version ]; then + curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb -o /tmp/cuda-keyring.deb + sudo dpkg -i /tmp/cuda-keyring.deb + sudo apt-get update && sudo apt-get install -y nvidia-driver-550 2>/dev/null || true + fi + + # Load nvidia kernel module immediately (avoids needing a reboot) + sudo modprobe nvidia 2>/dev/null || true + " || warn "Driver install on ${gpu_ip} had issues — check manually" + + # Verify + if ssh_exec "${gpu_ip}" "nvidia-smi 2>/dev/null" &>/dev/null; then + log " ✓ NVIDIA driver installed successfully on ${gpu_ip}" + else + warn " NVIDIA driver may need a reboot on ${gpu_ip} to take effect" + fi + fi + + # Install NVIDIA Container Toolkit (needed for GPU containers in k0s) + log " Ensuring NVIDIA Container Toolkit on ${gpu_ip}..." + ssh_exec "${gpu_ip}" " + if command -v nvidia-ctk &>/dev/null; then + echo 'nvidia-ctk already installed' + else + # Add NVIDIA Container Toolkit repo + curl -fsSL https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | \ + sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo >/dev/null 2>/dev/null || true + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \ + sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg 2>/dev/null || true + + # Install + sudo dnf install -y nvidia-container-toolkit 2>/dev/null || \ + sudo yum install -y nvidia-container-toolkit 2>/dev/null || \ + sudo apt-get install -y nvidia-container-toolkit 2>/dev/null || true + fi + + # Configure for k0s containerd (k0s uses /run/k0s/containerd.sock) + if [ -d /etc/k0s/containerd.d ]; then + # nvidia-ctk writes to /etc/containerd/conf.d/ by default, not the + # k0s drop-in dir. Generate it first, then copy with fixups. + sudo nvidia-ctk runtime configure --runtime=containerd 2>/dev/null || true + + # Copy the generated config to k0s drop-in location + if [ -f /etc/containerd/conf.d/99-nvidia.toml ]; then + sudo cp /etc/containerd/conf.d/99-nvidia.toml /etc/k0s/containerd.d/nvidia.toml + sudo rm -f /etc/containerd/conf.d/99-nvidia.toml + elif [ ! -s /etc/k0s/containerd.d/nvidia.toml ]; then + # Fallback: nvidia-ctk may have written directly; try explicit config path + sudo nvidia-ctk runtime configure --runtime=containerd \ + --config=/etc/k0s/containerd.d/nvidia.toml 2>/dev/null || true + fi + + # Strip version/imports lines so the file is treated as a drop-in + # snippet, not a full containerd config (prevents node NotReady). + sudo sed -i '/^version/d; /^imports/d; /^disabled_plugins/d; /^required_plugins/d' \ + /etc/k0s/containerd.d/nvidia.toml 2>/dev/null || true + + # Set nvidia as the default containerd runtime on GPU nodes so that + # all pods automatically get GPU access without needing runtimeClassName. + # This matches EKS behavior where the GPU AMI's default runtime handles + # GPU passthrough. The nvidia runtime is a superset of runc — non-GPU + # containers run unchanged. + # Insert inside the existing [plugins."...".containerd] section (not as + # a new top-level section, which would create a duplicate TOML table). + if ! grep -q 'default_runtime_name' /etc/k0s/containerd.d/nvidia.toml 2>/dev/null; then + sudo sed -i '/\[plugins\.\"io\.containerd\.grpc\.v1\.cri\"\.containerd\]$/{ + a\ default_runtime_name = \"nvidia\" + }' /etc/k0s/containerd.d/nvidia.toml 2>/dev/null || true + fi + elif [ -f /etc/containerd/config.toml ]; then + sudo nvidia-ctk runtime configure --runtime=containerd 2>/dev/null || true + fi + + # Generate CDI (Container Device Interface) specs so the device + # plugin can discover GPUs via CDI when using the nvidia RuntimeClass. + sudo mkdir -p /etc/cdi + sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml 2>/dev/null || true + + # Kill any leftover containerd-shim processes from previous runs + # before restarting the worker. Stale shims keep the old containerd + # socket busy and cause ping-containerd-timeout errors on restart. + sudo systemctl stop k0sworker 2>/dev/null || true + sleep 3 + sudo pkill -9 containerd-shim 2>/dev/null || true + sudo rm -f /run/k0s/containerd.sock 2>/dev/null || true + + # Restart k0s worker to pick up containerd config changes + sudo systemctl start k0sworker 2>/dev/null || true + " || warn " Container toolkit setup on ${gpu_ip} had issues — check manually" + + log " ✓ GPU node ${gpu_ip} setup complete" + done + + # Wait for GPU workers to rejoin and verify they are Ready + log "Waiting for GPU worker nodes to rejoin cluster and become Ready..." + local gpu_wait_timeout=180 + local gpu_wait_elapsed=0 + local all_gpu_ready=false + + while [[ ${gpu_wait_elapsed} -lt ${gpu_wait_timeout} ]]; do + all_gpu_ready=true + for gpu_ip in "${gpu_ips[@]}"; do + # Resolve GPU node name via SSH hostname lookup + local gpu_node + gpu_node=$(resolve_node_name "${gpu_ip}") + + if [[ -z "${gpu_node}" ]] || ! kubectl get node "${gpu_node}" &>/dev/null; then + all_gpu_ready=false + break + fi + + local ready_status + ready_status=$(kubectl get node "${gpu_node}" -o json 2>/dev/null | \ + jq -r '.status.conditions[] | select(.type=="Ready") | .status' 2>/dev/null || echo "") + if [[ "${ready_status}" != "True" ]]; then + all_gpu_ready=false + break + fi + done + + if [[ "${all_gpu_ready}" == "true" ]]; then + log "✓ All GPU worker nodes are Ready" + break + fi + + sleep 10 + gpu_wait_elapsed=$((gpu_wait_elapsed + 10)) + log " Waiting for GPU nodes to be Ready... ${gpu_wait_elapsed}/${gpu_wait_timeout}s" + done + + if [[ "${all_gpu_ready}" != "true" ]]; then + warn "Some GPU nodes may not be Ready yet. Check with: kubectl get nodes" + warn "GPU nodes may need a reboot if NVIDIA drivers were freshly installed." + fi + + # Verify GPUs are visible to Kubernetes + log "Checking if GPUs are visible to Kubernetes..." + local gpu_capacity + gpu_capacity=$(kubectl get nodes -l splunk.ai/workload-type=gpu -o json 2>/dev/null | \ + jq '[.items[].status.capacity["nvidia.com/gpu"] // "0" | tonumber] | add' 2>/dev/null || echo "0") + if [[ "${gpu_capacity}" -gt 0 ]]; then + log "✓ Total GPUs visible to Kubernetes: ${gpu_capacity}" + else + warn "No GPUs visible to Kubernetes yet — the NVIDIA device plugin may still be starting" + warn "Check with: kubectl get nodes -o json | jq '.items[].status.capacity'" + fi + + log "NVIDIA host driver installation complete" +} + +# ====== INSTALL NVIDIA DEVICE PLUGIN (matches EKS approach) ====== +# Ref: eks_cluster_with_stack.sh — uses the simple DaemonSet, NOT the GPU Operator. +# The GPU Operator's driver container images don't exist for Amazon Linux 2023. install_nvidia_device_plugin() { if [[ ${GPU_WORKER_COUNT} -eq 0 ]]; then - log "Skipping NVIDIA GPU operator (no GPU workers)" + log "Skipping NVIDIA device plugin (no GPU workers)" return 0 fi - log "Installing NVIDIA GPU Operator..." + local ver="${NVIDIA_VERSION:-v0.17.3}" + log "Installing NVIDIA device plugin DaemonSet (${ver})..." + + # Create the nvidia RuntimeClass so pods (including the device plugin + # itself) can use the NVIDIA container runtime for GPU access. + log " Creating nvidia RuntimeClass..." + cat <<'RTEOF' | kubectl apply -f - +apiVersion: node.k8s.io/v1 +kind: RuntimeClass +metadata: + name: nvidia +handler: nvidia +RTEOF - helm repo add nvidia https://helm.ngc.nvidia.com/nvidia || true - helm repo update + kubectl apply -n kube-system \ + -f "https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/${ver}/deployments/static/nvidia-device-plugin.yml" - helm_retry 3 upgrade --install gpu-operator nvidia/gpu-operator \ - --namespace gpu-operator --create-namespace \ - --set driver.enabled=true \ - --set toolkit.enabled=true \ - --wait --timeout=10m + # Constrain the device plugin to GPU-labeled nodes only — non-GPU nodes + # don't have the NVIDIA drivers and the plugin pods would fail there. + log " Patching device plugin: GPU nodeSelector..." + kubectl patch daemonset nvidia-device-plugin-daemonset -n kube-system --type='json' \ + -p='[ + {"op": "add", "path": "/spec/template/spec/nodeSelector", "value": {"splunk.ai/workload-type": "gpu"}} + ]' 2>/dev/null || true - log "NVIDIA GPU Operator installed successfully" + kubectl -n kube-system rollout status ds/nvidia-device-plugin-daemonset --timeout=3m || true + + log "NVIDIA device plugin installed successfully" } # ====== INSTALL PROMETHEUS OPERATOR ====== @@ -1172,7 +1902,8 @@ install_kube_prometheus() { log "Installing kube-prometheus-stack..." helm repo add prometheus-community https://prometheus-community.github.io/helm-charts || true - helm repo update + # TODO uncomment + # helm repo update prometheus-community # Only update the specific repo we need helm_retry 3 upgrade --install kube-prometheus-stack prometheus-community/kube-prometheus-stack \ --namespace monitoring --create-namespace \ @@ -1187,8 +1918,12 @@ install_kube_prometheus() { install_otel_operator_and_contrib_collector() { log "Installing OpenTelemetry Operator..." + # OTEL operator uses cert-manager for webhook certs — ensure webhook is ready + wait_for_cert_manager_webhook 30 10 + helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts || true - helm repo update + # TODO uncomment + # helm repo update open-telemetry # Only update the specific repo we need # Use cert-manager for webhook certificates (now that konnectivity is fixed) helm_retry 3 upgrade --install opentelemetry-operator open-telemetry/opentelemetry-operator \ @@ -1207,11 +1942,14 @@ install_ray_operator() { log "Installing KubeRay Operator..." helm repo add kuberay https://ray-project.github.io/kuberay-helm/ || true - helm repo update + # TODO uncomment + # helm repo update kuberay # Only update the specific repo we need helm_retry 3 upgrade --install kuberay-operator kuberay/kuberay-operator \ --namespace ray-system --create-namespace \ - --version 1.0.0 \ + --version 1.2.2 \ + --set image.repository=quay.io/kuberay/operator \ + --set image.tag=v1.2.2 \ --wait --timeout=10m wait_for_crd rayservices.ray.io 300 @@ -1229,6 +1967,14 @@ install_splunk_operator() { return 0 fi + # Determine the namespace from the YAML file or use default + local splunk_operator_ns="splunk-operator" + ensure_namespace "${splunk_operator_ns}" + + # Create image pull secrets in splunk-operator namespace BEFORE applying manifests + log "Creating image pull secrets in ${splunk_operator_ns} namespace..." + create_image_pull_secrets "${splunk_operator_ns}" >/dev/null 2>&1 || true + # Use kubectl replace --force for CRDs to avoid annotation size limits # This deletes and recreates the resource, avoiding the annotation issue log "Installing/updating Splunk Operator CRDs and resources..." @@ -1242,11 +1988,111 @@ install_splunk_operator() { kubectl replace --force -f "${SPLUNK_OPERATOR_FILE}" 2>&1 | grep -v "Warning: --force is deprecated" || true fi + # Patch splunk-operator deployment with imagePullSecrets if any exist + log "Checking for imagePullSecrets to add to Splunk Operator deployment..." + local secrets_patch="" + for secret_name in ecr-registry-secret docker-hub-secret gcr-secret acr-secret custom-registry-secret; do + if kubectl get secret "${secret_name}" -n "${splunk_operator_ns}" &>/dev/null 2>&1; then + secrets_patch+='{"name":"'"${secret_name}"'"},' + log " Found secret: ${secret_name}" + fi + done + + if [[ -n "${secrets_patch}" ]]; then + secrets_patch="${secrets_patch%,}" + local dep_name + dep_name=$(kubectl -n "${splunk_operator_ns}" get deploy -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + + if [[ -n "${dep_name}" ]]; then + log "Patching Splunk Operator deployment (${dep_name}) with imagePullSecrets..." + kubectl -n "${splunk_operator_ns}" patch deployment "${dep_name}" \ + --type='json' \ + -p='[{"op":"add","path":"/spec/template/spec/imagePullSecrets","value":['"${secrets_patch}"']}]' \ + 2>/dev/null || log " imagePullSecrets may already exist" + + # Restart to apply changes + kubectl rollout restart deployment "${dep_name}" -n "${splunk_operator_ns}" 2>/dev/null || true + fi + fi + wait_for_crd standalones.enterprise.splunk.com 300 log "Splunk Operator installed successfully" } +# ====== WAIT FOR CERT-MANAGER WEBHOOK ====== +# Ensures cert-manager webhook is responsive before applying resources that +# contain Certificate/Issuer CRs (e.g. artifacts.yaml). +wait_for_cert_manager_webhook() { + local max_attempts="${1:-30}" + local sleep_interval="${2:-10}" + + log "Verifying cert-manager webhook is responsive..." + + # 1. Ensure webhook pod is running + if ! kubectl get namespace cert-manager &>/dev/null; then + warn "cert-manager namespace not found, skipping webhook check" + return 0 + fi + + kubectl wait --for=condition=ready pod \ + -l app.kubernetes.io/component=webhook \ + -n cert-manager --timeout=120s 2>/dev/null \ + || warn "cert-manager webhook pod may not be fully ready" + + # 2. Ensure webhook endpoint has addresses + local attempt=0 + while (( attempt < max_attempts )); do + local webhook_ip + webhook_ip=$(kubectl -n cert-manager get endpoints cert-manager-webhook \ + -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || echo "") + + if [[ -n "${webhook_ip}" ]]; then + log "cert-manager webhook endpoint: ${webhook_ip}" + break + fi + + log " Waiting for cert-manager webhook endpoint... (${attempt}/${max_attempts})" + sleep "${sleep_interval}" + attempt=$((attempt + 1)) + done + + if (( attempt >= max_attempts )); then + warn "cert-manager webhook endpoint not found after ${max_attempts} attempts" + return 1 + fi + + # 3. Functional test: create and delete a test Issuer + local test_ok=false + for i in $(seq 1 "${max_attempts}"); do + if kubectl apply -f - <<'TESTEOF' 2>/dev/null +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: cert-manager-webhook-test + namespace: cert-manager +spec: + selfSigned: {} +TESTEOF + then + kubectl delete issuer cert-manager-webhook-test -n cert-manager \ + --ignore-not-found=true 2>/dev/null || true + test_ok=true + log "✓ cert-manager webhook is responsive" + break + fi + log " cert-manager webhook not yet accepting requests... (${i}/${max_attempts})" + sleep "${sleep_interval}" + done + + if [[ "${test_ok}" != "true" ]]; then + warn "cert-manager webhook did not become responsive after ${max_attempts} attempts" + return 1 + fi + + return 0 +} + # ====== INSTALL SPLUNK AI OPERATOR ====== install_splunk_ai_operator() { log "Installing Splunk AI Operator from ${SPLUNK_AI_FILE}..." @@ -1261,13 +2107,60 @@ install_splunk_ai_operator() { local ai_operator_ns="splunk-ai-operator-system" ensure_namespace "${ai_operator_ns}" + # Create image pull secrets in operator namespace BEFORE applying manifests + log "Creating image pull secrets in ${ai_operator_ns} namespace..." + create_image_pull_secrets "${ai_operator_ns}" >/dev/null 2>&1 || true + + # Ensure cert-manager webhook is ready before applying (artifacts.yaml contains + # Certificate and Issuer resources that require the webhook to be responsive) + wait_for_cert_manager_webhook 30 10 + # Apply the artifacts.yaml file (contains CRDs and operator deployment) log "Applying Splunk AI Operator manifests..." - # First try to apply normally - if kubectl apply -f "${SPLUNK_AI_FILE}" 2>&1 | grep -q "field is immutable\|too long"; then - log "Standard apply failed, using server-side apply with force..." - kubectl apply --server-side --force-conflicts -f "${SPLUNK_AI_FILE}" + # Use server-side apply with force to ensure all fields are updated including images + log "Using server-side apply to ensure image URLs are updated..." + local apply_output + apply_output=$(kubectl apply --server-side --force-conflicts -f "${SPLUNK_AI_FILE}" 2>&1) || true + echo "${apply_output}" + + # Check if any cert-manager resources (Certificate/Issuer) failed due to webhook errors + if echo "${apply_output}" | grep -qi "webhook.*cert-manager\|failed calling webhook.*cert-manager\|i/o timeout"; then + warn "Some cert-manager resources failed on first attempt, retrying..." + + # Wait for webhook to stabilize and retry + sleep 15 + wait_for_cert_manager_webhook 15 10 + + log "Retrying full apply for cert-manager resources..." + kubectl apply --server-side --force-conflicts -f "${SPLUNK_AI_FILE}" 2>&1 | \ + grep -iE "certificate|issuer|error|warning" || true + fi + + # Verify that the critical Certificate and Issuer resources exist + log "Verifying cert-manager resources were created..." + local cm_retries=0 + local cm_max=12 + while (( cm_retries < cm_max )); do + local serving_cert + serving_cert=$(kubectl get certificate splunk-ai-operator-serving-cert \ + -n "${ai_operator_ns}" -o jsonpath='{.metadata.name}' 2>/dev/null || echo "") + + if [[ -n "${serving_cert}" ]]; then + log "✓ Certificate 'splunk-ai-operator-serving-cert' exists" + break + fi + + log " Waiting for cert-manager resources to be created... (${cm_retries}/${cm_max})" + sleep 10 + # Re-apply on each retry to ensure cert-manager resources are processed + kubectl apply --server-side --force-conflicts -f "${SPLUNK_AI_FILE}" 2>&1 | \ + grep -iE "certificate|issuer" || true + cm_retries=$((cm_retries + 1)) + done + + if (( cm_retries >= cm_max )); then + warn "Certificate resources may not have been created — the AI operator webhook may not work" fi # Specifically ensure ClusterRole is updated (common RBAC update issue) @@ -1290,6 +2183,31 @@ install_splunk_ai_operator() { # Remove 'deployment.apps/' prefix if present dep="${dep#deployment.apps/}" log "Found deployment: ${dep}" + + # Patch deployment with imagePullSecrets if any exist + log "Checking for imagePullSecrets to add to operator deployment..." + local secrets_patch="" + for secret_name in ecr-registry-secret docker-hub-secret gcr-secret acr-secret custom-registry-secret; do + if kubectl get secret "${secret_name}" -n "${ai_operator_ns}" &>/dev/null 2>&1; then + secrets_patch+='{"name":"'"${secret_name}"'"},' + log " Found secret: ${secret_name}" + fi + done + + if [[ -n "${secrets_patch}" ]]; then + # Remove trailing comma + secrets_patch="${secrets_patch%,}" + log "Patching operator deployment with imagePullSecrets..." + kubectl -n "${ai_operator_ns}" patch deployment "${dep}" \ + --type='json' \ + -p='[{"op":"add","path":"/spec/template/spec/imagePullSecrets","value":['"${secrets_patch}"']}]' \ + 2>/dev/null || log " imagePullSecrets may already exist or path differs" + fi + + # Force restart the deployment to pick up new environment variables (image URLs) + log "Restarting operator deployment to apply updated image configuration..." + kubectl rollout restart deployment "${dep}" -n "${ai_operator_ns}" + wait_rollout "${ai_operator_ns}" deploy "${dep}" else warn "Could not find operator deployment, will wait for CRDs instead" @@ -1312,8 +2230,8 @@ create_minio_secret() { kubectl create secret generic minio-credentials \ --namespace="${ns}" \ - --from-literal=accessKey="${MINIO_ACCESS_KEY}" \ - --from-literal=secretKey="${MINIO_SECRET_KEY}" \ + --from-literal=accessKey="${MINIO_ROOT_USER}" \ + --from-literal=secretKey="${MINIO_ROOT_PASSWORD}" \ --dry-run=client -o yaml | kubectl apply -f - log "MinIO credentials secret created" @@ -1323,8 +2241,10 @@ create_minio_secret() { # ====== SETUP ECR REPOSITORY PERMISSIONS ====== setup_ecr_permissions() { local repo_prefix="${1:-ml-platform}" + # Use ECR_REGION from config, fallback to REGION, then us-east-2 + local ecr_region="${ECR_REGION:-${REGION:-us-east-2}}" - log "Checking ECR repository permissions for: ${repo_prefix}..." + log "Checking ECR repository permissions for: ${repo_prefix} in region ${ecr_region}..." # Check if AWS credentials are available if ! aws sts get-caller-identity &>/dev/null; then @@ -1338,8 +2258,8 @@ setup_ecr_permissions() { # List repositories matching prefix local repos - repos=$(aws ecr describe-repositories --region "${REGION}" 2>/dev/null | \ - jq -r ".repositories[] | select(.repositoryName | startswith(\"${repo_prefix}\")) | .repositoryName" || echo "") + repos=$(aws ecr describe-repositories --region "${ecr_region}" 2>/dev/null | \ + jq -r --arg prefix "${repo_prefix}" '.repositories[] | select(.repositoryName | startswith($prefix)) | .repositoryName' || echo "") if [[ -z "${repos}" ]]; then warn "No ECR repositories found with prefix: ${repo_prefix}" @@ -1359,7 +2279,7 @@ setup_ecr_permissions() { # Get current policy local policy - policy=$(aws ecr get-repository-policy --repository-name "${repo}" --region "${REGION}" 2>/dev/null | jq -r '.policyText' || echo "") + policy=$(aws ecr get-repository-policy --repository-name "${repo}" --region "${ecr_region}" 2>/dev/null | jq -r '.policyText' || echo "") if [[ -z "${policy}" ]]; then log " No policy found, creating one to allow pull access..." @@ -1387,7 +2307,7 @@ EOF if aws ecr set-repository-policy \ --repository-name "${repo}" \ - --region "${REGION}" \ + --region "${ecr_region}" \ --policy-text "file:///tmp/ecr-policy-${repo//\//-}.json" &>/dev/null; then log " ✓ Pull permissions granted for repository: ${repo}" else @@ -1417,8 +2337,10 @@ create_image_pull_secrets() { # 1. Create ECR secret if enabled if [[ "${IMAGE_PULL_SECRETS_ECR_ENABLED}" == "true" ]]; then log "Creating ECR secret..." - local ecr_region="${REGION:-us-west-2}" + # Use ECR_REGION from config, fallback to REGION, then us-east-2 + local ecr_region="${ECR_REGION:-${REGION:-us-east-2}}" local ecr_account="${ECR_ACCOUNT:-}" + log " ECR Region: ${ecr_region}, ECR Account: ${ecr_account}" # Check if AWS credentials are available if ! aws sts get-caller-identity &>/dev/null; then @@ -1554,7 +2476,8 @@ create_image_pull_secrets() { # ====== CREATE ECR IMAGE PULL SECRET (Legacy - kept for compatibility) ====== create_ecr_secret() { local ns="$1" - local region="${REGION:-us-west-2}" + # Use ECR_REGION from config, fallback to REGION, then us-east-2 + local region="${ECR_REGION:-${REGION:-us-east-2}}" local ecr_account="${ECR_ACCOUNT:-}" ensure_namespace "${ns}" @@ -1605,7 +2528,201 @@ create_ecr_secret() { log "✓ ECR secret created: ecr-registry-secret" log "✓ Secret will be referenced in AIPlatform CR spec.imagePullSecrets" - log "Note: ECR tokens expire after 12 hours. Re-run installation to refresh." +} + +# ====== ECR CREDENTIAL REFRESHER CRONJOB ====== +# ECR tokens expire every 12 hours. This CronJob refreshes the ecr-registry-secret +# in all relevant namespaces every 6 hours so image pulls never break. +install_ecr_credential_refresher() { + if [[ "${IMAGE_PULL_SECRETS_ECR_ENABLED}" != "true" ]]; then + log "ECR not enabled — skipping credential refresher" + return 0 + fi + + local ecr_region="${ECR_REGION:-${REGION:-us-east-2}}" + local ecr_account="${ECR_ACCOUNT}" + + if [[ -z "${ecr_account}" ]]; then + warn "ECR account not configured — skipping credential refresher" + return 0 + fi + + local ecr_server="${ecr_account}.dkr.ecr.${ecr_region}.amazonaws.com" + local refresher_ns="${AI_NS}" + local target_namespaces="${AI_NS} splunk-ai-operator-system" + + # Resolve AWS credentials (env > aws configure) + local aws_key="${AWS_ACCESS_KEY_ID:-}" + local aws_secret="${AWS_SECRET_ACCESS_KEY:-}" + local aws_session="${AWS_SESSION_TOKEN:-}" + + if [[ -z "$aws_key" ]]; then + aws_key=$(aws configure get aws_access_key_id 2>/dev/null || echo "") + fi + if [[ -z "$aws_secret" ]]; then + aws_secret=$(aws configure get aws_secret_access_key 2>/dev/null || echo "") + fi + + if [[ -z "$aws_key" ]] || [[ -z "$aws_secret" ]]; then + warn "AWS credentials not available — skipping ECR credential refresher CronJob" + warn "ECR tokens will expire after 12 hours. Refresh ecr-registry-secret manually." + return 0 + fi + + if [[ -n "$aws_session" ]]; then + warn "Detected temporary AWS credentials (session token present)" + warn "ECR refresher CronJob will work until these session credentials expire" + warn "For long-term use, configure an IAM user with ecr:GetAuthorizationToken permission" + fi + + log "Installing ECR credential refresher CronJob..." + + # Store AWS credentials in a secret for the CronJob to use + local secret_args=( + --from-literal=AWS_ACCESS_KEY_ID="${aws_key}" + --from-literal=AWS_SECRET_ACCESS_KEY="${aws_secret}" + ) + [[ -n "$aws_session" ]] && secret_args+=(--from-literal=AWS_SESSION_TOKEN="${aws_session}") + + kubectl -n "${refresher_ns}" create secret generic aws-ecr-credentials \ + "${secret_args[@]}" \ + --dry-run=client -o yaml | kubectl apply -f - + + # Pre-build the optional SESSION_TOKEN env block + local session_token_env="" + if [[ -n "$aws_session" ]]; then + session_token_env=" + - name: AWS_SESSION_TOKEN + valueFrom: + secretKeyRef: + name: aws-ecr-credentials + key: AWS_SESSION_TOKEN + optional: true" + fi + + # Deploy ServiceAccount, RBAC, and CronJob + cat < /dev/null + echo " Updated existing secret" + else + curl -sf -X POST \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer \${K8S_TOKEN}" \ + --cacert \${K8S_CA} \ + "\${K8S_API}/api/v1/namespaces/\${NS}/secrets" \ + -d "\${SECRET_JSON}" > /dev/null + echo " Created new secret" + fi + done + echo "ECR credential refresh complete" + env: + - name: ECR_REGION + value: "${ecr_region}" + - name: ECR_SERVER + value: "${ecr_server}" + - name: TARGET_NAMESPACES + value: "${target_namespaces}" + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: aws-ecr-credentials + key: AWS_ACCESS_KEY_ID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: aws-ecr-credentials + key: AWS_SECRET_ACCESS_KEY${session_token_env} +CRONEOF + + log "✓ ECR credential refresher CronJob installed (schedule: every 6 hours)" + + # Trigger an immediate run to ensure fresh credentials right now + log "Running initial credential refresh..." + kubectl -n "${refresher_ns}" delete job ecr-initial-refresh --ignore-not-found=true 2>/dev/null || true + kubectl -n "${refresher_ns}" create job --from=cronjob/ecr-credential-refresher ecr-initial-refresh 2>/dev/null || true + + # Wait for the initial job to complete (up to 2 minutes) + if kubectl -n "${refresher_ns}" wait --for=condition=complete job/ecr-initial-refresh --timeout=120s 2>/dev/null; then + log "✓ Initial ECR credential refresh completed successfully" + else + warn "Initial ECR refresh may still be running — pods should recover once it completes" + fi } # ====== INSTALL SPLUNK STANDALONE ====== @@ -1615,12 +2732,27 @@ install_splunk_standalone() { ensure_namespace "${AI_NS}" wait_for_crd standalones.enterprise.splunk.com 600 - # Create MinIO secret for Splunk (S3-compatible credentials) - log "Creating S3-compatible secret for Splunk App Framework..." - kubectl -n "${AI_NS}" create secret generic s3-secret \ - --from-literal=s3_access_key="${MINIO_ACCESS_KEY}" \ - --from-literal=s3_secret_key="${MINIO_SECRET_KEY}" \ - --dry-run=client -o yaml | kubectl apply -f - + # Create credentials secret for Splunk App Framework + if [[ "${USE_EXTERNAL_OBJ_STORE}" == "true" ]]; then + log "Using external S3-compatible credentials for Splunk App Framework..." + if ! kubectl get secret minio-credentials -n "${AI_NS}" &>/dev/null; then + log "Creating minio-credentials secret in ${AI_NS}..." + kubectl -n "${AI_NS}" create secret generic minio-credentials \ + --from-literal=AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" \ + --from-literal=AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \ + --from-literal=s3_access_key="${MINIO_ROOT_USER}" \ + --from-literal=s3_secret_key="${MINIO_ROOT_PASSWORD}" \ + --from-literal=MINIO_ACCESS_KEY="${MINIO_ROOT_USER}" \ + --from-literal=MINIO_SECRET_KEY="${MINIO_ROOT_PASSWORD}" \ + --dry-run=client -o yaml | kubectl -n "${AI_NS}" apply -f - + fi + else + log "Creating S3-compatible secret for Splunk App Framework..." + kubectl -n "${AI_NS}" create secret generic s3-secret \ + --from-literal=s3_access_key="${MINIO_ROOT_USER}" \ + --from-literal=s3_secret_key="${MINIO_ROOT_PASSWORD}" \ + --dry-run=client -o yaml | kubectl apply -f - + fi # Create splunk-defaults ConfigMap (optional but recommended) cat <<'YAML' | kubectl -n "${AI_NS}" apply -f - @@ -1642,8 +2774,18 @@ data: sslPassword: password YAML - # Create Splunk Standalone with App Framework (not SmartStore) - cat </dev/null; then + log "Patching default ServiceAccount with ecr-registry-secret..." + kubectl patch serviceaccount default -n "${AI_NS}" \ + -p '{"imagePullSecrets": [{"name": "ecr-registry-secret"}]}' 2>/dev/null || \ + warn "Could not patch default ServiceAccount" + fi + + # Standalone app repo: external S3-compatible when objectStore.type is s3compat/minio/seaweedfs, else S3 + if [[ "${USE_EXTERNAL_OBJ_STORE}" == "true" ]]; then + local minio_endpoint="${MINIO_ENDPOINT:-${OBJ_STORE_ENDPOINT}}" + cat </dev/null || true + kubectl delete pods -n "${AI_NS}" --field-selector status.phase=Failed --wait=false 2>/dev/null || true + # Delete pods stuck in ImagePullBackOff or ErrImagePull (use jq to avoid bash 3.x jsonpath parsing issues) + kubectl get pods -n "${AI_NS}" -o json 2>/dev/null | \ + jq -r '.items[] | select(.status.containerStatuses[]? | .state.waiting?.reason? == "ImagePullBackOff") | .metadata.name' 2>/dev/null | \ + xargs -r -I {} kubectl delete pod {} -n "${AI_NS}" --wait=false --grace-period=0 --force 2>/dev/null || true + kubectl get pods -n "${AI_NS}" -o json 2>/dev/null | \ + jq -r '.items[] | select(.status.containerStatuses[]? | .state.waiting?.reason? == "ErrImagePull") | .metadata.name' 2>/dev/null | \ + xargs -r -I {} kubectl delete pod {} -n "${AI_NS}" --wait=false --grace-period=0 --force 2>/dev/null || true + log "✓ Cleanup complete" + # Get Splunk secret name (for HEC endpoint) local splunk_secret="splunk-${AI_STANDALONE_NAME}-standalone-secret-v1" log "Using Splunk secret: ${splunk_secret}" - # Ensure s3-secret exists in AI namespace (for MinIO credentials) - log "Creating/updating MinIO credentials secret (s3-secret) in ${AI_NS}..." - kubectl -n "${AI_NS}" create secret generic s3-secret \ - --from-literal=s3_access_key="${MINIO_ACCESS_KEY}" \ - --from-literal=s3_secret_key="${MINIO_SECRET_KEY}" \ - --dry-run=client -o yaml | kubectl apply -f - - log "✓ MinIO credentials secret ready" + # Ensure object storage credentials secret exists in AI namespace + if [[ "${USE_EXTERNAL_OBJ_STORE}" == "true" ]]; then + log "Creating/updating external S3-compatible credentials secret (minio-credentials) in ${AI_NS}..." + kubectl -n "${AI_NS}" create secret generic minio-credentials \ + --from-literal=AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" \ + --from-literal=AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \ + --from-literal=s3_access_key="${MINIO_ROOT_USER}" \ + --from-literal=s3_secret_key="${MINIO_ROOT_PASSWORD}" \ + --from-literal=MINIO_ACCESS_KEY="${MINIO_ROOT_USER}" \ + --from-literal=MINIO_SECRET_KEY="${MINIO_ROOT_PASSWORD}" \ + --dry-run=client -o yaml | kubectl -n "${AI_NS}" apply -f - + log "✓ Object storage credentials secret ready" + else + log "Creating/updating S3 credentials secret (s3-secret) in ${AI_NS}..." + kubectl -n "${AI_NS}" create secret generic s3-secret \ + --from-literal=s3_access_key="${MINIO_ROOT_USER}" \ + --from-literal=s3_secret_key="${MINIO_ROOT_PASSWORD}" \ + --dry-run=client -o yaml | kubectl apply -f - + log "✓ S3 credentials secret ready" + fi # Build imagePullSecrets YAML from created secrets local image_pull_secrets="" @@ -1733,6 +2934,57 @@ EOF log "No imagePullSecrets found, using public images only" fi + # objectStorage: path/endpoint/secret by object store type (aws | s3compat | minio | seaweedfs) + local obj_path obj_endpoint obj_secret + case "${OBJ_STORE_TYPE}" in + s3compat) + obj_path="s3compat://${OBJ_STORE_BUCKET}" + obj_endpoint="${OBJ_STORE_ENDPOINT}" + obj_secret="minio-credentials" + ;; + minio) + obj_path="minio://${MINIO_BUCKET}" + obj_endpoint="${MINIO_ENDPOINT:-${OBJ_STORE_ENDPOINT}}" + obj_secret="minio-credentials" + ;; + seaweedfs) + obj_path="seaweedfs://${OBJ_STORE_BUCKET}" + obj_endpoint="${OBJ_STORE_ENDPOINT}" + obj_secret="minio-credentials" + ;; + aws|*) + obj_path="s3://${OBJ_STORE_BUCKET}" + obj_endpoint="http://minio.${MINIO_NS}.svc.cluster.local:9000" + obj_secret="s3-secret" + ;; + esac + + # Build features YAML from config file (reads aiPlatform.features[] array) + local features_yaml="" + local feature_count + feature_count=$(yq eval '.aiPlatform.features | length' "${CONFIG_FILE}" 2>/dev/null || echo "0") + + if [[ "${feature_count}" -gt 0 ]]; then + log "Reading ${feature_count} feature(s) from config..." + local i=0 + while [[ $i -lt $feature_count ]]; do + local fname fver fsa + fname=$(yq eval ".aiPlatform.features[$i].name" "${CONFIG_FILE}" 2>/dev/null || echo "") + fver=$(yq eval ".aiPlatform.features[$i].version // \"1.0.0\"" "${CONFIG_FILE}" 2>/dev/null || echo "1.0.0") + fsa=$(yq eval ".aiPlatform.features[$i].serviceAccountName // \"\"" "${CONFIG_FILE}" 2>/dev/null || echo "") + if [[ -n "$fname" && "$fname" != "null" ]]; then + features_yaml+=" - name: ${fname}"$'\n' + features_yaml+=" version: \"${fver}\""$'\n' + [[ -n "$fsa" && "$fsa" != "null" ]] && features_yaml+=" serviceAccountName: ${fsa}"$'\n' + log " Feature: ${fname} v${fver}" + fi + i=$((i + 1)) + done + else + log "No features in config — defaulting to saia" + features_yaml=" - name: saia"$'\n'" version: \"1.1.0\""$'\n' + fi + # Apply AIPlatform CR (matching EKS script pattern) log "Applying AIPlatform CR: ${CLUSTER_NAME}-ai-platform" cat </dev/null | grep -q "Running"; then + # Check 3: MinIO / Object Storage + log "Checking object storage..." + if [[ "${USE_EXTERNAL_OBJ_STORE}" == "true" ]]; then + log "⏭️ External S3-compatible storage (${OBJ_STORE_TYPE}); skipping in-cluster check" + elif kubectl get pod -n "${MINIO_NS}" -l app=minio 2>/dev/null | grep -q "Running"; then log "✅ MinIO is running" else warn "MinIO pod not in Running state" - kubectl get pods -n minio-system + kubectl get pods -n "${MINIO_NS}" ((health_issues++)) fi log "" @@ -2017,12 +3278,12 @@ show_platform_access_info() { log " API URL: http://localhost:9000" log " " log " 💡 Access MinIO Console:" - log " kubectl port-forward svc/minio -n minio-system 9001:9001" + log " kubectl port-forward svc/minio -n ${MINIO_NS} 9001:9001" log " Open: http://localhost:9001" log " " log " 🔑 Credentials:" - log " Username: ${MINIO_ACCESS_KEY}" - log " Password: ${MINIO_SECRET_KEY}" + log " Username: ${MINIO_ROOT_USER}" + log " Password: ${MINIO_ROOT_PASSWORD}" log "" # AI Platform information @@ -2109,6 +3370,10 @@ show_platform_access_info() { # ====== MAIN INSTALL FLOW ====== main_install() { load_config + + validate_image_config + configure_images + preflight_checks # Check if existing Kubernetes cluster should be used @@ -2173,6 +3438,25 @@ main_install() { log "" log "Skipping k0s installation, using existing cluster" use_existing_cluster=true + + # Prepare all nodes for OS compatibility (iptables, firewalld, etc.) + local all_node_ips=("${CONTROLLER_IPS[@]}") + if [[ -n "${EXISTING_WORKER_IPS}" ]]; then + IFS=' ' read -ra WORKER_IPS <<< "${EXISTING_WORKER_IPS}" + all_node_ips+=("${WORKER_IPS[@]}") + fi + prepare_nodes_for_k0s "${all_node_ips[@]}" + + # Ensure all expected workers are joined + if [[ -n "${EXISTING_WORKER_IPS}" ]]; then + local current_node_count + current_node_count=$(kubectl get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ') + local expected_total=$(( ${#CONTROLLER_IPS[@]} + ${#WORKER_IPS[@]} )) + if [[ "${current_node_count}" -lt "${expected_total}" ]]; then + log "Cluster has ${current_node_count} nodes but ${expected_total} expected — joining missing workers..." + join_workers + fi + fi elif [[ "${USE_EXISTING}" == "force" ]]; then err "useExisting=force but no k0s cluster found on provided nodes" fi @@ -2221,6 +3505,25 @@ main_install() { log "" log "Skipping k0s installation, using existing cluster" use_existing_cluster=true + + # Prepare all nodes for OS compatibility (iptables, firewalld, etc.) + local all_node_ips2=("${CONTROLLER_IPS[@]}") + if [[ -n "${EXISTING_WORKER_IPS}" ]]; then + IFS=' ' read -ra WORKER_IPS <<< "${EXISTING_WORKER_IPS}" + all_node_ips2+=("${WORKER_IPS[@]}") + fi + prepare_nodes_for_k0s "${all_node_ips2[@]}" + + # Ensure all expected workers are joined + if [[ -n "${EXISTING_WORKER_IPS}" ]]; then + local current_node_count + current_node_count=$(kubectl get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ') + local expected_total=$(( ${#CONTROLLER_IPS[@]} + ${#WORKER_IPS[@]} )) + if [[ "${current_node_count}" -lt "${expected_total}" ]]; then + log "Cluster has ${current_node_count} nodes but ${expected_total} expected — joining missing workers..." + join_workers + fi + fi fi fi @@ -2613,7 +3916,7 @@ Notes: * Includes retry logic for ENI detachment * Provides detailed cleanup summary - 'clean-all' adds aggressive node-level cleanup (on-prem only): - * Removes k0s binaries and data directories + * Removes k0s data directories (preserves k0s binary) * Cleans kubelet, CNI, and Calico files * Flushes iptables rules - For EC2 mode, 'delete' terminates all instances and cleans AWS resources @@ -2622,6 +3925,54 @@ Notes: EOF } +# ====== VERIFY WORKER STATUS ====== +# Check if a worker is properly connected to the cluster +verify_worker_status() { + local worker_ip="$1" + local controller_ip="$2" + + log " Verifying worker ${worker_ip} status..." + + # Check 1: Is k0s running on the worker? + local k0s_status + k0s_status=$(ssh_exec "${worker_ip}" "sudo k0s status 2>&1" || echo "not running") + + if echo "${k0s_status}" | grep -q "Kube-api probing successful: true"; then + log " ✓ k0s running and API reachable" + return 0 + elif echo "${k0s_status}" | grep -q "Role: worker"; then + # k0s is running but API not reachable yet + log " ⏳ k0s running but API not yet reachable" + return 1 + else + log " ✗ k0s not running" + return 2 + fi +} + +# ====== THOROUGH WORKER CLEANUP ====== +# Completely clean up k0s on a worker node (for fresh rejoin) +cleanup_worker_k0s() { + local worker_ip="$1" + + log " Performing thorough k0s cleanup on ${worker_ip}..." + + ssh_exec "${worker_ip}" " + sudo systemctl stop k0sworker 2>/dev/null || true + sudo systemctl disable k0sworker 2>/dev/null || true + sudo systemctl reset-failed k0sworker 2>/dev/null || true + sudo pkill -9 k0s 2>/dev/null || true + sudo pkill -9 kubelet 2>/dev/null || true + sudo pkill -9 containerd-shim 2>/dev/null || true + sudo rm -f /etc/systemd/system/k0sworker.service + sudo rm -rf /var/lib/k0s /run/k0s /etc/k0s /tmp/k0s-token + sudo rm -f /run/k0s/containerd.sock 2>/dev/null || true + sudo systemctl daemon-reload + " 2>/dev/null || true + + log " ✓ Cleanup complete" +} + # ====== JOIN WORKERS (Resume/Retry Worker Joins) ====== join_workers() { log "============================================" @@ -2690,25 +4041,68 @@ join_workers() { log "Controller IP: ${controller_ip}" log "Worker IPs: ${WORKER_IPS[*]}" - # Check which workers are already joined + # Check which workers are already joined AND healthy log "Checking current cluster nodes..." kubectl get nodes -o wide || true local already_joined_ips=() + local needs_rejoin_ips=() + + # Get all cluster nodes once for matching + local cluster_nodes_json + cluster_nodes_json=$(kubectl get nodes -o json 2>/dev/null || echo '{"items":[]}') + for worker_ip in "${WORKER_IPS[@]}"; do - # Check if node with this IP already exists in cluster - local node_exists - node_exists=$(kubectl get nodes -o json | jq -r ".items[] | select(.status.addresses[]? | select(.type==\"InternalIP\" and .address==\"${worker_ip}\")) | .metadata.name" 2>/dev/null || echo "") + # Resolve the Kubernetes node name by SSHing to the worker and getting its hostname + local node_exists="" + node_exists=$(resolve_node_name "${worker_ip}") + # Verify this node actually exists in the cluster if [[ -n "${node_exists}" ]]; then - log " ✓ Worker ${worker_ip} already joined as ${node_exists}" - already_joined_ips+=("${worker_ip}") + local found_in_cluster + found_in_cluster=$(echo "${cluster_nodes_json}" | jq -r --arg name "${node_exists}" \ + '.items[] | select(.metadata.name==$name) | .metadata.name' 2>/dev/null | head -1 || echo "") + if [[ -z "${found_in_cluster}" ]]; then + node_exists="" + fi + fi + + if [[ -n "${node_exists}" ]]; then + # Node exists in cluster, check if it's Ready + local node_ready + node_ready=$(echo "${cluster_nodes_json}" | jq -r --arg name "${node_exists}" \ + '.items[] | select(.metadata.name==$name) | .status.conditions[] | select(.type=="Ready") | .status' 2>/dev/null || echo "Unknown") + + if [[ "${node_ready}" == "True" ]]; then + log " ✓ Worker ${worker_ip} joined and Ready as ${node_exists}" + already_joined_ips+=("${worker_ip}") + else + log " ⚠ Worker ${worker_ip} exists as ${node_exists} but not Ready (${node_ready})" + needs_rejoin_ips+=("${worker_ip}") + fi else - log " ✗ Worker ${worker_ip} not joined yet" + # Node doesn't exist in cluster, check k0s status on worker + log " Checking k0s status on ${worker_ip}..." + if verify_worker_status "${worker_ip}" "${controller_ip}"; then + log " ⏳ Worker ${worker_ip} k0s running, waiting for cluster sync..." + # Give it more time to appear in cluster + else + log " ✗ Worker ${worker_ip} not properly connected" + needs_rejoin_ips+=("${worker_ip}") + fi fi done + # If all workers are joined, nothing to do + if [[ ${#already_joined_ips[@]} -eq ${#WORKER_IPS[@]} ]]; then + log "" + log "✓ All ${#WORKER_IPS[@]} workers are already joined and healthy!" + kubectl get nodes -o wide + return 0 + fi + # Generate worker token from controller + log "" log "Generating worker join token..." local worker_token worker_token=$(ssh_exec "${controller_ip}" "sudo k0s token create --role=worker" 2>/dev/null) @@ -2717,26 +4111,38 @@ join_workers() { err "Failed to generate worker token from controller" fi - log "Worker token generated successfully" + log "Worker token generated successfully (${#worker_token} chars)" - # Install and join workers that aren't already joined + # Join workers that need to be joined/rejoined local workers_joined=0 + local workers_to_process=() + + # Build list of workers to process (use ${arr[@]+...} to avoid unbound-variable on empty arrays) for worker_ip in "${WORKER_IPS[@]}"; do - # Skip if already joined - local skip_worker=false + local skip=false if [[ ${#already_joined_ips[@]} -gt 0 ]]; then for joined_ip in "${already_joined_ips[@]}"; do if [[ "${joined_ip}" == "${worker_ip}" ]]; then - skip_worker=true + skip=true break fi done fi - - if [[ "${skip_worker}" == "true" ]]; then - continue + if [[ "${skip}" == "false" ]]; then + workers_to_process+=("${worker_ip}") fi + done + + log "" + log "Workers to join/rejoin: ${workers_to_process[*]:-none}" + if [[ ${#workers_to_process[@]} -eq 0 ]]; then + log "No workers need joining" + return 0 + fi + + for worker_ip in "${workers_to_process[@]}"; do + log "" log "============================================" log "Joining worker: ${worker_ip}" log "============================================" @@ -2753,15 +4159,17 @@ join_workers() { log " ✓ k0s already installed" fi - # Stop k0s if it's running (to rejoin cleanly) - log " Stopping any existing k0s worker process..." - ssh_exec "${worker_ip}" "sudo k0s stop 2>/dev/null || true" - ssh_exec "${worker_ip}" "sudo k0s reset 2>/dev/null || true" + # Ensure k0s is in sudo's secure_path (some distros exclude /usr/local/bin) + ssh_exec "${worker_ip}" "if [ -f /usr/local/bin/k0s ] && [ ! -f /usr/bin/k0s ]; then sudo ln -sf /usr/local/bin/k0s /usr/bin/k0s; fi" || true - # Install worker + # Thorough cleanup before rejoining (handles stale configurations) + cleanup_worker_k0s "${worker_ip}" + + # RHEL/Fedora compatibility (firewalld, iptables-nft, python3-pyyaml, k0s binary) + prepare_nodes_for_k0s "${worker_ip}" + + # Install worker with fresh token log " Installing k0s worker configuration..." - # Write token to temp file first (stdin pipe doesn't work reliably over SSH) - # Note: Token file must remain until worker bootstraps, so we don't delete it here if ssh_exec "${worker_ip}" "echo '${worker_token}' | sudo tee /tmp/k0s-token >/dev/null && sudo k0s install worker --token-file=/tmp/k0s-token"; then log " ✓ Worker configuration installed" else @@ -2769,22 +4177,36 @@ join_workers() { continue fi - # Start worker + # Start worker using systemctl (more reliable than k0s start) log " Starting k0s worker..." - if ssh_exec "${worker_ip}" "sudo k0s start"; then - log " ✓ Worker started successfully" - workers_joined=$((workers_joined + 1)) + if ssh_exec "${worker_ip}" "sudo systemctl start k0sworker"; then + log " ✓ Worker service started" else warn " Failed to start k0s worker on ${worker_ip}" - continue + # Try fallback + ssh_exec "${worker_ip}" "sudo k0s start" || continue + fi + + # Wait briefly and verify + log " Waiting for worker to initialize (15s)..." + sleep 15 + + # Verify worker status + if verify_worker_status "${worker_ip}" "${controller_ip}"; then + log " ✓ Worker ${worker_ip} connected successfully!" + workers_joined=$((workers_joined + 1)) + else + warn " Worker ${worker_ip} may still be connecting..." + workers_joined=$((workers_joined + 1)) # Count as attempted fi done if [[ ${workers_joined} -gt 0 ]]; then log "" - log "Waiting for workers to join cluster (60s)..." - sleep 60 + log "Waiting for workers to appear in cluster (45s)..." + sleep 45 + log "" log "Current cluster nodes:" kubectl get nodes -o wide @@ -2795,11 +4217,23 @@ join_workers() { log "" log "============================================" - log "✓ Successfully joined ${workers_joined} worker(s)" + log "✓ Processed ${workers_joined} worker(s)" log "============================================" + + # Final verification + local final_count + final_count=$(kubectl get nodes --no-headers | wc -l) + local expected_count=$((${#CONTROLLER_IPS[@]} + ${#WORKER_IPS[@]})) + + if [[ ${final_count} -ge ${expected_count} ]]; then + log "✓ All ${expected_count} nodes are now in the cluster!" + else + warn "Only ${final_count}/${expected_count} nodes in cluster. Some workers may need more time." + warn "Run '$0 join-workers' again if workers don't appear within a few minutes." + fi else log "" - log "All workers already joined or no new workers to join" + log "No workers needed to be joined" fi } @@ -2815,6 +4249,7 @@ case "${1:-install}" in clean_all ;; join-workers) + # TODO fix this flow join_workers ;; *) diff --git a/tools/cluster_setup/refresh_ecr_credentials.sh b/tools/cluster_setup/refresh_ecr_credentials.sh new file mode 100755 index 0000000..24abeef --- /dev/null +++ b/tools/cluster_setup/refresh_ecr_credentials.sh @@ -0,0 +1,84 @@ +#!/usr/bin/env bash +# refresh_ecr_credentials.sh - Refresh ECR image pull secrets using an ECR token +# +# Usage (run on controller node): +# ./refresh_ecr_credentials.sh # auto-fetches token via aws cli +# ./refresh_ecr_credentials.sh "$(aws ecr get-login-password --region us-east-2)" # pass token +# ECR_TOKEN=xxxx ./refresh_ecr_credentials.sh # pass via env +set -euo pipefail + +ECR_ACCOUNT="${ECR_ACCOUNT:-658391232643}" +ECR_REGION="${ECR_REGION:-us-east-2}" +ECR_SERVER="${ECR_ACCOUNT}.dkr.ecr.${ECR_REGION}.amazonaws.com" +NAMESPACES="${TARGET_NAMESPACES:-ai-platform splunk-ai-operator-system}" +KUBECTL="${KUBECTL:-k0s kubectl}" + +info() { echo "[INFO] $*"; } +error() { echo "[ERROR] $*" >&2; } + +# --- Get ECR token: argument > env > auto-fetch --- +TOKEN="${1:-${ECR_TOKEN:-}}" + +if [[ -z "$TOKEN" ]]; then + info "No token provided, fetching via: aws ecr get-login-password --region ${ECR_REGION}" + TOKEN=$(aws ecr get-login-password --region "${ECR_REGION}" 2>/dev/null || true) +fi + +if [[ -z "$TOKEN" ]]; then + error "Failed to get ECR token." + error "Usage: $0 \"\$(aws ecr get-login-password --region ${ECR_REGION})\"" + exit 1 +fi +info "ECR token obtained (${#TOKEN} chars)" + +# --- Step 1: Update ecr-registry-secret in all namespaces --- +for ns in ${NAMESPACES}; do + info "Updating ecr-registry-secret in ${ns}..." + $KUBECTL -n "${ns}" delete secret ecr-registry-secret 2>/dev/null || true + $KUBECTL -n "${ns}" create secret docker-registry ecr-registry-secret \ + --docker-server="${ECR_SERVER}" \ + --docker-username=AWS \ + --docker-password="${TOKEN}" && \ + info " ✓ ecr-registry-secret refreshed in ${ns}" || \ + error " Failed to create ecr-registry-secret in ${ns}" +done + +# --- Step 2: Delete pods stuck in ImagePullBackOff --- +info "Cleaning up ImagePullBackOff pods..." +for ns in ${NAMESPACES}; do + backoff_pods=$($KUBECTL -n "${ns}" get pods 2>/dev/null \ + | grep -i "ImagePullBackOff\|ErrImagePull" \ + | awk '{print $1}' || true) + + if [[ -n "$backoff_pods" ]]; then + while IFS= read -r pod; do + [[ -z "$pod" ]] && continue + $KUBECTL -n "${ns}" delete pod "${pod}" --grace-period=0 --force 2>/dev/null && \ + info " Deleted: ${pod}" || true + done <<< "$backoff_pods" + else + info " No stuck pods in ${ns}" + fi +done + +# --- Step 3: Restart deployments that use ECR images --- +info "Restarting ECR-based deployments..." +for ns in ${NAMESPACES}; do + for dep in $($KUBECTL -n "${ns}" get deployments -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null); do + [[ -z "$dep" ]] && continue + images=$($KUBECTL -n "${ns}" get deployment "${dep}" -o jsonpath='{.spec.template.spec.containers[*].image}' 2>/dev/null || "") + if echo "$images" | grep -q "${ECR_ACCOUNT}" 2>/dev/null; then + $KUBECTL -n "${ns}" rollout restart deployment "${dep}" 2>/dev/null && \ + info " Restarted: ${dep}" || true + fi + done +done + +echo "" +info "==========================================" +info "ECR credentials refreshed!" +info " Server: ${ECR_SERVER}" +info " Namespaces: ${NAMESPACES}" +info "" +info " Token expires in ~12 hours. Re-run this script to refresh." +info "==========================================" diff --git a/tools/cluster_setup/splunk-operator-cluster.yaml b/tools/cluster_setup/splunk-operator-cluster.yaml index 06573be..0732ea3 100644 --- a/tools/cluster_setup/splunk-operator-cluster.yaml +++ b/tools/cluster_setup/splunk-operator-cluster.yaml @@ -55325,6 +55325,7 @@ subjects: apiVersion: v1 data: OPERATOR_NAME: '"splunk-operator"' + # TODO identify whats this ?? RELATED_IMAGE_SPLUNK_ENTERPRISE: 667741767953.dkr.ecr.us-west-2.amazonaws.com/splunk/splunk:splunk-redhat-8-amd64-10.2.0-ef65e8205e4d-6d943f7-28228924 WATCH_NAMESPACE: "" kind: ConfigMap @@ -55428,7 +55429,7 @@ spec: - name: WATCH_NAMESPACE value: "" - name: RELATED_IMAGE_SPLUNK_ENTERPRISE - value: docker.io/splunk/splunk:10.2.0-dev1 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/splunk/splunk:10-2-ai-custom - name: OPERATOR_NAME value: splunk-operator - name: SPLUNK_GENERAL_TERMS From e5ee1eb4983486a42a4fc09d29493842bc9aad70 Mon Sep 17 00:00:00 2001 From: Mohammed Arif Date: Wed, 15 Apr 2026 18:23:26 +0530 Subject: [PATCH 33/55] make k0s script run fast + revisited model configs for all models --- config/configs/applications.yaml | 300 ++++++------- pkg/ai/raybuilder/builder.go | 10 +- tools/cluster_setup/k0s-cluster-config.yaml | 20 +- tools/cluster_setup/k0s_cluster_with_stack.sh | 405 +++++++++++------- 4 files changed, 423 insertions(+), 312 deletions(-) diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index d40e531..5edc3e3 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -21,6 +21,152 @@ applications: SERVICE_NAME: "ai_platform_models" SKIP_VERIFICATION: "true" USE_SYSTEM_PERMISSIONS: "true" + - args: + application_name: GptOss120b + deployment_configs: + LLMDeployment: + gpu_type_options_override: + H100: + ray_actor_options: + num_gpus: 1 + L40S: + ray_actor_options: + num_gpus: 2 + options: + autoscaling_config: + max_replicas: {{.Replicas.GptOss120b}} + min_replicas: {{.Replicas.GptOss120b}} + deployment_type: text_gen_model_deployment + gpu_types: '["{{.AcceleratorType}}"]' + model_definition: + gpu_type_model_config_override: + H100: + engine_args: + gpu_memory_utilization: 0.90 + tensor_parallel_size: 1 + L40S: + engine_args: + gpu_memory_utilization: 0.90 + tensor_parallel_size: 2 + model_config: + openai_serving_config: + chat: + enable_auto_tools: true + tool_parser: openai + responses: + enable_auto_tools: true + tool_parser: openai + model_id: gpt_oss_120b + model_loader: + blob_storage: + blob_prefix: model_artifacts/gpt-oss-120b + tokenizer_definition: + model_id: gpt_oss_120b + model_loader: + blob_storage: + artifacts_list: + - chat_template.jinja + - config.json + - tokenizer_config.json + - tokenizer.json + blob_prefix: model_artifacts/gpt-oss-120b + name: GptOss120b + import_path: main:create_serve_app + route_prefix: /gpt_oss_120b + runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" + env_vars: + API_VERSION: "v1" + APPLICATION_NAME: gpt_oss_120b + VLLM_ATTENTION_BACKEND: TRITON_ATTN + ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" + CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" + ENABLE_AUTHN: "false" + ENABLE_AUTHZ: "false" + SERVICE_EXTERNAL_NAME: "ai-platform-models" + SERVICE_INTERNAL_NAME: "ai_platform_models" + SERVICE_NAME: "ai_platform_models" + SKIP_VERIFICATION: "true" + USE_SYSTEM_PERMISSIONS: "true" + VLLM_WORKER_MULTIPROC_METHOD: spawn + - args: + application_name: GptOss20b + deployment_configs: + LLMDeployment: + gpu_type_options_override: + H100: + ray_actor_options: + num_gpus: 0.5 + L40S: + ray_actor_options: + num_gpus: 1 + options: + autoscaling_config: + max_replicas: {{.Replicas.GptOss20b}} + min_replicas: {{.Replicas.GptOss20b}} + deployment_type: text_gen_model_deployment + gpu_types: '["{{.AcceleratorType}}"]' + model_definition: + gpu_type_model_config_override: + H100: + engine_args: + gpu_memory_utilization: 0.5 + tensor_parallel_size: 1 + L40S: + engine_args: + gpu_memory_utilization: 0.95 + tensor_parallel_size: 1 + model_config: + openai_serving_config: + chat: + enable_auto_tools: true + tool_parser: openai + responses: + enable_auto_tools: true + tool_parser: openai + model_id: gpt_oss_20b + model_loader: + blob_storage: + blob_prefix: model_artifacts/gpt-oss-20b + tokenizer_definition: + model_id: gpt_oss_20b + model_loader: + blob_storage: + artifacts_list: + - chat_template.jinja + - config.json + - tokenizer_config.json + - tokenizer.json + blob_prefix: model_artifacts/gpt-oss-20b + name: GptOss20b + import_path: main:create_serve_app + route_prefix: /gpt_oss_20b + runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" + env_vars: + API_VERSION: "v1" + APPLICATION_NAME: gpt_oss_20b + VLLM_ATTENTION_BACKEND: TRITON_ATTN + ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" + CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" + ENABLE_AUTHN: "false" + ENABLE_AUTHZ: "false" + SERVICE_EXTERNAL_NAME: "ai-platform-models" + SERVICE_INTERNAL_NAME: "ai_platform_models" + SERVICE_NAME: "ai_platform_models" + SKIP_VERIFICATION: "true" + USE_SYSTEM_PERMISSIONS: "true" + VLLM_WORKER_MULTIPROC_METHOD: spawn - args: application_name: UaeLarge deployment_configs: @@ -31,13 +177,13 @@ applications: num_gpus: 0.0375 L40S: ray_actor_options: - num_gpus: 0.05 + num_gpus: 0.075 options: autoscaling_config: max_replicas: {{.Replicas.UaeLarge}} min_replicas: {{.Replicas.UaeLarge}} ray_actor_options: - num_gpus: 0.1 + num_gpus: 0.15 deployment_type: embedding_model_deployment model_definition: gpu_type_model_config_override: @@ -46,10 +192,10 @@ applications: gpu_memory_utilization: 0.0375 L40S: engine_args: - gpu_memory_utilization: 0.05 + gpu_memory_utilization: 0.075 model_config: engine_args: - gpu_memory_utilization: 0.1 + gpu_memory_utilization: 0.15 tensor_parallel_size: 1 model_id: uae_large model_loader: @@ -351,79 +497,6 @@ applications: SERVICE_NAME: "ai_platform_models" SKIP_VERIFICATION: "true" USE_SYSTEM_PERMISSIONS: "true" - - args: - application_name: GptOss20b - deployment_configs: - LLMDeployment: - gpu_type_options_override: - H100: - ray_actor_options: - num_gpus: 0.5 - L40S: - ray_actor_options: - num_gpus: 1 - options: - autoscaling_config: - max_replicas: {{.Replicas.GptOss20b}} - min_replicas: {{.Replicas.GptOss20b}} - deployment_type: text_gen_model_deployment - gpu_types: '["{{.AcceleratorType}}"]' - model_definition: - gpu_type_model_config_override: - H100: - engine_args: - gpu_memory_utilization: 0.90 - tensor_parallel_size: 1 - L40S: - engine_args: - gpu_memory_utilization: 0.90 - tensor_parallel_size: 1 - model_config: - openai_serving_config: - chat: - enable_auto_tools: true - tool_parser: openai - responses: - enable_auto_tools: true - tool_parser: openai - model_id: gpt_oss_20b - model_loader: - blob_storage: - blob_prefix: model_artifacts/gpt-oss-20b - tokenizer_definition: - model_id: gpt_oss_20b - model_loader: - blob_storage: - artifacts_list: - - chat_template.jinja - - config.json - - tokenizer_config.json - - tokenizer.json - blob_prefix: model_artifacts/gpt-oss-20b - name: GptOss20b - import_path: main:create_serve_app - route_prefix: /gpt_oss_20b - runtime_env: - working_dir: "file:///home/ray/ray/applications/generic_application.zip" - env_vars: - API_VERSION: "v1" - APPLICATION_NAME: gpt_oss_20b - VLLM_ATTENTION_BACKEND: TRITON_ATTN - ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" - S3_BUCKET: "{{.ArtifactBucketName}}" - ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" - CLOUD_PROVIDER: "{{.CloudProvider}}" - S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" - S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" - S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" - ENABLE_AUTHN: "false" - ENABLE_AUTHZ: "false" - SERVICE_EXTERNAL_NAME: "ai-platform-models" - SERVICE_INTERNAL_NAME: "ai_platform_models" - SERVICE_NAME: "ai_platform_models" - SKIP_VERIFICATION: "true" - USE_SYSTEM_PERMISSIONS: "true" - VLLM_WORKER_MULTIPROC_METHOD: spawn - args: application_name: E5LanguageClassifier deployment_configs: @@ -480,79 +553,6 @@ applications: SERVICE_NAME: "ai_platform_models" SKIP_VERIFICATION: "true" USE_SYSTEM_PERMISSIONS: "true" - - args: - application_name: GptOss120b - deployment_configs: - LLMDeployment: - gpu_type_options_override: - H100: - ray_actor_options: - num_gpus: 1 - L40S: - ray_actor_options: - num_gpus: 2 - options: - autoscaling_config: - max_replicas: {{.Replicas.GptOss120b}} - min_replicas: {{.Replicas.GptOss120b}} - deployment_type: text_gen_model_deployment - gpu_types: '["{{.AcceleratorType}}"]' - model_definition: - gpu_type_model_config_override: - H100: - engine_args: - gpu_memory_utilization: 0.90 - tensor_parallel_size: 1 - L40S: - engine_args: - gpu_memory_utilization: 0.90 - tensor_parallel_size: 2 - model_config: - openai_serving_config: - chat: - enable_auto_tools: true - tool_parser: openai - responses: - enable_auto_tools: true - tool_parser: openai - model_id: gpt_oss_120b - model_loader: - blob_storage: - blob_prefix: model_artifacts/gpt-oss-120b - tokenizer_definition: - model_id: gpt_oss_120b - model_loader: - blob_storage: - artifacts_list: - - chat_template.jinja - - config.json - - tokenizer_config.json - - tokenizer.json - blob_prefix: model_artifacts/gpt-oss-120b - name: GptOss120b - import_path: main:create_serve_app - route_prefix: /gpt_oss_120b - runtime_env: - working_dir: "file:///home/ray/ray/applications/generic_application.zip" - env_vars: - API_VERSION: "v1" - APPLICATION_NAME: gpt_oss_120b - VLLM_ATTENTION_BACKEND: TRITON_ATTN - ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" - S3_BUCKET: "{{.ArtifactBucketName}}" - ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" - CLOUD_PROVIDER: "{{.CloudProvider}}" - S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" - S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" - S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" - ENABLE_AUTHN: "false" - ENABLE_AUTHZ: "false" - SERVICE_EXTERNAL_NAME: "ai-platform-models" - SERVICE_INTERNAL_NAME: "ai_platform_models" - SERVICE_NAME: "ai_platform_models" - SKIP_VERIFICATION: "true" - USE_SYSTEM_PERMISSIONS: "true" - VLLM_WORKER_MULTIPROC_METHOD: spawn - args: application_name: PromptInjectionCrossEncoder deployment_configs: diff --git a/pkg/ai/raybuilder/builder.go b/pkg/ai/raybuilder/builder.go index c088f3c..342b81d 100644 --- a/pkg/ai/raybuilder/builder.go +++ b/pkg/ai/raybuilder/builder.go @@ -732,11 +732,17 @@ func (b *Builder) buildClusterConfig(ctx context.Context) (*rayv1.RayClusterSpec cpuLimit := cfg.Resources.Limits[corev1.ResourceCPU] replicas := instanceScale[cfg.Tier] + + maxReplicas := replicas + 5 + if cfg.GPUsPerPod > 0 { + maxReplicas = replicas + } + wg := rayv1.WorkerGroupSpec{ GroupName: cfg.Tier, Replicas: int32Ptr(replicas), MinReplicas: int32Ptr(replicas), - MaxReplicas: int32Ptr(replicas + 5), + MaxReplicas: int32Ptr(maxReplicas), RayStartParams: map[string]string{ "num-cpus": cpuLimit.String(), "resources": fmt.Sprintf(`"{\"accelerator_type:%s\":1,\"gpu_count:%d\":1}"`, acceleratorType, cfg.GPUsPerPod), @@ -934,7 +940,7 @@ func (b *Builder) makeWorkerTemplate(cfg InstanceDetail) corev1.PodTemplateSpec {Name: "SERVICE_INTERNAL_NAME", Value: b.ai.Name}, {Name: "USE_SYSTEM_PERMISSIONS", Value: "true"}, {Name: "GPG_PUBLICKEY_PATH", Value: "kv-splunk/al-platform.ray-worker-sa/gpgkey"}, // FIXME - {Name: "GPU_TYPE", Value: b.effectiveAcceleratorType()}, // FIXME + {Name: "GPU_TYPE", Value: b.effectiveAcceleratorType()}, // FIXME } // Combine defaultEnv with cfg.Env to create combinedEnv diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml index 03a283b..a3a2d12 100644 --- a/tools/cluster_setup/k0s-cluster-config.yaml +++ b/tools/cluster_setup/k0s-cluster-config.yaml @@ -15,7 +15,8 @@ cluster: name: airgap-cluster # region: us-east-2 # Ignored for on-prem, but required in config sshUser: ec2-user # CHANGE THIS: SSH user for remote nodes - sshKeyPath: /Users/mohaari2/.ssh/ai-key-arif.pem # CHANGE THIS: Path to SSH private key + # sshKeyPath: /Users/mohaari2/.ssh/ai-key-arif.pem # CHANGE THIS: Path to SSH private key + sshKeyPath: /Users/mohaari2/.ssh/ai-key-arif1.pem # CHANGE THIS: Path to SSH private key # ---------- Node Configuration ---------- nodes: @@ -25,11 +26,15 @@ nodes: existingIPs: controllers: - - 3.144.14.96 # CHANGE THIS: Your controller server IP + # - 3.144.14.96 # CHANGE THIS: Your controller server IP + - 10.0.34.164 workers: - - 3.14.134.16 # CHANGE THIS: CPU worker 1 - - 13.59.78.115 # CHANGE THIS: GPU worker 1 - - 3.15.20.136 # CHANGE THIS: GPU worker 2 + # - 3.14.134.16 # CHANGE THIS: CPU worker 1 + # - 13.59.78.115 # CHANGE THIS: GPU worker 1 + # - 3.15.20.136 # CHANGE THIS: GPU worker 2 + - 10.0.34.168 + - 10.0.34.142 + - 10.0.34.153 # ---------- Storage Configuration ---------- # Object storage: AWS S3 or external S3-compatible (no in-cluster MinIO install for external). @@ -56,7 +61,7 @@ images: operator: # image: "docker.io/kpratyush775/splunk-ai-operator:v0.1.29" - image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.9" + image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.10" splunk: image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/splunk/splunk:10-2-ai-custom" @@ -118,7 +123,8 @@ splunk: # ---------- AI Platform Configuration ---------- aiPlatform: name: "splunk-ai-stack" - defaultAcceleratorType: "L40S" + # defaultAcceleratorType: "L40S" + defaultAcceleratorType: "H100" workerGroupConfig: imageRegistry: "" diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index 08017fe..4b161e6 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -1004,8 +1004,17 @@ PYSCRIPT" ssh_exec "${controller_ip}" "sudo k0s install controller --config /tmp/k0s.yaml --enable-worker" ssh_exec "${controller_ip}" "sudo k0s start" - log "Waiting for controller to be ready (60s)..." - sleep 60 + log "Waiting for controller API server to be ready..." + local ctrl_retries=0 + while (( ctrl_retries < 60 )); do + if ssh_exec "${controller_ip}" "sudo k0s kubectl get --raw /healthz 2>/dev/null" &>/dev/null; then + log " ✓ Controller API server is ready (${ctrl_retries}s)" + break + fi + sleep 5 + ctrl_retries=$((ctrl_retries + 5)) + log " Waiting... ${ctrl_retries}/300s" + done # Generate worker token log "Generating worker join token..." @@ -1069,8 +1078,21 @@ PYSCRIPT" warn "Some workers failed to install/start: ${failed_workers[*]}" fi - log "Waiting for workers to join (60s)..." - sleep 60 + log "Waiting for workers to join the cluster..." + local expected_join=$((${#CONTROLLER_IPS[@]} + ${#WORKER_IPS[@]})) + local join_retries=0 + while (( join_retries < 120 )); do + local current_nodes + current_nodes=$(ssh_exec "${controller_ip}" "sudo k0s kubectl get nodes --no-headers 2>/dev/null | wc -l" 2>/dev/null || echo "0") + current_nodes=$(echo "${current_nodes}" | tr -d '[:space:]') + if [[ "${current_nodes}" -ge "${expected_join}" ]]; then + log " ✓ All ${current_nodes} node(s) joined (${join_retries}s)" + break + fi + sleep 10 + join_retries=$((join_retries + 10)) + log " Waiting... ${current_nodes}/${expected_join} nodes joined (${join_retries}/120s)" + done # Verify workers actually joined log "Verifying worker nodes joined the cluster..." @@ -1604,9 +1626,9 @@ install_cert_manager() { warn "cert-manager webhook endpoint not found after ${max_retries} retries" fi - # Give webhooks extra time to stabilize and register with API server - log "Waiting for webhooks to stabilize (30s)..." - sleep 30 + # Brief pause for webhook registration with API server + log "Waiting for webhooks to stabilize (10s)..." + sleep 10 # Test webhook by creating a test Certificate resource log "Testing cert-manager webhook functionality..." @@ -1627,6 +1649,123 @@ EOF } # ====== INSTALL NVIDIA DRIVERS ON GPU NODES (bare-metal / EC2) ====== +# Per-node NVIDIA driver + container toolkit install (called in parallel). +_install_nvidia_on_node() { + local gpu_ip="$1" + + # Check if driver is already installed + if ssh_exec "${gpu_ip}" "nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null" &>/dev/null; then + local driver_ver + driver_ver=$(ssh_exec "${gpu_ip}" "nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null" || echo "unknown") + echo "✓ NVIDIA driver already installed on ${gpu_ip} (version: ${driver_ver})" + else + echo "Installing NVIDIA driver on ${gpu_ip}..." + ssh_exec "${gpu_ip}" " + set -e + # Install kernel headers (needed for DKMS driver build) + sudo dnf install -y kernel-devel-\$(uname -r) kernel-headers-\$(uname -r) 2>/dev/null || \ + sudo yum install -y kernel-devel-\$(uname -r) kernel-headers-\$(uname -r) 2>/dev/null || \ + sudo apt-get install -y linux-headers-\$(uname -r) 2>/dev/null || true + + # Detect OS and add appropriate NVIDIA repo + if [ -f /etc/amzn-release ] || grep -qi 'amzn' /etc/os-release 2>/dev/null; then + sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/amzn2023/x86_64/cuda-amzn2023.repo 2>/dev/null || true + sudo dnf install -y nvidia-driver nvidia-driver-cuda nvidia-driver-libs 2>/dev/null || \ + sudo dnf module install -y nvidia-driver:latest-dkms 2>/dev/null || true + elif [ -f /etc/redhat-release ]; then + RHEL_MAJOR=\$(rpm -E %{rhel} 2>/dev/null || echo 9) + if [ \"\${RHEL_MAJOR}\" -ge 10 ]; then + # Add RHEL 10 CUDA repo only; remove any stale rhel9 repo to prevent GPG conflicts + sudo rm -f /etc/yum.repos.d/cuda-rhel9.repo 2>/dev/null || true + sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel10/x86_64/cuda-rhel10.repo 2>/dev/null || true + + # RHEL 10 removed DNF modularity; DKMS kmod requires EPEL + if ! rpm -q epel-release >/dev/null 2>&1; then + echo 'Installing EPEL for dkms...' + sudo dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-10.noarch.rpm 2>/dev/null || true + fi + sudo dnf install -y dkms 2>/dev/null || true + + sudo dnf install -y nvidia-driver nvidia-driver-cuda nvidia-driver-libs 2>/dev/null || \ + sudo dnf install -y --nobest nvidia-driver nvidia-driver-cuda nvidia-driver-libs 2>/dev/null || \ + sudo dnf install -y --nobest nvidia-open 2>/dev/null || true + else + sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo 2>/dev/null || true + sudo dnf module install -y nvidia-driver:latest-dkms 2>/dev/null || \ + sudo dnf install -y --nobest nvidia-driver nvidia-driver-cuda nvidia-driver-libs 2>/dev/null || true + fi + elif [ -f /etc/debian_version ]; then + curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb -o /tmp/cuda-keyring.deb + sudo dpkg -i /tmp/cuda-keyring.deb + sudo apt-get update && sudo apt-get install -y nvidia-driver-550 2>/dev/null || true + fi + + # Load nvidia kernel module immediately (avoids needing a reboot) + sudo modprobe nvidia 2>/dev/null || true + " || { echo "Driver install on ${gpu_ip} had issues"; return 1; } + + # Verify + if ssh_exec "${gpu_ip}" "nvidia-smi 2>/dev/null" &>/dev/null; then + echo "✓ NVIDIA driver installed successfully on ${gpu_ip}" + else + echo "⚠ NVIDIA driver may need a reboot on ${gpu_ip} to take effect" + fi + fi + + # Install NVIDIA Container Toolkit + echo "Ensuring NVIDIA Container Toolkit on ${gpu_ip}..." + ssh_exec "${gpu_ip}" " + if command -v nvidia-ctk &>/dev/null; then + echo 'nvidia-ctk already installed' + else + # Add NVIDIA Container Toolkit repo + curl -fsSL https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | \ + sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo >/dev/null 2>/dev/null || true + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \ + sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg 2>/dev/null || true + + # Install + sudo dnf install -y nvidia-container-toolkit 2>/dev/null || \ + sudo yum install -y nvidia-container-toolkit 2>/dev/null || \ + sudo apt-get install -y nvidia-container-toolkit 2>/dev/null || true + fi + + # Configure for k0s containerd (k0s uses /run/k0s/containerd.sock) + if [ -d /etc/k0s/containerd.d ]; then + sudo nvidia-ctk runtime configure --runtime=containerd 2>/dev/null || true + + if [ -f /etc/containerd/conf.d/99-nvidia.toml ]; then + sudo cp /etc/containerd/conf.d/99-nvidia.toml /etc/k0s/containerd.d/nvidia.toml + sudo rm -f /etc/containerd/conf.d/99-nvidia.toml + elif [ ! -s /etc/k0s/containerd.d/nvidia.toml ]; then + sudo nvidia-ctk runtime configure --runtime=containerd \ + --config=/etc/k0s/containerd.d/nvidia.toml 2>/dev/null || true + fi + + sudo sed -i '/^version/d; /^imports/d; /^disabled_plugins/d; /^required_plugins/d' \ + /etc/k0s/containerd.d/nvidia.toml 2>/dev/null || true + + if ! grep -q 'default_runtime_name' /etc/k0s/containerd.d/nvidia.toml 2>/dev/null; then + sudo sed -i '/\[plugins\.\"io\.containerd\.grpc\.v1\.cri\"\.containerd\]$/{ + a\ default_runtime_name = \"nvidia\" + }' /etc/k0s/containerd.d/nvidia.toml 2>/dev/null || true + fi + elif [ -f /etc/containerd/config.toml ]; then + sudo nvidia-ctk runtime configure --runtime=containerd 2>/dev/null || true + fi + + sudo mkdir -p /etc/cdi + sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml 2>/dev/null || true + + sudo systemctl stop k0sworker 2>/dev/null || true + sleep 3 + sudo pkill -9 containerd-shim 2>/dev/null || true + sudo rm -f /run/k0s/containerd.sock 2>/dev/null || true + + sudo systemctl start k0sworker 2>/dev/null || true + " || { echo "Container toolkit setup on ${gpu_ip} had issues"; return 1; } +} + # EKS GPU AMIs ship with NVIDIA drivers pre-installed. # For k0s on generic AMIs (e.g. Amazon Linux 2023), we must install them # on the host before the Kubernetes device-plugin can expose GPUs. @@ -1664,142 +1803,45 @@ install_nvidia_host_drivers() { return 0 fi + # Run driver + toolkit install on all GPU nodes in parallel + log "Installing NVIDIA drivers on ${#gpu_ips[@]} GPU node(s) in parallel..." + local pids=() + local logdir + logdir=$(mktemp -d) + for gpu_ip in "${gpu_ips[@]}"; do - log "Checking NVIDIA driver on ${gpu_ip}..." + ( + _install_nvidia_on_node "${gpu_ip}" > "${logdir}/${gpu_ip}.log" 2>&1 + echo $? > "${logdir}/${gpu_ip}.rc" + ) & + pids+=($!) + log " Started NVIDIA install on ${gpu_ip} (pid $!)" + done - # Check if driver is already installed - if ssh_exec "${gpu_ip}" "nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null" &>/dev/null; then - local driver_ver - driver_ver=$(ssh_exec "${gpu_ip}" "nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null" || echo "unknown") - log " ✓ NVIDIA driver already installed on ${gpu_ip} (version: ${driver_ver})" + # Wait for all background installs to finish + local failed=0 + for i in "${!pids[@]}"; do + local pid=${pids[$i]} + local gpu_ip=${gpu_ips[$i]} + if wait "${pid}"; then + log " ✓ NVIDIA setup completed on ${gpu_ip}" else - log " Installing NVIDIA driver on ${gpu_ip}..." - ssh_exec "${gpu_ip}" " - set -e - # Install kernel headers (needed for DKMS driver build) - sudo dnf install -y kernel-devel-\$(uname -r) kernel-headers-\$(uname -r) 2>/dev/null || \ - sudo yum install -y kernel-devel-\$(uname -r) kernel-headers-\$(uname -r) 2>/dev/null || \ - sudo apt-get install -y linux-headers-\$(uname -r) 2>/dev/null || true - - # Detect OS and add appropriate NVIDIA repo - if [ -f /etc/amzn-release ] || grep -qi 'amzn' /etc/os-release 2>/dev/null; then - sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/amzn2023/x86_64/cuda-amzn2023.repo 2>/dev/null || true - sudo dnf install -y nvidia-driver nvidia-driver-cuda nvidia-driver-libs 2>/dev/null || \ - sudo dnf module install -y nvidia-driver:latest-dkms 2>/dev/null || true - elif [ -f /etc/redhat-release ]; then - RHEL_MAJOR=\$(rpm -E %{rhel} 2>/dev/null || echo 9) - if [ \"\${RHEL_MAJOR}\" -ge 10 ]; then - # Add RHEL 10 CUDA repo only; remove any stale rhel9 repo to prevent GPG conflicts - sudo rm -f /etc/yum.repos.d/cuda-rhel9.repo 2>/dev/null || true - sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel10/x86_64/cuda-rhel10.repo 2>/dev/null || true - - # RHEL 10 removed DNF modularity; DKMS kmod requires EPEL - if ! rpm -q epel-release >/dev/null 2>&1; then - echo 'Installing EPEL for dkms...' - sudo dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-10.noarch.rpm 2>/dev/null || true - fi - sudo dnf install -y dkms 2>/dev/null || true - - sudo dnf install -y nvidia-driver nvidia-driver-cuda nvidia-driver-libs 2>/dev/null || \ - sudo dnf install -y --nobest nvidia-driver nvidia-driver-cuda nvidia-driver-libs 2>/dev/null || \ - sudo dnf install -y --nobest nvidia-open 2>/dev/null || true - else - sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo 2>/dev/null || true - sudo dnf module install -y nvidia-driver:latest-dkms 2>/dev/null || \ - sudo dnf install -y --nobest nvidia-driver nvidia-driver-cuda nvidia-driver-libs 2>/dev/null || true - fi - elif [ -f /etc/debian_version ]; then - curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb -o /tmp/cuda-keyring.deb - sudo dpkg -i /tmp/cuda-keyring.deb - sudo apt-get update && sudo apt-get install -y nvidia-driver-550 2>/dev/null || true - fi - - # Load nvidia kernel module immediately (avoids needing a reboot) - sudo modprobe nvidia 2>/dev/null || true - " || warn "Driver install on ${gpu_ip} had issues — check manually" - - # Verify - if ssh_exec "${gpu_ip}" "nvidia-smi 2>/dev/null" &>/dev/null; then - log " ✓ NVIDIA driver installed successfully on ${gpu_ip}" - else - warn " NVIDIA driver may need a reboot on ${gpu_ip} to take effect" - fi + warn " NVIDIA setup on ${gpu_ip} had issues" + failed=$((failed + 1)) fi + # Stream the per-node log so output is visible + while IFS= read -r line; do + log " [${gpu_ip}] ${line}" + done < "${logdir}/${gpu_ip}.log" + done - # Install NVIDIA Container Toolkit (needed for GPU containers in k0s) - log " Ensuring NVIDIA Container Toolkit on ${gpu_ip}..." - ssh_exec "${gpu_ip}" " - if command -v nvidia-ctk &>/dev/null; then - echo 'nvidia-ctk already installed' - else - # Add NVIDIA Container Toolkit repo - curl -fsSL https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | \ - sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo >/dev/null 2>/dev/null || true - curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \ - sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg 2>/dev/null || true - - # Install - sudo dnf install -y nvidia-container-toolkit 2>/dev/null || \ - sudo yum install -y nvidia-container-toolkit 2>/dev/null || \ - sudo apt-get install -y nvidia-container-toolkit 2>/dev/null || true - fi - - # Configure for k0s containerd (k0s uses /run/k0s/containerd.sock) - if [ -d /etc/k0s/containerd.d ]; then - # nvidia-ctk writes to /etc/containerd/conf.d/ by default, not the - # k0s drop-in dir. Generate it first, then copy with fixups. - sudo nvidia-ctk runtime configure --runtime=containerd 2>/dev/null || true - - # Copy the generated config to k0s drop-in location - if [ -f /etc/containerd/conf.d/99-nvidia.toml ]; then - sudo cp /etc/containerd/conf.d/99-nvidia.toml /etc/k0s/containerd.d/nvidia.toml - sudo rm -f /etc/containerd/conf.d/99-nvidia.toml - elif [ ! -s /etc/k0s/containerd.d/nvidia.toml ]; then - # Fallback: nvidia-ctk may have written directly; try explicit config path - sudo nvidia-ctk runtime configure --runtime=containerd \ - --config=/etc/k0s/containerd.d/nvidia.toml 2>/dev/null || true - fi - - # Strip version/imports lines so the file is treated as a drop-in - # snippet, not a full containerd config (prevents node NotReady). - sudo sed -i '/^version/d; /^imports/d; /^disabled_plugins/d; /^required_plugins/d' \ - /etc/k0s/containerd.d/nvidia.toml 2>/dev/null || true - - # Set nvidia as the default containerd runtime on GPU nodes so that - # all pods automatically get GPU access without needing runtimeClassName. - # This matches EKS behavior where the GPU AMI's default runtime handles - # GPU passthrough. The nvidia runtime is a superset of runc — non-GPU - # containers run unchanged. - # Insert inside the existing [plugins."...".containerd] section (not as - # a new top-level section, which would create a duplicate TOML table). - if ! grep -q 'default_runtime_name' /etc/k0s/containerd.d/nvidia.toml 2>/dev/null; then - sudo sed -i '/\[plugins\.\"io\.containerd\.grpc\.v1\.cri\"\.containerd\]$/{ - a\ default_runtime_name = \"nvidia\" - }' /etc/k0s/containerd.d/nvidia.toml 2>/dev/null || true - fi - elif [ -f /etc/containerd/config.toml ]; then - sudo nvidia-ctk runtime configure --runtime=containerd 2>/dev/null || true - fi - - # Generate CDI (Container Device Interface) specs so the device - # plugin can discover GPUs via CDI when using the nvidia RuntimeClass. - sudo mkdir -p /etc/cdi - sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml 2>/dev/null || true - - # Kill any leftover containerd-shim processes from previous runs - # before restarting the worker. Stale shims keep the old containerd - # socket busy and cause ping-containerd-timeout errors on restart. - sudo systemctl stop k0sworker 2>/dev/null || true - sleep 3 - sudo pkill -9 containerd-shim 2>/dev/null || true - sudo rm -f /run/k0s/containerd.sock 2>/dev/null || true - - # Restart k0s worker to pick up containerd config changes - sudo systemctl start k0sworker 2>/dev/null || true - " || warn " Container toolkit setup on ${gpu_ip} had issues — check manually" + rm -rf "${logdir}" - log " ✓ GPU node ${gpu_ip} setup complete" - done + if [[ ${failed} -gt 0 ]]; then + warn "${failed} GPU node(s) had NVIDIA install issues — check logs above" + else + log "NVIDIA drivers installed successfully on all ${#gpu_ips[@]} GPU node(s)" + fi # Wait for GPU workers to rejoin and verify they are Ready log "Waiting for GPU worker nodes to rejoin cluster and become Ready..." @@ -2861,10 +2903,15 @@ spec: YAML fi + log "Splunk Standalone CR applied (pod starts in background)" +} + +# Blocks until Splunk Standalone pod is ready. Called at the end of the +# install flow so the operator and CR can deploy while Splunk boots. +wait_for_splunk_standalone() { log "Waiting for Splunk Standalone to be ready..." kubectl wait --for=condition=ready pod -l app.kubernetes.io/instance=${AI_STANDALONE_NAME} -n ${AI_NS} --timeout=600s || true - - log "Splunk Standalone installed successfully" + log "Splunk Standalone is ready" } # ====== INSTALL AI PLATFORM CR ====== @@ -3070,19 +3117,69 @@ install_ai_platform_stack() { ensure_namespace "${AI_NS}" - # Install infrastructure components - install_minio - install_cert_manager - install_kube_prometheus + # --- Phase 1: Independent infrastructure (parallel) --- + log "Phase 1: Installing independent infrastructure components in parallel..." + local phase1_pids=() phase1_names=() phase1_logdir + phase1_logdir=$(mktemp -d) + + install_minio > "${phase1_logdir}/minio.log" 2>&1 & + phase1_pids+=($!); phase1_names+=("minio") + + install_cert_manager > "${phase1_logdir}/cert-manager.log" 2>&1 & + phase1_pids+=($!); phase1_names+=("cert-manager") + + install_kube_prometheus > "${phase1_logdir}/kube-prometheus.log" 2>&1 & + phase1_pids+=($!); phase1_names+=("kube-prometheus") + + # These don't need cert-manager — run in parallel too + mount_nvme_instance_store > "${phase1_logdir}/nvme.log" 2>&1 & + phase1_pids+=($!); phase1_names+=("nvme-mount") + + install_nvidia_host_drivers > "${phase1_logdir}/nvidia-drivers.log" 2>&1 & + phase1_pids+=($!); phase1_names+=("nvidia-drivers") + + for i in "${!phase1_pids[@]}"; do + if wait "${phase1_pids[$i]}"; then + log " ✓ ${phase1_names[$i]} completed" + else + warn " ✗ ${phase1_names[$i]} had issues" + fi + while IFS= read -r line; do + log " [${phase1_names[$i]}] ${line}" + done < "${phase1_logdir}/${phase1_names[$i]}.log" + done + rm -rf "${phase1_logdir}" + ensure_s3compat_credentials - install_otel_operator_and_contrib_collector - mount_nvme_instance_store - install_nvidia_host_drivers - install_nvidia_device_plugin - install_ray_operator - # Install Splunk components - install_splunk_operator + # --- Phase 2: cert-manager-dependent components (parallel) --- + log "Phase 2: Installing cert-manager-dependent components in parallel..." + local phase2_pids=() phase2_names=() phase2_logdir + phase2_logdir=$(mktemp -d) + + install_otel_operator_and_contrib_collector > "${phase2_logdir}/otel.log" 2>&1 & + phase2_pids+=($!); phase2_names+=("otel-operator") + + install_ray_operator > "${phase2_logdir}/ray.log" 2>&1 & + phase2_pids+=($!); phase2_names+=("ray-operator") + + install_splunk_operator > "${phase2_logdir}/splunk-operator.log" 2>&1 & + phase2_pids+=($!); phase2_names+=("splunk-operator") + + install_nvidia_device_plugin > "${phase2_logdir}/nvidia-plugin.log" 2>&1 & + phase2_pids+=($!); phase2_names+=("nvidia-device-plugin") + + for i in "${!phase2_pids[@]}"; do + if wait "${phase2_pids[$i]}"; then + log " ✓ ${phase2_names[$i]} completed" + else + warn " ✗ ${phase2_names[$i]} had issues" + fi + while IFS= read -r line; do + log " [${phase2_names[$i]}] ${line}" + done < "${phase2_logdir}/${phase2_names[$i]}.log" + done + rm -rf "${phase2_logdir}" # Create image pull secrets before Splunk Standalone (it uses the default SA which needs ECR creds) create_image_pull_secrets "${AI_NS}" @@ -3090,14 +3187,16 @@ install_ai_platform_stack() { # Deploy CronJob that auto-refreshes ECR credentials every 6 hours (tokens expire at 12h) install_ecr_credential_refresher + # Apply Splunk Standalone CR (non-blocking — pod boots in background) install_splunk_standalone - # Install AI Platform operator + # Install AI Platform operator and CR while Splunk Standalone boots install_splunk_ai_operator - - # Install AI Platform CR install_ai_platform_cr + # Now wait for Splunk Standalone to be ready (likely already done by now) + wait_for_splunk_standalone + log "AI Platform stack installation complete!" } From f3a75d551c82dcb7648d25a58f776a2af761a7f2 Mon Sep 17 00:00:00 2001 From: Mohammed Arif Date: Mon, 20 Apr 2026 14:25:18 +0530 Subject: [PATCH 34/55] feat(saia): add SAIA v2 deployment + nginx path-based v1/v2 router MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-deploy SAIA v1 API, v2 API, v2 worker, and an nginx reverse proxy for path-based routing, end-to-end verified against spl-copilot clients on k0s airgap clusters. Architecture (per AIService CR): public saia-service (:8080) ──► nginx ──► /saia-api-v2/ ──► saia-v2-service (v2 API :8000) │ └──► everything else ──► saia-v1-service (v1 API :8080) weaviate (:80 HTTP, :50051 gRPC) ← v1, v2, v2 worker, data-loader Job Operator (pkg/ai/features/saia/impl.go): - New stages: SAIAv2Deployment, SAIAv2Worker, NginxConfigMap, NginxDeployment, SAIAv1Service, SAIAv2Service. - SAIAService now selects the nginx pod; v1/v2 Services are internal-only ClusterIPs wired by nginx upstreams. - Nginx config: path regex "/saia-api-v2/" (search-anywhere) + fall- through to v1; proxy_http_version 1.1 + proxy_buffering off for SSE; exact "= /nginx_health" for probes; "= /nginx_status" loopback-only. - reconcilePostInstallHook: explicitly passes VECTOR_DB_HOST/PORT, VECTOR_DB_GRPC_HOST/PORT (50051), VECTOR_DB_SECURE=false, VECTOR_DB_AUTH_ENABLED=false so the v2 data-loader's Weaviate v4 gRPC health check doesn't fall through to "grpc.:443" TLS production defaults. - buildV2ExtraEnv: v2 API/worker receive the same gRPC config. - v2 worker liveness probe uses python3 (base image lacks coreutils) and WORKER_HEARTBEAT_PATH=/tmp/ingestion_worker_heartbeat to match saia-v2 default in app/core/config.py. - reconcileSAIAService initializes Annotations to avoid nil-map panic when the AIService carries custom annotations. - Nginx image resolvable via RELATED_IMAGE_NGINX env var for airgap. - v1 worker deployment and SAIAWorkerConfig removed (consolidated into the v2 worker topology). CRD schema (api/v1/aiservice_types.go + generated files): - Add v2 (SAIAv2Config: image, replicas, resources) and v2Worker (SAIAWorkerConfig: replicas, resources) fields. - Regenerate deepcopy, config/crd/bases, helm-chart/.../crds, and the bundled tools/cluster_setup/artifacts.yaml so kube-apiserver does not silently strip the new fields on write. Weaviate (pkg/ai/weaviate.go): - Expose gRPC on containerPort 50051 and Service port 50051, plus GRPC_PORT=50051 env. Required by SAIA v2 + data-loader (Weaviate v4 python client). Helm chart + bundled artifacts: - values.yaml: nginxImage and saiaApiV2Image defaults. - deployment.yaml: RELATED_IMAGE_SAIA_API_V2 and RELATED_IMAGE_NGINX env vars on the operator manager. - tools/cluster_setup/artifacts.yaml: inject the new AIService CRD (with v2/v2Worker), refresh image tags (operator v0.1.20, saia v2.0.4-23-g2fc91e9), and add RELATED_IMAGE_NGINX. k0s install script (tools/cluster_setup/k0s_cluster_with_stack.sh): - Plumb images.nginx.image through yq → sed into the bundled manifest alongside existing image wiring. - Replace fragile `SEDOPTION="-i ''"` + unquoted expansion (which created stray `filename''` backups on macOS) with a bash array `SED_INPLACE=(sed -i "")` that preserves the empty-suffix arg. Cluster config (tools/cluster_setup/k0s-cluster-config.yaml): - Expose images.nginx (docker.io/library/nginx:1.27-alpine default). - Bump operator to v0.1.20 and saia images to v2.0.4-23-g2fc91e9. Tests (pkg/ai/features/saia/impl_test.go, pkg/ai/weaviate_test.go): - Cover v2 API/worker reconcile, nginx ConfigMap regex + streaming directives, nginx image override, v1/v2/public Services, reconcileSAIAService annotation-map regression, and reconcilePostInstallHook gRPC env wiring. .gitignore: - Ignore tools/cluster_setup/*.original (install-script reset snapshots, regenerated on first install, never committed). Verified on k0s airgap cluster (4-node L40S): 38 client-path API requests through the public LB — 16 v1 + 22 v2 — 100% routed to the correct upstream (verified via nginx access log upstream field). Made-with: Cursor --- .gitignore | 7 + api/v1/aiservice_types.go | 39 + api/v1/zz_generated.deepcopy.go | 34 + .../crd/bases/ai.splunk.com_aiservices.yaml | 146 ++++ config/manager/kustomization.yaml | 16 +- .../crds/ai.splunk.com_aiservices.yaml | 146 ++++ .../templates/deployment.yaml | 4 + helm-chart/splunk-ai-operator/values.yaml | 5 + pkg/ai/features/saia/impl.go | 677 +++++++++++++++++- pkg/ai/features/saia/impl_test.go | 420 ++++++++++- pkg/ai/reconciler.go | 14 +- pkg/ai/weaviate.go | 37 +- pkg/ai/weaviate_test.go | 26 +- tools/cluster_setup/artifacts.yaml | 177 ++++- tools/cluster_setup/k0s-cluster-config.yaml | 38 +- tools/cluster_setup/k0s_cluster_with_stack.sh | 56 +- 16 files changed, 1768 insertions(+), 74 deletions(-) diff --git a/.gitignore b/.gitignore index 3cc0b27..ac1d882 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,10 @@ skaffold.env.local *.tgz helm-chart/**/charts/ !helm-chart/**/charts/.gitkeep + +# Cluster-setup script byproducts (*.original): pristine-snapshot backups +# written by tools/cluster_setup/k0s_cluster_with_stack.sh on first run and +# reused as a reset point on subsequent runs (see configure_images() +# → "Restoring from clean originals"). Needed locally for idempotent +# re-installs; never committed. +tools/cluster_setup/*.original diff --git a/api/v1/aiservice_types.go b/api/v1/aiservice_types.go index 41914fa..a675cb1 100644 --- a/api/v1/aiservice_types.go +++ b/api/v1/aiservice_types.go @@ -117,6 +117,45 @@ type AIServiceSpec struct { // +kubebuilder:default="cluster.local" // +kubebuilder:validation:Pattern=`^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$` ClusterDomain string `json:"clusterDomain,omitempty"` + + // V2 configures the SAIA v2 deployment. v2 is always deployed alongside v1 behind nginx. + // Users toggle Agent Mode (v1 vs v2) from the Splunk Settings UI. + // +kubebuilder:validation:Optional + V2 SAIAv2Config `json:"v2,omitempty"` + + // V2Worker configures the v2 SAIA worker deployment (same v2 image, command=run-worker.sh). + // +kubebuilder:validation:Optional + V2Worker SAIAWorkerConfig `json:"v2Worker,omitempty"` +} + +// SAIAv2Config defines the configuration for the SAIA v2 API deployment. +type SAIAv2Config struct { + // Image is the container image for the v2 API pod + // +kubebuilder:validation:Optional + Image string `json:"image,omitempty"` + + // Replicas is the number of v2 API replicas + // +kubebuilder:validation:Optional + // +kubebuilder:default=1 + // +kubebuilder:validation:Minimum=0 + Replicas int32 `json:"replicas,omitempty"` + + // Resources defines the compute resources for the v2 API pods + // +kubebuilder:validation:Optional + Resources corev1.ResourceRequirements `json:"resources,omitempty"` +} + +// SAIAWorkerConfig defines the configuration for a SAIA worker deployment. +type SAIAWorkerConfig struct { + // Replicas is the number of worker replicas + // +kubebuilder:validation:Optional + // +kubebuilder:default=1 + // +kubebuilder:validation:Minimum=0 + Replicas int32 `json:"replicas,omitempty"` + + // Resources defines the compute resources for the worker pods + // +kubebuilder:validation:Optional + Resources corev1.ResourceRequirements `json:"resources,omitempty"` } // MetricsConfig defines the metrics configuration for monitoring diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index f63b7c9..987e478 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -247,6 +247,8 @@ func (in *AIServiceSpec) DeepCopyInto(out *AIServiceSpec) { out.Metrics = in.Metrics in.MTLS.DeepCopyInto(&out.MTLS) in.ServiceTemplate.DeepCopyInto(&out.ServiceTemplate) + in.V2.DeepCopyInto(&out.V2) + in.V2Worker.DeepCopyInto(&out.V2Worker) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AIServiceSpec. @@ -526,6 +528,38 @@ func (in *ReplicasSpec) DeepCopy() *ReplicasSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SAIAWorkerConfig) DeepCopyInto(out *SAIAWorkerConfig) { + *out = *in + in.Resources.DeepCopyInto(&out.Resources) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SAIAWorkerConfig. +func (in *SAIAWorkerConfig) DeepCopy() *SAIAWorkerConfig { + if in == nil { + return nil + } + out := new(SAIAWorkerConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SAIAv2Config) DeepCopyInto(out *SAIAv2Config) { + *out = *in + in.Resources.DeepCopyInto(&out.Resources) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SAIAv2Config. +func (in *SAIAv2Config) DeepCopy() *SAIAv2Config { + if in == nil { + return nil + } + out := new(SAIAv2Config) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SchedulingSpec) DeepCopyInto(out *SchedulingSpec) { *out = *in diff --git a/config/crd/bases/ai.splunk.com_aiservices.yaml b/config/crd/bases/ai.splunk.com_aiservices.yaml index 5bce496..f203f3c 100644 --- a/config/crd/bases/ai.splunk.com_aiservices.yaml +++ b/config/crd/bases/ai.splunk.com_aiservices.yaml @@ -1895,6 +1895,152 @@ spec: type: string type: object type: array + v2: + description: |- + V2 configures the SAIA v2 deployment. v2 is always deployed alongside v1 behind nginx. + Users toggle Agent Mode (v1 vs v2) from the Splunk Settings UI. + properties: + image: + description: Image is the container image for the v2 API pod + type: string + replicas: + default: 1 + description: Replicas is the number of v2 API replicas + format: int32 + minimum: 0 + type: integer + resources: + description: Resources defines the compute resources for the v2 + API pods + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This is an alpha field and requires enabling the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + type: object + v2Worker: + description: V2Worker configures the v2 SAIA worker deployment (same + v2 image, command=run-worker.sh). + properties: + replicas: + default: 1 + description: Replicas is the number of worker replicas + format: int32 + minimum: 0 + type: integer + resources: + description: Resources defines the compute resources for the worker + pods + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This is an alpha field and requires enabling the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + type: object vectorDbUrl: description: VectorDbUrl specifies the URL or service name for the vector database diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index 415896e..1d09a52 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -7,16 +7,16 @@ resources: patches: - patch: "- op: add\n path: /spec/template/spec/containers/0/env\n value: \n - name: WATCH_NAMESPACE\n value: WATCH_NAMESPACE_VALUE\n - name: RELATED_IMAGE_SPLUNK_ENTERPRISE\n - \ value: splunk/splunk:10.2.0-dev1\n - name: OPERATOR_NAME\n value: - splunk-operator\n - name: POD_NAME\n valueFrom:\n fieldRef:\n fieldPath: - metadata.name\n - name: RELATED_IMAGE_RAY_HEAD\n value: \"667741767953.dkr.ecr.us-west-2.amazonaws.com/ml-platform/ray/ray-head:build-17\"\n + \ value: splunk/splunk:10.2.0-dev1\n - name: OPERATOR_NAME\n value: splunk-operator\n + \ - name: POD_NAME\n valueFrom:\n fieldRef:\n fieldPath: metadata.name\n + \ - name: RELATED_IMAGE_RAY_HEAD\n value: \"667741767953.dkr.ecr.us-west-2.amazonaws.com/ml-platform/ray/ray-head:build-17\"\n \ - name: RELATED_IMAGE_RAY_WORKER\n value: \"667741767953.dkr.ecr.us-west-2.amazonaws.com/ml-platform/ray/ray-worker-gpu:build-17\"\n \ - name: RELATED_IMAGE_WEAVIATE\n value: \"semitechnologies/weaviate:stable-v1.28-007846a\"\n \ - name: RELATED_IMAGE_SAIA_API\n value: \"667741767953.dkr.ecr.us-west-2.amazonaws.com/ml-platform/saia/saia-api:build-1\"\n \ - name: RELATED_IMAGE_POST_INSTALL_HOOK\n value: \"667741767953.dkr.ecr.us-west-2.amazonaws.com/ml-platform/saia/saia-data-loader:build-1\"\n - \ - name: RELATED_IMAGE_FLUENT_BIT\n value: \"fluent/fluent-bit:1.9.6\"\n - \ - name: RELATED_IMAGE_OTEL_COLLECTOR\n value: \"otel/opentelemetry-collector-contrib:0.122.1\"\n - - name: MODEL_VERSION\n value: \"v0.3.14-36-g1549f5a\"\n - name: RAY_VERSION\n + \ - name: RELATED_IMAGE_FLUENT_BIT\n value: \"fluent/fluent-bit:1.9.6\"\n - + name: RELATED_IMAGE_OTEL_COLLECTOR\n value: \"otel/opentelemetry-collector-contrib:0.122.1\"\n + \ - name: MODEL_VERSION\n value: \"v0.3.14-36-g1549f5a\"\n - name: RAY_VERSION\n \ value: \"2.44.0\"" target: kind: Deployment @@ -25,5 +25,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization images: - name: controller - newName: docker.io/splunk/splunk-ai-operator - newTag: 0.1.0 + newName: docker.com/splunk/splunk-ai-operator + newTag: v0.0.1 diff --git a/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiservices.yaml b/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiservices.yaml index 5bce496..f203f3c 100644 --- a/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiservices.yaml +++ b/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiservices.yaml @@ -1895,6 +1895,152 @@ spec: type: string type: object type: array + v2: + description: |- + V2 configures the SAIA v2 deployment. v2 is always deployed alongside v1 behind nginx. + Users toggle Agent Mode (v1 vs v2) from the Splunk Settings UI. + properties: + image: + description: Image is the container image for the v2 API pod + type: string + replicas: + default: 1 + description: Replicas is the number of v2 API replicas + format: int32 + minimum: 0 + type: integer + resources: + description: Resources defines the compute resources for the v2 + API pods + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This is an alpha field and requires enabling the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + type: object + v2Worker: + description: V2Worker configures the v2 SAIA worker deployment (same + v2 image, command=run-worker.sh). + properties: + replicas: + default: 1 + description: Replicas is the number of worker replicas + format: int32 + minimum: 0 + type: integer + resources: + description: Resources defines the compute resources for the worker + pods + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This is an alpha field and requires enabling the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + type: object vectorDbUrl: description: VectorDbUrl specifies the URL or service name for the vector database diff --git a/helm-chart/splunk-ai-operator/templates/deployment.yaml b/helm-chart/splunk-ai-operator/templates/deployment.yaml index 34ed56a..4f17067 100644 --- a/helm-chart/splunk-ai-operator/templates/deployment.yaml +++ b/helm-chart/splunk-ai-operator/templates/deployment.yaml @@ -76,10 +76,14 @@ spec: value: {{ .Values.weaviateImage }} - name: RELATED_IMAGE_SAIA_API value: {{ .Values.saiaApiImage }} + - name: RELATED_IMAGE_SAIA_API_V2 + value: {{ .Values.saiaApiV2Image }} - name: RELATED_IMAGE_POST_INSTALL_HOOK value: {{ .Values.saiaSchemaImage }} - name: RELATED_IMAGE_OTEL_COLLECTOR value: {{ .Values.otelCollectorImage }} + - name: RELATED_IMAGE_NGINX + value: {{ .Values.nginxImage }} - name: MODEL_VERSION value: v0.3.14-36-g1549f5a - name: RAY_VERSION diff --git a/helm-chart/splunk-ai-operator/values.yaml b/helm-chart/splunk-ai-operator/values.yaml index 2a4f660..f16dc4a 100644 --- a/helm-chart/splunk-ai-operator/values.yaml +++ b/helm-chart/splunk-ai-operator/values.yaml @@ -107,11 +107,16 @@ weaviateImage: "docker.io/semitechnologies/weaviate:stable-v1.28-007846a" # SAIA (Splunk AI Assistant) images saiaApiImage: "docker.io/splunk/saia-api:1.1.0" +saiaApiV2Image: "docker.io/splunk/saia-api-v2:1.1.0" saiaSchemaImage: "docker.io/splunk/saia-data-loader:1.1.0" # OpenTelemetry Collector sidecar image otelCollectorImage: "docker.io/otel/opentelemetry-collector-contrib:0.122.1" +# Nginx reverse proxy image used by the SAIA reconciler to route v1/v2 traffic. +# Override this in airgapped installs to point at your internal mirror. +nginxImage: "nginx:1.27-alpine" + # Set security context for Splunk Operator pod # reference: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#podsecuritycontext-v1-core securityContext: diff --git a/pkg/ai/features/saia/impl.go b/pkg/ai/features/saia/impl.go index 16d4642..9f1a301 100644 --- a/pkg/ai/features/saia/impl.go +++ b/pkg/ai/features/saia/impl.go @@ -61,6 +61,12 @@ func (r *SaiaReconciler) Reconcile(ctx context.Context, aiservice *aiv1.AIServic {"Certificate", r.reconcileCertificate}, {"PostInstallHook", r.reconcilePostInstallHook}, {"SAIADeployment", r.reconcileSAIADeployment}, + {"SAIAv2Deployment", r.reconcileSAIAv2Deployment}, + {"SAIAv2Worker", r.reconcileSAIAv2Worker}, + {"NginxConfigMap", r.reconcileNginxConfigMap}, + {"NginxDeployment", r.reconcileNginxDeployment}, + {"SAIAv1Service", r.reconcileSAIAv1Service}, + {"SAIAv2Service", r.reconcileSAIAv2Service}, {"SAIAService", r.reconcileSAIAService}, {"ServiceMonitor", r.reconcileServiceMonitor}, } @@ -183,6 +189,18 @@ func (r *SaiaReconciler) validateAIService( ai.Spec.Replicas = 1 } + // V2 image is required (v2 is always deployed alongside v1) + if ai.Spec.V2.Image == "" { + r.Recorder.Event(ai, corev1.EventTypeWarning, "InvalidSpec", "v2.image must be set for SAIA v2 deployment") + return fmt.Errorf("v2.image must be set for SAIA v2 deployment") + } + if ai.Spec.V2.Replicas == 0 { + ai.Spec.V2.Replicas = 1 + } + if ai.Spec.V2Worker.Replicas == 0 { + ai.Spec.V2Worker.Replicas = 1 + } + if ai.Spec.SplunkConfiguration.Endpoint == "" && ai.Spec.SplunkConfiguration.SplunkCustomResourceRef.Name == "" { r.Recorder.Event(ai, corev1.EventTypeWarning, "SplunkConfigMissing", "Splunk configuration is missing assuming no logging") return nil @@ -558,8 +576,22 @@ func (r *SaiaReconciler) reconcilePostInstallHook( Name: "vector-db-setup-container", Image: hookImage, ImagePullPolicy: corev1.PullIfNotPresent, + // The v2 data-loader image (>= v2.0.4-13-g3b677604) uses the + // Weaviate v4 Python client, which performs a gRPC health + // check on connect and requires explicit gRPC host/port. Its + // URL-compat shim defaults to the Splunk production naming + // (grpc.:443 TLS) if these are unset — wrong for k0s + // airgap where Weaviate exposes gRPC on the same Service + // (port 50051, plaintext). Always set these explicitly so + // the shim's setdefault() calls are no-ops. Env: []corev1.EnvVar{ {Name: "VECTOR_DB_URL", Value: uri}, + {Name: "VECTOR_DB_HOST", Value: ai.Spec.VectorDbUrl}, + {Name: "VECTOR_DB_PORT", Value: "80"}, + {Name: "VECTOR_DB_GRPC_HOST", Value: ai.Spec.VectorDbUrl}, + {Name: "VECTOR_DB_GRPC_PORT", Value: "50051"}, + {Name: "VECTOR_DB_SECURE", Value: "false"}, + {Name: "VECTOR_DB_AUTH_ENABLED", Value: "false"}, {Name: "SPLUNK_AI_ASSISTANT_SERVICE_CMP", Value: "true"}, }, }, @@ -584,6 +616,143 @@ func (r *SaiaReconciler) reconcilePostInstallHook( return fmt.Errorf("created Job %q, waiting for completion", job.Name) } +// buildSAIABaseEnv returns the common environment variables shared across all SAIA pods +// (v1 API, v1 worker, v2 API, v2 worker). Callers append pod-specific vars. +func buildSAIABaseEnv(ai *aiv1.AIService) []corev1.EnvVar { + bucketName := extractBucketName(ai.Spec.TaskVolume.Path) + env := []corev1.EnvVar{ + {Name: "PLATFORM_URL", Value: ai.Spec.AIPlatformUrl}, + {Name: "VECTOR_DB_URL", Value: ai.Spec.VectorDbUrl}, + {Name: "S3_BUCKET", Value: bucketName}, + } + + if ai.Spec.TaskVolume.Endpoint != "" { + env = append(env, + corev1.EnvVar{Name: "S3COMPAT_OBJECT_STORE_ENDPOINT_URL", Value: ai.Spec.TaskVolume.Endpoint}, + corev1.EnvVar{Name: "S3COMPAT_OBJECT_STORE_BUCKET", Value: bucketName}, + ) + } + + if ai.Spec.TaskVolume.SecretRef != "" { + env = append(env, + corev1.EnvVar{ + Name: "S3COMPAT_OBJECT_STORE_ACCESS_KEY", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: ai.Spec.TaskVolume.SecretRef}, + Key: "s3_access_key", + }, + }, + }, + corev1.EnvVar{ + Name: "S3COMPAT_OBJECT_STORE_SECRET_KEY", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: ai.Spec.TaskVolume.SecretRef}, + Key: "s3_secret_key", + }, + }, + }, + ) + } + + return env +} + +// buildV2ExtraEnv returns additional env vars needed by the SAIA v2 image. +// v2 uses different env var names: VECTOR_DB_HOST (not VECTOR_DB_URL), +// ML_PLATFORM_URL (not PLATFORM_URL), and needs vector DB TLS/auth disabled. +func buildV2ExtraEnv(ai *aiv1.AIService) []corev1.EnvVar { + return []corev1.EnvVar{ + {Name: "ML_PLATFORM_URL", Value: ai.Spec.AIPlatformUrl}, + {Name: "VECTOR_DB_AUTH_ENABLED", Value: "false"}, + {Name: "VECTOR_DB_GRPC_HOST", Value: ai.Spec.VectorDbUrl}, + {Name: "VECTOR_DB_GRPC_PORT", Value: "50051"}, + {Name: "VECTOR_DB_HOST", Value: ai.Spec.VectorDbUrl}, + {Name: "VECTOR_DB_PORT", Value: "80"}, + {Name: "VECTOR_DB_SECURE", Value: "false"}, + } +} + +// buildSAIATLSEnv appends TLS-related env vars and returns updated env, volumes, and mounts. +func buildSAIATLSEnv(ai *aiv1.AIService, env []corev1.EnvVar, volumes []corev1.Volume, mounts []corev1.VolumeMount, ports []corev1.ContainerPort) ([]corev1.EnvVar, []corev1.Volume, []corev1.VolumeMount, []corev1.ContainerPort) { + if ai.Spec.MTLS.Enabled && ai.Spec.MTLS.Termination == "operator" { + volumes = append(volumes, corev1.Volume{ + Name: "tls", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{SecretName: ai.Spec.MTLS.SecretName}, + }, + }) + mounts = append(mounts, corev1.VolumeMount{Name: "tls", MountPath: "/etc/tls", ReadOnly: true}) + env = append(env, + corev1.EnvVar{Name: "TLS_CERT_FILE", Value: "/etc/tls/tls.crt"}, + corev1.EnvVar{Name: "TLS_KEY_FILE", Value: "/etc/tls/tls.key"}, + ) + ports = append(ports, corev1.ContainerPort{Name: "https", ContainerPort: 8443}) + } else { + env = append(env, corev1.EnvVar{Name: "TLS_DISABLED", Value: "true"}) + } + return env, volumes, mounts, ports +} + +// saiaEnvFrom returns the EnvFromSource for the SAIA ConfigMap. +func saiaEnvFrom(ai *aiv1.AIService) []corev1.EnvFromSource { + return []corev1.EnvFromSource{ + { + ConfigMapRef: &corev1.ConfigMapEnvSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: fmt.Sprintf("%s-saia-config", ai.Name), + }, + }, + }, + } +} + +// saiaVolumes returns the standard config volume and mount for SAIA pods. +func saiaVolumes(ai *aiv1.AIService) ([]corev1.Volume, []corev1.VolumeMount) { + featureConfigName := fmt.Sprintf("splunk-%s-feature-config", ai.Name) + volumes := []corev1.Volume{ + { + Name: "config-volume", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{Name: featureConfigName}, + }, + }, + }, + } + mounts := []corev1.VolumeMount{ + {Name: "config-volume", MountPath: "/etc/config"}, + } + return volumes, mounts +} + +// saiaLabelsAndAnnotations returns the labels and annotations for SAIA pods. +func saiaLabelsAndAnnotations(ai *aiv1.AIService, component string) (map[string]string, map[string]string) { + labels := map[string]string{ + "app": ai.Name, + "component": component, + "area": "ml", + "team": "ml", + } + for k, v := range ai.Labels { + labels[k] = v + } + + annotations := map[string]string{ + "prometheus.io/port": "8088", + "prometheus.io/path": "/metrics", + "prometheus.io/scheme": "http", + } + for k, v := range ai.Annotations { + if k == "kubectl.kubernetes.io/last-applied-configuration" || k == "kubectl.kubernetes.io/restartedAt" { + continue + } + annotations[k] = v + } + return labels, annotations +} + // reconcileSAIADeployment ensures the main Deployment exists and is configured. func (r *SaiaReconciler) reconcileSAIADeployment( ctx context.Context, @@ -621,9 +790,12 @@ func (r *SaiaReconciler) reconcileSAIADeployment( {Name: "S3_BUCKET", Value: extractBucketName(ai.Spec.TaskVolume.Path)}, } - // S3-compatible object store: set S3COMPAT_OBJECT_STORE_ENDPOINT_URL for custom endpoint (MinIO, SeaweedFS, etc.). + // S3-compatible object store: set S3COMPAT_OBJECT_STORE_ENDPOINT_URL and S3COMPAT_OBJECT_STORE_BUCKET for custom endpoint (MinIO, SeaweedFS, etc.). if ai.Spec.TaskVolume.Endpoint != "" { - env = append(env, corev1.EnvVar{Name: "S3COMPAT_OBJECT_STORE_ENDPOINT_URL", Value: ai.Spec.TaskVolume.Endpoint}) + env = append(env, + corev1.EnvVar{Name: "S3COMPAT_OBJECT_STORE_ENDPOINT_URL", Value: ai.Spec.TaskVolume.Endpoint}, + corev1.EnvVar{Name: "S3COMPAT_OBJECT_STORE_BUCKET", Value: extractBucketName(ai.Spec.TaskVolume.Path)}, + ) } // S3-compatible object store credentials from secretRef (S3COMPAT_OBJECT_STORE_ACCESS_KEY, S3COMPAT_OBJECT_STORE_SECRET_KEY). @@ -743,6 +915,8 @@ func (r *SaiaReconciler) reconcileSAIADeployment( Name: ai.Name, Image: os.Getenv("RELATED_IMAGE_SAIA_API"), ImagePullPolicy: corev1.PullIfNotPresent, + Command: []string{"/bin/sh", "-c"}, + Args: []string{"python -m uvicorn --host 0.0.0.0 server.main:metrics_app --port 8088 & exec python -m uvicorn --host 0.0.0.0 server.main:app --port 8080"}, Ports: ports, VolumeMounts: mounts, Resources: ai.Spec.Resources, @@ -786,7 +960,487 @@ func (r *SaiaReconciler) reconcileSAIADeployment( return nil } -// reconcileSAIAService ensures the Service for SAIA is created/updated. // remove me +// reconcileSAIAv2Deployment creates the v2 API Deployment and its internal Service. +func (r *SaiaReconciler) reconcileSAIAv2Deployment( + ctx context.Context, + ai *aiv1.AIService, +) error { + volumes, mounts := saiaVolumes(ai) + ports := []corev1.ContainerPort{ + {Name: "http", ContainerPort: 8000}, + {Name: "metrics", ContainerPort: 8088}, + } + + env := buildSAIABaseEnv(ai) + env = append(env, buildV2ExtraEnv(ai)...) + env = append(env, corev1.EnvVar{Name: "VAULT_TEMPLATE_DISABLED", Value: "true"}) + env, volumes, mounts, ports = buildSAIATLSEnv(ai, env, volumes, mounts, ports) + sort.Slice(env, func(i, j int) bool { return env[i].Name < env[j].Name }) + + component := ai.Name + "-v2-api" + labels, annotations := saiaLabelsAndAnnotations(ai, component) + + deployment := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: ai.Name + "-saia-v2-deployment", + Namespace: ai.Namespace, + }, + } + + if err := controllerutil.SetControllerReference(ai, deployment, r.Scheme); err != nil { + return fmt.Errorf("ownerref on v2 Deployment: %w", err) + } + + v2Resources := ai.Spec.V2.Resources + if v2Resources.Requests == nil { + v2Resources = ai.Spec.Resources + } + + if _, err := controllerutil.CreateOrUpdate(ctx, r.Client, deployment, func() error { + deployment.ObjectMeta.Labels = labels + deployment.ObjectMeta.Annotations = annotations + deployment.Spec.Replicas = &ai.Spec.V2.Replicas + + if deployment.Spec.Selector == nil { + deployment.Spec.Selector = &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": ai.Name, "component": component}, + } + } + + deployment.Spec.Template = corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": ai.Name, "component": component}, + Annotations: annotations, + }, + Spec: corev1.PodSpec{ + ServiceAccountName: ai.Spec.ServiceAccountName, + Containers: []corev1.Container{{ + Name: "saia-v2-api", + Image: ai.Spec.V2.Image, + ImagePullPolicy: corev1.PullIfNotPresent, + Command: []string{"/bin/sh", "-c"}, + Args: []string{". /home/splunk/init-prometheus.sh && python -m uvicorn --host 0.0.0.0 app.main:metrics_app --port 8088 & exec python -m uvicorn --host 0.0.0.0 app.main:app --port 8000"}, + Ports: ports, + VolumeMounts: mounts, + Resources: v2Resources, + Env: env, + EnvFrom: saiaEnvFrom(ai), + LivenessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{Path: "/health", Port: intstr.FromInt(8000)}, + }, + PeriodSeconds: 30, + FailureThreshold: 5, + }, + ReadinessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{Path: "/health", Port: intstr.FromInt(8000)}, + }, + PeriodSeconds: 30, + FailureThreshold: 5, + }, + StartupProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{Path: "/health", Port: intstr.FromInt(8000)}, + }, + InitialDelaySeconds: 10, + PeriodSeconds: 30, + FailureThreshold: 5, + }, + }}, + Volumes: volumes, + Affinity: &ai.Spec.Affinity, + Tolerations: ai.Spec.Tolerations, + ImagePullSecrets: ai.Spec.ImagePullSecrets, + }, + } + return nil + }); err != nil { + return fmt.Errorf("create/update v2 Deployment: %w", err) + } + return nil +} + +// reconcileSAIAv2Worker creates the v2 worker Deployment (same v2 image, command=run-worker.sh). +func (r *SaiaReconciler) reconcileSAIAv2Worker( + ctx context.Context, + ai *aiv1.AIService, +) error { + volumes, mounts := saiaVolumes(ai) + ports := []corev1.ContainerPort{ + {Name: "metrics", ContainerPort: 8088}, + } + + env := buildSAIABaseEnv(ai) + env = append(env, buildV2ExtraEnv(ai)...) + // Keep heartbeat path in sync with saia-v2's default (app/core/config.py: + // worker_heartbeat_path = "/tmp/ingestion_worker_heartbeat"). The ingestion + // worker writes a floating-point unix timestamp to this file every poll cycle. + env = append(env, + corev1.EnvVar{Name: "RUN_TASKS_DELAY_S", Value: "600"}, + corev1.EnvVar{Name: "VAULT_TEMPLATE_DISABLED", Value: "true"}, + corev1.EnvVar{Name: "WORKER_HEARTBEAT_PATH", Value: "/tmp/ingestion_worker_heartbeat"}, + ) + env, volumes, mounts, _ = buildSAIATLSEnv(ai, env, volumes, mounts, nil) + sort.Slice(env, func(i, j int) bool { return env[i].Name < env[j].Name }) + + component := ai.Name + "-v2-worker" + labels, annotations := saiaLabelsAndAnnotations(ai, component) + + deployment := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: ai.Name + "-saia-v2-worker", + Namespace: ai.Namespace, + }, + } + + if err := controllerutil.SetControllerReference(ai, deployment, r.Scheme); err != nil { + return fmt.Errorf("ownerref on v2 worker Deployment: %w", err) + } + + v2WorkerResources := ai.Spec.V2Worker.Resources + if v2WorkerResources.Requests == nil { + v2WorkerResources = ai.Spec.Resources + } + + if _, err := controllerutil.CreateOrUpdate(ctx, r.Client, deployment, func() error { + deployment.ObjectMeta.Labels = labels + deployment.ObjectMeta.Annotations = annotations + deployment.Spec.Replicas = &ai.Spec.V2Worker.Replicas + + if deployment.Spec.Selector == nil { + deployment.Spec.Selector = &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": ai.Name, "component": component}, + } + } + + deployment.Spec.Template = corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": ai.Name, "component": component}, + Annotations: annotations, + }, + Spec: corev1.PodSpec{ + ServiceAccountName: ai.Spec.ServiceAccountName, + Containers: []corev1.Container{{ + Name: "saia-v2-worker", + Image: ai.Spec.V2.Image, + ImagePullPolicy: corev1.PullIfNotPresent, + Command: []string{"/bin/sh", "-c"}, + Args: []string{". /home/splunk/init-prometheus.sh && python -m uvicorn --host 0.0.0.0 app.main:metrics_app --port 8088 & exec python -m app.workers.ingestion_worker"}, + Ports: ports, + VolumeMounts: mounts, + Resources: v2WorkerResources, + Env: env, + EnvFrom: saiaEnvFrom(ai), + LivenessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + Exec: &corev1.ExecAction{ + // The saia-v2 base image (python3-debian13-vault:4.1.3) is a minimal + // Python runtime that lacks coreutils like `date`, `cat`, `cut`. Use + // python3 directly, which is guaranteed to exist. The heartbeat file + // contains a float "secs.usec\n" written by ingestion_worker. + Command: []string{ + "python3", "-c", + "import os,sys,time\n" + + "p=os.environ.get('WORKER_HEARTBEAT_PATH','/tmp/ingestion_worker_heartbeat')\n" + + "sys.exit(0 if os.path.exists(p) and (time.time()-float(open(p).read().strip()))<120 else 1)", + }, + }, + }, + PeriodSeconds: 60, + FailureThreshold: 3, + InitialDelaySeconds: 30, + }, + }}, + Volumes: volumes, + Affinity: &ai.Spec.Affinity, + Tolerations: ai.Spec.Tolerations, + ImagePullSecrets: ai.Spec.ImagePullSecrets, + }, + } + return nil + }); err != nil { + return fmt.Errorf("create/update v2 worker Deployment: %w", err) + } + return nil +} + +// reconcileNginxConfigMap creates the ConfigMap with nginx.conf for path-based routing. +func (r *SaiaReconciler) reconcileNginxConfigMap( + ctx context.Context, + ai *aiv1.AIService, +) error { + v1ServiceName := ai.Name + "-saia-v1-service" + v2ServiceName := ai.Name + "-saia-v2-service" + + nginxConf := fmt.Sprintf(`worker_processes auto; +error_log /dev/stderr warn; +pid /tmp/nginx.pid; + +events { + worker_connections 1024; +} + +http { + log_format routing '$remote_addr - [$time_local] "$request" ' + 'status=$status upstream=$upstream_addr ' + 'rt=$request_time uct=$upstream_connect_time urt=$upstream_response_time'; + + access_log /dev/stdout routing; + + upstream saia_v1 { + server %s:8080; + } + + upstream saia_v2 { + server %s:8000; + } + + server { + listen 8080; + + # Nginx health/status endpoints MUST be declared before the v2 regex + # match; otherwise nginx's longest-prefix-before-regex rule would let + # exact matches win only if explicitly marked with "^~" or "=", and we + # don't want a stray /saia-api-v2/nginx_status to ever hit the backend. + location = /nginx_health { + return 200 'ok'; + add_header Content-Type text/plain; + } + + location = /nginx_status { + # stub_status exposes counters (active connections, reqs/s, etc.). + # k8s probes use /nginx_health — NOT /nginx_status — so restricting + # this to the nginx pod's loopback is safe. Operators needing to + # scrape should exec into the pod (kubectl exec curl 127.0.0.1/nginx_status). + stub_status on; + allow 127.0.0.1; + deny all; + } + + # v2: any path containing /saia-api-v2/ (with or without a tenant + # prefix). Using "search anywhere" avoids the ^// requirement + # that would silently fall through to v1 for tenant-less callers. + # Word boundary via "/saia-api-v2/" (not "saia-api-v2" substring) + # prevents accidental matches like /foo/saia-api-v2-legacy/. + location ~ /saia-api-v2/ { + proxy_pass http://saia_v2; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_read_timeout 300s; + proxy_send_timeout 300s; + proxy_buffering off; + chunked_transfer_encoding on; + } + + # v1: everything else (including /health, /{tenant}/saia-api/v1alpha1/...) + location / { + proxy_pass http://saia_v1; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_read_timeout 300s; + proxy_send_timeout 300s; + proxy_buffering off; + } + } +} +`, v1ServiceName, v2ServiceName) + + cmName := ai.Name + "-saia-nginx-config" + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: cmName, + Namespace: ai.Namespace, + }, + } + + if err := controllerutil.SetControllerReference(ai, cm, r.Scheme); err != nil { + return fmt.Errorf("ownerref on nginx ConfigMap: %w", err) + } + + if _, err := controllerutil.CreateOrUpdate(ctx, r.Client, cm, func() error { + cm.Data = map[string]string{"nginx.conf": nginxConf} + return nil + }); err != nil { + return fmt.Errorf("create/update nginx ConfigMap: %w", err) + } + return nil +} + +// reconcileNginxDeployment creates the nginx reverse proxy Deployment. +func (r *SaiaReconciler) reconcileNginxDeployment( + ctx context.Context, + ai *aiv1.AIService, +) error { + component := ai.Name + "-nginx" + labels, annotations := saiaLabelsAndAnnotations(ai, component) + + deployment := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: ai.Name + "-saia-nginx", + Namespace: ai.Namespace, + }, + } + + if err := controllerutil.SetControllerReference(ai, deployment, r.Scheme); err != nil { + return fmt.Errorf("ownerref on nginx Deployment: %w", err) + } + + var replicas int32 = 1 + + // Resolve nginx image. Allow an override via RELATED_IMAGE_NGINX so airgapped + // installs can pull the image from a private mirror. Fall back to a stable + // upstream tag so `make run` / default helm deploys still work. + nginxImage := os.Getenv("RELATED_IMAGE_NGINX") + if nginxImage == "" { + nginxImage = "nginx:1.27-alpine" + } + + if _, err := controllerutil.CreateOrUpdate(ctx, r.Client, deployment, func() error { + deployment.ObjectMeta.Labels = labels + deployment.ObjectMeta.Annotations = annotations + deployment.Spec.Replicas = &replicas + + if deployment.Spec.Selector == nil { + deployment.Spec.Selector = &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": ai.Name, "component": component}, + } + } + + deployment.Spec.Template = corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": ai.Name, "component": component}, + Annotations: annotations, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{ + Name: "nginx", + Image: nginxImage, + ImagePullPolicy: corev1.PullIfNotPresent, + Ports: []corev1.ContainerPort{ + {Name: "http", ContainerPort: 8080}, + }, + VolumeMounts: []corev1.VolumeMount{ + {Name: "nginx-config", MountPath: "/etc/nginx/nginx.conf", SubPath: "nginx.conf"}, + }, + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("100m"), + corev1.ResourceMemory: resource.MustParse("64Mi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("500m"), + corev1.ResourceMemory: resource.MustParse("128Mi"), + }, + }, + LivenessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{Path: "/nginx_health", Port: intstr.FromInt(8080)}, + }, + PeriodSeconds: 30, + FailureThreshold: 3, + }, + ReadinessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{Path: "/nginx_health", Port: intstr.FromInt(8080)}, + }, + PeriodSeconds: 10, + FailureThreshold: 3, + }, + }}, + Volumes: []corev1.Volume{ + { + Name: "nginx-config", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: ai.Name + "-saia-nginx-config", + }, + }, + }, + }, + }, + ImagePullSecrets: ai.Spec.ImagePullSecrets, + }, + } + return nil + }); err != nil { + return fmt.Errorf("create/update nginx Deployment: %w", err) + } + return nil +} + +// reconcileSAIAv1Service creates the internal v1 ClusterIP Service. +func (r *SaiaReconciler) reconcileSAIAv1Service( + ctx context.Context, + ai *aiv1.AIService, +) error { + component := ai.Name // v1 API uses "app: {name}, component: {name}" from reconcileSAIADeployment + svc := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: ai.Name + "-saia-v1-service", + Namespace: ai.Namespace, + Labels: map[string]string{"app": ai.Name}, + }, + } + + if err := controllerutil.SetControllerReference(ai, svc, r.Scheme); err != nil { + return fmt.Errorf("ownerref on v1 Service: %w", err) + } + + if _, err := controllerutil.CreateOrUpdate(ctx, r.Client, svc, func() error { + svc.Spec.Selector = map[string]string{"app": ai.Name, "component": component} + svc.Spec.Ports = []corev1.ServicePort{ + {Name: "http", Port: 8080, TargetPort: intstr.FromInt(8080)}, + {Name: "metrics", Port: 8088, TargetPort: intstr.FromInt(8088)}, + } + svc.Spec.Type = corev1.ServiceTypeClusterIP + return nil + }); err != nil { + return fmt.Errorf("create/update v1 Service: %w", err) + } + return nil +} + +// reconcileSAIAv2Service creates the internal v2 ClusterIP Service. +func (r *SaiaReconciler) reconcileSAIAv2Service( + ctx context.Context, + ai *aiv1.AIService, +) error { + component := ai.Name + "-v2-api" + svc := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: ai.Name + "-saia-v2-service", + Namespace: ai.Namespace, + Labels: map[string]string{"app": ai.Name}, + }, + } + + if err := controllerutil.SetControllerReference(ai, svc, r.Scheme); err != nil { + return fmt.Errorf("ownerref on v2 Service: %w", err) + } + + if _, err := controllerutil.CreateOrUpdate(ctx, r.Client, svc, func() error { + svc.Spec.Selector = map[string]string{"app": ai.Name, "component": component} + svc.Spec.Ports = []corev1.ServicePort{ + {Name: "http", Port: 8000, TargetPort: intstr.FromInt(8000)}, + {Name: "metrics", Port: 8088, TargetPort: intstr.FromInt(8088)}, + } + svc.Spec.Type = corev1.ServiceTypeClusterIP + return nil + }); err != nil { + return fmt.Errorf("create/update v2 Service: %w", err) + } + return nil +} + +// reconcileSAIAService ensures the public-facing Service routes to nginx. func (r *SaiaReconciler) reconcileSAIAService( ctx context.Context, ai *aiv1.AIService, @@ -795,9 +1449,11 @@ func (r *SaiaReconciler) reconcileSAIAService( serviceTemplate := ai.Spec.ServiceTemplate.DeepCopy() cleanServiceTemplate(serviceTemplate) + // Public service points to nginx (which routes to v1/v2 by path) + nginxComponent := ai.Name + "-nginx" + ports := []corev1.ServicePort{ {Name: "http", Port: 8080, TargetPort: intstr.FromInt(8080)}, - {Name: "metrics", Port: 8088, TargetPort: intstr.FromInt(8088)}, } if ai.Spec.MTLS.Enabled && ai.Spec.MTLS.Termination == "operator" { ports = append(ports, corev1.ServicePort{ @@ -806,12 +1462,13 @@ func (r *SaiaReconciler) reconcileSAIAService( } svc := &corev1.Service{ ObjectMeta: metav1.ObjectMeta{ - Name: ai.Name + "-saia-service", - Namespace: ai.Namespace, - Labels: map[string]string{"app": ai.Name}, + Name: ai.Name + "-saia-service", + Namespace: ai.Namespace, + Labels: map[string]string{"app": ai.Name}, + Annotations: map[string]string{}, }, Spec: corev1.ServiceSpec{ - Selector: map[string]string{"app": ai.Name, "component": ai.Name}, + Selector: map[string]string{"app": ai.Name, "component": nginxComponent}, Ports: ports, Type: corev1.ServiceTypeClusterIP, }, @@ -851,10 +1508,8 @@ func (r *SaiaReconciler) reconcileSAIAService( return fmt.Errorf("ownerref on Service: %w", err) } if _, err := controllerutil.CreateOrUpdate(ctx, r.Client, svc, func() error { - // Update mutable fields - svc.Spec.Selector = map[string]string{"app": ai.Name, "component": ai.Name} + svc.Spec.Selector = map[string]string{"app": ai.Name, "component": nginxComponent} svc.Spec.Ports = ports - // Type is already set above based on ServiceTemplate return nil }); err != nil { r.Recorder.Event(ai, corev1.EventTypeWarning, "InvalidSpec", "create/update Service failed") diff --git a/pkg/ai/features/saia/impl_test.go b/pkg/ai/features/saia/impl_test.go index b5aec6c..6cb15bc 100644 --- a/pkg/ai/features/saia/impl_test.go +++ b/pkg/ai/features/saia/impl_test.go @@ -2,16 +2,22 @@ package saia import ( "context" - //"errors" + "fmt" "os" + "strings" "testing" aiv1 "github.com/splunk/splunk-ai-operator/api/v1" "github.com/splunk/splunk-ai-operator/pkg/ai/features/common" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + appsv1 "k8s.io/api/apps/v1" + batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) @@ -102,6 +108,7 @@ func Test_validateAIService_defaults(t *testing.T) { Spec: aiv1.AIServiceSpec{ AIPlatformRef: corev1.ObjectReference{Name: "plat", Namespace: "ns"}, TaskVolume: aiv1.ObjectStorageSpec{Path: "/data"}, + V2: aiv1.SAIAv2Config{Image: "saia-v2:latest"}, }, } @@ -113,6 +120,8 @@ func Test_validateAIService_defaults(t *testing.T) { assert.NotNil(t, ai.Spec.Resources.Limits) assert.Equal(t, "ray.ns.svc.cluster.local:8000", ai.Spec.AIPlatformUrl) assert.Equal(t, "vec.ns.svc.cluster.local", ai.Spec.VectorDbUrl) + assert.Equal(t, int32(1), ai.Spec.V2.Replicas) + assert.Equal(t, int32(1), ai.Spec.V2Worker.Replicas) } func Test_getAIPlatform_success(t *testing.T) { @@ -154,3 +163,412 @@ func Test_getAIPlatform_error(t *testing.T) { assert.Error(t, err) assert.Nil(t, got) } + +func Test_validateAIService_missingV2Image(t *testing.T) { + os.Setenv("RELATED_IMAGE_POST_INSTALL_HOOK", "dummy") + defer os.Unsetenv("RELATED_IMAGE_POST_INSTALL_HOOK") + + r := &SaiaReconciler{ + Recorder: record.NewFakeRecorder(10), + Client: fake.NewClientBuilder().WithScheme(buildTestScheme(t)).Build(), + } + + ai := &aiv1.AIService{ + Spec: aiv1.AIServiceSpec{ + AIPlatformUrl: "http://platform:8000", + VectorDbUrl: "weaviate:80", + TaskVolume: aiv1.ObjectStorageSpec{Path: "s3://bucket"}, + }, + } + err := r.validateAIService(context.Background(), ai) + assert.ErrorContains(t, err, "v2.image must be set") +} + +// buildFullTestScheme creates a scheme that includes apps/v1 for Deployment testing. +func buildFullTestScheme(t *testing.T) *runtime.Scheme { + s := buildTestScheme(t) + require.NoError(t, appsv1.AddToScheme(s)) + return s +} + +// newTestAIService returns a minimal AIService for reconciliation tests. +func newTestAIService() *aiv1.AIService { + return &aiv1.AIService{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: "default", + UID: "uid-123", + }, + Spec: aiv1.AIServiceSpec{ + AIPlatformUrl: "http://platform:8000", + VectorDbUrl: "weaviate:80", + Replicas: 1, + ServiceAccountName: "test-sa", + TaskVolume: aiv1.ObjectStorageSpec{ + Path: "s3://test-bucket/saia", + Endpoint: "http://seaweedfs:8333", + SecretRef: "s3-creds", + }, + V2: aiv1.SAIAv2Config{ + Image: "saia-v2:latest", + Replicas: 1, + }, + V2Worker: aiv1.SAIAWorkerConfig{Replicas: 1}, + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: *mustParseQuantity("500m"), + corev1.ResourceMemory: *mustParseQuantity("2Gi"), + }, + }, + }, + } +} + +func mustParseQuantity(s string) *resource.Quantity { + q := resource.MustParse(s) + return &q +} + +func Test_reconcilePostInstallHook_SetsGRPCEnvForV2DataLoader(t *testing.T) { + // Regression: the saia-data-loader v2 image (>= v2.0.4-13-g3b677604) uses + // the Weaviate v4 Python client, which performs a gRPC health check on + // connect. Its url_compat shim defaults VECTOR_DB_GRPC_HOST to + // "grpc.{host}" and VECTOR_DB_GRPC_PORT to "443" (Splunk production + // convention). In k0s airgap, Weaviate exposes gRPC on the same Service at + // :50051. The operator MUST pass these vars explicitly so the shim's + // setdefault() calls are no-ops. + t.Setenv("RELATED_IMAGE_POST_INSTALL_HOOK", "dummy-hook-image:latest") + + scheme := buildFullTestScheme(t) + require.NoError(t, batchv1.AddToScheme(scheme)) + ai := newTestAIService() + ai.Spec.VectorDbUrl = "weaviate.ai-platform.svc.cluster.local" + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + // First call creates the Job and returns "waiting" as a sentinel error. + err := r.reconcilePostInstallHook(context.Background(), ai) + require.Error(t, err) + assert.Contains(t, err.Error(), "created Job") + + job := &batchv1.Job{} + require.NoError(t, fakeClient.Get(context.Background(), + types.NamespacedName{Name: "test-vector-db-setup-posthook", Namespace: "default"}, job)) + + // Collect env var names/values. + envMap := envToMap(job.Spec.Template.Spec.Containers[0].Env) + + assert.Equal(t, "http://weaviate.ai-platform.svc.cluster.local:80", envMap["VECTOR_DB_URL"]) + assert.Equal(t, "weaviate.ai-platform.svc.cluster.local", envMap["VECTOR_DB_HOST"]) + assert.Equal(t, "80", envMap["VECTOR_DB_PORT"]) + // Critical: GRPC host must NOT be "grpc."; it's the same Service. + assert.Equal(t, "weaviate.ai-platform.svc.cluster.local", envMap["VECTOR_DB_GRPC_HOST"]) + assert.Equal(t, "50051", envMap["VECTOR_DB_GRPC_PORT"]) + assert.Equal(t, "false", envMap["VECTOR_DB_SECURE"]) + assert.Equal(t, "false", envMap["VECTOR_DB_AUTH_ENABLED"]) + assert.Equal(t, "true", envMap["SPLUNK_AI_ASSISTANT_SERVICE_CMP"]) +} + +func Test_reconcileSAIAv2Deployment(t *testing.T) { + scheme := buildFullTestScheme(t) + ai := newTestAIService() + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + err := r.reconcileSAIAv2Deployment(context.Background(), ai) + require.NoError(t, err) + + dep := &appsv1.Deployment{} + err = fakeClient.Get(context.Background(), types.NamespacedName{Name: "test-saia-v2-deployment", Namespace: "default"}, dep) + require.NoError(t, err) + + container := dep.Spec.Template.Spec.Containers[0] + assert.Equal(t, "saia-v2:latest", container.Image) + assert.Equal(t, "saia-v2-api", container.Name) + + // v2 API listens on 8000 + assert.Equal(t, int32(8000), container.Ports[0].ContainerPort) + assert.Equal(t, "/health", container.ReadinessProbe.HTTPGet.Path) + assert.Equal(t, 8000, container.ReadinessProbe.HTTPGet.Port.IntValue()) + + envMap := envToMap(container.Env) + assert.Equal(t, "http://platform:8000", envMap["PLATFORM_URL"]) + assert.Equal(t, "test-bucket", envMap["S3_BUCKET"]) + assert.Equal(t, "true", envMap["VAULT_TEMPLATE_DISABLED"]) +} + +func Test_reconcileSAIAv2Worker(t *testing.T) { + scheme := buildFullTestScheme(t) + ai := newTestAIService() + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + err := r.reconcileSAIAv2Worker(context.Background(), ai) + require.NoError(t, err) + + dep := &appsv1.Deployment{} + err = fakeClient.Get(context.Background(), types.NamespacedName{Name: "test-saia-v2-worker", Namespace: "default"}, dep) + require.NoError(t, err) + + container := dep.Spec.Template.Spec.Containers[0] + assert.Equal(t, "saia-v2:latest", container.Image) + assert.Equal(t, "saia-v2-worker", container.Name) + assert.Equal(t, []string{"/bin/sh", "-c"}, container.Command) + assert.Contains(t, container.Args[0], "app.workers.ingestion_worker") + + envMap := envToMap(container.Env) + assert.Equal(t, "600", envMap["RUN_TASKS_DELAY_S"]) + // Heartbeat path must match saia-v2's default (app/core/config.py). + assert.Equal(t, "/tmp/ingestion_worker_heartbeat", envMap["WORKER_HEARTBEAT_PATH"]) + assert.Equal(t, "true", envMap["VAULT_TEMPLATE_DISABLED"]) + + // Liveness uses exec (heartbeat file check), not HTTP + assert.NotNil(t, container.LivenessProbe.Exec) + assert.Nil(t, container.LivenessProbe.HTTPGet) + // Probe must use python3 (not coreutils) because the saia-v2 base image lacks date/cat/cut. + assert.Equal(t, "python3", container.LivenessProbe.Exec.Command[0]) + assert.Contains(t, container.LivenessProbe.Exec.Command[2], "WORKER_HEARTBEAT_PATH") + + // Only metrics port, no HTTP API port + assert.Len(t, container.Ports, 1) + assert.Equal(t, int32(8088), container.Ports[0].ContainerPort) +} + +func Test_reconcileNginxConfigMap(t *testing.T) { + scheme := buildFullTestScheme(t) + ai := newTestAIService() + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + err := r.reconcileNginxConfigMap(context.Background(), ai) + require.NoError(t, err) + + cm := &corev1.ConfigMap{} + err = fakeClient.Get(context.Background(), types.NamespacedName{Name: "test-saia-nginx-config", Namespace: "default"}, cm) + require.NoError(t, err) + + conf := cm.Data["nginx.conf"] + assert.NotEmpty(t, conf) + + // v2 routing: ANY path containing "/saia-api-v2/" — with or without a + // tenant prefix — must be sent to the v2 upstream. The regex must NOT + // require a path segment before "saia-api-v2" (that would silently route + // tenant-less probes to v1). + assert.Contains(t, conf, "location ~ /saia-api-v2/") + assert.Contains(t, conf, "proxy_pass http://saia_v2") + + // v1 is the default + assert.Contains(t, conf, "location /") + assert.Contains(t, conf, "proxy_pass http://saia_v1") + + // Upstream names reference the correct internal service names + assert.Contains(t, conf, "test-saia-v1-service:8080") + assert.Contains(t, conf, "test-saia-v2-service:8000") + + // SSE/streaming friendliness + assert.Contains(t, conf, "proxy_buffering off") + assert.Contains(t, conf, "proxy_http_version 1.1") + + // Health and status endpoints — stub_status must be loopback-only. + assert.Contains(t, conf, "location = /nginx_health") + assert.Contains(t, conf, "location = /nginx_status") + assert.Contains(t, conf, "deny all;") +} + +func Test_reconcileNginxDeployment(t *testing.T) { + // Ensure no env override leaks from other tests in the package. + os.Unsetenv("RELATED_IMAGE_NGINX") + + scheme := buildFullTestScheme(t) + ai := newTestAIService() + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + err := r.reconcileNginxDeployment(context.Background(), ai) + require.NoError(t, err) + + dep := &appsv1.Deployment{} + err = fakeClient.Get(context.Background(), types.NamespacedName{Name: "test-saia-nginx", Namespace: "default"}, dep) + require.NoError(t, err) + + container := dep.Spec.Template.Spec.Containers[0] + assert.Equal(t, "nginx:1.27-alpine", container.Image) + assert.Equal(t, "nginx", container.Name) + assert.Equal(t, int32(8080), container.Ports[0].ContainerPort) + + // ConfigMap volume mount + assert.Equal(t, "/etc/nginx/nginx.conf", container.VolumeMounts[0].MountPath) + assert.Equal(t, "nginx.conf", container.VolumeMounts[0].SubPath) + + // Health probes use nginx_health + assert.Equal(t, "/nginx_health", container.LivenessProbe.HTTPGet.Path) + assert.Equal(t, "/nginx_health", container.ReadinessProbe.HTTPGet.Path) +} + +func Test_reconcileNginxDeployment_imageOverride(t *testing.T) { + os.Setenv("RELATED_IMAGE_NGINX", "private.registry.example.com/nginx:1.29-alpine") + defer os.Unsetenv("RELATED_IMAGE_NGINX") + + scheme := buildFullTestScheme(t) + ai := newTestAIService() + ai.Name = "override" + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + require.NoError(t, r.reconcileNginxDeployment(context.Background(), ai)) + + dep := &appsv1.Deployment{} + require.NoError(t, fakeClient.Get(context.Background(), + types.NamespacedName{Name: "override-saia-nginx", Namespace: "default"}, dep)) + + assert.Equal(t, "private.registry.example.com/nginx:1.29-alpine", + dep.Spec.Template.Spec.Containers[0].Image) +} + +func Test_reconcileSAIAService_handlesAnnotationsWithoutPanic(t *testing.T) { + // Regression: the pre-existing code did not initialize svc.Annotations, so + // any user-provided annotation on the AIService caused a "assignment to + // entry in nil map" panic when reconciling the public service. + scheme := buildFullTestScheme(t) + ai := newTestAIService() + ai.Annotations = map[string]string{ + "operator.splunk.com/example": "v1", + "kubectl.kubernetes.io/restartedAt": "should-be-skipped", + "kubectl.kubernetes.io/last-applied-configuration": "should-be-skipped", + } + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + require.NotPanics(t, func() { + err := r.reconcileSAIAService(context.Background(), ai) + require.NoError(t, err) + }) + + svc := &corev1.Service{} + require.NoError(t, fakeClient.Get(context.Background(), + types.NamespacedName{Name: "test-saia-service", Namespace: "default"}, svc)) + + assert.Equal(t, "v1", svc.Annotations["operator.splunk.com/example"]) + assert.NotContains(t, svc.Annotations, "kubectl.kubernetes.io/restartedAt") + assert.NotContains(t, svc.Annotations, "kubectl.kubernetes.io/last-applied-configuration") +} + +func Test_reconcileSAIAv1Service(t *testing.T) { + scheme := buildFullTestScheme(t) + ai := newTestAIService() + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + err := r.reconcileSAIAv1Service(context.Background(), ai) + require.NoError(t, err) + + svc := &corev1.Service{} + err = fakeClient.Get(context.Background(), types.NamespacedName{Name: "test-saia-v1-service", Namespace: "default"}, svc) + require.NoError(t, err) + + assert.Equal(t, map[string]string{"app": "test", "component": "test"}, svc.Spec.Selector) + assert.Equal(t, int32(8080), svc.Spec.Ports[0].Port) +} + +func Test_reconcileSAIAv2Service(t *testing.T) { + scheme := buildFullTestScheme(t) + ai := newTestAIService() + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + err := r.reconcileSAIAv2Service(context.Background(), ai) + require.NoError(t, err) + + svc := &corev1.Service{} + err = fakeClient.Get(context.Background(), types.NamespacedName{Name: "test-saia-v2-service", Namespace: "default"}, svc) + require.NoError(t, err) + + assert.Equal(t, map[string]string{"app": "test", "component": "test-v2-api"}, svc.Spec.Selector) + assert.Equal(t, int32(8000), svc.Spec.Ports[0].Port) +} + +func Test_reconcileSAIAService_pointsToNginx(t *testing.T) { + scheme := buildFullTestScheme(t) + ai := newTestAIService() + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + err := r.reconcileSAIAService(context.Background(), ai) + require.NoError(t, err) + + svc := &corev1.Service{} + err = fakeClient.Get(context.Background(), types.NamespacedName{Name: "test-saia-service", Namespace: "default"}, svc) + require.NoError(t, err) + + // Public service must target nginx, not v1 directly + assert.Equal(t, map[string]string{"app": "test", "component": "test-nginx"}, svc.Spec.Selector) + assert.Equal(t, int32(8080), svc.Spec.Ports[0].Port) +} + +func Test_buildSAIABaseEnv(t *testing.T) { + ai := newTestAIService() + env := buildSAIABaseEnv(ai) + envMap := envToMap(env) + + assert.Equal(t, "http://platform:8000", envMap["PLATFORM_URL"]) + assert.Equal(t, "weaviate:80", envMap["VECTOR_DB_URL"]) + assert.Equal(t, "test-bucket", envMap["S3_BUCKET"]) + assert.Equal(t, "http://seaweedfs:8333", envMap["S3COMPAT_OBJECT_STORE_ENDPOINT_URL"]) + assert.Equal(t, "test-bucket", envMap["S3COMPAT_OBJECT_STORE_BUCKET"]) + + // S3 creds come from secretRef + found := false + for _, e := range env { + if e.Name == "S3COMPAT_OBJECT_STORE_ACCESS_KEY" { + found = true + assert.Equal(t, "s3-creds", e.ValueFrom.SecretKeyRef.Name) + assert.Equal(t, "s3_access_key", e.ValueFrom.SecretKeyRef.Key) + } + } + assert.True(t, found, "S3COMPAT_OBJECT_STORE_ACCESS_KEY should be present") +} + +func Test_extractBucketName(t *testing.T) { + tests := []struct { + input string + want string + }{ + {"s3://my-bucket/path", "my-bucket"}, + {"s3compat://bucket-name", "bucket-name"}, + {"minio://bucket-name", "bucket-name"}, + {"seaweedfs://my-bucket/prefix", "my-bucket"}, + {"gs://my-bucket", "my-bucket"}, + {"plain-bucket", "plain-bucket"}, + } + for _, tt := range tests { + t.Run(tt.input, func(t *testing.T) { + assert.Equal(t, tt.want, extractBucketName(tt.input)) + }) + } +} + +// envToMap converts a slice of EnvVar to a map for easy assertion. +// Only includes env vars with direct values (not ValueFrom). +func envToMap(envs []corev1.EnvVar) map[string]string { + m := make(map[string]string) + for _, e := range envs { + if e.ValueFrom == nil { + m[e.Name] = e.Value + } + } + return m +} + +// Suppress unused import warnings +var _ = fmt.Sprintf +var _ = strings.Contains diff --git a/pkg/ai/reconciler.go b/pkg/ai/reconciler.go index 9e2a803..8c7814f 100644 --- a/pkg/ai/reconciler.go +++ b/pkg/ai/reconciler.go @@ -3,6 +3,7 @@ package ai_platform import ( "context" "fmt" + "os" aiApi "github.com/splunk/splunk-ai-operator/api/v1" "github.com/splunk/splunk-ai-operator/pkg/ai/raybuilder" @@ -211,7 +212,7 @@ func (r *AIPlatformReconciler) buildAIService(ctx context.Context, platform *aiA taskObjectStorage := platform.Spec.ObjectStorage // Don't append feature name - just pass the bucket path directly // taskObjectStorage.Path is already set from platform.Spec.ObjectStorage - return &aiApi.AIService{ + svc := &aiApi.AIService{ ObjectMeta: metav1.ObjectMeta{ Name: name, Namespace: platform.Namespace, @@ -239,11 +240,18 @@ func (r *AIPlatformReconciler) buildAIService(ctx context.Context, platform *aiA Port: 8080, Path: "/metrics", }, - MTLS: platform.Spec.MTLS, - // Propagate imagePullSecrets from AIPlatform to AIService + MTLS: platform.Spec.MTLS, ImagePullSecrets: platform.Spec.Images.ImagePullSecrets, }, } + + // SAIA v2: populate from operator env var if set + if v2Image := os.Getenv("RELATED_IMAGE_SAIA_API_V2"); v2Image != "" { + svc.Spec.V2 = aiApi.SAIAv2Config{Image: v2Image, Replicas: 1} + svc.Spec.V2Worker = aiApi.SAIAWorkerConfig{Replicas: 1} + } + + return svc } // CheckAIServiceStatus verifies that all AIService children have successful conditions. diff --git a/pkg/ai/weaviate.go b/pkg/ai/weaviate.go index 8189c1f..d10eb39 100644 --- a/pkg/ai/weaviate.go +++ b/pkg/ai/weaviate.go @@ -196,16 +196,18 @@ func (r *AIPlatformReconciler) ReconcileWeaviateDatabase(ctx context.Context, in Image: weaviateImage, ImagePullPolicy: corev1.PullIfNotPresent, Resources: resources, - VolumeMounts: volumeMounts, - Ports: []corev1.ContainerPort{{ - Name: "http", - ContainerPort: 8080, - }}, + VolumeMounts: volumeMounts, + Ports: []corev1.ContainerPort{ + {Name: "http", ContainerPort: 8080}, + {Name: "grpc", ContainerPort: 50051}, + }, Env: []corev1.EnvVar{ - { - Name: "PERSISTENCE_DATA_PATH", - Value: "/var/lib/weaviate", - }, + {Name: "PERSISTENCE_DATA_PATH", Value: "/var/lib/weaviate"}, + // gRPC server is enabled by default in Weaviate v1.19+. Setting GRPC_PORT + // explicitly matches the Splunk vector-db reference chart and makes the + // port contract explicit for the 50051 containerPort/service declared below. + // Required by SAIA v2 which uses the Weaviate python v4 gRPC client. + {Name: "GRPC_PORT", Value: "50051"}, }, }} return nil @@ -230,11 +232,18 @@ func (r *AIPlatformReconciler) ReconcileWeaviateDatabase(ctx context.Context, in } if _, err := controllerutil.CreateOrUpdate(ctx, r.Client, svc, func() error { svc.Spec.Selector = labels - svc.Spec.Ports = []corev1.ServicePort{{ - Name: "http", - Port: 80, - TargetPort: intstr.FromInt(8080), - }} + svc.Spec.Ports = []corev1.ServicePort{ + { + Name: "http", + Port: 80, + TargetPort: intstr.FromInt(8080), + }, + { + Name: "grpc", + Port: 50051, + TargetPort: intstr.FromInt(50051), + }, + } return nil }); err != nil { return err diff --git a/pkg/ai/weaviate_test.go b/pkg/ai/weaviate_test.go index a397d6a..6f60038 100644 --- a/pkg/ai/weaviate_test.go +++ b/pkg/ai/weaviate_test.go @@ -150,11 +150,33 @@ func TestReconcileWeaviateDatabase(t *testing.T) { assert.NoError(t, err) assert.Equal(t, "weaviate:test", sts.Spec.Template.Spec.Containers[0].Image) - // Verify Service created + // Verify container exposes both http (8080) and grpc (50051) ports + containerPorts := sts.Spec.Template.Spec.Containers[0].Ports + portNames := map[string]int32{} + for _, p := range containerPorts { + portNames[p.Name] = p.ContainerPort + } + assert.Equal(t, int32(8080), portNames["http"]) + assert.Equal(t, int32(50051), portNames["grpc"]) + + // Verify container has GRPC_PORT env var (gRPC server is enabled by default in + // Weaviate v1.19+, GRPC_PORT is set explicitly to make the port contract clear). + envMap := map[string]string{} + for _, e := range sts.Spec.Template.Spec.Containers[0].Env { + envMap[e.Name] = e.Value + } + assert.Equal(t, "50051", envMap["GRPC_PORT"]) + + // Verify Service created with both http and grpc ports svc := &corev1.Service{} err = fc.Get(ctx, types.NamespacedName{Name: platformName + "-weaviate", Namespace: ns}, svc) assert.NoError(t, err) - assert.Equal(t, int32(80), svc.Spec.Ports[0].Port) + svcPorts := map[string]int32{} + for _, p := range svc.Spec.Ports { + svcPorts[p.Name] = p.Port + } + assert.Equal(t, int32(80), svcPorts["http"]) + assert.Equal(t, int32(50051), svcPorts["grpc"]) }) } diff --git a/tools/cluster_setup/artifacts.yaml b/tools/cluster_setup/artifacts.yaml index 70d48d1..316b183 100644 --- a/tools/cluster_setup/artifacts.yaml +++ b/tools/cluster_setup/artifacts.yaml @@ -4866,23 +4866,36 @@ spec: properties: endpoint: description: |- - Optional override endpoint (only needed for S3-compatible services like MinIO) - Must be a valid HTTP/HTTPS URL + Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS) + Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible (MinIO, SeaweedFS, etc.) pattern: ^https?://.*$ type: string path: description: |- Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, minio://bucketname/, seaweedfs://bucketname/, or s3compat://bucketname/ + azure://containername/, s3compat://bucketname/ (generic S3-compatible), minio://, or seaweedfs:// pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ type: string + provider: + description: |- + Provider is an optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint. + Values: aws, minio, seaweedfs, s3compat, gcs, azure + enum: + - aws + - minio + - seaweedfs + - s3compat + - gcs + - azure + type: string region: description: Region of the remote storage volume. Required for S3, optional for other providers minLength: 1 type: string secretRef: - description: Secret name containing storage credentials + description: Secret name containing storage credentials (e.g. + s3_access_key, s3_secret_key for S3-compatible backends) maxLength: 253 minLength: 1 type: string @@ -4930,6 +4943,152 @@ spec: type: string type: object type: array + v2: + description: |- + V2 configures the SAIA v2 deployment. v2 is always deployed alongside v1 behind nginx. + Users toggle Agent Mode (v1 vs v2) from the Splunk Settings UI. + properties: + image: + description: Image is the container image for the v2 API pod + type: string + replicas: + default: 1 + description: Replicas is the number of v2 API replicas + format: int32 + minimum: 0 + type: integer + resources: + description: Resources defines the compute resources for the v2 + API pods + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This is an alpha field and requires enabling the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + type: object + v2Worker: + description: V2Worker configures the v2 SAIA worker deployment (same + v2 image, command=run-worker.sh). + properties: + replicas: + default: 1 + description: Replicas is the number of worker replicas + format: int32 + minimum: 0 + type: integer + resources: + description: Resources defines the compute resources for the worker + pods + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This is an alpha field and requires enabling the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + type: object vectorDbUrl: description: VectorDbUrl specifies the URL or service name for the vector database @@ -5529,20 +5688,24 @@ spec: - name: RELATED_IMAGE_WEAVIATE value: docker.io/semitechnologies/weaviate:stable-v1.28-007846a - name: RELATED_IMAGE_SAIA_API - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:build-006 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:v2.0.4-23-g2fc91e9 + - name: RELATED_IMAGE_SAIA_API_V2 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api-v2:v2.0.4-23-g2fc91e9 - name: RELATED_IMAGE_POST_INSTALL_HOOK - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-data-loader:build-003 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-data-loader:v2.0.4-23-g2fc91e9 - name: SPLUNK_METRICS_INDEX_NAME value: _metrics - name: RELATED_IMAGE_FLUENT_BIT value: docker.io/fluent/fluent-bit:1.9.6 - name: RELATED_IMAGE_OTEL_COLLECTOR value: docker.io/otel/opentelemetry-collector-contrib:0.122.1 + - name: RELATED_IMAGE_NGINX + value: docker.io/library/nginx:1.27-alpine - name: MODEL_VERSION value: v0.3.14-36-g1549f5a - name: RAY_VERSION value: 2.53.0 - image: 658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.9 + image: 658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.20 livenessProbe: httpGet: path: /healthz diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml index a3a2d12..23b1a28 100644 --- a/tools/cluster_setup/k0s-cluster-config.yaml +++ b/tools/cluster_setup/k0s-cluster-config.yaml @@ -16,7 +16,7 @@ cluster: # region: us-east-2 # Ignored for on-prem, but required in config sshUser: ec2-user # CHANGE THIS: SSH user for remote nodes # sshKeyPath: /Users/mohaari2/.ssh/ai-key-arif.pem # CHANGE THIS: Path to SSH private key - sshKeyPath: /Users/mohaari2/.ssh/ai-key-arif1.pem # CHANGE THIS: Path to SSH private key + sshKeyPath: /Users/mohaari2/.ssh/ai-key-arif.pem # CHANGE THIS: Path to SSH private key # ---------- Node Configuration ---------- nodes: @@ -26,15 +26,15 @@ nodes: existingIPs: controllers: - # - 3.144.14.96 # CHANGE THIS: Your controller server IP - - 10.0.34.164 + - 3.144.14.96 # CHANGE THIS: Your controller server IP + # - 10.0.34.164 workers: - # - 3.14.134.16 # CHANGE THIS: CPU worker 1 - # - 13.59.78.115 # CHANGE THIS: GPU worker 1 - # - 3.15.20.136 # CHANGE THIS: GPU worker 2 - - 10.0.34.168 - - 10.0.34.142 - - 10.0.34.153 + - 3.14.134.16 # CHANGE THIS: CPU worker 1 + - 13.59.78.115 # CHANGE THIS: GPU worker 1 + - 3.15.20.136 # CHANGE THIS: GPU worker 2 + # - 10.0.34.168 + # - 10.0.34.142 + # - 10.0.34.153 # ---------- Storage Configuration ---------- # Object storage: AWS S3 or external S3-compatible (no in-cluster MinIO install for external). @@ -61,7 +61,7 @@ images: operator: # image: "docker.io/kpratyush775/splunk-ai-operator:v0.1.29" - image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.10" + image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.20" splunk: image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/splunk/splunk:10-2-ai-custom" @@ -83,10 +83,14 @@ images: saia: # apiImage: "ml-platform/saia/saia-api:build-v1alpha1" - apiImage: "ml-platform/saia/saia-api:build-006" + # apiImage: "ml-platform/saia/saia-api:build-006" #saia v1.5 + apiImage: "ml-platform/saia/saia-api:v2.0.4-23-g2fc91e9" #saia v2 + + apiV2Image: "ml-platform/saia/saia-api-v2:v2.0.4-23-g2fc91e9" #saia v2 # dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v1alpha1" - dataLoaderImage: "ml-platform/saia/saia-data-loader:build-003" + # dataLoaderImage: "ml-platform/saia/saia-data-loader:build-003" #saia v1.5 + dataLoaderImage: "ml-platform/saia/saia-data-loader:v2.0.4-23-g2fc91e9" #saia v2 fluentBit: image: "docker.io/fluent/fluent-bit:1.9.6" @@ -94,6 +98,12 @@ images: otelCollector: image: "docker.io/otel/opentelemetry-collector-contrib:0.122.1" + # Reverse proxy used by the SAIA reconciler to route v1 / v2 requests by + # path. Consumed via RELATED_IMAGE_NGINX. Point this at an internal mirror + # for airgapped clusters. + nginx: + image: "docker.io/library/nginx:1.27-alpine" + # ---------- Operator Versions ---------- operators: ray: @@ -123,8 +133,8 @@ splunk: # ---------- AI Platform Configuration ---------- aiPlatform: name: "splunk-ai-stack" - # defaultAcceleratorType: "L40S" - defaultAcceleratorType: "H100" + defaultAcceleratorType: "L40S" + # defaultAcceleratorType: "H100" workerGroupConfig: imageRegistry: "" diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index 4b161e6..85eedde 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -158,9 +158,11 @@ load_config() { RAY_WORKER_IMAGE="$(yq eval '.images.ray.workerImage' "$CONFIG_FILE" 2>/dev/null || echo "")" WEAVIATE_IMAGE="$(yq eval '.images.weaviate.image' "$CONFIG_FILE" 2>/dev/null || echo "")" SAIA_API_IMAGE="$(yq eval '.images.saia.apiImage' "$CONFIG_FILE" 2>/dev/null || echo "")" + SAIA_API_V2_IMAGE="$(yq eval '.images.saia.apiV2Image' "$CONFIG_FILE" 2>/dev/null || echo "")" SAIA_DATALOADER_IMAGE="$(yq eval '.images.saia.dataLoaderImage' "$CONFIG_FILE" 2>/dev/null || echo "")" FLUENT_BIT_IMAGE="$(yq eval '.images.fluentBit.image' "$CONFIG_FILE" 2>/dev/null || echo "")" OTEL_COLLECTOR_IMAGE="$(yq eval '.images.otelCollector.image' "$CONFIG_FILE" 2>/dev/null || echo "")" + NGINX_IMAGE="$(yq eval '.images.nginx.image' "$CONFIG_FILE" 2>/dev/null || echo "")" # Operator versions MODEL_VERSION="$(yq eval '.operators.ray.modelVersion // ""' "$CONFIG_FILE" 2>/dev/null || echo "")" @@ -257,6 +259,9 @@ validate_image_config() { if [[ -z "$SAIA_API_IMAGE" || "$SAIA_API_IMAGE" == "null" ]]; then err "REQUIRED: images.saia.apiImage must be specified in k0s-cluster-config.yaml" fi + if [[ -z "$SAIA_API_V2_IMAGE" || "$SAIA_API_V2_IMAGE" == "null" ]]; then + err "REQUIRED: images.saia.apiV2Image must be specified in k0s-cluster-config.yaml" + fi if [[ -z "$SAIA_DATALOADER_IMAGE" || "$SAIA_DATALOADER_IMAGE" == "null" ]]; then err "REQUIRED: images.saia.dataLoaderImage must be specified in k0s-cluster-config.yaml" fi @@ -272,6 +277,10 @@ validate_image_config() { OTEL_COLLECTOR_IMAGE="otel/opentelemetry-collector-contrib:0.122.1" log "Using default OpenTelemetry Collector image: $OTEL_COLLECTOR_IMAGE" fi + if [[ -z "$NGINX_IMAGE" || "$NGINX_IMAGE" == "null" ]]; then + NGINX_IMAGE="docker.io/library/nginx:1.27-alpine" + log "Using default Nginx image: $NGINX_IMAGE" + fi if [[ -z "$MODEL_VERSION" || "$MODEL_VERSION" == "null" ]]; then MODEL_VERSION="v0.3.14-36-g1549f5a" log "Using default Model version: $MODEL_VERSION" @@ -307,42 +316,61 @@ configure_images() { local ray_worker_full=$(build_image_url "$IMAGE_REGISTRY" "$RAY_WORKER_IMAGE") local weaviate_full=$(build_image_url "$IMAGE_REGISTRY" "$WEAVIATE_IMAGE") local saia_api_full=$(build_image_url "$IMAGE_REGISTRY" "$SAIA_API_IMAGE") + local saia_api_v2_full=$(build_image_url "$IMAGE_REGISTRY" "$SAIA_API_V2_IMAGE") local saia_dataloader_full=$(build_image_url "$IMAGE_REGISTRY" "$SAIA_DATALOADER_IMAGE") local fluent_bit_full=$(build_image_url "$IMAGE_REGISTRY" "$FLUENT_BIT_IMAGE") local otel_collector_full=$(build_image_url "$IMAGE_REGISTRY" "$OTEL_COLLECTOR_IMAGE") + # Nginx is an upstream image; don't rewrite it to the ECR registry unless the + # user explicitly put it under their registry. build_image_url already + # preserves a fully-qualified image path, so `docker.io/library/nginx:...` + # stays intact and `nginx:1.27-alpine` gets prefixed with $IMAGE_REGISTRY. + local nginx_full=$(build_image_url "$IMAGE_REGISTRY" "$NGINX_IMAGE") local ray_head_escaped=$(echo "$ray_head_full" | sed 's/[\/&]/\\&/g') local ray_worker_escaped=$(echo "$ray_worker_full" | sed 's/[\/&]/\\&/g') local weaviate_escaped=$(echo "$weaviate_full" | sed 's/[\/&]/\\&/g') local saia_api_escaped=$(echo "$saia_api_full" | sed 's/[\/&]/\\&/g') + local saia_api_v2_escaped=$(echo "$saia_api_v2_full" | sed 's/[\/&]/\\&/g') local saia_dataloader_escaped=$(echo "$saia_dataloader_full" | sed 's/[\/&]/\\&/g') local fluent_bit_escaped=$(echo "$fluent_bit_full" | sed 's/[\/&]/\\&/g') local otel_collector_escaped=$(echo "$otel_collector_full" | sed 's/[\/&]/\\&/g') + local nginx_escaped=$(echo "$nginx_full" | sed 's/[\/&]/\\&/g') local operator_escaped=$(echo "$operator_full" | sed 's/[\/&]/\\&/g') - SEDOPTION="-i" + # BSD (macOS) sed requires an explicit backup-suffix arg after -i. + # GNU (Linux) sed accepts -i without the suffix arg. + # Use a bash array so the empty-string "" is preserved as a distinct argv entry + # on macOS; without this, unquoted $SEDOPTION word-splitting created stray + # "filename''" backup files next to each artifact. + local SED_INPLACE if [[ "$OSTYPE" == "darwin"* ]]; then - SEDOPTION="-i ''" + SED_INPLACE=(sed -i "") + else + SED_INPLACE=(sed -i) fi - sed $SEDOPTION "/name: RELATED_IMAGE_RAY_HEAD/,/value:/ s|value:.*|value: ${ray_head_escaped}|" "$SPLUNK_AI_FILE" - sed $SEDOPTION "/name: RELATED_IMAGE_RAY_WORKER/,/value:/ s|value:.*|value: ${ray_worker_escaped}|" "$SPLUNK_AI_FILE" - sed $SEDOPTION "/name: RELATED_IMAGE_WEAVIATE/,/value:/ s|value:.*|value: ${weaviate_escaped}|" "$SPLUNK_AI_FILE" - sed $SEDOPTION "/name: RELATED_IMAGE_SAIA_API/,/value:/ s|value:.*|value: ${saia_api_escaped}|" "$SPLUNK_AI_FILE" - sed $SEDOPTION "/name: RELATED_IMAGE_POST_INSTALL_HOOK/,/value:/ s|value:.*|value: ${saia_dataloader_escaped}|" "$SPLUNK_AI_FILE" - sed $SEDOPTION "/name: RELATED_IMAGE_FLUENT_BIT/,/value:/ s|value:.*|value: ${fluent_bit_escaped}|" "$SPLUNK_AI_FILE" - sed $SEDOPTION "/name: RELATED_IMAGE_OTEL_COLLECTOR/,/value:/ s|value:.*|value: ${otel_collector_escaped}|" "$SPLUNK_AI_FILE" - sed $SEDOPTION "/name: MODEL_VERSION/,/value:/ s|value:.*|value: ${MODEL_VERSION}|" "$SPLUNK_AI_FILE" - sed $SEDOPTION "/name: RAY_VERSION/,/value:/ s|value:.*|value: ${RAY_RUNTIME_VERSION}|" "$SPLUNK_AI_FILE" - sed $SEDOPTION "s|image: .*splunk.*ai.*operator.*|image: ${operator_escaped}|I" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_RAY_HEAD/,/value:/ s|value:.*|value: ${ray_head_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_RAY_WORKER/,/value:/ s|value:.*|value: ${ray_worker_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_WEAVIATE/,/value:/ s|value:.*|value: ${weaviate_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_SAIA_API$/,/value:/ s|value:.*|value: ${saia_api_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_SAIA_API_V2/,/value:/ s|value:.*|value: ${saia_api_v2_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_POST_INSTALL_HOOK/,/value:/ s|value:.*|value: ${saia_dataloader_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_FLUENT_BIT/,/value:/ s|value:.*|value: ${fluent_bit_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_OTEL_COLLECTOR/,/value:/ s|value:.*|value: ${otel_collector_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_NGINX/,/value:/ s|value:.*|value: ${nginx_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: MODEL_VERSION/,/value:/ s|value:.*|value: ${MODEL_VERSION}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RAY_VERSION/,/value:/ s|value:.*|value: ${RAY_RUNTIME_VERSION}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "s|image: .*splunk.*ai.*operator.*|image: ${operator_escaped}|I" "$SPLUNK_AI_FILE" log " ✓ Updated RELATED_IMAGE_RAY_HEAD: $ray_head_full" log " ✓ Updated RELATED_IMAGE_RAY_WORKER: $ray_worker_full" log " ✓ Updated RELATED_IMAGE_WEAVIATE: $weaviate_full" log " ✓ Updated RELATED_IMAGE_SAIA_API: $saia_api_full" + log " ✓ Updated RELATED_IMAGE_SAIA_API_V2: $saia_api_v2_full" log " ✓ Updated RELATED_IMAGE_POST_INSTALL_HOOK: $saia_dataloader_full" log " ✓ Updated RELATED_IMAGE_FLUENT_BIT: $fluent_bit_full" log " ✓ Updated RELATED_IMAGE_OTEL_COLLECTOR: $otel_collector_full" + log " ✓ Updated RELATED_IMAGE_NGINX: $nginx_full" log " ✓ Updated operator image: $operator_full" log " ✓ Updated MODEL_VERSION: $MODEL_VERSION" log " ✓ Updated RAY_VERSION: $RAY_RUNTIME_VERSION" @@ -355,8 +383,8 @@ configure_images() { local splunk_escaped=$(echo "$splunk_full" | sed 's/[\/&]/\\&/g') local splunk_op_escaped=$(echo "$splunk_operator_full" | sed 's/[\/&]/\\&/g') - sed $SEDOPTION "/name: RELATED_IMAGE_SPLUNK_ENTERPRISE/,/value:/ s|value:.*|value: ${splunk_escaped}|" "$SPLUNK_OPERATOR_FILE" - sed $SEDOPTION "s|image: .*splunk.*operator.*|image: ${splunk_op_escaped}|I" "$SPLUNK_OPERATOR_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_SPLUNK_ENTERPRISE/,/value:/ s|value:.*|value: ${splunk_escaped}|" "$SPLUNK_OPERATOR_FILE" + "${SED_INPLACE[@]}" "s|image: .*splunk.*operator.*|image: ${splunk_op_escaped}|I" "$SPLUNK_OPERATOR_FILE" log " ✓ Updated Splunk Enterprise image: $splunk_full" log " ✓ Updated Splunk Operator image: $splunk_operator_full" From 6c15036b80bed21315e40158b18aace04954ef68 Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Mon, 20 Apr 2026 18:56:11 +0530 Subject: [PATCH 35/55] feat: add configurable aiPlatformScheme to AIServiceSpec Adds AIPlatformScheme field (http/https, default: http) so the URL scheme used when auto-generating PLATFORM_URL from AIPlatformRef is configurable, fixing airgapped clusters where the platform service is HTTP-only. --- api/v1/aiservice_types.go | 6 ++++++ pkg/ai/features/saia/impl.go | 8 ++++++-- pkg/ai/features/seca/seca.go | 6 +++++- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/api/v1/aiservice_types.go b/api/v1/aiservice_types.go index a675cb1..f4965e2 100644 --- a/api/v1/aiservice_types.go +++ b/api/v1/aiservice_types.go @@ -53,6 +53,12 @@ type AIServiceSpec struct { // +kubebuilder:validation:Optional AIPlatformUrl string `json:"aiPlatformUrl,omitempty"` + // AIPlatformScheme specifies the URL scheme for the AI Platform service ("http" or "https") + // +kubebuilder:validation:Optional + // +kubebuilder:default="http" + // +kubebuilder:validation:Enum=http;https + AIPlatformScheme string `json:"aiPlatformScheme,omitempty"` + // AIPlatformRef is a reference to the AIPlatform resource // +kubebuilder:validation:Required AIPlatformRef corev1.ObjectReference `json:"aiPlatformRef"` diff --git a/pkg/ai/features/saia/impl.go b/pkg/ai/features/saia/impl.go index 9f1a301..50d7cef 100644 --- a/pkg/ai/features/saia/impl.go +++ b/pkg/ai/features/saia/impl.go @@ -149,8 +149,12 @@ func (r *SaiaReconciler) validateAIService( clusterDomain = "cluster.local" } if ai.Spec.AIPlatformUrl == "" { - ai.Spec.AIPlatformUrl = fmt.Sprintf("%s.%s.svc.%s:8000", - aiPlatform.Status.RayServiceName, ai.Spec.AIPlatformRef.Namespace, clusterDomain) + scheme := ai.Spec.AIPlatformScheme + if scheme == "" { + scheme = "http" + } + ai.Spec.AIPlatformUrl = fmt.Sprintf("%s://%s.%s.svc.%s:8000", + scheme, aiPlatform.Status.RayServiceName, ai.Spec.AIPlatformRef.Namespace, clusterDomain) } if ai.Spec.VectorDbUrl == "" { ai.Spec.VectorDbUrl = fmt.Sprintf("%s.%s.svc.%s", diff --git a/pkg/ai/features/seca/seca.go b/pkg/ai/features/seca/seca.go index 04351bc..5915418 100644 --- a/pkg/ai/features/seca/seca.go +++ b/pkg/ai/features/seca/seca.go @@ -104,7 +104,11 @@ func (r *SecaReconciler) validateAIService(ctx context.Context, ai *aiv1.AIServi ); err != nil { return fmt.Errorf("fetching AIPlatform: %w", err) } - ai.Spec.AIPlatformUrl = fmt.Sprintf("%s.%s.svc.%s:8000", plat.Status.RayServiceName, ai.Spec.AIPlatformRef.Namespace, "cluster.local") + scheme := ai.Spec.AIPlatformScheme + if scheme == "" { + scheme = "http" + } + ai.Spec.AIPlatformUrl = fmt.Sprintf("%s://%s.%s.svc.%s:8000", scheme, plat.Status.RayServiceName, ai.Spec.AIPlatformRef.Namespace, "cluster.local") ai.Spec.VectorDbUrl = fmt.Sprintf("%s.%s.svc.%s", plat.Status.VectorDbServiceName, ai.Spec.AIPlatformRef.Namespace, "cluster.local") } if ai.Spec.AIPlatformRef.Name == "" && ai.Spec.AIPlatformUrl == "" { From 6870cb8c456e6e4c6c85adfb1762a8ae94e92b6c Mon Sep 17 00:00:00 2001 From: Mohammed Arif Date: Mon, 20 Apr 2026 17:43:50 +0530 Subject: [PATCH 36/55] feat(saia): expose public SAIA service via NodePort for Pattern-B v2 browser traffic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SAIA v2 chat UI in spl-copilot makes browser-direct fetch() calls (/query SSE, /conversations, /feedback, /admin/*) to the SAIA public Service. With the default ClusterIP, those calls fail for users on the customer's VPN because the cluster-internal DNS is not resolvable from their laptop. Enabling NodePort on the public nginx Service gives VPN users a single reachable URL for both Pattern A (Splunk Enterprise pod → saia-service) and Pattern B (browser → saia-service). - pkg/ai/reconciler.go: propagate AIPlatform.spec.serviceTemplate into each AIService via deep-copy; preserve direct `kubectl patch aiservice serviceTemplate` overrides across AIPlatform reconciles (same pattern as the existing Resources-preservation hack), so admins can flip exposure at runtime without the operator stomping their change. - pkg/ai/reconciler_test.go: lock in AIPlatform → AIService ServiceTemplate propagation + deep-copy isolation. - pkg/ai/features/saia/impl_test.go: parametrized matrix test for reconcileSAIAService covering all ServiceType branches (empty → ClusterIP, explicit ClusterIP, NodePort with/without explicit port, LoadBalancer, unknown type → safe ClusterIP default). - tools/cluster_setup/k0s-cluster-config.yaml: add `aiPlatform. serviceTemplate` block with NodePort 30080 default; bump operator to v0.1.21 which carries the propagation fix; bump saia v1/v2/data-loader images to v2.0.4-31-g9efe1fc (picks up saia-service S3 repositories for conversations, field counts, field descriptions from PR merged on Apr 20) and correct the apiV2Image / dataLoaderImage YAML keys that were accidentally all mapped to apiImage. - tools/cluster_setup/k0s_cluster_with_stack.sh: read aiPlatform. serviceTemplate.{type,nodePort} from the config and inject it into the rendered AIPlatform CR at install time. Emitting nothing for ClusterIP / omitted / null keeps the operator's safe ClusterIP fallback path working unchanged. Customer-facing: to enable external exposure, set `serviceTemplate. type: NodePort` (default). To disable, delete the block or set `type: ClusterIP`. Either path is treated identically. Aligns with ERD section 3.8 "SAIA V2 Agent Mode Support on AI Tier" decision A.2 assigning operator-side reverse-proxy + v1/v2/worker integration to the AI Tier Team. Made-with: Cursor --- pkg/ai/features/saia/impl_test.go | 110 ++++++++++++++++++ pkg/ai/reconciler.go | 21 +++- pkg/ai/reconciler_test.go | 42 +++++++ tools/cluster_setup/k0s-cluster-config.yaml | 45 ++++++- tools/cluster_setup/k0s_cluster_with_stack.sh | 20 ++++ 5 files changed, 232 insertions(+), 6 deletions(-) diff --git a/pkg/ai/features/saia/impl_test.go b/pkg/ai/features/saia/impl_test.go index 6cb15bc..644193b 100644 --- a/pkg/ai/features/saia/impl_test.go +++ b/pkg/ai/features/saia/impl_test.go @@ -515,6 +515,116 @@ func Test_reconcileSAIAService_pointsToNginx(t *testing.T) { assert.Equal(t, int32(8080), svc.Spec.Ports[0].Port) } +func Test_reconcileSAIAService_ServiceTypeVariations(t *testing.T) { + // Lock in the contract that the customer's k0s-cluster-config.yaml can + // omit / empty / explicitly-set serviceTemplate and get the expected + // Service.Type. Without this test, a future refactor could silently break + // the "just omit the block = ClusterIP" escape hatch documented in + // tools/cluster_setup/k0s-cluster-config.yaml. + scheme := buildFullTestScheme(t) + + cases := []struct { + name string + template corev1.Service + wantType corev1.ServiceType + wantNodePort int32 // 0 = don't check + }{ + { + name: "omitted/empty template → ClusterIP", + template: corev1.Service{}, // zero value, what yq-absent produces + wantType: corev1.ServiceTypeClusterIP, + }, + { + name: "explicit ClusterIP → ClusterIP", + template: corev1.Service{ + Spec: corev1.ServiceSpec{Type: corev1.ServiceTypeClusterIP}, + }, + wantType: corev1.ServiceTypeClusterIP, + }, + { + name: "NodePort without explicit port → NodePort auto-allocated", + template: corev1.Service{ + Spec: corev1.ServiceSpec{Type: corev1.ServiceTypeNodePort}, + }, + wantType: corev1.ServiceTypeNodePort, + // wantNodePort == 0 means we don't assert a specific value + }, + { + name: "NodePort with explicit 30080 → NodePort 30080", + template: corev1.Service{ + Spec: corev1.ServiceSpec{ + Type: corev1.ServiceTypeNodePort, + Ports: []corev1.ServicePort{ + {Name: "http", NodePort: 30080}, + }, + }, + }, + wantType: corev1.ServiceTypeNodePort, + wantNodePort: 30080, + }, + { + name: "LoadBalancer → LoadBalancer", + template: corev1.Service{ + Spec: corev1.ServiceSpec{Type: corev1.ServiceTypeLoadBalancer}, + }, + wantType: corev1.ServiceTypeLoadBalancer, + }, + { + name: "Unknown garbage type → ClusterIP (safe default)", + template: corev1.Service{ + Spec: corev1.ServiceSpec{Type: corev1.ServiceType("Bogus")}, + }, + wantType: corev1.ServiceTypeClusterIP, + }, + } + + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + ai := newTestAIService() + ai.Name = "svctype-" + sanitize(tc.name) + ai.Spec.ServiceTemplate = tc.template + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + require.NoError(t, r.reconcileSAIAService(context.Background(), ai)) + + svc := &corev1.Service{} + require.NoError(t, fakeClient.Get(context.Background(), + types.NamespacedName{Name: ai.Name + "-saia-service", Namespace: "default"}, svc)) + + assert.Equal(t, tc.wantType, svc.Spec.Type) + if tc.wantNodePort != 0 { + require.NotEmpty(t, svc.Spec.Ports) + assert.Equal(t, tc.wantNodePort, svc.Spec.Ports[0].NodePort) + } + }) + } +} + +// sanitize turns a free-form subtest name into a valid k8s resource name. +func sanitize(s string) string { + s = strings.ToLower(s) + out := make([]byte, 0, len(s)) + for i := 0; i < len(s); i++ { + c := s[i] + switch { + case c >= 'a' && c <= 'z', c >= '0' && c <= '9': + out = append(out, c) + default: + if len(out) > 0 && out[len(out)-1] != '-' { + out = append(out, '-') + } + } + } + // Trim trailing hyphen + for len(out) > 0 && out[len(out)-1] == '-' { + out = out[:len(out)-1] + } + return string(out) +} + func Test_buildSAIABaseEnv(t *testing.T) { ai := newTestAIService() env := buildSAIABaseEnv(ai) diff --git a/pkg/ai/reconciler.go b/pkg/ai/reconciler.go index 8c7814f..9db671b 100644 --- a/pkg/ai/reconciler.go +++ b/pkg/ai/reconciler.go @@ -134,6 +134,12 @@ func (r *AIPlatformReconciler) ReconcileFeatures(ctx context.Context, platform * _, err := controllerutil.CreateOrUpdate(ctx, r.Client, &svc, func() error { // After client Get, svc holds the live AIService (empty on first create). preservedResources := svc.Spec.Resources + // Preserve any direct `kubectl patch aiservice` edit of ServiceTemplate. + // Without this, an admin who patches the public SAIA Service type + // (e.g. to NodePort for browser-direct v2 traffic) would see their + // change revert on the next AIPlatform reconcile, same footgun as + // Resources above. + preservedServiceTemplate := svc.Spec.ServiceTemplate // Ensure ownership if err := controllerutil.SetControllerReference(platform, &svc, r.Scheme); err != nil { @@ -151,6 +157,12 @@ func (r *AIPlatformReconciler) ReconcileFeatures(ctx context.Context, platform * if resourceRequirementsNonEmpty(preservedResources) { svc.Spec.Resources = preservedResources } + // If the admin already patched serviceTemplate (non-empty + // spec.type), keep that override. Otherwise fall through to the + // value buildAIService() just set from AIPlatform.spec. + if preservedServiceTemplate.Spec.Type != "" { + svc.Spec.ServiceTemplate = preservedServiceTemplate + } // Merge labels if svc.Labels == nil { @@ -240,7 +252,14 @@ func (r *AIPlatformReconciler) buildAIService(ctx context.Context, platform *aiA Port: 8080, Path: "/metrics", }, - MTLS: platform.Spec.MTLS, + MTLS: platform.Spec.MTLS, + // Propagate public-exposure preference from AIPlatform. Customers deploy + // the higher-level AIPlatform CR, so any NodePort / LoadBalancer setting + // they configure at that level must flow down to the AIService. Without + // this copy, the spec lands on AIPlatform and is silently ignored. + // Deep-copy because corev1.Service is a value type with nested + // slices/maps; a shallow copy would share state across children. + ServiceTemplate: *platform.Spec.ServiceTemplate.DeepCopy(), ImagePullSecrets: platform.Spec.Images.ImagePullSecrets, }, } diff --git a/pkg/ai/reconciler_test.go b/pkg/ai/reconciler_test.go index d53ad90..dfbcc46 100644 --- a/pkg/ai/reconciler_test.go +++ b/pkg/ai/reconciler_test.go @@ -75,6 +75,48 @@ func TestBuildAIService_PopulatesExpectedFields(t *testing.T) { assert.Equal(t, "feature1", service.Labels["feature"]) } +func TestBuildAIService_PropagatesServiceTemplate(t *testing.T) { + // Customers configure public exposure (NodePort / LoadBalancer) at the + // AIPlatform level. Without propagation, the setting is silently dropped + // and SAIA is never reachable outside the cluster. This test locks in the + // contract that AIPlatform.spec.serviceTemplate flows into AIService. + scheme := buildTestScheme(t) + + platform := &aiApi.AIPlatform{ + ObjectMeta: metav1.ObjectMeta{Name: "my-ai", Namespace: "default"}, + Spec: aiApi.AIPlatformSpec{ + ObjectStorage: aiApi.ObjectStorageSpec{Path: "/data"}, + SplunkConfiguration: aiApi.SplunkConfigurationSpec{ + Endpoint: "splunk-endpoint", + }, + ServiceTemplate: corev1.Service{ + Spec: corev1.ServiceSpec{ + Type: corev1.ServiceTypeNodePort, + Ports: []corev1.ServicePort{ + {Name: "http", NodePort: 30080}, + }, + }, + }, + }, + } + feature := aiApi.FeatureSpec{Name: "saia", Version: "v1"} + r := &AIPlatformReconciler{Scheme: scheme} + + service := r.buildAIService(context.Background(), platform, feature, "my-ai-saia") + + assert.Equal(t, corev1.ServiceTypeNodePort, service.Spec.ServiceTemplate.Spec.Type, + "NodePort selection must propagate so customers can expose SAIA") + if assert.Len(t, service.Spec.ServiceTemplate.Spec.Ports, 1) { + assert.Equal(t, int32(30080), service.Spec.ServiceTemplate.Spec.Ports[0].NodePort, + "explicit NodePort must propagate") + } + + // Mutating the child spec must not affect the parent (deep-copy check). + service.Spec.ServiceTemplate.Spec.Ports[0].NodePort = 31234 + assert.Equal(t, int32(30080), platform.Spec.ServiceTemplate.Spec.Ports[0].NodePort, + "buildAIService must deep-copy ServiceTemplate to avoid shared state") +} + func TestReconcileFeatures_CreatesNewAIService(t *testing.T) { ctx := context.Background() scheme := buildTestScheme(t) diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml index 23b1a28..77b099a 100644 --- a/tools/cluster_setup/k0s-cluster-config.yaml +++ b/tools/cluster_setup/k0s-cluster-config.yaml @@ -61,7 +61,12 @@ images: operator: # image: "docker.io/kpratyush775/splunk-ai-operator:v0.1.29" - image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.20" + # v0.1.21 adds ServiceTemplate propagation (AIPlatform → AIService) so the + # aiPlatform.serviceTemplate block in this config actually takes effect. + # Build & push with: + # IMG=658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.21 \ + # make docker-build-amd64 docker-push + image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.21" splunk: image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/splunk/splunk:10-2-ai-custom" @@ -84,13 +89,16 @@ images: saia: # apiImage: "ml-platform/saia/saia-api:build-v1alpha1" # apiImage: "ml-platform/saia/saia-api:build-006" #saia v1.5 - apiImage: "ml-platform/saia/saia-api:v2.0.4-23-g2fc91e9" #saia v2 + # apiImage: "ml-platform/saia/saia-api:v2.0.4-23-g2fc91e9" #saia v2 + apiImage: "ml-platform/saia/saia-api:v2.0.4-31-g9efe1fc" #saia v2 + tony changes + + # apiV2Image: "ml-platform/saia/saia-api-v2:v2.0.4-23-g2fc91e9" #saia v2 + apiV2Image: "ml-platform/saia/saia-api-v2:v2.0.4-31-g9efe1fc" #saia v2 + tony changes - apiV2Image: "ml-platform/saia/saia-api-v2:v2.0.4-23-g2fc91e9" #saia v2 - # dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v1alpha1" # dataLoaderImage: "ml-platform/saia/saia-data-loader:build-003" #saia v1.5 - dataLoaderImage: "ml-platform/saia/saia-data-loader:v2.0.4-23-g2fc91e9" #saia v2 + # dataLoaderImage: "ml-platform/saia/saia-data-loader:v2.0.4-23-g2fc91e9" #saia v2 + dataLoaderImage: "ml-platform/saia/saia-data-loader:v2.0.4-31-g9efe1fc" #saia v2 + tony changes fluentBit: image: "docker.io/fluent/fluent-bit:1.9.6" @@ -139,6 +147,33 @@ aiPlatform: workerGroupConfig: imageRegistry: "" + # ---------- SAIA public exposure (OPTIONAL) ---------- + # The SAIA "public" Service (nginx reverse proxy in front of v1+v2 API pods) + # defaults to ClusterIP, meaning it is only reachable from inside the cluster. + # + # Two call patterns hit this Service: + # (A) Splunk Enterprise pod → saia-service (works with ClusterIP) + # (B) End user's browser → saia-service (needs external exposure) + # + # Pattern B is used by the v2 chat UI (/query streaming, conversations, + # feedback, admin endpoints). Without external exposure the v2 chat UI + # breaks for users, even though v1 one-shot SPL features still work. + # + # To DISABLE external exposure (use ClusterIP only), either: + # * Delete / comment-out the entire `serviceTemplate:` block below, OR + # * Set `type: ClusterIP` explicitly. + # Either is treated identically — the installer skips emitting serviceTemplate + # into the AIPlatform CR and the operator falls through to the ClusterIP + # default in reconcileSAIAService(). + # + # To ENABLE external exposure for on-prem / airgap customers, NodePort is the + # recommended default: any k8s node IP + the configured nodePort yields a + # reachable endpoint from VPN-connected users. No cloud LB / cert-manager + # needed. Use LoadBalancer only if the customer runs MetalLB or a cloud LB. + serviceTemplate: + type: NodePort # ClusterIP | NodePort | LoadBalancer (omit block = ClusterIP) + nodePort: 30080 # Fixed NodePort (30000-32767). Required for stable DNS. + features: - name: "saia" version: "1.1.0" diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index 85eedde..a01a30d 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -3034,6 +3034,25 @@ EOF ;; esac + # Build SAIA public-Service exposure block. + # The AIPlatform reconciler copies AIPlatform.spec.serviceTemplate down to + # each AIService; the SAIA feature reconciler uses it as the spec for the + # public saia-service. For on-prem / airgap customers, NodePort is the + # recommended default (no cloud LB, no cert-manager, browser on VPN can + # reach any node IP for Pattern-B v2 APIs like /query streaming). + local svc_template_yaml="" + local svc_type + svc_type=$(yq eval '.aiPlatform.serviceTemplate.type // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + if [[ -n "${svc_type}" && "${svc_type}" != "null" && "${svc_type}" != "ClusterIP" ]]; then + local svc_node_port + svc_node_port=$(yq eval '.aiPlatform.serviceTemplate.nodePort // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + svc_template_yaml=" serviceTemplate:"$'\n'" spec:"$'\n'" type: ${svc_type}"$'\n' + if [[ -n "${svc_node_port}" && "${svc_node_port}" != "null" && "${svc_type}" == "NodePort" ]]; then + svc_template_yaml+=" ports:"$'\n'" - name: http"$'\n'" port: 8080"$'\n'" targetPort: 8080"$'\n'" nodePort: ${svc_node_port}"$'\n' + fi + log "SAIA public exposure: ${svc_type}${svc_node_port:+ (nodePort=${svc_node_port})}" + fi + # Build features YAML from config file (reads aiPlatform.features[] array) local features_yaml="" local feature_count @@ -3084,6 +3103,7 @@ ${image_pull_secrets} # Features from config (aiPlatform.features) features: ${features_yaml} +${svc_template_yaml} # Storage configuration storage: vectorDB: From 7db8e0111ca5042fed56e7b8e7f81d8aa3927f32 Mon Sep 17 00:00:00 2001 From: Mohammed Arif Date: Mon, 20 Apr 2026 17:46:49 +0530 Subject: [PATCH 37/55] fix(saia): wire FIELD_DESCRIPTION S3 backend on v2 API and v2 worker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SAIA v2 ingestion worker crashes on startup with ValueError: Unknown FIELD_DESCRIPTION_BACKEND: ''. Expected 'dynamodb', 'file', or 's3'. because FieldDescriptionRepositoryFactory.get() strictly requires one of the three supported backends (saia-service commit 9efe1fce added s3 alongside the existing dynamodb/file options). Per the Confluence ERD "ERD - AI Tier v0.2 - Bare Metal - SAIA 2.0" section 3.8.1.2 + decision A.3, AI Tier uses Option B — the s3 backend that reads the global field-descriptions JSON from the same S3-compatible object store (SeaweedFS/MinIO/CVFS) SAIA already uses for tenant data. Rationale: - dynamodb: ERD assumption 2.1 disallows DynamoDB in AI Tier. - file: requires saia-v2/Dockerfile to COPY dataset/, which the current image (v2.0.4-31-g9efe1fc) does NOT do. - s3: reuses existing object store + credentials; clean path. Changes: - buildV2ExtraEnv (pkg/ai/features/saia/impl.go) now adds three env vars on every v2 pod (API + worker + any future v2-image caller): FIELD_DESCRIPTION_BACKEND = "s3" FIELD_DESCRIPTION_S3_KEY = "field-descriptions/global-field-descriptions.json" AWS_ENDPOINT_URL = $TaskVolume.Endpoint (only if set) The v2 S3StorageAdapter reads the canonical AWS_ENDPOINT_URL name, so v1's S3COMPAT_OBJECT_STORE_ENDPOINT_URL (already in buildSAIABaseEnv) does not help the v2 code path. AWS_ENDPOINT_URL is omitted when TaskVolume.Endpoint is empty so boto3 falls back to the default AWS regional endpoint in a real cloud deployment. - Extended Test_reconcileSAIAv2Deployment and Test_reconcileSAIAv2Worker to assert all three env vars are propagated into the rendered Deployment pod spec. - Added Test_buildV2ExtraEnv_FieldDescriptionBackend with two subtests that pin the contract: AWS_ENDPOINT_URL is set when TaskVolume.Endpoint is non-empty, and omitted when empty (so the variable does not override the cloud default). - Bumped operator image tag in k0s-cluster-config.yaml to v0.1.22 so the next `k0s_cluster_with_stack.sh install` run picks up this fix. Prereq for the worker to actually start: the JSON object at FIELD_DESCRIPTION_S3_KEY must exist in S3_BUCKET before the worker's first poll cycle. The data-loader Job is the canonical bootstrap workload for this; the upload step itself is tracked as a separate saia-service change (ERD A.2 assigns that to the SAIA team). Made-with: Cursor --- pkg/ai/features/saia/impl.go | 37 +++++++++++++++- pkg/ai/features/saia/impl_test.go | 49 +++++++++++++++++++++ tools/cluster_setup/k0s-cluster-config.yaml | 9 ++-- 3 files changed, 91 insertions(+), 4 deletions(-) diff --git a/pkg/ai/features/saia/impl.go b/pkg/ai/features/saia/impl.go index 50d7cef..14552c0 100644 --- a/pkg/ai/features/saia/impl.go +++ b/pkg/ai/features/saia/impl.go @@ -666,8 +666,29 @@ func buildSAIABaseEnv(ai *aiv1.AIService) []corev1.EnvVar { // buildV2ExtraEnv returns additional env vars needed by the SAIA v2 image. // v2 uses different env var names: VECTOR_DB_HOST (not VECTOR_DB_URL), // ML_PLATFORM_URL (not PLATFORM_URL), and needs vector DB TLS/auth disabled. +// +// SAIA V2 FieldDescription backend selection (required by both v2 API and v2 +// worker, else FieldDescriptionRepositoryFactory.get() raises ValueError at +// startup and the worker enters a restart loop). +// +// Per Confluence ERD "ERD - AI Tier v0.2 - Bare Metal - SAIA 2.0", section +// 3.8.1.2 + decision A.3: Option B (clean architecture) — use the new `s3` +// backend that reads the global field-descriptions JSON from the same +// S3-compatible object store (SeaweedFS/MinIO/CVFS) that SAIA already uses +// for tenant data. The alternatives: +// - `dynamodb` — ERD assumption 2.1 explicitly disallows DynamoDB in AI Tier. +// - `file` — requires the saia-v2 Dockerfile to `COPY dataset/`, which +// the current image (v2.0.4-31-g9efe1fc) does NOT do. +// +// The JSON object must be pre-uploaded to S3_BUCKET/FIELD_DESCRIPTION_S3_KEY +// before the worker runs; the data-loader Job is the canonical bootstrap step +// for this (see scripts/data_loader/ in saia-service). +// +// AWS_ENDPOINT_URL: the v2 S3StorageAdapter reads this canonical name. v1's +// S3_COMPAT_* env vars are already set in buildSAIABaseEnv but are NOT read +// by the v2 adapter, so we must set AWS_ENDPOINT_URL explicitly here. func buildV2ExtraEnv(ai *aiv1.AIService) []corev1.EnvVar { - return []corev1.EnvVar{ + env := []corev1.EnvVar{ {Name: "ML_PLATFORM_URL", Value: ai.Spec.AIPlatformUrl}, {Name: "VECTOR_DB_AUTH_ENABLED", Value: "false"}, {Name: "VECTOR_DB_GRPC_HOST", Value: ai.Spec.VectorDbUrl}, @@ -675,7 +696,21 @@ func buildV2ExtraEnv(ai *aiv1.AIService) []corev1.EnvVar { {Name: "VECTOR_DB_HOST", Value: ai.Spec.VectorDbUrl}, {Name: "VECTOR_DB_PORT", Value: "80"}, {Name: "VECTOR_DB_SECURE", Value: "false"}, + // FieldDescription S3 backend (see doc-comment above). + {Name: "FIELD_DESCRIPTION_BACKEND", Value: "s3"}, + {Name: "FIELD_DESCRIPTION_S3_KEY", Value: "field-descriptions/global-field-descriptions.json"}, + } + // Only expose AWS_ENDPOINT_URL when the operator was configured with an + // explicit S3-compatible endpoint (SeaweedFS/MinIO). Omitting it lets the + // v2 adapter use the default AWS S3 endpoint when running in a real cloud + // deployment. + if ai.Spec.TaskVolume.Endpoint != "" { + env = append(env, corev1.EnvVar{ + Name: "AWS_ENDPOINT_URL", + Value: ai.Spec.TaskVolume.Endpoint, + }) } + return env } // buildSAIATLSEnv appends TLS-related env vars and returns updated env, volumes, and mounts. diff --git a/pkg/ai/features/saia/impl_test.go b/pkg/ai/features/saia/impl_test.go index 644193b..60ada34 100644 --- a/pkg/ai/features/saia/impl_test.go +++ b/pkg/ai/features/saia/impl_test.go @@ -297,6 +297,18 @@ func Test_reconcileSAIAv2Deployment(t *testing.T) { assert.Equal(t, "http://platform:8000", envMap["PLATFORM_URL"]) assert.Equal(t, "test-bucket", envMap["S3_BUCKET"]) assert.Equal(t, "true", envMap["VAULT_TEMPLATE_DISABLED"]) + + // SAIA V2 FieldDescription backend is REQUIRED (worker and API both call + // FieldDescriptionRepositoryFactory.get() which raises ValueError on empty + // backend). Per Confluence ERD section 3.8.1.2 decision A.3 we use the + // S3-compatible backend for AI Tier. + assert.Equal(t, "s3", envMap["FIELD_DESCRIPTION_BACKEND"]) + assert.Equal(t, "field-descriptions/global-field-descriptions.json", + envMap["FIELD_DESCRIPTION_S3_KEY"]) + // AWS_ENDPOINT_URL is what the v2 S3StorageAdapter reads (vs v1's + // S3COMPAT_OBJECT_STORE_ENDPOINT_URL). Only set when the AIService has + // an explicit endpoint — e.g. for SeaweedFS/MinIO. + assert.Equal(t, "http://seaweedfs:8333", envMap["AWS_ENDPOINT_URL"]) } func Test_reconcileSAIAv2Worker(t *testing.T) { @@ -325,6 +337,15 @@ func Test_reconcileSAIAv2Worker(t *testing.T) { assert.Equal(t, "/tmp/ingestion_worker_heartbeat", envMap["WORKER_HEARTBEAT_PATH"]) assert.Equal(t, "true", envMap["VAULT_TEMPLATE_DISABLED"]) + // SAIA V2 FieldDescription backend is REQUIRED — without this, the worker + // immediately raises ValueError and enters a restart loop. Ref Confluence + // ERD 3.8.1.2 + A.3: Option B (S3-compatible object store). These three + // vars are the minimum to make the worker bootstrap cleanly. + assert.Equal(t, "s3", envMap["FIELD_DESCRIPTION_BACKEND"]) + assert.Equal(t, "field-descriptions/global-field-descriptions.json", + envMap["FIELD_DESCRIPTION_S3_KEY"]) + assert.Equal(t, "http://seaweedfs:8333", envMap["AWS_ENDPOINT_URL"]) + // Liveness uses exec (heartbeat file check), not HTTP assert.NotNil(t, container.LivenessProbe.Exec) assert.Nil(t, container.LivenessProbe.HTTPGet) @@ -625,6 +646,34 @@ func sanitize(s string) string { return string(out) } +func Test_buildV2ExtraEnv_FieldDescriptionBackend(t *testing.T) { + // Explicit AIService with seaweedfs-style endpoint → AWS_ENDPOINT_URL is set. + t.Run("with S3-compatible endpoint", func(t *testing.T) { + ai := newTestAIService() // already sets TaskVolume.Endpoint = "http://seaweedfs:8333" + envMap := envToMap(buildV2ExtraEnv(ai)) + + assert.Equal(t, "s3", envMap["FIELD_DESCRIPTION_BACKEND"]) + assert.Equal(t, "field-descriptions/global-field-descriptions.json", + envMap["FIELD_DESCRIPTION_S3_KEY"]) + assert.Equal(t, "http://seaweedfs:8333", envMap["AWS_ENDPOINT_URL"]) + }) + + // No explicit endpoint (= real AWS S3 deployment) → AWS_ENDPOINT_URL must + // be omitted so boto3 falls back to the default AWS regional endpoint. + t.Run("without S3-compatible endpoint", func(t *testing.T) { + ai := newTestAIService() + ai.Spec.TaskVolume.Endpoint = "" + envMap := envToMap(buildV2ExtraEnv(ai)) + + assert.Equal(t, "s3", envMap["FIELD_DESCRIPTION_BACKEND"]) + assert.Equal(t, "field-descriptions/global-field-descriptions.json", + envMap["FIELD_DESCRIPTION_S3_KEY"]) + _, has := envMap["AWS_ENDPOINT_URL"] + assert.False(t, has, + "AWS_ENDPOINT_URL must be omitted when TaskVolume.Endpoint is empty (cloud S3 case)") + }) +} + func Test_buildSAIABaseEnv(t *testing.T) { ai := newTestAIService() env := buildSAIABaseEnv(ai) diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml index 77b099a..3f452f7 100644 --- a/tools/cluster_setup/k0s-cluster-config.yaml +++ b/tools/cluster_setup/k0s-cluster-config.yaml @@ -61,12 +61,15 @@ images: operator: # image: "docker.io/kpratyush775/splunk-ai-operator:v0.1.29" - # v0.1.21 adds ServiceTemplate propagation (AIPlatform → AIService) so the + # v0.1.21 added ServiceTemplate propagation (AIPlatform → AIService) so the # aiPlatform.serviceTemplate block in this config actually takes effect. + # v0.1.22 sets FIELD_DESCRIPTION_BACKEND=s3 + FIELD_DESCRIPTION_S3_KEY + + # AWS_ENDPOINT_URL on the v2 API and v2 worker pods so the worker no longer + # crash-loops on saia-v2 >= v2.0.4-13-g3b677604 (Confluence ERD 3.8.1.2). # Build & push with: - # IMG=658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.21 \ + # IMG=658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.22 \ # make docker-build-amd64 docker-push - image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.21" + image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.22" splunk: image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/splunk/splunk:10-2-ai-custom" From cb76b29af6538a80b12c800d5c70db7c7dc27ee2 Mon Sep 17 00:00:00 2001 From: Mohammed Arif Date: Mon, 20 Apr 2026 19:32:55 +0530 Subject: [PATCH 38/55] fix(saia): wire AWS_ACCESS_KEY_ID/SECRET on v2 pods for S3FieldDescription MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The saia-v2 S3FieldDescriptionRepository (see app/repositories/field_description/factory.py) constructs its own S3StorageAdapter which calls boto3 directly. boto3 only reads the canonical AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY env vars — it does not read the S3COMPAT_OBJECT_STORE_ACCESS_KEY/_SECRET_KEY env vars that buildSAIABaseEnv already exposes for the v1 S3CompatStorageAdapter. Without these canonical names present, boto3 hits NoCredentialsError on the first load(). S3FieldDescriptionRepository catches that as StorageAdapterError, logs once, and returns an empty cache. Subsequent lookups silently return ([], []) — the worker stays up and processes tasks, but every sourcetype / knowledge-object ingestion produces blank field descriptions, degrading search quality without any visible failure. Source the creds from the same secret keys (s3_access_key / s3_secret_key) as the existing S3COMPAT_* plumbing (see raybuilder/builder.go and ai.Spec.TaskVolume.SecretRef), so there is a single source of truth for object-store auth. Only emit when TaskVolume.SecretRef is set — cloud deployments that rely on IAM roles must leave the env vars unset so boto3 can walk its default credential chain. Added two sub-tests under Test_buildV2ExtraEnv_FieldDescriptionBackend to cover both branches. Confirmed on k0s: worker log now shows "Found credentials in environment variables." and task ingestion succeeds. Made-with: Cursor --- pkg/ai/features/saia/impl.go | 38 +++++++++++++++++++++++--- pkg/ai/features/saia/impl_test.go | 45 +++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 3 deletions(-) diff --git a/pkg/ai/features/saia/impl.go b/pkg/ai/features/saia/impl.go index 14552c0..9bf85b4 100644 --- a/pkg/ai/features/saia/impl.go +++ b/pkg/ai/features/saia/impl.go @@ -684,9 +684,16 @@ func buildSAIABaseEnv(ai *aiv1.AIService) []corev1.EnvVar { // before the worker runs; the data-loader Job is the canonical bootstrap step // for this (see scripts/data_loader/ in saia-service). // -// AWS_ENDPOINT_URL: the v2 S3StorageAdapter reads this canonical name. v1's -// S3_COMPAT_* env vars are already set in buildSAIABaseEnv but are NOT read -// by the v2 adapter, so we must set AWS_ENDPOINT_URL explicitly here. +// AWS_ENDPOINT_URL / AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY: the v2 +// S3StorageAdapter (used by S3FieldDescriptionRepository, see +// app/repositories/field_description/factory.py) constructs boto3 directly and +// reads the canonical AWS_* names. v1's S3COMPAT_OBJECT_STORE_* env vars are +// already set in buildSAIABaseEnv but are NOT read by boto3, so without +// AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY the worker would silently fall back +// to no-credentials (NoCredentialsError caught by the repository as +// StorageAdapterError, returning an empty cache and degraded search results). +// Sourcing them from the same secret keys as the S3-compat creds keeps a +// single source of truth for object-store auth. func buildV2ExtraEnv(ai *aiv1.AIService) []corev1.EnvVar { env := []corev1.EnvVar{ {Name: "ML_PLATFORM_URL", Value: ai.Spec.AIPlatformUrl}, @@ -710,6 +717,31 @@ func buildV2ExtraEnv(ai *aiv1.AIService) []corev1.EnvVar { Value: ai.Spec.TaskVolume.Endpoint, }) } + // boto3-canonical credentials for the v2 S3StorageAdapter. Mirrors the + // S3COMPAT_OBJECT_STORE_ACCESS_KEY/_SECRET_KEY plumbing in buildSAIABaseEnv; + // see s3compat secret schema in raybuilder/builder.go and ai.Spec.TaskVolume.SecretRef. + if ai.Spec.TaskVolume.SecretRef != "" { + env = append(env, + corev1.EnvVar{ + Name: "AWS_ACCESS_KEY_ID", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: ai.Spec.TaskVolume.SecretRef}, + Key: "s3_access_key", + }, + }, + }, + corev1.EnvVar{ + Name: "AWS_SECRET_ACCESS_KEY", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: ai.Spec.TaskVolume.SecretRef}, + Key: "s3_secret_key", + }, + }, + }, + ) + } return env } diff --git a/pkg/ai/features/saia/impl_test.go b/pkg/ai/features/saia/impl_test.go index 60ada34..413811b 100644 --- a/pkg/ai/features/saia/impl_test.go +++ b/pkg/ai/features/saia/impl_test.go @@ -672,6 +672,51 @@ func Test_buildV2ExtraEnv_FieldDescriptionBackend(t *testing.T) { assert.False(t, has, "AWS_ENDPOINT_URL must be omitted when TaskVolume.Endpoint is empty (cloud S3 case)") }) + + // SecretRef present → AWS_ACCESS_KEY_ID/SECRET sourced from same keys as + // the S3COMPAT_* envs in buildSAIABaseEnv. Required so that the v2 + // S3StorageAdapter (used by S3FieldDescriptionRepository) can authenticate + // to SeaweedFS / MinIO. + t.Run("AWS credentials sourced from SecretRef", func(t *testing.T) { + ai := newTestAIService() // already sets SecretRef = "s3-creds" + env := buildV2ExtraEnv(ai) + + var foundID, foundSecret bool + for _, e := range env { + if e.Name == "AWS_ACCESS_KEY_ID" { + foundID = true + if assert.NotNil(t, e.ValueFrom) && assert.NotNil(t, e.ValueFrom.SecretKeyRef) { + assert.Equal(t, "s3-creds", e.ValueFrom.SecretKeyRef.Name) + assert.Equal(t, "s3_access_key", e.ValueFrom.SecretKeyRef.Key) + } + } + if e.Name == "AWS_SECRET_ACCESS_KEY" { + foundSecret = true + if assert.NotNil(t, e.ValueFrom) && assert.NotNil(t, e.ValueFrom.SecretKeyRef) { + assert.Equal(t, "s3-creds", e.ValueFrom.SecretKeyRef.Name) + assert.Equal(t, "s3_secret_key", e.ValueFrom.SecretKeyRef.Key) + } + } + } + assert.True(t, foundID, "AWS_ACCESS_KEY_ID must be present so boto3 can auth to S3-compat endpoint") + assert.True(t, foundSecret, "AWS_SECRET_ACCESS_KEY must be present so boto3 can auth to S3-compat endpoint") + }) + + // No SecretRef → AWS_* must be omitted (cloud deployments use IAM role, + // not env-var creds; setting empty values would otherwise mask the IAM + // chain inside boto3). + t.Run("AWS credentials omitted when SecretRef empty", func(t *testing.T) { + ai := newTestAIService() + ai.Spec.TaskVolume.SecretRef = "" + env := buildV2ExtraEnv(ai) + + for _, e := range env { + assert.NotEqual(t, "AWS_ACCESS_KEY_ID", e.Name, + "AWS_ACCESS_KEY_ID must be omitted in cloud (IAM-role) case") + assert.NotEqual(t, "AWS_SECRET_ACCESS_KEY", e.Name, + "AWS_SECRET_ACCESS_KEY must be omitted in cloud (IAM-role) case") + } + }) } func Test_buildSAIABaseEnv(t *testing.T) { From 4656c4c6e699edbc53055b32671621f0dd57836c Mon Sep 17 00:00:00 2001 From: Mohammed Arif Date: Mon, 20 Apr 2026 19:33:56 +0530 Subject: [PATCH 39/55] fix(saia): set v2 worker RUN_TASKS_DELAY_S=10 to keep heartbeat fresh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The saia-v2 IngestionWorker.run() only refreshes its heartbeat file at the top of process_next() (see app/workers/ingestion_worker.py:236), then sleeps settings.run_tasks_delay_s between iterations whenever the queue is empty OR the tenant lock is held by another worker. The Kubernetes liveness probe rejects heartbeats older than 120s, so any sleep value above ~100s will trigger SIGKILL → restart → CrashLoop the first time the worker can't acquire a lock. The previous value (600s) was copied from the v1 SAIA worker's APScheduler cron, where RUN_TASKS_DELAY_S controlled a completely different thing (the periodic batch-job interval — 10 min is fine there because v1 doesn't use the same file-heartbeat mechanism). v2 reuses the env name for a per- iteration sleep, so the values must NOT be shared. Set to 10s (matches saia-v2's own Settings.run_tasks_delay_s default), which keeps heartbeat age well under the 120s probe threshold even on a busy-lock path. Added a doc comment calling out the v1/v2 semantic drift so future changes don't re-introduce the regression. Also rolls forward the bundled k0s-cluster-config.yaml operator image to v0.1.23, with changelog comments for all three fixes in this series (http:// prefix, AWS creds, RUN_TASKS_DELAY_S) so operators reading the file can trace which image introduced which behaviour. Verified on k0s: v2 worker stayed 1/1 Running with 0 restarts for 4+ minutes, heartbeat age measured at 6.1s (well under probe threshold). Made-with: Cursor --- pkg/ai/features/saia/impl.go | 11 ++++++++++- pkg/ai/features/saia/impl_test.go | 6 +++++- tools/cluster_setup/k0s-cluster-config.yaml | 20 +++++++++++++++----- 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/pkg/ai/features/saia/impl.go b/pkg/ai/features/saia/impl.go index 9bf85b4..6148b69 100644 --- a/pkg/ai/features/saia/impl.go +++ b/pkg/ai/features/saia/impl.go @@ -1147,8 +1147,17 @@ func (r *SaiaReconciler) reconcileSAIAv2Worker( // Keep heartbeat path in sync with saia-v2's default (app/core/config.py: // worker_heartbeat_path = "/tmp/ingestion_worker_heartbeat"). The ingestion // worker writes a floating-point unix timestamp to this file every poll cycle. + // + // RUN_TASKS_DELAY_S (run_tasks_delay_s) is the per-iteration sleep in + // IngestionWorker.run() when the queue is empty OR the tenant lock is busy. + // The heartbeat is written only at the top of process_next(), so this sleep + // directly controls heartbeat cadence. The liveness probe rejects heartbeats + // older than 120s, so we MUST keep this well under that threshold — 10s + // matches the saia-v2 default (see Settings.run_tasks_delay_s). Do NOT + // conflate with the v1 worker APScheduler cron (which uses 600s for weekly + // jobs); v2 reuses the same env name for a different purpose. env = append(env, - corev1.EnvVar{Name: "RUN_TASKS_DELAY_S", Value: "600"}, + corev1.EnvVar{Name: "RUN_TASKS_DELAY_S", Value: "10"}, corev1.EnvVar{Name: "VAULT_TEMPLATE_DISABLED", Value: "true"}, corev1.EnvVar{Name: "WORKER_HEARTBEAT_PATH", Value: "/tmp/ingestion_worker_heartbeat"}, ) diff --git a/pkg/ai/features/saia/impl_test.go b/pkg/ai/features/saia/impl_test.go index 413811b..aa08995 100644 --- a/pkg/ai/features/saia/impl_test.go +++ b/pkg/ai/features/saia/impl_test.go @@ -332,7 +332,11 @@ func Test_reconcileSAIAv2Worker(t *testing.T) { assert.Contains(t, container.Args[0], "app.workers.ingestion_worker") envMap := envToMap(container.Env) - assert.Equal(t, "600", envMap["RUN_TASKS_DELAY_S"]) + // RUN_TASKS_DELAY_S controls the v2 worker's poll sleep (saia-v2 + // IngestionWorker.run). The value MUST stay well under the liveness probe + // threshold (120s) because the heartbeat file is only refreshed at the top + // of each iteration. 10s matches saia-v2's own Settings default. + assert.Equal(t, "10", envMap["RUN_TASKS_DELAY_S"]) // Heartbeat path must match saia-v2's default (app/core/config.py). assert.Equal(t, "/tmp/ingestion_worker_heartbeat", envMap["WORKER_HEARTBEAT_PATH"]) assert.Equal(t, "true", envMap["VAULT_TEMPLATE_DISABLED"]) diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml index 3f452f7..30302c5 100644 --- a/tools/cluster_setup/k0s-cluster-config.yaml +++ b/tools/cluster_setup/k0s-cluster-config.yaml @@ -63,13 +63,23 @@ images: # image: "docker.io/kpratyush775/splunk-ai-operator:v0.1.29" # v0.1.21 added ServiceTemplate propagation (AIPlatform → AIService) so the # aiPlatform.serviceTemplate block in this config actually takes effect. - # v0.1.22 sets FIELD_DESCRIPTION_BACKEND=s3 + FIELD_DESCRIPTION_S3_KEY + - # AWS_ENDPOINT_URL on the v2 API and v2 worker pods so the worker no longer - # crash-loops on saia-v2 >= v2.0.4-13-g3b677604 (Confluence ERD 3.8.1.2). + # v0.1.21 also sets FIELD_DESCRIPTION_BACKEND=s3 + FIELD_DESCRIPTION_S3_KEY + + # AWS_ENDPOINT_URL on the v2 API and v2 worker pods (Confluence ERD 3.8.1.2) + # and wires AWS_ACCESS_KEY_ID/SECRET from TaskVolume.SecretRef so boto3 can + # actually auth to the S3-compatible endpoint (otherwise field-description + # lookups silently return empty and sourcetype metadata is degraded). + # v0.1.22 prefixes http:// on the auto-generated AIPlatformUrl so v2's + # httpx/openai adapters can parse it — without the scheme they raise + # UnsupportedProtocol and every ingestion task fails. + # v0.1.23 lowers RUN_TASKS_DELAY_S from 600s to 10s on the v2 worker. + # The saia-v2 IngestionWorker only refreshes its heartbeat at the top of + # each poll iteration, and the liveness probe kills the pod at 120s stale. + # At 600s (the v1 scheduler cadence) the worker got SIGKILLed every time + # the tenant lock was busy or the queue was empty. # Build & push with: - # IMG=658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.22 \ + # IMG=658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.23 \ # make docker-build-amd64 docker-push - image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.22" + image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.23" splunk: image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/splunk/splunk:10-2-ai-custom" From 824af70265d2dbff1b20e2f45b16128d7ce20cde Mon Sep 17 00:00:00 2001 From: Mohammed Arif Date: Mon, 20 Apr 2026 21:53:49 +0530 Subject: [PATCH 40/55] feat: update images --- tools/cluster_setup/artifacts.yaml | 8 ++++---- tools/cluster_setup/k0s-cluster-config.yaml | 9 +++------ 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/tools/cluster_setup/artifacts.yaml b/tools/cluster_setup/artifacts.yaml index 316b183..badfbe6 100644 --- a/tools/cluster_setup/artifacts.yaml +++ b/tools/cluster_setup/artifacts.yaml @@ -5688,11 +5688,11 @@ spec: - name: RELATED_IMAGE_WEAVIATE value: docker.io/semitechnologies/weaviate:stable-v1.28-007846a - name: RELATED_IMAGE_SAIA_API - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:v2.0.4-23-g2fc91e9 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:v2.0.4-31-g9efe1fc - name: RELATED_IMAGE_SAIA_API_V2 - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api-v2:v2.0.4-23-g2fc91e9 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api-v2:v2.0.4-31-g9efe1fc - name: RELATED_IMAGE_POST_INSTALL_HOOK - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-data-loader:v2.0.4-23-g2fc91e9 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-data-loader:v2.0.4-31-g9efe1fc - name: SPLUNK_METRICS_INDEX_NAME value: _metrics - name: RELATED_IMAGE_FLUENT_BIT @@ -5705,7 +5705,7 @@ spec: value: v0.3.14-36-g1549f5a - name: RAY_VERSION value: 2.53.0 - image: 658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.20 + image: 658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.24 livenessProbe: httpGet: path: /healthz diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml index 30302c5..be127bd 100644 --- a/tools/cluster_setup/k0s-cluster-config.yaml +++ b/tools/cluster_setup/k0s-cluster-config.yaml @@ -68,18 +68,15 @@ images: # and wires AWS_ACCESS_KEY_ID/SECRET from TaskVolume.SecretRef so boto3 can # actually auth to the S3-compatible endpoint (otherwise field-description # lookups silently return empty and sourcetype metadata is degraded). - # v0.1.22 prefixes http:// on the auto-generated AIPlatformUrl so v2's - # httpx/openai adapters can parse it — without the scheme they raise - # UnsupportedProtocol and every ingestion task fails. - # v0.1.23 lowers RUN_TASKS_DELAY_S from 600s to 10s on the v2 worker. + # v0.1.24 lowers RUN_TASKS_DELAY_S from 600s to 10s on the v2 worker. # The saia-v2 IngestionWorker only refreshes its heartbeat at the top of # each poll iteration, and the liveness probe kills the pod at 120s stale. # At 600s (the v1 scheduler cadence) the worker got SIGKILLed every time # the tenant lock was busy or the queue was empty. # Build & push with: - # IMG=658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.23 \ + # IMG=658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.24 \ # make docker-build-amd64 docker-push - image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.23" + image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.24" splunk: image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/splunk/splunk:10-2-ai-custom" From 8fa59a54fb15cdcafc53bb5d01ab60c7d6e8a1de Mon Sep 17 00:00:00 2001 From: Mohammed Arif Date: Tue, 21 Apr 2026 19:25:22 +0530 Subject: [PATCH 41/55] fix(saia): unblock airgap v2 query path via CORS preflight, authz re-enable, and Redis no-op MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three independent root causes were breaking the airgap SAIA v2 flow end-to-end. Each is fixed at its actual origin so the operator's rendered manifests are usable out-of-the-box on k0s without post-install patches. * nginx CORS preflight short-circuit (pkg/ai/features/saia/impl.go): SAIA v2's TenantConversationKeyMiddleware rejects unauthenticated OPTIONS with 400 before FastAPI's CORSMiddleware can respond, so browsers block the real request with "No Access-Control-Allow-Origin header present". nginx now answers OPTIONS with 204 + CORS headers on both v1 and v2 locations. Access-Control-Allow-Headers is reflected from the request via a map{} so new client headers don't require nginx edits, and ACAO is emitted only on preflight to avoid duplicating FastAPI's real-response ACAO. * ENABLE_AUTHZ=true (pkg/ai/features/saia/impl.go defaults): The CMP interactive-token branch in SAIAAuthorizer is the only path that sets request.state.cmp_splunk_url, which AdminCapabilityAuthorizer needs to bridge a Splunk.interactive bearer into an EC-equivalent token. With "false", /admin/* returned 403 "Admin endpoints require an authenticated EC user token." Even in airgap CMP mode the value must be "true" — there is no authz-skip value that preserves the CMP bridge. * DISABLE_RESPONSES_API_REDIS=True on GptOss120b / GptOss20b (config/configs/applications.yaml): Pairs with ray-head/ray-worker-gpu:build-v2-002 which ships the NoOpOpenAIServingResponses implementation (ai-platform-models c1f9aef3). Without this flag the vLLM RedisOpenAIServingResponses constructor raises "Responses Redis URL not set" on every /v1/responses call, the SSE stream is empty, and the /query path fails with SearchStreamError. Airgap has no Redis; cloud stays on "False" with its in-namespace Redis StatefulSet. Supporting changes: * Bump operator to v0.1.25 and all SAIA/Ray images to build-v2-002 in artifacts.yaml and k0s-cluster-config.yaml. * Promote gopkg.in/yaml.v3 from indirect to direct in go.mod for the new raybuilder test. * Add regression tests: - Test_reconcileSAIAConfigMap_EnablesAuthzForCMPBridging - Test_reconcileSAIAConfigMap_PreservesUserOverride (user override honored) - Test_reconcileNginxConfigMap_CORSPreflight (both locations, exactly two ACAO instances, dynamic header reflection) - pkg/ai/raybuilder/configmap_apps_test.go for the Redis no-op flag. * Update AIPlatformUrl default test expectation to include the http:// scheme (matches the scheme-qualified URL that httpx/openai clients need). * Add k0s-cluster-config-h100.yaml for the H100 lab topology. Made-with: Cursor --- config/configs/applications.yaml | 14 ++ go.mod | 2 +- pkg/ai/features/saia/impl.go | 59 ++++- pkg/ai/features/saia/impl_test.go | 116 ++++++++- pkg/ai/raybuilder/configmap_apps_test.go | 163 +++++++++++++ tools/cluster_setup/artifacts.yaml | 10 +- .../k0s-cluster-config-h100.yaml | 228 ++++++++++++++++++ tools/cluster_setup/k0s-cluster-config.yaml | 31 ++- 8 files changed, 608 insertions(+), 15 deletions(-) create mode 100644 pkg/ai/raybuilder/configmap_apps_test.go create mode 100644 tools/cluster_setup/k0s-cluster-config-h100.yaml diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index 5edc3e3..23a5274 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -94,6 +94,17 @@ applications: SKIP_VERIFICATION: "true" USE_SYSTEM_PERMISSIONS: "true" VLLM_WORKER_MULTIPROC_METHOD: spawn + # Disable the Redis-backed Responses API store (see ai-platform-models + # commit c1f9aef3: "feat: add a no-op store"). When True, the vLLM + # TextGen deployment constructs NoOpOpenAIServingResponses instead of + # RedisOpenAIServingResponses, so /v1/responses works without a Redis + # infra. Without this flag the deployment raises + # RuntimeError: Responses Redis URL not set + # on every request, which surfaces as an empty SSE stream and the SAIA + # v2 /query path fails with "An error occurred processing your request". + # Airgap k0s has no Redis; cloud sets this to "False" and wires + # RESPONSES_REDIS_ADDRESS to its in-namespace Redis StatefulSet. + DISABLE_RESPONSES_API_REDIS: "True" - args: application_name: GptOss20b deployment_configs: @@ -167,6 +178,9 @@ applications: SKIP_VERIFICATION: "true" USE_SYSTEM_PERMISSIONS: "true" VLLM_WORKER_MULTIPROC_METHOD: spawn + # See GptOss120b above for rationale. Must be "True" in airgap (no + # Redis) so vLLM uses NoOpOpenAIServingResponses. + DISABLE_RESPONSES_API_REDIS: "True" - args: application_name: UaeLarge deployment_configs: diff --git a/go.mod b/go.mod index e5daf45..dce61a0 100644 --- a/go.mod +++ b/go.mod @@ -22,6 +22,7 @@ require ( github.com/stretchr/testify v1.11.1 google.golang.org/api v0.235.0 gopkg.in/yaml.v2 v2.4.0 + gopkg.in/yaml.v3 v3.0.1 k8s.io/api v0.33.1 k8s.io/apiextensions-apiserver v0.33.1 k8s.io/apimachinery v0.33.1 @@ -128,7 +129,6 @@ require ( google.golang.org/protobuf v1.36.11 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/apiserver v0.33.1 // indirect k8s.io/component-base v0.33.1 // indirect k8s.io/klog/v2 v2.130.1 // indirect diff --git a/pkg/ai/features/saia/impl.go b/pkg/ai/features/saia/impl.go index 6148b69..000ede4 100644 --- a/pkg/ai/features/saia/impl.go +++ b/pkg/ai/features/saia/impl.go @@ -322,13 +322,24 @@ func (r *SaiaReconciler) reconcileSAIAConfigMap( cmName := fmt.Sprintf("%s-saia-config", ai.Name) // Defaults for static keys (override in user-managed CM if desired). + // + // ENABLE_AUTHZ MUST be "true" for SAIAAuthorizer.authorize() to run its + // CMP interactive-token validation branch, which is the ONLY code path + // that sets request.state.cmp_splunk_url on a successful token. The admin + // endpoints (AdminCapabilityAuthorizer) read that attribute to bridge the + // Splunk.interactive bearer into an EC-equivalent token. With "false" the + // main authorizer early-returns, the attribute is never set, and every + // /admin/* request fails with: + // 403 {"detail":"Admin endpoints require an authenticated EC user token."} + // There is no authorization-skip value that also preserves CMP bridging — + // the value IS "true" even in airgap CMP mode. defaults := map[string]string{ // previously hardcoded "SERVICE_NAME": "splunk_ai_assistant", "SERVICE_INTERNAL_NAME": "SAIA", "SPLUNK_ISSUERS": "https://splunk-splunk-standalone-standalone-service:8089", "SPLUNK_AI_ASSISTANT_SERVICE_CMP": "true", - "ENABLE_AUTHZ": "false", // FIXME remove when ready + "ENABLE_AUTHZ": "true", "FEATURE_CONFIG_FILE_LOCATION": "/etc/config/features_config.yaml", "PLATFORM_VERSION": "0.3.0", // TODO make configurable "SAIA_API_VERSION": "0.3.1", // TODO make configurable @@ -1276,6 +1287,15 @@ http { server %s:8000; } + # Reflect Access-Control-Request-Headers back on preflight. If the browser + # didn't send any (rare), fall back to a broad default. Safer than a + # hardcoded allowlist because spl-copilot (and future clients) may add + # custom headers like x-requested-with, x-csrf-token, x-splunk-*, etc. + map $http_access_control_request_headers $cors_allow_headers { + default $http_access_control_request_headers; + "" "authorization, content-type, x-ec-token, x-es-tenant-bearer, x-stack-url, x-stack-url-legacy, splunk-client, x-conversation-key, x-request-id, x-admin-preferences-filename, x-requested-with"; + } + server { listen 8080; @@ -1304,6 +1324,28 @@ http { # Word boundary via "/saia-api-v2/" (not "saia-api-v2" substring) # prevents accidental matches like /foo/saia-api-v2-legacy/. location ~ /saia-api-v2/ { + # CORS preflight short-circuit. Browser preflights are + # unauthenticated by spec; SAIA v2's TenantConversationKeyMiddleware + # rejects them with 400 before FastAPI's CORSMiddleware can respond, + # which makes the browser block the real request with "No + # Access-Control-Allow-Origin header present". Answer preflight + # here and never proxy OPTIONS upstream. + # + # IMPORTANT: Do NOT emit Access-Control-Allow-Origin on non-OPTIONS + # responses — FastAPI's CORSMiddleware already sets it on real + # responses. A second ACAO from nginx would produce duplicate + # "*, http://origin" values that browsers reject. + if ($request_method = OPTIONS) { + add_header Access-Control-Allow-Origin $http_origin always; + add_header Access-Control-Allow-Credentials true always; + add_header Access-Control-Allow-Methods 'GET, POST, PUT, DELETE, PATCH, OPTIONS' always; + add_header Access-Control-Allow-Headers $cors_allow_headers always; + add_header Access-Control-Max-Age 3600 always; + add_header Content-Length 0 always; + add_header Content-Type 'text/plain charset=UTF-8' always; + return 204; + } + proxy_pass http://saia_v2; proxy_http_version 1.1; proxy_set_header Host $host; @@ -1318,6 +1360,21 @@ http { # v1: everything else (including /health, /{tenant}/saia-api/v1alpha1/...) location / { + # Mirror the CORS preflight short-circuit for v1 routes; spl-copilot's + # Pattern B (direct browser fetch) may hit v1 admin endpoints too. Same + # rationale as v2: SAIA v1 middlewares authenticate on OPTIONS and would + # reject the preflight before CORS headers are emitted. + if ($request_method = OPTIONS) { + add_header Access-Control-Allow-Origin $http_origin always; + add_header Access-Control-Allow-Credentials true always; + add_header Access-Control-Allow-Methods 'GET, POST, PUT, DELETE, PATCH, OPTIONS' always; + add_header Access-Control-Allow-Headers $cors_allow_headers always; + add_header Access-Control-Max-Age 3600 always; + add_header Content-Length 0 always; + add_header Content-Type 'text/plain charset=UTF-8' always; + return 204; + } + proxy_pass http://saia_v1; proxy_http_version 1.1; proxy_set_header Host $host; diff --git a/pkg/ai/features/saia/impl_test.go b/pkg/ai/features/saia/impl_test.go index aa08995..2066e87 100644 --- a/pkg/ai/features/saia/impl_test.go +++ b/pkg/ai/features/saia/impl_test.go @@ -118,7 +118,11 @@ func Test_validateAIService_defaults(t *testing.T) { assert.Equal(t, int32(1), ai.Spec.Replicas) assert.NotNil(t, ai.Spec.Resources.Requests) assert.NotNil(t, ai.Spec.Resources.Limits) - assert.Equal(t, "ray.ns.svc.cluster.local:8000", ai.Spec.AIPlatformUrl) + // AIPlatformUrl is built as "://..svc.:8000". + // When AIPlatformScheme is unset, the operator defaults to "http" (see + // validateAIService). This makes the URL usable directly by httpx/openai + // clients in SAIA v2 without a second string-concat step. + assert.Equal(t, "http://ray.ns.svc.cluster.local:8000", ai.Spec.AIPlatformUrl) assert.Equal(t, "vec.ns.svc.cluster.local", ai.Spec.VectorDbUrl) assert.Equal(t, int32(1), ai.Spec.V2.Replicas) assert.Equal(t, int32(1), ai.Spec.V2Worker.Replicas) @@ -270,6 +274,61 @@ func Test_reconcilePostInstallHook_SetsGRPCEnvForV2DataLoader(t *testing.T) { assert.Equal(t, "true", envMap["SPLUNK_AI_ASSISTANT_SERVICE_CMP"]) } +func Test_reconcileSAIAConfigMap_EnablesAuthzForCMPBridging(t *testing.T) { + // Regression: ENABLE_AUTHZ=true is REQUIRED for the SAIAAuthorizer's + // CMP interactive-token path to run. That path sets request.state.cmp_splunk_url, + // which AdminCapabilityAuthorizer needs to bridge a Splunk.interactive bearer + // into an EC-equivalent token. ENABLE_AUTHZ=false early-returns before the + // attribute is set, and /admin/* requests then fail with: + // 403 {"detail":"Admin endpoints require an authenticated EC user token."} + // Even in airgap CMP mode, ENABLE_AUTHZ must be "true" — there's no value + // that both skips authorization AND preserves the CMP bridge. + scheme := buildFullTestScheme(t) + ai := newTestAIService() + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + require.NoError(t, r.reconcileSAIAConfigMap(context.Background(), ai)) + + cm := &corev1.ConfigMap{} + require.NoError(t, fakeClient.Get(context.Background(), + types.NamespacedName{Name: "test-saia-config", Namespace: "default"}, cm)) + + assert.Equal(t, "true", cm.Data["ENABLE_AUTHZ"], + "ENABLE_AUTHZ must default to 'true' so CMP interactive-token bridging works on /admin/* routes") + assert.Equal(t, "true", cm.Data["SPLUNK_AI_ASSISTANT_SERVICE_CMP"], + "CMP mode flag must be set alongside ENABLE_AUTHZ so the authorizer picks the interactive-token branch") +} + +func Test_reconcileSAIAConfigMap_PreservesUserOverride(t *testing.T) { + // If an operator explicitly disables authz on an existing ConfigMap + // (e.g. for development/debugging), our reconcile must NOT clobber that + // value back to the "true" default. The merge logic fills in missing or + // empty keys only. + scheme := buildFullTestScheme(t) + ai := newTestAIService() + + existing := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-saia-config", + Namespace: "default", + }, + Data: map[string]string{"ENABLE_AUTHZ": "false"}, + } + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai, existing).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + require.NoError(t, r.reconcileSAIAConfigMap(context.Background(), ai)) + + cm := &corev1.ConfigMap{} + require.NoError(t, fakeClient.Get(context.Background(), + types.NamespacedName{Name: "test-saia-config", Namespace: "default"}, cm)) + + assert.Equal(t, "false", cm.Data["ENABLE_AUTHZ"], + "user-set ENABLE_AUTHZ=false must be preserved across reconciles") +} + func Test_reconcileSAIAv2Deployment(t *testing.T) { scheme := buildFullTestScheme(t) ai := newTestAIService() @@ -404,6 +463,61 @@ func Test_reconcileNginxConfigMap(t *testing.T) { assert.Contains(t, conf, "deny all;") } +func Test_reconcileNginxConfigMap_CORSPreflight(t *testing.T) { + // Regression: saia-v2's TenantConversationKeyMiddleware rejects + // unauthenticated CORS preflight OPTIONS requests with 400 before + // FastAPI's CORSMiddleware can respond, causing browsers to block the + // subsequent real request with "No Access-Control-Allow-Origin header + // present". The nginx reverse proxy MUST short-circuit OPTIONS at the + // proxy layer and respond with permissive CORS headers so the browser + // accepts the preflight. See: + // saia-service/saia-v2/app/middleware/tenant_conversation_key.py + scheme := buildFullTestScheme(t) + ai := newTestAIService() + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + require.NoError(t, r.reconcileNginxConfigMap(context.Background(), ai)) + + cm := &corev1.ConfigMap{} + require.NoError(t, fakeClient.Get(context.Background(), + types.NamespacedName{Name: "test-saia-nginx-config", Namespace: "default"}, cm)) + + conf := cm.Data["nginx.conf"] + + // OPTIONS short-circuit must be present on BOTH v1 (/) and v2 + // (/saia-api-v2/) locations. Without it, v1 admin routes (Pattern B + // direct browser fetch) would also fail the same way. + assert.Equal(t, 2, strings.Count(conf, "if ($request_method = OPTIONS)"), + "OPTIONS short-circuit must exist in both v1 and v2 location blocks") + assert.Contains(t, conf, "return 204", + "preflight must return 204 No Content") + + // 'map' directive dynamically reflects Access-Control-Request-Headers + // so any custom header the client sends is auto-allowed (avoids drift + // between nginx allowlist and client's evolving header set). + assert.Contains(t, conf, "map $http_access_control_request_headers $cors_allow_headers", + "must use 'map' to reflect Access-Control-Request-Headers back to client") + assert.Contains(t, conf, "add_header Access-Control-Allow-Headers $cors_allow_headers", + "preflight response must echo the requested headers via $cors_allow_headers") + + // ACAO must be reflected from Origin (not a hardcoded wildcard) so that + // Access-Control-Allow-Credentials=true is valid (browsers reject + // Allow-Origin="*" + Allow-Credentials=true). + assert.Contains(t, conf, "add_header Access-Control-Allow-Origin $http_origin", + "preflight ACAO must be reflected from Origin to support Allow-Credentials=true") + + // CRITICAL: ACAO must ONLY appear in OPTIONS branches. FastAPI's + // CORSMiddleware already sets ACAO on real responses; adding it again + // from nginx produces duplicate "*, http://origin" values that browsers + // reject ("The 'Access-Control-Allow-Origin' header contains multiple + // values '*, http://localhost:18000', but only one is allowed"). + assert.Equal(t, 2, strings.Count(conf, "add_header Access-Control-Allow-Origin"), + "ACAO must appear EXACTLY TWICE (once per OPTIONS branch). Adding it "+ + "on real responses duplicates FastAPI's header and breaks the browser.") +} + func Test_reconcileNginxDeployment(t *testing.T) { // Ensure no env override leaks from other tests in the package. os.Unsetenv("RELATED_IMAGE_NGINX") diff --git a/pkg/ai/raybuilder/configmap_apps_test.go b/pkg/ai/raybuilder/configmap_apps_test.go new file mode 100644 index 0000000..07711c2 --- /dev/null +++ b/pkg/ai/raybuilder/configmap_apps_test.go @@ -0,0 +1,163 @@ +package raybuilder + +import ( + "os" + "path/filepath" + "regexp" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "gopkg.in/yaml.v3" +) + +// readApplicationsYAMLFromRepo locates the repo's +// config/configs/applications.yaml relative to the raybuilder test file. +// Keeping this a standalone helper (rather than using os.Getenv("APPLICATION_FILE")) +// lets the test run under `go test ./pkg/ai/raybuilder/...` without setting env. +func readApplicationsYAMLFromRepo(t *testing.T) string { + t.Helper() + wd, err := os.Getwd() + require.NoError(t, err) + // pkg/ai/raybuilder is three levels below the repo root. + repoRoot := filepath.Clean(filepath.Join(wd, "..", "..", "..")) + path := filepath.Join(repoRoot, "config", "configs", "applications.yaml") + raw, err := os.ReadFile(path) + require.NoError(t, err, "unable to read %s", path) + return string(raw) +} + +// maskGoTemplates replaces `{{ ... }}` tokens with a plain string so the +// result parses as valid YAML. applications.yaml interpolates Go template +// variables at runtime (see Builder.ReconcileApplicationsConfigMap) — during +// unit testing we never render them, so a syntactic mask is sufficient. +func maskGoTemplates(s string) string { + return regexp.MustCompile(`\{\{[^}]+\}\}`).ReplaceAllString(s, "PLACEHOLDER") +} + +// Test_ApplicationsYAML_DisableResponsesRedis is a regression test for the +// airgap k0s /query failure. Each vLLM TextGen Ray Serve deployment constructs +// a RedisOpenAIServingResponses on the first /v1/responses request; that +// class's __init__ raises RuntimeError if neither RESPONSES_REDIS_URL nor +// RESPONSES_REDIS_ADDRESS is configured, and the resulting empty SSE stream +// bubbles up to SAIA v2's search pipeline as "No generations found in stream" +// → SearchStreamError → "An error occurred processing your request" to the +// end user. See ai-platform-models commits c1f9aef3, da7628ea, b6ff101e. +// +// The fix (set DISABLE_RESPONSES_API_REDIS=True) switches vLLM to the new +// NoOpOpenAIServingResponses class that skips Redis entirely. It MUST be set +// on every app whose deployment_type is text_gen_model_deployment — that's +// the only Ray Serve deployment type that instantiates the Responses API +// serving class. Other deployment types (embedding_model_deployment, +// scoring_model_deployment, classification_model_deployment, custom_deployment) +// do not call /v1/responses and do not need this flag. +func Test_ApplicationsYAML_DisableResponsesRedis(t *testing.T) { + masked := maskGoTemplates(readApplicationsYAMLFromRepo(t)) + + // Parse just enough structure to traverse apps; keep the rest loose so + // unrelated config churn doesn't break this test. + type envVars = map[string]string + type runtimeEnv struct { + EnvVars envVars `yaml:"env_vars"` + } + type args struct { + DeploymentType string `yaml:"deployment_type"` + } + type app struct { + Name string `yaml:"name"` + Args args `yaml:"args"` + RuntimeEnv runtimeEnv `yaml:"runtime_env"` + } + var doc struct { + Applications []app `yaml:"applications"` + } + require.NoError(t, yaml.Unmarshal([]byte(masked), &doc)) + require.NotEmpty(t, doc.Applications, "applications.yaml parsed as empty") + + // Collect the set of text-gen apps (must-set) and everything else (must-not-set). + var textGenApps []app + var otherApps []app + for _, a := range doc.Applications { + if a.Args.DeploymentType == "text_gen_model_deployment" { + textGenApps = append(textGenApps, a) + } else { + otherApps = append(otherApps, a) + } + } + + // We expect exactly two text-gen apps today (GptOss120b, GptOss20b). If + // this count changes, someone added a new text-gen model; they MUST also + // add DISABLE_RESPONSES_API_REDIS to the new app. + require.Len(t, textGenApps, 2, + "expected exactly 2 text_gen_model_deployment apps (GptOss120b, GptOss20b); "+ + "found %d. New text-gen apps MUST set DISABLE_RESPONSES_API_REDIS.", + len(textGenApps)) + + for _, a := range textGenApps { + assert.Equal(t, "True", a.RuntimeEnv.EnvVars["DISABLE_RESPONSES_API_REDIS"], + "app %q (deployment_type=text_gen_model_deployment) must set "+ + "DISABLE_RESPONSES_API_REDIS=\"True\" in runtime_env.env_vars. Without this, "+ + "vLLM's RedisOpenAIServingResponses constructor raises "+ + "RuntimeError('Responses Redis URL not set') and /v1/responses calls fail "+ + "(surfaces to SAIA v2 /query as \"An error occurred processing your request\").", + a.Name) + } + + // Sanity: assert the two canonical app names we expect. Keeps the test + // readable if someone renames an app and forgets to re-check this. + var names []string + for _, a := range textGenApps { + names = append(names, a.Name) + } + assert.ElementsMatch(t, []string{"GptOss120b", "GptOss20b"}, names, + "unexpected set of text_gen_model_deployment apps: %v", names) + + // Hygiene check: non-text-gen apps should NOT carry this env (it's a + // no-op for them and misleading if present). + for _, a := range otherApps { + if _, ok := a.RuntimeEnv.EnvVars["DISABLE_RESPONSES_API_REDIS"]; ok { + t.Errorf("app %q (deployment_type=%q) should NOT set "+ + "DISABLE_RESPONSES_API_REDIS — it's only read by "+ + "vllm_text_gen_model.VLLMTextGenModel.", a.Name, a.Args.DeploymentType) + } + } +} + +// Test_ApplicationsYAML_IsWellFormed is a tiny smoke test that the bundled +// applications.yaml parses correctly after Go-template masking. Catches +// accidental structural breakage (e.g. un-indented env_vars, stray tabs). +func Test_ApplicationsYAML_IsWellFormed(t *testing.T) { + masked := maskGoTemplates(readApplicationsYAMLFromRepo(t)) + var raw map[string]any + require.NoError(t, yaml.Unmarshal([]byte(masked), &raw), + "applications.yaml does not parse as YAML (after masking Go templates)") + apps, ok := raw["applications"].([]any) + require.True(t, ok, "applications.yaml missing top-level 'applications' list") + require.NotEmpty(t, apps, "applications list is empty") + + // Spot-check: every app entry must have a 'name' key. 'args' is optional + // — the Entrypoint router app omits it, model apps carry deployment config + // there. + for i, a := range apps { + m, ok := a.(map[string]any) + require.True(t, ok, "app at index %d is not a mapping", i) + _, hasName := m["name"] + require.True(t, hasName, + "app at index %d missing 'name': keys=%v", i, keys(m)) + } +} + +func keys(m map[string]any) []string { + out := make([]string, 0, len(m)) + for k := range m { + out = append(out, k) + } + // Stable-ish for readability in failure messages. + for i := 1; i < len(out); i++ { + for j := i; j > 0 && strings.Compare(out[j], out[j-1]) < 0; j-- { + out[j], out[j-1] = out[j-1], out[j] + } + } + return out +} diff --git a/tools/cluster_setup/artifacts.yaml b/tools/cluster_setup/artifacts.yaml index badfbe6..69c3664 100644 --- a/tools/cluster_setup/artifacts.yaml +++ b/tools/cluster_setup/artifacts.yaml @@ -5682,15 +5682,15 @@ spec: fieldRef: fieldPath: metadata.name - name: RELATED_IMAGE_RAY_HEAD - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-head:9a24502-ai-tier + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-head:build-v2-002 - name: RELATED_IMAGE_RAY_WORKER - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-worker-gpu:9a24502-ai-tier + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-worker-gpu:build-v2-002 - name: RELATED_IMAGE_WEAVIATE value: docker.io/semitechnologies/weaviate:stable-v1.28-007846a - name: RELATED_IMAGE_SAIA_API - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:v2.0.4-31-g9efe1fc + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:build-v2-002 - name: RELATED_IMAGE_SAIA_API_V2 - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api-v2:v2.0.4-31-g9efe1fc + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api-v2:build-v2-002 - name: RELATED_IMAGE_POST_INSTALL_HOOK value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-data-loader:v2.0.4-31-g9efe1fc - name: SPLUNK_METRICS_INDEX_NAME @@ -5705,7 +5705,7 @@ spec: value: v0.3.14-36-g1549f5a - name: RAY_VERSION value: 2.53.0 - image: 658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.24 + image: 658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.25 livenessProbe: httpGet: path: /healthz diff --git a/tools/cluster_setup/k0s-cluster-config-h100.yaml b/tools/cluster_setup/k0s-cluster-config-h100.yaml new file mode 100644 index 0000000..a649309 --- /dev/null +++ b/tools/cluster_setup/k0s-cluster-config-h100.yaml @@ -0,0 +1,228 @@ +# =================================================================== +# k0s Cluster Configuration for Splunk AI Platform +# =================================================================== +# Mirrors cluster-config.yaml (EKS) but adapted for k0s on bare-metal / EC2. +# +# Quick Start: +# 1. Copy: cp k0s-cluster-config.yaml my-k0s-config.yaml +# 2. Edit: vi my-k0s-config.yaml +# 3. Replace all values marked with "CHANGE THIS" +# 4. Run: CONFIG_FILE=./my-k0s-config.yaml ./k0s_cluster_with_stack.sh install +# =================================================================== + +# ---------- Cluster Configuration ---------- +cluster: + name: airgap-cluster + # region: us-east-2 # Ignored for on-prem, but required in config + sshUser: ec2-user # CHANGE THIS: SSH user for remote nodes + # sshKeyPath: /Users/mohaari2/.ssh/ai-key-arif.pem # CHANGE THIS: Path to SSH private key + sshKeyPath: /Users/mohaari2/.ssh/ai-key-arif1.pem # CHANGE THIS: Path to SSH private key + +# ---------- Node Configuration ---------- +nodes: + controllers: 1 + cpuWorkers: 1 # Not used with existingIPs + gpuWorkers: 2 # Not used with existingIPs + + existingIPs: + controllers: + # - 3.144.14.96 # CHANGE THIS: Your controller server IP + - 10.0.34.164 + workers: + # - 3.14.134.16 # CHANGE THIS: CPU worker 1 + # - 13.59.78.115 # CHANGE THIS: GPU worker 1 + # - 3.15.20.136 # CHANGE THIS: GPU worker 2 + - 10.0.34.168 + - 10.0.34.153 + - 10.0.34.140 + +# ---------- Storage Configuration ---------- +# Object storage: AWS S3 or external S3-compatible (no in-cluster MinIO install for external). +# Use objectStore.type: aws (S3) or s3compat | minio | seaweedfs (external; endpoint + credentials required). +storage: + s3Bucket: "ai-platform-bucket-minio-us-east-2" # Used when objectStore.type is aws + storageClass: "local-path" # Storage class for Kubernetes PVCs (gp3, gp2, io1, io2) + vectorDbSize: "50Gi" # VectorDB persistent volume size + + objectStore: + # type: "minio" # aws | s3compat | minio | seaweedfs (external only for non-aws) + type: "seaweedfs" # aws | s3compat | minio | seaweedfs (external only for non-aws) + bucket: "ai-platform-bucket-minio-us-east-2" + # endpoint: "http://13.59.216.105:9000" # MinIO port 9000. For SeaweedFS use port 8333. + endpoint: "http://3.144.157.201:8333" + auth: + rootUser: "minioadmin" + rootPassword: "minioadmin" + +# ---------- Container Images Configuration ---------- +images: + # Registry prefix - applied to images without a full registry path + registry: "658391232643.dkr.ecr.us-east-2.amazonaws.com" # CHANGE THIS: Your ECR/Docker/Harbor registry + + operator: + # image: "docker.io/kpratyush775/splunk-ai-operator:v0.1.29" + # v0.1.21 added ServiceTemplate propagation (AIPlatform → AIService) so the + # aiPlatform.serviceTemplate block in this config actually takes effect. + # v0.1.21 also sets FIELD_DESCRIPTION_BACKEND=s3 + FIELD_DESCRIPTION_S3_KEY + + # AWS_ENDPOINT_URL on the v2 API and v2 worker pods (Confluence ERD 3.8.1.2) + # and wires AWS_ACCESS_KEY_ID/SECRET from TaskVolume.SecretRef so boto3 can + # actually auth to the S3-compatible endpoint (otherwise field-description + # lookups silently return empty and sourcetype metadata is degraded). + # v0.1.24 lowers RUN_TASKS_DELAY_S from 600s to 10s on the v2 worker. + # The saia-v2 IngestionWorker only refreshes its heartbeat at the top of + # each poll iteration, and the liveness probe kills the pod at 120s stale. + # At 600s (the v1 scheduler cadence) the worker got SIGKILLed every time + # the tenant lock was busy or the queue was empty. + # v0.1.25 adds the nginx CORS preflight short-circuit (SAIA v2's + # TenantConversationKeyMiddleware rejects unauthenticated OPTIONS with + # 400 before CORSMiddleware can respond, so nginx has to answer the + # preflight itself with 204 + CORS headers) AND sets + # DISABLE_RESPONSES_API_REDIS=True on the GptOss120b and GptOss20b Ray + # Serve apps (pair with ray-head/ray-worker-gpu:build-v2-001 which + # includes the NoOpOpenAIServingResponses implementation — see + # ai-platform-models commits c1f9aef3, da7628ea, b6ff101e). Without the + # env var the vLLM RedisOpenAIServingResponses constructor raises + # RuntimeError('Responses Redis URL not set') on every /v1/responses + # call and the SAIA v2 /query path fails with SearchStreamError. + # Build & push with: + # IMG=658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.25 \ + # make docker-build-amd64 docker-push + image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.25" + + splunk: + image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/splunk/splunk:10-2-ai-custom" + operatorImage: "docker.io/splunk/splunk-operator:3.0.0" + + ray: + # headImage: "ml-platform/ray/ray-head:build-v1alpha1" + # headImage: "ml-platform/ray/ray-head:087e40e" + # headImage: "ml-platform/ray/ray-head:build-010" + # headImage: "ml-platform/ray/ray-head:9a24502-ai-tier" # arif rebase to main + # headImage: "ml-platform/ray/ray-head:build-v2-001" # tony redis changes + headImage: "ml-platform/ray/ray-head:build-v2-002" # tony redis changes + fixes + + # workerImage: "ml-platform/ray/ray-worker-gpu:build-v1alpha1" + # workerImage: "ml-platform/ray/ray-worker-gpu:087e40e" + # workerImage: "ml-platform/ray/ray-worker-gpu:build-010" + # workerImage: "ml-platform/ray/ray-worker-gpu:9a24502-ai-tier" # arif rebase to main + # workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-001" # tony redis changes + workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-002" # tony redis changes + fixes + + weaviate: + image: "docker.io/semitechnologies/weaviate:stable-v1.28-007846a" + + saia: + # apiImage: "ml-platform/saia/saia-api:build-v1alpha1" + # apiImage: "ml-platform/saia/saia-api:build-006" #saia v1.5 + # apiImage: "ml-platform/saia/saia-api:v2.0.4-23-g2fc91e9" #saia v2 + # apiImage: "ml-platform/saia/saia-api:v2.0.4-31-g9efe1fc" #saia v2 + tony changes + apiImage: "ml-platform/saia/saia-api:build-v2-002" #saia v2 + tony changes + + # apiV2Image: "ml-platform/saia/saia-api-v2:v2.0.4-23-g2fc91e9" #saia v2 + # apiV2Image: "ml-platform/saia/saia-api-v2:v2.0.4-31-g9efe1fc" #saia v2 + tony changes + apiV2Image: "ml-platform/saia/saia-api-v2:build-v2-002" #saia v2 + tony changes + + # dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v1alpha1" + # dataLoaderImage: "ml-platform/saia/saia-data-loader:build-003" #saia v1.5 + # dataLoaderImage: "ml-platform/saia/saia-data-loader:v2.0.4-23-g2fc91e9" #saia v2 + dataLoaderImage: "ml-platform/saia/saia-data-loader:v2.0.4-31-g9efe1fc" #saia v2 + tony changes + + fluentBit: + image: "docker.io/fluent/fluent-bit:1.9.6" + + otelCollector: + image: "docker.io/otel/opentelemetry-collector-contrib:0.122.1" + + # Reverse proxy used by the SAIA reconciler to route v1 / v2 requests by + # path. Consumed via RELATED_IMAGE_NGINX. Point this at an internal mirror + # for airgapped clusters. + nginx: + image: "docker.io/library/nginx:1.27-alpine" + +# ---------- Operator Versions ---------- +operators: + ray: + version: "v1.2.2" + modelVersion: "v0.3.14-36-g1549f5a" + rayVersion: "2.53.0" + + certManager: + installCRDs: true + + nvidia: + devicePluginVersion: "v0.17.3" + +# ---------- Kubernetes ---------- +kubernetes: + namespace: ai-platform + +# ---------- File Paths ---------- +files: + splunkOperator: "/Users/mohaari2/Files/repos/AI/splunk-ai-operator/tools/cluster_setup/splunk-operator-cluster.yaml" + aiPlatform: "/Users/mohaari2/Files/repos/AI/splunk-ai-operator/tools/cluster_setup/artifacts.yaml" + +# ---------- Splunk Configuration ---------- +splunk: + standaloneName: splunk-standalone + +# ---------- AI Platform Configuration ---------- +aiPlatform: + name: "splunk-ai-stack" + # defaultAcceleratorType: "L40S" + defaultAcceleratorType: "H100" + + workerGroupConfig: + imageRegistry: "" + + # ---------- SAIA public exposure (OPTIONAL) ---------- + # The SAIA "public" Service (nginx reverse proxy in front of v1+v2 API pods) + # defaults to ClusterIP, meaning it is only reachable from inside the cluster. + # + # Two call patterns hit this Service: + # (A) Splunk Enterprise pod → saia-service (works with ClusterIP) + # (B) End user's browser → saia-service (needs external exposure) + # + # Pattern B is used by the v2 chat UI (/query streaming, conversations, + # feedback, admin endpoints). Without external exposure the v2 chat UI + # breaks for users, even though v1 one-shot SPL features still work. + # + # To DISABLE external exposure (use ClusterIP only), either: + # * Delete / comment-out the entire `serviceTemplate:` block below, OR + # * Set `type: ClusterIP` explicitly. + # Either is treated identically — the installer skips emitting serviceTemplate + # into the AIPlatform CR and the operator falls through to the ClusterIP + # default in reconcileSAIAService(). + # + # To ENABLE external exposure for on-prem / airgap customers, NodePort is the + # recommended default: any k8s node IP + the configured nodePort yields a + # reachable endpoint from VPN-connected users. No cloud LB / cert-manager + # needed. Use LoadBalancer only if the customer runs MetalLB or a cloud LB. + serviceTemplate: + type: NodePort # ClusterIP | NodePort | LoadBalancer (omit block = ClusterIP) + nodePort: 30080 # Fixed NodePort (30000-32767). Required for stable DNS. + + features: + - name: "saia" + version: "1.1.0" + + cpuScheduling: + nodeSelector: {} + tolerations: [] + + gpuScheduling: + nodeSelector: {} + tolerations: + - key: "nvidia.com/gpu" + operator: "Equal" + value: "true" + effect: "NoSchedule" + +# ---------- Image Pull Secrets ---------- +imagePullSecrets: + secrets: + - ecr-registry-secret + autoCreateECR: true + +ecr: + account: "658391232643" + region: us-east-2 diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml index be127bd..978d0d1 100644 --- a/tools/cluster_setup/k0s-cluster-config.yaml +++ b/tools/cluster_setup/k0s-cluster-config.yaml @@ -15,8 +15,8 @@ cluster: name: airgap-cluster # region: us-east-2 # Ignored for on-prem, but required in config sshUser: ec2-user # CHANGE THIS: SSH user for remote nodes - # sshKeyPath: /Users/mohaari2/.ssh/ai-key-arif.pem # CHANGE THIS: Path to SSH private key sshKeyPath: /Users/mohaari2/.ssh/ai-key-arif.pem # CHANGE THIS: Path to SSH private key + # sshKeyPath: /Users/mohaari2/.ssh/ai-key-arif1.pem # CHANGE THIS: Path to SSH private key # ---------- Node Configuration ---------- nodes: @@ -73,10 +73,21 @@ images: # each poll iteration, and the liveness probe kills the pod at 120s stale. # At 600s (the v1 scheduler cadence) the worker got SIGKILLed every time # the tenant lock was busy or the queue was empty. + # v0.1.25 adds the nginx CORS preflight short-circuit (SAIA v2's + # TenantConversationKeyMiddleware rejects unauthenticated OPTIONS with + # 400 before CORSMiddleware can respond, so nginx has to answer the + # preflight itself with 204 + CORS headers) AND sets + # DISABLE_RESPONSES_API_REDIS=True on the GptOss120b and GptOss20b Ray + # Serve apps (pair with ray-head/ray-worker-gpu:build-v2-001 which + # includes the NoOpOpenAIServingResponses implementation — see + # ai-platform-models commits c1f9aef3, da7628ea, b6ff101e). Without the + # env var the vLLM RedisOpenAIServingResponses constructor raises + # RuntimeError('Responses Redis URL not set') on every /v1/responses + # call and the SAIA v2 /query path fails with SearchStreamError. # Build & push with: - # IMG=658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.24 \ + # IMG=658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.25 \ # make docker-build-amd64 docker-push - image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.24" + image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.25" splunk: image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/splunk/splunk:10-2-ai-custom" @@ -86,12 +97,16 @@ images: # headImage: "ml-platform/ray/ray-head:build-v1alpha1" # headImage: "ml-platform/ray/ray-head:087e40e" # headImage: "ml-platform/ray/ray-head:build-010" - headImage: "ml-platform/ray/ray-head:9a24502-ai-tier" + # headImage: "ml-platform/ray/ray-head:9a24502-ai-tier" # arif rebase to main + # headImage: "ml-platform/ray/ray-head:build-v2-001" # tony redis changes + headImage: "ml-platform/ray/ray-head:build-v2-002" # tony redis changes + fixes # workerImage: "ml-platform/ray/ray-worker-gpu:build-v1alpha1" # workerImage: "ml-platform/ray/ray-worker-gpu:087e40e" # workerImage: "ml-platform/ray/ray-worker-gpu:build-010" - workerImage: "ml-platform/ray/ray-worker-gpu:9a24502-ai-tier" + # workerImage: "ml-platform/ray/ray-worker-gpu:9a24502-ai-tier" # arif rebase to main + # workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-001" # tony redis changes + workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-002" # tony redis changes + fixes weaviate: image: "docker.io/semitechnologies/weaviate:stable-v1.28-007846a" @@ -100,10 +115,12 @@ images: # apiImage: "ml-platform/saia/saia-api:build-v1alpha1" # apiImage: "ml-platform/saia/saia-api:build-006" #saia v1.5 # apiImage: "ml-platform/saia/saia-api:v2.0.4-23-g2fc91e9" #saia v2 - apiImage: "ml-platform/saia/saia-api:v2.0.4-31-g9efe1fc" #saia v2 + tony changes + # apiImage: "ml-platform/saia/saia-api:v2.0.4-31-g9efe1fc" #saia v2 + tony changes + apiImage: "ml-platform/saia/saia-api:build-v2-002" #saia v2 + tony changes # apiV2Image: "ml-platform/saia/saia-api-v2:v2.0.4-23-g2fc91e9" #saia v2 - apiV2Image: "ml-platform/saia/saia-api-v2:v2.0.4-31-g9efe1fc" #saia v2 + tony changes + # apiV2Image: "ml-platform/saia/saia-api-v2:v2.0.4-31-g9efe1fc" #saia v2 + tony changes + apiV2Image: "ml-platform/saia/saia-api-v2:build-v2-002" #saia v2 + tony changes # dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v1alpha1" # dataLoaderImage: "ml-platform/saia/saia-data-loader:build-003" #saia v1.5 From e51baade0708dc86f759cddb43e512d0c6f6a74e Mon Sep 17 00:00:00 2001 From: Mohammed Arif Date: Sat, 25 Apr 2026 00:00:55 +0530 Subject: [PATCH 42/55] fix: WEAVIATE_PLATFORM_URL + support for rhel 10 (untested) --- .gitignore | 1 + pkg/ai/features/saia/impl.go | 50 + pkg/ai/features/saia/impl_test.go | 75 +- .../k0s-cluster-config-h100.yaml | 62 +- tools/cluster_setup/k0s-cluster-config.yaml | 51 +- tools/cluster_setup/k0s_cluster_with_stack.sh | 960 +++++++++++++++--- 6 files changed, 1016 insertions(+), 183 deletions(-) diff --git a/.gitignore b/.gitignore index ac1d882..1235ec2 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ bin testbin/* examplecodebase/* Dockerfile.cross +tmp/* # Test binary, build with `go test -c` *.test diff --git a/pkg/ai/features/saia/impl.go b/pkg/ai/features/saia/impl.go index 000ede4..a33b564 100644 --- a/pkg/ai/features/saia/impl.go +++ b/pkg/ai/features/saia/impl.go @@ -635,8 +635,13 @@ func (r *SaiaReconciler) reconcilePostInstallHook( // (v1 API, v1 worker, v2 API, v2 worker). Callers append pod-specific vars. func buildSAIABaseEnv(ai *aiv1.AIService) []corev1.EnvVar { bucketName := extractBucketName(ai.Spec.TaskVolume.Path) + // WEAVIATE_PLATFORM_URL points directly at the native Weaviate service. + // When the value contains a scheme ("://"), the SAIA v1 pipeline uses it + // as-is (bypassing the cloud ML-Platform "/weaviate" path convention). + weaviatePlatformURL := fmt.Sprintf("http://%s:80", ai.Spec.VectorDbUrl) env := []corev1.EnvVar{ {Name: "PLATFORM_URL", Value: ai.Spec.AIPlatformUrl}, + {Name: "WEAVIATE_PLATFORM_URL", Value: weaviatePlatformURL}, {Name: "VECTOR_DB_URL", Value: ai.Spec.VectorDbUrl}, {Name: "S3_BUCKET", Value: bucketName}, } @@ -678,6 +683,13 @@ func buildSAIABaseEnv(ai *aiv1.AIService) []corev1.EnvVar { // v2 uses different env var names: VECTOR_DB_HOST (not VECTOR_DB_URL), // ML_PLATFORM_URL (not PLATFORM_URL), and needs vector DB TLS/auth disabled. // +// This also switches the conversation store from the ephemeral filesystem +// default to the S3 backend added in saia-service commit 3d3756f3 (Tony, +// merged into ai-tier-v2.0 via 9efe1fce on 2026-04-20, shipped in image +// build-v2-002). See the CONVERSATION_STORE block below for the full +// rationale; without this the v2 API returns 404 on GET /conversations/ +// /items after every pod restart. +// // SAIA V2 FieldDescription backend selection (required by both v2 API and v2 // worker, else FieldDescriptionRepositoryFactory.get() raises ValueError at // startup and the worker enters a restart loop). @@ -706,6 +718,7 @@ func buildSAIABaseEnv(ai *aiv1.AIService) []corev1.EnvVar { // Sourcing them from the same secret keys as the S3-compat creds keeps a // single source of truth for object-store auth. func buildV2ExtraEnv(ai *aiv1.AIService) []corev1.EnvVar { + bucketName := extractBucketName(ai.Spec.TaskVolume.Path) env := []corev1.EnvVar{ {Name: "ML_PLATFORM_URL", Value: ai.Spec.AIPlatformUrl}, {Name: "VECTOR_DB_AUTH_ENABLED", Value: "false"}, @@ -718,6 +731,41 @@ func buildV2ExtraEnv(ai *aiv1.AIService) []corev1.EnvVar { {Name: "FIELD_DESCRIPTION_BACKEND", Value: "s3"}, {Name: "FIELD_DESCRIPTION_S3_KEY", Value: "field-descriptions/global-field-descriptions.json"}, } + // Conversation persistence backend. + // + // SAIA v2 defaults conversation_store to "filesystem" which writes to + // /home/splunk/.local_storage/conversations on the pod's ephemeral + // container overlay. Every v2 pod restart (worker crash-loop, operator + // reconfigure, Kuberay zero-downtime upgrade, node drain) wipes the full + // chat history and produces user-visible "Conversation ... not found" + // 404s on GET /conversations//items whenever the Splunk UI tries to + // re-hydrate a chat (incl. the saia_v2_audit_index_log_proxy flow). + // + // Tony's saia-service commits 3d3756f3 + 8e2a9f40 (merged into + // ai-tier-v2.0 via 9efe1fce on 2026-04-20, and present in image + // build-v2-002) added an S3ConversationStore that reuses the same + // S3-compatible object store already configured for TaskVolume + // (SeaweedFS / MinIO / CVFS / real AWS S3). Turning it on for the SAIA + // v2 API and v2 worker makes chat history survive pod restarts. + // + // Activation contract (saia-v2/app/core/config.py::Settings): + // - CONVERSATION_STORE=s3 + // - CONVERSATION_S3_BUCKET= (validator raises ValueError at + // startup if CONVERSATION_STORE=s3 and this is empty) + // - AWS_ENDPOINT_URL, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY: + // already wired below for the FieldDescription S3 adapter and + // transparently reused by S3ConversationStore via boto3. + // + // We only emit these when we can derive a bucket name from + // TaskVolume.Path (the canonical source of truth for SAIA object + // storage). Leaving the defaults alone in the pathological "no path" + // case avoids a v2 pod startup crash-loop on misconfigured CRs. + if bucketName != "" { + env = append(env, + corev1.EnvVar{Name: "CONVERSATION_STORE", Value: "s3"}, + corev1.EnvVar{Name: "CONVERSATION_S3_BUCKET", Value: bucketName}, + ) + } // Only expose AWS_ENDPOINT_URL when the operator was configured with an // explicit S3-compatible endpoint (SeaweedFS/MinIO). Omitting it lets the // v2 adapter use the default AWS S3 endpoint when running in a real cloud @@ -863,9 +911,11 @@ func (r *SaiaReconciler) reconcileSAIADeployment( } // Base env: keep ONLY dynamic values here. + weaviatePlatformURL := fmt.Sprintf("http://%s:80", ai.Spec.VectorDbUrl) env := []corev1.EnvVar{ // Dynamic or runtime-derived values: {Name: "PLATFORM_URL", Value: ai.Spec.AIPlatformUrl}, + {Name: "WEAVIATE_PLATFORM_URL", Value: weaviatePlatformURL}, {Name: "VECTOR_DB_URL", Value: ai.Spec.VectorDbUrl}, // SAIA uses /tasks subdirectory within its feature path // Extract just the bucket name from the full path (e.g., "s3://bucket-name" -> "bucket-name") diff --git a/pkg/ai/features/saia/impl_test.go b/pkg/ai/features/saia/impl_test.go index 2066e87..458b7df 100644 --- a/pkg/ai/features/saia/impl_test.go +++ b/pkg/ai/features/saia/impl_test.go @@ -205,7 +205,7 @@ func newTestAIService() *aiv1.AIService { }, Spec: aiv1.AIServiceSpec{ AIPlatformUrl: "http://platform:8000", - VectorDbUrl: "weaviate:80", + VectorDbUrl: "weaviate.ai-platform.svc.cluster.local", Replicas: 1, ServiceAccountName: "test-sa", TaskVolume: aiv1.ObjectStorageSpec{ @@ -577,8 +577,8 @@ func Test_reconcileSAIAService_handlesAnnotationsWithoutPanic(t *testing.T) { scheme := buildFullTestScheme(t) ai := newTestAIService() ai.Annotations = map[string]string{ - "operator.splunk.com/example": "v1", - "kubectl.kubernetes.io/restartedAt": "should-be-skipped", + "operator.splunk.com/example": "v1", + "kubectl.kubernetes.io/restartedAt": "should-be-skipped", "kubectl.kubernetes.io/last-applied-configuration": "should-be-skipped", } @@ -837,13 +837,80 @@ func Test_buildV2ExtraEnv_FieldDescriptionBackend(t *testing.T) { }) } +// Test_buildV2ExtraEnv_ConversationStore verifies the switch from the +// ephemeral "filesystem" default (which lives on the pod's container overlay +// and loses all chat history on restart) to the "s3" backend introduced in +// saia-service by Tony's commits 3d3756f3 / 8e2a9f40 (merged into +// ai-tier-v2.0 via 9efe1fce on 2026-04-20, shipped in image build-v2-002). +// +// Contract (from saia-v2/app/core/config.py::Settings and +// app/repositories/conversation/store_factory.py): +// - CONVERSATION_STORE=s3 selects S3ConversationStore +// - CONVERSATION_S3_BUCKET must be non-empty (validator raises +// ValueError at startup otherwise, crash-looping the v2 pod) +// - AWS_ENDPOINT_URL / AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY are +// reused from the FieldDescription S3 wiring below +func Test_buildV2ExtraEnv_ConversationStore(t *testing.T) { + t.Run("enables s3 backend with bucket extracted from TaskVolume.Path", func(t *testing.T) { + ai := newTestAIService() // TaskVolume.Path = "s3://test-bucket/saia" + envMap := envToMap(buildV2ExtraEnv(ai)) + + assert.Equal(t, "s3", envMap["CONVERSATION_STORE"], + "CONVERSATION_STORE must be 's3' so S3ConversationStore is selected over the ephemeral filesystem default") + assert.Equal(t, "test-bucket", envMap["CONVERSATION_S3_BUCKET"], + "CONVERSATION_S3_BUCKET must be the extracted bucket name so SAIA v2's Settings validator passes at startup") + }) + + t.Run("handles all supported TaskVolume.Path prefixes", func(t *testing.T) { + cases := []struct { + path string + wantBucket string + }{ + {"s3://my-bucket/path", "my-bucket"}, + {"s3compat://bucket-name", "bucket-name"}, + {"minio://minio-bucket", "minio-bucket"}, + {"seaweedfs://sw-bucket/prefix", "sw-bucket"}, + {"gs://gcs-bucket", "gcs-bucket"}, + } + for _, tc := range cases { + t.Run(tc.path, func(t *testing.T) { + ai := newTestAIService() + ai.Spec.TaskVolume.Path = tc.path + envMap := envToMap(buildV2ExtraEnv(ai)) + + assert.Equal(t, "s3", envMap["CONVERSATION_STORE"]) + assert.Equal(t, tc.wantBucket, envMap["CONVERSATION_S3_BUCKET"]) + }) + } + }) + + // An empty TaskVolume.Path indicates a misconfigured CR. We must NOT + // emit CONVERSATION_STORE=s3 in that case, because CONVERSATION_S3_BUCKET + // would be empty and the v2 pod would crash-loop on the Pydantic + // validator. Leaving the defaults in place gives a clearer failure mode + // (ephemeral filesystem store) than a startup crash. + t.Run("omits conversation-store envs when TaskVolume.Path is empty", func(t *testing.T) { + ai := newTestAIService() + ai.Spec.TaskVolume.Path = "" + envMap := envToMap(buildV2ExtraEnv(ai)) + + _, hasStore := envMap["CONVERSATION_STORE"] + _, hasBucket := envMap["CONVERSATION_S3_BUCKET"] + assert.False(t, hasStore, + "CONVERSATION_STORE must be omitted when no bucket can be derived, to avoid the SAIA v2 startup validator crashing the pod") + assert.False(t, hasBucket, + "CONVERSATION_S3_BUCKET must be omitted when no bucket can be derived") + }) +} + func Test_buildSAIABaseEnv(t *testing.T) { ai := newTestAIService() env := buildSAIABaseEnv(ai) envMap := envToMap(env) assert.Equal(t, "http://platform:8000", envMap["PLATFORM_URL"]) - assert.Equal(t, "weaviate:80", envMap["VECTOR_DB_URL"]) + assert.Equal(t, "http://weaviate.ai-platform.svc.cluster.local:80", envMap["WEAVIATE_PLATFORM_URL"]) + assert.Equal(t, "weaviate.ai-platform.svc.cluster.local", envMap["VECTOR_DB_URL"]) assert.Equal(t, "test-bucket", envMap["S3_BUCKET"]) assert.Equal(t, "http://seaweedfs:8333", envMap["S3COMPAT_OBJECT_STORE_ENDPOINT_URL"]) assert.Equal(t, "test-bucket", envMap["S3COMPAT_OBJECT_STORE_BUCKET"]) diff --git a/tools/cluster_setup/k0s-cluster-config-h100.yaml b/tools/cluster_setup/k0s-cluster-config-h100.yaml index a649309..ff60343 100644 --- a/tools/cluster_setup/k0s-cluster-config-h100.yaml +++ b/tools/cluster_setup/k0s-cluster-config-h100.yaml @@ -15,8 +15,7 @@ cluster: name: airgap-cluster # region: us-east-2 # Ignored for on-prem, but required in config sshUser: ec2-user # CHANGE THIS: SSH user for remote nodes - # sshKeyPath: /Users/mohaari2/.ssh/ai-key-arif.pem # CHANGE THIS: Path to SSH private key - sshKeyPath: /Users/mohaari2/.ssh/ai-key-arif1.pem # CHANGE THIS: Path to SSH private key + sshKeyPath: /Users/mohaari2/.ssh/ai-key-arif.pem # CHANGE THIS: Path to SSH private key # ---------- Node Configuration ---------- nodes: @@ -26,15 +25,11 @@ nodes: existingIPs: controllers: - # - 3.144.14.96 # CHANGE THIS: Your controller server IP - - 10.0.34.164 + - 18.191.66.248 # CHANGE THIS: Your controller server IP workers: - # - 3.14.134.16 # CHANGE THIS: CPU worker 1 - # - 13.59.78.115 # CHANGE THIS: GPU worker 1 - # - 3.15.20.136 # CHANGE THIS: GPU worker 2 - - 10.0.34.168 - - 10.0.34.153 - - 10.0.34.140 + - 18.220.129.123 # CHANGE THIS: CPU worker 1 + - 3.15.10.138 # CHANGE THIS: GPU worker 1 + - 18.219.68.93 # CHANGE THIS: GPU worker 2 # ---------- Storage Configuration ---------- # Object storage: AWS S3 or external S3-compatible (no in-cluster MinIO install for external). @@ -45,11 +40,20 @@ storage: vectorDbSize: "50Gi" # VectorDB persistent volume size objectStore: - # type: "minio" # aws | s3compat | minio | seaweedfs (external only for non-aws) - type: "seaweedfs" # aws | s3compat | minio | seaweedfs (external only for non-aws) + # 2026-04-21: switched from seaweedfs to minio because SeaweedFS returns + # S3 InternalError/500 (not NoSuchKey/404) for GetObjectTagging on a + # missing key. The SAIA v2 S3ConversationStore (added by Tony in + # saia-service commits 3d3756f3/8e2a9f40, shipped in image build-v2-002) + # calls GetObjectTagging on the conversation key *before* the first + # PutObject, so every brand-new draft: conversation hit a 502 from the + # SDK's 5-retry backoff. MinIO is AWS-spec compliant (NoSuchKey/404) and + # hosts the same bucket name at :9000, so swapping the endpoint is + # sufficient. Fallback: flip back by setting type: "seaweedfs" and + # endpoint to :8333 (but note the 502 on every draft conversation). + type: "minio" # aws | s3compat | minio | seaweedfs (external only for non-aws) bucket: "ai-platform-bucket-minio-us-east-2" - # endpoint: "http://13.59.216.105:9000" # MinIO port 9000. For SeaweedFS use port 8333. - endpoint: "http://3.144.157.201:8333" + # endpoint: "http://3.144.157.201:8333" # SeaweedFS (deprecated — see comment above) + endpoint: "http://13.59.216.105:9000" # MinIO (AWS-spec compliant GetObjectTagging semantics) auth: rootUser: "minioadmin" rootPassword: "minioadmin" @@ -84,10 +88,22 @@ images: # env var the vLLM RedisOpenAIServingResponses constructor raises # RuntimeError('Responses Redis URL not set') on every /v1/responses # call and the SAIA v2 /query path fails with SearchStreamError. + # v0.1.26 switches the SAIA v2 conversation store from the ephemeral + # "filesystem" default to "s3" (S3ConversationStore) by setting + # CONVERSATION_STORE=s3 and CONVERSATION_S3_BUCKET= on both the + # v2 API and v2 worker pods. Before this, chat history lived on the + # pod's container overlay and every pod restart produced spurious + # "Conversation not found" 404s on GET /conversations//items + # (the Splunk UI's saia_v2_audit_index_log_proxy flow surfaced them as + # "Failed to fetch SAIA V2 conversation items"). Reuses the same + # AWS_ENDPOINT_URL + AWS_ACCESS_KEY_ID/SECRET that v0.1.21 wired for + # the FieldDescription S3 adapter. Pairs with saia-service image + # build-v2-002 which ships Tony's S3ConversationStore (commits + # 3d3756f3, 8e2a9f40, merged via 9efe1fce into ai-tier-v2.0). # Build & push with: - # IMG=658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.25 \ + # IMG=658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.26 \ # make docker-build-amd64 docker-push - image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.25" + image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.28" splunk: image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/splunk/splunk:10-2-ai-custom" @@ -99,14 +115,14 @@ images: # headImage: "ml-platform/ray/ray-head:build-010" # headImage: "ml-platform/ray/ray-head:9a24502-ai-tier" # arif rebase to main # headImage: "ml-platform/ray/ray-head:build-v2-001" # tony redis changes - headImage: "ml-platform/ray/ray-head:build-v2-002" # tony redis changes + fixes + headImage: "ml-platform/ray/ray-head:build-v2-008" # tony redis changes + fixes # workerImage: "ml-platform/ray/ray-worker-gpu:build-v1alpha1" # workerImage: "ml-platform/ray/ray-worker-gpu:087e40e" # workerImage: "ml-platform/ray/ray-worker-gpu:build-010" # workerImage: "ml-platform/ray/ray-worker-gpu:9a24502-ai-tier" # arif rebase to main # workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-001" # tony redis changes - workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-002" # tony redis changes + fixes + workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-008" # tony redis changes + fixes weaviate: image: "docker.io/semitechnologies/weaviate:stable-v1.28-007846a" @@ -116,16 +132,16 @@ images: # apiImage: "ml-platform/saia/saia-api:build-006" #saia v1.5 # apiImage: "ml-platform/saia/saia-api:v2.0.4-23-g2fc91e9" #saia v2 # apiImage: "ml-platform/saia/saia-api:v2.0.4-31-g9efe1fc" #saia v2 + tony changes - apiImage: "ml-platform/saia/saia-api:build-v2-002" #saia v2 + tony changes + apiImage: "ml-platform/saia/saia-api:build-v2-009" #saia v2 + tony changes # apiV2Image: "ml-platform/saia/saia-api-v2:v2.0.4-23-g2fc91e9" #saia v2 # apiV2Image: "ml-platform/saia/saia-api-v2:v2.0.4-31-g9efe1fc" #saia v2 + tony changes - apiV2Image: "ml-platform/saia/saia-api-v2:build-v2-002" #saia v2 + tony changes + apiV2Image: "ml-platform/saia/saia-api-v2:build-v2-009" #saia v2 + tony changes # dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v1alpha1" # dataLoaderImage: "ml-platform/saia/saia-data-loader:build-003" #saia v1.5 # dataLoaderImage: "ml-platform/saia/saia-data-loader:v2.0.4-23-g2fc91e9" #saia v2 - dataLoaderImage: "ml-platform/saia/saia-data-loader:v2.0.4-31-g9efe1fc" #saia v2 + tony changes + dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v2-009" #saia v2 + tony changes fluentBit: image: "docker.io/fluent/fluent-bit:1.9.6" @@ -168,8 +184,8 @@ splunk: # ---------- AI Platform Configuration ---------- aiPlatform: name: "splunk-ai-stack" - # defaultAcceleratorType: "L40S" - defaultAcceleratorType: "H100" + defaultAcceleratorType: "L40S" + # defaultAcceleratorType: "H100" workerGroupConfig: imageRegistry: "" diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml index 978d0d1..124373f 100644 --- a/tools/cluster_setup/k0s-cluster-config.yaml +++ b/tools/cluster_setup/k0s-cluster-config.yaml @@ -16,7 +16,6 @@ cluster: # region: us-east-2 # Ignored for on-prem, but required in config sshUser: ec2-user # CHANGE THIS: SSH user for remote nodes sshKeyPath: /Users/mohaari2/.ssh/ai-key-arif.pem # CHANGE THIS: Path to SSH private key - # sshKeyPath: /Users/mohaari2/.ssh/ai-key-arif1.pem # CHANGE THIS: Path to SSH private key # ---------- Node Configuration ---------- nodes: @@ -27,14 +26,10 @@ nodes: existingIPs: controllers: - 3.144.14.96 # CHANGE THIS: Your controller server IP - # - 10.0.34.164 workers: - 3.14.134.16 # CHANGE THIS: CPU worker 1 - 13.59.78.115 # CHANGE THIS: GPU worker 1 - 3.15.20.136 # CHANGE THIS: GPU worker 2 - # - 10.0.34.168 - # - 10.0.34.142 - # - 10.0.34.153 # ---------- Storage Configuration ---------- # Object storage: AWS S3 or external S3-compatible (no in-cluster MinIO install for external). @@ -45,11 +40,20 @@ storage: vectorDbSize: "50Gi" # VectorDB persistent volume size objectStore: - # type: "minio" # aws | s3compat | minio | seaweedfs (external only for non-aws) - type: "seaweedfs" # aws | s3compat | minio | seaweedfs (external only for non-aws) + # 2026-04-21: switched from seaweedfs to minio because SeaweedFS returns + # S3 InternalError/500 (not NoSuchKey/404) for GetObjectTagging on a + # missing key. The SAIA v2 S3ConversationStore (added by Tony in + # saia-service commits 3d3756f3/8e2a9f40, shipped in image build-v2-002) + # calls GetObjectTagging on the conversation key *before* the first + # PutObject, so every brand-new draft: conversation hit a 502 from the + # SDK's 5-retry backoff. MinIO is AWS-spec compliant (NoSuchKey/404) and + # hosts the same bucket name at :9000, so swapping the endpoint is + # sufficient. Fallback: flip back by setting type: "seaweedfs" and + # endpoint to :8333 (but note the 502 on every draft conversation). + type: "minio" # aws | s3compat | minio | seaweedfs (external only for non-aws) bucket: "ai-platform-bucket-minio-us-east-2" - # endpoint: "http://13.59.216.105:9000" # MinIO port 9000. For SeaweedFS use port 8333. - endpoint: "http://3.144.157.201:8333" + # endpoint: "http://3.144.157.201:8333" # SeaweedFS (deprecated — see comment above) + endpoint: "http://13.59.216.105:9000" # MinIO (AWS-spec compliant GetObjectTagging semantics) auth: rootUser: "minioadmin" rootPassword: "minioadmin" @@ -84,13 +88,25 @@ images: # env var the vLLM RedisOpenAIServingResponses constructor raises # RuntimeError('Responses Redis URL not set') on every /v1/responses # call and the SAIA v2 /query path fails with SearchStreamError. + # v0.1.26 switches the SAIA v2 conversation store from the ephemeral + # "filesystem" default to "s3" (S3ConversationStore) by setting + # CONVERSATION_STORE=s3 and CONVERSATION_S3_BUCKET= on both the + # v2 API and v2 worker pods. Before this, chat history lived on the + # pod's container overlay and every pod restart produced spurious + # "Conversation not found" 404s on GET /conversations//items + # (the Splunk UI's saia_v2_audit_index_log_proxy flow surfaced them as + # "Failed to fetch SAIA V2 conversation items"). Reuses the same + # AWS_ENDPOINT_URL + AWS_ACCESS_KEY_ID/SECRET that v0.1.21 wired for + # the FieldDescription S3 adapter. Pairs with saia-service image + # build-v2-002 which ships Tony's S3ConversationStore (commits + # 3d3756f3, 8e2a9f40, merged via 9efe1fce into ai-tier-v2.0). # Build & push with: - # IMG=658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.25 \ + # IMG=658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.26 \ # make docker-build-amd64 docker-push - image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.25" + image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.28" splunk: - image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/splunk/splunk:10-2-ai-custom" + image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/splunk/splunk:10-2-ai-custom" #TODO this update operatorImage: "docker.io/splunk/splunk-operator:3.0.0" ray: @@ -99,14 +115,14 @@ images: # headImage: "ml-platform/ray/ray-head:build-010" # headImage: "ml-platform/ray/ray-head:9a24502-ai-tier" # arif rebase to main # headImage: "ml-platform/ray/ray-head:build-v2-001" # tony redis changes - headImage: "ml-platform/ray/ray-head:build-v2-002" # tony redis changes + fixes + headImage: "ml-platform/ray/ray-head:build-v2-008" # tony redis changes + fixes # workerImage: "ml-platform/ray/ray-worker-gpu:build-v1alpha1" # workerImage: "ml-platform/ray/ray-worker-gpu:087e40e" # workerImage: "ml-platform/ray/ray-worker-gpu:build-010" # workerImage: "ml-platform/ray/ray-worker-gpu:9a24502-ai-tier" # arif rebase to main # workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-001" # tony redis changes - workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-002" # tony redis changes + fixes + workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-008" # tony redis changes + fixes weaviate: image: "docker.io/semitechnologies/weaviate:stable-v1.28-007846a" @@ -116,16 +132,17 @@ images: # apiImage: "ml-platform/saia/saia-api:build-006" #saia v1.5 # apiImage: "ml-platform/saia/saia-api:v2.0.4-23-g2fc91e9" #saia v2 # apiImage: "ml-platform/saia/saia-api:v2.0.4-31-g9efe1fc" #saia v2 + tony changes - apiImage: "ml-platform/saia/saia-api:build-v2-002" #saia v2 + tony changes + apiImage: "ml-platform/saia/saia-api:build-v2-009" #saia v2 + tony changes # apiV2Image: "ml-platform/saia/saia-api-v2:v2.0.4-23-g2fc91e9" #saia v2 # apiV2Image: "ml-platform/saia/saia-api-v2:v2.0.4-31-g9efe1fc" #saia v2 + tony changes - apiV2Image: "ml-platform/saia/saia-api-v2:build-v2-002" #saia v2 + tony changes + apiV2Image: "ml-platform/saia/saia-api-v2:build-v2-009" #saia v2 + tony changes # dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v1alpha1" # dataLoaderImage: "ml-platform/saia/saia-data-loader:build-003" #saia v1.5 # dataLoaderImage: "ml-platform/saia/saia-data-loader:v2.0.4-23-g2fc91e9" #saia v2 - dataLoaderImage: "ml-platform/saia/saia-data-loader:v2.0.4-31-g9efe1fc" #saia v2 + tony changes + # dataLoaderImage: "ml-platform/saia/saia-data-loader:v2.0.4-31-g9efe1fc" #saia v2 + tony changes + dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v2-009" #saia v2 + tony changes + personalization fix fluentBit: image: "docker.io/fluent/fluent-bit:1.9.6" diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index a01a30d..d797e4e 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -622,10 +622,7 @@ EOF # Create instances (arrays already declared globally at top of script) CONTROLLER_IPS=() - CONTROLLER_PRIVATE_IPS=() - CONTROLLER_PUBLIC_IPS=() WORKER_IPS=() - WORKER_PRIVATE_IPS=() ALL_INSTANCE_IDS=() # Add existing instances to tracking arrays @@ -746,33 +743,22 @@ EOF log "Waiting additional time for SSH to be fully ready..." sleep 60 - # Get IPs - collect BOTH public and private IPs - # Use public IPs for SSH from local machine, private IPs for k0s internal communication + # Get public IPs for all instances for id in "${ALL_INSTANCE_IDS[@]}"; do local role role=$(aws ec2 describe-instances --region "${REGION}" --instance-ids "${id}" \ --query 'Reservations[0].Instances[0].Tags[?Key==`Role`].Value' --output text) - # Get public IP for SSH access from local machine local public_ip public_ip=$(aws ec2 describe-instances --region "${REGION}" --instance-ids "${id}" \ --query 'Reservations[0].Instances[0].PublicIpAddress' --output text) - # Get private IP for k0s internal communication - local private_ip - private_ip=$(aws ec2 describe-instances --region "${REGION}" --instance-ids "${id}" \ - --query 'Reservations[0].Instances[0].PrivateIpAddress' --output text) - - # Use public IP for SSH, but store private IP for k0s config if [[ "${role}" == "controller" ]]; then - CONTROLLER_IPS+=("${public_ip}") # For SSH from local machine - CONTROLLER_PRIVATE_IPS+=("${private_ip}") # For k0s internal communication - CONTROLLER_PUBLIC_IPS+=("${public_ip}") # For kubectl access and certificates - log "Controller - Public IP: ${public_ip}, Private IP: ${private_ip}" + CONTROLLER_IPS+=("${public_ip}") + log "Controller - Public IP: ${public_ip}" else - WORKER_IPS+=("${public_ip}") # For SSH from local machine - WORKER_PRIVATE_IPS+=("${private_ip}") # For k0s internal communication - log "Worker - Public IP: ${public_ip}, Private IP: ${private_ip} (${role})" + WORKER_IPS+=("${public_ip}") + log "Worker - Public IP: ${public_ip} (${role})" fi done @@ -786,7 +772,9 @@ prepare_nodes_for_k0s() { log "Preparing ${#node_ips[@]} node(s) for k0s (OS compatibility + binary)..." for node_ip in "${node_ips[@]}"; do log " Preparing node ${node_ip}..." - ssh_exec "${node_ip}" " + ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ + ${SSH_KEY_PATH:+-i "${SSH_KEY_PATH}"} "${SSH_USER}@${node_ip}" \ + bash -s <<'REMOTE_SCRIPT' || warn " Preparation had issues on ${node_ip}" # Disable firewalld if active (blocks k0s ports: 6443, 10250, 8472, etc.) if systemctl is-active firewalld >/dev/null 2>&1; then echo 'Disabling firewalld...' @@ -794,11 +782,53 @@ prepare_nodes_for_k0s() { sudo systemctl disable firewalld fi - # Ensure iptables is available (RHEL 10+ ships only nftables) - if ! command -v iptables >/dev/null 2>&1; then - if command -v dnf >/dev/null 2>&1 && dnf list available iptables-nft 2>/dev/null | grep -q iptables-nft; then + # Load kernel modules required by Calico and kube-proxy. + # On RHEL 10 the legacy xtables extension modules (xt_conntrack, xt_comment, + # br_netfilter) are not built into the kernel at all, so they cannot be + # modprobed. However nf_conntrack (the core conntrack module) is built as a + # module (=m) and must be loaded — Calico's VXLAN dataplane and kube-proxy + # nftables mode both require it. + # overlay is needed by containerd for container overlay filesystems. + for mod in nf_conntrack overlay; do + if ! lsmod | grep -q "^${mod} "; then + sudo modprobe "${mod}" 2>/dev/null || echo "WARN: could not load kernel module ${mod}" + fi + done + # Persist across reboots + sudo mkdir -p /etc/modules-load.d + printf 'nf_conntrack\noverlay\n' | sudo tee /etc/modules-load.d/k0s.conf >/dev/null + + # Ensure iptables is available (RHEL 10+ ships only nftables). + # Do NOT use 'command -v iptables' or 'iptables --version' as the guard: + # on RHEL 10 both can return exit 0 even when no binary exists (shell PATH + # edge case). Use rpm -q as the ground truth on RPM-based systems; fall + # back to testing whether the binary actually produces output. + _iptables_ok=false + if rpm -q iptables-nft >/dev/null 2>&1 || rpm -q iptables >/dev/null 2>&1; then + _iptables_ok=true + elif [ -n "$(iptables --version 2>/dev/null)" ]; then + _iptables_ok=true + fi + if ! $_iptables_ok; then + if command -v dnf >/dev/null 2>&1; then echo 'Installing iptables-nft...' - sudo dnf install -y iptables-nft >/dev/null 2>&1 + if sudo dnf install -y iptables-nft; then + echo 'iptables-nft installed successfully' + else + echo 'ERROR: dnf install iptables-nft failed — kube-proxy will fail to program ClusterIP NAT rules. Ensure AppStream repo is enabled.' >&2 + exit 1 + fi + elif command -v apt-get >/dev/null 2>&1; then + echo 'Installing iptables...' + if sudo apt-get install -y iptables; then + echo 'iptables installed successfully' + else + echo 'ERROR: apt-get install iptables failed — cannot proceed.' >&2 + exit 1 + fi + else + echo 'ERROR: No package manager found to install iptables — cannot proceed.' >&2 + exit 1 fi fi @@ -821,10 +851,114 @@ prepare_nodes_for_k0s() { if [ -f /usr/local/bin/k0s ] && [ ! -f /usr/bin/k0s ]; then sudo ln -sf /usr/local/bin/k0s /usr/bin/k0s fi - " || warn " Preparation had issues on ${node_ip}" +REMOTE_SCRIPT done } +# ====== FIX KUBE-PROXY MODE (iptables → nftables) IF NEEDED ====== +# RHEL 10 kernel 6.12.0-124.38+ removed the legacy xtables extension modules +# (xt_conntrack, xt_comment, xt_nat, nft_compat). kube-proxy in "iptables" +# mode cannot program NAT rules without them, even with iptables-nft installed. +# Older RHEL 10 kernels (6.12.0-124.21) still ship xt_* modules so iptables +# mode works fine there — we only patch when the modules are truly absent. +# +# k0s manages kube-proxy via a stack manifest at +# /var/lib/k0s/manifests/kubeproxy/kube-proxy.yaml +# and continuously reconciles the ConfigMap from it. Patching the ConfigMap +# alone is overwritten within seconds. We must patch the on-disk manifest +# FIRST, then the ConfigMap, then bounce the pods. +fix_kube_proxy_mode_if_needed() { + local controller_ip="$1" + + # Check current mode from ConfigMap + local current_mode + current_mode=$(kubectl get cm kube-proxy -n kube-system \ + -o jsonpath='{.data.config\.conf}' 2>/dev/null \ + | grep '^mode:' | awk '{print $2}' | tr -d '"') + + if [[ "${current_mode}" != "iptables" ]]; then + log "kube-proxy mode is '${current_mode}' — no patch needed" + return 0 + fi + + # Check if the controller kernel actually has xt_conntrack + local has_xt + has_xt=$(ssh_exec "${controller_ip}" \ + "modprobe -n xt_conntrack 2>/dev/null && echo yes || echo no") || has_xt="no" + has_xt=$(echo "${has_xt}" | tr -d '[:space:]') + + if [[ "${has_xt}" == "yes" ]]; then + log "kube-proxy mode is iptables and kernel has xt_conntrack — no patch needed" + return 0 + fi + + log "kube-proxy is in iptables mode but kernel lacks xt_conntrack — patching to nftables..." + + # 1. Patch the on-disk manifest so k0s reconciliation preserves the change + ssh_exec "${controller_ip}" \ + "sudo sed -i 's/mode: \"iptables\"/mode: \"nftables\"/' /var/lib/k0s/manifests/kubeproxy/kube-proxy.yaml" \ + || warn " Could not patch on-disk kube-proxy manifest" + + # 2. Patch the ConfigMap for immediate effect + kubectl get cm kube-proxy -n kube-system -o json 2>/dev/null \ + | python3 -c " +import json, sys +cm = json.load(sys.stdin) +cm['data']['config.conf'] = cm['data']['config.conf'].replace('mode: \"iptables\"', 'mode: \"nftables\"') +print(json.dumps(cm)) +" | kubectl apply -f - 2>/dev/null + + # 3. Force-kill kube-proxy pods so replacements start with the new ConfigMap. + # --wait=false is not enough: the old pods keep running with the stale + # iptables-mode config until graceful shutdown completes, and the new pods + # may mount the ConfigMap volume before k8s propagates the update. + log " Force-restarting kube-proxy pods to pick up nftables mode..." + kubectl delete pods -n kube-system -l k8s-app=kube-proxy \ + --force --grace-period=0 2>/dev/null || true + sleep 5 # give DaemonSet controller time to schedule replacements + + local deadline=$(( $(date +%s) + 90 )) + while true; do + local not_ready + not_ready=$(kubectl get pods -n kube-system -l k8s-app=kube-proxy \ + --no-headers 2>/dev/null | grep -cv '1/1.*Running') || not_ready=99 + [[ "${not_ready}" -eq 0 ]] && { log " ✓ kube-proxy pods Running in nftables mode"; break; } + [[ $(date +%s) -ge ${deadline} ]] && { warn " Timed out waiting for kube-proxy pods"; break; } + sleep 3 + done +} + +bounce_calico_if_needed() { + log "Checking if calico-node pods need a restart (install-cni CrashLoop)..." + local crashing + crashing=$(kubectl get pods -n kube-system -l k8s-app=calico-node \ + --no-headers 2>/dev/null | grep -cE 'Init:Error|CrashLoopBackOff|Init:CrashLoopBackOff') || crashing=0 + if [[ "${crashing}" -gt 0 ]]; then + log " Found ${crashing} crashing calico-node pod(s) — deleting so they restart with working kube-proxy..." + kubectl delete pods -n kube-system -l k8s-app=calico-node --wait=false 2>/dev/null || true + + local deadline=$(( $(date +%s) + 120 )) + log " Waiting up to 120s for calico-node pods to become Running..." + while true; do + local not_running + not_running=$(kubectl get pods -n kube-system -l k8s-app=calico-node \ + --no-headers 2>/dev/null | grep -cv 'Running') || not_running=99 + if [[ "${not_running}" -eq 0 ]]; then + log " ✓ All calico-node pods are Running" + break + fi + if [[ $(date +%s) -ge ${deadline} ]]; then + warn " Timed out waiting for calico-node pods — current state:" + kubectl get pods -n kube-system -l k8s-app=calico-node 2>/dev/null || true + break + fi + sleep 5 + done + else + log " calico-node pods look healthy, no restart needed" + fi +} + # ====== MOUNT NVMe INSTANCE STORE FOR EPHEMERAL STORAGE ====== # GPU instance types (g5, g6, p4, p5) typically come with large NVMe instance # store drives but tiny 10 GB EBS root volumes. Kubernetes counts ephemeral @@ -1191,6 +1325,11 @@ PYSCRIPT" log "k0s cluster installed successfully!" kubectl get nodes + # On newer RHEL 10 kernels (6.12.0-124.38+) the xt_conntrack module is gone, + # so kube-proxy in iptables mode can't program NAT rules. Detect and fix. + fix_kube_proxy_mode_if_needed "${controller_ip}" + bounce_calico_if_needed + # Label nodes for proper workload scheduling label_nodes } @@ -1211,21 +1350,63 @@ resolve_node_name() { label_nodes() { log "Labeling nodes for AI workload scheduling..." - # Wait for all nodes to be ready + # Wait for all nodes to be ready. + # + # NOTE: we count nodes whose "Ready" condition is exactly "True" via a + # structured JSON query — NOT by grepping for the string "Ready" in the + # plain-text `kubectl get nodes` output. That string match is a trap + # because the STATUS column of a not-yet-ready node prints the substring + # "NotReady" which ALSO matches a naive `grep -c Ready`, causing the loop + # to exit prematurely. Downstream labeling then silently skips any worker + # that joined the API server late with "Node not found in cluster". local node_count=$((${#CONTROLLER_IPS[@]} + ${#WORKER_IPS[@]})) - log "Waiting for ${node_count} nodes to be ready..." + log "Waiting for ${node_count} node(s) to be Ready..." local timeout=300 local elapsed=0 - while [[ $(kubectl get nodes --no-headers | grep -c "Ready") -lt ${node_count} ]]; do + local ready_count + while :; do + ready_count=$(kubectl get nodes -o json 2>/dev/null \ + | jq '[.items[] | select(.status.conditions[] | select(.type=="Ready" and .status=="True"))] | length' 2>/dev/null \ + || echo 0) + if [[ "${ready_count}" -ge "${node_count}" ]]; then + log " ✓ All ${ready_count}/${node_count} nodes Ready" + break + fi sleep 5 elapsed=$((elapsed + 5)) if [[ ${elapsed} -ge ${timeout} ]]; then - warn "Timeout waiting for all nodes to be ready, proceeding anyway..." + warn "Timeout (${timeout}s) waiting for all nodes to be Ready (have ${ready_count}/${node_count}); proceeding anyway..." break fi + if (( elapsed % 30 == 0 )); then + log " ${ready_count}/${node_count} nodes Ready (${elapsed}/${timeout}s)" + fi done + # Helper: wait up to 60s for a given node name to appear in the API server. + # This guards against the race where a worker joined the cluster just after + # the top-of-function readiness check returned but its Node object is still + # propagating to the API server we're talking to. + _wait_for_node_visible() { + local node_name="$1" + local ip="$2" + local tries=0 + local max_tries=12 # 12 * 5s = 60s + while (( tries < max_tries )); do + if kubectl get node "${node_name}" &>/dev/null; then + return 0 + fi + sleep 5 + tries=$((tries + 1)) + done + warn " Node '${node_name}' (from ${ip}) did not become visible in API server after 60s" + return 1 + } + + # Track labeling outcomes so we can fail loud if any node ends up unlabeled. + local labeling_failures=() + # Label controller nodes for controller_ip in "${CONTROLLER_IPS[@]}"; do local node_name @@ -1233,12 +1414,12 @@ label_nodes() { if [[ -z "${node_name}" ]]; then warn " Could not resolve hostname for controller ${controller_ip}, skipping..." + labeling_failures+=("${controller_ip} (hostname unresolved)") continue fi - # Verify this node exists in the cluster - if ! kubectl get node "${node_name}" &>/dev/null; then - warn " Node '${node_name}' (from ${controller_ip}) not found in cluster, skipping..." + if ! _wait_for_node_visible "${node_name}" "${controller_ip}"; then + labeling_failures+=("${controller_ip} / ${node_name} (never visible)") continue fi @@ -1269,12 +1450,13 @@ label_nodes() { if [[ -z "${node_name}" ]]; then warn " Could not resolve hostname for worker ${worker_ip}, skipping..." + labeling_failures+=("${worker_ip} (hostname unresolved)") worker_index=$((worker_index + 1)) continue fi - if ! kubectl get node "${node_name}" &>/dev/null; then - warn " Node '${node_name}' (from ${worker_ip}) not found in cluster, skipping..." + if ! _wait_for_node_visible "${node_name}" "${worker_ip}"; then + labeling_failures+=("${worker_ip} / ${node_name} (never visible)") worker_index=$((worker_index + 1)) continue fi @@ -1308,6 +1490,91 @@ label_nodes() { kubectl taint nodes "${node#node/}" nvidia.com/gpu=true:NoSchedule --overwrite || true done + # --- Final verification: every node must have splunk.ai/workload-type set --- + # Without this, downstream scheduling silently breaks: weaviate / ray-head / + # many operator-created workloads use nodeSelector: splunk.ai/workload-type=cpu + # and will sit in Pending forever on a node that only has default labels. + log "Verifying every node has splunk.ai/workload-type set..." + local unlabeled + unlabeled=$(kubectl get nodes -o json 2>/dev/null \ + | jq -r '.items[] | select(.metadata.labels["splunk.ai/workload-type"] == null) | .metadata.name' 2>/dev/null \ + || echo "") + if [[ -n "${unlabeled}" ]]; then + # Last-chance recovery: re-iterate config IPs and label whichever matches. + # This catches the case where resolve_node_name raced earlier in the run. + warn "Found unlabeled node(s), attempting recovery:" + echo "${unlabeled}" | while IFS= read -r nn; do + warn " - ${nn}" + done + for ip in "${CONTROLLER_IPS[@]}" "${WORKER_IPS[@]}"; do + local nn + nn=$(resolve_node_name "${ip}") + [[ -z "${nn}" ]] && continue + if echo "${unlabeled}" | grep -qx "${nn}"; then + # Best-effort: apply CPU labels to the controller, CPU labels to + # any worker whose index is < CPU_WORKER_COUNT, else GPU labels. + # This duplicates a small amount of logic but keeps the recovery + # path fully self-contained. + local is_controller=false + for cip in "${CONTROLLER_IPS[@]}"; do + [[ "${cip}" == "${ip}" ]] && is_controller=true && break + done + if ${is_controller}; then + log " Recovery: labeling controller ${nn} (${ip})" + kubectl label nodes "${nn}" \ + splunk.ai/node-role=controller \ + splunk.ai/workload-type=control-plane \ + node.kubernetes.io/role=controller \ + --overwrite || true + else + local wi=0 + for wip in "${WORKER_IPS[@]}"; do + [[ "${wip}" == "${ip}" ]] && break + wi=$((wi + 1)) + done + if [[ ${wi} -lt ${CPU_WORKER_COUNT} ]]; then + log " Recovery: labeling CPU worker ${nn} (${ip})" + kubectl label nodes "${nn}" \ + splunk.ai/node-role=worker \ + splunk.ai/workload-type=cpu \ + node.kubernetes.io/workload=ai-cpu \ + splunk.ai/instance-type=cpu-worker \ + --overwrite || true + else + log " Recovery: labeling GPU worker ${nn} (${ip})" + kubectl label nodes "${nn}" \ + splunk.ai/node-role=worker \ + splunk.ai/workload-type=gpu \ + node.kubernetes.io/workload=ai-gpu \ + splunk.ai/instance-type=gpu-worker \ + nvidia.com/gpu=true \ + --overwrite || true + fi + fi + fi + done + + # Re-check after recovery attempt. + unlabeled=$(kubectl get nodes -o json 2>/dev/null \ + | jq -r '.items[] | select(.metadata.labels["splunk.ai/workload-type"] == null) | .metadata.name' 2>/dev/null \ + || echo "") + if [[ -n "${unlabeled}" ]]; then + err "Nodes still unlabeled after recovery pass: +$(echo "${unlabeled}" | sed 's/^/ /') + +Workloads that select splunk.ai/workload-type=cpu (weaviate, ray-head, +most operator-managed pods) will stay Pending. Aborting." + fi + log " ✓ Recovery successful — all nodes now have workload-type set" + else + log " ✓ All nodes have splunk.ai/workload-type set" + fi + + if [[ ${#labeling_failures[@]} -gt 0 ]]; then + warn "label_nodes encountered ${#labeling_failures[@]} non-fatal issue(s):" + for f in "${labeling_failures[@]}"; do warn " - ${f}"; done + fi + log "Node labeling complete!" log "Nodes with labels:" kubectl get nodes --show-labels @@ -1678,120 +1945,443 @@ EOF # ====== INSTALL NVIDIA DRIVERS ON GPU NODES (bare-metal / EC2) ====== # Per-node NVIDIA driver + container toolkit install (called in parallel). +# +# Error handling philosophy: +# - `set -euo pipefail` inside every remote block so the first real failure +# aborts the node install immediately. +# - NO blanket `|| true` / `2>/dev/null` on installer commands — failures +# are loud and caught. +# - After install, strict verification gates hard-fail if the artifacts +# aren't where they should be (nvidia-smi works, libnvidia-ml.so exists, +# nvidia-ctk present, CDI spec populated). +# - RHEL 9 and RHEL 10 paths are deliberately symmetric: both install EPEL, +# both install DKMS, both clean stale cross-major CUDA repos. +# +# Returns 0 on fully-successful install, non-zero on any verification failure. _install_nvidia_on_node() { local gpu_ip="$1" - # Check if driver is already installed - if ssh_exec "${gpu_ip}" "nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null" &>/dev/null; then - local driver_ver - driver_ver=$(ssh_exec "${gpu_ip}" "nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null" || echo "unknown") + # ---- Phase A: detect if driver is already installed --------------------- + local driver_ver="" + if ssh_exec "${gpu_ip}" "command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi --query-gpu=driver_version --format=csv,noheader" 2>/dev/null; then + driver_ver=$(ssh_exec "${gpu_ip}" "nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1") || driver_ver="" + fi + + if [[ -n "${driver_ver}" ]]; then echo "✓ NVIDIA driver already installed on ${gpu_ip} (version: ${driver_ver})" else echo "Installing NVIDIA driver on ${gpu_ip}..." - ssh_exec "${gpu_ip}" " - set -e - # Install kernel headers (needed for DKMS driver build) - sudo dnf install -y kernel-devel-\$(uname -r) kernel-headers-\$(uname -r) 2>/dev/null || \ - sudo yum install -y kernel-devel-\$(uname -r) kernel-headers-\$(uname -r) 2>/dev/null || \ - sudo apt-get install -y linux-headers-\$(uname -r) 2>/dev/null || true - - # Detect OS and add appropriate NVIDIA repo - if [ -f /etc/amzn-release ] || grep -qi 'amzn' /etc/os-release 2>/dev/null; then - sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/amzn2023/x86_64/cuda-amzn2023.repo 2>/dev/null || true - sudo dnf install -y nvidia-driver nvidia-driver-cuda nvidia-driver-libs 2>/dev/null || \ - sudo dnf module install -y nvidia-driver:latest-dkms 2>/dev/null || true + + # ---- Phase B: install driver + supporting packages -------------------- + # `set -euo pipefail` means ANY failure aborts the block. Each step below + # must either succeed or have an explicit fallback branch that succeeds. + if ! ssh_exec "${gpu_ip}" " + set -euo pipefail + + # --- OS detection (RHEL 9, RHEL 10, Amazon Linux 2023, Debian/Ubuntu) --- + # OS_VERSION holds the numeric major we use to build the CUDA+EPEL URLs. + # For RHEL we read %{rhel}; for Amazon Linux 2023 we hardcode 9 because + # AL2023 is binary-compatible with RHEL/Fedora 9's nvidia-driver RPMs + # and the Fedora EPEL9 repo is the standard 3rd-party source. + echo '--- OS detection ---' + OS_FAMILY= + OS_VERSION= + if grep -qiE '^ID=\"?amzn\"?' /etc/os-release 2>/dev/null; then + OS_FAMILY=amzn + OS_VERSION=\$(. /etc/os-release; echo \"\${VERSION_ID%%.*}\") elif [ -f /etc/redhat-release ]; then - RHEL_MAJOR=\$(rpm -E %{rhel} 2>/dev/null || echo 9) - if [ \"\${RHEL_MAJOR}\" -ge 10 ]; then - # Add RHEL 10 CUDA repo only; remove any stale rhel9 repo to prevent GPG conflicts - sudo rm -f /etc/yum.repos.d/cuda-rhel9.repo 2>/dev/null || true - sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel10/x86_64/cuda-rhel10.repo 2>/dev/null || true - - # RHEL 10 removed DNF modularity; DKMS kmod requires EPEL - if ! rpm -q epel-release >/dev/null 2>&1; then - echo 'Installing EPEL for dkms...' - sudo dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-10.noarch.rpm 2>/dev/null || true - fi - sudo dnf install -y dkms 2>/dev/null || true + OS_FAMILY=rhel + OS_VERSION=\$(rpm -E %{rhel}) + elif [ -f /etc/debian_version ]; then + OS_FAMILY=debian + fi + if [ -z \"\${OS_FAMILY}\" ]; then + echo 'ERROR: unsupported OS (not amzn/rhel/debian)' >&2 + cat /etc/os-release >&2 || true + exit 1 + fi + echo \"OS_FAMILY=\${OS_FAMILY} OS_VERSION=\${OS_VERSION:-n/a}\" + + # --- Step 1: kernel headers (required for DKMS to build nvidia kmod) --- + KREL=\$(uname -r) + echo \"--- Installing kernel headers for kernel \${KREL} ---\" + if [ \"\${OS_FAMILY}\" = 'debian' ]; then + sudo apt-get update -qq + sudo apt-get install -y \"linux-headers-\${KREL}\" + else + # Exact-match: every historical kernel-devel is usually in RHUI for + # RHEL 9/10. Fall back to the latest only when absent (rare). + if ! sudo dnf install -y \"kernel-devel-\${KREL}\" \"kernel-headers-\${KREL}\"; then + echo \"WARN: Exact kernel-devel-\${KREL} not found; installing latest kernel-devel/headers.\" + echo \" DKMS will build against the latest headers — if they don't match the running kernel,\" + echo \" modprobe will fail below and you'll need to reboot into the updated kernel.\" + sudo dnf install -y kernel-devel kernel-headers + fi + fi - sudo dnf install -y nvidia-driver nvidia-driver-cuda nvidia-driver-libs 2>/dev/null || \ - sudo dnf install -y --nobest nvidia-driver nvidia-driver-cuda nvidia-driver-libs 2>/dev/null || \ - sudo dnf install -y --nobest nvidia-open 2>/dev/null || true + # --- Step 2: EPEL + DKMS + build toolchain ---------------------------- + # DKMS builds the nvidia kernel module from source on every kernel + # update. It needs: dkms (from EPEL), gcc, make, elfutils-libelf-devel. + # On a BARE RHEL minimal install, NONE of these are pre-installed. + # On AWS AMIs they may be partially pre-installed but we should not + # rely on that — be explicit. + if [ \"\${OS_FAMILY}\" = 'rhel' ] || [ \"\${OS_FAMILY}\" = 'amzn' ]; then + # EPEL: AL2023 = EPEL9 (binary-compat). RHEL: matching major. + if [ \"\${OS_FAMILY}\" = 'amzn' ]; then + EPEL_MAJOR=9 else - sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo 2>/dev/null || true - sudo dnf module install -y nvidia-driver:latest-dkms 2>/dev/null || \ - sudo dnf install -y --nobest nvidia-driver nvidia-driver-cuda nvidia-driver-libs 2>/dev/null || true + EPEL_MAJOR=\${OS_VERSION} fi - elif [ -f /etc/debian_version ]; then - curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb -o /tmp/cuda-keyring.deb + + # dnf-plugins-core provides 'dnf config-manager'. Pre-installed on + # most AMIs; install explicitly for minimal images. + sudo dnf install -y dnf-plugins-core + + # EPEL: provides DKMS on RHEL (RHEL's own repos don't ship DKMS). + if ! rpm -q epel-release >/dev/null 2>&1; then + echo \"--- Installing EPEL for DKMS (major \${EPEL_MAJOR}) ---\" + sudo dnf install -y \"https://dl.fedoraproject.org/pub/epel/epel-release-latest-\${EPEL_MAJOR}.noarch.rpm\" + fi + # CRB (formerly PowerTools on RHEL 8) hosts a few EPEL build deps on + # RHEL. AL2023 doesn't have a CRB repo (its core packages are in + # 'amazonlinux' directly), so this whole chain is best-effort — the + # trailing '|| true' only runs when ALL three names fail to match + # any known repo, which is the expected state on AL2023. + sudo dnf config-manager --set-enabled crb 2>/dev/null \\ + || sudo dnf config-manager --set-enabled PowerTools 2>/dev/null \\ + || sudo dnf config-manager --set-enabled powertools 2>/dev/null \\ + || true + + # DKMS + the build toolchain. Being explicit means a minimal / bare + # RHEL install works out-of-the-box and future driver versions + # with different weak-deps don't silently miss a needed package. + echo '--- Installing DKMS + build toolchain (gcc, make, elfutils-libelf-devel) ---' + sudo dnf install -y dkms gcc make elfutils-libelf-devel + fi + + # --- Step 3: CUDA repo for the right OS family + version -------------- + # Clean cross-major repos so dnf doesn't try to install from the wrong + # CUDA metadata (common failure mode on in-place RHEL 9 → 10 upgrades, + # and on re-runs of this script where the target OS may have changed). + if [ \"\${OS_FAMILY}\" = 'amzn' ]; then + sudo rm -f /etc/yum.repos.d/cuda-amzn*.repo + sudo dnf config-manager --add-repo \\ + \"https://developer.download.nvidia.com/compute/cuda/repos/amzn\${OS_VERSION:-2023}/x86_64/cuda-amzn\${OS_VERSION:-2023}.repo\" + elif [ \"\${OS_FAMILY}\" = 'rhel' ]; then + sudo rm -f /etc/yum.repos.d/cuda-rhel*.repo + sudo dnf config-manager --add-repo \\ + \"https://developer.download.nvidia.com/compute/cuda/repos/rhel\${OS_VERSION}/x86_64/cuda-rhel\${OS_VERSION}.repo\" + elif [ \"\${OS_FAMILY}\" = 'debian' ]; then + curl -fsSL \\ + https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb \\ + -o /tmp/cuda-keyring.deb sudo dpkg -i /tmp/cuda-keyring.deb - sudo apt-get update && sudo apt-get install -y nvidia-driver-550 2>/dev/null || true + sudo apt-get update -qq fi - # Load nvidia kernel module immediately (avoids needing a reboot) - sudo modprobe nvidia 2>/dev/null || true - " || { echo "Driver install on ${gpu_ip} had issues"; return 1; } + # --- Step 4: install the driver ------------------------------------- + # Follows NVIDIA's official RHEL install guidance: + # https://docs.nvidia.com/datacenter/tesla/driver-installation-guide/ + # + # Package names in the CUDA repo (rhel9/rhel10/amzn2023): + # - cuda-drivers -> meta-pkg for proprietary driver + # (pulls nvidia-driver, kmod-nvidia-latest-dkms, + # nvidia-driver-cuda, nvidia-driver-libs, + # libnvidia-ml, etc.) + # - nvidia-open -> meta-pkg for open-source kernel driver + # - nvidia-driver:latest-dkms -> RHEL 9 only (dnf modular stream). + # Removed in RHEL 10 (modularity deprecated). + # + # There is NO package called 'nvidia-driver-dkms' in either repo — + # previous attempts at it failed on every fresh install. + # + # Strategy: single meta-package install. RHEL 10 requires --allowerasing + # because it has to remove conflicting nouveau packages. The flag is + # a no-op on RHEL 9/AL2023 where there's nothing to erase. + echo '--- Installing NVIDIA driver (meta package: cuda-drivers) ---' + if [ \"\${OS_FAMILY}\" = 'debian' ]; then + sudo apt-get install -y nvidia-driver-550 + else + # Blacklist nouveau so the new nvidia driver can load without fighting it. + # Harmless if nouveau isn't loaded (grep returns nothing). + if lsmod | grep -q '^nouveau'; then + echo '--- Blacklisting nouveau + unloading ---' + echo -e 'blacklist nouveau\\noptions nouveau modeset=0' \\ + | sudo tee /etc/modprobe.d/blacklist-nouveau.conf >/dev/null + sudo rmmod nouveau 2>/dev/null || true + # Regenerate initramfs so nouveau doesn't come back on reboot. + sudo dracut --force 2>/dev/null || true + fi - # Verify - if ssh_exec "${gpu_ip}" "nvidia-smi 2>/dev/null" &>/dev/null; then - echo "✓ NVIDIA driver installed successfully on ${gpu_ip}" - else - echo "⚠ NVIDIA driver may need a reboot on ${gpu_ip} to take effect" + # Primary strategy: cuda-drivers meta-package (works on RHEL 9, RHEL 10, + # AL2023 — the CUDA repo ships the same package name everywhere). + if sudo dnf install -y --allowerasing cuda-drivers; then + echo '✓ Installed cuda-drivers meta-package' + elif [ \"\${OS_VERSION}\" = '9' ] && sudo dnf module install -y nvidia-driver:latest-dkms; then + # RHEL 9 fallback: classic dnf module stream (RHEL 10 dropped modularity). + # Kept as a safety net — cuda-drivers should always work above. + echo '✓ Installed nvidia-driver:latest-dkms via dnf module (RHEL 9 legacy path)' + elif sudo dnf install -y --allowerasing nvidia-open; then + # Last-resort fallback: open-source kernel driver. + echo '✓ Installed nvidia-open (open-kernel fallback)' + else + echo 'ERROR: all NVIDIA driver install strategies failed' >&2 + echo ' Tried: cuda-drivers, nvidia-driver:latest-dkms (module), nvidia-open' >&2 + echo ' Possible causes:' >&2 + echo ' - CUDA repo URL incorrect for OS version \${OS_VERSION}' >&2 + echo ' - EPEL/DKMS not available' >&2 + echo ' - Network blocked to developer.download.nvidia.com' >&2 + exit 1 + fi + fi + + # --- Step 5: verify DKMS built + load kmod --------------------------- + # Before modprobe: check dkms status so we catch kernel-mismatch cases + # early with a clear error instead of the cryptic 'Module not found'. + echo '--- Verifying DKMS status + loading nvidia kmod ---' + if [ \"\${OS_FAMILY}\" != 'debian' ]; then + DKMS_OUT=\$(sudo dkms status 2>&1 | grep nvidia || true) + if [ -z \"\${DKMS_OUT}\" ]; then + echo 'ERROR: dkms status shows no nvidia entry — driver install did not register with DKMS' >&2 + exit 1 + fi + echo \"DKMS: \${DKMS_OUT}\" + if ! echo \"\${DKMS_OUT}\" | grep -qE 'installed|built'; then + echo 'ERROR: nvidia DKMS module is not installed/built. See: sudo dkms status; dmesg | grep nvidia' >&2 + exit 1 + fi + # Check the built-for kernel matches the running kernel. If not, + # a reboot into the newer installed kernel is required — DO NOT pretend + # modprobe will work. This is exactly what prevents false-positive + # 'install succeeded' on nodes that had a pending kernel update. + if ! echo \"\${DKMS_OUT}\" | grep -qF \"\${KREL}\"; then + echo \"ERROR: DKMS built nvidia module for a different kernel than \${KREL}.\" >&2 + echo \" 'sudo dkms status' shows: \${DKMS_OUT}\" >&2 + echo \" Action: reboot the node into the kernel DKMS built for, then re-run.\" >&2 + exit 1 + fi + fi + sudo modprobe nvidia || { + echo 'ERROR: modprobe nvidia failed after DKMS build succeeded.' >&2 + echo 'Diagnose with: sudo dmesg | grep -i nvidia | tail -30' >&2 + exit 1 + } + "; then + echo "❌ NVIDIA driver install failed on ${gpu_ip}" >&2 + return 1 fi + + # ---- Phase C: hard-verify driver actually works ----------------------- + local ver_check + ver_check=$(ssh_exec "${gpu_ip}" "nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>&1 | head -1" || echo "") + if [[ -z "${ver_check}" ]] || ! [[ "${ver_check}" =~ ^[0-9]+\.[0-9]+ ]]; then + echo "❌ nvidia-smi verification failed on ${gpu_ip} (got: '${ver_check}')" >&2 + return 1 + fi + echo "✓ NVIDIA driver v${ver_check} running on ${gpu_ip}" fi - # Install NVIDIA Container Toolkit - echo "Ensuring NVIDIA Container Toolkit on ${gpu_ip}..." - ssh_exec "${gpu_ip}" " - if command -v nvidia-ctk &>/dev/null; then - echo 'nvidia-ctk already installed' + # ---- Phase D: NVIDIA Container Toolkit install ------------------------ + echo "Installing NVIDIA Container Toolkit on ${gpu_ip}..." + if ! ssh_exec "${gpu_ip}" " + set -euo pipefail + if command -v nvidia-ctk >/dev/null 2>&1; then + echo '✓ nvidia-ctk already installed (version: '\"\$(nvidia-ctk --version 2>/dev/null | head -1)\"')' else - # Add NVIDIA Container Toolkit repo - curl -fsSL https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | \ - sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo >/dev/null 2>/dev/null || true - curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \ - sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg 2>/dev/null || true - - # Install - sudo dnf install -y nvidia-container-toolkit 2>/dev/null || \ - sudo yum install -y nvidia-container-toolkit 2>/dev/null || \ - sudo apt-get install -y nvidia-container-toolkit 2>/dev/null || true + echo '--- Adding NVIDIA container-toolkit repo ---' + if [ -f /etc/debian_version ]; then + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \ + sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list >/dev/null + sudo apt-get update -qq + sudo apt-get install -y nvidia-container-toolkit + else + # RHEL 9 and 10 both use the same libnvidia-container stable RPM repo. + curl -fsSL https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | \ + sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo >/dev/null + sudo dnf install -y nvidia-container-toolkit + fi fi - # Configure for k0s containerd (k0s uses /run/k0s/containerd.sock) + # --- Configure k0s containerd (k0s uses /run/k0s/containerd.sock) ---- + # Strategy (compatible with nvidia-ctk >= 1.14, validated against 1.19): + # + # 1. Run \`nvidia-ctk runtime configure --runtime=containerd + # --nvidia-set-as-default\` with NO --config= flag. This makes + # nvidia-ctk emit a complete, correct drop-in at its known-good + # default path /etc/containerd/conf.d/99-nvidia.toml, containing: + # + # - version = 2 + # - plugins.\"io.containerd.grpc.v1.cri\".containerd.runtimes.nvidia + # - default_runtime_name = \"nvidia\" + # + # 2. On k0s nodes, we cannot leave that file at its default path + # because k0s's managed /etc/k0s/containerd.toml only imports + # /etc/k0s/containerd.d/*.toml — anything under + # /etc/containerd/conf.d/ is ignored. So we move it. + # + # We deliberately avoid passing --config= pointing at the k0s drop-in, + # because nvidia-ctk 1.19 treats the --config target as a \"main\" + # containerd config and only writes a two-line stub (imports + version) + # into it, emitting the actual runtime config to /etc/containerd/conf.d/ + # regardless. That silent behavior caused the 'containerd-nvidia-runtime: + # FAIL' verification error that reaching here used to surface. + echo '--- Configuring containerd runtime for nvidia ---' if [ -d /etc/k0s/containerd.d ]; then - sudo nvidia-ctk runtime configure --runtime=containerd 2>/dev/null || true - - if [ -f /etc/containerd/conf.d/99-nvidia.toml ]; then - sudo cp /etc/containerd/conf.d/99-nvidia.toml /etc/k0s/containerd.d/nvidia.toml - sudo rm -f /etc/containerd/conf.d/99-nvidia.toml - elif [ ! -s /etc/k0s/containerd.d/nvidia.toml ]; then - sudo nvidia-ctk runtime configure --runtime=containerd \ - --config=/etc/k0s/containerd.d/nvidia.toml 2>/dev/null || true + sudo mkdir -p /etc/k0s/containerd.d + + # Preserve any existing drop-in so idempotent re-runs don't lose + # hand-tuned configuration. + if [ -s /etc/k0s/containerd.d/nvidia.toml ]; then + sudo cp -a /etc/k0s/containerd.d/nvidia.toml /etc/k0s/containerd.d/nvidia.toml.bak + fi + + # Wipe any previous output so we can tell whether this invocation + # actually produced a file. + sudo rm -f /etc/containerd/conf.d/99-nvidia.toml + + # Generate the canonical drop-in at nvidia-ctk's default path. We + # rely on --nvidia-set-as-default to inject default_runtime_name. + sudo nvidia-ctk runtime configure \\ + --runtime=containerd \\ + --nvidia-set-as-default + + # Hard-fail if the file is missing or empty. + if [ ! -s /etc/containerd/conf.d/99-nvidia.toml ]; then + echo 'ERROR: nvidia-ctk did not produce /etc/containerd/conf.d/99-nvidia.toml' >&2 + echo 'nvidia-ctk --version:' >&2 + nvidia-ctk --version 2>&1 | head -3 >&2 + exit 1 fi - sudo sed -i '/^version/d; /^imports/d; /^disabled_plugins/d; /^required_plugins/d' \ - /etc/k0s/containerd.d/nvidia.toml 2>/dev/null || true + # Verify the generated drop-in actually names nvidia as the default + # runtime. Earlier nvidia-ctk versions (< 1.14) ignored + # --nvidia-set-as-default silently. + if ! sudo grep -q 'default_runtime_name = \"nvidia\"' /etc/containerd/conf.d/99-nvidia.toml; then + echo 'ERROR: nvidia-ctk drop-in does not set default_runtime_name = \"nvidia\".' >&2 + echo 'nvidia-ctk --version:' >&2 + nvidia-ctk --version 2>&1 | head -3 >&2 + echo '--- generated drop-in (first 30 lines) ---' >&2 + sudo head -30 /etc/containerd/conf.d/99-nvidia.toml >&2 + exit 1 + fi - if ! grep -q 'default_runtime_name' /etc/k0s/containerd.d/nvidia.toml 2>/dev/null; then - sudo sed -i '/\[plugins\.\"io\.containerd\.grpc\.v1\.cri\"\.containerd\]$/{ - a\ default_runtime_name = \"nvidia\" - }' /etc/k0s/containerd.d/nvidia.toml 2>/dev/null || true + # Relocate the drop-in from nvidia-ctk's default path to the path + # that k0s's managed containerd.toml imports. We strip keys that + # would duplicate declarations already made by k0s's top-level + # config (version / imports / disabled_plugins / required_plugins); + # leaving them in place causes containerd to refuse to start with + # duplicate top-level-key errors. + sudo mv /etc/containerd/conf.d/99-nvidia.toml /etc/k0s/containerd.d/nvidia.toml + sudo sed -i '/^version/d; /^imports/d; /^disabled_plugins/d; /^required_plugins/d' \\ + /etc/k0s/containerd.d/nvidia.toml + + # Final sanity: the k0s drop-in must still carry default_runtime_name + # after the key-strip above (it lives under a nested table, not at + # top level, so the sed above never touches it — but verify anyway + # so failure is loud instead of silently broken). + if ! sudo grep -q 'default_runtime_name = \"nvidia\"' /etc/k0s/containerd.d/nvidia.toml; then + echo 'ERROR: /etc/k0s/containerd.d/nvidia.toml lost default_runtime_name after relocation.' >&2 + echo '--- file contents ---' >&2 + sudo cat /etc/k0s/containerd.d/nvidia.toml >&2 + exit 1 fi elif [ -f /etc/containerd/config.toml ]; then - sudo nvidia-ctk runtime configure --runtime=containerd 2>/dev/null || true + # Non-k0s containerd (standalone) — safe to let nvidia-ctk edit in place. + sudo nvidia-ctk runtime configure --runtime=containerd --nvidia-set-as-default + else + echo 'ERROR: no containerd config dir found at /etc/k0s/containerd.d or /etc/containerd/config.toml' >&2 + exit 1 fi + # --- Generate the CDI spec so k8s device plugin can find the GPUs --- + echo '--- Generating CDI spec ---' sudo mkdir -p /etc/cdi - sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml 2>/dev/null || true + sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml + if [ ! -s /etc/cdi/nvidia.yaml ]; then + echo 'ERROR: /etc/cdi/nvidia.yaml empty after generation' >&2 + exit 1 + fi + # Sanity-check that NVML could enumerate at least one device (without + # this, the spec contains no devices and the device plugin crash-loops). + if ! grep -q 'name: ' /etc/cdi/nvidia.yaml; then + echo 'ERROR: /etc/cdi/nvidia.yaml contains no device entries' >&2 + cat /etc/cdi/nvidia.yaml | head -40 >&2 + exit 1 + fi - sudo systemctl stop k0sworker 2>/dev/null || true + # --- Restart k0sworker to pick up new runtime + CDI spec ----------- + echo '--- Restarting k0sworker to pick up runtime changes ---' + sudo systemctl stop k0sworker || true sleep 3 - sudo pkill -9 containerd-shim 2>/dev/null || true - sudo rm -f /run/k0s/containerd.sock 2>/dev/null || true + sudo pkill -9 containerd-shim || true + sudo rm -f /run/k0s/containerd.sock || true + sudo systemctl start k0sworker + + # Quick sanity: confirm nvidia-ctk + libnvidia-ml.so exist where expected. + # Search all known paths (distributions differ): RHEL/Fedora use + # /usr/lib64, Debian/Ubuntu use /usr/lib/x86_64-linux-gnu, and + # some distros also expose it via ldconfig. + echo '--- Post-install sanity ---' + nvidia-ctk --version | head -1 + LIBNVML_PATH=\$(ldconfig -p 2>/dev/null | awk '/libnvidia-ml\\.so\\.1/ {print \$NF; exit}') + if [ -z \"\${LIBNVML_PATH}\" ]; then + for so in /usr/lib64/libnvidia-ml.so.1 \\ + /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1 \\ + /usr/lib/libnvidia-ml.so.1; do + if [ -e \"\${so}\" ]; then LIBNVML_PATH=\"\${so}\"; break; fi + done + fi + if [ -n \"\${LIBNVML_PATH}\" ]; then + echo \"✓ libnvidia-ml.so.1 found: \${LIBNVML_PATH}\" + else + echo 'ERROR: libnvidia-ml.so.1 not found on any standard path.' >&2 + exit 1 + fi + "; then + echo "❌ Container toolkit setup failed on ${gpu_ip}" >&2 + return 1 + fi - sudo systemctl start k0sworker 2>/dev/null || true - " || { echo "Container toolkit setup on ${gpu_ip} had issues"; return 1; } + # ---- Phase E: post-install strict verification ----------------------- + # These checks are what the device plugin will actually need at runtime. + local checks_out + checks_out=$(ssh_exec "${gpu_ip}" " + set +e + echo -n 'nvidia-smi: ' + nvidia-smi --query-gpu=name --format=csv,noheader >/dev/null 2>&1 && echo OK || echo FAIL + echo -n 'libnvidia-ml.so: ' + # Check ldconfig cache first (most reliable), then fall back to the + # common per-distribution install paths. + if ldconfig -p 2>/dev/null | grep -q 'libnvidia-ml\.so\.1'; then + echo OK + elif ls /usr/lib64/libnvidia-ml.so.1 \\ + /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1 \\ + /usr/lib/libnvidia-ml.so.1 2>/dev/null | head -1 | grep -q .; then + echo OK + else + echo FAIL + fi + echo -n 'nvidia-ctk: ' + command -v nvidia-ctk >/dev/null 2>&1 && echo OK || echo FAIL + echo -n 'cdi-spec: ' + [ -s /etc/cdi/nvidia.yaml ] && grep -q 'name: ' /etc/cdi/nvidia.yaml && echo OK || echo FAIL + echo -n 'nvidia-kmod: ' + lsmod | grep -q '^nvidia ' && echo OK || echo FAIL + echo -n 'containerd-nvidia-runtime: ' + grep -q 'default_runtime_name = \"nvidia\"' /etc/k0s/containerd.d/nvidia.toml 2>/dev/null && echo OK || echo FAIL + ") + + echo "Strict verification on ${gpu_ip}:" + echo "${checks_out}" | sed 's/^/ /' + if echo "${checks_out}" | grep -q FAIL; then + echo "❌ Strict verification failed on ${gpu_ip} — device plugin will crash-loop with ERROR_LIBRARY_NOT_FOUND" >&2 + return 1 + fi + echo "✓ Strict verification passed on ${gpu_ip}" + return 0 } # EKS GPU AMIs ship with NVIDIA drivers pre-installed. @@ -1866,7 +2456,21 @@ install_nvidia_host_drivers() { rm -rf "${logdir}" if [[ ${failed} -gt 0 ]]; then - warn "${failed} GPU node(s) had NVIDIA install issues — check logs above" + err "${failed}/${#gpu_ips[@]} GPU node(s) had NVIDIA install failures. Aborting install. + + What to check on a failing node: + ssh 'dkms status | grep nvidia' # must show 'installed' + ssh 'lsmod | grep nvidia' # must list nvidia kmod + ssh 'ls /usr/lib64/libnvidia-ml.so.1' # must exist + ssh 'nvidia-ctk --version' # must work + ssh 'cat /etc/cdi/nvidia.yaml | head -40' # must list GPU devices + ssh 'sudo dmesg | grep -i nvidia | tail -30'# kernel-level errors + + Common causes: + - kernel-devel for running kernel not available (exact match too new); + reboot to match a released kernel, then re-run + - EPEL/DKMS didn't install (check 'rpm -q epel-release dkms') + - Stale /etc/yum.repos.d/cuda-rhel*.repo from a prior OS upgrade" else log "NVIDIA drivers installed successfully on all ${#gpu_ips[@]} GPU node(s)" fi @@ -1909,20 +2513,41 @@ install_nvidia_host_drivers() { done if [[ "${all_gpu_ready}" != "true" ]]; then - warn "Some GPU nodes may not be Ready yet. Check with: kubectl get nodes" - warn "GPU nodes may need a reboot if NVIDIA drivers were freshly installed." + err "Some GPU nodes did not become Ready within ${gpu_wait_timeout}s. Check: kubectl get nodes" + fi + + # Verify GPUs are visible to Kubernetes. If the device-plugin DaemonSet + # isn't installed yet (expected during the initial install — it's created + # by install_nvidia_device_plugin() in Phase 2), short-circuit immediately + # instead of waiting a fruitless 120s. For idempotent re-runs where the + # DS is already present we poll up to 120s. + if ! kubectl -n kube-system get ds nvidia-device-plugin-daemonset &>/dev/null; then + log " (device plugin DaemonSet not yet installed; capacity will appear after install_nvidia_device_plugin runs)" + log "NVIDIA host driver installation complete" + return 0 fi - # Verify GPUs are visible to Kubernetes log "Checking if GPUs are visible to Kubernetes..." - local gpu_capacity - gpu_capacity=$(kubectl get nodes -l splunk.ai/workload-type=gpu -o json 2>/dev/null | \ - jq '[.items[].status.capacity["nvidia.com/gpu"] // "0" | tonumber] | add' 2>/dev/null || echo "0") - if [[ "${gpu_capacity}" -gt 0 ]]; then - log "✓ Total GPUs visible to Kubernetes: ${gpu_capacity}" - else - warn "No GPUs visible to Kubernetes yet — the NVIDIA device plugin may still be starting" - warn "Check with: kubectl get nodes -o json | jq '.items[].status.capacity'" + local gpu_capacity="0" + local cap_wait=0 + local cap_timeout=120 + while [[ ${cap_wait} -lt ${cap_timeout} ]]; do + gpu_capacity=$(kubectl get nodes -l splunk.ai/workload-type=gpu -o json 2>/dev/null | \ + jq '[.items[].status.capacity["nvidia.com/gpu"] // "0" | tonumber] | add' 2>/dev/null || echo "0") + if [[ "${gpu_capacity}" -gt 0 ]]; then + log "✓ Total GPUs visible to Kubernetes: ${gpu_capacity}" + break + fi + sleep 10 + cap_wait=$((cap_wait + 10)) + log " Waiting for GPU capacity to be reported... ${cap_wait}/${cap_timeout}s" + done + + if [[ "${gpu_capacity}" -le 0 ]]; then + err "Device plugin DaemonSet is installed but no GPUs are visible after ${cap_timeout}s. + Investigate with: + kubectl -n kube-system logs ds/nvidia-device-plugin-daemonset --tail 40 + kubectl -n kube-system describe pod -l name=nvidia-device-plugin-ds" fi log "NVIDIA host driver installation complete" @@ -1940,8 +2565,10 @@ install_nvidia_device_plugin() { local ver="${NVIDIA_VERSION:-v0.17.3}" log "Installing NVIDIA device plugin DaemonSet (${ver})..." - # Create the nvidia RuntimeClass so pods (including the device plugin - # itself) can use the NVIDIA container runtime for GPU access. + # Create the nvidia RuntimeClass FIRST. The device-plugin DaemonSet we + # apply below references this RuntimeClass via runtimeClassName=nvidia, so + # it must exist before any DS pod is scheduled — otherwise kubelet will + # reject the pod with 'RuntimeClass "nvidia" not found'. log " Creating nvidia RuntimeClass..." cat <<'RTEOF' | kubectl apply -f - apiVersion: node.k8s.io/v1 @@ -1951,17 +2578,55 @@ metadata: handler: nvidia RTEOF - kubectl apply -n kube-system \ - -f "https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/${ver}/deployments/static/nvidia-device-plugin.yml" + # Fetch the upstream manifest into a temp file, inject our required + # pod-spec fields (nodeSelector + runtimeClassName) BEFORE applying. + # Doing this in one shot — instead of apply-then-patch — avoids the + # race where the initial DS pods start under the default runtime + # (runc), hit 'ERROR_LIBRARY_NOT_FOUND' because they have no access to + # libnvidia-ml.so or /dev/nvidia*, and land in CrashLoopBackOff before + # the patch ever reaches them. + local manifest + manifest=$(mktemp) + if ! curl -fsSL \ + "https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/${ver}/deployments/static/nvidia-device-plugin.yml" \ + -o "${manifest}"; then + rm -f "${manifest}" + err "Failed to fetch NVIDIA device-plugin manifest from GitHub (version ${ver}). + Check network connectivity and that version ${ver} exists upstream." + fi + + log " Patching manifest in place: GPU nodeSelector + nvidia runtimeClassName..." + # Use yq when available (cleanest, structure-aware); fall back to kubectl + # patch --local on stdout — both produce the same patched manifest on + # stdout which we then `apply -f -`. + local patched + patched=$(mktemp) + if command -v yq >/dev/null 2>&1; then + yq eval ' + (select(.kind == "DaemonSet") | .spec.template.spec.nodeSelector."splunk.ai/workload-type") = "gpu" + | (select(.kind == "DaemonSet") | .spec.template.spec.runtimeClassName) = "nvidia" + ' "${manifest}" > "${patched}" + else + # Fallback: use kubectl patch --local. This requires reading from the + # manifest and piping through patch; multi-document files complicate + # things, but this upstream manifest is a single DaemonSet. + kubectl patch -f "${manifest}" --local -o yaml \ + --type='json' \ + -p='[ + {"op": "add", "path": "/spec/template/spec/nodeSelector", "value": {"splunk.ai/workload-type": "gpu"}}, + {"op": "add", "path": "/spec/template/spec/runtimeClassName", "value": "nvidia"} + ]' > "${patched}" + fi - # Constrain the device plugin to GPU-labeled nodes only — non-GPU nodes - # don't have the NVIDIA drivers and the plugin pods would fail there. - log " Patching device plugin: GPU nodeSelector..." - kubectl patch daemonset nvidia-device-plugin-daemonset -n kube-system --type='json' \ - -p='[ - {"op": "add", "path": "/spec/template/spec/nodeSelector", "value": {"splunk.ai/workload-type": "gpu"}} - ]' 2>/dev/null || true + if ! kubectl apply -n kube-system -f "${patched}"; then + rm -f "${manifest}" "${patched}" + err "Failed to apply patched NVIDIA device-plugin manifest. Check kubectl connectivity." + fi + rm -f "${manifest}" "${patched}" + # Wait for the DS to roll out so the caller observes GPU capacity as + # soon as possible. Non-fatal: we verify capacity explicitly upstream + # via the strict-verification loop. kubectl -n kube-system rollout status ds/nvidia-device-plugin-daemonset --timeout=3m || true log "NVIDIA device plugin installed successfully" @@ -3186,11 +3851,18 @@ install_ai_platform_stack() { install_nvidia_host_drivers > "${phase1_logdir}/nvidia-drivers.log" 2>&1 & phase1_pids+=($!); phase1_names+=("nvidia-drivers") + # Track which phase-1 tasks failed. nvidia-drivers failures are fatal: + # without them the device-plugin crash-loops and the whole GPU stack + # silently fails. Every other phase-1 task is merely warned on failure. + local phase1_fatal_failures=0 for i in "${!phase1_pids[@]}"; do if wait "${phase1_pids[$i]}"; then log " ✓ ${phase1_names[$i]} completed" else warn " ✗ ${phase1_names[$i]} had issues" + if [[ "${phase1_names[$i]}" == "nvidia-drivers" ]]; then + phase1_fatal_failures=$((phase1_fatal_failures + 1)) + fi fi while IFS= read -r line; do log " [${phase1_names[$i]}] ${line}" @@ -3198,6 +3870,12 @@ install_ai_platform_stack() { done rm -rf "${phase1_logdir}" + if [[ ${phase1_fatal_failures} -gt 0 ]]; then + err "NVIDIA driver install failed on at least one GPU node; aborting install. + Device-plugin pods would otherwise crash-loop with NVML: ERROR_LIBRARY_NOT_FOUND + and model pods would stay Pending forever. Fix the errors above and re-run." + fi + ensure_s3compat_credentials # --- Phase 2: cert-manager-dependent components (parallel) --- @@ -3593,6 +4271,8 @@ main_install() { all_node_ips+=("${WORKER_IPS[@]}") fi prepare_nodes_for_k0s "${all_node_ips[@]}" + fix_kube_proxy_mode_if_needed "${controller_ip}" + bounce_calico_if_needed # Ensure all expected workers are joined if [[ -n "${EXISTING_WORKER_IPS}" ]]; then @@ -3660,6 +4340,8 @@ main_install() { all_node_ips2+=("${WORKER_IPS[@]}") fi prepare_nodes_for_k0s "${all_node_ips2[@]}" + fix_kube_proxy_mode_if_needed "${controller_ip}" + bounce_calico_if_needed # Ensure all expected workers are joined if [[ -n "${EXISTING_WORKER_IPS}" ]]; then From 1ee8a6b5eca488f61727976e4b6104088cddcf37 Mon Sep 17 00:00:00 2001 From: Mohammed Arif Date: Sat, 25 Apr 2026 00:05:49 +0530 Subject: [PATCH 43/55] fix: reverted support for rhel 10 (untested) --- .../k0s-cluster-config-h100.yaml | 8 +- tools/cluster_setup/k0s_cluster_with_stack.sh | 161 +----------------- 2 files changed, 8 insertions(+), 161 deletions(-) diff --git a/tools/cluster_setup/k0s-cluster-config-h100.yaml b/tools/cluster_setup/k0s-cluster-config-h100.yaml index ff60343..b91c08e 100644 --- a/tools/cluster_setup/k0s-cluster-config-h100.yaml +++ b/tools/cluster_setup/k0s-cluster-config-h100.yaml @@ -25,11 +25,11 @@ nodes: existingIPs: controllers: - - 18.191.66.248 # CHANGE THIS: Your controller server IP + - 3.149.241.167 # CHANGE THIS: Your controller server IP workers: - - 18.220.129.123 # CHANGE THIS: CPU worker 1 - - 3.15.10.138 # CHANGE THIS: GPU worker 1 - - 18.219.68.93 # CHANGE THIS: GPU worker 2 + - 18.221.244.241 # CHANGE THIS: CPU worker 1 + - 18.191.19.128 # CHANGE THIS: GPU worker 1 + - 3.137.209.219 # CHANGE THIS: GPU worker 2 # ---------- Storage Configuration ---------- # Object storage: AWS S3 or external S3-compatible (no in-cluster MinIO install for external). diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index d797e4e..1f45cfc 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -782,55 +782,15 @@ prepare_nodes_for_k0s() { sudo systemctl disable firewalld fi - # Load kernel modules required by Calico and kube-proxy. - # On RHEL 10 the legacy xtables extension modules (xt_conntrack, xt_comment, - # br_netfilter) are not built into the kernel at all, so they cannot be - # modprobed. However nf_conntrack (the core conntrack module) is built as a - # module (=m) and must be loaded — Calico's VXLAN dataplane and kube-proxy - # nftables mode both require it. - # overlay is needed by containerd for container overlay filesystems. - for mod in nf_conntrack overlay; do + # Load kernel modules required by Calico and kube-proxy + for mod in br_netfilter overlay nf_conntrack; do if ! lsmod | grep -q "^${mod} "; then sudo modprobe "${mod}" 2>/dev/null || echo "WARN: could not load kernel module ${mod}" fi done # Persist across reboots sudo mkdir -p /etc/modules-load.d - printf 'nf_conntrack\noverlay\n' | sudo tee /etc/modules-load.d/k0s.conf >/dev/null - - # Ensure iptables is available (RHEL 10+ ships only nftables). - # Do NOT use 'command -v iptables' or 'iptables --version' as the guard: - # on RHEL 10 both can return exit 0 even when no binary exists (shell PATH - # edge case). Use rpm -q as the ground truth on RPM-based systems; fall - # back to testing whether the binary actually produces output. - _iptables_ok=false - if rpm -q iptables-nft >/dev/null 2>&1 || rpm -q iptables >/dev/null 2>&1; then - _iptables_ok=true - elif [ -n "$(iptables --version 2>/dev/null)" ]; then - _iptables_ok=true - fi - if ! $_iptables_ok; then - if command -v dnf >/dev/null 2>&1; then - echo 'Installing iptables-nft...' - if sudo dnf install -y iptables-nft; then - echo 'iptables-nft installed successfully' - else - echo 'ERROR: dnf install iptables-nft failed — kube-proxy will fail to program ClusterIP NAT rules. Ensure AppStream repo is enabled.' >&2 - exit 1 - fi - elif command -v apt-get >/dev/null 2>&1; then - echo 'Installing iptables...' - if sudo apt-get install -y iptables; then - echo 'iptables installed successfully' - else - echo 'ERROR: apt-get install iptables failed — cannot proceed.' >&2 - exit 1 - fi - else - echo 'ERROR: No package manager found to install iptables — cannot proceed.' >&2 - exit 1 - fi - fi + printf 'br_netfilter\noverlay\nnf_conntrack\n' | sudo tee /etc/modules-load.d/k0s.conf >/dev/null # Ensure python3 + PyYAML are available (used for k0s config generation) if ! python3 -c 'import yaml' 2>/dev/null; then @@ -855,110 +815,6 @@ REMOTE_SCRIPT done } -# ====== FIX KUBE-PROXY MODE (iptables → nftables) IF NEEDED ====== -# RHEL 10 kernel 6.12.0-124.38+ removed the legacy xtables extension modules -# (xt_conntrack, xt_comment, xt_nat, nft_compat). kube-proxy in "iptables" -# mode cannot program NAT rules without them, even with iptables-nft installed. -# Older RHEL 10 kernels (6.12.0-124.21) still ship xt_* modules so iptables -# mode works fine there — we only patch when the modules are truly absent. -# -# k0s manages kube-proxy via a stack manifest at -# /var/lib/k0s/manifests/kubeproxy/kube-proxy.yaml -# and continuously reconciles the ConfigMap from it. Patching the ConfigMap -# alone is overwritten within seconds. We must patch the on-disk manifest -# FIRST, then the ConfigMap, then bounce the pods. -fix_kube_proxy_mode_if_needed() { - local controller_ip="$1" - - # Check current mode from ConfigMap - local current_mode - current_mode=$(kubectl get cm kube-proxy -n kube-system \ - -o jsonpath='{.data.config\.conf}' 2>/dev/null \ - | grep '^mode:' | awk '{print $2}' | tr -d '"') - - if [[ "${current_mode}" != "iptables" ]]; then - log "kube-proxy mode is '${current_mode}' — no patch needed" - return 0 - fi - - # Check if the controller kernel actually has xt_conntrack - local has_xt - has_xt=$(ssh_exec "${controller_ip}" \ - "modprobe -n xt_conntrack 2>/dev/null && echo yes || echo no") || has_xt="no" - has_xt=$(echo "${has_xt}" | tr -d '[:space:]') - - if [[ "${has_xt}" == "yes" ]]; then - log "kube-proxy mode is iptables and kernel has xt_conntrack — no patch needed" - return 0 - fi - - log "kube-proxy is in iptables mode but kernel lacks xt_conntrack — patching to nftables..." - - # 1. Patch the on-disk manifest so k0s reconciliation preserves the change - ssh_exec "${controller_ip}" \ - "sudo sed -i 's/mode: \"iptables\"/mode: \"nftables\"/' /var/lib/k0s/manifests/kubeproxy/kube-proxy.yaml" \ - || warn " Could not patch on-disk kube-proxy manifest" - - # 2. Patch the ConfigMap for immediate effect - kubectl get cm kube-proxy -n kube-system -o json 2>/dev/null \ - | python3 -c " -import json, sys -cm = json.load(sys.stdin) -cm['data']['config.conf'] = cm['data']['config.conf'].replace('mode: \"iptables\"', 'mode: \"nftables\"') -print(json.dumps(cm)) -" | kubectl apply -f - 2>/dev/null - - # 3. Force-kill kube-proxy pods so replacements start with the new ConfigMap. - # --wait=false is not enough: the old pods keep running with the stale - # iptables-mode config until graceful shutdown completes, and the new pods - # may mount the ConfigMap volume before k8s propagates the update. - log " Force-restarting kube-proxy pods to pick up nftables mode..." - kubectl delete pods -n kube-system -l k8s-app=kube-proxy \ - --force --grace-period=0 2>/dev/null || true - sleep 5 # give DaemonSet controller time to schedule replacements - - local deadline=$(( $(date +%s) + 90 )) - while true; do - local not_ready - not_ready=$(kubectl get pods -n kube-system -l k8s-app=kube-proxy \ - --no-headers 2>/dev/null | grep -cv '1/1.*Running') || not_ready=99 - [[ "${not_ready}" -eq 0 ]] && { log " ✓ kube-proxy pods Running in nftables mode"; break; } - [[ $(date +%s) -ge ${deadline} ]] && { warn " Timed out waiting for kube-proxy pods"; break; } - sleep 3 - done -} - -bounce_calico_if_needed() { - log "Checking if calico-node pods need a restart (install-cni CrashLoop)..." - local crashing - crashing=$(kubectl get pods -n kube-system -l k8s-app=calico-node \ - --no-headers 2>/dev/null | grep -cE 'Init:Error|CrashLoopBackOff|Init:CrashLoopBackOff') || crashing=0 - if [[ "${crashing}" -gt 0 ]]; then - log " Found ${crashing} crashing calico-node pod(s) — deleting so they restart with working kube-proxy..." - kubectl delete pods -n kube-system -l k8s-app=calico-node --wait=false 2>/dev/null || true - - local deadline=$(( $(date +%s) + 120 )) - log " Waiting up to 120s for calico-node pods to become Running..." - while true; do - local not_running - not_running=$(kubectl get pods -n kube-system -l k8s-app=calico-node \ - --no-headers 2>/dev/null | grep -cv 'Running') || not_running=99 - if [[ "${not_running}" -eq 0 ]]; then - log " ✓ All calico-node pods are Running" - break - fi - if [[ $(date +%s) -ge ${deadline} ]]; then - warn " Timed out waiting for calico-node pods — current state:" - kubectl get pods -n kube-system -l k8s-app=calico-node 2>/dev/null || true - break - fi - sleep 5 - done - else - log " calico-node pods look healthy, no restart needed" - fi -} - # ====== MOUNT NVMe INSTANCE STORE FOR EPHEMERAL STORAGE ====== # GPU instance types (g5, g6, p4, p5) typically come with large NVMe instance # store drives but tiny 10 GB EBS root volumes. Kubernetes counts ephemeral @@ -1325,11 +1181,6 @@ PYSCRIPT" log "k0s cluster installed successfully!" kubectl get nodes - # On newer RHEL 10 kernels (6.12.0-124.38+) the xt_conntrack module is gone, - # so kube-proxy in iptables mode can't program NAT rules. Detect and fix. - fix_kube_proxy_mode_if_needed "${controller_ip}" - bounce_calico_if_needed - # Label nodes for proper workload scheduling label_nodes } @@ -4271,8 +4122,6 @@ main_install() { all_node_ips+=("${WORKER_IPS[@]}") fi prepare_nodes_for_k0s "${all_node_ips[@]}" - fix_kube_proxy_mode_if_needed "${controller_ip}" - bounce_calico_if_needed # Ensure all expected workers are joined if [[ -n "${EXISTING_WORKER_IPS}" ]]; then @@ -4340,8 +4189,6 @@ main_install() { all_node_ips2+=("${WORKER_IPS[@]}") fi prepare_nodes_for_k0s "${all_node_ips2[@]}" - fix_kube_proxy_mode_if_needed "${controller_ip}" - bounce_calico_if_needed # Ensure all expected workers are joined if [[ -n "${EXISTING_WORKER_IPS}" ]]; then @@ -4994,7 +4841,7 @@ join_workers() { # Thorough cleanup before rejoining (handles stale configurations) cleanup_worker_k0s "${worker_ip}" - # RHEL/Fedora compatibility (firewalld, iptables-nft, python3-pyyaml, k0s binary) + # RHEL/Fedora compatibility (firewalld, kernel modules, python3-pyyaml, k0s binary) prepare_nodes_for_k0s "${worker_ip}" # Install worker with fresh token From 9cf5cc26789953f36a4058b8583bada0e4b75ab9 Mon Sep 17 00:00:00 2001 From: Mohammed Arif Date: Mon, 27 Apr 2026 19:38:26 +0530 Subject: [PATCH 44/55] fix: vulnerability issues CVE-2026-29181 and CVE-2026-39883 --- go.mod | 44 ++++++++++++++--------------- go.sum | 88 +++++++++++++++++++++++++++++----------------------------- 2 files changed, 66 insertions(+), 66 deletions(-) diff --git a/go.mod b/go.mod index dce61a0..8af8e88 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/splunk/splunk-ai-operator -go 1.24.0 +go 1.25.0 godebug default=go1.23 @@ -41,7 +41,7 @@ require ( cloud.google.com/go/monitoring v1.24.2 // indirect github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.1 // indirect github.com/AzureAD/microsoft-authentication-library-for-go v1.4.2 // indirect - github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.30.0 // indirect + github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.31.0 // indirect github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.51.0 // indirect github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.51.0 // indirect github.com/antlr4-go/antlr/v4 v4.13.1 // indirect @@ -75,7 +75,7 @@ require ( github.com/google/uuid v1.6.0 // indirect github.com/googleapis/enterprise-certificate-proxy v0.3.6 // indirect github.com/googleapis/gax-go/v2 v2.14.2 // indirect - github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.7 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/jmespath/go-jmespath v0.4.1-0.20220621161143-b0104c826a24 // indirect github.com/josharian/intern v1.0.0 // indirect @@ -101,31 +101,31 @@ require ( go.opentelemetry.io/contrib/detectors/gcp v1.39.0 // indirect go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect - go.opentelemetry.io/otel v1.40.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.40.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.40.0 // indirect - go.opentelemetry.io/otel/metric v1.40.0 // indirect - go.opentelemetry.io/otel/sdk v1.40.0 // indirect - go.opentelemetry.io/otel/sdk/metric v1.40.0 // indirect - go.opentelemetry.io/otel/trace v1.40.0 // indirect - go.opentelemetry.io/proto/otlp v1.9.0 // indirect + go.opentelemetry.io/otel v1.43.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 // indirect + go.opentelemetry.io/otel/metric v1.43.0 // indirect + go.opentelemetry.io/otel/sdk v1.43.0 // indirect + go.opentelemetry.io/otel/sdk/metric v1.43.0 // indirect + go.opentelemetry.io/otel/trace v1.43.0 // indirect + go.opentelemetry.io/proto/otlp v1.10.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.27.0 // indirect - golang.org/x/crypto v0.47.0 // indirect + golang.org/x/crypto v0.49.0 // indirect golang.org/x/exp v0.0.0-20250207012021-f9890c6ad9f3 // indirect - golang.org/x/net v0.49.0 // indirect - golang.org/x/oauth2 v0.34.0 // indirect - golang.org/x/sync v0.19.0 // indirect - golang.org/x/sys v0.40.0 // indirect - golang.org/x/term v0.39.0 // indirect - golang.org/x/text v0.33.0 // indirect + golang.org/x/net v0.52.0 // indirect + golang.org/x/oauth2 v0.35.0 // indirect + golang.org/x/sync v0.20.0 // indirect + golang.org/x/sys v0.42.0 // indirect + golang.org/x/term v0.41.0 // indirect + golang.org/x/text v0.35.0 // indirect golang.org/x/time v0.11.0 // indirect - golang.org/x/tools v0.40.0 // indirect + golang.org/x/tools v0.42.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect google.golang.org/genproto v0.0.0-20250505200425-f936aa4a68b2 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20260128011058-8636f8732409 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20260128011058-8636f8732409 // indirect - google.golang.org/grpc v1.79.3 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 // indirect + google.golang.org/grpc v1.80.0 // indirect google.golang.org/protobuf v1.36.11 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect diff --git a/go.sum b/go.sum index 7021646..1f95e16 100644 --- a/go.sum +++ b/go.sum @@ -36,8 +36,8 @@ github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1 h1:WJ github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1/go.mod h1:tCcJZ0uHAmvjsVYzEFivsRTN00oz5BEsRgQHu5JZ9WE= github.com/AzureAD/microsoft-authentication-library-for-go v1.4.2 h1:oygO0locgZJe7PpYPXT5A29ZkwJaPqcva7BVeemZOZs= github.com/AzureAD/microsoft-authentication-library-for-go v1.4.2/go.mod h1:wP83P5OoQ5p6ip3ScPr0BAq0BvuPAvacpEuSzyouqAI= -github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.30.0 h1:sBEjpZlNHzK1voKq9695PJSX2o5NEXl7/OL3coiIY0c= -github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.30.0/go.mod h1:P4WPRUkOhJC13W//jWpyfJNDAIpvRbAUIYLX/4jtlE0= +github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.31.0 h1:DHa2U07rk8syqvCge0QIGMCE1WxGj9njT44GH7zNJLQ= +github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.31.0/go.mod h1:P4WPRUkOhJC13W//jWpyfJNDAIpvRbAUIYLX/4jtlE0= github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.51.0 h1:fYE9p3esPxA/C0rQ0AHhP0drtPXDRhaWiwg1DPqO7IU= github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.51.0/go.mod h1:BnBReJLvVYx2CS/UHOgVz2BXKXD9wsQPxZug20nZhd0= github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/cloudmock v0.51.0 h1:OqVGm6Ei3x5+yZmSJG1Mh2NwHvpVmZ08CB5qJhT9Nuk= @@ -134,8 +134,8 @@ github.com/googleapis/enterprise-certificate-proxy v0.3.6 h1:GW/XbdyBFQ8Qe+YAmFU github.com/googleapis/enterprise-certificate-proxy v0.3.6/go.mod h1:MkHOF77EYAE7qfSuSS9PU6g4Nt4e11cnsDUowfwewLA= github.com/googleapis/gax-go/v2 v2.14.2 h1:eBLnkZ9635krYIPD+ag1USrOAI0Nr0QYF3+/3GqO0k0= github.com/googleapis/gax-go/v2 v2.14.2/go.mod h1:ON64QhlJkhVtSqp4v1uaK92VyZ2gmvDQsweuyLV+8+w= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.7 h1:X+2YciYSxvMQK0UZ7sg45ZVabVZBeBuvMkmuI2V3Fak= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.7/go.mod h1:lW34nIZuQ8UDPdkon5fmfp2l3+ZkQ2me/+oecHYLOII= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 h1:HWRh5R2+9EifMyIHV7ZV+MIZqgz+PMpZ14Jynv3O2Zs= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0/go.mod h1:JfhWUomR1baixubs02l85lZYYOm7LV6om4ceouMv45c= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/jmespath/go-jmespath v0.4.1-0.20220621161143-b0104c826a24 h1:liMMTbpW34dhU4az1GN0pTPADwNmvoRSeoZ6PItiqnY= @@ -231,24 +231,24 @@ go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.6 go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0/go.mod h1:rg+RlpR5dKwaS95IyyZqj5Wd4E13lk/msnTS0Xl9lJM= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 h1:sbiXRNDSWJOTobXh5HyQKjq6wUC5tNybqjIqDpAY4CU= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0/go.mod h1:69uWxva0WgAA/4bu2Yy70SLDBwZXuQ6PbBpbsa5iZrQ= -go.opentelemetry.io/otel v1.40.0 h1:oA5YeOcpRTXq6NN7frwmwFR0Cn3RhTVZvXsP4duvCms= -go.opentelemetry.io/otel v1.40.0/go.mod h1:IMb+uXZUKkMXdPddhwAHm6UfOwJyh4ct1ybIlV14J0g= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.40.0 h1:QKdN8ly8zEMrByybbQgv8cWBcdAarwmIPZ6FThrWXJs= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.40.0/go.mod h1:bTdK1nhqF76qiPoCCdyFIV+N/sRHYXYCTQc+3VCi3MI= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.40.0 h1:DvJDOPmSWQHWywQS6lKL+pb8s3gBLOZUtw4N+mavW1I= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.40.0/go.mod h1:EtekO9DEJb4/jRyN4v4Qjc2yA7AtfCBuz2FynRUWTXs= +go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I= +go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 h1:88Y4s2C8oTui1LGM6bTWkw0ICGcOLCAI5l6zsD1j20k= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0/go.mod h1:Vl1/iaggsuRlrHf/hfPJPvVag77kKyvrLeD10kpMl+A= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 h1:RAE+JPfvEmvy+0LzyUA25/SGawPwIUbZ6u0Wug54sLc= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0/go.mod h1:AGmbycVGEsRx9mXMZ75CsOyhSP6MFIcj/6dnG+vhVjk= go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.36.0 h1:rixTyDGXFxRy1xzhKrotaHy3/KXdPhlWARrCgK+eqUY= go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.36.0/go.mod h1:dowW6UsM9MKbJq5JTz2AMVp3/5iW5I/TStsk8S+CfHw= -go.opentelemetry.io/otel/metric v1.40.0 h1:rcZe317KPftE2rstWIBitCdVp89A2HqjkxR3c11+p9g= -go.opentelemetry.io/otel/metric v1.40.0/go.mod h1:ib/crwQH7N3r5kfiBZQbwrTge743UDc7DTFVZrrXnqc= -go.opentelemetry.io/otel/sdk v1.40.0 h1:KHW/jUzgo6wsPh9At46+h4upjtccTmuZCFAc9OJ71f8= -go.opentelemetry.io/otel/sdk v1.40.0/go.mod h1:Ph7EFdYvxq72Y8Li9q8KebuYUr2KoeyHx0DRMKrYBUE= -go.opentelemetry.io/otel/sdk/metric v1.40.0 h1:mtmdVqgQkeRxHgRv4qhyJduP3fYJRMX4AtAlbuWdCYw= -go.opentelemetry.io/otel/sdk/metric v1.40.0/go.mod h1:4Z2bGMf0KSK3uRjlczMOeMhKU2rhUqdWNoKcYrtcBPg= -go.opentelemetry.io/otel/trace v1.40.0 h1:WA4etStDttCSYuhwvEa8OP8I5EWu24lkOzp+ZYblVjw= -go.opentelemetry.io/otel/trace v1.40.0/go.mod h1:zeAhriXecNGP/s2SEG3+Y8X9ujcJOTqQ5RgdEJcawiA= -go.opentelemetry.io/proto/otlp v1.9.0 h1:l706jCMITVouPOqEnii2fIAuO3IVGBRPV5ICjceRb/A= -go.opentelemetry.io/proto/otlp v1.9.0/go.mod h1:xE+Cx5E/eEHw+ISFkwPLwCZefwVjY+pqKg1qcK03+/4= +go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM= +go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY= +go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg= +go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg= +go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw= +go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A= +go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A= +go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0= +go.opentelemetry.io/proto/otlp v1.10.0 h1:IQRWgT5srOCYfiWnpqUYz9CVmbO8bFmKcwYxpuCSL2g= +go.opentelemetry.io/proto/otlp v1.10.0/go.mod h1:/CV4QoCR/S9yaPj8utp3lvQPoqMtxXdzn7ozvvozVqk= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= @@ -258,8 +258,8 @@ go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.47.0 h1:V6e3FRj+n4dbpw86FJ8Fv7XVOql7TEwpHapKoMJ/GO8= -golang.org/x/crypto v0.47.0/go.mod h1:ff3Y9VzzKbwSSEzWqJsJVBnWmRwRSHt/6Op5n9bQc4A= +golang.org/x/crypto v0.49.0 h1:+Ng2ULVvLHnJ/ZFEq4KdcDd/cfjrrjjNSXNzxg0Y4U4= +golang.org/x/crypto v0.49.0/go.mod h1:ErX4dUh2UM+CFYiXZRTcMpEcN8b/1gxEuv3nODoYtCA= golang.org/x/exp v0.0.0-20250207012021-f9890c6ad9f3 h1:qNgPs5exUA+G0C96DrPwNrvLSj7GT/9D+3WMWUcUg34= golang.org/x/exp v0.0.0-20250207012021-f9890c6ad9f3/go.mod h1:tujkw807nyEEAamNbDrEGzRav+ilXA7PCRAd6xsmwiU= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= @@ -268,53 +268,53 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o= -golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8= -golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw= -golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= +golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0= +golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw= +golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ= +golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= -golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= +golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ= -golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= -golang.org/x/term v0.39.0 h1:RclSuaJf32jOqZz74CkPA9qFuVTX7vhLlpfj/IGWlqY= -golang.org/x/term v0.39.0/go.mod h1:yxzUCTP/U+FzoxfdKmLaA0RV1WgE0VY7hXBwKtY/4ww= +golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= +golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/term v0.41.0 h1:QCgPso/Q3RTJx2Th4bDLqML4W6iJiaXFq2/ftQF13YU= +golang.org/x/term v0.41.0/go.mod h1:3pfBgksrReYfZ5lvYM0kSO0LIkAl4Yl2bXOkKP7Ec2A= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE= -golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8= +golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8= +golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA= golang.org/x/time v0.11.0 h1:/bpjEDfN9tkoN/ryeYHnv5hcMlc8ncjMcM4XBk5NWV0= golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.40.0 h1:yLkxfA+Qnul4cs9QA3KnlFu0lVmd8JJfoq+E41uSutA= -golang.org/x/tools v0.40.0/go.mod h1:Ik/tzLRlbscWpqqMRjyWYDisX8bG13FrdXp3o4Sr9lc= +golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k= +golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= -gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= -gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4= +gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E= google.golang.org/api v0.235.0 h1:C3MkpQSRxS1Jy6AkzTGKKrpSCOd2WOGrezZ+icKSkKo= google.golang.org/api v0.235.0/go.mod h1:QpeJkemzkFKe5VCE/PMv7GsUfn9ZF+u+q1Q7w6ckxTg= google.golang.org/genproto v0.0.0-20250505200425-f936aa4a68b2 h1:1tXaIXCracvtsRxSBsYDiSBN0cuJvM7QYW+MrpIRY78= google.golang.org/genproto v0.0.0-20250505200425-f936aa4a68b2/go.mod h1:49MsLSx0oWMOZqcpB3uL8ZOkAh1+TndpJ8ONoCBWiZk= -google.golang.org/genproto/googleapis/api v0.0.0-20260128011058-8636f8732409 h1:merA0rdPeUV3YIIfHHcH4qBkiQAc1nfCKSI7lB4cV2M= -google.golang.org/genproto/googleapis/api v0.0.0-20260128011058-8636f8732409/go.mod h1:fl8J1IvUjCilwZzQowmw2b7HQB2eAuYBabMXzWurF+I= -google.golang.org/genproto/googleapis/rpc v0.0.0-20260128011058-8636f8732409 h1:H86B94AW+VfJWDqFeEbBPhEtHzJwJfTbgE2lZa54ZAQ= -google.golang.org/genproto/googleapis/rpc v0.0.0-20260128011058-8636f8732409/go.mod h1:j9x/tPzZkyxcgEFkiKEEGxfvyumM01BEtsW8xzOahRQ= -google.golang.org/grpc v1.79.3 h1:sybAEdRIEtvcD68Gx7dmnwjZKlyfuc61Dyo9pGXXkKE= -google.golang.org/grpc v1.79.3/go.mod h1:KmT0Kjez+0dde/v2j9vzwoAScgEPx/Bw1CYChhHLrHQ= +google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 h1:VPWxll4HlMw1Vs/qXtN7BvhZqsS9cdAittCNvVENElA= +google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:7QBABkRtR8z+TEnmXTqIqwJLlzrZKVfAUm7tY3yGv0M= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 h1:m8qni9SQFH0tJc1X0vmnpw/0t+AImlSvp30sEupozUg= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8= +google.golang.org/grpc v1.80.0 h1:Xr6m2WmWZLETvUNvIUmeD5OAagMw3FiKmMlTdViWsHM= +google.golang.org/grpc v1.80.0/go.mod h1:ho/dLnxwi3EDJA4Zghp7k2Ec1+c2jqup0bFkw07bwF4= google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= From 7a24a4c2f87f402763408bfc69b377f21aeb2798 Mon Sep 17 00:00:00 2001 From: Mohammed Arif Date: Mon, 27 Apr 2026 20:11:49 +0530 Subject: [PATCH 45/55] fix: downgrade go version for fixing unit cases --- go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go.mod b/go.mod index 8af8e88..fb2981f 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/splunk/splunk-ai-operator -go 1.25.0 +go 1.24.0 godebug default=go1.23 From cc874b423dcefb18873bb973ab4fcbc50f111f56 Mon Sep 17 00:00:00 2001 From: Mohammed Arif Date: Tue, 28 Apr 2026 11:20:43 +0530 Subject: [PATCH 46/55] fix: upgrade go version due to vuln issue --- .env | 2 +- Dockerfile | 3 ++- Dockerfile.debug | 3 ++- Makefile | 9 ++++++--- go.mod | 2 +- 5 files changed, 12 insertions(+), 7 deletions(-) diff --git a/.env b/.env index 69a2de5..59af144 100644 --- a/.env +++ b/.env @@ -1,6 +1,6 @@ OPERATOR_SDK_VERSION=v1.31.0 REVIEWERS=vivekr-splunk,rlieberman-splunk,patrykw-splunk,Igor-splunk,kasiakoziol -GO_VERSION=1.24.0 +GO_VERSION=1.25.0 AWSCLI_URL=https://awscli.amazonaws.com/awscli-exe-linux-x86_64-2.8.6.zip KUBECTL_VERSION=v1.29.1 AZ_CLI_VERSION=2.30.0 diff --git a/Dockerfile b/Dockerfile index 67224af..ae259de 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,6 @@ # Build the manager binary -FROM docker.io/golang:1.24 AS builder +ARG GO_VERSION=1.25.0 +FROM docker.io/golang:${GO_VERSION} AS builder ARG TARGETOS ARG TARGETARCH diff --git a/Dockerfile.debug b/Dockerfile.debug index c5fac22..e9cdffd 100644 --- a/Dockerfile.debug +++ b/Dockerfile.debug @@ -1,5 +1,6 @@ # Build the manager binary with debug symbols -FROM docker.io/golang:1.24 AS builder +ARG GO_VERSION=1.25.0 +FROM docker.io/golang:${GO_VERSION} AS builder ARG TARGETOS ARG TARGETARCH diff --git a/Makefile b/Makefile index d6a7f7b..7c69fe1 100644 --- a/Makefile +++ b/Makefile @@ -65,6 +65,9 @@ endif # tools. (i.e. podman) CONTAINER_TOOL ?= docker +# GO_VERSION is read from .env if not already set, and passed as a build-arg to docker builds. +GO_VERSION ?= $(shell grep '^GO_VERSION=' .env | cut -d= -f2) + # Setting SHELL to bash allows bash commands to be executed by recipes. # Options are set to exit when a recipe line exits non-zero or a piped command fails. SHELL = /usr/bin/env bash -o pipefail @@ -215,11 +218,11 @@ run: manifests generate fmt vet ## Run a controller from your host. # More info: https://docs.docker.com/develop/develop-images/build_enhancements/ .PHONY: docker-build docker-build: ## Build docker image with the manager. - $(CONTAINER_TOOL) build -t ${IMG} . + $(CONTAINER_TOOL) build --build-arg GO_VERSION=$(GO_VERSION) -t ${IMG} . .PHONY: docker-build-amd64 docker-build-amd64: ## Build docker image for linux/amd64 (e.g. for x86_64 servers/EC2). - $(CONTAINER_TOOL) build --platform=linux/amd64 -t ${IMG} . + $(CONTAINER_TOOL) build --platform=linux/amd64 --build-arg GO_VERSION=$(GO_VERSION) -t ${IMG} . .PHONY: docker-push docker-push: ## Push docker image with the manager. @@ -238,7 +241,7 @@ docker-buildx: ## Build and push docker image for the manager for cross-platform sed -e '1 s/\(^FROM\)/FROM --platform=\$$\{BUILDPLATFORM\}/; t' -e ' 1,// s//FROM --platform=\$$\{BUILDPLATFORM\}/' Dockerfile > Dockerfile.cross - $(CONTAINER_TOOL) buildx create --name splunk-ai-operator-builder $(CONTAINER_TOOL) buildx use splunk-ai-operator-builder - - $(CONTAINER_TOOL) buildx build --push --platform=$(PLATFORMS) --tag ${IMG} -f Dockerfile.cross . + - $(CONTAINER_TOOL) buildx build --push --platform=$(PLATFORMS) --build-arg GO_VERSION=$(GO_VERSION) --tag ${IMG} -f Dockerfile.cross . - $(CONTAINER_TOOL) buildx rm splunk-ai-operator-builder rm Dockerfile.cross diff --git a/go.mod b/go.mod index fb2981f..8af8e88 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/splunk/splunk-ai-operator -go 1.24.0 +go 1.25.0 godebug default=go1.23 From 880f68b8f0a04d71cfc9ba47db6430d76a9f4fcf Mon Sep 17 00:00:00 2001 From: Mohammed Arif Date: Tue, 28 Apr 2026 15:52:22 +0530 Subject: [PATCH 47/55] feature: including saia deployments helm configs --- pkg/ai/features/saia/impl.go | 44 ++++++++++++++++----- pkg/ai/features/saia/impl_test.go | 24 +++++++---- tools/cluster_setup/artifacts.yaml | 12 +++--- tools/cluster_setup/k0s-cluster-config.yaml | 10 ++--- 4 files changed, 62 insertions(+), 28 deletions(-) diff --git a/pkg/ai/features/saia/impl.go b/pkg/ai/features/saia/impl.go index a33b564..c899bcf 100644 --- a/pkg/ai/features/saia/impl.go +++ b/pkg/ai/features/saia/impl.go @@ -175,14 +175,16 @@ func (r *SaiaReconciler) validateAIService( // Default resources — SAIA API needs headroom beyond 2Gi or the kubelet OOMKills during startup. if ai.Spec.Resources.Requests == nil { ai.Spec.Resources.Requests = corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("500m"), - corev1.ResourceMemory: resource.MustParse("2Gi"), + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("4Gi"), + corev1.ResourceEphemeralStorage: resource.MustParse("10Gi"), } } if ai.Spec.Resources.Limits == nil { ai.Spec.Resources.Limits = corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("2"), - corev1.ResourceMemory: resource.MustParse("4Gi"), + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("4Gi"), + corev1.ResourceEphemeralStorage: resource.MustParse("10Gi"), } } if ai.Spec.TaskVolume.Path == "" { @@ -1125,7 +1127,18 @@ func (r *SaiaReconciler) reconcileSAIAv2Deployment( v2Resources := ai.Spec.V2.Resources if v2Resources.Requests == nil { - v2Resources = ai.Spec.Resources + v2Resources = corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("4Gi"), + corev1.ResourceEphemeralStorage: resource.MustParse("10Gi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("4Gi"), + corev1.ResourceEphemeralStorage: resource.MustParse("10Gi"), + }, + } } if _, err := controllerutil.CreateOrUpdate(ctx, r.Client, deployment, func() error { @@ -1213,12 +1226,12 @@ func (r *SaiaReconciler) reconcileSAIAv2Worker( // IngestionWorker.run() when the queue is empty OR the tenant lock is busy. // The heartbeat is written only at the top of process_next(), so this sleep // directly controls heartbeat cadence. The liveness probe rejects heartbeats - // older than 120s, so we MUST keep this well under that threshold — 10s - // matches the saia-v2 default (see Settings.run_tasks_delay_s). Do NOT + // older than 1200s, so we MUST keep this well under that threshold — 600s + // matches the saia-v2 helm default (see Settings.run_tasks_delay_s). Do NOT // conflate with the v1 worker APScheduler cron (which uses 600s for weekly // jobs); v2 reuses the same env name for a different purpose. env = append(env, - corev1.EnvVar{Name: "RUN_TASKS_DELAY_S", Value: "10"}, + corev1.EnvVar{Name: "RUN_TASKS_DELAY_S", Value: "600"}, corev1.EnvVar{Name: "VAULT_TEMPLATE_DISABLED", Value: "true"}, corev1.EnvVar{Name: "WORKER_HEARTBEAT_PATH", Value: "/tmp/ingestion_worker_heartbeat"}, ) @@ -1241,7 +1254,18 @@ func (r *SaiaReconciler) reconcileSAIAv2Worker( v2WorkerResources := ai.Spec.V2Worker.Resources if v2WorkerResources.Requests == nil { - v2WorkerResources = ai.Spec.Resources + v2WorkerResources = corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("16Gi"), + corev1.ResourceEphemeralStorage: resource.MustParse("25Gi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("16Gi"), + corev1.ResourceEphemeralStorage: resource.MustParse("25Gi"), + }, + } } if _, err := controllerutil.CreateOrUpdate(ctx, r.Client, deployment, func() error { @@ -1284,7 +1308,7 @@ func (r *SaiaReconciler) reconcileSAIAv2Worker( "python3", "-c", "import os,sys,time\n" + "p=os.environ.get('WORKER_HEARTBEAT_PATH','/tmp/ingestion_worker_heartbeat')\n" + - "sys.exit(0 if os.path.exists(p) and (time.time()-float(open(p).read().strip()))<120 else 1)", + "sys.exit(0 if os.path.exists(p) and (time.time()-float(open(p).read().strip()))<1200 else 1)", }, }, }, diff --git a/pkg/ai/features/saia/impl_test.go b/pkg/ai/features/saia/impl_test.go index 458b7df..94be4c8 100644 --- a/pkg/ai/features/saia/impl_test.go +++ b/pkg/ai/features/saia/impl_test.go @@ -116,8 +116,12 @@ func Test_validateAIService_defaults(t *testing.T) { err := r.validateAIService(context.Background(), ai) assert.NoError(t, err) assert.Equal(t, int32(1), ai.Spec.Replicas) - assert.NotNil(t, ai.Spec.Resources.Requests) - assert.NotNil(t, ai.Spec.Resources.Limits) + assert.Equal(t, resource.MustParse("2"), ai.Spec.Resources.Requests[corev1.ResourceCPU]) + assert.Equal(t, resource.MustParse("4Gi"), ai.Spec.Resources.Requests[corev1.ResourceMemory]) + assert.Equal(t, resource.MustParse("10Gi"), ai.Spec.Resources.Requests[corev1.ResourceEphemeralStorage]) + assert.Equal(t, resource.MustParse("2"), ai.Spec.Resources.Limits[corev1.ResourceCPU]) + assert.Equal(t, resource.MustParse("4Gi"), ai.Spec.Resources.Limits[corev1.ResourceMemory]) + assert.Equal(t, resource.MustParse("10Gi"), ai.Spec.Resources.Limits[corev1.ResourceEphemeralStorage]) // AIPlatformUrl is built as "://..svc.:8000". // When AIPlatformScheme is unset, the operator defaults to "http" (see // validateAIService). This makes the URL usable directly by httpx/openai @@ -220,8 +224,14 @@ func newTestAIService() *aiv1.AIService { V2Worker: aiv1.SAIAWorkerConfig{Replicas: 1}, Resources: corev1.ResourceRequirements{ Requests: corev1.ResourceList{ - corev1.ResourceCPU: *mustParseQuantity("500m"), - corev1.ResourceMemory: *mustParseQuantity("2Gi"), + corev1.ResourceCPU: *mustParseQuantity("2"), + corev1.ResourceMemory: *mustParseQuantity("4Gi"), + corev1.ResourceEphemeralStorage: *mustParseQuantity("10Gi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: *mustParseQuantity("2"), + corev1.ResourceMemory: *mustParseQuantity("4Gi"), + corev1.ResourceEphemeralStorage: *mustParseQuantity("10Gi"), }, }, }, @@ -393,9 +403,9 @@ func Test_reconcileSAIAv2Worker(t *testing.T) { envMap := envToMap(container.Env) // RUN_TASKS_DELAY_S controls the v2 worker's poll sleep (saia-v2 // IngestionWorker.run). The value MUST stay well under the liveness probe - // threshold (120s) because the heartbeat file is only refreshed at the top - // of each iteration. 10s matches saia-v2's own Settings default. - assert.Equal(t, "10", envMap["RUN_TASKS_DELAY_S"]) + // threshold (1200s) because the heartbeat file is only refreshed at the top + // of each iteration. 600s matches saia-v2's helm default. + assert.Equal(t, "600", envMap["RUN_TASKS_DELAY_S"]) // Heartbeat path must match saia-v2's default (app/core/config.py). assert.Equal(t, "/tmp/ingestion_worker_heartbeat", envMap["WORKER_HEARTBEAT_PATH"]) assert.Equal(t, "true", envMap["VAULT_TEMPLATE_DISABLED"]) diff --git a/tools/cluster_setup/artifacts.yaml b/tools/cluster_setup/artifacts.yaml index 69c3664..cca7c19 100644 --- a/tools/cluster_setup/artifacts.yaml +++ b/tools/cluster_setup/artifacts.yaml @@ -5682,17 +5682,17 @@ spec: fieldRef: fieldPath: metadata.name - name: RELATED_IMAGE_RAY_HEAD - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-head:build-v2-002 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-head:build-v2-008 - name: RELATED_IMAGE_RAY_WORKER - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-worker-gpu:build-v2-002 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-worker-gpu:build-v2-008 - name: RELATED_IMAGE_WEAVIATE value: docker.io/semitechnologies/weaviate:stable-v1.28-007846a - name: RELATED_IMAGE_SAIA_API - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:build-v2-002 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:build-v2-009 - name: RELATED_IMAGE_SAIA_API_V2 - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api-v2:build-v2-002 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api-v2:build-v2-009 - name: RELATED_IMAGE_POST_INSTALL_HOOK - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-data-loader:v2.0.4-31-g9efe1fc + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-data-loader:build-v2-009 - name: SPLUNK_METRICS_INDEX_NAME value: _metrics - name: RELATED_IMAGE_FLUENT_BIT @@ -5705,7 +5705,7 @@ spec: value: v0.3.14-36-g1549f5a - name: RAY_VERSION value: 2.53.0 - image: 658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.25 + image: 658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.28 livenessProbe: httpGet: path: /healthz diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml index 124373f..4c4967c 100644 --- a/tools/cluster_setup/k0s-cluster-config.yaml +++ b/tools/cluster_setup/k0s-cluster-config.yaml @@ -115,14 +115,14 @@ images: # headImage: "ml-platform/ray/ray-head:build-010" # headImage: "ml-platform/ray/ray-head:9a24502-ai-tier" # arif rebase to main # headImage: "ml-platform/ray/ray-head:build-v2-001" # tony redis changes - headImage: "ml-platform/ray/ray-head:build-v2-008" # tony redis changes + fixes + headImage: "ml-platform/ray/ray-head:build-v2-010" # tony redis changes + fixes # workerImage: "ml-platform/ray/ray-worker-gpu:build-v1alpha1" # workerImage: "ml-platform/ray/ray-worker-gpu:087e40e" # workerImage: "ml-platform/ray/ray-worker-gpu:build-010" # workerImage: "ml-platform/ray/ray-worker-gpu:9a24502-ai-tier" # arif rebase to main # workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-001" # tony redis changes - workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-008" # tony redis changes + fixes + workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-010" # tony redis changes + fixes weaviate: image: "docker.io/semitechnologies/weaviate:stable-v1.28-007846a" @@ -132,17 +132,17 @@ images: # apiImage: "ml-platform/saia/saia-api:build-006" #saia v1.5 # apiImage: "ml-platform/saia/saia-api:v2.0.4-23-g2fc91e9" #saia v2 # apiImage: "ml-platform/saia/saia-api:v2.0.4-31-g9efe1fc" #saia v2 + tony changes - apiImage: "ml-platform/saia/saia-api:build-v2-009" #saia v2 + tony changes + apiImage: "ml-platform/saia/saia-api:build-v2-010" #saia v2 + tony changes # apiV2Image: "ml-platform/saia/saia-api-v2:v2.0.4-23-g2fc91e9" #saia v2 # apiV2Image: "ml-platform/saia/saia-api-v2:v2.0.4-31-g9efe1fc" #saia v2 + tony changes - apiV2Image: "ml-platform/saia/saia-api-v2:build-v2-009" #saia v2 + tony changes + apiV2Image: "ml-platform/saia/saia-api-v2:build-v2-010" #saia v2 + tony changes # dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v1alpha1" # dataLoaderImage: "ml-platform/saia/saia-data-loader:build-003" #saia v1.5 # dataLoaderImage: "ml-platform/saia/saia-data-loader:v2.0.4-23-g2fc91e9" #saia v2 # dataLoaderImage: "ml-platform/saia/saia-data-loader:v2.0.4-31-g9efe1fc" #saia v2 + tony changes - dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v2-009" #saia v2 + tony changes + personalization fix + dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v2-010" #saia v2 + tony changes + personalization fix fluentBit: image: "docker.io/fluent/fluent-bit:1.9.6" From 86cf822a1c78b0202b1ee3a68339738652e99f40 Mon Sep 17 00:00:00 2001 From: Mohammed Arif Date: Tue, 28 Apr 2026 17:27:06 +0530 Subject: [PATCH 48/55] fix: removal of aws specific usages --- tools/cluster_setup/k0s_cluster_with_stack.sh | 691 ++---------------- 1 file changed, 68 insertions(+), 623 deletions(-) diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index 1f45cfc..ae46ebc 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -4,10 +4,8 @@ set -euo pipefail # ============================================================================= # k0s Cluster Setup Script for Splunk AI Platform # ============================================================================= -# Mirrors eks_cluster_with_stack.sh functionality but for k0s clusters -# Supports: -# 1. On-prem/baremetal: Use customer-provided IP addresses -# 2. AWS EC2: Automatically create EC2 instances for testing +# Deploys a k0s cluster on customer-provided (on-prem / baremetal) nodes. +# Requires existingIPs in the config YAML (controller + worker IPs). # ============================================================================= # --- AWS credentials handling --- @@ -110,19 +108,15 @@ load_config() { SSH_USER=$(yq eval '.cluster.sshUser' "${CONFIG_FILE}" 2>/dev/null || echo "ubuntu") SSH_KEY_PATH=$(yq eval '.cluster.sshKeyPath' "${CONFIG_FILE}" 2>/dev/null || echo "") - # EC2 configuration (if creating instances) - VPC_ID=$(yq eval '.ec2.vpcId' "${CONFIG_FILE}" 2>/dev/null || echo "") - SUBNET_ID=$(yq eval '.ec2.subnetId' "${CONFIG_FILE}" 2>/dev/null || echo "") - KEY_NAME=$(yq eval '.ec2.keyName' "${CONFIG_FILE}" 2>/dev/null || echo "") + # Validate existingIPs are provided (mandatory for on-prem) + if [[ -z "${EXISTING_CONTROLLER_IPS}" ]]; then + err "nodes.existingIPs.controllers must be set in config YAML — this script requires pre-provisioned nodes" + fi CONTROLLER_COUNT=$(yq eval '.nodes.controllers' "${CONFIG_FILE}" 2>/dev/null || echo "1") CPU_WORKER_COUNT=$(yq eval '.nodes.cpuWorkers' "${CONFIG_FILE}" 2>/dev/null || echo "2") GPU_WORKER_COUNT=$(yq eval '.nodes.gpuWorkers' "${CONFIG_FILE}" 2>/dev/null || echo "1") - CONTROLLER_INSTANCE_TYPE=$(yq eval '.instanceTypes.controller' "${CONFIG_FILE}" 2>/dev/null || echo "t3.xlarge") - CPU_WORKER_INSTANCE_TYPE=$(yq eval '.instanceTypes.cpuWorker' "${CONFIG_FILE}" 2>/dev/null || echo "m5.4xlarge") - GPU_WORKER_INSTANCE_TYPE=$(yq eval '.instanceTypes.gpuWorker' "${CONFIG_FILE}" 2>/dev/null || echo "g5.2xlarge") - # Storage configuration STORAGE_CLASS=$(yq eval '.storage.storageClass // "local-path"' "${CONFIG_FILE}" 2>/dev/null || echo "local-path") VECTORDB_SIZE=$(yq eval '.storage.vectorDbSize // "50Gi"' "${CONFIG_FILE}" 2>/dev/null || echo "50Gi") @@ -179,11 +173,6 @@ load_config() { ECR_ACCOUNT=$(yq eval '.ecr.account' "${CONFIG_FILE}" 2>/dev/null || echo "") ECR_REGION=$(yq eval '.ecr.region // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") - # Get AWS account if using EC2 - if [[ -z "${EXISTING_CONTROLLER_IPS}" ]]; then - ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text 2>/dev/null || echo "") - fi - # Auto-detect ECR account from AWS if not specified if [[ -z "${ECR_ACCOUNT}" ]] && aws sts get-caller-identity &>/dev/null; then ECR_ACCOUNT=$(aws sts get-caller-identity --query Account --output text 2>/dev/null || echo "") @@ -432,22 +421,10 @@ preflight_checks() { fi pf_header "Infrastructure mode" - if [[ -n "${EXISTING_CONTROLLER_IPS}" ]]; then - pf_ok "Using existing infrastructure (on-prem/baremetal)" - pf_ok "Controller IPs: ${EXISTING_CONTROLLER_IPS}" - pf_ok "Worker IPs: ${EXISTING_WORKER_IPS}" - [[ -n "${SSH_KEY_PATH}" && -f "${SSH_KEY_PATH}" ]] && pf_ok "SSH key: ${SSH_KEY_PATH}" || pf_fail "SSH key not found: ${SSH_KEY_PATH}" - else - pf_ok "Creating EC2 instances" - if command -v aws >/dev/null 2>&1; then - pf_ok "AWS CLI found" - [[ -n "${ACCOUNT_ID}" ]] && pf_ok "AWS Account: ${ACCOUNT_ID}" || pf_fail "Cannot get AWS account ID" - [[ -n "${VPC_ID}" ]] && pf_ok "VPC ID: ${VPC_ID}" || pf_fail "VPC ID not set" - [[ -n "${KEY_NAME}" ]] && pf_ok "EC2 Key name: ${KEY_NAME}" || pf_fail "EC2 key name not set" - else - pf_fail "AWS CLI not found - required for EC2 instance creation" - fi - fi + pf_ok "Using existing infrastructure (on-prem/baremetal)" + pf_ok "Controller IPs: ${EXISTING_CONTROLLER_IPS}" + pf_ok "Worker IPs: ${EXISTING_WORKER_IPS}" + [[ -n "${SSH_KEY_PATH}" && -f "${SSH_KEY_PATH}" ]] && pf_ok "SSH key: ${SSH_KEY_PATH}" || pf_fail "SSH key not found: ${SSH_KEY_PATH}" pf_summary } @@ -477,295 +454,6 @@ scp_file() { fi } -# ====== EC2 INSTANCE CREATION ====== -create_security_group() { - log "Creating security group for k0s cluster..." - - local sg_name="${CLUSTER_NAME}-k0s-sg" - local sg_id - - sg_id=$(aws ec2 describe-security-groups \ - --region "${REGION}" \ - --filters "Name=group-name,Values=${sg_name}" "Name=vpc-id,Values=${VPC_ID}" \ - --query 'SecurityGroups[0].GroupId' --output text 2>/dev/null || echo "None") - - if [[ "${sg_id}" != "None" && -n "${sg_id}" ]]; then - log "Security group already exists: ${sg_id}" - echo "${sg_id}" - return 0 - fi - - sg_id=$(aws ec2 create-security-group \ - --region "${REGION}" \ - --group-name "${sg_name}" \ - --description "Security group for ${CLUSTER_NAME} k0s cluster" \ - --vpc-id "${VPC_ID}" \ - --query 'GroupId' --output text) - - # Tag the security group - aws ec2 create-tags --region "${REGION}" --resources "${sg_id}" \ - --tags "Key=Cluster,Value=${CLUSTER_NAME}" "Key=ManagedBy,Value=k0s-script" "Key=Name,Value=${sg_name}" - - log "Created security group: ${sg_id}" - - # Add ingress rules (redirect output to avoid pollution) - log "Configuring security group rules (restricted to your IP)..." - - # Detect current public IP address - MY_IP="${ALLOWED_CIDR:-}" - if [[ -z "$MY_IP" ]]; then - log "Auto-detecting your public IP address..." - MY_IP=$(curl -s https://checkip.amazonaws.com || curl -s https://ipinfo.io/ip || curl -s https://api.ipify.org) - if [[ -z "$MY_IP" ]]; then - warn "Could not auto-detect IP. Set ALLOWED_CIDR environment variable." - warn "Example: export ALLOWED_CIDR=\"1.2.3.4/32\"" - err "Failed to determine your IP address" - fi - # Add /32 for single IP - MY_IP="${MY_IP}/32" - log " Detected IP: ${MY_IP}" - else - log " Using provided CIDR: ${MY_IP}" - fi - - # === EXTERNAL ACCESS (restricted to your IP) === - # API server - allow ONLY from your IP for kubectl access - aws ec2 authorize-security-group-ingress --region "${REGION}" --group-id "${sg_id}" \ - --protocol tcp --port 6443 --cidr "${MY_IP}" >/dev/null 2>&1 || true - log " ✓ Port 6443 (Kubernetes API): RESTRICTED to ${MY_IP}" - - # SSH - allow ONLY from your IP for management - aws ec2 authorize-security-group-ingress --region "${REGION}" --group-id "${sg_id}" \ - --protocol tcp --port 22 --cidr "${MY_IP}" >/dev/null 2>&1 || true - log " ✓ Port 22 (SSH): RESTRICTED to ${MY_IP}" - - # NodePort services - allow ONLY from your IP for accessing deployed services - aws ec2 authorize-security-group-ingress --region "${REGION}" --group-id "${sg_id}" \ - --protocol tcp --port 30000-32767 --cidr "${MY_IP}" >/dev/null 2>&1 || true - log " ✓ Ports 30000-32767 (NodePort): RESTRICTED to ${MY_IP}" - - # Konnectivity agent port - allow ONLY from your IP - aws ec2 authorize-security-group-ingress --region "${REGION}" --group-id "${sg_id}" \ - --protocol tcp --port 8132 --cidr "${MY_IP}" >/dev/null 2>&1 || true - log " ✓ Port 8132 (Konnectivity): RESTRICTED to ${MY_IP}" - - # === INTERNAL CLUSTER COMMUNICATION (within security group only) === - # All internal traffic - etcd (2380), kubelet (10250), CNI, pod networking, etc. - aws ec2 authorize-security-group-ingress --region "${REGION}" --group-id "${sg_id}" \ - --protocol -1 --source-group "${sg_id}" >/dev/null 2>&1 || true - log " ✓ All ports: INTERNAL ONLY - for cluster communication via private IPs" - - log "Security group rules configured" - echo "${sg_id}" -} - -find_existing_instances() { - local role="$1" - aws ec2 describe-instances \ - --region "${REGION}" \ - --filters \ - "Name=tag:Cluster,Values=${CLUSTER_NAME}" \ - "Name=tag:Role,Values=${role}" \ - "Name=instance-state-name,Values=running,pending,stopping,stopped" \ - --query 'Reservations[].Instances[].InstanceId' \ - --output text -} - -create_ec2_instances() { - log "Creating EC2 instances for k0s cluster..." - - # Check for existing instances - local existing_controllers existing_cpu_workers existing_gpu_workers - existing_controllers=$(find_existing_instances "controller") - existing_cpu_workers=$(find_existing_instances "cpu-worker") - existing_gpu_workers=$(find_existing_instances "gpu-worker") - - local existing_controller_count=$(echo "${existing_controllers}" | wc -w) - local existing_cpu_worker_count=$(echo "${existing_cpu_workers}" | wc -w) - local existing_gpu_worker_count=$(echo "${existing_gpu_workers}" | wc -w) - - log "Found existing instances: ${existing_controller_count} controllers, ${existing_cpu_worker_count} CPU workers, ${existing_gpu_worker_count} GPU workers" - - local sg_id - sg_id=$(create_security_group) - - # Get subnet if not provided - if [[ -z "${SUBNET_ID}" ]]; then - SUBNET_ID=$(aws ec2 describe-subnets \ - --region "${REGION}" \ - --filters "Name=vpc-id,Values=${VPC_ID}" \ - --query 'Subnets[0].SubnetId' --output text) - fi - - [[ -n "${SUBNET_ID}" && "${SUBNET_ID}" != "None" ]] || err "No subnets found in VPC ${VPC_ID}" - - # Get latest Ubuntu 22.04 AMI - local ami_id - ami_id=$(aws ec2 describe-images \ - --region "${REGION}" \ - --owners 099720109477 \ - --filters "Name=name,Values=ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server-*" \ - --query 'sort_by(Images, &CreationDate)[-1].ImageId' --output text) - - log "Using AMI: ${ami_id}" - - # User data for k0s installation - write to temp file - local user_data_file="/tmp/k0s-userdata-$$.sh" - cat > "${user_data_file}" <<'EOF' -#!/bin/bash -set -ex -apt-get update -apt-get install -y curl wget jq -curl -sSLf https://get.k0s.sh | sh -EOF - TMP_FILES+=("${user_data_file}") - - # Create instances (arrays already declared globally at top of script) - CONTROLLER_IPS=() - WORKER_IPS=() - ALL_INSTANCE_IDS=() - - # Add existing instances to tracking arrays - if [[ -n "${existing_controllers}" ]]; then - for id in ${existing_controllers}; do - ALL_INSTANCE_IDS+=("${id}") - done - fi - if [[ -n "${existing_cpu_workers}" ]]; then - for id in ${existing_cpu_workers}; do - ALL_INSTANCE_IDS+=("${id}") - done - fi - if [[ -n "${existing_gpu_workers}" ]]; then - for id in ${existing_gpu_workers}; do - ALL_INSTANCE_IDS+=("${id}") - done - fi - - # Controllers - only create if needed - local controllers_to_create=$((CONTROLLER_COUNT - existing_controller_count)) - if [[ ${controllers_to_create} -gt 0 ]]; then - log "Creating ${controllers_to_create} additional controller(s)..." - for ((i=existing_controller_count; i/dev/null 2>&1 && sudo k0s status >/dev/null 2>&1"; then log "============================================" - log "✓ k0s cluster already running on EC2 instances!" + log "✓ k0s cluster already running on existing nodes!" log "============================================" log "Retrieving kubeconfig from existing k0s cluster..." mkdir -p "${HOME}/.kube" @@ -4235,208 +3913,38 @@ main_delete() { log "Starting cleanup of k0s cluster: ${CLUSTER_NAME}" log "============================================" - # For EC2 mode: Just delete AWS resources (instances, security groups) - # Kubernetes resources will be destroyed when instances are terminated - # This is much faster and avoids stuck namespace deletion issues - - if [[ -n "${EXISTING_CONTROLLER_IPS}" ]]; then - # On-prem mode: Need to clean Kubernetes resources gracefully - log "On-prem mode detected - performing graceful Kubernetes cleanup..." - - export KUBECONFIG="${HOME}/.kube/k0s-${CLUSTER_NAME}" - - if [[ -f "${KUBECONFIG}" ]] && timeout 10 kubectl cluster-info &>/dev/null; then - log "Deleting Kubernetes resources..." - kubectl delete aiplatform --all -n "${AI_NS}" --timeout=60s || true - kubectl delete namespace "${AI_NS}" --timeout=120s || true - kubectl delete namespace splunk-ai-operator-system --timeout=60s || true - kubectl delete namespace monitoring --timeout=60s || true - fi - # On-prem: Stop k0s on existing infrastructure - IFS=' ' read -ra CONTROLLER_IPS <<< "${EXISTING_CONTROLLER_IPS}" - IFS=' ' read -ra WORKER_IPS <<< "${EXISTING_WORKER_IPS}" - - log "Stopping k0s on controller nodes..." - for ip in "${CONTROLLER_IPS[@]}"; do - log " Stopping k0s on controller: ${ip}..." - ssh_exec "${ip}" "sudo k0s stop || true; sudo k0s reset --force || true" || warn "Failed to stop k0s on ${ip}" - done - - log "Stopping k0s on worker nodes..." - for ip in "${WORKER_IPS[@]}"; do - log " Stopping k0s on worker: ${ip}..." - ssh_exec "${ip}" "sudo k0s stop || true; sudo k0s reset --force || true" || warn "Failed to stop k0s on ${ip}" - done + # Graceful Kubernetes cleanup, then stop k0s on all nodes + log "Performing graceful Kubernetes cleanup..." - log "k0s stopped on all on-prem nodes" - log "NOTE: Node machines are still running. To clean up completely:" - log " - Remove k0s binaries: sudo rm -f /usr/local/bin/k0s" - log " - Clean up data: sudo rm -rf /var/lib/k0s /etc/k0s" - - else - # EC2: Terminate instances - log "============================================" - log "Scanning for resources to delete..." - log "============================================" - - # First, preview what will be deleted - local instance_ids instance_count=0 - instance_ids=$(aws ec2 describe-instances \ - --region "${REGION}" \ - --filters \ - "Name=tag:Cluster,Values=${CLUSTER_NAME}" \ - "Name=tag:ManagedBy,Values=k0s-script" \ - "Name=instance-state-name,Values=running,stopped,stopping" \ - --query 'Reservations[].Instances[].InstanceId' --output text) - - if [[ -n "${instance_ids}" ]]; then - instance_count=$(echo "${instance_ids}" | wc -w) - log "EC2 Instances to terminate: ${instance_count}" - # Show instance details - aws ec2 describe-instances --region "${REGION}" --instance-ids ${instance_ids} \ - --query 'Reservations[].Instances[].[InstanceId,Tags[?Key==`Name`].Value|[0],InstanceType,State.Name]' \ - --output table 2>/dev/null || echo " ${instance_ids}" - else - log "EC2 Instances: None found" - fi - - # Check other resources - local enis=$(aws ec2 describe-network-interfaces --region "${REGION}" \ - --filters "Name=tag:Cluster,Values=${CLUSTER_NAME}" "Name=tag:ManagedBy,Values=k0s-script" \ - --query 'NetworkInterfaces[?Status==`available`].NetworkInterfaceId' --output text 2>/dev/null || echo "") - local eni_count=$(echo "${enis}" | wc -w) - log "Network Interfaces: ${eni_count:-0}" - - local sg_id=$(aws ec2 describe-security-groups --region "${REGION}" \ - --filters "Name=group-name,Values=${CLUSTER_NAME}-k0s-sg" "Name=tag:ManagedBy,Values=k0s-script" \ - --query 'SecurityGroups[0].GroupId' --output text 2>/dev/null || echo "") - if [[ -n "${sg_id}" && "${sg_id}" != "None" ]]; then - log "Security Groups: 1 (${sg_id})" - else - log "Security Groups: 0" - fi - - local volumes=$(aws ec2 describe-volumes --region "${REGION}" \ - --filters "Name=tag:Cluster,Values=${CLUSTER_NAME}" "Name=tag:ManagedBy,Values=k0s-script" "Name=status,Values=available" \ - --query 'Volumes[].VolumeId' --output text 2>/dev/null || echo "") - local vol_count=$(echo "${volumes}" | wc -w) - log "EBS Volumes: ${vol_count:-0}" - - log "" - log "All resources are tagged with:" - log " - Cluster: ${CLUSTER_NAME}" - log " - ManagedBy: k0s-script" - log "" - - # Confirmation prompt (skip if AUTO_APPROVE is set) - if [[ "${AUTO_APPROVE:-false}" != "true" ]]; then - warn "This will permanently delete the above AWS resources!" - read -p "Type 'yes' to confirm deletion: " -r - if [[ ! $REPLY =~ ^[Yy]es$ ]]; then - log "Deletion cancelled by user" - exit 0 - fi - fi - - log "" - log "============================================" - log "Starting resource deletion..." - log "============================================" - log "" - - # Now proceed with deletion - if [[ -n "${instance_ids}" ]]; then - log "Terminating ${instance_count} EC2 instance(s)..." - aws ec2 terminate-instances --region "${REGION}" --instance-ids ${instance_ids} + export KUBECONFIG="${HOME}/.kube/k0s-${CLUSTER_NAME}" - log "Waiting for instances to terminate..." - aws ec2 wait instance-terminated --region "${REGION}" --instance-ids ${instance_ids} || warn "Timeout waiting for instances to terminate" + if [[ -f "${KUBECONFIG}" ]] && timeout 10 kubectl cluster-info &>/dev/null; then + log "Deleting Kubernetes resources..." + kubectl delete aiplatform --all -n "${AI_NS}" --timeout=60s || true + kubectl delete namespace "${AI_NS}" --timeout=120s || true + kubectl delete namespace splunk-ai-operator-system --timeout=60s || true + kubectl delete namespace monitoring --timeout=60s || true + fi - log "EC2 instances terminated successfully" - else - log "No EC2 instances to terminate" - fi + IFS=' ' read -ra CONTROLLER_IPS <<< "${EXISTING_CONTROLLER_IPS}" + IFS=' ' read -ra WORKER_IPS <<< "${EXISTING_WORKER_IPS}" - # Clean up network interfaces that may be stuck - log "Checking for orphaned network interfaces..." - local enis eni_count=0 - enis=$(aws ec2 describe-network-interfaces \ - --region "${REGION}" \ - --filters \ - "Name=tag:Cluster,Values=${CLUSTER_NAME}" \ - "Name=tag:ManagedBy,Values=k0s-script" \ - --query 'NetworkInterfaces[?Status==`available`].NetworkInterfaceId' --output text 2>/dev/null || echo "") - - if [[ -n "${enis}" ]]; then - eni_count=$(echo "${enis}" | wc -w) - log "Found ${eni_count} orphaned network interface(s), deleting..." - for eni in ${enis}; do - log " Deleting network interface: ${eni}" - aws ec2 delete-network-interface --region "${REGION}" --network-interface-id "${eni}" 2>/dev/null || warn "Could not delete ENI ${eni}" - done - else - log "No orphaned network interfaces found" - fi - - # Delete security group (with retries for ENI detachment) - log "Deleting security group..." - local sg_id sg_deleted=false - sg_id=$(aws ec2 describe-security-groups \ - --region "${REGION}" \ - --filters \ - "Name=group-name,Values=${CLUSTER_NAME}-k0s-sg" \ - "Name=tag:ManagedBy,Values=k0s-script" \ - --query 'SecurityGroups[0].GroupId' --output text 2>/dev/null || echo "") - - if [[ -n "${sg_id}" && "${sg_id}" != "None" ]]; then - log "Found security group: ${sg_id}" - - # Try multiple times with increasing wait periods - for attempt in 1 2 3 4 5; do - log " Attempt ${attempt}/5 to delete security group..." - if aws ec2 delete-security-group --region "${REGION}" --group-id "${sg_id}" 2>/dev/null; then - log "Security group deleted successfully" - sg_deleted=true - break - else - if [[ ${attempt} -lt 5 ]]; then - local wait_time=$((attempt * 15)) - log " Security group still has dependencies, waiting ${wait_time}s for ENIs to detach..." - sleep ${wait_time} - fi - fi - done + log "Stopping k0s on controller nodes..." + for ip in "${CONTROLLER_IPS[@]}"; do + log " Stopping k0s on controller: ${ip}..." + ssh_exec "${ip}" "sudo k0s stop || true; sudo k0s reset --force || true" || warn "Failed to stop k0s on ${ip}" + done - if [[ "${sg_deleted}" == "false" ]]; then - warn "Could not delete security group after 5 attempts (may have dependencies)" - warn "AWS will auto-clean it when dependencies are removed" - fi - else - log "Security group not found or already deleted" - fi + log "Stopping k0s on worker nodes..." + for ip in "${WORKER_IPS[@]}"; do + log " Stopping k0s on worker: ${ip}..." + ssh_exec "${ip}" "sudo k0s stop || true; sudo k0s reset --force || true" || warn "Failed to stop k0s on ${ip}" + done - # Delete any EBS volumes that were created - log "Checking for orphaned EBS volumes..." - local volumes vol_count=0 - volumes=$(aws ec2 describe-volumes \ - --region "${REGION}" \ - --filters \ - "Name=tag:Cluster,Values=${CLUSTER_NAME}" \ - "Name=tag:ManagedBy,Values=k0s-script" \ - "Name=status,Values=available" \ - --query 'Volumes[].VolumeId' --output text) - - if [[ -n "${volumes}" ]]; then - vol_count=$(echo "${volumes}" | wc -w) - log "Found ${vol_count} orphaned EBS volume(s), deleting..." - for vol in ${volumes}; do - log " Deleting volume: ${vol}" - aws ec2 delete-volume --region "${REGION}" --volume-id "${vol}" && log " Volume ${vol} deleted" || warn " Could not delete volume ${vol}" - done - else - log "No orphaned EBS volumes found" - fi - fi + log "k0s stopped on all nodes" + log "NOTE: Node machines are still running. To clean up completely:" + log " - Remove k0s binaries: sudo rm -f /usr/local/bin/k0s" + log " - Clean up data: sudo rm -rf /var/lib/k0s /etc/k0s" # Clean up local files log "Cleaning up local files..." @@ -4453,17 +3961,9 @@ main_delete() { log "Cleanup Summary" log "============================================" - if [[ -n "${EXISTING_CONTROLLER_IPS}" ]]; then - log "Infrastructure: On-premises" - log " - k0s stopped and reset on all nodes" - log " - NOTE: Nodes are still running, k0s binaries remain" - else - log "Infrastructure: AWS EC2" - log " - EC2 Instances: ${instance_count:-0} terminated" - log " - Network Interfaces: ${eni_count:-0} cleaned up" - log " - Security Groups: $([ "${sg_deleted}" == "true" ] && echo "1 deleted" || echo "pending cleanup")" - log " - EBS Volumes: ${vol_count:-0} deleted" - fi + log "Infrastructure: On-premises" + log " - k0s stopped and reset on all nodes" + log " - NOTE: Nodes are still running, k0s binaries remain" log "" log "Kubernetes Resources:" @@ -4483,21 +3983,11 @@ main_delete() { log "" log "Cluster '${CLUSTER_NAME}' has been deleted." - if [[ -n "${EXISTING_CONTROLLER_IPS}" ]]; then - log "" - log "On-prem nodes are still running with k0s stopped." - log "To fully clean up each node, run:" - log " sudo rm -f /usr/local/bin/k0s" - log " sudo rm -rf /var/lib/k0s /etc/k0s" - else - # Check if any resources failed to delete - if [[ "${sg_deleted}" == "false" ]]; then - log "" - warn "Some resources may require manual cleanup:" - warn " - Security group ${sg_id} may have lingering dependencies" - warn " - Check AWS console for any remaining resources tagged with Cluster=${CLUSTER_NAME}" - fi - fi + log "" + log "Nodes are still running with k0s stopped." + log "To fully clean up each node, run:" + log " sudo rm -f /usr/local/bin/k0s" + log " sudo rm -rf /var/lib/k0s /etc/k0s" } # ====== CLEAN ALL (AGGRESSIVE CLEANUP) ====== @@ -4545,36 +4035,34 @@ usage() { cat < Date: Tue, 28 Apr 2026 19:32:09 +0530 Subject: [PATCH 49/55] refactor: replace NVMe auto-format with preflight storage checks, remove in-cluster MinIO install requiring customer-managed object storage --- tools/cluster_setup/k0s-cluster-config.yaml | 14 + tools/cluster_setup/k0s_cluster_with_stack.sh | 624 ++++-------------- 2 files changed, 137 insertions(+), 501 deletions(-) diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml index 4c4967c..019e66c 100644 --- a/tools/cluster_setup/k0s-cluster-config.yaml +++ b/tools/cluster_setup/k0s-cluster-config.yaml @@ -32,6 +32,12 @@ nodes: - 3.15.20.136 # CHANGE THIS: GPU worker 2 # ---------- Storage Configuration ---------- +# Prerequisites (must be provisioned BEFORE running the installer): +# - /var/lib/k0s must have at least 500 GB free on GPU workers +# - /var/lib/k0s must have at least 200 GB free on CPU workers +# - /var/lib/k0s must have at least 100 GB free on controllers +# If using a dedicated disk, mount it at /var/lib/k0s before running this script. +# # Object storage: AWS S3 or external S3-compatible (no in-cluster MinIO install for external). # Use objectStore.type: aws (S3) or s3compat | minio | seaweedfs (external; endpoint + credentials required). storage: @@ -39,6 +45,14 @@ storage: storageClass: "local-path" # Storage class for Kubernetes PVCs (gp3, gp2, io1, io2) vectorDbSize: "50Gi" # VectorDB persistent volume size + # Minimum available disk space (GB) on /var/lib/k0s per node role. + # The installer checks these thresholds at preflight and fails if not met. + # Override to lower values only if you know your workload footprint is smaller. + # minimumDiskSpace: + # controller: 100 # k0s control plane, kine/etcd, container images + # cpuWorker: 200 # weaviate, saia-api, data-loader, fluent-bit + # gpuWorker: 500 # model weights (60-240 GB each), ray-worker-gpu image (~30 GB) + objectStore: # 2026-04-21: switched from seaweedfs to minio because SeaweedFS returns # S3 InternalError/500 (not NoSuchKey/404) for GetObjectTagging on a diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index ae46ebc..ff22fd6 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -121,22 +121,27 @@ load_config() { STORAGE_CLASS=$(yq eval '.storage.storageClass // "local-path"' "${CONFIG_FILE}" 2>/dev/null || echo "local-path") VECTORDB_SIZE=$(yq eval '.storage.vectorDbSize // "50Gi"' "${CONFIG_FILE}" 2>/dev/null || echo "50Gi") + # Minimum disk space thresholds (GB) for preflight validation. + # Customers must ensure /var/lib/k0s has at least this much space before install. + MIN_DISK_CONTROLLER=$(yq eval '.storage.minimumDiskSpace.controller // "100"' "${CONFIG_FILE}" 2>/dev/null || echo "100") + MIN_DISK_CPU_WORKER=$(yq eval '.storage.minimumDiskSpace.cpuWorker // "200"' "${CONFIG_FILE}" 2>/dev/null || echo "200") + MIN_DISK_GPU_WORKER=$(yq eval '.storage.minimumDiskSpace.gpuWorker // "500"' "${CONFIG_FILE}" 2>/dev/null || echo "500") + # Strip non-numeric suffixes (e.g. "30Gi" -> "30") so arithmetic comparisons work + MIN_DISK_CONTROLLER="${MIN_DISK_CONTROLLER//[!0-9]/}" + MIN_DISK_CPU_WORKER="${MIN_DISK_CPU_WORKER//[!0-9]/}" + MIN_DISK_GPU_WORKER="${MIN_DISK_GPU_WORKER//[!0-9]/}" + # Object storage: objectStore.type (aws | s3compat | minio | seaweedfs); default minio when unset OBJ_STORE_TYPE="$(yq eval '.storage.objectStore.type // "minio"' "$CONFIG_FILE" 2>/dev/null || echo "minio")" OBJ_STORE_BUCKET="$(yq eval '.storage.objectStore.bucket // "ai-platform-data"' "$CONFIG_FILE" 2>/dev/null || echo "ai-platform-data")" OBJ_STORE_ENDPOINT="$(yq eval '.storage.objectStore.endpoint // ""' "$CONFIG_FILE" 2>/dev/null || echo "")" _obj_user="$(yq eval '.storage.objectStore.auth.rootUser // "minioadmin"' "$CONFIG_FILE" 2>/dev/null || echo "minioadmin")" _obj_pw="$(yq eval '.storage.objectStore.auth.rootPassword // ""' "$CONFIG_FILE" 2>/dev/null || echo "")" - USE_EXTERNAL_OBJ_STORE="false" - case "${OBJ_STORE_TYPE}" in s3compat|minio|seaweedfs) USE_EXTERNAL_OBJ_STORE="true"; esac MINIO_ENDPOINT="${OBJ_STORE_ENDPOINT}" MINIO_BUCKET="${OBJ_STORE_BUCKET}" MINIO_ROOT_USER="${MINIO_ROOT_USER:-$_obj_user}" MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-$_obj_pw}" - # Legacy compat: MINIO_NS for in-cluster MinIO (unused when external) - MINIO_NS="minio-system" - # Kubernetes namespace AI_NS=$(yq eval '.kubernetes.namespace' "${CONFIG_FILE}" 2>/dev/null || echo "ai-platform") @@ -190,11 +195,7 @@ load_config() { SPLUNK_AI_FILE=$(yq eval '.files.aiPlatform' "${CONFIG_FILE}" 2>/dev/null || echo "./artifacts.yaml") log "Configuration loaded: cluster=${CLUSTER_NAME}, namespace=${AI_NS}" - if [[ "${USE_EXTERNAL_OBJ_STORE}" == "true" ]]; then - log "Object storage: external S3-compatible (${OBJ_STORE_TYPE}), endpoint=${OBJ_STORE_ENDPOINT:-not set}, bucket=${OBJ_STORE_BUCKET}" - else - log "Object storage: AWS S3, bucket=${OBJ_STORE_BUCKET}" - fi + log "Object storage: ${OBJ_STORE_TYPE}, endpoint=${OBJ_STORE_ENDPOINT:-not set}, bucket=${OBJ_STORE_BUCKET}" if [[ -n "${ECR_ACCOUNT}" ]]; then log "ECR Account: ${ECR_ACCOUNT}" fi @@ -403,22 +404,18 @@ preflight_checks() { [[ -f "${SPLUNK_OPERATOR_FILE}" ]] && pf_ok "Splunk operator file: ${SPLUNK_OPERATOR_FILE}" || pf_warn "Splunk operator file not found: ${SPLUNK_OPERATOR_FILE}" [[ -f "${SPLUNK_AI_FILE}" ]] && pf_ok "AI platform file: ${SPLUNK_AI_FILE}" || pf_warn "AI platform file not found: ${SPLUNK_AI_FILE}" - pf_header "Object storage" - if [[ "${USE_EXTERNAL_OBJ_STORE}" == "true" ]]; then - pf_ok "Object storage: external S3-compatible (${OBJ_STORE_TYPE})" - if [[ "${OBJ_STORE_TYPE}" == "seaweedfs" ]]; then - if echo "${OBJ_STORE_ENDPOINT}" | grep -q ':9000'; then - pf_warn "SeaweedFS uses port 8333 (not 9000). Endpoint has :9000 (MinIO); use http://host:8333 for SeaweedFS." - else - pf_ok "SeaweedFS endpoint: ${OBJ_STORE_ENDPOINT}" - fi + pf_header "Object storage (customer-managed)" + pf_ok "Object storage type: ${OBJ_STORE_TYPE} (bucket=${OBJ_STORE_BUCKET})" + if [[ "${OBJ_STORE_TYPE}" == "seaweedfs" ]]; then + if echo "${OBJ_STORE_ENDPOINT}" | grep -q ':9000'; then + pf_warn "SeaweedFS uses port 8333 (not 9000). Endpoint has :9000 (MinIO); use http://host:8333 for SeaweedFS." else - [[ -n "${OBJ_STORE_ENDPOINT}" ]] && pf_ok "Endpoint: ${OBJ_STORE_ENDPOINT}" || pf_fail "External object store requires endpoint" + [[ -n "${OBJ_STORE_ENDPOINT}" ]] && pf_ok "SeaweedFS endpoint: ${OBJ_STORE_ENDPOINT}" || pf_fail "objectStore.endpoint is required" fi - [[ -n "${MINIO_ROOT_PASSWORD}" ]] && pf_ok "Credentials configured" || pf_fail "Object store credentials required" else - pf_ok "Object storage: in-cluster MinIO or AWS S3 (bucket=${OBJ_STORE_BUCKET})" + [[ -n "${OBJ_STORE_ENDPOINT}" ]] && pf_ok "Endpoint: ${OBJ_STORE_ENDPOINT}" || pf_fail "objectStore.endpoint is required" fi + [[ -n "${MINIO_ROOT_PASSWORD}" ]] && pf_ok "Credentials configured" || pf_fail "Object store credentials required (objectStore.auth.rootPassword)" pf_header "Infrastructure mode" pf_ok "Using existing infrastructure (on-prem/baremetal)" @@ -426,6 +423,9 @@ preflight_checks() { pf_ok "Worker IPs: ${EXISTING_WORKER_IPS}" [[ -n "${SSH_KEY_PATH}" && -f "${SSH_KEY_PATH}" ]] && pf_ok "SSH key: ${SSH_KEY_PATH}" || pf_fail "SSH key not found: ${SSH_KEY_PATH}" + # Validate disk space on every node (requires SSH access) + preflight_check_node_storage + pf_summary } @@ -503,117 +503,70 @@ REMOTE_SCRIPT done } -# ====== MOUNT NVMe INSTANCE STORE FOR EPHEMERAL STORAGE ====== -# GPU instance types (g5, g6, p4, p5) typically come with large NVMe instance -# store drives but tiny 10 GB EBS root volumes. Kubernetes counts ephemeral -# storage from the filesystem backing /var/lib/k0s/kubelet, so we mount an -# unused NVMe drive there to prevent "Insufficient ephemeral-storage" errors. -mount_nvme_instance_store() { - if [[ ${GPU_WORKER_COUNT} -eq 0 ]]; then - return 0 - fi +# ====== PREFLIGHT: NODE STORAGE VALIDATION ====== +# On-prem / baremetal nodes must have sufficient disk space BEFORE running the +# installer. This function SSHs to every node and verifies the filesystem +# backing /var/lib/k0s (or / on first install) meets the minimum threshold. +# +# Thresholds (configurable via storage.minimumDiskSpace in config YAML): +# Controller : 100 GB (k0s control plane, kine/etcd, container images) +# CPU worker : 200 GB (weaviate, saia-api, data-loader, fluent-bit, etc.) +# GPU worker : 500 GB (model weights 60-240 GB each, ray-worker-gpu image ~30 GB) +# +# If a dedicated disk is available, the customer should mount it at +# /var/lib/k0s before running this script. +preflight_check_node_storage() { + pf_header "Node storage" + + IFS=' ' read -ra _ctrl_ips <<< "${EXISTING_CONTROLLER_IPS}" + IFS=' ' read -ra _worker_ips <<< "${EXISTING_WORKER_IPS}" + + # Helper: SSH to a node and return available GB on the filesystem backing + # /var/lib/k0s (falls back to / if k0s hasn't been installed yet). + _get_avail_gb() { + local ip="$1" + ssh_exec "${ip}" " + avail_kb=\$(df --output=avail /var/lib/k0s 2>/dev/null | tail -1 | tr -d ' ') + if [ -z \"\${avail_kb}\" ] || [ \"\${avail_kb}\" = \"Avail\" ]; then + avail_kb=\$(df --output=avail / 2>/dev/null | tail -1 | tr -d ' ') + fi + echo \$(( \${avail_kb:-0} / 1048576 )) + " 2>/dev/null || echo "0" + } - # Ensure WORKER_IPS is populated - if [[ -z "${WORKER_IPS+x}" || ${#WORKER_IPS[@]} -eq 0 ]]; then - if [[ -n "${EXISTING_WORKER_IPS}" ]]; then - IFS=' ' read -ra WORKER_IPS <<< "${EXISTING_WORKER_IPS}" + # Check controller nodes + for ip in "${_ctrl_ips[@]}"; do + local avail + avail=$(_get_avail_gb "${ip}") + avail=$(echo "${avail}" | tr -d '[:space:]') + if [[ "${avail}" -ge "${MIN_DISK_CONTROLLER}" ]]; then + pf_ok "Controller ${ip}: ${avail} GB available (minimum: ${MIN_DISK_CONTROLLER} GB)" else - return 0 + pf_fail "Controller ${ip}: ${avail} GB available — need at least ${MIN_DISK_CONTROLLER} GB on /var/lib/k0s" fi - fi - - local gpu_ips=() - local idx=0 - for ip in "${WORKER_IPS[@]}"; do - if [[ ${idx} -ge ${CPU_WORKER_COUNT} ]]; then - gpu_ips+=("${ip}") - fi - idx=$((idx + 1)) done - if [[ ${#gpu_ips[@]} -eq 0 ]]; then - return 0 - fi - - log "Checking NVMe instance store volumes on GPU workers..." - - for gpu_ip in "${gpu_ips[@]}"; do - ssh_exec "${gpu_ip}" " - # Skip if /var/lib/k0s is already on a large filesystem (>50 GB) - k0s_avail_gb=\$(df --output=avail /var/lib/k0s 2>/dev/null | tail -1 | awk '{print int(\$1/1048576)}') - if [ \"\${k0s_avail_gb:-0}\" -ge 50 ]; then - echo 'NVMe mount: /var/lib/k0s already has >=50 GB, skipping' - exit 0 - fi - - # Find the first NVMe device that is NOT the root disk and has no partitions - ROOT_DEV=\$(lsblk -no PKNAME \$(findmnt -n -o SOURCE /) 2>/dev/null | head -1) - NVME_DEV='' - for dev in /dev/nvme*n1; do - [ -b \"\$dev\" ] || continue - dev_name=\$(basename \"\$dev\") - # Skip the root device - [ \"\$dev_name\" = \"\$ROOT_DEV\" ] && continue - # Skip devices that already have partitions (they are in use) - if lsblk -n \"\$dev\" 2>/dev/null | grep -q part; then continue; fi - # Skip devices already mounted - if mount | grep -q \"\$dev\"; then continue; fi - NVME_DEV=\"\$dev\" - break - done - - if [ -z \"\$NVME_DEV\" ]; then - echo 'NVMe mount: no unused NVMe instance store found, skipping' - exit 0 - fi - - echo \"NVMe mount: formatting \$NVME_DEV and mounting to /var/lib/k0s\" - - # Format - sudo mkfs.xfs -f \"\$NVME_DEV\" >/dev/null 2>&1 - - # If k0s is running, stop it and preserve existing data - if systemctl is-active k0sworker >/dev/null 2>&1; then - sudo systemctl stop k0sworker 2>/dev/null || true - sleep 3 - sudo pkill -9 k0s 2>/dev/null || true - sudo pkill -9 containerd 2>/dev/null || true - sudo pkill -9 containerd-shim 2>/dev/null || true - sleep 2 - fi - - # Lazy unmount anything stuck under /var/lib/k0s - for mp in \$(mount | grep '/var/lib/k0s' | awk '{print \$3}' | sort -r); do - sudo umount -l \"\$mp\" 2>/dev/null || true - done - - # Copy existing data if present - if [ -d /var/lib/k0s ] && [ \"\$(ls -A /var/lib/k0s 2>/dev/null)\" ]; then - sudo mkdir -p /mnt/nvme-staging - sudo mount \"\$NVME_DEV\" /mnt/nvme-staging - sudo cp -a /var/lib/k0s/. /mnt/nvme-staging/ 2>/dev/null || true - sudo umount /mnt/nvme-staging - sudo rmdir /mnt/nvme-staging - fi - - # Mount - sudo rm -rf /var/lib/k0s 2>/dev/null || true - sudo mkdir -p /var/lib/k0s - sudo mount \"\$NVME_DEV\" /var/lib/k0s - - # Persist in fstab - NVME_UUID=\$(sudo blkid -s UUID -o value \"\$NVME_DEV\") - if ! grep -q \"\$NVME_UUID\" /etc/fstab 2>/dev/null; then - echo \"UUID=\$NVME_UUID /var/lib/k0s xfs defaults,nofail 0 2\" | sudo tee -a /etc/fstab >/dev/null - fi + # Check worker nodes (distinguish CPU vs GPU by index) + local widx=0 + for ip in "${_worker_ips[@]}"; do + local avail role min_required + avail=$(_get_avail_gb "${ip}") + avail=$(echo "${avail}" | tr -d '[:space:]') - # Restart k0s if it was running - if systemctl is-enabled k0sworker >/dev/null 2>&1; then - sudo systemctl start k0sworker 2>/dev/null || true - fi + if [[ ${widx} -lt ${CPU_WORKER_COUNT} ]]; then + role="CPU worker" + min_required="${MIN_DISK_CPU_WORKER}" + else + role="GPU worker" + min_required="${MIN_DISK_GPU_WORKER}" + fi - echo \"NVMe mount: done — \$(df -h \$NVME_DEV | tail -1 | awk '{print \$2}') available on /var/lib/k0s\" - " 2>/dev/null || warn " NVMe mount on ${gpu_ip} had issues — may need manual setup" + if [[ "${avail}" -ge "${min_required}" ]]; then + pf_ok "${role} ${ip}: ${avail} GB available (minimum: ${min_required} GB)" + else + pf_fail "${role} ${ip}: ${avail} GB available — need at least ${min_required} GB on /var/lib/k0s" + fi + widx=$((widx + 1)) done } @@ -1145,273 +1098,17 @@ ensure_namespace() { fi } -# ====== INSTALL MINIO ====== -# TODO remove -install_minio() { - # When using external S3-compatible storage, skip in-cluster MinIO; credentials - # are created by ensure_s3compat_credentials() instead. - if [[ "${USE_EXTERNAL_OBJ_STORE}" == "true" ]]; then - log "Using external S3-compatible storage (${OBJ_STORE_TYPE}); skipping in-cluster MinIO install." - return 0 - fi - - # Auto-generate root password if not set - if [[ -z "${MINIO_ROOT_PASSWORD}" ]]; then - MINIO_ROOT_PASSWORD="$(openssl rand -base64 24 2>/dev/null || head -c 32 /dev/urandom | base64)" - log "Generated MinIO root password (saved for secret creation)" - fi - - # In-cluster MinIO installation - log "Installing MinIO in ${MINIO_NS}..." - ensure_namespace "${MINIO_NS}" - - # Create MinIO secret - kubectl create secret generic minio-creds \ - --namespace="${MINIO_NS}" \ - --from-literal=accesskey="${MINIO_ROOT_USER}" \ - --from-literal=secretkey="${MINIO_ROOT_PASSWORD}" \ - --dry-run=client -o yaml | kubectl apply -f - - - # Deploy MinIO - cat </dev/null || true - sleep 2 - - cat </dev/null 2>&1; then - echo "✓ Bucket '${MINIO_BUCKET}' already exists" - else - echo "Creating bucket: ${MINIO_BUCKET}" - mc mb myminio/${MINIO_BUCKET} - echo "Setting anonymous read policy for bucket..." - mc anonymous set download myminio/${MINIO_BUCKET} || true - fi - - echo "" - echo "Verifying required directories..." - DIRS_TO_CREATE="" - - # Check each directory - for dir in apps artifacts model_artifacts tasks; do - if mc ls myminio/${MINIO_BUCKET}/\$dir/ >/dev/null 2>&1; then - echo " ✓ \$dir/ exists" - else - echo " → \$dir/ missing, will create" - DIRS_TO_CREATE="\$DIRS_TO_CREATE \$dir" - fi - done - - # Create missing directories only - if [ -n "\$DIRS_TO_CREATE" ]; then - echo "" - echo "Creating missing directories..." - for dir in \$DIRS_TO_CREATE; do - case \$dir in - apps) - echo " - apps/ (for Splunk apps and add-ons)" - echo "placeholder" | mc pipe myminio/${MINIO_BUCKET}/apps/.keep - ;; - artifacts) - echo " - artifacts/ (for AI Platform artifacts)" - echo "placeholder" | mc pipe myminio/${MINIO_BUCKET}/artifacts/.keep - ;; - model_artifacts) - echo " - model_artifacts/ (for AI model artifacts)" - echo "placeholder" | mc pipe myminio/${MINIO_BUCKET}/model_artifacts/.keep - ;; - tasks) - echo " - tasks/ (for AI Platform tasks)" - echo "placeholder" | mc pipe myminio/${MINIO_BUCKET}/tasks/.keep - ;; - esac - done - else - echo "" - echo "✓ All directories already exist, nothing to create" - fi - - echo "" - echo "Final verification:" - ALL_OK=true - for dir in apps artifacts model_artifacts tasks; do - if mc ls myminio/${MINIO_BUCKET}/\$dir/ >/dev/null 2>&1; then - echo " ✓ \$dir/ verified" - else - echo " ✗ \$dir/ missing" - ALL_OK=false - fi - done - - if [ "\$ALL_OK" = "true" ]; then - echo "" - echo "✓ Bucket structure ready!" - echo "" - echo "Bucket contents:" - mc ls myminio/${MINIO_BUCKET}/ - else - echo "" - echo "✗ Some directories are missing" - exit 1 - fi -EOF - - log "Waiting for bucket verification job to complete..." - if kubectl wait --for=condition=complete job/minio-create-bucket -n "${MINIO_NS}" --timeout=120s; then - log "✓ MinIO bucket structure verified" - - # Show job logs for verification - kubectl logs -n "${MINIO_NS}" job/minio-create-bucket --tail=20 2>/dev/null || true - else - warn "Bucket verification job did not complete in time, checking status..." - kubectl describe job/minio-create-bucket -n "${MINIO_NS}" || true - kubectl logs -n "${MINIO_NS}" job/minio-create-bucket --tail=50 || true - fi - - log "✓ MinIO installed; bucket=${MINIO_BUCKET}; credentials secret ${AI_NS}/${secret_name}" -} - -# ====== External S3-compatible object storage (credentials only; no in-cluster install) ====== +# ====== S3-COMPATIBLE OBJECT STORAGE CREDENTIALS ====== +# Object storage is always customer-managed (external). This function creates +# the Kubernetes credentials secret so the operator and workloads can auth. ensure_s3compat_credentials() { - if [[ "${USE_EXTERNAL_OBJ_STORE}" != "true" ]]; then - return 0 - fi - - log "Object store type is ${OBJ_STORE_TYPE}; creating credentials secret for external S3-compatible storage." + log "Creating credentials secret for S3-compatible object storage (${OBJ_STORE_TYPE})..." if [[ -z "${OBJ_STORE_ENDPOINT}" && -z "${MINIO_ENDPOINT}" ]]; then err "storage.objectStore.type=${OBJ_STORE_TYPE} requires storage.objectStore.endpoint" return 1 fi if [[ -z "${MINIO_ROOT_PASSWORD}" ]]; then - err "External S3-compatible storage requires credentials (objectStore.auth.rootPassword or MINIO_ROOT_PASSWORD)" + err "S3-compatible storage requires credentials (objectStore.auth.rootPassword or MINIO_ROOT_PASSWORD)" return 1 fi ensure_namespace "${AI_NS}" @@ -1424,7 +1121,7 @@ ensure_s3compat_credentials() { --from-literal=MINIO_ACCESS_KEY="${MINIO_ROOT_USER}" \ --from-literal=MINIO_SECRET_KEY="${MINIO_ROOT_PASSWORD}" \ --dry-run=client -o yaml | kubectl -n "${AI_NS}" apply -f - - log "✓ External S3-compatible credentials secret ${AI_NS}/${secret_name} ready" + log "✓ S3-compatible credentials secret ${AI_NS}/${secret_name} ready" } # ====== INSTALL CERT-MANAGER ====== @@ -3006,26 +2703,17 @@ install_splunk_standalone() { ensure_namespace "${AI_NS}" wait_for_crd standalones.enterprise.splunk.com 600 - # Create credentials secret for Splunk App Framework - if [[ "${USE_EXTERNAL_OBJ_STORE}" == "true" ]]; then - log "Using external S3-compatible credentials for Splunk App Framework..." - if ! kubectl get secret minio-credentials -n "${AI_NS}" &>/dev/null; then - log "Creating minio-credentials secret in ${AI_NS}..." - kubectl -n "${AI_NS}" create secret generic minio-credentials \ - --from-literal=AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" \ - --from-literal=AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \ - --from-literal=s3_access_key="${MINIO_ROOT_USER}" \ - --from-literal=s3_secret_key="${MINIO_ROOT_PASSWORD}" \ - --from-literal=MINIO_ACCESS_KEY="${MINIO_ROOT_USER}" \ - --from-literal=MINIO_SECRET_KEY="${MINIO_ROOT_PASSWORD}" \ - --dry-run=client -o yaml | kubectl -n "${AI_NS}" apply -f - - fi - else - log "Creating S3-compatible secret for Splunk App Framework..." - kubectl -n "${AI_NS}" create secret generic s3-secret \ + # Ensure credentials secret exists for Splunk App Framework + if ! kubectl get secret minio-credentials -n "${AI_NS}" &>/dev/null; then + log "Creating minio-credentials secret in ${AI_NS}..." + kubectl -n "${AI_NS}" create secret generic minio-credentials \ + --from-literal=AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" \ + --from-literal=AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \ --from-literal=s3_access_key="${MINIO_ROOT_USER}" \ --from-literal=s3_secret_key="${MINIO_ROOT_PASSWORD}" \ - --dry-run=client -o yaml | kubectl apply -f - + --from-literal=MINIO_ACCESS_KEY="${MINIO_ROOT_USER}" \ + --from-literal=MINIO_SECRET_KEY="${MINIO_ROOT_PASSWORD}" \ + --dry-run=client -o yaml | kubectl -n "${AI_NS}" apply -f - fi # Create splunk-defaults ConfigMap (optional but recommended) @@ -3056,10 +2744,9 @@ YAML warn "Could not patch default ServiceAccount" fi - # Standalone app repo: external S3-compatible when objectStore.type is s3compat/minio/seaweedfs, else S3 - if [[ "${USE_EXTERNAL_OBJ_STORE}" == "true" ]]; then - local minio_endpoint="${MINIO_ENDPOINT:-${OBJ_STORE_ENDPOINT}}" - cat < "${phase1_logdir}/minio.log" 2>&1 & - phase1_pids+=($!); phase1_names+=("minio") - install_cert_manager > "${phase1_logdir}/cert-manager.log" 2>&1 & phase1_pids+=($!); phase1_names+=("cert-manager") install_kube_prometheus > "${phase1_logdir}/kube-prometheus.log" 2>&1 & phase1_pids+=($!); phase1_names+=("kube-prometheus") - # These don't need cert-manager — run in parallel too - mount_nvme_instance_store > "${phase1_logdir}/nvme.log" 2>&1 & - phase1_pids+=($!); phase1_names+=("nvme-mount") - install_nvidia_host_drivers > "${phase1_logdir}/nvidia-drivers.log" 2>&1 & phase1_pids+=($!); phase1_names+=("nvidia-drivers") @@ -3498,15 +3130,12 @@ check_platform_health() { fi log "" - # Check 3: MinIO / Object Storage - log "Checking object storage..." - if [[ "${USE_EXTERNAL_OBJ_STORE}" == "true" ]]; then - log "⏭️ External S3-compatible storage (${OBJ_STORE_TYPE}); skipping in-cluster check" - elif kubectl get pod -n "${MINIO_NS}" -l app=minio 2>/dev/null | grep -q "Running"; then - log "✅ MinIO is running" + # Check 3: Object Storage + log "Checking object storage configuration..." + if [[ -n "${OBJ_STORE_ENDPOINT}" ]]; then + log "✅ Object storage configured: ${OBJ_STORE_TYPE} at ${OBJ_STORE_ENDPOINT} (customer-managed)" else - warn "MinIO pod not in Running state" - kubectl get pods -n "${MINIO_NS}" + warn "Object storage endpoint not configured" ((health_issues++)) fi log "" @@ -3636,18 +3265,11 @@ show_platform_access_info() { kubectl get nodes -o wide 2>/dev/null || warn "Could not retrieve node information" log "" - # MinIO information - log "🗄️ MinIO (Object Storage):" - log " Console URL: http://localhost:9001" - log " API URL: http://localhost:9000" - log " " - log " 💡 Access MinIO Console:" - log " kubectl port-forward svc/minio -n ${MINIO_NS} 9001:9001" - log " Open: http://localhost:9001" - log " " - log " 🔑 Credentials:" - log " Username: ${MINIO_ROOT_USER}" - log " Password: ${MINIO_ROOT_PASSWORD}" + # Object storage information + log "🗄️ Object Storage (customer-managed):" + log " Type: ${OBJ_STORE_TYPE}" + log " Endpoint: ${OBJ_STORE_ENDPOINT}" + log " Bucket: ${OBJ_STORE_BUCKET}" log "" # AI Platform information From 0ccde9fabb5cf2786b744b0713150f2b9ff41de9 Mon Sep 17 00:00:00 2001 From: Mohammed Arif Date: Tue, 28 Apr 2026 20:01:49 +0530 Subject: [PATCH 50/55] refactor: remove ecr credential refresher --- tools/cluster_setup/k0s_cluster_with_stack.sh | 198 ------------------ 1 file changed, 198 deletions(-) diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index ff22fd6..199d6c5 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -2501,201 +2501,6 @@ create_ecr_secret() { log "✓ Secret will be referenced in AIPlatform CR spec.imagePullSecrets" } -# ====== ECR CREDENTIAL REFRESHER CRONJOB ====== -# ECR tokens expire every 12 hours. This CronJob refreshes the ecr-registry-secret -# in all relevant namespaces every 6 hours so image pulls never break. -install_ecr_credential_refresher() { - if [[ "${IMAGE_PULL_SECRETS_ECR_ENABLED}" != "true" ]]; then - log "ECR not enabled — skipping credential refresher" - return 0 - fi - - local ecr_region="${ECR_REGION:-${REGION:-us-east-2}}" - local ecr_account="${ECR_ACCOUNT}" - - if [[ -z "${ecr_account}" ]]; then - warn "ECR account not configured — skipping credential refresher" - return 0 - fi - - local ecr_server="${ecr_account}.dkr.ecr.${ecr_region}.amazonaws.com" - local refresher_ns="${AI_NS}" - local target_namespaces="${AI_NS} splunk-ai-operator-system" - - # Resolve AWS credentials (env > aws configure) - local aws_key="${AWS_ACCESS_KEY_ID:-}" - local aws_secret="${AWS_SECRET_ACCESS_KEY:-}" - local aws_session="${AWS_SESSION_TOKEN:-}" - - if [[ -z "$aws_key" ]]; then - aws_key=$(aws configure get aws_access_key_id 2>/dev/null || echo "") - fi - if [[ -z "$aws_secret" ]]; then - aws_secret=$(aws configure get aws_secret_access_key 2>/dev/null || echo "") - fi - - if [[ -z "$aws_key" ]] || [[ -z "$aws_secret" ]]; then - warn "AWS credentials not available — skipping ECR credential refresher CronJob" - warn "ECR tokens will expire after 12 hours. Refresh ecr-registry-secret manually." - return 0 - fi - - if [[ -n "$aws_session" ]]; then - warn "Detected temporary AWS credentials (session token present)" - warn "ECR refresher CronJob will work until these session credentials expire" - warn "For long-term use, configure an IAM user with ecr:GetAuthorizationToken permission" - fi - - log "Installing ECR credential refresher CronJob..." - - # Store AWS credentials in a secret for the CronJob to use - local secret_args=( - --from-literal=AWS_ACCESS_KEY_ID="${aws_key}" - --from-literal=AWS_SECRET_ACCESS_KEY="${aws_secret}" - ) - [[ -n "$aws_session" ]] && secret_args+=(--from-literal=AWS_SESSION_TOKEN="${aws_session}") - - kubectl -n "${refresher_ns}" create secret generic aws-ecr-credentials \ - "${secret_args[@]}" \ - --dry-run=client -o yaml | kubectl apply -f - - - # Pre-build the optional SESSION_TOKEN env block - local session_token_env="" - if [[ -n "$aws_session" ]]; then - session_token_env=" - - name: AWS_SESSION_TOKEN - valueFrom: - secretKeyRef: - name: aws-ecr-credentials - key: AWS_SESSION_TOKEN - optional: true" - fi - - # Deploy ServiceAccount, RBAC, and CronJob - cat < /dev/null - echo " Updated existing secret" - else - curl -sf -X POST \ - -H "Content-Type: application/json" \ - -H "Authorization: Bearer \${K8S_TOKEN}" \ - --cacert \${K8S_CA} \ - "\${K8S_API}/api/v1/namespaces/\${NS}/secrets" \ - -d "\${SECRET_JSON}" > /dev/null - echo " Created new secret" - fi - done - echo "ECR credential refresh complete" - env: - - name: ECR_REGION - value: "${ecr_region}" - - name: ECR_SERVER - value: "${ecr_server}" - - name: TARGET_NAMESPACES - value: "${target_namespaces}" - - name: AWS_ACCESS_KEY_ID - valueFrom: - secretKeyRef: - name: aws-ecr-credentials - key: AWS_ACCESS_KEY_ID - - name: AWS_SECRET_ACCESS_KEY - valueFrom: - secretKeyRef: - name: aws-ecr-credentials - key: AWS_SECRET_ACCESS_KEY${session_token_env} -CRONEOF - - log "✓ ECR credential refresher CronJob installed (schedule: every 6 hours)" - - # Trigger an immediate run to ensure fresh credentials right now - log "Running initial credential refresh..." - kubectl -n "${refresher_ns}" delete job ecr-initial-refresh --ignore-not-found=true 2>/dev/null || true - kubectl -n "${refresher_ns}" create job --from=cronjob/ecr-credential-refresher ecr-initial-refresh 2>/dev/null || true - - # Wait for the initial job to complete (up to 2 minutes) - if kubectl -n "${refresher_ns}" wait --for=condition=complete job/ecr-initial-refresh --timeout=120s 2>/dev/null; then - log "✓ Initial ECR credential refresh completed successfully" - else - warn "Initial ECR refresh may still be running — pods should recover once it completes" - fi -} - # ====== INSTALL SPLUNK STANDALONE ====== install_splunk_standalone() { log "Installing Splunk Standalone: ${AI_STANDALONE_NAME} in ${AI_NS}..." @@ -3081,9 +2886,6 @@ install_ai_platform_stack() { # Create image pull secrets before Splunk Standalone (it uses the default SA which needs ECR creds) create_image_pull_secrets "${AI_NS}" - # Deploy CronJob that auto-refreshes ECR credentials every 6 hours (tokens expire at 12h) - install_ecr_credential_refresher - # Apply Splunk Standalone CR (non-blocking — pod boots in background) install_splunk_standalone From 922cb4f72f89c2ab3bffaf251f9bc0ff13d5fe8a Mon Sep 17 00:00:00 2001 From: Mohammed Arif Date: Tue, 28 Apr 2026 22:02:20 +0530 Subject: [PATCH 51/55] fix: add safety gate to prevent install_k0s_cluster from wiping a live cluster MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the existing-cluster detection (useExisting) flakes due to an SSH timeout or transient k0s status error, install_k0s_cluster could fall through and unconditionally rm -rf /var/lib/k0s, destroying all cluster state (etcd/kine, CRs, PVCs). Add a pre-wipe check that queries k0s kubectl for Ready nodes and aborts with a clear error if any are found. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- tools/cluster_setup/k0s_cluster_with_stack.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index 199d6c5..37cb5aa 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -644,6 +644,14 @@ PYSCRIPT" # Ensure k0s is in sudo's secure_path (some distros exclude /usr/local/bin) ssh_exec "${controller_ip}" "if [ -f /usr/local/bin/k0s ] && [ ! -f /usr/bin/k0s ]; then sudo ln -sf /usr/local/bin/k0s /usr/bin/k0s; fi" || true + # Safety gate: refuse to wipe if a live cluster with Ready nodes exists. + # This prevents accidental data loss when the existing-cluster detection + # (useExisting) flakes due to an SSH timeout or transient k0s status error. + if ssh_exec "${controller_ip}" "sudo k0s kubectl get nodes --no-headers 2>/dev/null" 2>/dev/null | grep -q ' Ready'; then + err "k0s cluster on ${controller_ip} has Ready nodes — refusing to wipe. + Use 'delete' or 'clean-all' to tear down first, or set useExisting=auto in config." + fi + # Clean stale k0s state from any previous run ssh_exec "${controller_ip}" " sudo systemctl stop k0scontroller 2>/dev/null || true From b1372710d75aafd542df3c381c569910f985ab07 Mon Sep 17 00:00:00 2001 From: Mohammed Arif Date: Tue, 28 Apr 2026 23:18:08 +0530 Subject: [PATCH 52/55] fix: added logging to a file --- tools/cluster_setup/k0s_cluster_with_stack.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index 37cb5aa..d6da6b3 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -27,6 +27,13 @@ export LANG=C LC_ALL=C # ====== CONFIG FILE LOCATION ====== CONFIG_FILE="${CONFIG_FILE:-$(dirname "$0")/k0s-cluster-config.yaml}" +# ====== SESSION LOG ====== +LOG_DIR="${LOG_DIR:-$(dirname "$0")/logs}" +mkdir -p "${LOG_DIR}" +LOG_FILE="${LOG_DIR}/k0s-install-$(date '+%Y-%m-%d_%H-%M-%S').log" +exec > >(tee -a "${LOG_FILE}") 2>&1 +echo "[LOG] Session log: ${LOG_FILE}" + # ====== COLORS & LOGGING ====== log() { echo -e "\033[1;36m[INFO]\033[0m $*" >&2; } warn() { echo -e "\033[1;33m[WARN]\033[0m $*" >&2; } From 3d1104da45b759ac930150ec3fc2faad063a46e5 Mon Sep 17 00:00:00 2001 From: Mohammed Arif Date: Tue, 28 Apr 2026 23:33:50 +0530 Subject: [PATCH 53/55] feat: added initContainer for saia-vector-db-setup posthook --- pkg/ai/features/saia/impl.go | 28 ++++++++++++++++++++++++++++ pkg/ai/features/saia/impl_test.go | 14 ++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/pkg/ai/features/saia/impl.go b/pkg/ai/features/saia/impl.go index c899bcf..75d9d0a 100644 --- a/pkg/ai/features/saia/impl.go +++ b/pkg/ai/features/saia/impl.go @@ -579,15 +579,43 @@ func (r *SaiaReconciler) reconcilePostInstallHook( } } uri := fmt.Sprintf("http://%s:80", ai.Spec.VectorDbUrl) + backoffLimit := int32(1) job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ Name: ai.Name + "-vector-db-setup-posthook", Namespace: ai.Namespace, }, Spec: batchv1.JobSpec{ + BackoffLimit: &backoffLimit, Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ RestartPolicy: corev1.RestartPolicyNever, + // Wait for Weaviate to accept connections before running + // the schema setup container. This eliminates the + // error-pod churn that occurred when the Job was created + // before Weaviate was fully serving (the operator-level + // condition check can race with actual endpoint readiness). + InitContainers: []corev1.Container{ + { + Name: "wait-for-weaviate", + Image: hookImage, + ImagePullPolicy: corev1.PullIfNotPresent, + Command: []string{"python3", "-c", fmt.Sprintf( + `import urllib.request, time, sys +url = "http://%s:80/v1/.well-known/ready" +for i in range(120): + try: + r = urllib.request.urlopen(url, timeout=5) + if r.status == 200: + print("weaviate ready"); sys.exit(0) + except Exception as e: + print(f"attempt {i+1}/120: {e}") + time.sleep(5) +print("timed out waiting for weaviate"); sys.exit(1)`, + ai.Spec.VectorDbUrl, + )}, + }, + }, Containers: []corev1.Container{ { Name: "vector-db-setup-container", diff --git a/pkg/ai/features/saia/impl_test.go b/pkg/ai/features/saia/impl_test.go index 94be4c8..e368531 100644 --- a/pkg/ai/features/saia/impl_test.go +++ b/pkg/ai/features/saia/impl_test.go @@ -270,6 +270,20 @@ func Test_reconcilePostInstallHook_SetsGRPCEnvForV2DataLoader(t *testing.T) { require.NoError(t, fakeClient.Get(context.Background(), types.NamespacedName{Name: "test-vector-db-setup-posthook", Namespace: "default"}, job)) + // BackoffLimit must be 1 to avoid error-pod churn. + require.NotNil(t, job.Spec.BackoffLimit) + assert.Equal(t, int32(1), *job.Spec.BackoffLimit) + + // InitContainer must poll Weaviate readiness before the main container runs. + require.Len(t, job.Spec.Template.Spec.InitContainers, 1) + initC := job.Spec.Template.Spec.InitContainers[0] + assert.Equal(t, "wait-for-weaviate", initC.Name) + assert.Equal(t, "dummy-hook-image:latest", initC.Image) + require.NotEmpty(t, initC.Command) + assert.Equal(t, "python3", initC.Command[0]) + assert.Contains(t, initC.Command[2], "weaviate.ai-platform.svc.cluster.local") + assert.Contains(t, initC.Command[2], "/v1/.well-known/ready") + // Collect env var names/values. envMap := envToMap(job.Spec.Template.Spec.Containers[0].Env) From d74d9c50ab2d6dd6d1f517a262f4d97f609e4a14 Mon Sep 17 00:00:00 2001 From: Mohammed Arif Date: Wed, 29 Apr 2026 09:01:38 +0530 Subject: [PATCH 54/55] fix: github copilot review comments --- .gitignore | 3 ++ Dockerfile | 6 ++-- config/manager/kustomization.yaml | 2 +- pkg/ai/raybuilder/builder.go | 29 ++---------------- pkg/storage/storageclient.go | 9 ++++++ .../README.md | 10 +++---- .../SEAWEEDFS_SYSTEMD.md | 4 +-- .../test_minio_connection.sh | 6 ++-- tools/cluster_setup/K0S_README.md | 2 +- tools/cluster_setup/artifacts.yaml | 10 +++---- tools/cluster_setup/cluster-config.yaml | 8 ++--- .../k0s-cluster-config-h100.yaml | 30 +++++++++---------- tools/cluster_setup/k0s-cluster-config.yaml | 26 ++++++++-------- tools/cluster_setup/k0s_cluster_with_stack.sh | 10 ++----- .../splunk-operator-cluster.yaml | 1 - 15 files changed, 70 insertions(+), 86 deletions(-) diff --git a/.gitignore b/.gitignore index 1235ec2..d87d947 100644 --- a/.gitignore +++ b/.gitignore @@ -31,6 +31,9 @@ tmp/* skaffold.env.local .skaffold/ +# Logs +tools/cluster_setup/logs/ + # Helm build artifacts *.tgz helm-chart/**/charts/ diff --git a/Dockerfile b/Dockerfile index ae259de..25a756c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,8 +44,10 @@ COPY LICENSE LICENSE-2.0.txt COPY --from=builder /certs/tls.crt /certs/tls.crt COPY --from=builder /certs/tls.key /certs/tls.key -# USER 65532:65532 -# GID 0 required for Red Hat / OpenShift SCC compatibility on k0s nodes +# Run as non-root UID with GID 0 (root group). GID 0 is required on +# RHEL / OpenShift / k0s nodes: the container runtime assigns a random +# UID at launch and only grants group-read/write to GID 0. Without it +# the process cannot read /manager or the config files copied above. USER 1001:0 ENV INSTANCE_FILE=/instance.yaml ENV APPLICATION_FILE=/applications.yaml diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index 1d09a52..6cf0049 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -25,5 +25,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization images: - name: controller - newName: docker.com/splunk/splunk-ai-operator + newName: docker.io/splunk/splunk-ai-operator newTag: v0.0.1 diff --git a/pkg/ai/raybuilder/builder.go b/pkg/ai/raybuilder/builder.go index 342b81d..a50b8b7 100644 --- a/pkg/ai/raybuilder/builder.go +++ b/pkg/ai/raybuilder/builder.go @@ -50,9 +50,8 @@ type ApplicationParams struct { S3CompatObjectStoreEndpointUrl string `yaml:"S3COMPAT_OBJECT_STORE_ENDPOINT_URL"` S3CompatObjectStoreAccessKey string `yaml:"S3COMPAT_OBJECT_STORE_ACCESS_KEY"` S3CompatObjectStoreSecretKey string `yaml:"S3COMPAT_OBJECT_STORE_SECRET_KEY"` - Replicas map[string]int32 `yaml:"REPLICAS"` - WorkingDirBase string `yaml:"WORKING_DIR_BASE"` - ModelVersion string `yaml:"MODEL_VERSION"` + Replicas map[string]int32 `yaml:"REPLICAS"` + ModelVersion string `yaml:"MODEL_VERSION"` AcceleratorType string `yaml:"ACCELERATOR_TYPE"` } @@ -88,26 +87,6 @@ func (b *Builder) effectiveAcceleratorType() string { return "L40S" } -// rayWorkingDirBase builds the base URI for runtime_env.working_dir application zips. -// -// Ray's Serve config rejects plain http:// for remote working_dir URIs; allowed schemes include -// s3 and https. We always use s3:// for S3 and S3-compatible backends (AWS, MinIO, SeaweedFS, etc.). -// Ray pods receive AWS_ENDPOINT_URL plus AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY (when applicable) -// from rayS3DownloadEnv; modern boto3/botocore honor AWS_ENDPOINT_URL for the S3 client used to -// fetch runtime_env packages. -// -// For GCS we use gs:// (scheme may be gs or gcs in objectStorage.path). -func rayWorkingDirBase(scheme, bucket string) string { - switch strings.ToLower(scheme) { - case "s3", "s3compat", "minio", "seaweedfs": - return fmt.Sprintf("s3://%s/ray-services/ai-platform/applications", bucket) - case "gs", "gcs": - return fmt.Sprintf("gs://%s/ray-services/ai-platform/applications", bucket) - default: - return fmt.Sprintf("%s://%s/ray-services/ai-platform/applications", scheme, bucket) - } -} - // --- 7️⃣ ReconcileRayService: build & create/update the RayService CR --- func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPlatform) error { logger := log.FromContext(ctx) // Define logger @@ -208,9 +187,6 @@ func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPl } } - // Build working_dir base (s3:// or gs://; see rayWorkingDirBase). - workingDirBase := rayWorkingDirBase(u.Scheme, u.Host) - param := ApplicationParams{ ArtifactBucketName: u.Host, ArtifactsProvider: artifactsProvider, @@ -219,7 +195,6 @@ func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPl S3CompatObjectStoreAccessKey: s3CompatObjectStoreAccessKey, S3CompatObjectStoreSecretKey: s3CompatObjectStoreSecretKey, Replicas: replicasMap, - WorkingDirBase: workingDirBase, ModelVersion: os.Getenv("MODEL_VERSION"), AcceleratorType: b.effectiveAcceleratorType(), } diff --git a/pkg/storage/storageclient.go b/pkg/storage/storageclient.go index 9f73b95..cc04209 100644 --- a/pkg/storage/storageclient.go +++ b/pkg/storage/storageclient.go @@ -61,16 +61,25 @@ func NewStorageClient( if u.Host == "" { return nil, fmt.Errorf("invalid volume URI %q: S3-compatible path must include bucket name (e.g. s3compat://bucket-name/prefix)", vs.Path) } + if vs.Endpoint == "" { + return nil, fmt.Errorf("s3compat:// scheme requires spec.objectStorage.endpoint to be set (otherwise the AWS SDK targets real AWS S3)") + } return NewS3CompatibleClient(ctx, k8sClient, namespace, u.Host, prefix, vs) case "minio": if u.Host == "" { return nil, fmt.Errorf("invalid volume URI %q: MinIO path must include bucket name (e.g. minio://bucket-name/prefix)", vs.Path) } + if vs.Endpoint == "" { + return nil, fmt.Errorf("minio:// scheme requires spec.objectStorage.endpoint to be set (otherwise the AWS SDK targets real AWS S3)") + } return NewS3CompatibleClient(ctx, k8sClient, namespace, u.Host, prefix, vs) case "seaweedfs": if u.Host == "" { return nil, fmt.Errorf("invalid volume URI %q: SeaweedFS path must include bucket name (e.g. seaweedfs://bucket-name/prefix)", vs.Path) } + if vs.Endpoint == "" { + return nil, fmt.Errorf("seaweedfs:// scheme requires spec.objectStorage.endpoint to be set (otherwise the AWS SDK targets real AWS S3)") + } return NewS3CompatibleClient(ctx, k8sClient, namespace, u.Host, prefix, vs) case "fixture": // fixture:// is a special scheme for testing purposes, using a fake client. diff --git a/tools/artifacts_download_upload_scripts/README.md b/tools/artifacts_download_upload_scripts/README.md index 3f47a7f..f847483 100755 --- a/tools/artifacts_download_upload_scripts/README.md +++ b/tools/artifacts_download_upload_scripts/README.md @@ -98,12 +98,12 @@ Preferred generic names; `MINIO_*` are accepted for backward compatibility. | Preferred (generic) | Fallback | Description | |---------------------|----------|-------------| -| `S3COMPAT_OBJECT_STORE_ENDPOINT` | `MINIO_ENDPOINT` | S3 API endpoint URL (e.g. http://host:9000 for MinIO, http://host:8333 for SeaweedFS) | -| `S3COMPAT_OBJECT_STORE_BUCKET` | `MINIO_BUCKET` | Bucket name | -| `S3COMPAT_OBJECT_STORE_ACCESS_KEY` | `MINIO_ROOT_USER` or `MINIO_ACCESS_KEY` | Access key | -| `S3COMPAT_OBJECT_STORE_SECRET_KEY` | `MINIO_ROOT_PASSWORD` or `MINIO_SECRET_KEY` | Secret key | +| `OBJECT_STORE_ENDPOINT` | `MINIO_ENDPOINT` | S3 API endpoint URL (e.g. http://host:9000 for MinIO, http://host:8333 for SeaweedFS) | +| `OBJECT_STORE_BUCKET` | `MINIO_BUCKET` | Bucket name | +| `OBJECT_STORE_ACCESS_KEY` | `MINIO_ROOT_USER` or `MINIO_ACCESS_KEY` | Access key | +| `OBJECT_STORE_SECRET_KEY` | `MINIO_ROOT_PASSWORD` or `MINIO_SECRET_KEY` | Secret key | -Example for SeaweedFS: `S3COMPAT_OBJECT_STORE_ENDPOINT=http://seaweedfs:8333 S3COMPAT_OBJECT_STORE_BUCKET=my-bucket ./upload_to_minio.sh` +Example for SeaweedFS: `OBJECT_STORE_ENDPOINT=http://seaweedfs:8333 OBJECT_STORE_BUCKET=my-bucket ./upload_to_minio.sh` **Prerequisites:** - Run `download_from_huggingface.sh` first to download artifacts diff --git a/tools/artifacts_download_upload_scripts/SEAWEEDFS_SYSTEMD.md b/tools/artifacts_download_upload_scripts/SEAWEEDFS_SYSTEMD.md index a4b9caa..f542ba3 100644 --- a/tools/artifacts_download_upload_scripts/SEAWEEDFS_SYSTEMD.md +++ b/tools/artifacts_download_upload_scripts/SEAWEEDFS_SYSTEMD.md @@ -12,7 +12,7 @@ Run SeaweedFS as a systemd service so it **restarts on failure** and **starts on On the host where SeaweedFS should run: ```bash -cd /path/to/splunk-ai-operator/tools/cluster_setup +cd /path/to/splunk-ai-operator/tools/artifacts_download_upload_scripts sudo ./install_seaweedfs_systemd.sh ``` @@ -22,7 +22,7 @@ This copies `seaweedfs.service` to `/etc/systemd/system/`, enables and starts th 1. Copy the unit file: ```bash - sudo cp tools/cluster_setup/seaweedfs.service /etc/systemd/system/ + sudo cp tools/artifacts_download_upload_scripts/seaweedfs.service /etc/systemd/system/ sudo systemctl daemon-reload ``` diff --git a/tools/artifacts_download_upload_scripts/test_minio_connection.sh b/tools/artifacts_download_upload_scripts/test_minio_connection.sh index 9f1baf2..ada3233 100755 --- a/tools/artifacts_download_upload_scripts/test_minio_connection.sh +++ b/tools/artifacts_download_upload_scripts/test_minio_connection.sh @@ -1,10 +1,10 @@ #!/bin/bash # Test script to diagnose MinIO connectivity and bucket creation issues -MINIO_ENDPOINT="${MINIO_ENDPOINT:-http://18.221.188.50:9000}" +MINIO_ENDPOINT="${MINIO_ENDPOINT:-http://127.0.0.1:9000}" MINIO_ROOT_USER="${MINIO_ROOT_USER:-minioadmin}" -MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-AAnwWE2sLfFduYTpPy4v7PcyczSHGrVM}" -MINIO_BUCKET="${MINIO_BUCKET:-ai-platform-bucket-us-east-2}" +MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-minioadmin}" +MINIO_BUCKET="${MINIO_BUCKET:-ai-platform-bucket}" echo "==========================================" echo "MinIO Connection Test" diff --git a/tools/cluster_setup/K0S_README.md b/tools/cluster_setup/K0S_README.md index a116b62..9dcbc9b 100644 --- a/tools/cluster_setup/K0S_README.md +++ b/tools/cluster_setup/K0S_README.md @@ -639,7 +639,7 @@ storage: rootUser: "admin" rootPassword: "Change-This-Strong-Password-123!" -images: # TODO update images with released versions (from docker.io / how ?) +images: registry: "registry.corp.com" operator: image: "registry.corp.com/splunk/splunk-ai-operator:v0.1.5" diff --git a/tools/cluster_setup/artifacts.yaml b/tools/cluster_setup/artifacts.yaml index cca7c19..66b9b28 100644 --- a/tools/cluster_setup/artifacts.yaml +++ b/tools/cluster_setup/artifacts.yaml @@ -5682,17 +5682,17 @@ spec: fieldRef: fieldPath: metadata.name - name: RELATED_IMAGE_RAY_HEAD - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-head:build-v2-008 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-head:build-v2-010 - name: RELATED_IMAGE_RAY_WORKER - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-worker-gpu:build-v2-008 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-worker-gpu:build-v2-010 - name: RELATED_IMAGE_WEAVIATE value: docker.io/semitechnologies/weaviate:stable-v1.28-007846a - name: RELATED_IMAGE_SAIA_API - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:build-v2-009 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:build-v2-010 - name: RELATED_IMAGE_SAIA_API_V2 - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api-v2:build-v2-009 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api-v2:build-v2-010 - name: RELATED_IMAGE_POST_INSTALL_HOOK - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-data-loader:build-v2-009 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-data-loader:build-v2-010 - name: SPLUNK_METRICS_INDEX_NAME value: _metrics - name: RELATED_IMAGE_FLUENT_BIT diff --git a/tools/cluster_setup/cluster-config.yaml b/tools/cluster_setup/cluster-config.yaml index 513b425..58287ae 100644 --- a/tools/cluster_setup/cluster-config.yaml +++ b/tools/cluster_setup/cluster-config.yaml @@ -14,7 +14,7 @@ # ---------- Cluster Configuration ---------- cluster: useExisting: false # true = do not create cluster; use existing one (script fails if cluster not found) - name: "ai-tier-sok-test-east2" # CHANGE THIS: Your EKS cluster name (DNS-1123 compliant: lowercase, numbers, hyphens) + name: "my-ai-cluster" # CHANGE THIS: Your EKS cluster name (DNS-1123 compliant: lowercase, numbers, hyphens) region: "us-east-2" # CHANGE THIS: Your AWS region (e.g., us-east-1, us-west-2, eu-west-1) k8sVersion: "1.31" # Kubernetes version (1.29, 1.30, 1.31 supported) # When true: require subnets (existing VPC). On 'delete', only EKS and related resources are removed; VPC is preserved so you can redeploy (e.g. with MinIO on EC2 in same VPC). @@ -110,10 +110,10 @@ storage: objectStore: type: "minio" # aws | s3compat | minio | seaweedfs (external only for non-aws) bucket: "ai-platform-bucket-minio-us-east-2" - endpoint: "http://13.59.216.105:9000" # MinIO API (9000) or SeaweedFS S3 gateway (8333) + endpoint: "http://10.0.0.5:9000" # CHANGE THIS: MinIO API (9000) or SeaweedFS S3 gateway (8333) auth: - rootUser: "minioadmin" - rootPassword: "minioadmin" # Must match SeaweedFS env (AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY) or MinIO root + rootUser: "" # CHANGE THIS: S3-compatible access key (or MinIO root user) + rootPassword: "" # CHANGE THIS: S3-compatible secret key (or MinIO root password) # ---------- Container Images Configuration ---------- images: diff --git a/tools/cluster_setup/k0s-cluster-config-h100.yaml b/tools/cluster_setup/k0s-cluster-config-h100.yaml index b91c08e..7abfd33 100644 --- a/tools/cluster_setup/k0s-cluster-config-h100.yaml +++ b/tools/cluster_setup/k0s-cluster-config-h100.yaml @@ -15,7 +15,7 @@ cluster: name: airgap-cluster # region: us-east-2 # Ignored for on-prem, but required in config sshUser: ec2-user # CHANGE THIS: SSH user for remote nodes - sshKeyPath: /Users/mohaari2/.ssh/ai-key-arif.pem # CHANGE THIS: Path to SSH private key + sshKeyPath: ~/.ssh/id_rsa # CHANGE THIS: Path to SSH private key # ---------- Node Configuration ---------- nodes: @@ -25,11 +25,11 @@ nodes: existingIPs: controllers: - - 3.149.241.167 # CHANGE THIS: Your controller server IP + - 10.0.0.1 # CHANGE THIS: Your controller server IP workers: - - 18.221.244.241 # CHANGE THIS: CPU worker 1 - - 18.191.19.128 # CHANGE THIS: GPU worker 1 - - 3.137.209.219 # CHANGE THIS: GPU worker 2 + - 10.0.0.2 # CHANGE THIS: CPU worker 1 + - 10.0.0.3 # CHANGE THIS: GPU worker 1 + - 10.0.0.4 # CHANGE THIS: GPU worker 2 # ---------- Storage Configuration ---------- # Object storage: AWS S3 or external S3-compatible (no in-cluster MinIO install for external). @@ -53,7 +53,7 @@ storage: type: "minio" # aws | s3compat | minio | seaweedfs (external only for non-aws) bucket: "ai-platform-bucket-minio-us-east-2" # endpoint: "http://3.144.157.201:8333" # SeaweedFS (deprecated — see comment above) - endpoint: "http://13.59.216.105:9000" # MinIO (AWS-spec compliant GetObjectTagging semantics) + endpoint: "http://10.0.0.5:9000" # CHANGE THIS: MinIO/SeaweedFS S3 API endpoint auth: rootUser: "minioadmin" rootPassword: "minioadmin" @@ -61,7 +61,7 @@ storage: # ---------- Container Images Configuration ---------- images: # Registry prefix - applied to images without a full registry path - registry: "658391232643.dkr.ecr.us-east-2.amazonaws.com" # CHANGE THIS: Your ECR/Docker/Harbor registry + registry: "" # CHANGE THIS: Your ECR/Docker/Harbor registry (e.g. 123456789012.dkr.ecr.us-east-2.amazonaws.com) operator: # image: "docker.io/kpratyush775/splunk-ai-operator:v0.1.29" @@ -103,10 +103,10 @@ images: # Build & push with: # IMG=658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.26 \ # make docker-build-amd64 docker-push - image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.28" + image: "splunk-ai-operator:latest" # CHANGE THIS: Your operator image splunk: - image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/splunk/splunk:10-2-ai-custom" + image: "splunk/splunk:10.2.0" # CHANGE THIS: Your Splunk Enterprise image operatorImage: "docker.io/splunk/splunk-operator:3.0.0" ray: @@ -115,14 +115,14 @@ images: # headImage: "ml-platform/ray/ray-head:build-010" # headImage: "ml-platform/ray/ray-head:9a24502-ai-tier" # arif rebase to main # headImage: "ml-platform/ray/ray-head:build-v2-001" # tony redis changes - headImage: "ml-platform/ray/ray-head:build-v2-008" # tony redis changes + fixes + headImage: "ml-platform/ray/ray-head:build-v2-010" # tony redis changes + fixes # workerImage: "ml-platform/ray/ray-worker-gpu:build-v1alpha1" # workerImage: "ml-platform/ray/ray-worker-gpu:087e40e" # workerImage: "ml-platform/ray/ray-worker-gpu:build-010" # workerImage: "ml-platform/ray/ray-worker-gpu:9a24502-ai-tier" # arif rebase to main # workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-001" # tony redis changes - workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-008" # tony redis changes + fixes + workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-010" # tony redis changes + fixes weaviate: image: "docker.io/semitechnologies/weaviate:stable-v1.28-007846a" @@ -132,11 +132,11 @@ images: # apiImage: "ml-platform/saia/saia-api:build-006" #saia v1.5 # apiImage: "ml-platform/saia/saia-api:v2.0.4-23-g2fc91e9" #saia v2 # apiImage: "ml-platform/saia/saia-api:v2.0.4-31-g9efe1fc" #saia v2 + tony changes - apiImage: "ml-platform/saia/saia-api:build-v2-009" #saia v2 + tony changes + apiImage: "ml-platform/saia/saia-api:build-v2-010" #saia v2 + tony changes # apiV2Image: "ml-platform/saia/saia-api-v2:v2.0.4-23-g2fc91e9" #saia v2 # apiV2Image: "ml-platform/saia/saia-api-v2:v2.0.4-31-g9efe1fc" #saia v2 + tony changes - apiV2Image: "ml-platform/saia/saia-api-v2:build-v2-009" #saia v2 + tony changes + apiV2Image: "ml-platform/saia/saia-api-v2:build-v2-010" #saia v2 + tony changes # dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v1alpha1" # dataLoaderImage: "ml-platform/saia/saia-data-loader:build-003" #saia v1.5 @@ -240,5 +240,5 @@ imagePullSecrets: autoCreateECR: true ecr: - account: "658391232643" - region: us-east-2 + account: "" # CHANGE THIS: Your AWS account ID (e.g. 123456789012) + region: us-east-2 # CHANGE THIS: Your AWS region diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml index 019e66c..aecec3f 100644 --- a/tools/cluster_setup/k0s-cluster-config.yaml +++ b/tools/cluster_setup/k0s-cluster-config.yaml @@ -15,7 +15,7 @@ cluster: name: airgap-cluster # region: us-east-2 # Ignored for on-prem, but required in config sshUser: ec2-user # CHANGE THIS: SSH user for remote nodes - sshKeyPath: /Users/mohaari2/.ssh/ai-key-arif.pem # CHANGE THIS: Path to SSH private key + sshKeyPath: ~/.ssh/id_rsa # CHANGE THIS: Path to SSH private key # ---------- Node Configuration ---------- nodes: @@ -25,11 +25,11 @@ nodes: existingIPs: controllers: - - 3.144.14.96 # CHANGE THIS: Your controller server IP + - 10.0.0.1 # CHANGE THIS: Your controller server IP workers: - - 3.14.134.16 # CHANGE THIS: CPU worker 1 - - 13.59.78.115 # CHANGE THIS: GPU worker 1 - - 3.15.20.136 # CHANGE THIS: GPU worker 2 + - 10.0.0.2 # CHANGE THIS: CPU worker 1 + - 10.0.0.3 # CHANGE THIS: GPU worker 1 + - 10.0.0.4 # CHANGE THIS: GPU worker 2 # ---------- Storage Configuration ---------- # Prerequisites (must be provisioned BEFORE running the installer): @@ -67,7 +67,7 @@ storage: type: "minio" # aws | s3compat | minio | seaweedfs (external only for non-aws) bucket: "ai-platform-bucket-minio-us-east-2" # endpoint: "http://3.144.157.201:8333" # SeaweedFS (deprecated — see comment above) - endpoint: "http://13.59.216.105:9000" # MinIO (AWS-spec compliant GetObjectTagging semantics) + endpoint: "http://10.0.0.5:9000" # CHANGE THIS: MinIO/SeaweedFS S3 API endpoint auth: rootUser: "minioadmin" rootPassword: "minioadmin" @@ -75,7 +75,7 @@ storage: # ---------- Container Images Configuration ---------- images: # Registry prefix - applied to images without a full registry path - registry: "658391232643.dkr.ecr.us-east-2.amazonaws.com" # CHANGE THIS: Your ECR/Docker/Harbor registry + registry: "" # CHANGE THIS: Your ECR/Docker/Harbor registry (e.g. 123456789012.dkr.ecr.us-east-2.amazonaws.com) operator: # image: "docker.io/kpratyush775/splunk-ai-operator:v0.1.29" @@ -117,10 +117,10 @@ images: # Build & push with: # IMG=658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.26 \ # make docker-build-amd64 docker-push - image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.28" + image: "splunk-ai-operator:latest" # CHANGE THIS: Your operator image splunk: - image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/splunk/splunk:10-2-ai-custom" #TODO this update + image: "splunk/splunk:10.2.0" # CHANGE THIS: Your Splunk Enterprise image operatorImage: "docker.io/splunk/splunk-operator:3.0.0" ray: @@ -189,8 +189,8 @@ kubernetes: # ---------- File Paths ---------- files: - splunkOperator: "/Users/mohaari2/Files/repos/AI/splunk-ai-operator/tools/cluster_setup/splunk-operator-cluster.yaml" - aiPlatform: "/Users/mohaari2/Files/repos/AI/splunk-ai-operator/tools/cluster_setup/artifacts.yaml" + splunkOperator: "./splunk-operator-cluster.yaml" # CHANGE THIS: Path to Splunk Operator manifest + aiPlatform: "./artifacts.yaml" # CHANGE THIS: Path to AI Platform artifacts # ---------- Splunk Configuration ---------- splunk: @@ -255,5 +255,5 @@ imagePullSecrets: autoCreateECR: true ecr: - account: "658391232643" - region: us-east-2 + account: "" # CHANGE THIS: Your AWS account ID (e.g. 123456789012) + region: us-east-2 # CHANGE THIS: Your AWS region diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index d6da6b3..b0ac10a 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -1888,8 +1888,7 @@ install_kube_prometheus() { log "Installing kube-prometheus-stack..." helm repo add prometheus-community https://prometheus-community.github.io/helm-charts || true - # TODO uncomment - # helm repo update prometheus-community # Only update the specific repo we need + helm repo update prometheus-community # Only update the specific repo we need helm_retry 3 upgrade --install kube-prometheus-stack prometheus-community/kube-prometheus-stack \ --namespace monitoring --create-namespace \ @@ -1908,8 +1907,7 @@ install_otel_operator_and_contrib_collector() { wait_for_cert_manager_webhook 30 10 helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts || true - # TODO uncomment - # helm repo update open-telemetry # Only update the specific repo we need + helm repo update open-telemetry # Only update the specific repo we need # Use cert-manager for webhook certificates (now that konnectivity is fixed) helm_retry 3 upgrade --install opentelemetry-operator open-telemetry/opentelemetry-operator \ @@ -1928,8 +1926,7 @@ install_ray_operator() { log "Installing KubeRay Operator..." helm repo add kuberay https://ray-project.github.io/kuberay-helm/ || true - # TODO uncomment - # helm repo update kuberay # Only update the specific repo we need + helm repo update kuberay # Only update the specific repo we need helm_retry 3 upgrade --install kuberay-operator kuberay/kuberay-operator \ --namespace ray-system --create-namespace \ @@ -3809,7 +3806,6 @@ case "${1:-install}" in clean_all ;; join-workers) - # TODO fix this flow join_workers ;; *) diff --git a/tools/cluster_setup/splunk-operator-cluster.yaml b/tools/cluster_setup/splunk-operator-cluster.yaml index 0732ea3..467879e 100644 --- a/tools/cluster_setup/splunk-operator-cluster.yaml +++ b/tools/cluster_setup/splunk-operator-cluster.yaml @@ -55325,7 +55325,6 @@ subjects: apiVersion: v1 data: OPERATOR_NAME: '"splunk-operator"' - # TODO identify whats this ?? RELATED_IMAGE_SPLUNK_ENTERPRISE: 667741767953.dkr.ecr.us-west-2.amazonaws.com/splunk/splunk:splunk-redhat-8-amd64-10.2.0-ef65e8205e4d-6d943f7-28228924 WATCH_NAMESPACE: "" kind: ConfigMap From 57973285717f24ab9c684315a75543bc2ab9b102 Mon Sep 17 00:00:00 2001 From: Mohammed Arif Date: Wed, 29 Apr 2026 16:39:48 +0530 Subject: [PATCH 55/55] fix: code review comments --- tools/cluster_setup/EKS_README.md | 306 ++- tools/cluster_setup/K0S_README.md | 2117 +++-------------- tools/cluster_setup/artifacts.yaml | 8 +- tools/cluster_setup/cluster-config.yaml | 7 - tools/cluster_setup/eks_cluster_with_stack.sh | 4 +- .../k0s-cluster-config-h100.yaml | 244 -- tools/cluster_setup/k0s-cluster-config.yaml | 83 +- tools/cluster_setup/k0s_cluster_with_stack.sh | 2 +- 8 files changed, 630 insertions(+), 2141 deletions(-) delete mode 100644 tools/cluster_setup/k0s-cluster-config-h100.yaml diff --git a/tools/cluster_setup/EKS_README.md b/tools/cluster_setup/EKS_README.md index c65c94e..6ef9b75 100644 --- a/tools/cluster_setup/EKS_README.md +++ b/tools/cluster_setup/EKS_README.md @@ -50,8 +50,8 @@ The script installs everything needed for the AI Platform: 1. **EKS Cluster** (Kubernetes 1.31-1.34) - AWS-managed control plane 2. **VPC CNI** - Native AWS VPC networking for pods 3. **S3 Bucket** - Object storage for AI artifacts and models -4. **EBS CSI Driver** - Persistent volumes backed by AWS EBS -5. **Cluster Autoscaler** - Automatic node scaling based on demand +4. **EBS CSI Driver** - Persistent volumes backed by AWS EBS (IRSA-based IAM) +5. **Cluster Autoscaler** - Automatic node scaling based on demand (IRSA-based IAM) 6. **Cert-Manager** - Automated certificate management 7. **Object storage** - AWS S3 or external S3-compatible only (MinIO, SeaweedFS, etc.; no in-cluster MinIO install) 8. **Kube-Prometheus Stack** - Monitoring with Prometheus + Grafana @@ -390,13 +390,14 @@ You must configure these images in `cluster-config.yaml`: |-------|--------------|-------------| | Splunk AI Operator | `operator.image` | Main operator controller | | Splunk Enterprise | `splunk.image` | Splunk instance for observability | -| Splunk Operator | `splunk.operatorImage` | Splunk CRD controller (optional, has default) | +| Splunk Operator | `splunk.operatorImage` | Splunk CRD controller (optional, default: `docker.io/splunk/splunk-operator:3.0.0`) | | Ray Head | `ray.headImage` | Ray cluster head node | | Ray Worker | `ray.workerImage` | Ray worker nodes (GPU) | | Weaviate | `weaviate.image` | Vector database | | SAIA API | `saia.apiImage` | Splunk AI Assistant API | | SAIA Data Loader | `saia.dataLoaderImage` | SAIA initialization | -| Fluent Bit | `fluentBit.image` | Logging (optional, has default) | +| Fluent Bit | `fluentBit.image` | Logging (optional, default: `fluent/fluent-bit:1.9.6`) | +| OpenTelemetry Collector | `otelCollector.image` | Telemetry collection (optional, default: `otel/opentelemetry-collector-contrib:0.122.1`) | **No manual YAML editing required!** The script handles everything. @@ -519,7 +520,7 @@ vi my-cluster-config.yaml cluster: name: "my-ai-cluster" # ← CHANGE: Your unique cluster name (DNS-1123 compliant) region: "us-west-2" # ← CHANGE: Your AWS region - k8sVersion: "1.31" # Kubernetes version (1.29, 1.30, 1.31) + k8sVersion: "1.31" # Kubernetes version (1.31, 1.32, 1.33, 1.34) # Option A: Leave subnets empty to create new VPC automatically # Option B: Provide existing subnet IDs (eksctl auto-detects VPC from subnets) @@ -579,9 +580,11 @@ export MINIO_ROOT_PASSWORD='your-secure-password' CONFIG_FILE=./cluster-config.yaml ./eks_cluster_with_stack.sh install ``` -**Idempotency and existing VPC** -- The install is **idempotent**: if the EKS cluster already exists, the script skips cluster creation and only runs reconcile (addons, operators, AIPlatform). Set `cluster.useExisting: true` to require an existing cluster (script fails if the cluster is not found). +**Idempotency and existing clusters** +- The install is **idempotent**: if the EKS cluster already exists, the script skips cluster creation and only runs reconcile (addons, operators, AIPlatform). You can safely re-run `install` to update images, fix issues, or add components. +- **Require existing cluster:** Set `cluster.useExisting: true` to skip cluster creation entirely. The script will fail with a clear error if the cluster is not found. This is useful when you created the cluster separately or want to guard against accidentally creating a new cluster. - **Use an existing VPC:** Provide `cluster.subnets` (private and public subnet IDs and AZs). eksctl will use that VPC and will not create a new one. +- **Preserve VPC on delete:** Set `cluster.preserveVpcOnDelete: true` when using an existing VPC to prevent the `delete` command from removing it. Requires at least 2 private subnets to be specified. **Important Notes:** - **Cluster Name**: Must be DNS-1123 compliant (lowercase letters, numbers, hyphens; start/end with alphanumeric) @@ -595,15 +598,22 @@ CONFIG_FILE=./cluster-config.yaml ./eks_cluster_with_stack.sh install |---------|--------------|------------------| | `cluster.name` | EKS cluster name | ✅ **REQUIRED:** Change to your cluster name | | `cluster.region` | AWS region | ✅ **REQUIRED:** Change to your region | +| `cluster.k8sVersion` | Kubernetes version (1.31-1.34) | ⚙️ Optional: default 1.31 | | `cluster.useExisting` | Use existing cluster only (do not create) | ⚙️ Set `true` to skip cluster creation; script fails if cluster not found | +| `cluster.preserveVpcOnDelete` | Keep VPC when running `delete` | ⚙️ Set `true` when using an existing VPC you don't want deleted | | `cluster.subnets` | VPC subnets for nodes | ⚙️ **OPTIONAL:** Leave empty for new VPC or provide existing subnet IDs to use existing VPC | | `storage.s3Bucket` | S3 bucket for AI artifacts (used when `objectStore.type` is aws) | ✅ **REQUIRED** if not using MinIO/SeaweedFS | | `storage.objectStore` | Object store: `type` (aws \| s3compat \| minio \| seaweedfs), `bucket`, `endpoint`, `auth`. Default type is `aws` when unset. External only (no in-cluster install). | ⚙️ Required for s3compat/minio/seaweedfs: set `endpoint` and credentials. See [Object Storage Selection](../../docs/configuration/object-storage.md). | | `images.registry` | Container registry URL | ✅ **REQUIRED:** Your ECR/Docker registry | | `images.*` | All container images | ✅ **REQUIRED:** Configure all image paths | | `nodeGroups.cpu` | CPU node group settings | ⚙️ Optional: adjust size/type | -| `nodeGroups.gpu` | GPU node group settings | ⚙️ Optional: adjust size/type | +| `nodeGroups.gpu` | GPU node group settings | ⚙️ Optional: adjust size/type/AZ/capacity reservation | +| `nodeGroups.gpu.availabilityZones` | Lock GPU nodes to specific AZs | ⚙️ Optional: for capacity-constrained GPU types | +| `nodeGroups.gpu.capacityReservation` | H100 Capacity Block reservation | ⚙️ Optional: for H100 with Capacity Blocks | +| `operators.ray.modelVersion` | Model version for AI serving | ⚙️ Optional: default `v0.3.14-36-g1549f5a` | +| `operators.ray.rayVersion` | Ray runtime version | ⚙️ Optional: default `2.44.0` | | `aiPlatform` | AI Platform configuration | ⚙️ Optional: customize features | +| `aiPlatform.defaultAcceleratorType` | GPU type: `L40S`, `H100` | ⚙️ Optional: default `L40S` | ### 5. Configure Container Images ⚠️ CRITICAL @@ -636,6 +646,9 @@ images: fluentBit: image: "fluent/fluent-bit:1.9.6" # ← OPTIONAL (has default) + + otelCollector: + image: "otel/opentelemetry-collector-contrib:0.122.1" # ← OPTIONAL (has default) ``` **Tips:** @@ -647,6 +660,12 @@ images: **The script will validate ALL images exist before deployment!** +**Additional Version Configuration:** + +The script also configures these versions in `artifacts.yaml`: +- `operators.ray.modelVersion` - Sets `MODEL_VERSION` env var (default: `v0.3.14-36-g1549f5a`) +- `operators.ray.rayVersion` - Sets `RAY_VERSION` env var (default: `2.44.0`) + ### 6. Login to Container Registries **For AWS ECR:** @@ -720,52 +739,66 @@ CONFIG_FILE=./my-cluster-config.yaml ./eks_cluster_with_stack.sh install - ✓ Creates backups 2. **Preflight Checks** (1 min) - - ✓ Checks AWS credentials + - ✓ Checks AWS credentials and identity + - ✓ Validates cluster name (DNS-1123), S3 bucket name - ✓ Verifies subnets exist (if provided) - - ✓ Validates NAT Gateway & Internet Gateway - - ✓ Checks required tools + - ✓ Validates NAT Gateway, Internet Gateway, route tables + - ✓ Checks required tools (aws, eksctl, kubectl, helm, git, jq, yq) -3. **Create EKS Cluster** (10-15 min) - - ✓ Creates managed control plane +3. **Create EKS Cluster** (10-15 min) - skipped if cluster already exists + - ✓ Creates managed control plane with OIDC - ✓ Sets up node groups (CPU + GPU) + - ✓ Creates H100 node group via CloudFormation (if using Capacity Block) 4. **Install Infrastructure** (10-15 min) - - ✓ EBS CSI Driver (for persistent volumes) - - ✓ Cluster Autoscaler (for node scaling) - - ✓ VPC CNI (for pod networking) + - ✓ OIDC provider for IRSA + - ✓ EBS CSI Driver with IRSA role + - ✓ gp3 StorageClass (set as default) + - ✓ Cluster Autoscaler with IRSA role + - ✓ NVIDIA device plugin 5. **Install Platform Components** (15-20 min) + - ✓ Kube-Prometheus Stack (Prometheus + Grafana) - ✓ Cert Manager (certificates) - - ✓ Prometheus + Grafana (monitoring) - - ✓ OpenTelemetry (tracing) - - ✓ NVIDIA GPU Operator (GPU support) + - ✓ S3-compatible credentials secret (if external object store) + - ✓ OpenTelemetry Operator + collector - ✓ KubeRay Operator (Ray clusters) - ✓ Splunk Operator (Splunk management) + - ✓ Splunk AI Platform Operator (with your images!) 6. **Deploy AI Platform** (5-10 min) - - ✓ Creates S3 bucket - - ✓ Sets up IAM roles (IRSA) - - ✓ Installs Splunk AI Operator (with your images!) - - ✓ Creates AIPlatform CR - - ✓ Deploys AI services + - ✓ Creates S3 bucket and prefixes (artifacts/, apps/, tasks/) + - ✓ Uploads Splunk app to S3 (if localAppPath configured) + - ✓ Sets up IRSA roles for Ray head, Ray worker, SAIA service + - ✓ Adds ECR permissions to IRSA roles + - ✓ Creates Splunk Standalone instance + - ✓ Creates AIPlatform CR and monitors until Ready + - ✓ Waits for Splunk AI Assistant app installation on Standalone **What Happens During Installation:** -1. ✓ Creates EKS cluster with control plane (5-10 minutes) -2. ✓ Creates managed node groups (CPU and GPU) (5-10 minutes) -3. ✓ Installs AWS Load Balancer Controller -4. ✓ Installs EBS CSI driver -5. ✓ Installs Cluster Autoscaler -6. ✓ Installs cert-manager -7. ✓ Installs monitoring stack (Prometheus, Grafana) -8. ✓ Installs OpenTelemetry -9. ✓ Installs NVIDIA GPU support -10. ✓ Installs Ray operator -11. ✓ Installs Splunk operator -12. ✓ Creates Splunk Standalone instance -13. ✓ Installs Splunk AI Platform operator -14. ✓ Creates S3 bucket and IAM roles -15. ✓ Creates ECR image pull secrets -16. ✓ Deploys AIPlatform CR +1. ✓ Validates configuration and container images (fails fast if images missing) +2. ✓ Runs preflight checks (AWS credentials, subnets, VPC networking, tools) +3. ✓ Creates EKS cluster with control plane (or skips if already exists) +4. ✓ Creates managed node groups (CPU and GPU) +5. ✓ Creates H100 GPU node group via CloudFormation (if using Capacity Block) +6. ✓ Ensures OIDC provider for IRSA +7. ✓ Installs EBS CSI driver (with IRSA role) +8. ✓ Creates gp3 StorageClass (set as default) +9. ✓ Installs Cluster Autoscaler (with IRSA role) +10. ✓ Installs NVIDIA device plugin +11. ✓ Installs kube-prometheus-stack (monitoring) +12. ✓ Installs cert-manager +13. ✓ Creates S3-compatible credentials secret (if using external object store) +14. ✓ Installs OpenTelemetry Operator + collector +15. ✓ Installs KubeRay Operator +16. ✓ Installs Splunk Operator +17. ✓ Installs Splunk AI Platform Operator +18. ✓ Creates S3 bucket and IAM roles (IRSA for Ray head/worker/SAIA) +19. ✓ Adds ECR permissions to IRSA roles +20. ✓ Creates Splunk Standalone instance +21. ✓ Deploys AIPlatform CR +22. ✓ Monitors AIPlatform status until Ready +23. ✓ Waits for Splunk AI Assistant app to be installed on Standalone ### 4. Verify Installation @@ -874,7 +907,9 @@ The script uses a YAML configuration file (`cluster-config.yaml`) for all settin cluster: name: "my-ai-cluster" # EKS cluster name (DNS-1123 compliant) region: "us-west-2" # AWS region - k8sVersion: "1.31" # Kubernetes version (1.29, 1.30, 1.31) + k8sVersion: "1.31" # Kubernetes version (1.31, 1.32, 1.33, 1.34) + useExisting: false # Set true to require existing cluster (fails if not found) + preserveVpcOnDelete: false # Set true to keep VPC when running delete (existing VPC only) subnets: # Optional - leave empty for auto VPC creation private: # Private subnets (at least 2, different AZs) @@ -906,19 +941,51 @@ nodeGroups: maxSize: 4 # Maximum nodes volumeSize: 1000 # EBS volume size in GB volumeType: "gp3" # EBS volume type + availabilityZones: [] # Optional: lock GPU nodes to specific AZs + capacityReservation: # Optional: for H100 Capacity Blocks (CloudFormation-based) + id: "" # EC2 Capacity Reservation ID + az: "" # AZ of the reservation storage: - s3Bucket: "my-ai-platform-bucket" # S3 bucket for artifacts/apps/tasks + s3Bucket: "my-ai-platform-bucket" # S3 bucket for artifacts/apps/tasks (used when objectStore.type is aws) storageClass: "gp3" # Default storage class for PVCs vectorDbSize: "50Gi" # VectorDB PVC size + objectStore: # External S3-compatible storage (optional) + type: "aws" # aws | s3compat | minio | seaweedfs (default: aws) + bucket: "" # Bucket name (defaults to s3Bucket) + endpoint: "" # S3-compatible endpoint (required for s3compat/minio/seaweedfs) + namespace: "minio" # Namespace hint (for credential secret placement) + auth: + rootUser: "minioadmin" # S3-compatible access key (env MINIO_ROOT_USER takes precedence) + rootPassword: "" # S3-compatible secret key (env MINIO_ROOT_PASSWORD takes precedence) -operators: +images: + registry: "" # Container registry URL (prepended to relative image paths) + operator: + image: "" # Splunk AI Operator image splunk: - image: "splunk/splunk:10.2.0-dev1" # Splunk Enterprise image + image: "" # Splunk Enterprise image + operatorImage: "" # Splunk Operator image (default: docker.io/splunk/splunk-operator:3.0.0) ray: - version: "v1.2.2" # Ray operator version + headImage: "" # Ray head node image + workerImage: "" # Ray worker node image + weaviate: + image: "" # Weaviate vector database image + saia: + apiImage: "" # SAIA API image + dataLoaderImage: "" # SAIA data loader / post-install hook image + fluentBit: + image: "" # Fluent Bit image (default: fluent/fluent-bit:1.9.6) + otelCollector: + image: "" # OpenTelemetry Collector image (default: otel/opentelemetry-collector-contrib:0.122.1) + +operators: + ray: + version: "v1.2.2" # KubeRay operator version + modelVersion: "" # Model version (default: v0.3.14-36-g1549f5a) + rayVersion: "" # Ray runtime version (default: 2.44.0) nvidia: - devicePluginVersion: "v0.17.3" # NVIDIA device plugin version + devicePluginVersion: "v0.17.3" # NVIDIA device plugin version aiPlatform: namespace: "ai-platform" # Kubernetes namespace @@ -927,7 +994,7 @@ aiPlatform: rayHead: "ray-head-sa" rayWorker: "ray-worker-sa" saiaService: "saia-service-sa" - defaultAcceleratorType: "L40S" # Default GPU type + defaultAcceleratorType: "L40S" # Default GPU type (L40S, H100) workerGroupConfig: serviceAccountName: "ray-worker-sa" imageRegistry: "" # Leave empty for default @@ -936,11 +1003,13 @@ aiPlatform: className: "nginx" host: "ai.example.com" tlsSecretName: "ai-platform-tls" + certificate: + issuerName: "platform-issuer" # Cert-manager issuer name splunkStandalone: name: "splunk-standalone" # Splunk Standalone CR name serviceAccount: "saia-service-sa" # Service account for S3 access - localAppPath: "" # Optional: local path to Splunk app to upload + localAppPath: "" # Optional: local path to Splunk app to upload to S3 files: splunkOperatorManifest: "./splunk-operator-cluster.yaml" @@ -990,8 +1059,6 @@ storage: vectorDbSize: "20Gi" # Smaller vector DB operators: - splunk: - image: "splunk/splunk:10.2.0-dev1" ray: version: "v1.2.2" @@ -1014,6 +1081,7 @@ cluster: name: "prod-ai-platform" region: "us-west-2" k8sVersion: "1.31" + preserveVpcOnDelete: true # Don't delete VPC on cleanup subnets: private: # 3 AZs for high availability - id: "subnet-private-2a" @@ -1055,8 +1123,6 @@ storage: vectorDbSize: "200Gi" # Large vector DB operators: - splunk: - image: "splunk/splunk:10.2.0-dev1" ray: version: "v1.2.2" @@ -1107,8 +1173,6 @@ storage: vectorDbSize: "100Gi" operators: - splunk: - image: "splunk/splunk:10.2.0-dev1" ray: version: "v1.2.2" @@ -1118,6 +1182,53 @@ aiPlatform: defaultAcceleratorType: "L40S" ``` +#### Example 4: H100 GPU Cluster with Capacity Block + +```yaml +# h100-cluster-config.yaml - H100 instances with EC2 Capacity Blocks + +cluster: + name: "h100-ai-cluster" + region: "us-east-2" + k8sVersion: "1.31" + +nodeGroups: + cpu: + enabled: true + instanceType: "m5.xlarge" + desiredCapacity: 3 + minSize: 2 + maxSize: 6 + volumeSize: 300 + volumeType: "gp3" + + gpu: + enabled: true + instanceType: "p5.48xlarge" # 8x H100 GPUs + desiredCapacity: 2 + minSize: 2 + maxSize: 2 + volumeSize: 2000 + volumeType: "gp3" + capacityReservation: # H100 Capacity Block + id: "cr-0abcdef1234567890" # Your Capacity Reservation ID + az: "us-east-2b" # AZ of the reservation + +storage: + s3Bucket: "h100-ai-platform-data" + storageClass: "gp3" + vectorDbSize: "100Gi" + +operators: + ray: + version: "v1.2.2" + +aiPlatform: + namespace: "ai-platform" + name: "splunk-ai-stack" + defaultAcceleratorType: "H100" # Must be H100 for capacity block +``` + ### Instance Type Selection Guide #### CPU Instance Types (For Ray head, Weaviate, general workloads) @@ -1136,11 +1247,13 @@ aiPlatform: | Instance Type | GPUs | GPU Memory | vCPU | Memory | Use Case | Approx Cost/hr | |---------------|------|------------|------|--------|----------|----------------| | g5.xlarge | 1x A10G | 24 GB | 4 | 16 GB | Dev/Small Models | $1.01 | -| g5.2xlarge | 1x A10G | 24 GB | 8 | 32 GB | **Recommended** | $1.21 | +| g5.2xlarge | 1x A10G | 24 GB | 8 | 32 GB | Small Production | $1.21 | | g5.4xlarge | 1x A10G | 24 GB | 16 | 64 GB | Large Single-GPU | $1.62 | | g5.12xlarge | 4x A10G | 96 GB | 48 | 192 GB | Multi-GPU Training | $5.67 | +| g6e.12xlarge | 4x L40S | 192 GB | 48 | 384 GB | **Recommended (L40S)** | $7.77 | | p3.2xlarge | 1x V100 | 16 GB | 8 | 61 GB | ML Training | $3.06 | | p4d.24xlarge | 8x A100 | 320 GB | 96 | 1152 GB | Large-Scale Training | $32.77 | +| p5.48xlarge | 8x H100 | 640 GB | 192 | 2048 GB | H100 (Capacity Block) | $98.32 | **Note:** Prices are approximate for US East/West regions and may vary. Check [AWS Pricing](https://aws.amazon.com/ec2/pricing/on-demand/) for current rates. @@ -1151,19 +1264,53 @@ aiPlatform: ### Basic Commands ```bash -# Install EKS cluster and AI Platform +# Install EKS cluster and AI Platform (idempotent - safe to re-run) ./eks_cluster_with_stack.sh install -# Delete entire cluster and all AWS resources +# Delete cluster and ALL AWS resources/roles/policies created by this script ./eks_cluster_with_stack.sh delete -# Full cleanup (including S3 buckets, IAM roles) +# Full cleanup: uninstall CRs/operators then run comprehensive AWS cleanup ./eks_cluster_with_stack.sh delete-full +``` + +#### What `delete` Does (10-Step Cleanup) -# Check AIPlatform status -./eks_cluster_with_stack.sh status +The `delete` command performs a comprehensive, ordered cleanup of all AWS resources created by the script: + +| Step | Action | Details | +|------|--------|---------| +| 1 | Delete IRSA Service Accounts | Removes SA CloudFormation stacks for Cluster Autoscaler, Ray head/worker, SAIA, EBS CSI | +| 2 | Delete IAM Roles | Removes IRSA roles for all service accounts | +| 3 | Clean up EBS CSI addon roles | Finds and deletes any `eksctl--addon-aws-ebs-csi-driver-*` roles | +| 4 | Delete EKS Addons | Removes `aws-ebs-csi-driver` addon | +| 5 | Delete EKS Cluster | Runs `eksctl delete cluster --wait` and waits for CloudFormation stack deletion | +| 6 | Clean up CloudFormation stacks | Deletes lingering nodegroup, IAMServiceAccount, and addon stacks | +| 7 | Delete IAM Policies | Removes S3 bucket policy (or ECR-only policy if using external object store) | +| 8 | Purge IRSA roles by OIDC | Finds and removes any remaining roles associated with the cluster's OIDC provider | +| 9 | Delete OIDC Provider | Removes the IAM OIDC identity provider | +| 10 | Delete EBS Volumes | Removes all EBS volumes tagged with the cluster name | + +**VPC Preservation:** If `cluster.preserveVpcOnDelete: true` is set, the VPC and subnets are preserved; only EKS and related resources are deleted. + +**Verification after delete:** +```bash +# Check for remaining IAM roles +aws iam list-roles --query "Roles[?contains(RoleName, '${CLUSTER_NAME}')].RoleName" + +# Check for remaining CloudFormation stacks +aws cloudformation list-stacks --query "StackSummaries[?contains(StackName, 'eksctl-${CLUSTER_NAME}')].StackName" + +# Check for remaining EBS volumes +aws ec2 describe-volumes --region ${REGION} \ + --filters "Name=tag:kubernetes.io/cluster/${CLUSTER_NAME},Values=owned" \ + --query 'Volumes[].VolumeId' ``` +#### What `delete-full` Does + +The `delete-full` command runs a full teardown: it first uninstalls all Kubernetes CRs and operators (AIPlatform, Splunk Standalone, Splunk Operator, OpenTelemetry, Cluster Autoscaler, KubeRay, kube-prometheus-stack, cert-manager, gp3 StorageClass), then runs the same 10-step `delete` cleanup above. + ### Post-Installation Tasks #### 1. Access the Cluster @@ -1282,7 +1429,7 @@ aws eks update-nodegroup-config \ aws eks describe-cluster --name ${CLUSTER_NAME} --query cluster.version # Update control plane -aws eks update-cluster-version --name ${CLUSTER_NAME} --kubernetes-version 1.29 +aws eks update-cluster-version --name ${CLUSTER_NAME} --kubernetes-version 1.32 # Wait for update to complete (check status) aws eks describe-update --name ${CLUSTER_NAME} --update-id @@ -2002,6 +2149,43 @@ aws ecr describe-images --repository-name ray --region us-west-2 ## Advanced Topics +### H100 GPU Nodes with Capacity Blocks + +For H100 instances, the script supports AWS EC2 Capacity Blocks, which guarantee GPU capacity for a reserved time period. When `defaultAcceleratorType: "H100"` and a `capacityReservation.id` is set, GPU nodes are created separately via CloudFormation instead of eksctl managed node groups. + +**How It Works:** +1. CPU node group is created first via eksctl (standard managed node group) +2. The script then creates a CloudFormation stack with a Launch Template that references the Capacity Block reservation +3. Nodes auto-join the cluster with `nvidia.com/gpu=true` label and taint +4. The CloudFormation stack is idempotent (skipped if already healthy) + +**Configuration:** +```yaml +nodeGroups: + gpu: + enabled: true + instanceType: "p5.48xlarge" # H100 instance type + desiredCapacity: 2 + volumeSize: 2000 + volumeType: "gp3" + capacityReservation: + id: "cr-0abcdef1234567890" # Your Capacity Reservation ID + az: "us-east-2b" # AZ of the reservation + +aiPlatform: + defaultAcceleratorType: "H100" # Must be H100 for capacity block path +``` + +**Requirements:** +- You must have a valid EC2 Capacity Reservation (Capacity Block) purchased in your region +- The AZ of the reservation must match a subnet in your VPC +- `defaultAcceleratorType` must be set to `H100` + +**Cleanup:** +The `delete` and `delete-full` commands automatically clean up the CloudFormation stack (`-gpu-capacity-block`). + +--- + ### Auto Scaling #### Cluster Autoscaler diff --git a/tools/cluster_setup/K0S_README.md b/tools/cluster_setup/K0S_README.md index 9dcbc9b..3d116f9 100644 --- a/tools/cluster_setup/K0S_README.md +++ b/tools/cluster_setup/K0S_README.md @@ -5,7 +5,6 @@ Complete guide for deploying Splunk AI Platform on k0s Kubernetes clusters. ## Table of Contents - [Overview](#overview) -- [Pure On-Premises Deployments](#pure-on-premises-deployments-no-aws) - [Features](#features) - [Prerequisites](#prerequisites) - [Quick Start](#quick-start) @@ -26,10 +25,12 @@ Complete guide for deploying Splunk AI Platform on k0s Kubernetes clusters. The `k0s_cluster_with_stack.sh` script deploys the complete Splunk AI Platform on k0s Kubernetes, supporting: - **Bare metal / on-premises deployments** with existing hardware and SSH access -- **AWS EC2 instances** for testing and simulation (auto-creates instances) -- **External S3-compatible object storage** (SeaweedFS, MinIO, or any S3-compatible endpoint) -- **In-cluster MinIO** as a fallback when no external storage is configured +- **External S3-compatible object storage** (SeaweedFS, MinIO, or any S3-compatible endpoint) — customer-managed - **Air-gapped environments** with private registries +- **Session logging** — all output captured to timestamped log files +- **Safety gates** — refuses to wipe a live cluster with Ready nodes + +> **Important:** This script requires pre-provisioned nodes with `existingIPs` in the config YAML. It does **not** auto-create cloud instances. Object storage must be external and customer-managed (no in-cluster MinIO is deployed). ### What is k0s? @@ -41,413 +42,41 @@ The `k0s_cluster_with_stack.sh` script deploys the complete Splunk AI Platform o --- -## Pure On-Premises Deployments (No AWS) - -### Does this work for customers in their own data centers? - -**Yes!** The k0s deployment is specifically designed for on-premises deployments where customers have zero AWS presence. Here's what you need to know: - -### What Works Without AWS - -✅ **Complete AI Platform Stack** - All features (SAIA, Slim, SECA) work in pure on-prem environments -✅ **Flexible Object Storage** - External SeaweedFS/MinIO/S3-compatible, or in-cluster MinIO -✅ **No Cloud Dependencies** - No AWS services required -✅ **Air-Gapped Support** - Can run completely disconnected from the internet -✅ **Private Registries** - Use your own container registry instead of ECR - -### What You Need to Provide (On-Premises) - -**1. Physical/Virtual Infrastructure:** -- Physical servers or VMs with Ubuntu 22.04 LTS (or similar) -- Minimum 3 nodes (1 controller + 2 workers), recommended 5+ nodes -- Direct SSH access to all nodes -- Root/sudo privileges on all nodes - -**2. Network Infrastructure:** -- **Internal Network**: All nodes must be on the same network segment -- **IP Addressing**: Static IPs or DHCP reservations for all nodes -- **DNS (Optional but recommended)**: Internal DNS for node resolution -- **Internet Access (Initial Setup)**: For downloading k0s binary and container images - - Can be removed after installation for air-gapped operation - -**3. Network Ports (Between Nodes):** - -| Port | Protocol | Source | Destination | Purpose | -|------|----------|--------|-------------|---------| -| 22 | TCP | Admin workstation | All nodes | SSH management | -| 6443 | TCP | All nodes | Controller | Kubernetes API | -| 2380 | TCP | Controllers | Controllers | etcd peer communication | -| 10250 | TCP | All nodes | All nodes | Kubelet API | -| 8132 | TCP | Worker nodes | Controller | Konnectivity agent | -| 179 | TCP | All nodes | All nodes | Calico BGP (if using BGP) | -| 4789 | UDP | All nodes | All nodes | Calico VXLAN overlay | -| 30000-32767 | TCP | User networks | Worker nodes | NodePort services (optional) | - -**4. Storage:** -- Local disk space on each node: - - Controller: 100GB minimum - - CPU Worker: 200GB minimum (for MinIO and workloads) - - GPU Worker: 500GB+ recommended (for models and datasets) - -**5. For Private Container Registry:** -- Your own Docker registry (Harbor, Artifactory, etc.) -- Pre-pull and push all required images to your registry -- Configure imagePullSecrets for the registry - -### Network Architecture (Pure On-Premises) - -``` -┌─────────────────────────────────────────────────────────────┐ -│ Your Data Center Network │ -│ (e.g., 10.0.0.0/16) │ -└─────────────────────────────────────────────────────────────┘ - │ - ┌───────────────────┼───────────────────┐ - │ │ │ -┌───────▼──────────┐ ┌──────▼───────────┐ ┌───▼──────────────┐ -│ Controller Node │ │ CPU Worker 1 │ │ GPU Worker 1 │ -│ 10.0.1.10 │ │ 10.0.1.20 │ │ 10.0.1.30 │ -│ :6443 (API) │ │ │ │ │ -│ :8132 (Konnect) │ │ • MinIO │ │ • Ray GPU Pods │ -└──────────────────┘ └──────────────────┘ └──────────────────┘ - │ │ │ - └───────────────────┼───────────────────┘ - │ - ┌─────────▼──────────┐ - │ Calico VXLAN │ - │ Pod Network │ - │ 10.244.0.0/16 │ - └────────────────────┘ -``` - -**Key Points:** -- **Host Network (10.0.0.0/16)**: Your physical data center network -- **Pod Network (10.244.0.0/16)**: Calico VXLAN overlay network -- **Service Network (10.96.0.0/16)**: Kubernetes ClusterIP services -- All pod-to-pod communication happens over VXLAN (no cloud networking) -- Object storage is internal / external to the cluster (SeaweedFS, MinIO, or S3-compatible endpoint) - -### Configuration Example (Pure On-Premises) - -```yaml -cluster: - name: onprem-ai-cluster - sshUser: ubuntu - sshKeyPath: ~/.ssh/onprem-key - -nodes: - controllers: 1 - cpuWorkers: 2 # First 2 workers are CPU - gpuWorkers: 2 # Remaining 2 workers are GPU - - existingIPs: - controllers: - - 10.0.1.10 # Your controller server IP - workers: - - 10.0.1.20 # CPU worker 1 (index 0) - - 10.0.1.21 # CPU worker 2 (index 1) - - 10.0.1.30 # GPU worker 1 (index 2) - - 10.0.1.31 # GPU worker 2 (index 3) - -storage: - storageClass: "local-path" - vectorDbSize: "100Gi" - objectStore: - type: "minio" # External MinIO endpoint - bucket: "ai-platform-data" - endpoint: "http://10.0.1.50:9000" - auth: - rootUser: "minio-admin" - rootPassword: "SuperSecurePassword123!" - -images: - registry: "registry.yourcompany.com" - operator: - image: "registry.yourcompany.com/splunk/splunk-ai-operator:v0.1.5" - splunk: - image: "registry.yourcompany.com/splunk/splunk:latest" - operatorImage: "registry.yourcompany.com/splunk/splunk-operator:3.0.0" - ray: - headImage: "registry.yourcompany.com/ray/ray-head:build-v1alpha1" - workerImage: "registry.yourcompany.com/ray/ray-worker-gpu:build-v1alpha1" - weaviate: - image: "registry.yourcompany.com/weaviate:stable-v1.28" - saia: - apiImage: "registry.yourcompany.com/saia/saia-api:build-v1alpha1" - dataLoaderImage: "registry.yourcompany.com/saia/saia-data-loader:build-v1alpha1" - slim: - apiImage: "registry.yourcompany.com/slim/slim-api:v0.0.1" - -kubernetes: - namespace: ai-platform - -imagePullSecrets: - secrets: - - private-registry-secret - autoCreateECR: false - -aiPlatform: - name: "onprem-ai-stack" - features: - - name: "saia" - version: "1.1.0" - - name: "slim" - version: "1.0.0" -``` - -### Installation Steps (Pure On-Premises) - -**1. Prepare Your Nodes:** -```bash -# On each node, ensure: -# - Ubuntu 22.04 LTS installed -# - SSH access configured -# - Passwordless sudo enabled -# - Python 3.8+ installed - -# Example setup on each node: -ssh ubuntu@10.0.1.10 -sudo apt-get update -sudo apt-get install -y python3 curl -``` - -**2. Configure SSH Access:** -```bash -# From your admin workstation -# Test SSH access to all nodes -ssh -i ~/.ssh/onprem-key ubuntu@10.0.1.10 "hostname" -ssh -i ~/.ssh/onprem-key ubuntu@10.0.1.20 "hostname" -ssh -i ~/.ssh/onprem-key ubuntu@10.0.1.21 "hostname" -``` - -**3. Create Configuration File:** -```bash -# Copy template and edit -cp k0s-cluster-config.yaml onprem-config.yaml -vi onprem-config.yaml -# - Set existingIPs to your node IPs -# - Set autoCreateECR: false -# - Configure MinIO credentials -``` - -**4. Run Installation:** -```bash -# From your admin workstation (must have internet access for initial download) -CONFIG_FILE=./onprem-config.yaml ./k0s_cluster_with_stack.sh install -``` - -**5. Access Your Cluster:** -```bash -# Kubeconfig is saved to ~/.kube/k0s- -export KUBECONFIG=~/.kube/k0s-onprem-ai-cluster - -# Verify -kubectl get nodes -kubectl get pods -A -``` - -### Private Container Registry Setup - -If using a private registry instead of public Docker Hub: - -**1. Set up your registry** (Harbor, Artifactory, JFrog, etc.) - -**2. Pre-pull and push images:** -```bash -# Pull from public registries -docker pull rayproject/ray:2.9.0 -docker pull semitechnologies/weaviate:1.28.0 -docker pull minio/minio:latest - -# Tag for your registry -docker tag rayproject/ray:2.9.0 registry.yourcompany.com/ray:2.9.0 -docker tag semitechnologies/weaviate:1.28.0 registry.yourcompany.com/weaviate:1.28.0 -docker tag minio/minio:latest registry.yourcompany.com/minio:latest - -# Push to your registry -docker push registry.yourcompany.com/ray:2.9.0 -docker push registry.yourcompany.com/weaviate:1.28.0 -docker push registry.yourcompany.com/minio:latest -``` - -**3. Create registry secret:** -```bash -kubectl create secret docker-registry private-registry-secret \ - --docker-server=registry.yourcompany.com \ - --docker-username=admin \ - --docker-password=secretpassword \ - --namespace=ai-platform -``` - -**4. Configure in k0s-cluster-config.yaml:** -```yaml -imagePullSecrets: - secrets: - - private-registry-secret - autoCreateECR: false - -aiplatform: - ray: - image: "registry.yourcompany.com/ray:2.9.0" - vectordb: - image: "registry.yourcompany.com/weaviate:1.28.0" -``` - -### Air-Gapped Deployment - -For completely disconnected environments: - -**1. Pre-stage on a connected system:** -- Download k0s binary -- Pull all required container images -- Download Helm charts - -**2. Transfer to air-gapped environment:** -- Copy k0s binary to all nodes -- Load images into local registry -- Copy Helm charts and manifests - -**3. Configure to use local resources:** -```yaml -imagePullSecrets: - secrets: - - airgap-registry - autoCreateECR: false -``` - -**4. Run installation pointing to local registry** - -### Common On-Premises Scenarios - -#### Scenario 1: Corporate Data Center with Proxy - -```yaml -# Configure nodes to use corporate proxy -# On each node: -export HTTP_PROXY=http://proxy.corp.com:8080 -export HTTPS_PROXY=http://proxy.corp.com:8080 -export NO_PROXY=localhost,127.0.0.1,10.0.0.0/8,.cluster.local - -# Then run installation -``` - -#### Scenario 2: Multiple Data Centers (Multi-Site) - -For multi-site deployments: -- Deploy separate k0s cluster per data center -- Use federation or multi-cluster management (not covered in this script) -- Consider network latency between sites (<10ms recommended for etcd) - -#### Scenario 3: Existing Kubernetes Cluster - -If you already have a Kubernetes cluster: -```yaml -cluster: - useExisting: force # Use existing cluster instead of creating new one -``` - -Then install just the AI Platform stack on your existing cluster. - -### Networking Deep Dive - -#### Required Connectivity Matrix - -| From | To | Ports | Purpose | -|------|-----|-------|---------| -| Admin Workstation | All nodes | 22/TCP | SSH management | -| All nodes | Controller | 6443/TCP | Kubernetes API | -| All nodes | Controller | 8132/TCP | Konnectivity | -| All nodes | All nodes | 10250/TCP | Kubelet | -| All nodes | All nodes | 4789/UDP | VXLAN overlay | -| Controllers | Controllers | 2380/TCP | etcd (HA only) | -| User clients | Worker nodes | 30000-32767/TCP | NodePort (optional) | - -#### Firewall Configuration Example (iptables) - -```bash -# On controller node -sudo iptables -A INPUT -p tcp --dport 6443 -s 10.0.0.0/16 -j ACCEPT -sudo iptables -A INPUT -p tcp --dport 8132 -s 10.0.0.0/16 -j ACCEPT -sudo iptables -A INPUT -p tcp --dport 2380 -s 10.0.0.0/16 -j ACCEPT - -# On all nodes -sudo iptables -A INPUT -p tcp --dport 10250 -s 10.0.0.0/16 -j ACCEPT -sudo iptables -A INPUT -p udp --dport 4789 -s 10.0.0.0/16 -j ACCEPT -sudo iptables -A INPUT -p tcp --dport 179 -s 10.0.0.0/16 -j ACCEPT -``` - -#### DNS Requirements - -**Optional but Recommended:** -- Internal DNS server resolving node hostnames -- Or: Configure /etc/hosts on all nodes with all node IPs - -```bash -# Example /etc/hosts on each node -10.0.1.10 controller1.corp.local controller1 -10.0.1.20 worker1.corp.local worker1 -10.0.1.21 worker2.corp.local worker2 -``` - -### What About AWS Features? - -| AWS Feature | On-Prem Alternative | -|-------------|---------------------| -| S3 Storage | MinIO (S3-compatible) ✅ | -| ECR Registry | Harbor, Artifactory, JFrog ✅ | -| EBS Volumes | Local storage (local-path) ✅ | -| IAM Roles | Kubernetes ServiceAccounts ✅ | -| ELB/ALB | NodePort or MetalLB ✅ | -| VPC Networking | Calico VXLAN ✅ | -| Route53 DNS | Internal DNS server ✅ | -| CloudWatch | Prometheus + Grafana ✅ | - -**Everything works on-premises with alternative solutions!** - ---- - ## Features ### Complete AI Platform Stack The script installs everything needed for the AI Platform: -1. **k0s Kubernetes Cluster** - CNCF certified, single-binary Kubernetes -2. **Calico CNI** - High-performance networking with VXLAN -3. **local-path Storage Provisioner** - Default StorageClass for PVCs -4. **Object Storage** - External S3-compatible (SeaweedFS/MinIO) or in-cluster MinIO -5. **Cert-Manager v1.13.0** - Automated certificate management -6. **Kube-Prometheus Stack** - Monitoring with Prometheus + Grafana -7. **OpenTelemetry Operator** - Distributed tracing and telemetry -8. **NVIDIA Host Drivers + Device Plugin** - GPU support for AI workloads (optional, bare-metal driver install) -9. **KubeRay Operator v1.0.0** - Ray cluster management for distributed AI -10. **Splunk Operator** - Splunk Enterprise management -11. **Splunk AI Platform Operator** - AI platform orchestration (SAIA, Slim, SECA features) -12. **AIPlatform CR** - Complete AI deployment with features, scheduling, and secrets - -### Two Deployment Modes - -#### Mode 1: Bare Metal / On-Premises -- Provide existing IP addresses in `nodes.existingIPs` -- Script SSHs into each node and installs k0s, NVIDIA drivers (if GPU), iptables, and PyYAML -- Passwordless SSH with sudo access required -- Production-ready for on-prem deployments -- Air-gapped support with private registries - -#### Mode 2: AWS EC2 (Testing / Simulation) -- Automatically creates EC2 instances (controller, CPU workers, GPU workers) -- Creates or reuses a Security Group with required k0s ports open -- User-data bootstraps nodes with `curl`, `wget`, `jq`, and k0s binary -- Quick setup for testing and validation before on-prem rollout - -### Image Pull Secrets Support 🔐 +1. **k0s Kubernetes Cluster** — CNCF certified, single-binary Kubernetes +2. **Calico CNI** — High-performance networking with VXLAN +3. **local-path Storage Provisioner** — Default StorageClass for PVCs +4. **Cert-Manager v1.13.0** — Automated certificate management +5. **Kube-Prometheus Stack** — Monitoring with Prometheus + Grafana +6. **OpenTelemetry Operator** — Distributed tracing and telemetry +7. **NVIDIA Host Drivers + Device Plugin** — GPU support (RHEL 9/10, AL2023, Debian/Ubuntu) +8. **KubeRay Operator v1.2.2** — Ray cluster management for distributed AI +9. **Splunk Operator** — Splunk Enterprise management +10. **Splunk AI Platform Operator** — AI platform orchestration (SAIA feature) +11. **AIPlatform CR** — Complete AI deployment with features, scheduling, and secrets + +### Operational Features + +- **Two-phase parallel installation** — Independent components install concurrently for faster deployments +- **Helm retry with exponential backoff** — Automatic retries on transient errors (timeouts, TLS handshake failures) +- **Preflight validation** — Checks tools, config, SSH connectivity, and disk space before starting +- **Safety gate** — Refuses to wipe a cluster that has Ready nodes (prevents accidental data loss) +- **Session logging** — All stdout/stderr captured to `tools/cluster_setup/logs/k0s-install-YYYY-MM-DD_HH-MM-SS.log` +- **Existing cluster detection** — `useExisting` flag (auto/force/never) to skip k0s install and deploy stack only + +### Image Pull Secrets Support Automatically creates and configures secrets for private container registries: -- **AWS ECR** - Elastic Container Registry (auto-token refresh) -- **Docker Hub** - Docker Hub private repositories -- **GCR** - Google Container Registry -- **ACR** - Azure Container Registry -- **Custom** - Any Docker registry +- **AWS ECR** — Elastic Container Registry (auto-token refresh) +- **Docker Hub** — Docker Hub private repositories +- **GCR** — Google Container Registry +- **ACR** — Azure Container Registry +- **Custom** — Any Docker registry Secrets are automatically propagated through the platform: ``` @@ -458,11 +87,11 @@ AIPlatform CR → AIService → Job/RayCluster → Pods ## Prerequisites -### Required Tools +### Required Tools (on Admin Workstation) ```bash # Install required tools on macOS -brew install kubectl helm git jq yq aws-cli +brew install kubectl helm git jq yq # Install required tools on Ubuntu/Debian sudo apt-get update @@ -478,58 +107,42 @@ jq --version yq --version ``` -### For On-Prem Deployments +### Hardware Requirements + +| Node Type | CPU | RAM | Disk | Notes | +|-----------|-----|-----|------|-------| +| Controller | 4+ | 8GB+ | 100GB+ | Runs API server, etcd, scheduler | +| CPU Worker | 8+ | 32GB+ | 200GB+ | Runs Weaviate, Ray head, Splunk | +| GPU Worker | 8+ | 32GB+ | 500GB+ | NVIDIA GPU required for AI inference | -**Hardware Requirements:** -- **Controller Node**: 4 CPU, 8GB RAM, 50GB disk (minimum) -- **CPU Worker**: 8 CPU, 32GB RAM, 100GB disk (recommended for AI) -- **GPU Worker**: 8 CPU, 32GB RAM, 100GB disk + NVIDIA GPU +### Software Requirements (on All Nodes) -**Software Requirements:** -- Ubuntu 22.04 LTS (or similar Linux distribution) -- Passwordless SSH access to all nodes +- RHEL 9/10, Amazon Linux 2023, or Debian/Ubuntu +- Passwordless SSH access from admin workstation - Sudo privileges without password -- Python 3.8+ installed on all nodes +- Python 3.8+ installed + +### Network Requirements -**Network Requirements:** Open the following ports between nodes: | Port | Protocol | Purpose | |------|----------|---------| +| 22 | TCP | SSH management | | 6443 | TCP | Kubernetes API server | -| 2380 | TCP | etcd client | +| 2380 | TCP | etcd peer communication | | 10250 | TCP | Kubelet API | | 8132 | TCP | Konnectivity agent | | 179 | TCP | Calico BGP | | 4789 | UDP | Calico VXLAN | -| 30000-32767 | TCP | NodePort services | - -### For AWS EC2 Deployments - -**AWS Requirements:** -- AWS CLI configured with credentials -- IAM permissions: EC2, VPC, Security Groups -- Existing VPC with internet gateway -- SSH key pair in AWS region -- Sufficient EC2 quotas: - - t3.xlarge (controllers): 1+ instances - - m5.4xlarge (CPU workers): 2+ instances - - g5.2xlarge (GPU workers): 2+ instances +| 30000-32767 | TCP | NodePort services (optional) | -**Verify AWS Access:** -```bash -# Check AWS credentials -aws sts get-caller-identity - -# Check available regions -aws ec2 describe-regions --output table +### External Object Storage -# Check EC2 quotas -aws service-quotas get-service-quota \ - --service-code ec2 \ - --quota-code L-1216C47A \ - --region us-west-2 -``` +You must provide an external S3-compatible object storage endpoint: +- **SeaweedFS**, **MinIO**, or any S3-compatible service +- Must be reachable from all cluster nodes +- The script does **not** deploy object storage in-cluster --- @@ -555,17 +168,13 @@ vi my-cluster.yaml ### 3. Deploy the Cluster ```bash -# For on-prem deployment -CONFIG_FILE=./my-cluster.yaml ./k0s_cluster_with_stack.sh install - -# For EC2 testing CONFIG_FILE=./my-cluster.yaml ./k0s_cluster_with_stack.sh install ``` ### 4. Verify Installation ```bash -# Set kubeconfig +# Set kubeconfig (saved automatically during install) export KUBECONFIG=~/.kube/k0s-my-cluster # Check nodes @@ -587,57 +196,55 @@ kubectl get pods --all-namespaces The `k0s-cluster-config.yaml` file controls all aspects of the deployment: ```yaml -cluster: # Cluster name, useExisting, region, SSH user/key +cluster: # Cluster name, useExisting, SSH user/key nodes: # Controller/worker counts and existingIPs -storage: # storageClass, vectorDbSize, objectStore (type/endpoint/auth) -images: # registry prefix, operator, splunk, ray, weaviate, saia, slim, fluentBit, otelCollector +storage: # storageClass, vectorDbSize, objectStore, minimumDiskSpace +images: # registry prefix, operator, splunk, ray, weaviate, saia, nginx, fluentBit, otelCollector operators: # ray (version/modelVersion/rayVersion), certManager, nvidia devicePluginVersion kubernetes: # namespace files: # splunkOperator, aiPlatform manifest paths splunk: # standaloneName -aiPlatform: # defaultAcceleratorType, workerGroupConfig, features, scheduling +aiPlatform: # defaultAcceleratorType, workerGroupConfig, features, scheduling, serviceTemplate imagePullSecrets: # secrets list, autoCreateECR, dockerHub, gcr, acr, custom ecr: # account, region -ec2: # vpcId, subnetId, keyName (EC2 mode) -instanceTypes: # controller, cpuWorker, gpuWorker (EC2 mode) ``` -### Configuration Examples - -#### Example 1: On-Premises / Bare Metal Production Cluster - -**Use Case:** Production deployment on existing hardware with external object storage +### Configuration Example ```yaml cluster: name: prod-ai-platform + useExisting: auto # auto | force | never sshUser: ubuntu sshKeyPath: ~/.ssh/prod-key.pem nodes: controllers: 1 - cpuWorkers: 2 # First 2 workers treated as CPU - gpuWorkers: 2 # Remaining 2 workers treated as GPU - + cpuWorkers: 2 # First 2 workers treated as CPU + gpuWorkers: 2 # Remaining 2 workers treated as GPU existingIPs: controllers: - - 10.0.1.10 # Physical server 1 + - 10.0.1.10 workers: - - 10.0.1.20 # Physical server 2 (CPU - worker index 0) - - 10.0.1.21 # Physical server 3 (CPU - worker index 1) - - 10.0.1.22 # Physical server 4 (GPU - worker index 2) - - 10.0.1.23 # Physical server 5 (GPU - worker index 3) + - 10.0.1.20 # CPU (worker index 0) + - 10.0.1.21 # CPU (worker index 1) + - 10.0.1.22 # GPU (worker index 2) + - 10.0.1.23 # GPU (worker index 3) storage: storageClass: "local-path" vectorDbSize: "200Gi" + minimumDiskSpace: # Preflight disk checks (GB) + controller: 100 + cpuWorker: 200 + gpuWorker: 500 objectStore: - type: "seaweedfs" - bucket: "ai-platform-production" - endpoint: "http://10.0.1.50:8333" + type: "seaweedfs" # aws | s3compat | minio | seaweedfs + bucket: "ai-platform-data" + endpoint: "http://10.0.1.50:8333" # REQUIRED for s3compat/minio/seaweedfs auth: rootUser: "admin" - rootPassword: "Change-This-Strong-Password-123!" + rootPassword: "Change-This-Strong-Password!" images: registry: "registry.corp.com" @@ -653,9 +260,24 @@ images: image: "docker.io/semitechnologies/weaviate:stable-v1.28" saia: apiImage: "registry.corp.com/saia/saia-api:build-v1alpha1" + apiV2Image: "registry.corp.com/saia/saia-api-v2:build-v1alpha1" dataLoaderImage: "registry.corp.com/saia/saia-data-loader:build-v1alpha1" - slim: - apiImage: "registry.corp.com/slim/slim-api:v0.0.1" + nginx: + image: "docker.io/library/nginx:1.27-alpine" + fluentBit: + image: "docker.io/fluent/fluent-bit:1.9.6" + otelCollector: + image: "docker.io/otel/opentelemetry-collector-contrib:0.122.1" + +operators: + ray: + version: "v1.2.2" + modelVersion: "v0.3.14-36-g1549f5a" + rayVersion: "2.44.0" + certManager: + installCRDs: true + nvidia: + devicePluginVersion: "v0.17.3" kubernetes: namespace: ai-platform @@ -663,332 +285,178 @@ kubernetes: splunk: standaloneName: splunk-prod -imagePullSecrets: - secrets: - - private-registry-secret - autoCreateECR: false - aiPlatform: name: "prod-ai-stack" + defaultAcceleratorType: "L40S" # GPU tier: L40S, H100, or "" + workerGroupConfig: + imageRegistry: "" # Override registry for Ray worker images features: - name: "saia" version: "1.1.0" - - name: "slim" - version: "1.0.0" -``` - -#### Example 2: AWS EC2 Testing Cluster - -**Use Case:** Quick testing/validation before on-prem deployment - -```yaml -cluster: - name: test-ai-platform - region: us-west-2 - sshUser: ec2-user - sshKeyPath: ~/.ssh/test-key.pem - -nodes: - controllers: 1 - cpuWorkers: 2 - gpuWorkers: 1 - existingIPs: - controllers: [] # Empty = auto-create EC2 - workers: [] # Empty = auto-create EC2 - -ec2: - vpcId: vpc-0123456789abcdef0 - subnetId: "" - keyName: test-key - -instanceTypes: - controller: t3.xlarge - cpuWorker: m5.2xlarge - gpuWorker: g5.xlarge - -storage: - storageClass: "local-path" - vectorDbSize: "50Gi" - objectStore: - type: "minio" - bucket: "ai-platform-test" - endpoint: "http://minio-host:9000" - auth: - rootUser: "minioadmin" - rootPassword: "minioadmin123" - -images: - registry: "123456789012.dkr.ecr.us-west-2.amazonaws.com" - operator: - image: "123456789012.dkr.ecr.us-west-2.amazonaws.com/splunk-ai-operator:latest" - -ecr: - account: "123456789012" # Your AWS account ID - region: us-west-2 + serviceAccountName: "" + cpuScheduling: + nodeSelector: + splunk.ai/workload-type: cpu + tolerations: [] + gpuScheduling: + nodeSelector: + splunk.ai/workload-type: gpu + tolerations: + - key: "nvidia.com/gpu" + operator: "Equal" + value: "true" + effect: "NoSchedule" + serviceTemplate: # Optional: expose SAIA externally + type: "NodePort" # NodePort | LoadBalancer + nodePort: 30080 # Port for NodePort type imagePullSecrets: secrets: [] autoCreateECR: true + dockerHub: + enabled: false + username: "" + password: "" + email: "" + gcr: + enabled: false + jsonKey: "" + acr: + enabled: false + registry: "" + username: "" + password: "" + custom: + enabled: false + name: "custom-registry-secret" + server: "" + username: "" + password: "" + email: "" -kubernetes: - namespace: ai-platform -``` - -#### Example 3: Hybrid Cluster (Some Existing, Some New) - -**Use Case:** Mix existing on-prem nodes with cloud nodes - -```yaml -cluster: - name: hybrid-cluster - region: us-east-1 - sshUser: ubuntu - sshKeyPath: ~/.ssh/hybrid-key.pem - -nodes: - controllers: 1 - cpuWorkers: 2 # First 2 workers are CPU (on-prem), + 2 EC2 CPU workers created - gpuWorkers: 2 # Remaining 2 on-prem workers are GPU - - existingIPs: - controllers: - - 192.168.1.10 # Existing on-prem controller - workers: - - 192.168.1.20 # Existing on-prem worker (CPU - index 0) - - 192.168.1.21 # Existing on-prem worker (CPU - index 1) - - 192.168.1.30 # Existing on-prem worker (GPU - index 2) - - 192.168.1.31 # Existing on-prem worker (GPU - index 3) - -ec2: - vpcId: vpc-0123456789abcdef0 - keyName: hybrid-key - -instanceTypes: - cpuWorker: m5.2xlarge # For new EC2 workers - -imagePullSecrets: - autoCreateECR: true -``` - -#### Example 4: Air-Gapped On-Prem Cluster - -**Use Case:** Secure environment with no internet access - -```yaml -cluster: - name: airgap-cluster - sshUser: admin - sshKeyPath: ~/.ssh/secure-key.pem - -nodes: - controllers: 3 # HA setup - cpuWorkers: 2 # First 2 workers are CPU - gpuWorkers: 1 # Last worker is GPU - - existingIPs: - controllers: - - 172.16.0.10 - - 172.16.0.11 - - 172.16.0.12 - workers: - - 172.16.0.20 # CPU - - 172.16.0.21 # CPU - - 172.16.0.22 # GPU - -storage: - storageClass: "local-path" - vectorDbSize: "100Gi" - objectStore: - type: "minio" - bucket: "airgap-storage" - endpoint: "http://172.16.0.50:9000" - auth: - rootUser: "secure-admin" - rootPassword: "Very-Long-Secure-Password-456!" - -images: - registry: "registry.airgap.local" - operator: - image: "registry.airgap.local/splunk-ai-operator:v0.1.5" - -imagePullSecrets: - secrets: - - private-registry-secret # Pre-created manually - autoCreateECR: false - -# Note: Pre-pull all images to local registry before installation -# See the "Internet Dependencies" section for the full list of images +ecr: + account: "123456789012" + region: us-east-2 ``` ### Configuration Reference #### Cluster Section -```yaml -cluster: - # Cluster name (used for tagging, kubeconfig, etc.) - name: my-cluster - - # Use existing cluster instead of creating new one - # Options: auto (detect), force (fail if not found), never (always create) - useExisting: auto - - # AWS region. Required for EC2 mode. Also used as fallback for ecr.region - # when pulling images from ECR (even in bare-metal mode). - # Not needed for pure on-prem with no AWS. - region: us-west-2 - - # SSH configuration - sshUser: ubuntu # SSH username - sshKeyPath: ~/.ssh/my-key.pem # Path to private key -``` +| Field | Required | Default | Description | +|-------|----------|---------|-------------| +| `cluster.name` | Yes | — | Cluster identifier (used for kubeconfig, labels) | +| `cluster.useExisting` | No | `never` | `auto` = detect existing cluster, `force` = fail if not found, `never` = always create new | +| `cluster.sshUser` | Yes | `ubuntu` | SSH username for all nodes | +| `cluster.sshKeyPath` | Yes | — | Path to SSH private key | #### Nodes Section -```yaml -nodes: - # Number of controller nodes (1 or 3 for HA) - controllers: 1 - - # Number of CPU workers. In EC2 mode: instances to create. - # In bare-metal mode: first N entries in workers[] are CPU, rest are GPU. - # Controls node labeling, NVIDIA driver install, and GPU device plugin. - cpuWorkers: 2 - - # Number of GPU workers. In EC2 mode: instances to create. - # In bare-metal mode: workers after the first cpuWorkers are treated as GPU. - gpuWorkers: 1 - - # Existing IP addresses (bare-metal / on-prem mode) - existingIPs: - controllers: [] # Leave empty for EC2 auto-creation - workers: [] # Leave empty for EC2 auto-creation -``` +| Field | Required | Default | Description | +|-------|----------|---------|-------------| +| `nodes.controllers` | No | `1` | Number of controller nodes (1 or 3 for HA) | +| `nodes.cpuWorkers` | No | `2` | First N workers in the list are labeled as CPU | +| `nodes.gpuWorkers` | No | `1` | Remaining workers after cpuWorkers are labeled as GPU | +| `nodes.existingIPs.controllers` | **Yes** | — | List of controller node IPs | +| `nodes.existingIPs.workers` | **Yes** | — | List of worker node IPs | #### Storage Section -```yaml -storage: - storageClass: "local-path" # Kubernetes StorageClass for PVCs - vectorDbSize: "50Gi" # Weaviate PersistentVolume size - - objectStore: - type: "seaweedfs" # aws | s3compat | minio | seaweedfs - bucket: "ai-platform-bucket" # S3 bucket name - endpoint: "http://host:8333" # S3-compatible endpoint URL - auth: - rootUser: "admin" # Access key / root user - rootPassword: "password" # Secret key / root password -``` +| Field | Required | Default | Description | +|-------|----------|---------|-------------| +| `storage.storageClass` | No | `local-path` | Kubernetes StorageClass for PVCs | +| `storage.vectorDbSize` | No | `50Gi` | Weaviate PersistentVolume size | +| `storage.minimumDiskSpace.controller` | No | `100` | Minimum disk (GB) for controller preflight check | +| `storage.minimumDiskSpace.cpuWorker` | No | `200` | Minimum disk (GB) for CPU worker preflight check | +| `storage.minimumDiskSpace.gpuWorker` | No | `500` | Minimum disk (GB) for GPU worker preflight check | +| `storage.objectStore.type` | No | `minio` | `aws`, `s3compat`, `minio`, or `seaweedfs` | +| `storage.objectStore.bucket` | No | `ai-platform-data` | S3 bucket name | +| `storage.objectStore.endpoint` | **Yes*** | — | S3-compatible endpoint URL (*required for s3compat/minio/seaweedfs) | +| `storage.objectStore.auth.rootUser` | Yes | — | Access key / root user | +| `storage.objectStore.auth.rootPassword` | Yes | — | Secret key / root password | #### Images Section Short image paths (without a FQDN) are automatically prefixed with `images.registry`. -```yaml -images: - registry: "myregistry.com" # Prefix applied to short image paths - operator: - image: "myregistry.com/splunk-ai-operator:v0.1.5" - splunk: - image: "myregistry.com/splunk:latest" - operatorImage: "docker.io/splunk/splunk-operator:3.0.0" - ray: - headImage: "ray/ray-head:build-v1alpha1" - workerImage: "ray/ray-worker-gpu:build-v1alpha1" - weaviate: - image: "docker.io/semitechnologies/weaviate:stable-v1.28" - saia: - apiImage: "saia/saia-api:build-v1alpha1" - dataLoaderImage: "saia/saia-data-loader:build-v1alpha1" - slim: - apiImage: "myregistry.com/slim-api:v0.0.1" - fluentBit: - image: "docker.io/fluent/fluent-bit:1.9.6" - otelCollector: - image: "docker.io/otel/opentelemetry-collector-contrib:0.122.1" -``` - -**Image patching chain:** The script reads these config values, resolves them via `build_image_url()` (prepends registry if needed), then uses `sed` to patch the corresponding `RELATED_IMAGE_*` env vars in the manifest files: +| Field | Required | Default | Description | +|-------|----------|---------|-------------| +| `images.registry` | No | `""` | Registry prefix for short image paths | +| `images.operator.image` | **Yes** | — | Splunk AI Operator image | +| `images.splunk.image` | **Yes** | — | Splunk Enterprise image | +| `images.splunk.operatorImage` | No | `docker.io/splunk/splunk-operator:3.0.0` | Splunk Operator image | +| `images.ray.headImage` | **Yes** | — | Ray head node image | +| `images.ray.workerImage` | **Yes** | — | Ray GPU worker image | +| `images.weaviate.image` | **Yes** | — | Weaviate vector DB image | +| `images.saia.apiImage` | **Yes** | — | SAIA API v1 image | +| `images.saia.apiV2Image` | **Yes** | — | SAIA API v2 image | +| `images.saia.dataLoaderImage` | **Yes** | — | SAIA data loader / post-install hook image | +| `images.nginx.image` | No | `docker.io/library/nginx:1.27-alpine` | Nginx reverse proxy for SAIA v1/v2 routing | +| `images.fluentBit.image` | No | `fluent/fluent-bit:1.9.6` | Fluent Bit log forwarder | +| `images.otelCollector.image` | No | `otel/opentelemetry-collector-contrib:0.122.1` | OpenTelemetry Collector | + +**Image patching chain:** The script reads these config values, resolves them via `build_image_url()` (prepends registry if needed), then uses `sed` to patch the corresponding `RELATED_IMAGE_*` env vars in manifest files: | Config field | Env var patched | Target file | |---|---|---| | `images.operator.image` | Container `image:` field | `artifacts.yaml` | -| `images.splunk.image` | `RELATED_IMAGE_SPLUNK_ENTERPRISE` | `splunk-operator-cluster.yaml` only | +| `images.splunk.image` | `RELATED_IMAGE_SPLUNK_ENTERPRISE` | `splunk-operator-cluster.yaml` | | `images.splunk.operatorImage` | Container `image:` field | `splunk-operator-cluster.yaml` | | `images.ray.headImage` | `RELATED_IMAGE_RAY_HEAD` | `artifacts.yaml` | | `images.ray.workerImage` | `RELATED_IMAGE_RAY_WORKER` | `artifacts.yaml` | | `images.weaviate.image` | `RELATED_IMAGE_WEAVIATE` | `artifacts.yaml` | | `images.saia.apiImage` | `RELATED_IMAGE_SAIA_API` | `artifacts.yaml` | +| `images.saia.apiV2Image` | `RELATED_IMAGE_SAIA_API_V2` | `artifacts.yaml` | | `images.saia.dataLoaderImage` | `RELATED_IMAGE_POST_INSTALL_HOOK` | `artifacts.yaml` | -| `images.slim.apiImage` | `RELATED_IMAGE_SLIM_API` | `artifacts.yaml` | +| `images.nginx.image` | `RELATED_IMAGE_NGINX` | `artifacts.yaml` | | `images.fluentBit.image` | `RELATED_IMAGE_FLUENT_BIT` | `artifacts.yaml` | | `images.otelCollector.image` | `RELATED_IMAGE_OTEL_COLLECTOR` | `artifacts.yaml` | | `operators.ray.modelVersion` | `MODEL_VERSION` | `artifacts.yaml` | | `operators.ray.rayVersion` | `RAY_VERSION` | `artifacts.yaml` | -> **Note:** `RELATED_IMAGE_SPLUNK_ENTERPRISE` also exists in `artifacts.yaml` but is only -> patched in `splunk-operator-cluster.yaml`. `SPLUNK_METRICS_INDEX_NAME` in `artifacts.yaml` -> is not configurable from the config file. - -#### Operators Section - -```yaml -operators: - ray: - version: "v1.2.2" # KubeRay operator Helm chart version - modelVersion: "v0.3.14-36-g1549f5a" # Model version label for Ray - rayVersion: "2.44.0" # Ray runtime version - certManager: - installCRDs: true # Install cert-manager CRDs - nvidia: - devicePluginVersion: "v0.17.3" # NVIDIA k8s device plugin version -``` - #### AI Platform Section -```yaml -aiPlatform: - defaultAcceleratorType: "L40S" # GPU tier: L40S, H100_NVL, or "" - workerGroupConfig: - imageRegistry: "" # Override registry for Ray worker images - # Note: name, features, cpuScheduling, gpuScheduling are defined - # for reference but currently hardcoded in the script's CR template -``` +| Field | Required | Default | Description | +|-------|----------|---------|-------------| +| `aiPlatform.name` | No | `${CLUSTER_NAME}-ai-platform` | Base name for the AIPlatform CR | +| `aiPlatform.defaultAcceleratorType` | No | `""` | GPU tier label: `L40S`, `H100`, or empty | +| `aiPlatform.workerGroupConfig.imageRegistry` | No | `""` | Override registry for Ray worker images | +| `aiPlatform.features` | Yes | — | Array of features to deploy (read dynamically from config) | +| `aiPlatform.features[].name` | Yes | — | Feature name (e.g., `saia`) | +| `aiPlatform.features[].version` | Yes | — | Feature version | +| `aiPlatform.features[].serviceAccountName` | No | `""` | Service account override | +| `aiPlatform.cpuScheduling.nodeSelector` | No | auto-generated | Node selector for CPU workloads | +| `aiPlatform.cpuScheduling.tolerations` | No | `[]` | Tolerations for CPU workloads | +| `aiPlatform.gpuScheduling.nodeSelector` | No | auto-generated | Node selector for GPU workloads | +| `aiPlatform.gpuScheduling.tolerations` | No | GPU toleration | Tolerations for GPU workloads | +| `aiPlatform.serviceTemplate.type` | No | — | Service type for SAIA exposure: `NodePort` or `LoadBalancer` | +| `aiPlatform.serviceTemplate.nodePort` | No | — | Node port number (only when type=NodePort) | #### Image Pull Secrets Section -The `secrets` list is **not consumed** by the script. Instead, the script auto-detects -which secrets exist in the namespace by checking for hardcoded names: `ecr-registry-secret`, -`docker-hub-secret`, `gcr-secret`, `acr-secret`, `custom-registry-secret`. +The `secrets` list is **not consumed** by the script. Instead, the script auto-detects which secrets exist in the namespace by checking for hardcoded names: `ecr-registry-secret`, `docker-hub-secret`, `gcr-secret`, `acr-secret`, `custom-registry-secret`. ```yaml imagePullSecrets: - secrets: # Pre-existing secret names; NOT consumed; script auto-detects secrets in namespace - - ecr-registry-secret - - docker-hub-secret - autoCreateECR: true # Auto-create ECR secret from AWS creds + secrets: [] # NOT consumed; script auto-detects in namespace + autoCreateECR: true # Consumed → creates ECR secret from AWS creds - # Docker Hub (optional) dockerHub: enabled: false username: "" password: "" email: "" - # Google Container Registry (optional) gcr: enabled: false - jsonKey: "" # GCP service account JSON key + jsonKey: "" - # Azure Container Registry (optional) acr: enabled: false - registry: "" # e.g. myregistry.azurecr.io + registry: "" username: "" password: "" - # Custom Docker-compatible registry (optional) custom: enabled: false name: "custom-registry-secret" @@ -1002,106 +470,81 @@ imagePullSecrets: ## Usage -### Basic Commands +### Commands ```bash -# Install cluster with custom config +# Install cluster and full AI Platform stack CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh install -# Delete entire cluster +# Delete entire cluster (stop k0s, remove services) CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh delete # Clean all k0s state from bare-metal nodes (stop/reset/remove) CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh clean-all -# Join additional workers to an existing cluster +# Join additional workers to an existing cluster (or rejoin failed workers) CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh join-workers ``` -### Advanced Commands - -```bash -# Install without confirmation prompts -AUTO_APPROVE=true CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh install - -# Use existing cluster (skip k0s installation, deploy stack only) -USE_EXISTING=force CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh install -``` - -### Post-Installation Tasks +### Environment Variables -#### 1. Access the Cluster +| Variable | Default | Description | +|----------|---------|-------------| +| `CONFIG_FILE` | `./k0s-cluster-config.yaml` | Path to configuration file | +| `AUTO_APPROVE` | `false` | Skip confirmation prompts | +| `USE_EXISTING` | (from config) | Override `cluster.useExisting` (`auto`/`force`/`never`) | +| `LOG_DIR` | `./logs` | Directory for session log files | -```bash -# Set kubeconfig environment variable -export KUBECONFIG=~/.kube/k0s-my-cluster +### Session Logging -# Or copy to default location -cp ~/.kube/k0s-my-cluster ~/.kube/config +All script output (stdout and stderr) is automatically captured to a timestamped log file: -# Verify cluster access -kubectl cluster-info -kubectl get nodes ``` - -#### 2. Check Installation Status - -```bash -# Check all namespaces -kubectl get pods --all-namespaces - -# Check AI Platform specifically -kubectl get aiplatform -n ai-platform -o wide - -# Check AIServices -kubectl get aiservice -n ai-platform - -# Check RayCluster -kubectl get rayservice -n ai-platform +tools/cluster_setup/logs/k0s-install-2026-04-29_14-30-00.log ``` -#### 3. Access MinIO Console - +Override the log directory: ```bash -# Port forward MinIO console -kubectl port-forward -n minio-system svc/minio 9001:9001 - -# Open in browser: http://localhost:9001 -# Login with credentials from config file +LOG_DIR=/var/log/k0s CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh install ``` -#### 4. Access Splunk +### Install Flow -```bash -# Get Splunk admin password -SPLUNK_PASSWORD=$(kubectl get secret \ - splunk--standalone-secret-v1 \ - -n ai-platform \ - -o jsonpath='{.data.password}' | base64 -d) +The `install` command executes these steps in order: -echo "Splunk password: $SPLUNK_PASSWORD" +1. **Load config** — Parse YAML, validate existingIPs +2. **Validate images** — Ensure all required image fields are set +3. **Configure images** — Patch `RELATED_IMAGE_*` env vars in manifest files +4. **Preflight checks** — Validate tools, SSH connectivity, disk space, config +5. **Install k0s cluster** — Safety gate check → clean state → install controller → join workers → label nodes +6. **Install AI Platform stack** (two-phase parallel): + - Phase 1 (parallel): cert-manager, kube-prometheus, NVIDIA host drivers + - Between phases: Ensure S3 credentials secret + - Phase 2 (parallel): OTel operator, Ray operator, Splunk operator, NVIDIA device plugin + - Sequential: Image pull secrets → Splunk standalone → AI operator → AIPlatform CR +7. **Health checks** — Verify all components are running +8. **Access info** — Display kubeconfig path and service endpoints -# Port forward Splunk web UI -kubectl port-forward -n ai-platform \ - svc/splunk--standalone-service 8000:8000 +### join-workers Command -# Access at http://localhost:8000 -# Username: admin -# Password: (from above command) -``` +The `join-workers` command is used to: +- Add new worker nodes to an existing cluster +- Rejoin workers that were disconnected or failed -#### 5. Access Prometheus/Grafana +It: +1. Loads config and identifies which workers are not yet joined +2. Generates a fresh worker token from the controller +3. Installs k0s worker on each missing node +4. Waits for nodes to become Ready +5. Labels nodes with `splunk.ai/*` labels based on CPU/GPU role -```bash -# Prometheus -kubectl port-forward -n monitoring svc/prometheus-operated 9090:9090 -# Access at http://localhost:9090 - -# Grafana -kubectl port-forward -n monitoring svc/grafana 3000:80 -# Access at http://localhost:3000 -# Default credentials: admin/admin -``` +### useExisting Flag + +| Value | Behavior | +|-------|----------| +| `never` | Always creates a new k0s cluster (default). Fails if nodes have a live cluster (safety gate). | +| `auto` | Checks if a running k0s cluster exists on the controller. If yes, skips cluster creation and deploys stack only. If no, creates new cluster. | +| `force` | Assumes an existing cluster. Fails if no running cluster is found on the controller. | --- @@ -1130,9 +573,9 @@ kubectl port-forward -n monitoring svc/grafana 3000:80 ┌─▼───────▼──────┐ ┌─────────▼────────┐ ┌───────▼─────────┐ │ CPU Worker 1 │ │ CPU Worker 2 │ │ GPU Worker │ │ │ │ │ │ │ -│ • MinIO │ │ • Weaviate │ │ • Ray GPU Pods │ -│ • Ray Head │ │ • Ray CPU Pods │ │ • AI Training │ -│ • Monitoring │ │ • AI Inference │ │ │ +│ • Ray Head │ │ • Weaviate │ │ • Ray GPU Pods │ +│ • Splunk │ │ • Ray CPU Pods │ │ • AI Inference │ +│ • Monitoring │ │ • AI Services │ │ │ └────────────────┘ └──────────────────┘ └─────────────────┘ ``` @@ -1141,7 +584,6 @@ kubectl port-forward -n monitoring svc/grafana 3000:80 **Pod Network (Calico VXLAN):** - CIDR: `10.244.0.0/16` - Overlay network across all nodes -- Isolated from host network **Service Network:** - CIDR: `10.96.0.0/16` @@ -1149,55 +591,44 @@ kubectl port-forward -n monitoring svc/grafana 3000:80 - NodePort range: `30000-32767` **Host Network:** -- Controller API: `:6443` -- Konnectivity: `:8132` -- SSH: `:22` +- Controller API: `:6443` +- Konnectivity: `:8132` +- SSH: `:22` ### Storage Architecture ``` ┌──────────────────────────────────────────────────────────┐ -│ MinIO Object Storage │ -│ (S3-Compatible, Running in Kubernetes) │ +│ External S3-Compatible Object Storage │ +│ (Customer-Managed: SeaweedFS / MinIO / S3) │ │ │ -│ Endpoint: http://minio.minio-system.svc.cluster.local │ -│ Port: 9000 (API), 9001 (Console) │ +│ Endpoint: http://: │ │ │ │ Buckets: │ -│ ├─ ai-platform-bucket/ │ -│ │ ├─ artifacts/ (Build artifacts) │ -│ │ ├─ models/ (ML models) │ -│ │ ├─ datasets/ (Training data) │ -│ │ └─ tasks/ (Task outputs) │ -│ │ │ -│ └─ splunk-index/ (Splunk SmartStore indexes) │ +│ └─ ai-platform-data/ │ +│ ├─ artifacts/ (Build artifacts) │ +│ ├─ models/ (ML models) │ +│ ├─ datasets/ (Training data) │ +│ └─ tasks/ (Task outputs) │ │ │ -│ Persistence: │ -│ └─ PVC: minio-storage (local-path) │ -│ Size: 100Gi (configurable) │ +│ Credentials stored in-cluster as: │ +│ └─ Secret: s3-secret (namespace: ai-platform) │ +│ Keys: s3_access_key, s3_secret_key │ └──────────────────────────────────────────────────────────┘ ``` **Access Patterns:** ```yaml -# From pods in cluster -endpoint: http://minio.minio-system.svc.cluster.local:9000 - -# From outside cluster (via port-forward) -endpoint: http://localhost:9000 - # AIPlatform CR reference objectStorage: - path: s3://ai-platform-bucket/artifacts - endpoint: http://minio.minio-system.svc.cluster.local:9000 - region: us-east-1 # Ignored by MinIO, but required + path: s3:///artifacts + endpoint: http://: + region: us-east-1 secretRef: s3-secret ``` ### Component Architecture -#### Operator and Resource Hierarchy - ```mermaid graph TB subgraph "Control Plane Operators" @@ -1210,7 +641,7 @@ graph TB subgraph "AI Platform Namespace" AIPLATFORM[AIPlatform CR
Custom Resource] - AISERVICE[AIService CRs
saia, slim, seca] + AISERVICE[AIService CRs
saia] RAYSERVICE[RayService
Ray Serve + Cluster] RAYCLUSTER[RayCluster
Head + Workers] WEAVIATE[Weaviate
Vector Database] @@ -1219,7 +650,7 @@ graph TB end subgraph "Infrastructure" - MINIO[MinIO
Object Storage] + OBJSTORE[External Object Storage
S3-Compatible] PROMETHEUS[Prometheus
Metrics] GRAFANA[Grafana
Dashboards] STORAGE[Persistent Volumes
local-path] @@ -1235,14 +666,14 @@ graph TB RAYCLUSTER -->|provisions| RAYWORKER[Ray Worker Pods
CPU + GPU] SPLOP -->|watches & reconciles| SPLUNK - SPLUNK -->|stores logs| MINIO + SPLUNK -->|stores logs| OBJSTORE CERTMGR -->|issues certs| RAYSERVICE OTELOP -->|watches & creates| OTELCOL OTELCOL -->|sends traces| SPLUNK - AIPLATFORM -->|references| MINIO + AIPLATFORM -->|references| OBJSTORE AIPLATFORM -->|references| SPLUNK WEAVIATE -->|stores vectors| STORAGE @@ -1258,232 +689,10 @@ graph TB style OTELOP fill:#e1f5ff style AIPLATFORM fill:#fff3e0 style AISERVICE fill:#fff3e0 - style MINIO fill:#f3e5f5 + style OBJSTORE fill:#f3e5f5 style STORAGE fill:#f3e5f5 ``` -#### Data Flow and Interactions - -```mermaid -graph LR - subgraph "User Interface" - USER[User] - SPLUNKUI[Splunk UI
Search Head] - SAIAAPP[SAIA App
Splunk Application] - end - - subgraph "AI Platform Services" - SAIASERVICE[SAIA Service
AI Service CR] - RAYHEAD[Ray Head
Ray Serve API] - RAYWORKER_CPU[Ray Workers
CPU Nodes] - RAYWORKER_GPU[Ray Workers
GPU Nodes] - WEAVIATE[Weaviate
Vector DB] - end - - subgraph "Storage Layer" - MINIO[MinIO
S3-Compatible
Models & Artifacts] - PV[Persistent Volumes
Vector Data] - end - - subgraph "Observability" - SPLUNK[Splunk Enterprise
Logs & Events] - OTEL[OpenTelemetry
Traces] - PROM[Prometheus
Metrics] - end - - USER -->|uses| SPLUNKUI - SPLUNKUI -->|runs| SAIAAPP - SAIAAPP -->|sends prompts| SAIASERVICE - SAIASERVICE -->|connects to| RAYHEAD - RAYHEAD -->|distributes tasks| RAYWORKER_CPU - RAYHEAD -->|distributes tasks| RAYWORKER_GPU - RAYHEAD -->|vector search| WEAVIATE - - WEAVIATE -->|returns results| RAYHEAD - RAYHEAD -->|inference results| SAIASERVICE - SAIASERVICE -->|prompt results| SAIAAPP - SAIAAPP -->|displays to| USER - - RAYWORKER_CPU -->|load models| MINIO - RAYWORKER_GPU -->|load models| MINIO - RAYHEAD -->|store results| MINIO - - WEAVIATE -->|persist vectors| PV - - RAYHEAD -->|send logs| SPLUNK - RAYWORKER_CPU -->|send logs| SPLUNK - RAYWORKER_GPU -->|send logs| SPLUNK - WEAVIATE -->|send logs| SPLUNK - SAIASERVICE -->|send logs| SPLUNK - - RAYHEAD -->|send traces| OTEL - RAYWORKER_CPU -->|send traces| OTEL - SAIASERVICE -->|send traces| OTEL - OTEL -->|forward| SPLUNK - - RAYHEAD -->|expose metrics| PROM - RAYWORKER_CPU -->|expose metrics| PROM - RAYWORKER_GPU -->|expose metrics| PROM - WEAVIATE -->|expose metrics| PROM - SAIASERVICE -->|expose metrics| PROM - - style USER fill:#e8f5e9 - style SPLUNKUI fill:#fff9c4 - style SAIAAPP fill:#fff3e0 - style SAIASERVICE fill:#e1f5ff - style RAYHEAD fill:#e1f5ff - style RAYWORKER_CPU fill:#e1f5ff - style RAYWORKER_GPU fill:#e1f5ff - style WEAVIATE fill:#f3e5f5 - style MINIO fill:#fce4ec - style PV fill:#fce4ec - style SPLUNK fill:#fff9c4 - style OTEL fill:#fff9c4 - style PROM fill:#fff9c4 -``` - -#### Complete Platform Deployment - -```mermaid -graph TB - subgraph "Kubernetes Cluster - k0s" - subgraph "kube-system Namespace" - K8S_API[Kubernetes API Server] - CALICO[Calico CNI
VXLAN Networking] - end - - subgraph "cert-manager Namespace" - CERTMGR[Cert Manager
Certificate Controller] - ISSUER[Issuers & Certificates] - end - - subgraph "monitoring Namespace" - PROM[Prometheus
Metrics Collection] - GRAFANA[Grafana
Visualization] - ALERTMGR[Alert Manager
Alerting] - end - - subgraph "opentelemetry-operator-system" - OTELOP[OpenTelemetry Operator] - end - - subgraph "ray-system Namespace" - RAYOP[KubeRay Operator
Ray Management] - end - - subgraph "splunk-operator Namespace" - SPLOP[Splunk Operator
Splunk Management] - end - - subgraph "splunk-ai-operator-system" - AIOP[Splunk AI Operator
AI Platform Controller] - WEBHOOK[Admission Webhooks
Validation] - end - - subgraph "minio-system Namespace" - MINIO[MinIO Deployment
Object Storage] - MINIOPVC[MinIO PVC
200Gi] - end - - subgraph "ai-platform Namespace" - AIPLATFORM[AIPlatform CR
Main Resource] - - subgraph "AI Services" - SAIA[AIService: saia
Splunk AI Assistant] - SLIM[AIService: slim
Slim API] - end - - subgraph "Ray Infrastructure" - RAYSERVICE[RayService
Ray Serve] - RAYCLUSTER[RayCluster
Distributed Cluster] - RAYHEAD[Ray Head Pod
8 CPU, 32GB RAM] - RAYWORKER1[Ray Worker Pod
16 CPU, 64GB RAM] - RAYWORKER2[Ray Worker GPU Pod
8 CPU, 32GB, 1x GPU] - end - - subgraph "Data Services" - WEAVIATE[Weaviate StatefulSet
Vector Database] - WEAVIATEPVC[Weaviate PVC
50Gi] - end - - subgraph "Splunk Services" - SPLUNK[Splunk Standalone
Enterprise] - SPLUNKETC[Splunk etc PVC] - SPLUNKVAR[Splunk var PVC] - end - - subgraph "Observability" - OTELCOL[OpenTelemetry Collector
Traces] - end - - subgraph "Networking" - RAYSVC[Ray Head Service
ClusterIP] - WEAVIATESVC[Weaviate Service
ClusterIP] - SPLUNKSVC[Splunk Service
ClusterIP] - end - end - - subgraph "kube-system (GPU)" - GPUPLUGIN[NVIDIA Device Plugin
DaemonSet] - end - end - - K8S_API -->|manages| AIOP - K8S_API -->|manages| SPLOP - K8S_API -->|manages| RAYOP - - AIOP -->|reconciles| AIPLATFORM - AIPLATFORM -->|creates| SAIA - SAIA -->|creates| RAYSERVICE - RAYOP -->|reconciles| RAYSERVICE - RAYSERVICE -->|creates| RAYCLUSTER - RAYCLUSTER -->|provisions| RAYHEAD - RAYCLUSTER -->|provisions| RAYWORKER1 - RAYCLUSTER -->|provisions| RAYWORKER2 - - AIPLATFORM -->|creates| WEAVIATE - WEAVIATE -->|claims| WEAVIATEPVC - - SPLOP -->|reconciles| SPLUNK - SPLUNK -->|claims| SPLUNKETC - SPLUNK -->|claims| SPLUNKVAR - - CERTMGR -->|provisions certs| RAYSERVICE - - OTELOP -->|creates| OTELCOL - - RAYHEAD -->|exposes| RAYSVC - WEAVIATE -->|exposes| WEAVIATESVC - SPLUNK -->|exposes| SPLUNKSVC - - RAYHEAD -->|reads/writes| MINIO - RAYWORKER1 -->|reads/writes| MINIO - RAYWORKER2 -->|reads/writes| MINIO - SPLUNK -->|reads apps| MINIO - - MINIO -->|stores on| MINIOPVC - - PROM -->|scrapes| RAYHEAD - PROM -->|scrapes| RAYWORKER1 - PROM -->|scrapes| RAYWORKER2 - PROM -->|scrapes| WEAVIATE - GRAFANA -->|queries| PROM - - RAYHEAD -->|sends traces| OTELCOL - RAYWORKER1 -->|sends traces| OTELCOL - OTELCOL -->|forwards to| SPLUNK - - GPUPLUGIN -->|provides GPUs to| RAYWORKER2 - - style AIOP fill:#e1f5ff,stroke:#01579b,stroke-width:3px - style AIPLATFORM fill:#fff3e0,stroke:#e65100,stroke-width:3px - style RAYSERVICE fill:#f3e5f5,stroke:#4a148c,stroke-width:2px - style RAYCLUSTER fill:#f3e5f5,stroke:#4a148c,stroke-width:2px - style MINIO fill:#fce4ec,stroke:#880e4f,stroke-width:2px - style SPLUNK fill:#fff9c4,stroke:#f57f17,stroke-width:2px - style WEAVIATE fill:#e0f2f1,stroke:#004d40,stroke-width:2px -``` - --- ## Image Pull Secrets @@ -1500,15 +709,13 @@ The platform supports automatic creation and propagation of image pull secrets f ### Automatic ECR Configuration -The easiest way to use private ECR images: - ```yaml -# In k0s-cluster-config.yaml ecr: - account: "123456789012" # Your AWS account ID + account: "123456789012" + region: us-east-2 imagePullSecrets: - autoCreateECR: true # Enable automatic ECR secret creation + autoCreateECR: true ``` **What happens automatically:** @@ -1525,14 +732,12 @@ imagePullSecrets: ### Manual Secret Creation -For air-gapped or custom registries: - ```bash # ECR secret kubectl create secret docker-registry ecr-registry-secret \ - --docker-server=123456789012.dkr.ecr.us-west-2.amazonaws.com \ + --docker-server=123456789012.dkr.ecr.us-east-2.amazonaws.com \ --docker-username=AWS \ - --docker-password=$(aws ecr get-login-password --region us-west-2) \ + --docker-password=$(aws ecr get-login-password --region us-east-2) \ --namespace=ai-platform # Docker Hub secret @@ -1543,24 +748,13 @@ kubectl create secret docker-registry docker-hub-secret \ --namespace=ai-platform # Private registry secret -kubectl create secret docker-registry private-registry \ +kubectl create secret docker-registry custom-registry-secret \ --docker-server=registry.example.com \ --docker-username=admin \ --docker-password=secret123 \ --namespace=ai-platform ``` -Then reference in config: - -```yaml -imagePullSecrets: - secrets: - - ecr-registry-secret - - docker-hub-secret - - private-registry - autoCreateECR: false -``` - ### Image Pull Secret Propagation Secrets are automatically propagated through the platform: @@ -1589,20 +783,6 @@ Pods (Ray head, Ray workers, Weaviate, etc.) - name: ecr-registry-secret ``` -### Using Private Images - -Once secrets are configured, specify private images in your config: - -```yaml -# In k0s-cluster-config.yaml or AIPlatform CR -aiplatform: - ray: - image: "123456789012.dkr.ecr.us-west-2.amazonaws.com/ray:2.9.0" - - vectordb: - image: "123456789012.dkr.ecr.us-west-2.amazonaws.com/weaviate:1.28.0" -``` - ### Troubleshooting Image Pull Issues ```bash @@ -1613,11 +793,7 @@ kubectl get secret ecr-registry-secret -n ai-platform kubectl get secret ecr-registry-secret -n ai-platform -o jsonpath='{.type}' # Should output: kubernetes.io/dockerconfigjson -# Check secret content -kubectl get secret ecr-registry-secret -n ai-platform \ - -o jsonpath='{.data.\.dockerconfigjson}' | base64 -d | jq - -# Check pod events +# Check pod events for pull errors kubectl describe pod -n ai-platform | grep -A10 Events # Common errors: @@ -1661,7 +837,7 @@ nvidia.com/gpu: "true" nvidia.com/gpu.count: "1" # Auto-detected ``` -#### Taints +#### GPU Taints GPU nodes are automatically tainted to prevent non-GPU workloads: ```yaml @@ -1674,62 +850,31 @@ taints: #### Viewing Labels ```bash -# Show all labels -kubectl get nodes --show-labels - # Show specific labels kubectl get nodes -L splunk.ai/workload-type,splunk.ai/node-role -# Filter by label +# Filter by type kubectl get nodes -l splunk.ai/workload-type=gpu kubectl get nodes -l splunk.ai/workload-type=cpu - -# Count by type -echo "GPU nodes: $(kubectl get nodes -l splunk.ai/workload-type=gpu --no-headers | wc -l)" -echo "CPU nodes: $(kubectl get nodes -l splunk.ai/workload-type=cpu --no-headers | wc -l)" ``` -#### Custom Scheduling in AIPlatform CR +### NVIDIA GPU Support -```yaml -apiVersion: ai.splunk.com/v1 -kind: AIPlatform -metadata: - name: my-platform -spec: - # CPU workloads (Weaviate, Ray head, etc.) - cpuSchedulingSpec: - nodeSelector: - splunk.ai/workload-type: cpu - tolerations: [] - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: splunk.ai/workload-type - operator: In - values: - - cpu - - # GPU workloads (Ray GPU workers) - gpuSchedulingSpec: - nodeSelector: - splunk.ai/workload-type: gpu - nvidia.com/gpu: "true" - tolerations: - - key: nvidia.com/gpu - operator: Equal - value: "true" - effect: NoSchedule - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: nvidia.com/gpu.count - operator: Exists -``` +The script installs NVIDIA host drivers directly on GPU nodes (not the GPU Operator). + +**Supported distributions:** +- RHEL 9 +- RHEL 10 +- Amazon Linux 2023 +- Debian/Ubuntu + +**What happens on GPU nodes:** +1. Kernel headers installed +2. NVIDIA CUDA repository configured +3. `cuda-drivers` package installed (falls back to `nvidia-driver-550` on Debian) +4. NVIDIA Container Toolkit installed and configured +5. `nvidia-smi` verification run +6. NVIDIA device plugin DaemonSet applied cluster-wide with RuntimeClass ### High Availability Setup @@ -1737,7 +882,7 @@ For production deployments, use 3 controller nodes: ```yaml nodes: - controllers: 3 # HA etcd cluster + controllers: 3 existingIPs: controllers: - 10.0.1.10 @@ -1750,55 +895,52 @@ nodes: - etcd quorum maintained - Zero downtime for API server -**Requirements:** -- Odd number of controllers (1, 3, 5) -- Same datacenter/region for low latency -- Reliable network between controllers +### Service Template (SAIA Public Exposure) -### Custom CA Certificates +To expose the SAIA v2 chat UI externally: -For air-gapped or secure environments: - -```bash -# Create custom CA secret -kubectl create secret generic custom-ca \ - --from-file=ca.crt=/path/to/ca.crt \ - -n cert-manager - -# Update cert-manager to use custom CA -kubectl patch deployment cert-manager -n cert-manager \ - --patch '{"spec":{"template":{"spec":{"volumes":[{"name":"custom-ca","secret":{"secretName":"custom-ca"}}],"containers":[{"name":"cert-manager","volumeMounts":[{"name":"custom-ca","mountPath":"/etc/ssl/certs/custom-ca.crt","subPath":"ca.crt"}]}]}}}}' +```yaml +aiPlatform: + serviceTemplate: + type: "NodePort" # or "LoadBalancer" + nodePort: 30080 # only for NodePort ``` -### Resource Quotas +This generates a Kubernetes Service exposing port 8080 on the specified NodePort across all worker nodes. -Set resource limits per namespace: +### Air-Gapped Deployment -```bash -kubectl apply -f - < 8132 -``` - -#### DNS Resolution Failures +If install fails with "k0s cluster has Ready nodes — refusing to wipe": ```bash -# Test DNS from a pod -kubectl run -it --rm debug --image=busybox --restart=Never -- nslookup kubernetes.default +# Option 1: Use existing cluster (deploy stack only) +# Set useExisting: auto in config, then re-run install -# If fails, check CoreDNS -kubectl get pods -n kube-system | grep coredns -kubectl logs -n kube-system deployment/coredns +# Option 2: Tear down first +CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh delete +CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh install ``` ### Storage Issues -#### MinIO Not Starting +#### Object Storage Connectivity ```bash -# Check MinIO pods -kubectl get pods -n minio-system - -# View MinIO logs -kubectl logs -n minio-system deployment/minio - -# Common issues: -# 1. PVC not bound -kubectl get pvc -n minio-system - -# 2. Storage class not available -kubectl get sc +# Test endpoint from a node +ssh ubuntu@worker-ip +curl -s http://:/minio/health/live -# 3. Insufficient disk space -kubectl describe node | grep -A5 "Allocated resources" +# Verify S3 secret exists +kubectl get secret s3-secret -n ai-platform -o yaml ``` #### PVC Stuck in Pending @@ -1958,9 +1040,6 @@ kubectl describe node | grep -A5 "Allocated resources" # Check PVC status kubectl get pvc -n ai-platform -# Describe PVC for events -kubectl describe pvc -n ai-platform - # Check storage class kubectl get sc @@ -1973,8 +1052,6 @@ kubectl logs -n local-path-storage deployment/local-path-provisioner #### GPU Not Detected -The script installs NVIDIA host drivers and the device plugin DaemonSet directly (not the GPU Operator). - ```bash # Check NVIDIA device plugin pods kubectl get pods -n kube-system -l name=nvidia-device-plugin-ds @@ -1993,14 +1070,8 @@ nvidia-smi # Check if GPU nodes are tainted kubectl describe node | grep Taints -# Should have: -# nvidia.com/gpu=true:NoSchedule - # Check if pods have tolerations kubectl get pod -n ai-platform -o yaml | grep -A5 tolerations - -# Manually label GPU node if needed -kubectl label nodes nvidia.com/gpu=true --overwrite ``` ### Application Issues @@ -2017,86 +1088,18 @@ kubectl describe aiplatform -n ai-platform # Check operator logs kubectl logs -n splunk-ai-operator-system \ deployment/splunk-ai-operator-controller-manager - -# Common issues: -# 1. Missing dependencies (MinIO, Splunk) -kubectl get all -n minio-system -kubectl get standalone -n ai-platform - -# 2. Invalid configuration -kubectl get aiplatform -n ai-platform -o yaml -``` - -#### RayCluster Pods ImagePullBackOff - -```bash -# Check pod events -kubectl describe pod -n ai-platform | grep -A10 Events - -# Common causes: -# 1. Image doesn't exist -# Verify image exists in registry - -# 2. Missing imagePullSecrets -kubectl get pod -n ai-platform -o yaml | grep -A5 imagePullSecrets - -# 3. Invalid ECR token -kubectl get secret ecr-registry-secret -n ai-platform - -# Recreate ECR secret if expired (tokens expire after 12 hours) -kubectl delete secret ecr-registry-secret -n ai-platform -# Re-run installation or create manually -``` - -#### Weaviate Pod Stuck Pending - -```bash -# Check pod status -kubectl describe pod -n ai-platform - -# Common issue: No CPU nodes labeled -kubectl get nodes -l splunk.ai/workload-type=cpu - -# If no nodes found, label manually: -kubectl label nodes splunk.ai/workload-type=cpu - -# Or remove CPU nodeSelector from AIPlatform: -kubectl patch aiplatform -n ai-platform --type=json \ - -p='[{"op": "remove", "path": "/spec/cpuScheduler/nodeSelector"}]' ``` -### Performance Issues - -#### Slow Pod Startup - -```bash -# Check image pull time -kubectl describe pod -n ai-platform | grep -A20 Events - -# If pulling large images (GB+): -# 1. Pre-pull images to nodes -# 2. Use local registry mirror -# 3. Enable image pull parallelization - -# Check node resources -kubectl top nodes -kubectl describe node | grep -A10 "Allocated resources" -``` +### Session Logs -#### High Memory Usage +All install output is captured in timestamped log files: ```bash -# Check memory usage per node -kubectl top nodes - -# Check memory usage per pod -kubectl top pods -n ai-platform - -# Check pod limits -kubectl get pods -n ai-platform -o json | \ - jq '.items[] | {name: .metadata.name, limits: .spec.containers[].resources.limits}' +# View the latest log +ls -lt tools/cluster_setup/logs/ | head -5 -# If needed, adjust resource limits in AIPlatform CR +# Tail a running install +tail -f tools/cluster_setup/logs/k0s-install-*.log ``` ### Debugging Commands @@ -2117,15 +1120,6 @@ kubectl exec -it -n ai-platform -- /bin/bash # Check pod logs (all containers) kubectl logs -n ai-platform --all-containers=true --tail=100 - -# Check previous container logs (if crashed) -kubectl logs -n ai-platform --previous - -# Port forward for testing -kubectl port-forward -n ai-platform svc/ 8080:80 - -# Create debug pod -kubectl run -it --rm debug --image=nicolaka/netshoot --restart=Never -- bash ``` --- @@ -2134,7 +1128,7 @@ kubectl run -it --rm debug --image=nicolaka/netshoot --restart=Never -- bash ### Production Security Checklist -- [ ] Change default MinIO credentials +- [ ] Use strong object storage credentials (not defaults) - [ ] Enable TLS for all services - [ ] Configure network policies - [ ] Use unique SSH keys per environment @@ -2144,104 +1138,13 @@ kubectl run -it --rm debug --image=nicolaka/netshoot --restart=Never -- bash - [ ] Configure secrets encryption at rest - [ ] Set up backup and disaster recovery - [ ] Enable monitoring and alerting -- [ ] Harden SSH configuration -- [ ] Disable root SSH access +- [ ] Harden SSH configuration (disable root login) - [ ] Enable firewall on all nodes - [ ] Regular security updates -### Changing MinIO Credentials - -```bash -# 1. Create new secret -kubectl create secret generic minio-creds-new \ - --from-literal=accesskey='new-strong-access-key' \ - --from-literal=secretkey='new-strong-secret-key-123!' \ - --namespace=minio-system \ - --dry-run=client -o yaml | kubectl apply -f - - -# 2. Update MinIO deployment -kubectl patch deployment minio -n minio-system \ - --patch '{"spec":{"template":{"spec":{"containers":[{"name":"minio","env":[{"name":"MINIO_ROOT_USER","valueFrom":{"secretKeyRef":{"name":"minio-creds-new","key":"accesskey"}}},{"name":"MINIO_ROOT_PASSWORD","valueFrom":{"secretKeyRef":{"name":"minio-creds-new","key":"secretkey"}}}]}]}}}}' - -# 3. Update s3-secret in ai-platform namespace -kubectl create secret generic s3-secret \ - --from-literal=s3_access_key='new-strong-access-key' \ - --from-literal=s3_secret_key='new-strong-secret-key-123!' \ - --namespace=ai-platform \ - --dry-run=client -o yaml | kubectl apply -f - - -# 4. Restart affected pods -kubectl rollout restart deployment -n minio-system -kubectl delete pods -n ai-platform -l app=splunk -``` - -### Enabling TLS with Cert-Manager - -```bash -# 1. Create ClusterIssuer for Let's Encrypt -kubectl apply -f - < aiplatform-backup.yaml -# Export Splunk Standalone -kubectl get standalone -n ai-platform -o yaml > splunk-backup.yaml - -# Backup MinIO/S3 data +# Backup S3 data aws s3 sync s3://my-ai-bucket ./s3-backup/ ``` @@ -2364,23 +1256,13 @@ aws s3 sync s3://my-ai-bucket ./s3-backup/ CONFIG_FILE=./k0s-config.yaml ./k0s_cluster_with_stack.sh install ``` -**3. Restore Data to MinIO** +**3. Restore Data to Object Storage** ```bash -# Copy data to MinIO -mc mirror ./s3-backup/ k0s-minio/ai-platform-bucket/ -``` - -**4. Update AIPlatform CR** -```yaml -# Change objectStorage from S3 to MinIO -objectStorage: - path: s3://ai-platform-bucket/artifacts - endpoint: http://minio.minio-system.svc.cluster.local:9000 - region: us-east-1 - secretRef: s3-secret +# Copy data to your S3-compatible endpoint +mc mirror ./s3-backup/ my-storage/ai-platform-bucket/ ``` -**5. Apply Resources** +**4. Apply Resources** ```bash kubectl apply -f aiplatform-backup.yaml ``` @@ -2404,52 +1286,12 @@ sudo k0s start --- -## Comparison with EKS - -| Feature | EKS | k0s | -|---------|-----|-----| -| **Infrastructure** | -| Control Plane | AWS Managed | Self-managed | -| Worker Nodes | EC2 Auto Scaling Groups | Manual or EC2 | -| High Availability | Multi-AZ | Multi-node etcd | -| **Storage** | -| Object Storage | S3 (managed) | MinIO (self-hosted) | -| Block Storage | EBS CSI | local-path/Longhorn | -| Storage Costs | Pay per GB | Included in nodes | -| **Networking** | -| CNI | AWS VPC CNI | Calico VXLAN | -| Load Balancer | AWS ELB/ALB | NodePort/MetalLB | -| Ingress | AWS ALB Controller | NGINX Ingress | -| **Security** | -| IAM Integration | IRSA for pods | Service accounts only | -| Encryption | KMS | Manual cert-manager | -| Network Isolation | VPC Security Groups | Calico policies | -| **Operations** | -| Upgrades | Automated | Manual | -| Monitoring | CloudWatch | Self-hosted Prometheus | -| Logging | CloudWatch Logs | Self-hosted Loki | -| Backup | AWS Backup | Manual scripts | -| **Cost** | -| Control Plane | $0.10/hour | Included | -| Worker Nodes | EC2 pricing | EC2 or free (on-prem) | -| Storage | S3 pricing | Included in nodes | -| Networking | Data transfer fees | Free (on-prem) | -| **Use Cases** | -| Production Cloud | ✅ Excellent | ⚠️ Possible | -| On-Premises | ❌ Not possible | ✅ Excellent | -| Air-Gapped | ❌ Not possible | ✅ Excellent | -| Cost Optimization | ⚠️ Can be expensive | ✅ Lower cost | -| Quick Testing | ✅ Fast setup | ✅ Fast setup | - ---- - ## Support and Resources ### Documentation - k0s Official Docs: https://docs.k0sproject.io/ - Splunk AI Operator: https://github.com/splunk/splunk-ai-operator -- MinIO Docs: https://min.io/docs/ - KubeRay: https://docs.ray.io/en/latest/cluster/kubernetes/ ### Getting Help @@ -2458,221 +1300,8 @@ sudo k0s start - **Splunk Community**: https://community.splunk.com/ - **k0s Slack**: https://k8slens.slack.com -### Contributing - -Contributions are welcome! Please: -1. Fork the repository -2. Create a feature branch -3. Submit a pull request - -### License - -See the main repository LICENSE file. - ---- - -## Appendix - -### Complete Config File Reference - -```yaml -# Full k0s-cluster-config.yaml with all options -cluster: - name: my-cluster # Cluster identifier - useExisting: auto # auto|force|never - region: us-west-2 # EC2 mode + ECR fallback region (not needed for pure on-prem) - sshUser: ubuntu # SSH username for node access - sshKeyPath: ~/.ssh/key.pem # SSH private key path - -nodes: - controllers: 1 # 1 or 3 for HA - cpuWorkers: 2 # EC2: create count. Bare metal: first N workers = CPU - gpuWorkers: 1 # EC2: create count. Bare metal: remaining workers = GPU - existingIPs: - controllers: [] # Empty = create EC2, or list of IPs (bare metal) - workers: [] # Empty = create EC2, or list of IPs (bare metal) - -# --- Storage --- -storage: - storageClass: "local-path" # StorageClass for PVCs - vectorDbSize: "50Gi" # Weaviate PV size - objectStore: - type: "seaweedfs" # aws | s3compat | minio | seaweedfs - bucket: "ai-platform-bucket" # S3 bucket name - endpoint: "http://host:8333" # S3-compatible endpoint - auth: - rootUser: "admin" # Access key - rootPassword: "password" # Secret key - -# --- Container Images --- -images: - registry: "myregistry.com" # Registry prefix for short image paths - operator: - image: "myregistry.com/splunk-ai-operator:v0.1.5" - splunk: - image: "myregistry.com/splunk:latest" - operatorImage: "docker.io/splunk/splunk-operator:3.0.0" - ray: - headImage: "myregistry.com/ray/ray-head:build-v1alpha1" - workerImage: "myregistry.com/ray/ray-worker-gpu:build-v1alpha1" - weaviate: - image: "docker.io/semitechnologies/weaviate:stable-v1.28" - saia: - apiImage: "myregistry.com/saia/saia-api:build-v1alpha1" - dataLoaderImage: "myregistry.com/saia/saia-data-loader:build-v1alpha1" - slim: - apiImage: "myregistry.com/slim-api:v0.0.1" - fluentBit: - image: "docker.io/fluent/fluent-bit:1.9.6" - otelCollector: - image: "docker.io/otel/opentelemetry-collector-contrib:0.122.1" - -# --- Operator Versions --- -operators: - ray: - version: "v1.2.2" # KubeRay operator chart version - modelVersion: "v0.3.14-36-g1549f5a" # Model version label - rayVersion: "2.44.0" # Ray runtime version - certManager: - installCRDs: true - nvidia: - devicePluginVersion: "v0.17.3" # NVIDIA k8s device plugin tag - -# --- Kubernetes --- -kubernetes: - namespace: ai-platform # AI Platform namespace - -# --- File Paths --- -files: - splunkOperator: "./splunk-operator-cluster.yaml" # Splunk Operator manifest path - aiPlatform: "./artifacts.yaml" # AI Operator manifest path - -# --- Splunk --- -splunk: - standaloneName: splunk-standalone # Splunk Standalone CR name - -# --- AI Platform --- -# NOTE: defaultAcceleratorType and workerGroupConfig.imageRegistry are consumed -# by the script. The remaining fields are NOT consumed and are hardcoded in -# the AIPlatform CR template inside the script: -# - name: hardcoded as "${CLUSTER_NAME}-ai-platform" -# - features: hardcoded to only "saia" (slim/seca must be added manually) -# - cpuScheduling/gpuScheduling: hardcoded with node selectors -# - objectStorage.region: hardcoded to "us-east-1" -aiPlatform: - name: "splunk-ai-stack" # Reference only; NOT consumed; CR name = ${CLUSTER_NAME}-ai-platform - defaultAcceleratorType: "L40S" # Consumed → AIPlatform CR spec - workerGroupConfig: - imageRegistry: "" # Override registry for Ray worker images - features: # Reference only (hardcoded in script) - - name: "saia" - version: "1.1.0" - - name: "slim" - version: "1.0.0" - cpuScheduling: # Reference only (hardcoded in script) - nodeSelector: {} - tolerations: [] - gpuScheduling: # Reference only (hardcoded in script) - nodeSelector: {} - tolerations: - - key: "nvidia.com/gpu" - operator: "Equal" - value: "true" - effect: "NoSchedule" - -# --- EC2 Mode (optional) --- -ec2: - vpcId: vpc-xxx # Required for EC2 mode - subnetId: subnet-xxx # Optional, auto-selects first available - keyName: my-key # AWS key pair name - -instanceTypes: - controller: t3.xlarge # 4 CPU, 16GB RAM - cpuWorker: m5.4xlarge # 16 CPU, 64GB RAM - gpuWorker: g5.2xlarge # 8 CPU, 24GB RAM, A10G GPU - -# --- Image Pull Secrets --- -# NOTE: secrets[] list is NOT consumed by the script. The script auto-detects -# which secrets exist in the namespace by checking hardcoded names: -# ecr-registry-secret, docker-hub-secret, gcr-secret, acr-secret, custom-registry-secret. -imagePullSecrets: - secrets: [] # NOT consumed; script auto-detects in namespace - autoCreateECR: true # Consumed → creates ECR secret from AWS creds - - # Docker Hub private registry - dockerHub: - enabled: false - username: "" - password: "" # Use token, not plaintext password - email: "" - - # Google Container Registry - gcr: - enabled: false - jsonKey: "" # GCP service account JSON key - - # Azure Container Registry - acr: - enabled: false - registry: "" # e.g. myregistry.azurecr.io - username: "" - password: "" - - # Custom Docker-compatible registry - custom: - enabled: false - name: "custom-registry-secret" # Secret name to create - server: "" # Registry URL - username: "" - password: "" - email: "" - -ecr: - account: "123456789012" # AWS account ID - region: us-east-2 # ECR region -``` - -### Environment Variables - -```bash -# Override config file location -CONFIG_FILE=./my-config.yaml - -# Skip confirmation prompts -AUTO_APPROVE=true - -# Use existing cluster (skip k0s installation) -USE_EXISTING=force -``` - -### Common Recipes - -**Minimal Test Cluster:** -```bash -# Single CPU node, no GPU -CONFIG_FILE=minimal.yaml ./k0s_cluster_with_stack.sh install -``` - -**Production Cluster:** -```bash -# 3 controllers (HA), 5 workers, GPU support -CONFIG_FILE=production.yaml ./k0s_cluster_with_stack.sh install -``` - -**Air-Gapped Cluster:** -```bash -# Pre-pull all images, no internet access -# See air-gapped setup guide -``` - -**Development Cluster:** -```bash -# Quick setup for testing -CONFIG_FILE=dev.yaml AUTO_APPROVE=true ./k0s_cluster_with_stack.sh install -``` - --- -**Version:** 2.0 -**Last Updated:** February 2026 +**Version:** 3.0 +**Last Updated:** April 2026 **Maintainer:** Splunk AI Platform Team diff --git a/tools/cluster_setup/artifacts.yaml b/tools/cluster_setup/artifacts.yaml index 66b9b28..c6953e7 100644 --- a/tools/cluster_setup/artifacts.yaml +++ b/tools/cluster_setup/artifacts.yaml @@ -5688,11 +5688,11 @@ spec: - name: RELATED_IMAGE_WEAVIATE value: docker.io/semitechnologies/weaviate:stable-v1.28-007846a - name: RELATED_IMAGE_SAIA_API - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:build-v2-010 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:build-v2-012 - name: RELATED_IMAGE_SAIA_API_V2 - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api-v2:build-v2-010 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api-v2:build-v2-012 - name: RELATED_IMAGE_POST_INSTALL_HOOK - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-data-loader:build-v2-010 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-data-loader:build-v2-012 - name: SPLUNK_METRICS_INDEX_NAME value: _metrics - name: RELATED_IMAGE_FLUENT_BIT @@ -5705,7 +5705,7 @@ spec: value: v0.3.14-36-g1549f5a - name: RAY_VERSION value: 2.53.0 - image: 658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.28 + image: 658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.29 livenessProbe: httpGet: path: /healthz diff --git a/tools/cluster_setup/cluster-config.yaml b/tools/cluster_setup/cluster-config.yaml index 58287ae..c0ed83a 100644 --- a/tools/cluster_setup/cluster-config.yaml +++ b/tools/cluster_setup/cluster-config.yaml @@ -57,12 +57,6 @@ cluster: # availabilityZones: required — must match capacityReservation.az # maxSize: must equal desiredCapacity (capacity reservations are fixed-size) # -# H100_NVL: -# instanceType: p4de.24xlarge (8x H100 NVL GPUs, 94 GB VRAM each) -# defaultAcceleratorType: H100_NVL -# capacityReservation: not required -# availabilityZones: not required -# nodeGroups: cpu: enabled: true # Set to false to skip CPU node group @@ -232,7 +226,6 @@ aiPlatform: # Must be changed in sync with nodeGroups.gpu.instanceType (see GPU TYPE QUICK REFERENCE above). # L40S → instanceType: g6e.12xlarge # H100 → instanceType: p5.4xlarge (also uncomment capacityReservation + availabilityZones) - # H100_NVL → instanceType: p4de.24xlarge defaultAcceleratorType: "L40S" # Features to enable diff --git a/tools/cluster_setup/eks_cluster_with_stack.sh b/tools/cluster_setup/eks_cluster_with_stack.sh index 7426ae1..93df2bd 100755 --- a/tools/cluster_setup/eks_cluster_with_stack.sh +++ b/tools/cluster_setup/eks_cluster_with_stack.sh @@ -810,7 +810,7 @@ generate_node_groups() { k8s.io/cluster-autoscaler/${CLUSTER_NAME}: owned" fi # H100 with capacity reservation: node group created separately via CloudFormation - # All other GPU types (L40S, H100_NVL): standard eksctl managed node group + # All other GPU types (L40S): standard eksctl managed node group if [[ "$ENABLE_GPU" == "true" && "$DEFAULT_ACCELERATOR" == "H100" && -n "$GPU_CAPACITY_RESERVATION_ID" ]]; then log "GPU nodes will be created separately with capacity reservation ${GPU_CAPACITY_RESERVATION_ID}" elif [[ "$ENABLE_GPU" == "true" ]]; then @@ -822,7 +822,7 @@ generate_node_groups() { maxSize: ${GPU_MAX} volumeSize: ${GPU_VOLUME_SIZE} volumeType: ${GPU_VOLUME_TYPE}" - # Lock to specific AZ when availabilityZones are specified (e.g. for H100_NVL) + # Lock to specific AZ when availabilityZones are specified if [[ ${#GPU_AVAILABILITY_ZONES[@]} -gt 0 ]]; then nodes+=" availabilityZones:" diff --git a/tools/cluster_setup/k0s-cluster-config-h100.yaml b/tools/cluster_setup/k0s-cluster-config-h100.yaml deleted file mode 100644 index 7abfd33..0000000 --- a/tools/cluster_setup/k0s-cluster-config-h100.yaml +++ /dev/null @@ -1,244 +0,0 @@ -# =================================================================== -# k0s Cluster Configuration for Splunk AI Platform -# =================================================================== -# Mirrors cluster-config.yaml (EKS) but adapted for k0s on bare-metal / EC2. -# -# Quick Start: -# 1. Copy: cp k0s-cluster-config.yaml my-k0s-config.yaml -# 2. Edit: vi my-k0s-config.yaml -# 3. Replace all values marked with "CHANGE THIS" -# 4. Run: CONFIG_FILE=./my-k0s-config.yaml ./k0s_cluster_with_stack.sh install -# =================================================================== - -# ---------- Cluster Configuration ---------- -cluster: - name: airgap-cluster - # region: us-east-2 # Ignored for on-prem, but required in config - sshUser: ec2-user # CHANGE THIS: SSH user for remote nodes - sshKeyPath: ~/.ssh/id_rsa # CHANGE THIS: Path to SSH private key - -# ---------- Node Configuration ---------- -nodes: - controllers: 1 - cpuWorkers: 1 # Not used with existingIPs - gpuWorkers: 2 # Not used with existingIPs - - existingIPs: - controllers: - - 10.0.0.1 # CHANGE THIS: Your controller server IP - workers: - - 10.0.0.2 # CHANGE THIS: CPU worker 1 - - 10.0.0.3 # CHANGE THIS: GPU worker 1 - - 10.0.0.4 # CHANGE THIS: GPU worker 2 - -# ---------- Storage Configuration ---------- -# Object storage: AWS S3 or external S3-compatible (no in-cluster MinIO install for external). -# Use objectStore.type: aws (S3) or s3compat | minio | seaweedfs (external; endpoint + credentials required). -storage: - s3Bucket: "ai-platform-bucket-minio-us-east-2" # Used when objectStore.type is aws - storageClass: "local-path" # Storage class for Kubernetes PVCs (gp3, gp2, io1, io2) - vectorDbSize: "50Gi" # VectorDB persistent volume size - - objectStore: - # 2026-04-21: switched from seaweedfs to minio because SeaweedFS returns - # S3 InternalError/500 (not NoSuchKey/404) for GetObjectTagging on a - # missing key. The SAIA v2 S3ConversationStore (added by Tony in - # saia-service commits 3d3756f3/8e2a9f40, shipped in image build-v2-002) - # calls GetObjectTagging on the conversation key *before* the first - # PutObject, so every brand-new draft: conversation hit a 502 from the - # SDK's 5-retry backoff. MinIO is AWS-spec compliant (NoSuchKey/404) and - # hosts the same bucket name at :9000, so swapping the endpoint is - # sufficient. Fallback: flip back by setting type: "seaweedfs" and - # endpoint to :8333 (but note the 502 on every draft conversation). - type: "minio" # aws | s3compat | minio | seaweedfs (external only for non-aws) - bucket: "ai-platform-bucket-minio-us-east-2" - # endpoint: "http://3.144.157.201:8333" # SeaweedFS (deprecated — see comment above) - endpoint: "http://10.0.0.5:9000" # CHANGE THIS: MinIO/SeaweedFS S3 API endpoint - auth: - rootUser: "minioadmin" - rootPassword: "minioadmin" - -# ---------- Container Images Configuration ---------- -images: - # Registry prefix - applied to images without a full registry path - registry: "" # CHANGE THIS: Your ECR/Docker/Harbor registry (e.g. 123456789012.dkr.ecr.us-east-2.amazonaws.com) - - operator: - # image: "docker.io/kpratyush775/splunk-ai-operator:v0.1.29" - # v0.1.21 added ServiceTemplate propagation (AIPlatform → AIService) so the - # aiPlatform.serviceTemplate block in this config actually takes effect. - # v0.1.21 also sets FIELD_DESCRIPTION_BACKEND=s3 + FIELD_DESCRIPTION_S3_KEY + - # AWS_ENDPOINT_URL on the v2 API and v2 worker pods (Confluence ERD 3.8.1.2) - # and wires AWS_ACCESS_KEY_ID/SECRET from TaskVolume.SecretRef so boto3 can - # actually auth to the S3-compatible endpoint (otherwise field-description - # lookups silently return empty and sourcetype metadata is degraded). - # v0.1.24 lowers RUN_TASKS_DELAY_S from 600s to 10s on the v2 worker. - # The saia-v2 IngestionWorker only refreshes its heartbeat at the top of - # each poll iteration, and the liveness probe kills the pod at 120s stale. - # At 600s (the v1 scheduler cadence) the worker got SIGKILLed every time - # the tenant lock was busy or the queue was empty. - # v0.1.25 adds the nginx CORS preflight short-circuit (SAIA v2's - # TenantConversationKeyMiddleware rejects unauthenticated OPTIONS with - # 400 before CORSMiddleware can respond, so nginx has to answer the - # preflight itself with 204 + CORS headers) AND sets - # DISABLE_RESPONSES_API_REDIS=True on the GptOss120b and GptOss20b Ray - # Serve apps (pair with ray-head/ray-worker-gpu:build-v2-001 which - # includes the NoOpOpenAIServingResponses implementation — see - # ai-platform-models commits c1f9aef3, da7628ea, b6ff101e). Without the - # env var the vLLM RedisOpenAIServingResponses constructor raises - # RuntimeError('Responses Redis URL not set') on every /v1/responses - # call and the SAIA v2 /query path fails with SearchStreamError. - # v0.1.26 switches the SAIA v2 conversation store from the ephemeral - # "filesystem" default to "s3" (S3ConversationStore) by setting - # CONVERSATION_STORE=s3 and CONVERSATION_S3_BUCKET= on both the - # v2 API and v2 worker pods. Before this, chat history lived on the - # pod's container overlay and every pod restart produced spurious - # "Conversation not found" 404s on GET /conversations//items - # (the Splunk UI's saia_v2_audit_index_log_proxy flow surfaced them as - # "Failed to fetch SAIA V2 conversation items"). Reuses the same - # AWS_ENDPOINT_URL + AWS_ACCESS_KEY_ID/SECRET that v0.1.21 wired for - # the FieldDescription S3 adapter. Pairs with saia-service image - # build-v2-002 which ships Tony's S3ConversationStore (commits - # 3d3756f3, 8e2a9f40, merged via 9efe1fce into ai-tier-v2.0). - # Build & push with: - # IMG=658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.26 \ - # make docker-build-amd64 docker-push - image: "splunk-ai-operator:latest" # CHANGE THIS: Your operator image - - splunk: - image: "splunk/splunk:10.2.0" # CHANGE THIS: Your Splunk Enterprise image - operatorImage: "docker.io/splunk/splunk-operator:3.0.0" - - ray: - # headImage: "ml-platform/ray/ray-head:build-v1alpha1" - # headImage: "ml-platform/ray/ray-head:087e40e" - # headImage: "ml-platform/ray/ray-head:build-010" - # headImage: "ml-platform/ray/ray-head:9a24502-ai-tier" # arif rebase to main - # headImage: "ml-platform/ray/ray-head:build-v2-001" # tony redis changes - headImage: "ml-platform/ray/ray-head:build-v2-010" # tony redis changes + fixes - - # workerImage: "ml-platform/ray/ray-worker-gpu:build-v1alpha1" - # workerImage: "ml-platform/ray/ray-worker-gpu:087e40e" - # workerImage: "ml-platform/ray/ray-worker-gpu:build-010" - # workerImage: "ml-platform/ray/ray-worker-gpu:9a24502-ai-tier" # arif rebase to main - # workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-001" # tony redis changes - workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-010" # tony redis changes + fixes - - weaviate: - image: "docker.io/semitechnologies/weaviate:stable-v1.28-007846a" - - saia: - # apiImage: "ml-platform/saia/saia-api:build-v1alpha1" - # apiImage: "ml-platform/saia/saia-api:build-006" #saia v1.5 - # apiImage: "ml-platform/saia/saia-api:v2.0.4-23-g2fc91e9" #saia v2 - # apiImage: "ml-platform/saia/saia-api:v2.0.4-31-g9efe1fc" #saia v2 + tony changes - apiImage: "ml-platform/saia/saia-api:build-v2-010" #saia v2 + tony changes - - # apiV2Image: "ml-platform/saia/saia-api-v2:v2.0.4-23-g2fc91e9" #saia v2 - # apiV2Image: "ml-platform/saia/saia-api-v2:v2.0.4-31-g9efe1fc" #saia v2 + tony changes - apiV2Image: "ml-platform/saia/saia-api-v2:build-v2-010" #saia v2 + tony changes - - # dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v1alpha1" - # dataLoaderImage: "ml-platform/saia/saia-data-loader:build-003" #saia v1.5 - # dataLoaderImage: "ml-platform/saia/saia-data-loader:v2.0.4-23-g2fc91e9" #saia v2 - dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v2-009" #saia v2 + tony changes - - fluentBit: - image: "docker.io/fluent/fluent-bit:1.9.6" - - otelCollector: - image: "docker.io/otel/opentelemetry-collector-contrib:0.122.1" - - # Reverse proxy used by the SAIA reconciler to route v1 / v2 requests by - # path. Consumed via RELATED_IMAGE_NGINX. Point this at an internal mirror - # for airgapped clusters. - nginx: - image: "docker.io/library/nginx:1.27-alpine" - -# ---------- Operator Versions ---------- -operators: - ray: - version: "v1.2.2" - modelVersion: "v0.3.14-36-g1549f5a" - rayVersion: "2.53.0" - - certManager: - installCRDs: true - - nvidia: - devicePluginVersion: "v0.17.3" - -# ---------- Kubernetes ---------- -kubernetes: - namespace: ai-platform - -# ---------- File Paths ---------- -files: - splunkOperator: "/Users/mohaari2/Files/repos/AI/splunk-ai-operator/tools/cluster_setup/splunk-operator-cluster.yaml" - aiPlatform: "/Users/mohaari2/Files/repos/AI/splunk-ai-operator/tools/cluster_setup/artifacts.yaml" - -# ---------- Splunk Configuration ---------- -splunk: - standaloneName: splunk-standalone - -# ---------- AI Platform Configuration ---------- -aiPlatform: - name: "splunk-ai-stack" - defaultAcceleratorType: "L40S" - # defaultAcceleratorType: "H100" - - workerGroupConfig: - imageRegistry: "" - - # ---------- SAIA public exposure (OPTIONAL) ---------- - # The SAIA "public" Service (nginx reverse proxy in front of v1+v2 API pods) - # defaults to ClusterIP, meaning it is only reachable from inside the cluster. - # - # Two call patterns hit this Service: - # (A) Splunk Enterprise pod → saia-service (works with ClusterIP) - # (B) End user's browser → saia-service (needs external exposure) - # - # Pattern B is used by the v2 chat UI (/query streaming, conversations, - # feedback, admin endpoints). Without external exposure the v2 chat UI - # breaks for users, even though v1 one-shot SPL features still work. - # - # To DISABLE external exposure (use ClusterIP only), either: - # * Delete / comment-out the entire `serviceTemplate:` block below, OR - # * Set `type: ClusterIP` explicitly. - # Either is treated identically — the installer skips emitting serviceTemplate - # into the AIPlatform CR and the operator falls through to the ClusterIP - # default in reconcileSAIAService(). - # - # To ENABLE external exposure for on-prem / airgap customers, NodePort is the - # recommended default: any k8s node IP + the configured nodePort yields a - # reachable endpoint from VPN-connected users. No cloud LB / cert-manager - # needed. Use LoadBalancer only if the customer runs MetalLB or a cloud LB. - serviceTemplate: - type: NodePort # ClusterIP | NodePort | LoadBalancer (omit block = ClusterIP) - nodePort: 30080 # Fixed NodePort (30000-32767). Required for stable DNS. - - features: - - name: "saia" - version: "1.1.0" - - cpuScheduling: - nodeSelector: {} - tolerations: [] - - gpuScheduling: - nodeSelector: {} - tolerations: - - key: "nvidia.com/gpu" - operator: "Equal" - value: "true" - effect: "NoSchedule" - -# ---------- Image Pull Secrets ---------- -imagePullSecrets: - secrets: - - ecr-registry-secret - autoCreateECR: true - -ecr: - account: "" # CHANGE THIS: Your AWS account ID (e.g. 123456789012) - region: us-east-2 # CHANGE THIS: Your AWS region diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml index aecec3f..9faa669 100644 --- a/tools/cluster_setup/k0s-cluster-config.yaml +++ b/tools/cluster_setup/k0s-cluster-config.yaml @@ -41,7 +41,6 @@ nodes: # Object storage: AWS S3 or external S3-compatible (no in-cluster MinIO install for external). # Use objectStore.type: aws (S3) or s3compat | minio | seaweedfs (external; endpoint + credentials required). storage: - s3Bucket: "ai-platform-bucket-minio-us-east-2" # Used when objectStore.type is aws storageClass: "local-path" # Storage class for Kubernetes PVCs (gp3, gp2, io1, io2) vectorDbSize: "50Gi" # VectorDB persistent volume size @@ -54,16 +53,6 @@ storage: # gpuWorker: 500 # model weights (60-240 GB each), ray-worker-gpu image (~30 GB) objectStore: - # 2026-04-21: switched from seaweedfs to minio because SeaweedFS returns - # S3 InternalError/500 (not NoSuchKey/404) for GetObjectTagging on a - # missing key. The SAIA v2 S3ConversationStore (added by Tony in - # saia-service commits 3d3756f3/8e2a9f40, shipped in image build-v2-002) - # calls GetObjectTagging on the conversation key *before* the first - # PutObject, so every brand-new draft: conversation hit a 502 from the - # SDK's 5-retry backoff. MinIO is AWS-spec compliant (NoSuchKey/404) and - # hosts the same bucket name at :9000, so swapping the endpoint is - # sufficient. Fallback: flip back by setting type: "seaweedfs" and - # endpoint to :8333 (but note the 502 on every draft conversation). type: "minio" # aws | s3compat | minio | seaweedfs (external only for non-aws) bucket: "ai-platform-bucket-minio-us-east-2" # endpoint: "http://3.144.157.201:8333" # SeaweedFS (deprecated — see comment above) @@ -78,45 +67,6 @@ images: registry: "" # CHANGE THIS: Your ECR/Docker/Harbor registry (e.g. 123456789012.dkr.ecr.us-east-2.amazonaws.com) operator: - # image: "docker.io/kpratyush775/splunk-ai-operator:v0.1.29" - # v0.1.21 added ServiceTemplate propagation (AIPlatform → AIService) so the - # aiPlatform.serviceTemplate block in this config actually takes effect. - # v0.1.21 also sets FIELD_DESCRIPTION_BACKEND=s3 + FIELD_DESCRIPTION_S3_KEY + - # AWS_ENDPOINT_URL on the v2 API and v2 worker pods (Confluence ERD 3.8.1.2) - # and wires AWS_ACCESS_KEY_ID/SECRET from TaskVolume.SecretRef so boto3 can - # actually auth to the S3-compatible endpoint (otherwise field-description - # lookups silently return empty and sourcetype metadata is degraded). - # v0.1.24 lowers RUN_TASKS_DELAY_S from 600s to 10s on the v2 worker. - # The saia-v2 IngestionWorker only refreshes its heartbeat at the top of - # each poll iteration, and the liveness probe kills the pod at 120s stale. - # At 600s (the v1 scheduler cadence) the worker got SIGKILLed every time - # the tenant lock was busy or the queue was empty. - # v0.1.25 adds the nginx CORS preflight short-circuit (SAIA v2's - # TenantConversationKeyMiddleware rejects unauthenticated OPTIONS with - # 400 before CORSMiddleware can respond, so nginx has to answer the - # preflight itself with 204 + CORS headers) AND sets - # DISABLE_RESPONSES_API_REDIS=True on the GptOss120b and GptOss20b Ray - # Serve apps (pair with ray-head/ray-worker-gpu:build-v2-001 which - # includes the NoOpOpenAIServingResponses implementation — see - # ai-platform-models commits c1f9aef3, da7628ea, b6ff101e). Without the - # env var the vLLM RedisOpenAIServingResponses constructor raises - # RuntimeError('Responses Redis URL not set') on every /v1/responses - # call and the SAIA v2 /query path fails with SearchStreamError. - # v0.1.26 switches the SAIA v2 conversation store from the ephemeral - # "filesystem" default to "s3" (S3ConversationStore) by setting - # CONVERSATION_STORE=s3 and CONVERSATION_S3_BUCKET= on both the - # v2 API and v2 worker pods. Before this, chat history lived on the - # pod's container overlay and every pod restart produced spurious - # "Conversation not found" 404s on GET /conversations//items - # (the Splunk UI's saia_v2_audit_index_log_proxy flow surfaced them as - # "Failed to fetch SAIA V2 conversation items"). Reuses the same - # AWS_ENDPOINT_URL + AWS_ACCESS_KEY_ID/SECRET that v0.1.21 wired for - # the FieldDescription S3 adapter. Pairs with saia-service image - # build-v2-002 which ships Tony's S3ConversationStore (commits - # 3d3756f3, 8e2a9f40, merged via 9efe1fce into ai-tier-v2.0). - # Build & push with: - # IMG=658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.26 \ - # make docker-build-amd64 docker-push image: "splunk-ai-operator:latest" # CHANGE THIS: Your operator image splunk: @@ -124,39 +74,16 @@ images: operatorImage: "docker.io/splunk/splunk-operator:3.0.0" ray: - # headImage: "ml-platform/ray/ray-head:build-v1alpha1" - # headImage: "ml-platform/ray/ray-head:087e40e" - # headImage: "ml-platform/ray/ray-head:build-010" - # headImage: "ml-platform/ray/ray-head:9a24502-ai-tier" # arif rebase to main - # headImage: "ml-platform/ray/ray-head:build-v2-001" # tony redis changes - headImage: "ml-platform/ray/ray-head:build-v2-010" # tony redis changes + fixes - - # workerImage: "ml-platform/ray/ray-worker-gpu:build-v1alpha1" - # workerImage: "ml-platform/ray/ray-worker-gpu:087e40e" - # workerImage: "ml-platform/ray/ray-worker-gpu:build-010" - # workerImage: "ml-platform/ray/ray-worker-gpu:9a24502-ai-tier" # arif rebase to main - # workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-001" # tony redis changes - workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-010" # tony redis changes + fixes + headImage: "ml-platform/ray/ray-head:build-v2-010" + workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-010" weaviate: image: "docker.io/semitechnologies/weaviate:stable-v1.28-007846a" saia: - # apiImage: "ml-platform/saia/saia-api:build-v1alpha1" - # apiImage: "ml-platform/saia/saia-api:build-006" #saia v1.5 - # apiImage: "ml-platform/saia/saia-api:v2.0.4-23-g2fc91e9" #saia v2 - # apiImage: "ml-platform/saia/saia-api:v2.0.4-31-g9efe1fc" #saia v2 + tony changes - apiImage: "ml-platform/saia/saia-api:build-v2-010" #saia v2 + tony changes - - # apiV2Image: "ml-platform/saia/saia-api-v2:v2.0.4-23-g2fc91e9" #saia v2 - # apiV2Image: "ml-platform/saia/saia-api-v2:v2.0.4-31-g9efe1fc" #saia v2 + tony changes - apiV2Image: "ml-platform/saia/saia-api-v2:build-v2-010" #saia v2 + tony changes - - # dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v1alpha1" - # dataLoaderImage: "ml-platform/saia/saia-data-loader:build-003" #saia v1.5 - # dataLoaderImage: "ml-platform/saia/saia-data-loader:v2.0.4-23-g2fc91e9" #saia v2 - # dataLoaderImage: "ml-platform/saia/saia-data-loader:v2.0.4-31-g9efe1fc" #saia v2 + tony changes - dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v2-010" #saia v2 + tony changes + personalization fix + apiImage: "ml-platform/saia/saia-api:build-v2-012" + apiV2Image: "ml-platform/saia/saia-api-v2:build-v2-012" + dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v2-012" fluentBit: image: "docker.io/fluent/fluent-bit:1.9.6" diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index b0ac10a..2adcffe 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -2757,7 +2757,7 @@ spec: images: ${image_pull_secrets} - # GPU accelerator type (determines Ray worker tiers: L40S, H100_NVL, or empty for no workers) + # GPU accelerator type (determines Ray worker tiers: L40S, H100, or empty for no workers) defaultAcceleratorType: ${DEFAULT_ACCELERATOR} # Features from config (aiPlatform.features)