diff --git a/.env b/.env index 69a2de5..59af144 100644 --- a/.env +++ b/.env @@ -1,6 +1,6 @@ OPERATOR_SDK_VERSION=v1.31.0 REVIEWERS=vivekr-splunk,rlieberman-splunk,patrykw-splunk,Igor-splunk,kasiakoziol -GO_VERSION=1.24.0 +GO_VERSION=1.25.0 AWSCLI_URL=https://awscli.amazonaws.com/awscli-exe-linux-x86_64-2.8.6.zip KUBECTL_VERSION=v1.29.1 AZ_CLI_VERSION=2.30.0 diff --git a/.gitignore b/.gitignore index 3cc0b27..d87d947 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ bin testbin/* examplecodebase/* Dockerfile.cross +tmp/* # Test binary, build with `go test -c` *.test @@ -30,7 +31,17 @@ Dockerfile.cross skaffold.env.local .skaffold/ +# Logs +tools/cluster_setup/logs/ + # Helm build artifacts *.tgz helm-chart/**/charts/ !helm-chart/**/charts/.gitkeep + +# Cluster-setup script byproducts (*.original): pristine-snapshot backups +# written by tools/cluster_setup/k0s_cluster_with_stack.sh on first run and +# reused as a reset point on subsequent runs (see configure_images() +# → "Restoring from clean originals"). Needed locally for idempotent +# re-installs; never committed. +tools/cluster_setup/*.original diff --git a/Dockerfile b/Dockerfile index 25c47bc..25a756c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,6 @@ # Build the manager binary -FROM docker.io/golang:1.24 AS builder +ARG GO_VERSION=1.25.0 +FROM docker.io/golang:${GO_VERSION} AS builder ARG TARGETOS ARG TARGETARCH @@ -43,7 +44,11 @@ COPY LICENSE LICENSE-2.0.txt COPY --from=builder /certs/tls.crt /certs/tls.crt COPY --from=builder /certs/tls.key /certs/tls.key -USER 65532:65532 +# Run as non-root UID with GID 0 (root group). GID 0 is required on +# RHEL / OpenShift / k0s nodes: the container runtime assigns a random +# UID at launch and only grants group-read/write to GID 0. Without it +# the process cannot read /manager or the config files copied above. +USER 1001:0 ENV INSTANCE_FILE=/instance.yaml ENV APPLICATION_FILE=/applications.yaml ENTRYPOINT ["/manager"] diff --git a/Dockerfile.debug b/Dockerfile.debug index c5fac22..e9cdffd 100644 --- a/Dockerfile.debug +++ b/Dockerfile.debug @@ -1,5 +1,6 @@ # Build the manager binary with debug symbols -FROM docker.io/golang:1.24 AS builder +ARG GO_VERSION=1.25.0 +FROM docker.io/golang:${GO_VERSION} AS builder ARG TARGETOS ARG TARGETARCH diff --git a/Dockerfile.k0s-runner b/Dockerfile.k0s-runner new file mode 100644 index 0000000..aa0eda2 --- /dev/null +++ b/Dockerfile.k0s-runner @@ -0,0 +1,19 @@ +FROM registry.access.redhat.com/ubi9/ubi:latest + +RUN dnf install -y --allowerasing openssh-clients git jq && dnf clean all + +ARG TARGETARCH + +# kubectl +RUN curl -fsSL "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/${TARGETARCH}/kubectl" \ + -o /usr/local/bin/kubectl && chmod +x /usr/local/bin/kubectl + +# helm +RUN curl -fsSL "https://get.helm.sh/helm-v3.17.1-linux-${TARGETARCH}.tar.gz" | tar xz -C /tmp \ + && mv /tmp/linux-${TARGETARCH}/helm /usr/local/bin/helm && rm -rf /tmp/linux-${TARGETARCH} + +# yq +RUN curl -fsSL "https://github.com/mikefarah/yq/releases/latest/download/yq_linux_${TARGETARCH}" \ + -o /usr/local/bin/yq && chmod +x /usr/local/bin/yq + +WORKDIR /workspace diff --git a/Makefile b/Makefile index 203cb3d..7c69fe1 100644 --- a/Makefile +++ b/Makefile @@ -65,6 +65,9 @@ endif # tools. (i.e. podman) CONTAINER_TOOL ?= docker +# GO_VERSION is read from .env if not already set, and passed as a build-arg to docker builds. +GO_VERSION ?= $(shell grep '^GO_VERSION=' .env | cut -d= -f2) + # Setting SHELL to bash allows bash commands to be executed by recipes. # Options are set to exit when a recipe line exits non-zero or a piped command fails. SHELL = /usr/bin/env bash -o pipefail @@ -215,7 +218,11 @@ run: manifests generate fmt vet ## Run a controller from your host. # More info: https://docs.docker.com/develop/develop-images/build_enhancements/ .PHONY: docker-build docker-build: ## Build docker image with the manager. - $(CONTAINER_TOOL) build -t ${IMG} . + $(CONTAINER_TOOL) build --build-arg GO_VERSION=$(GO_VERSION) -t ${IMG} . + +.PHONY: docker-build-amd64 +docker-build-amd64: ## Build docker image for linux/amd64 (e.g. for x86_64 servers/EC2). + $(CONTAINER_TOOL) build --platform=linux/amd64 --build-arg GO_VERSION=$(GO_VERSION) -t ${IMG} . .PHONY: docker-push docker-push: ## Push docker image with the manager. @@ -234,7 +241,7 @@ docker-buildx: ## Build and push docker image for the manager for cross-platform sed -e '1 s/\(^FROM\)/FROM --platform=\$$\{BUILDPLATFORM\}/; t' -e ' 1,// s//FROM --platform=\$$\{BUILDPLATFORM\}/' Dockerfile > Dockerfile.cross - $(CONTAINER_TOOL) buildx create --name splunk-ai-operator-builder $(CONTAINER_TOOL) buildx use splunk-ai-operator-builder - - $(CONTAINER_TOOL) buildx build --push --platform=$(PLATFORMS) --tag ${IMG} -f Dockerfile.cross . + - $(CONTAINER_TOOL) buildx build --push --platform=$(PLATFORMS) --build-arg GO_VERSION=$(GO_VERSION) --tag ${IMG} -f Dockerfile.cross . - $(CONTAINER_TOOL) buildx rm splunk-ai-operator-builder rm Dockerfile.cross diff --git a/api/v1/aiplatform_types.go b/api/v1/aiplatform_types.go index 344c6ce..3d5ba37 100644 --- a/api/v1/aiplatform_types.go +++ b/api/v1/aiplatform_types.go @@ -364,13 +364,13 @@ type SidecarSpec struct { // ObjectStorageSpec defines object storage configuration for AI artifacts, tasks, and models type ObjectStorageSpec struct { // Remote volume URI in the format s3://bucketname/, gs://bucketname/, - // azure://containername/, or minio://bucketname/ + // azure://containername/, s3compat://bucketname/ (generic S3-compatible), minio://, or seaweedfs:// // +kubebuilder:validation:Required - // +kubebuilder:validation:Pattern=`^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$` + // +kubebuilder:validation:Pattern=`^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$` Path string `json:"path"` - // Optional override endpoint (only needed for S3-compatible services like MinIO) - // Must be a valid HTTP/HTTPS URL + // Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS) + // Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible (MinIO, SeaweedFS, etc.) // +kubebuilder:validation:Optional // +kubebuilder:validation:Pattern=`^https?://.*$` Endpoint string `json:"endpoint,omitempty"` @@ -380,11 +380,17 @@ type ObjectStorageSpec struct { // +kubebuilder:validation:MinLength=1 Region string `json:"region"` - // Secret name containing storage credentials + // Secret name containing storage credentials (e.g. s3_access_key, s3_secret_key for S3-compatible backends) // +kubebuilder:validation:Optional // +kubebuilder:validation:MinLength=1 // +kubebuilder:validation:MaxLength=253 SecretRef string `json:"secretRef,omitempty"` + + // Provider is an optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint. + // Values: aws, minio, seaweedfs, s3compat, gcs, azure + // +kubebuilder:validation:Optional + // +kubebuilder:validation:Enum=aws;minio;seaweedfs;s3compat;gcs;azure + Provider string `json:"provider,omitempty"` } // IngressSpec defines Ingress configuration for external access to platform services diff --git a/api/v1/aiservice_types.go b/api/v1/aiservice_types.go index 41914fa..f4965e2 100644 --- a/api/v1/aiservice_types.go +++ b/api/v1/aiservice_types.go @@ -53,6 +53,12 @@ type AIServiceSpec struct { // +kubebuilder:validation:Optional AIPlatformUrl string `json:"aiPlatformUrl,omitempty"` + // AIPlatformScheme specifies the URL scheme for the AI Platform service ("http" or "https") + // +kubebuilder:validation:Optional + // +kubebuilder:default="http" + // +kubebuilder:validation:Enum=http;https + AIPlatformScheme string `json:"aiPlatformScheme,omitempty"` + // AIPlatformRef is a reference to the AIPlatform resource // +kubebuilder:validation:Required AIPlatformRef corev1.ObjectReference `json:"aiPlatformRef"` @@ -117,6 +123,45 @@ type AIServiceSpec struct { // +kubebuilder:default="cluster.local" // +kubebuilder:validation:Pattern=`^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$` ClusterDomain string `json:"clusterDomain,omitempty"` + + // V2 configures the SAIA v2 deployment. v2 is always deployed alongside v1 behind nginx. + // Users toggle Agent Mode (v1 vs v2) from the Splunk Settings UI. + // +kubebuilder:validation:Optional + V2 SAIAv2Config `json:"v2,omitempty"` + + // V2Worker configures the v2 SAIA worker deployment (same v2 image, command=run-worker.sh). + // +kubebuilder:validation:Optional + V2Worker SAIAWorkerConfig `json:"v2Worker,omitempty"` +} + +// SAIAv2Config defines the configuration for the SAIA v2 API deployment. +type SAIAv2Config struct { + // Image is the container image for the v2 API pod + // +kubebuilder:validation:Optional + Image string `json:"image,omitempty"` + + // Replicas is the number of v2 API replicas + // +kubebuilder:validation:Optional + // +kubebuilder:default=1 + // +kubebuilder:validation:Minimum=0 + Replicas int32 `json:"replicas,omitempty"` + + // Resources defines the compute resources for the v2 API pods + // +kubebuilder:validation:Optional + Resources corev1.ResourceRequirements `json:"resources,omitempty"` +} + +// SAIAWorkerConfig defines the configuration for a SAIA worker deployment. +type SAIAWorkerConfig struct { + // Replicas is the number of worker replicas + // +kubebuilder:validation:Optional + // +kubebuilder:default=1 + // +kubebuilder:validation:Minimum=0 + Replicas int32 `json:"replicas,omitempty"` + + // Resources defines the compute resources for the worker pods + // +kubebuilder:validation:Optional + Resources corev1.ResourceRequirements `json:"resources,omitempty"` } // MetricsConfig defines the metrics configuration for monitoring diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index f63b7c9..987e478 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -247,6 +247,8 @@ func (in *AIServiceSpec) DeepCopyInto(out *AIServiceSpec) { out.Metrics = in.Metrics in.MTLS.DeepCopyInto(&out.MTLS) in.ServiceTemplate.DeepCopyInto(&out.ServiceTemplate) + in.V2.DeepCopyInto(&out.V2) + in.V2Worker.DeepCopyInto(&out.V2Worker) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AIServiceSpec. @@ -526,6 +528,38 @@ func (in *ReplicasSpec) DeepCopy() *ReplicasSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SAIAWorkerConfig) DeepCopyInto(out *SAIAWorkerConfig) { + *out = *in + in.Resources.DeepCopyInto(&out.Resources) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SAIAWorkerConfig. +func (in *SAIAWorkerConfig) DeepCopy() *SAIAWorkerConfig { + if in == nil { + return nil + } + out := new(SAIAWorkerConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SAIAv2Config) DeepCopyInto(out *SAIAv2Config) { + *out = *in + in.Resources.DeepCopyInto(&out.Resources) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SAIAv2Config. +func (in *SAIAv2Config) DeepCopy() *SAIAv2Config { + if in == nil { + return nil + } + out := new(SAIAv2Config) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SchedulingSpec) DeepCopyInto(out *SchedulingSpec) { *out = *in diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index fe8f28d..23a5274 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -1,13 +1,19 @@ applications: - name: Entrypoint - import_path: splunkai_models_apps.custom.deployments.entrypoint.main:SERVE_APP + import_path: main:SERVE_APP route_prefix: / runtime_env: + working_dir: "file:///home/ray/ray/applications/entrypoint.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: entrypoint ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -15,6 +21,166 @@ applications: SERVICE_NAME: "ai_platform_models" SKIP_VERIFICATION: "true" USE_SYSTEM_PERMISSIONS: "true" + - args: + application_name: GptOss120b + deployment_configs: + LLMDeployment: + gpu_type_options_override: + H100: + ray_actor_options: + num_gpus: 1 + L40S: + ray_actor_options: + num_gpus: 2 + options: + autoscaling_config: + max_replicas: {{.Replicas.GptOss120b}} + min_replicas: {{.Replicas.GptOss120b}} + deployment_type: text_gen_model_deployment + gpu_types: '["{{.AcceleratorType}}"]' + model_definition: + gpu_type_model_config_override: + H100: + engine_args: + gpu_memory_utilization: 0.90 + tensor_parallel_size: 1 + L40S: + engine_args: + gpu_memory_utilization: 0.90 + tensor_parallel_size: 2 + model_config: + openai_serving_config: + chat: + enable_auto_tools: true + tool_parser: openai + responses: + enable_auto_tools: true + tool_parser: openai + model_id: gpt_oss_120b + model_loader: + blob_storage: + blob_prefix: model_artifacts/gpt-oss-120b + tokenizer_definition: + model_id: gpt_oss_120b + model_loader: + blob_storage: + artifacts_list: + - chat_template.jinja + - config.json + - tokenizer_config.json + - tokenizer.json + blob_prefix: model_artifacts/gpt-oss-120b + name: GptOss120b + import_path: main:create_serve_app + route_prefix: /gpt_oss_120b + runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" + env_vars: + API_VERSION: "v1" + APPLICATION_NAME: gpt_oss_120b + VLLM_ATTENTION_BACKEND: TRITON_ATTN + ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" + CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" + ENABLE_AUTHN: "false" + ENABLE_AUTHZ: "false" + SERVICE_EXTERNAL_NAME: "ai-platform-models" + SERVICE_INTERNAL_NAME: "ai_platform_models" + SERVICE_NAME: "ai_platform_models" + SKIP_VERIFICATION: "true" + USE_SYSTEM_PERMISSIONS: "true" + VLLM_WORKER_MULTIPROC_METHOD: spawn + # Disable the Redis-backed Responses API store (see ai-platform-models + # commit c1f9aef3: "feat: add a no-op store"). When True, the vLLM + # TextGen deployment constructs NoOpOpenAIServingResponses instead of + # RedisOpenAIServingResponses, so /v1/responses works without a Redis + # infra. Without this flag the deployment raises + # RuntimeError: Responses Redis URL not set + # on every request, which surfaces as an empty SSE stream and the SAIA + # v2 /query path fails with "An error occurred processing your request". + # Airgap k0s has no Redis; cloud sets this to "False" and wires + # RESPONSES_REDIS_ADDRESS to its in-namespace Redis StatefulSet. + DISABLE_RESPONSES_API_REDIS: "True" + - args: + application_name: GptOss20b + deployment_configs: + LLMDeployment: + gpu_type_options_override: + H100: + ray_actor_options: + num_gpus: 0.5 + L40S: + ray_actor_options: + num_gpus: 1 + options: + autoscaling_config: + max_replicas: {{.Replicas.GptOss20b}} + min_replicas: {{.Replicas.GptOss20b}} + deployment_type: text_gen_model_deployment + gpu_types: '["{{.AcceleratorType}}"]' + model_definition: + gpu_type_model_config_override: + H100: + engine_args: + gpu_memory_utilization: 0.5 + tensor_parallel_size: 1 + L40S: + engine_args: + gpu_memory_utilization: 0.95 + tensor_parallel_size: 1 + model_config: + openai_serving_config: + chat: + enable_auto_tools: true + tool_parser: openai + responses: + enable_auto_tools: true + tool_parser: openai + model_id: gpt_oss_20b + model_loader: + blob_storage: + blob_prefix: model_artifacts/gpt-oss-20b + tokenizer_definition: + model_id: gpt_oss_20b + model_loader: + blob_storage: + artifacts_list: + - chat_template.jinja + - config.json + - tokenizer_config.json + - tokenizer.json + blob_prefix: model_artifacts/gpt-oss-20b + name: GptOss20b + import_path: main:create_serve_app + route_prefix: /gpt_oss_20b + runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" + env_vars: + API_VERSION: "v1" + APPLICATION_NAME: gpt_oss_20b + VLLM_ATTENTION_BACKEND: TRITON_ATTN + ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" + CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" + ENABLE_AUTHN: "false" + ENABLE_AUTHZ: "false" + SERVICE_EXTERNAL_NAME: "ai-platform-models" + SERVICE_INTERNAL_NAME: "ai_platform_models" + SERVICE_NAME: "ai_platform_models" + SKIP_VERIFICATION: "true" + USE_SYSTEM_PERMISSIONS: "true" + VLLM_WORKER_MULTIPROC_METHOD: spawn + # See GptOss120b above for rationale. Must be "True" in airgap (no + # Redis) so vLLM uses NoOpOpenAIServingResponses. + DISABLE_RESPONSES_API_REDIS: "True" - args: application_name: UaeLarge deployment_configs: @@ -22,42 +188,48 @@ applications: gpu_type_options_override: H100: ray_actor_options: - num_gpus: 0.025 + num_gpus: 0.0375 L40S: ray_actor_options: - num_gpus: 0.05 + num_gpus: 0.075 options: autoscaling_config: max_replicas: {{.Replicas.UaeLarge}} min_replicas: {{.Replicas.UaeLarge}} ray_actor_options: - num_gpus: 0.1 + num_gpus: 0.15 deployment_type: embedding_model_deployment model_definition: gpu_type_model_config_override: H100: engine_args: - gpu_memory_utilization: 0.025 + gpu_memory_utilization: 0.0375 L40S: engine_args: - gpu_memory_utilization: 0.05 + gpu_memory_utilization: 0.075 model_config: engine_args: - gpu_memory_utilization: 0.1 + gpu_memory_utilization: 0.15 tensor_parallel_size: 1 model_id: uae_large model_loader: - object_storage: - prefix: model_artifacts/uae-large + blob_storage: + blob_prefix: model_artifacts/uae-large name: UaeLarge - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /uae_large runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: uae_large ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -91,17 +263,23 @@ applications: tensor_parallel_size: 1 model_id: all_minilm_l6_v2 model_loader: - object_storage: - prefix: model_artifacts/all-minilm-l6-v2 + blob_storage: + blob_prefix: model_artifacts/all-minilm-l6-v2 name: AllMinilmL6V2 - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /all_minilm_l6_v2 runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: all_minilm_l6_v2 ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -135,17 +313,23 @@ applications: tensor_parallel_size: 1 model_id: bi_encoder model_loader: - object_storage: - prefix: model_artifacts/bi-encoder + blob_storage: + blob_prefix: model_artifacts/bi-encoder name: BiEncoder - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /bi_encoder runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: bi_encoder ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -172,20 +356,21 @@ applications: ray_actor_options: num_gpus: 0.2 deployment_type: custom_deployment - model_definition: - model_id: mbart_translator - model_loader: - object_storage: - prefix: model_artifacts/mbart-translator name: MbartTranslator - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /mbart_translator runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: mbart_translator ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -222,21 +407,26 @@ applications: model_config: engine_args: gpu_memory_utilization: 0.1 - task: classify tensor_parallel_size: 1 model_id: xlm_roberta_language_classifier model_loader: - object_storage: - prefix: model_artifacts/xlm-roberta-language-classifier + blob_storage: + blob_prefix: model_artifacts/xlm-roberta-language-classifier name: XlmRobertaLanguageClassifier - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /xlm_roberta_language_classifier runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: xlm_roberta_language_classifier ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -249,14 +439,20 @@ applications: custom_deployment_import_path: prompt_injection_tfidf:PromptInjectionTfidfDeployment deployment_type: custom_deployment name: PromptInjectionTfidf - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /prompt_injection_tfidf runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: APPLICATION_NAME: "PromptInjectionTfidf" API_VERSION: "v1" ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -290,18 +486,24 @@ applications: tensor_parallel_size: 1 model_id: cross_encoder model_loader: - object_storage: - prefix: model_artifacts/cross-encoder + blob_storage: + blob_prefix: model_artifacts/cross-encoder model_type: vllm_scoring_model name: CrossEncoder - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /cross_encoder runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: cross_encoder ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -309,78 +511,6 @@ applications: SERVICE_NAME: "ai_platform_models" SKIP_VERIFICATION: "true" USE_SYSTEM_PERMISSIONS: "true" - - args: - application_name: Llama31Instruct - deployment_configs: - LLMDeployment: - gpu_type_options_override: - A10G: - ray_actor_options: - num_gpus: 2 - H100: - ray_actor_options: - num_gpus: 0.5 - L40S: - ray_actor_options: - num_gpus: 1 - T4: - ray_actor_options: - num_gpus: 4 - runtime_env: - pip: - - triton==3.2.0 - options: - autoscaling_config: - max_replicas: {{.Replicas.Llama31Instruct}} - min_replicas: {{.Replicas.Llama31Instruct}} - deployment_type: text_gen_model_deployment - gpu_types: '["L40S"]' - model_definition: - gpu_type_model_config_override: - A10G: - engine_args: - tensor_parallel_size: 2 - H100: - engine_args: - gpu_memory_utilization: 0.5 - tensor_parallel_size: 1 - L40S: - engine_args: - tensor_parallel_size: 1 - T4: - engine_args: - dtype: half - tensor_parallel_size: 4 - model_id: llama31_instruct - model_loader: - object_storage: - prefix: model_artifacts/llama31-8b-instruct - tokenizer_definition: - model_id: llama31_instruct - model_loader: - object_storage: - artifacts_list: - - config.json - - tokenizer_config.json - - tokenizer.json - prefix: model_artifacts/llama31-8b-instruct - name: Llama31Instruct - import_path: splunkai_models_apps.main:create_serve_app - route_prefix: /llama31_instruct - runtime_env: - env_vars: - API_VERSION: "v1" - APPLICATION_NAME: llama31_instruct - ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" - CLOUD_PROVIDER: "{{.CloudProvider}}" - ENABLE_AUTHN: "false" - ENABLE_AUTHZ: "false" - SERVICE_EXTERNAL_NAME: "ai-platform-models" - SERVICE_INTERNAL_NAME: "ai_platform_models" - SERVICE_NAME: "ai_platform_models" - SKIP_VERIFICATION: "true" - USE_SYSTEM_PERMISSIONS: "true" - VLLM_WORKER_MULTIPROC_METHOD: spawn - args: application_name: E5LanguageClassifier deployment_configs: @@ -410,21 +540,26 @@ applications: model_config: engine_args: gpu_memory_utilization: 0.1 - task: classify tensor_parallel_size: 1 model_id: e5_language_classifier model_loader: - object_storage: - prefix: model_artifacts/e5-language-classifier + blob_storage: + blob_prefix: model_artifacts/e5-language-classifier name: E5LanguageClassifier - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /e5_language_classifier runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: e5_language_classifier ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -432,87 +567,6 @@ applications: SERVICE_NAME: "ai_platform_models" SKIP_VERIFICATION: "true" USE_SYSTEM_PERMISSIONS: "true" - - args: - application_name: Llama3170bInstructAwq - deployment_configs: - LLMDeployment: - gpu_type_options_override: - A100: - ray_actor_options: - num_gpus: 4 - A10G: - ray_actor_options: - num_gpus: 4 - H100: - ray_actor_options: - num_gpus: 1 - L40S: - ray_actor_options: - num_gpus: 2 - T4: - ray_actor_options: - num_gpus: 8 - runtime_env: - pip: - - triton==3.2.0 - options: - autoscaling_config: - max_replicas: {{.Replicas.Llama3170bInstructAwq}} - min_replicas: {{.Replicas.Llama3170bInstructAwq}} - max_ongoing_requests: 4 - deployment_type: text_gen_model_deployment - gpu_types: '["L40S"] ' - model_definition: - gpu_type_model_config_override: - A100: - engine_args: - tensor_parallel_size: 4 - A10G: - engine_args: - gpu_memory_utilization: 0.95 - tensor_parallel_size: 4 - H100: - engine_args: - gpu_memory_utilization: 0.95 - tensor_parallel_size: 1 - L40S: - engine_args: - gpu_memory_utilization: 0.95 - tensor_parallel_size: 2 - T4: - engine_args: - dtype: half - tensor_parallel_size: 8 - model_id: llama31_70b_instruct_awq - model_loader: - object_storage: - prefix: model_artifacts/llama31-70b-instruct-awq - tokenizer_definition: - model_id: llama31_70b_instruct_awq - model_loader: - object_storage: - artifacts_list: - - config.json - - tokenizer_config.json - - tokenizer.json - prefix: model_artifacts/llama31-70b-instruct-awq - name: Llama3170bInstructAwq - import_path: splunkai_models_apps.main:create_serve_app - route_prefix: /llama31_70b_instruct_awq - runtime_env: - env_vars: - API_VERSION: "v1" - APPLICATION_NAME: llama31_70b_instruct_awq - ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" - CLOUD_PROVIDER: "{{.CloudProvider}}" - ENABLE_AUTHN: "false" - ENABLE_AUTHZ: "false" - SERVICE_EXTERNAL_NAME: "ai-platform-models" - SERVICE_INTERNAL_NAME: "ai_platform_models" - SERVICE_NAME: "ai_platform_models" - SKIP_VERIFICATION: "true" - USE_SYSTEM_PERMISSIONS: "true" - VLLM_WORKER_MULTIPROC_METHOD: spawn - args: application_name: PromptInjectionCrossEncoder deployment_configs: @@ -538,14 +592,20 @@ applications: local_path: /home/ray/local_model_artifacts/prompt-injection-cross-encoder-1114 model_type: sentence_transformer_cross_encoder name: PromptInjectionCrossEncoder - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /prompt_injection_cross_encoder runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: prompt_injection_cross_encoder ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -564,14 +624,20 @@ applications: local_path: /home/ray/local_model_artifacts/prompt-injection-classifier-01052025 model_type: custom_model name: PromptInjectionClassifier - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /prompt_injection_classifier runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: prompt_injection_classifier ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" diff --git a/config/configs/features/saia.yaml b/config/configs/features/saia.yaml index 0fec5fc..a9192da 100644 --- a/config/configs/features/saia.yaml +++ b/config/configs/features/saia.yaml @@ -4,8 +4,8 @@ applicationScale: CrossEncoder: 1 E5LanguageClassifier: 1 Entrypoint: 1 - Llama31Instruct: 1 - Llama3170bInstructAwq: 1 + GptOss20b: 1 + GptOss120b: 1 MbartTranslator: 1 PromptInjectionClassifier: 1 PromptInjectionCrossEncoder: 1 @@ -17,6 +17,9 @@ instanceScale: l40s-0-gpu: 1 l40s-1-gpu: 2 l40s-2-gpu: 1 + H100: + h100-0-gpu: 1 + h100-1-gpu: 2 H100_NVL: h100-nvl-0-gpu: 1 h100-nvl-1-gpu: 2 \ No newline at end of file diff --git a/config/configs/instance.yaml b/config/configs/instance.yaml index 46518de..e704fd7 100644 --- a/config/configs/instance.yaml +++ b/config/configs/instance.yaml @@ -18,8 +18,8 @@ L40S: cpu: "4" limits: cpu: "16" - memory: "16Gi" - ephemeral-storage: "50Gi" + memory: "64Gi" + ephemeral-storage: "200Gi" nvidia.com/gpu: "1" - tier: l40s-2-gpu gpusPerPod: 2 @@ -31,6 +31,30 @@ L40S: memory: "48Gi" ephemeral-storage: "100Gi" nvidia.com/gpu: "2" +H100: + - tier: h100-0-gpu + gpusPerPod: 0 + env: + NVIDIA_VISIBLE_DEVICES: void + resources: + limits: + cpu: "16" + memory: "32Gi" + ephemeral-storage: "10Gi" + nvidia.com/gpu: "0" + requests: + cpu: "4" + - tier: h100-1-gpu + gpusPerPod: 1 + # No NVIDIA_VISIBLE_DEVICES here - GPUs must be visible for vLLM + resources: + requests: + cpu: "4" + limits: + cpu: "16" + memory: "48Gi" + ephemeral-storage: "100Gi" + nvidia.com/gpu: "1" H100_NVL: - tier: h100-nvl-0-gpu gpusPerPod: 0 diff --git a/config/crd/bases/ai.splunk.com_aiplatforms.yaml b/config/crd/bases/ai.splunk.com_aiplatforms.yaml index 98675dc..67fc505 100644 --- a/config/crd/bases/ai.splunk.com_aiplatforms.yaml +++ b/config/crd/bases/ai.splunk.com_aiplatforms.yaml @@ -2227,15 +2227,27 @@ spec: properties: endpoint: description: |- - Optional override endpoint (only needed for S3-compatible services like MinIO) - Must be a valid HTTP/HTTPS URL + Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS) + Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible (MinIO, SeaweedFS, etc.) pattern: ^https?://.*$ type: string path: description: |- Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, or minio://bucketname/ - pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ + azure://containername/, s3compat://bucketname/ (generic S3-compatible), minio://, or seaweedfs:// + pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ + type: string + provider: + description: |- + Provider is an optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint. + Values: aws, minio, seaweedfs, s3compat, gcs, azure + enum: + - aws + - minio + - seaweedfs + - s3compat + - gcs + - azure type: string region: description: Region of the remote storage volume. Required for @@ -2243,7 +2255,8 @@ spec: minLength: 1 type: string secretRef: - description: Secret name containing storage credentials + description: Secret name containing storage credentials (e.g. + s3_access_key, s3_secret_key for S3-compatible backends) maxLength: 253 minLength: 1 type: string diff --git a/config/crd/bases/ai.splunk.com_aiservices.yaml b/config/crd/bases/ai.splunk.com_aiservices.yaml index f9c3493..f203f3c 100644 --- a/config/crd/bases/ai.splunk.com_aiservices.yaml +++ b/config/crd/bases/ai.splunk.com_aiservices.yaml @@ -1818,15 +1818,27 @@ spec: properties: endpoint: description: |- - Optional override endpoint (only needed for S3-compatible services like MinIO) - Must be a valid HTTP/HTTPS URL + Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS) + Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible (MinIO, SeaweedFS, etc.) pattern: ^https?://.*$ type: string path: description: |- Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, or minio://bucketname/ - pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ + azure://containername/, s3compat://bucketname/ (generic S3-compatible), minio://, or seaweedfs:// + pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ + type: string + provider: + description: |- + Provider is an optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint. + Values: aws, minio, seaweedfs, s3compat, gcs, azure + enum: + - aws + - minio + - seaweedfs + - s3compat + - gcs + - azure type: string region: description: Region of the remote storage volume. Required for @@ -1834,7 +1846,8 @@ spec: minLength: 1 type: string secretRef: - description: Secret name containing storage credentials + description: Secret name containing storage credentials (e.g. + s3_access_key, s3_secret_key for S3-compatible backends) maxLength: 253 minLength: 1 type: string @@ -1882,6 +1895,152 @@ spec: type: string type: object type: array + v2: + description: |- + V2 configures the SAIA v2 deployment. v2 is always deployed alongside v1 behind nginx. + Users toggle Agent Mode (v1 vs v2) from the Splunk Settings UI. + properties: + image: + description: Image is the container image for the v2 API pod + type: string + replicas: + default: 1 + description: Replicas is the number of v2 API replicas + format: int32 + minimum: 0 + type: integer + resources: + description: Resources defines the compute resources for the v2 + API pods + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This is an alpha field and requires enabling the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + type: object + v2Worker: + description: V2Worker configures the v2 SAIA worker deployment (same + v2 image, command=run-worker.sh). + properties: + replicas: + default: 1 + description: Replicas is the number of worker replicas + format: int32 + minimum: 0 + type: integer + resources: + description: Resources defines the compute resources for the worker + pods + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This is an alpha field and requires enabling the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + type: object vectorDbUrl: description: VectorDbUrl specifies the URL or service name for the vector database diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index 415896e..6cf0049 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -7,16 +7,16 @@ resources: patches: - patch: "- op: add\n path: /spec/template/spec/containers/0/env\n value: \n - name: WATCH_NAMESPACE\n value: WATCH_NAMESPACE_VALUE\n - name: RELATED_IMAGE_SPLUNK_ENTERPRISE\n - \ value: splunk/splunk:10.2.0-dev1\n - name: OPERATOR_NAME\n value: - splunk-operator\n - name: POD_NAME\n valueFrom:\n fieldRef:\n fieldPath: - metadata.name\n - name: RELATED_IMAGE_RAY_HEAD\n value: \"667741767953.dkr.ecr.us-west-2.amazonaws.com/ml-platform/ray/ray-head:build-17\"\n + \ value: splunk/splunk:10.2.0-dev1\n - name: OPERATOR_NAME\n value: splunk-operator\n + \ - name: POD_NAME\n valueFrom:\n fieldRef:\n fieldPath: metadata.name\n + \ - name: RELATED_IMAGE_RAY_HEAD\n value: \"667741767953.dkr.ecr.us-west-2.amazonaws.com/ml-platform/ray/ray-head:build-17\"\n \ - name: RELATED_IMAGE_RAY_WORKER\n value: \"667741767953.dkr.ecr.us-west-2.amazonaws.com/ml-platform/ray/ray-worker-gpu:build-17\"\n \ - name: RELATED_IMAGE_WEAVIATE\n value: \"semitechnologies/weaviate:stable-v1.28-007846a\"\n \ - name: RELATED_IMAGE_SAIA_API\n value: \"667741767953.dkr.ecr.us-west-2.amazonaws.com/ml-platform/saia/saia-api:build-1\"\n \ - name: RELATED_IMAGE_POST_INSTALL_HOOK\n value: \"667741767953.dkr.ecr.us-west-2.amazonaws.com/ml-platform/saia/saia-data-loader:build-1\"\n - \ - name: RELATED_IMAGE_FLUENT_BIT\n value: \"fluent/fluent-bit:1.9.6\"\n - \ - name: RELATED_IMAGE_OTEL_COLLECTOR\n value: \"otel/opentelemetry-collector-contrib:0.122.1\"\n - - name: MODEL_VERSION\n value: \"v0.3.14-36-g1549f5a\"\n - name: RAY_VERSION\n + \ - name: RELATED_IMAGE_FLUENT_BIT\n value: \"fluent/fluent-bit:1.9.6\"\n - + name: RELATED_IMAGE_OTEL_COLLECTOR\n value: \"otel/opentelemetry-collector-contrib:0.122.1\"\n + \ - name: MODEL_VERSION\n value: \"v0.3.14-36-g1549f5a\"\n - name: RAY_VERSION\n \ value: \"2.44.0\"" target: kind: Deployment @@ -26,4 +26,4 @@ kind: Kustomization images: - name: controller newName: docker.io/splunk/splunk-ai-operator - newTag: 0.1.0 + newTag: v0.0.1 diff --git a/docs/configuration/object-storage.md b/docs/configuration/object-storage.md new file mode 100644 index 0000000..70a1f7a --- /dev/null +++ b/docs/configuration/object-storage.md @@ -0,0 +1,106 @@ +# Object Storage Selection + +This document describes how the Splunk AI Operator chooses the object storage backend and how to configure AWS S3, MinIO, SeaweedFS, or any S3-compatible storage. + +## How the operator decides the backend + +The operator selects the storage backend **only by the path scheme** in `spec.objectStorage.path`: + +| Path scheme | Backend behavior | cloudProvider | artifactsProvider | +|-----------------|-------------------------------------|---------------|-------------------| +| `s3://` | **AWS S3** (region, IRSA, no custom endpoint) | `aws` | `s3` | +| `s3compat://` | **S3-compatible** (generic; requires endpoint + secretRef) | `s3compat` | `s3` | +| `minio://` | **MinIO** (alias for S3-compatible) | `s3compat` | `s3` | +| `seaweedfs://` | **SeaweedFS** (alias for S3-compatible) | `s3compat` | `s3` | +| `gs://` / `gcs://` | **GCP Cloud Storage** | `gcp` | `gcs` | +| `azure://` | **Azure Blob Storage** | `azure` | `azure` | + +- **Path scheme** is the only decision input; there is no separate "provider type" switch in the operator logic. +- For **S3-compatible** backends (MinIO, SeaweedFS, Ceph, or any custom S3 API), use **`s3compat://bucket/prefix`** with `endpoint` and `secretRef` set. You can also use `minio://` or `seaweedfs://` as aliases; all use the same implementation (AWS S3 SDK with custom endpoint and path-style). + +## cloudProvider vs artifactsProvider + +- **cloudProvider**: Identifies the *platform* (e.g. `aws` for native AWS S3, `s3compat` for MinIO/SeaweedFS/other S3-compatible). Used for telemetry and any logic that needs to distinguish "real AWS" from "custom S3-compatible". +- **artifactsProvider**: The *protocol* used to access artifacts. For all S3 API backends (AWS S3, MinIO, SeaweedFS) the protocol is the S3 API, so `artifactsProvider` is always `s3` for those. Only GCS and Azure use different protocols (`gcs`, `azure`). + +## Path schemes and required fields + +- **`s3://bucket/prefix`** + - Use for **AWS S3** only. + - Set `region`. Optionally use `secretRef` for static credentials; otherwise IRSA or default AWS credential chain is used. Do **not** set `endpoint` for native S3. + +- **`s3compat://bucket/prefix`** + - Use for **any S3-compatible** backend (MinIO, SeaweedFS, Ceph, etc.). + - **Required:** `endpoint` (e.g. `http://minio.namespace.svc:9000` or `http://seaweedfs-s3:8333`), `region` (any value), `secretRef` with `s3_access_key` and `s3_secret_key`. + +- **`minio://bucket/prefix`** + - Alias for S3-compatible; use for **MinIO** (in-cluster or external). Same requirements as `s3compat://`. + +- **`seaweedfs://bucket/prefix`** + - Alias for S3-compatible; use for **SeaweedFS** (bring your own). Same requirements as `s3compat://`. + +## Optional provider field + +`spec.objectStorage.provider` is an optional hint for documentation and tooling. Allowed values: `aws`, `minio`, `seaweedfs`, `s3compat`, `gcs`, `azure`. The operator **does not** use this field to select the backend; behavior is derived only from the path scheme (and for `s3://`, absence of endpoint). Use it for clarity in manifests or scripts. + +## YAML examples + +### AWS S3 + +```yaml +spec: + objectStorage: + path: s3://my-ai-bucket/artifacts + region: us-east-2 + # secretRef optional when using IRSA +``` + +### MinIO (in-cluster) + +```yaml +spec: + objectStorage: + path: minio://ai-platform-bucket/artifacts + endpoint: http://minio.minio.svc.cluster.local:9000 + region: us-east-1 + secretRef: minio-credentials +``` + +### MinIO (external, e.g. EC2) + +```yaml +spec: + objectStorage: + path: minio://ai-platform-bucket/artifacts + endpoint: http://10.0.1.50:9000 + region: us-east-1 + secretRef: minio-credentials +``` + +### SeaweedFS + +```yaml +spec: + objectStorage: + path: seaweedfs://my-bucket/artifacts + endpoint: http://seaweedfs-s3.my-namespace.svc:8333 + region: us-east-1 + secretRef: minio-credentials +``` + +### Generic S3-compatible (e.g. Ceph, custom endpoint) + +```yaml +spec: + objectStorage: + path: s3compat://my-bucket/artifacts + endpoint: http://s3-gateway.my-namespace.svc:8333 + region: us-east-1 + secretRef: minio-credentials +``` + +The same Kubernetes secret format is used for all S3-compatible backends: keys `s3_access_key` and `s3_secret_key`. Pods receive **`S3COMPAT_OBJECT_STORE_ENDPOINT_URL`** (when endpoint is set), **`S3COMPAT_OBJECT_STORE_ACCESS_KEY`**, and **`S3COMPAT_OBJECT_STORE_SECRET_KEY`** from the operator. + +## Adding new S3-compatible backends + +Any storage that exposes an S3-compatible API (e.g. Ceph, DigitalOcean Spaces) can be used by using **`s3compat://bucket`** with the appropriate `endpoint` and `secretRef`. No new client code or scheme is required; `minio://` and `seaweedfs://` remain as optional aliases for clarity. diff --git a/docs/configuration/storage-artifacts.md b/docs/configuration/storage-artifacts.md index 58ae8f9..4584e28 100644 --- a/docs/configuration/storage-artifacts.md +++ b/docs/configuration/storage-artifacts.md @@ -6,10 +6,17 @@ The Splunk AI team has provided global artifact storage in a publicly readable S ## Prerequisites Utilizing the AI Platform requires one of the following remote storage providers: - * An Amazon S3 or S3-API-compliant remote object storage location + * **AWS S3** – Native Amazon S3 (use path scheme `s3://`) + * **MinIO** – S3-compatible, in-cluster or external (use path scheme `s3compat://` or `minio://` with endpoint and credentials) + * **SeaweedFS** – S3-compatible (use path scheme `s3compat://` or `seaweedfs://` with endpoint and credentials) + * Any other **S3-API-compatible** storage (use `s3compat://` with endpoint and secretRef; `minio://` and `seaweedfs://` are optional aliases) * Azure blob storage * GCP Cloud Storage +### Object storage selection + +The operator chooses the backend **by the path scheme** in `spec.objectStorage.path`. Use `s3://` for AWS S3 only; use `s3compat://` (or `minio://` / `seaweedfs://` as aliases) with `endpoint` and `secretRef` for MinIO, SeaweedFS, or any S3-compatible backend. See [Object Storage Selection](object-storage.md) for the full decision table, path schemes, and YAML examples. + ### Prerequisites common to all remote storage providers * Read-write access to the path used to host the files. * Connections to the remote object storage endpoint need to be secured using a minimum version of TLS 1.2. diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 871dbc7..57854d5 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -196,6 +196,167 @@ kubectl logs -l ray.io/node-type=worker -n | grep - `CUDA_VISIBLE_DEVICES is set to empty string` → GPU configuration issue - `RuntimeError: CUDA out of memory` → Increase GPU resources +#### "Invalid repository ID or local directory" (e.g. Llama31Instruct / VLLMTextGenModel) + +If you see a validation error like: + +```text +Invalid repository ID or local directory specified: '/home/ray/.cache/s3/artifacts/model_artifacts/llama31-8b-instruct'. +Please verify the following requirements: +1. Provide a valid Hugging Face repository ID. +2. Specify a local directory that contains a recognized configuration file. + - For Hugging Face models: ensure the presence of a 'config.json'. +``` + +the model loader is trying to use a **local path** where the model should have been downloaded from object storage (S3/MinIO). That path is either missing or does not contain the required files (e.g. `config.json`). Common causes: + +1. **Model not in object storage** + The prefix `model_artifacts/llama31-8b-instruct` must exist in your bucket with a full Hugging Face–style layout (including `config.json` and weight files). + - Download: `./tools/artifacts_download_upload_scripts/download_from_huggingface.sh` + - Upload to MinIO/S3-compatible: `./tools/artifacts_download_upload_scripts/upload_to_minio.sh` (set `S3COMPAT_OBJECT_STORE_ENDPOINT`, `S3COMPAT_OBJECT_STORE_BUCKET`, and credentials as in the [artifacts README](../tools/artifacts_download_upload_scripts/README.md); `MINIO_*` env vars are also accepted). + +2. **Ray workers cannot reach MinIO/S3** + - For **external MinIO** (e.g. EC2): ensure the MinIO endpoint in `cluster-config.yaml` (`storage.minio.endpoint`) is reachable from EKS (security groups, VPC, and if using a public IP, that nodes can egress to it). + - From a Ray worker pod: + `kubectl exec -it -n -- env | grep -E 'OBJECT_STORE|ARTIFACTS|S3'` + then test connectivity (e.g. curl to the object store endpoint or use the same client the SDK uses). + +3. **Wrong or missing credentials** + AIPlatform must have `objectStorage.secretRef` pointing to a secret with `s3_access_key` and `s3_secret_key` (the operator passes these as `S3COMPAT_OBJECT_STORE_ACCESS_KEY` / `S3COMPAT_OBJECT_STORE_SECRET_KEY` to Ray). Verify the secret exists and matches the S3-compatible account that can read the bucket: + - `kubectl get secret minio-credentials -n -o jsonpath='{.data}'` + +4. **Bucket/prefix mismatch** + The bucket name in AIPlatform `objectStorage.path` (e.g. `minio://`) and the prefix in the application config (`model_artifacts/llama31-8b-instruct`) must match where you uploaded the model. + +**Quick checks:** + +- List objects in the object store for the model prefix (from a host with `mc` or AWS CLI configured): + - `mc ls myminio//model_artifacts/llama31-8b-instruct/` + You should see at least `config.json` and the model weight files. +- From a Ray worker pod, confirm env vars and that the path is writable: + - `kubectl exec -it -n -- ls -la /home/ray/.cache/s3/artifacts/model_artifacts/ 2>/dev/null || echo "path missing or empty"` + If the directory is missing or empty, the download from object storage failed (network, credentials, or missing objects). + +**Full reset when the deployment keeps failing (e.g. Llama31Instruct / LLMDeploymentL40S):** + +If the model is correct in object storage and credentials are in the serve config but the replica still fails with "Invalid repository ID or local directory", clear the artifact cache and restart Ray so replicas run a fresh download and load. + +1. **Clear the artifact cache on all workers** + Either remove only the failing model prefix or the entire `model_artifacts` tree (more thorough): + + ```bash + export AI_NS="${AI_NS:-ai-platform}" + + # Option A: clear only the failing model (e.g. llama31-8b-instruct) + for p in $(kubectl get pods -n "$AI_NS" -l ray.io/node-type=worker -o jsonpath='{.items[*].metadata.name}'); do + kubectl exec -n "$AI_NS" "$p" -c ray-worker -- rm -rf /home/ray/.cache/s3/artifacts/model_artifacts/llama31-8b-instruct + done + + # Option B: clear entire model_artifacts (use if multiple models or unknown state) + for p in $(kubectl get pods -n "$AI_NS" -l ray.io/node-type=worker -o jsonpath='{.items[*].metadata.name}'); do + kubectl exec -n "$AI_NS" "$p" -c ray-worker -- rm -rf /home/ray/.cache/s3/artifacts/model_artifacts + done + ``` + +2. **Restart worker pods** so new replicas run and download from object storage: + + ```bash + kubectl delete pods -n "$AI_NS" -l ray.io/node-type=worker + ``` + +3. **Optional: restart the Ray head** to force a full Ray Serve redeploy (new replica placement and startup): + + ```bash + kubectl delete pod -n "$AI_NS" -l ray.io/node-type=head + ``` + +4. **Wait 10–15 minutes** for workers (and head) to be Running and for the deployment replica to download the model and start. The first download can be large (e.g. ~16 GB for Llama 3.1 8B); if the replica is restarted too soon (e.g. after a few quick failures), the download may never complete. + +5. **Verify** the deployment status and, if needed, that a worker has the model: + + ```bash + kubectl get rayservice -n "$AI_NS" -o yaml | grep -A 30 'Llama31Instruct:' + WORKER=$(kubectl get pods -n "$AI_NS" -l ray.io/node-type=worker -o jsonpath='{.items[0].metadata.name}') + kubectl exec -n "$AI_NS" "$WORKER" -c ray-worker -- sh -c 'ls /home/ray/.cache/s3/artifacts/model_artifacts/llama31-8b-instruct/*.safetensors 2>/dev/null || echo "No safetensors"' + ``` + +### Object store credentials and serve config verification + +When using S3-compatible object storage (MinIO, SeaweedFS, etc.), the operator injects credentials from the object storage secret into the Ray Serve config so replicas can download model artifacts. Use these steps to verify the secret and that the updated serve config is applied. + +**1. Check that the AIPlatform object storage secret exists and has the required keys** + +Replace `` with your AIPlatform namespace (e.g. `ai-platform`) and `` with the value of `spec.objectStorage.secretRef` from your AIPlatform (e.g. `minio-credentials`). + +```bash +# Get AIPlatform namespace and secretRef (optional: discover from the CR) +kubectl get aiplatform -A -o custom-columns=NAME:.metadata.name,NS:.metadata.namespace,SECRET:.spec.objectStorage.secretRef + +# Confirm the secret exists in the same namespace as the AIPlatform +kubectl get secret -n + +# List secret keys (names only; values are base64-encoded and must not be logged) +kubectl get secret -n -o jsonpath='{.data}' | jq -r 'keys[]' + +# Verify required keys are present (expect s3_access_key and s3_secret_key) +kubectl get secret -n -o jsonpath='{.data}' | jq -r 'keys[]' | grep -E 's3_access_key|s3_secret_key' +``` + +If either `s3_access_key` or `s3_secret_key` is missing, create or update the secret, for example: + +```bash +kubectl -n create secret generic \ + --from-literal=s3_access_key="" \ + --from-literal=s3_secret_key="" \ + --dry-run=client -o yaml | kubectl apply -f - +``` + +**2. Reconcile or restart the operator with the new image** + +After updating the operator image (with the change that injects object store credentials into the serve config), either trigger a reconcile or restart the operator so it rewrites `RayService.spec.serveConfigV2`. + +- **Option A – Restart the operator deployment** (simplest; causes one reconcile when the pod comes back): + + ```bash + # Replace with the namespace where the operator runs (e.g. splunk-ai-operator-system) + kubectl rollout restart deployment splunk-ai-operator-controller-manager -n + kubectl rollout status deployment splunk-ai-operator-controller-manager -n + ``` + +- **Option B – Trigger reconcile by touching the AIPlatform** (no operator restart): + + ```bash + kubectl annotate aiplatform -n \ + reconcile-$(date +%s)=triggered --overwrite + ``` + + The operator will reconcile and regenerate the RayService; ensure the operator is already running the new image before doing this. + +**3. Confirm RayService.spec.serveConfigV2 includes S3COMPAT_OBJECT_STORE_ACCESS_KEY and S3COMPAT_OBJECT_STORE_SECRET_KEY** + +The serve config is a JSON string in `RayService.spec.serveConfigV2`. Check that it contains the object store env vars for the apps (e.g. after the operator has reconciled). + +```bash +# Set your AIPlatform namespace and RayService name (often the same as AIPlatform name, e.g. splunk-ai-stack) +NAMESPACE="" +RAY_SERVICE_NAME="" + +# Count occurrences of S3COMPAT_OBJECT_STORE_ACCESS_KEY in the serve config (expect > 0 when using S3-compatible storage) +kubectl get rayservice "$RAY_SERVICE_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.serveConfigV2}' | jq -Rs 'split("S3COMPAT_OBJECT_STORE_ACCESS_KEY") | length - 1' + +# Show a snippet to confirm the keys are present (values are redacted in output) +kubectl get rayservice "$RAY_SERVICE_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.serveConfigV2}' | grep -o '"S3COMPAT_OBJECT_STORE_ACCESS_KEY"[^,]*' | head -1 +kubectl get rayservice "$RAY_SERVICE_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.serveConfigV2}' | grep -o '"S3COMPAT_OBJECT_STORE_SECRET_KEY"[^,]*' | head -1 +``` + +If the count is 0, the operator may not be using the new image, or `objectStorage.secretRef` may be unset. Ensure: + +- The AIPlatform has `spec.objectStorage.path` with scheme `s3compat://`, `minio://`, or `seaweedfs://` and `spec.objectStorage.secretRef` set to the secret name. +- The secret exists in the AIPlatform namespace and contains `s3_access_key` and `s3_secret_key`. +- The operator deployment has been restarted (or reconciled) with the image that injects object store credentials into the applications template. + +After confirming, restart Ray workers if needed so they pick up the new env (e.g. scale down and up the Ray cluster or wait for rolling restart), then re-check replica logs and the cache path `/home/ray/.cache/s3/artifacts/model_artifacts/...`. + ### Weaviate Errors ```bash diff --git a/go.mod b/go.mod index 8860ea8..8af8e88 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/splunk/splunk-ai-operator -go 1.24.0 +go 1.25.0 godebug default=go1.23 @@ -10,7 +10,7 @@ require ( github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.9.0 github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.6.1 github.com/aws/aws-sdk-go v1.55.7 - github.com/cert-manager/cert-manager v1.18.0 + github.com/cert-manager/cert-manager v1.18.5 github.com/go-logr/logr v1.4.3 github.com/google/go-cmp v0.7.0 github.com/onsi/ginkgo/v2 v2.22.2 @@ -22,6 +22,7 @@ require ( github.com/stretchr/testify v1.11.1 google.golang.org/api v0.235.0 gopkg.in/yaml.v2 v2.4.0 + gopkg.in/yaml.v3 v3.0.1 k8s.io/api v0.33.1 k8s.io/apiextensions-apiserver v0.33.1 k8s.io/apimachinery v0.33.1 @@ -31,7 +32,7 @@ require ( ) require ( - cel.dev/expr v0.24.0 // indirect + cel.dev/expr v0.25.1 // indirect cloud.google.com/go v0.121.1 // indirect cloud.google.com/go/auth v0.16.1 // indirect cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect @@ -40,7 +41,7 @@ require ( cloud.google.com/go/monitoring v1.24.2 // indirect github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.1 // indirect github.com/AzureAD/microsoft-authentication-library-for-go v1.4.2 // indirect - github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.30.0 // indirect + github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.31.0 // indirect github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.51.0 // indirect github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.51.0 // indirect github.com/antlr4-go/antlr/v4 v4.13.1 // indirect @@ -48,11 +49,11 @@ require ( github.com/blang/semver/v4 v4.0.0 // indirect github.com/cenkalti/backoff/v5 v5.0.3 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect - github.com/cncf/xds/go v0.0.0-20251022180443-0feb69152e9f // indirect + github.com/cncf/xds/go v0.0.0-20251210132809-ee656c7534f5 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/emicklei/go-restful/v3 v3.12.1 // indirect - github.com/envoyproxy/go-control-plane/envoy v1.35.0 // indirect - github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect + github.com/envoyproxy/go-control-plane/envoy v1.36.0 // indirect + github.com/envoyproxy/protoc-gen-validate v1.3.0 // indirect github.com/evanphx/json-patch/v5 v5.9.11 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.8.0 // indirect @@ -74,7 +75,7 @@ require ( github.com/google/uuid v1.6.0 // indirect github.com/googleapis/enterprise-certificate-proxy v0.3.6 // indirect github.com/googleapis/gax-go/v2 v2.14.2 // indirect - github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.7 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/jmespath/go-jmespath v0.4.1-0.20220621161143-b0104c826a24 // indirect github.com/josharian/intern v1.0.0 // indirect @@ -97,38 +98,37 @@ require ( github.com/stoewer/go-strcase v1.3.0 // indirect github.com/x448/float16 v0.8.4 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect - go.opentelemetry.io/contrib/detectors/gcp v1.38.0 // indirect + go.opentelemetry.io/contrib/detectors/gcp v1.39.0 // indirect go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect - go.opentelemetry.io/otel v1.40.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.40.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.40.0 // indirect - go.opentelemetry.io/otel/metric v1.40.0 // indirect - go.opentelemetry.io/otel/sdk v1.40.0 // indirect - go.opentelemetry.io/otel/sdk/metric v1.40.0 // indirect - go.opentelemetry.io/otel/trace v1.40.0 // indirect - go.opentelemetry.io/proto/otlp v1.9.0 // indirect + go.opentelemetry.io/otel v1.43.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 // indirect + go.opentelemetry.io/otel/metric v1.43.0 // indirect + go.opentelemetry.io/otel/sdk v1.43.0 // indirect + go.opentelemetry.io/otel/sdk/metric v1.43.0 // indirect + go.opentelemetry.io/otel/trace v1.43.0 // indirect + go.opentelemetry.io/proto/otlp v1.10.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.27.0 // indirect - golang.org/x/crypto v0.47.0 // indirect + golang.org/x/crypto v0.49.0 // indirect golang.org/x/exp v0.0.0-20250207012021-f9890c6ad9f3 // indirect - golang.org/x/net v0.49.0 // indirect - golang.org/x/oauth2 v0.34.0 // indirect - golang.org/x/sync v0.19.0 // indirect - golang.org/x/sys v0.40.0 // indirect - golang.org/x/term v0.39.0 // indirect - golang.org/x/text v0.33.0 // indirect + golang.org/x/net v0.52.0 // indirect + golang.org/x/oauth2 v0.35.0 // indirect + golang.org/x/sync v0.20.0 // indirect + golang.org/x/sys v0.42.0 // indirect + golang.org/x/term v0.41.0 // indirect + golang.org/x/text v0.35.0 // indirect golang.org/x/time v0.11.0 // indirect - golang.org/x/tools v0.40.0 // indirect + golang.org/x/tools v0.42.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect google.golang.org/genproto v0.0.0-20250505200425-f936aa4a68b2 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20260128011058-8636f8732409 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20260128011058-8636f8732409 // indirect - google.golang.org/grpc v1.78.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 // indirect + google.golang.org/grpc v1.80.0 // indirect google.golang.org/protobuf v1.36.11 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/apiserver v0.33.1 // indirect k8s.io/component-base v0.33.1 // indirect k8s.io/klog/v2 v2.130.1 // indirect diff --git a/go.sum b/go.sum index c6c9fa9..1f95e16 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,5 @@ -cel.dev/expr v0.24.0 h1:56OvJKSH3hDGL0ml5uSxZmz3/3Pq4tJ+fb1unVLAFcY= -cel.dev/expr v0.24.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw= +cel.dev/expr v0.25.1 h1:1KrZg61W6TWSxuNZ37Xy49ps13NUovb66QLprthtwi4= +cel.dev/expr v0.25.1/go.mod h1:hrXvqGP6G6gyx8UAHSHJ5RGk//1Oj5nXQ2NI02Nrsg4= cloud.google.com/go v0.121.1 h1:S3kTQSydxmu1JfLRLpKtxRPA7rSrYPRPEUmL/PavVUw= cloud.google.com/go v0.121.1/go.mod h1:nRFlrHq39MNVWu+zESP2PosMWA0ryJw8KUBZ2iZpxbw= cloud.google.com/go/auth v0.16.1 h1:XrXauHMd30LhQYVRHLGvJiYeczweKQXZxsTbV9TiguU= @@ -36,8 +36,8 @@ github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1 h1:WJ github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1/go.mod h1:tCcJZ0uHAmvjsVYzEFivsRTN00oz5BEsRgQHu5JZ9WE= github.com/AzureAD/microsoft-authentication-library-for-go v1.4.2 h1:oygO0locgZJe7PpYPXT5A29ZkwJaPqcva7BVeemZOZs= github.com/AzureAD/microsoft-authentication-library-for-go v1.4.2/go.mod h1:wP83P5OoQ5p6ip3ScPr0BAq0BvuPAvacpEuSzyouqAI= -github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.30.0 h1:sBEjpZlNHzK1voKq9695PJSX2o5NEXl7/OL3coiIY0c= -github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.30.0/go.mod h1:P4WPRUkOhJC13W//jWpyfJNDAIpvRbAUIYLX/4jtlE0= +github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.31.0 h1:DHa2U07rk8syqvCge0QIGMCE1WxGj9njT44GH7zNJLQ= +github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.31.0/go.mod h1:P4WPRUkOhJC13W//jWpyfJNDAIpvRbAUIYLX/4jtlE0= github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.51.0 h1:fYE9p3esPxA/C0rQ0AHhP0drtPXDRhaWiwg1DPqO7IU= github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.51.0/go.mod h1:BnBReJLvVYx2CS/UHOgVz2BXKXD9wsQPxZug20nZhd0= github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/cloudmock v0.51.0 h1:OqVGm6Ei3x5+yZmSJG1Mh2NwHvpVmZ08CB5qJhT9Nuk= @@ -54,12 +54,12 @@ github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= -github.com/cert-manager/cert-manager v1.18.0 h1:v7vxC1Mx5tkDz1oGOAktB88zA6TbGKcmpLM92+AIXRc= -github.com/cert-manager/cert-manager v1.18.0/go.mod h1:icDJx4kG9BCNpGjBvrmsFd99d+lXUvWdkkcrSSQdIiw= +github.com/cert-manager/cert-manager v1.18.5 h1:Gx4FSpSPYcSC4MQf43QjbxDfyTEbwZgfZQs5Lq9QlBs= +github.com/cert-manager/cert-manager v1.18.5/go.mod h1:HbPSO5MW/44wu19t84eY/K4c4/WwyPB4bA3uffOH92s= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cncf/xds/go v0.0.0-20251022180443-0feb69152e9f h1:Y8xYupdHxryycyPlc9Y+bSQAYZnetRJ70VMVKm5CKI0= -github.com/cncf/xds/go v0.0.0-20251022180443-0feb69152e9f/go.mod h1:HlzOvOjVBOfTGSRXRyY0OiCS/3J1akRGQQpRO/7zyF4= +github.com/cncf/xds/go v0.0.0-20251210132809-ee656c7534f5 h1:6xNmx7iTtyBRev0+D/Tv1FZd4SCg8axKApyNyRsAt/w= +github.com/cncf/xds/go v0.0.0-20251210132809-ee656c7534f5/go.mod h1:KdCmV+x/BuvyMxRnYBlmVaq4OLiKW6iRQfvC62cvdkI= github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -69,14 +69,14 @@ github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/r github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= github.com/emicklei/go-restful/v3 v3.12.1 h1:PJMDIM/ak7btuL8Ex0iYET9hxM3CI2sjZtzpL63nKAU= github.com/emicklei/go-restful/v3 v3.12.1/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= -github.com/envoyproxy/go-control-plane v0.13.5-0.20251024222203-75eaa193e329 h1:K+fnvUM0VZ7ZFJf0n4L/BRlnsb9pL/GuDG6FqaH+PwM= -github.com/envoyproxy/go-control-plane v0.13.5-0.20251024222203-75eaa193e329/go.mod h1:Alz8LEClvR7xKsrq3qzoc4N0guvVNSS8KmSChGYr9hs= -github.com/envoyproxy/go-control-plane/envoy v1.35.0 h1:ixjkELDE+ru6idPxcHLj8LBVc2bFP7iBytj353BoHUo= -github.com/envoyproxy/go-control-plane/envoy v1.35.0/go.mod h1:09qwbGVuSWWAyN5t/b3iyVfz5+z8QWGrzkoqm/8SbEs= +github.com/envoyproxy/go-control-plane v0.14.0 h1:hbG2kr4RuFj222B6+7T83thSPqLjwBIfQawTkC++2HA= +github.com/envoyproxy/go-control-plane v0.14.0/go.mod h1:NcS5X47pLl/hfqxU70yPwL9ZMkUlwlKxtAohpi2wBEU= +github.com/envoyproxy/go-control-plane/envoy v1.36.0 h1:yg/JjO5E7ubRyKX3m07GF3reDNEnfOboJ0QySbH736g= +github.com/envoyproxy/go-control-plane/envoy v1.36.0/go.mod h1:ty89S1YCCVruQAm9OtKeEkQLTb+Lkz0k8v9W0Oxsv98= github.com/envoyproxy/go-control-plane/ratelimit v0.1.0 h1:/G9QYbddjL25KvtKTv3an9lx6VBE2cnb8wp1vEGNYGI= github.com/envoyproxy/go-control-plane/ratelimit v0.1.0/go.mod h1:Wk+tMFAFbCXaJPzVVHnPgRKdUdwW/KdbRt94AzgRee4= -github.com/envoyproxy/protoc-gen-validate v1.2.1 h1:DEo3O99U8j4hBFwbJfrz9VtgcDfUKS7KJ7spH3d86P8= -github.com/envoyproxy/protoc-gen-validate v1.2.1/go.mod h1:d/C80l/jxXLdfEIhX1W2TmLfsJ31lvEjwamM4DxlWXU= +github.com/envoyproxy/protoc-gen-validate v1.3.0 h1:TvGH1wof4H33rezVKWSpqKz5NXWg5VPuZ0uONDT6eb4= +github.com/envoyproxy/protoc-gen-validate v1.3.0/go.mod h1:HvYl7zwPa5mffgyeTUHA9zHIH36nmrm7oCbo4YKoSWA= github.com/evanphx/json-patch v5.9.0+incompatible h1:fBXyNpNMuTTDdquAq/uisOr2lShz4oaXpDTX2bLe7ls= github.com/evanphx/json-patch v5.9.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= @@ -134,8 +134,8 @@ github.com/googleapis/enterprise-certificate-proxy v0.3.6 h1:GW/XbdyBFQ8Qe+YAmFU github.com/googleapis/enterprise-certificate-proxy v0.3.6/go.mod h1:MkHOF77EYAE7qfSuSS9PU6g4Nt4e11cnsDUowfwewLA= github.com/googleapis/gax-go/v2 v2.14.2 h1:eBLnkZ9635krYIPD+ag1USrOAI0Nr0QYF3+/3GqO0k0= github.com/googleapis/gax-go/v2 v2.14.2/go.mod h1:ON64QhlJkhVtSqp4v1uaK92VyZ2gmvDQsweuyLV+8+w= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.7 h1:X+2YciYSxvMQK0UZ7sg45ZVabVZBeBuvMkmuI2V3Fak= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.7/go.mod h1:lW34nIZuQ8UDPdkon5fmfp2l3+ZkQ2me/+oecHYLOII= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 h1:HWRh5R2+9EifMyIHV7ZV+MIZqgz+PMpZ14Jynv3O2Zs= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0/go.mod h1:JfhWUomR1baixubs02l85lZYYOm7LV6om4ceouMv45c= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/jmespath/go-jmespath v0.4.1-0.20220621161143-b0104c826a24 h1:liMMTbpW34dhU4az1GN0pTPADwNmvoRSeoZ6PItiqnY= @@ -225,30 +225,30 @@ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= -go.opentelemetry.io/contrib/detectors/gcp v1.38.0 h1:ZoYbqX7OaA/TAikspPl3ozPI6iY6LiIY9I8cUfm+pJs= -go.opentelemetry.io/contrib/detectors/gcp v1.38.0/go.mod h1:SU+iU7nu5ud4oCb3LQOhIZ3nRLj6FNVrKgtflbaf2ts= +go.opentelemetry.io/contrib/detectors/gcp v1.39.0 h1:kWRNZMsfBHZ+uHjiH4y7Etn2FK26LAGkNFw7RHv1DhE= +go.opentelemetry.io/contrib/detectors/gcp v1.39.0/go.mod h1:t/OGqzHBa5v6RHZwrDBJ2OirWc+4q/w2fTbLZwAKjTk= go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0 h1:x7wzEgXfnzJcHDwStJT+mxOz4etr2EcexjqhBvmoakw= go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0/go.mod h1:rg+RlpR5dKwaS95IyyZqj5Wd4E13lk/msnTS0Xl9lJM= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 h1:sbiXRNDSWJOTobXh5HyQKjq6wUC5tNybqjIqDpAY4CU= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0/go.mod h1:69uWxva0WgAA/4bu2Yy70SLDBwZXuQ6PbBpbsa5iZrQ= -go.opentelemetry.io/otel v1.40.0 h1:oA5YeOcpRTXq6NN7frwmwFR0Cn3RhTVZvXsP4duvCms= -go.opentelemetry.io/otel v1.40.0/go.mod h1:IMb+uXZUKkMXdPddhwAHm6UfOwJyh4ct1ybIlV14J0g= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.40.0 h1:QKdN8ly8zEMrByybbQgv8cWBcdAarwmIPZ6FThrWXJs= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.40.0/go.mod h1:bTdK1nhqF76qiPoCCdyFIV+N/sRHYXYCTQc+3VCi3MI= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.40.0 h1:DvJDOPmSWQHWywQS6lKL+pb8s3gBLOZUtw4N+mavW1I= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.40.0/go.mod h1:EtekO9DEJb4/jRyN4v4Qjc2yA7AtfCBuz2FynRUWTXs= +go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I= +go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 h1:88Y4s2C8oTui1LGM6bTWkw0ICGcOLCAI5l6zsD1j20k= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0/go.mod h1:Vl1/iaggsuRlrHf/hfPJPvVag77kKyvrLeD10kpMl+A= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 h1:RAE+JPfvEmvy+0LzyUA25/SGawPwIUbZ6u0Wug54sLc= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0/go.mod h1:AGmbycVGEsRx9mXMZ75CsOyhSP6MFIcj/6dnG+vhVjk= go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.36.0 h1:rixTyDGXFxRy1xzhKrotaHy3/KXdPhlWARrCgK+eqUY= go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.36.0/go.mod h1:dowW6UsM9MKbJq5JTz2AMVp3/5iW5I/TStsk8S+CfHw= -go.opentelemetry.io/otel/metric v1.40.0 h1:rcZe317KPftE2rstWIBitCdVp89A2HqjkxR3c11+p9g= -go.opentelemetry.io/otel/metric v1.40.0/go.mod h1:ib/crwQH7N3r5kfiBZQbwrTge743UDc7DTFVZrrXnqc= -go.opentelemetry.io/otel/sdk v1.40.0 h1:KHW/jUzgo6wsPh9At46+h4upjtccTmuZCFAc9OJ71f8= -go.opentelemetry.io/otel/sdk v1.40.0/go.mod h1:Ph7EFdYvxq72Y8Li9q8KebuYUr2KoeyHx0DRMKrYBUE= -go.opentelemetry.io/otel/sdk/metric v1.40.0 h1:mtmdVqgQkeRxHgRv4qhyJduP3fYJRMX4AtAlbuWdCYw= -go.opentelemetry.io/otel/sdk/metric v1.40.0/go.mod h1:4Z2bGMf0KSK3uRjlczMOeMhKU2rhUqdWNoKcYrtcBPg= -go.opentelemetry.io/otel/trace v1.40.0 h1:WA4etStDttCSYuhwvEa8OP8I5EWu24lkOzp+ZYblVjw= -go.opentelemetry.io/otel/trace v1.40.0/go.mod h1:zeAhriXecNGP/s2SEG3+Y8X9ujcJOTqQ5RgdEJcawiA= -go.opentelemetry.io/proto/otlp v1.9.0 h1:l706jCMITVouPOqEnii2fIAuO3IVGBRPV5ICjceRb/A= -go.opentelemetry.io/proto/otlp v1.9.0/go.mod h1:xE+Cx5E/eEHw+ISFkwPLwCZefwVjY+pqKg1qcK03+/4= +go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM= +go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY= +go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg= +go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg= +go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw= +go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A= +go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A= +go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0= +go.opentelemetry.io/proto/otlp v1.10.0 h1:IQRWgT5srOCYfiWnpqUYz9CVmbO8bFmKcwYxpuCSL2g= +go.opentelemetry.io/proto/otlp v1.10.0/go.mod h1:/CV4QoCR/S9yaPj8utp3lvQPoqMtxXdzn7ozvvozVqk= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= @@ -258,8 +258,8 @@ go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.47.0 h1:V6e3FRj+n4dbpw86FJ8Fv7XVOql7TEwpHapKoMJ/GO8= -golang.org/x/crypto v0.47.0/go.mod h1:ff3Y9VzzKbwSSEzWqJsJVBnWmRwRSHt/6Op5n9bQc4A= +golang.org/x/crypto v0.49.0 h1:+Ng2ULVvLHnJ/ZFEq4KdcDd/cfjrrjjNSXNzxg0Y4U4= +golang.org/x/crypto v0.49.0/go.mod h1:ErX4dUh2UM+CFYiXZRTcMpEcN8b/1gxEuv3nODoYtCA= golang.org/x/exp v0.0.0-20250207012021-f9890c6ad9f3 h1:qNgPs5exUA+G0C96DrPwNrvLSj7GT/9D+3WMWUcUg34= golang.org/x/exp v0.0.0-20250207012021-f9890c6ad9f3/go.mod h1:tujkw807nyEEAamNbDrEGzRav+ilXA7PCRAd6xsmwiU= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= @@ -268,53 +268,53 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o= -golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8= -golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw= -golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= +golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0= +golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw= +golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ= +golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= -golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= +golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ= -golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= -golang.org/x/term v0.39.0 h1:RclSuaJf32jOqZz74CkPA9qFuVTX7vhLlpfj/IGWlqY= -golang.org/x/term v0.39.0/go.mod h1:yxzUCTP/U+FzoxfdKmLaA0RV1WgE0VY7hXBwKtY/4ww= +golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= +golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/term v0.41.0 h1:QCgPso/Q3RTJx2Th4bDLqML4W6iJiaXFq2/ftQF13YU= +golang.org/x/term v0.41.0/go.mod h1:3pfBgksrReYfZ5lvYM0kSO0LIkAl4Yl2bXOkKP7Ec2A= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE= -golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8= +golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8= +golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA= golang.org/x/time v0.11.0 h1:/bpjEDfN9tkoN/ryeYHnv5hcMlc8ncjMcM4XBk5NWV0= golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.40.0 h1:yLkxfA+Qnul4cs9QA3KnlFu0lVmd8JJfoq+E41uSutA= -golang.org/x/tools v0.40.0/go.mod h1:Ik/tzLRlbscWpqqMRjyWYDisX8bG13FrdXp3o4Sr9lc= +golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k= +golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= -gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= -gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4= +gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E= google.golang.org/api v0.235.0 h1:C3MkpQSRxS1Jy6AkzTGKKrpSCOd2WOGrezZ+icKSkKo= google.golang.org/api v0.235.0/go.mod h1:QpeJkemzkFKe5VCE/PMv7GsUfn9ZF+u+q1Q7w6ckxTg= google.golang.org/genproto v0.0.0-20250505200425-f936aa4a68b2 h1:1tXaIXCracvtsRxSBsYDiSBN0cuJvM7QYW+MrpIRY78= google.golang.org/genproto v0.0.0-20250505200425-f936aa4a68b2/go.mod h1:49MsLSx0oWMOZqcpB3uL8ZOkAh1+TndpJ8ONoCBWiZk= -google.golang.org/genproto/googleapis/api v0.0.0-20260128011058-8636f8732409 h1:merA0rdPeUV3YIIfHHcH4qBkiQAc1nfCKSI7lB4cV2M= -google.golang.org/genproto/googleapis/api v0.0.0-20260128011058-8636f8732409/go.mod h1:fl8J1IvUjCilwZzQowmw2b7HQB2eAuYBabMXzWurF+I= -google.golang.org/genproto/googleapis/rpc v0.0.0-20260128011058-8636f8732409 h1:H86B94AW+VfJWDqFeEbBPhEtHzJwJfTbgE2lZa54ZAQ= -google.golang.org/genproto/googleapis/rpc v0.0.0-20260128011058-8636f8732409/go.mod h1:j9x/tPzZkyxcgEFkiKEEGxfvyumM01BEtsW8xzOahRQ= -google.golang.org/grpc v1.78.0 h1:K1XZG/yGDJnzMdd/uZHAkVqJE+xIDOcmdSFZkBUicNc= -google.golang.org/grpc v1.78.0/go.mod h1:I47qjTo4OKbMkjA/aOOwxDIiPSBofUtQUI5EfpWvW7U= +google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 h1:VPWxll4HlMw1Vs/qXtN7BvhZqsS9cdAittCNvVENElA= +google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:7QBABkRtR8z+TEnmXTqIqwJLlzrZKVfAUm7tY3yGv0M= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 h1:m8qni9SQFH0tJc1X0vmnpw/0t+AImlSvp30sEupozUg= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8= +google.golang.org/grpc v1.80.0 h1:Xr6m2WmWZLETvUNvIUmeD5OAagMw3FiKmMlTdViWsHM= +google.golang.org/grpc v1.80.0/go.mod h1:ho/dLnxwi3EDJA4Zghp7k2Ec1+c2jqup0bFkw07bwF4= google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/helm-chart/splunk-ai-operator/Chart.lock b/helm-chart/splunk-ai-operator/Chart.lock index 0136098..e80b049 100644 --- a/helm-chart/splunk-ai-operator/Chart.lock +++ b/helm-chart/splunk-ai-operator/Chart.lock @@ -13,6 +13,6 @@ dependencies: version: 72.4.0 - name: splunk-operator repository: https://splunk.github.io/splunk-operator - version: 3.0.0 -digest: sha256:41032e66994109109208bc66b07b6f10890c9c8dafe019aa480d73d4effe915a -generated: "2025-12-11T11:23:06.233099-08:00" + version: 3.1.0 +digest: sha256:bc5e962d5c6b465b26a13a91660d7fa45687c394e124abe2beb96e4a2e3760df +generated: "2026-03-21T00:24:00.448397+05:30" diff --git a/helm-chart/splunk-ai-operator/Chart.yaml b/helm-chart/splunk-ai-operator/Chart.yaml index 782cb8e..6101ae9 100644 --- a/helm-chart/splunk-ai-operator/Chart.yaml +++ b/helm-chart/splunk-ai-operator/Chart.yaml @@ -86,6 +86,6 @@ dependencies: # Splunk Operator - Required for managing Splunk Enterprise instances - name: splunk-operator - version: "3.0.0" + version: "3.1.0" repository: "https://splunk.github.io/splunk-operator" condition: splunk-operator.enabled diff --git a/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiplatforms.yaml b/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiplatforms.yaml index 98675dc..67fc505 100644 --- a/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiplatforms.yaml +++ b/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiplatforms.yaml @@ -2227,15 +2227,27 @@ spec: properties: endpoint: description: |- - Optional override endpoint (only needed for S3-compatible services like MinIO) - Must be a valid HTTP/HTTPS URL + Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS) + Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible (MinIO, SeaweedFS, etc.) pattern: ^https?://.*$ type: string path: description: |- Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, or minio://bucketname/ - pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ + azure://containername/, s3compat://bucketname/ (generic S3-compatible), minio://, or seaweedfs:// + pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ + type: string + provider: + description: |- + Provider is an optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint. + Values: aws, minio, seaweedfs, s3compat, gcs, azure + enum: + - aws + - minio + - seaweedfs + - s3compat + - gcs + - azure type: string region: description: Region of the remote storage volume. Required for @@ -2243,7 +2255,8 @@ spec: minLength: 1 type: string secretRef: - description: Secret name containing storage credentials + description: Secret name containing storage credentials (e.g. + s3_access_key, s3_secret_key for S3-compatible backends) maxLength: 253 minLength: 1 type: string diff --git a/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiservices.yaml b/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiservices.yaml index f9c3493..f203f3c 100644 --- a/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiservices.yaml +++ b/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiservices.yaml @@ -1818,15 +1818,27 @@ spec: properties: endpoint: description: |- - Optional override endpoint (only needed for S3-compatible services like MinIO) - Must be a valid HTTP/HTTPS URL + Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS) + Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible (MinIO, SeaweedFS, etc.) pattern: ^https?://.*$ type: string path: description: |- Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, or minio://bucketname/ - pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ + azure://containername/, s3compat://bucketname/ (generic S3-compatible), minio://, or seaweedfs:// + pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ + type: string + provider: + description: |- + Provider is an optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint. + Values: aws, minio, seaweedfs, s3compat, gcs, azure + enum: + - aws + - minio + - seaweedfs + - s3compat + - gcs + - azure type: string region: description: Region of the remote storage volume. Required for @@ -1834,7 +1846,8 @@ spec: minLength: 1 type: string secretRef: - description: Secret name containing storage credentials + description: Secret name containing storage credentials (e.g. + s3_access_key, s3_secret_key for S3-compatible backends) maxLength: 253 minLength: 1 type: string @@ -1882,6 +1895,152 @@ spec: type: string type: object type: array + v2: + description: |- + V2 configures the SAIA v2 deployment. v2 is always deployed alongside v1 behind nginx. + Users toggle Agent Mode (v1 vs v2) from the Splunk Settings UI. + properties: + image: + description: Image is the container image for the v2 API pod + type: string + replicas: + default: 1 + description: Replicas is the number of v2 API replicas + format: int32 + minimum: 0 + type: integer + resources: + description: Resources defines the compute resources for the v2 + API pods + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This is an alpha field and requires enabling the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + type: object + v2Worker: + description: V2Worker configures the v2 SAIA worker deployment (same + v2 image, command=run-worker.sh). + properties: + replicas: + default: 1 + description: Replicas is the number of worker replicas + format: int32 + minimum: 0 + type: integer + resources: + description: Resources defines the compute resources for the worker + pods + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This is an alpha field and requires enabling the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + type: object vectorDbUrl: description: VectorDbUrl specifies the URL or service name for the vector database diff --git a/helm-chart/splunk-ai-operator/templates/deployment.yaml b/helm-chart/splunk-ai-operator/templates/deployment.yaml index 579e800..4f17067 100644 --- a/helm-chart/splunk-ai-operator/templates/deployment.yaml +++ b/helm-chart/splunk-ai-operator/templates/deployment.yaml @@ -40,7 +40,7 @@ spec: {{- toYaml .Values.securityContext | nindent 8 }} containers: - name: manager - image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion | default "latest" }}" + image: "{{ if .Values.image.digest }}{{ .Values.image.repository }}@{{ .Values.image.digest }}{{ else }}{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion | default "latest" }}{{ end }}" imagePullPolicy: {{ .Values.image.pullPolicy }} args: - --metrics-bind-address=:8443 @@ -76,10 +76,14 @@ spec: value: {{ .Values.weaviateImage }} - name: RELATED_IMAGE_SAIA_API value: {{ .Values.saiaApiImage }} + - name: RELATED_IMAGE_SAIA_API_V2 + value: {{ .Values.saiaApiV2Image }} - name: RELATED_IMAGE_POST_INSTALL_HOOK value: {{ .Values.saiaSchemaImage }} - name: RELATED_IMAGE_OTEL_COLLECTOR value: {{ .Values.otelCollectorImage }} + - name: RELATED_IMAGE_NGINX + value: {{ .Values.nginxImage }} - name: MODEL_VERSION value: v0.3.14-36-g1549f5a - name: RAY_VERSION diff --git a/helm-chart/splunk-ai-operator/values.yaml b/helm-chart/splunk-ai-operator/values.yaml index 2a4f660..f16dc4a 100644 --- a/helm-chart/splunk-ai-operator/values.yaml +++ b/helm-chart/splunk-ai-operator/values.yaml @@ -107,11 +107,16 @@ weaviateImage: "docker.io/semitechnologies/weaviate:stable-v1.28-007846a" # SAIA (Splunk AI Assistant) images saiaApiImage: "docker.io/splunk/saia-api:1.1.0" +saiaApiV2Image: "docker.io/splunk/saia-api-v2:1.1.0" saiaSchemaImage: "docker.io/splunk/saia-data-loader:1.1.0" # OpenTelemetry Collector sidecar image otelCollectorImage: "docker.io/otel/opentelemetry-collector-contrib:0.122.1" +# Nginx reverse proxy image used by the SAIA reconciler to route v1/v2 traffic. +# Override this in airgapped installs to point at your internal mirror. +nginxImage: "nginx:1.27-alpine" + # Set security context for Splunk Operator pod # reference: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#podsecuritycontext-v1-core securityContext: diff --git a/internal/webhook/v1/aiplatform_webhook.go b/internal/webhook/v1/aiplatform_webhook.go index 7e8ceb2..674471f 100644 --- a/internal/webhook/v1/aiplatform_webhook.go +++ b/internal/webhook/v1/aiplatform_webhook.go @@ -195,14 +195,7 @@ func (v *AIPlatformCustomValidator) ValidateUpdate(ctx context.Context, oldObj, warnings = append(warnings, createWarnings...) } - // Validate immutable fields - if oldPlatform.Spec.ObjectStorage.Path != aiplatform.Spec.ObjectStorage.Path { - allErrs = append(allErrs, field.Forbidden( - field.NewPath("spec").Child("objectStorage").Child("path"), - "objectStorage.path is immutable", - )) - } - + // Validate immutable fields (path is mutable to allow switching storage backends, e.g. MinIO to SeaweedFS) if oldPlatform.Spec.ObjectStorage.Region != aiplatform.Spec.ObjectStorage.Region { allErrs = append(allErrs, field.Forbidden( field.NewPath("spec").Child("objectStorage").Child("region"), @@ -237,8 +230,8 @@ func (v *AIPlatformCustomValidator) validateObjectStorage(objStorage *aiv1.Objec if objStorage.Path == "" { allErrs = append(allErrs, field.Required(fldPath.Child("path"), "objectStorage.path must be specified")) } else { - // Validate path format (s3://, gs://, azure://, minio://) - validPrefixes := []string{"s3://", "gs://", "azure://", "minio://"} + // Validate path format (s3://, gs://, azure://, s3compat://, minio://, seaweedfs://) + validPrefixes := []string{"s3://", "gs://", "azure://", "s3compat://", "minio://", "seaweedfs://"} hasValidPrefix := false for _, prefix := range validPrefixes { if strings.HasPrefix(objStorage.Path, prefix) { @@ -250,7 +243,7 @@ func (v *AIPlatformCustomValidator) validateObjectStorage(objStorage *aiv1.Objec allErrs = append(allErrs, field.Invalid( fldPath.Child("path"), objStorage.Path, - "path must start with s3://, gs://, azure://, or minio://", + "path must start with s3://, gs://, azure://, s3compat://, minio://, or seaweedfs://", )) } } diff --git a/internal/webhook/v1/aiservice_webhook.go b/internal/webhook/v1/aiservice_webhook.go index 69a0f46..7d81d77 100644 --- a/internal/webhook/v1/aiservice_webhook.go +++ b/internal/webhook/v1/aiservice_webhook.go @@ -275,7 +275,7 @@ func (v *AIServiceCustomValidator) validateTaskVolume(taskVolume *aiv1.ObjectSto } else { // Validate path format /* - validPrefixes := []string{"s3://", "gs://", "azure://", "minio://"} + validPrefixes := []string{"s3://", "gs://", "azure://", "s3compat://", "minio://", "seaweedfs://"} hasValidPrefix := false for _, prefix := range validPrefixes { if strings.HasPrefix(taskVolume.Path, prefix) { @@ -287,7 +287,7 @@ func (v *AIServiceCustomValidator) validateTaskVolume(taskVolume *aiv1.ObjectSto allErrs = append(allErrs, field.Invalid( fldPath.Child("path"), taskVolume.Path, - "path must start with s3://, gs://, azure://, or minio://", + "path must start with s3://, gs://, azure://, s3compat://, minio://, or seaweedfs://", )) } */ diff --git a/pkg/ai/features/saia/impl.go b/pkg/ai/features/saia/impl.go index b3106b1..75d9d0a 100644 --- a/pkg/ai/features/saia/impl.go +++ b/pkg/ai/features/saia/impl.go @@ -61,6 +61,12 @@ func (r *SaiaReconciler) Reconcile(ctx context.Context, aiservice *aiv1.AIServic {"Certificate", r.reconcileCertificate}, {"PostInstallHook", r.reconcilePostInstallHook}, {"SAIADeployment", r.reconcileSAIADeployment}, + {"SAIAv2Deployment", r.reconcileSAIAv2Deployment}, + {"SAIAv2Worker", r.reconcileSAIAv2Worker}, + {"NginxConfigMap", r.reconcileNginxConfigMap}, + {"NginxDeployment", r.reconcileNginxDeployment}, + {"SAIAv1Service", r.reconcileSAIAv1Service}, + {"SAIAv2Service", r.reconcileSAIAv2Service}, {"SAIAService", r.reconcileSAIAService}, {"ServiceMonitor", r.reconcileServiceMonitor}, } @@ -143,8 +149,12 @@ func (r *SaiaReconciler) validateAIService( clusterDomain = "cluster.local" } if ai.Spec.AIPlatformUrl == "" { - ai.Spec.AIPlatformUrl = fmt.Sprintf("%s.%s.svc.%s:8000", - aiPlatform.Status.RayServiceName, ai.Spec.AIPlatformRef.Namespace, clusterDomain) + scheme := ai.Spec.AIPlatformScheme + if scheme == "" { + scheme = "http" + } + ai.Spec.AIPlatformUrl = fmt.Sprintf("%s://%s.%s.svc.%s:8000", + scheme, aiPlatform.Status.RayServiceName, ai.Spec.AIPlatformRef.Namespace, clusterDomain) } if ai.Spec.VectorDbUrl == "" { ai.Spec.VectorDbUrl = fmt.Sprintf("%s.%s.svc.%s", @@ -162,17 +172,19 @@ func (r *SaiaReconciler) validateAIService( return fmt.Errorf("VectorDbUrl must be set (either from AIPlatformRef or explicitly)") } - // Default resources + // Default resources — SAIA API needs headroom beyond 2Gi or the kubelet OOMKills during startup. if ai.Spec.Resources.Requests == nil { ai.Spec.Resources.Requests = corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("500m"), - corev1.ResourceMemory: resource.MustParse("1Gi"), + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("4Gi"), + corev1.ResourceEphemeralStorage: resource.MustParse("10Gi"), } } if ai.Spec.Resources.Limits == nil { ai.Spec.Resources.Limits = corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("1"), - corev1.ResourceMemory: resource.MustParse("2Gi"), + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("4Gi"), + corev1.ResourceEphemeralStorage: resource.MustParse("10Gi"), } } if ai.Spec.TaskVolume.Path == "" { @@ -183,6 +195,18 @@ func (r *SaiaReconciler) validateAIService( ai.Spec.Replicas = 1 } + // V2 image is required (v2 is always deployed alongside v1) + if ai.Spec.V2.Image == "" { + r.Recorder.Event(ai, corev1.EventTypeWarning, "InvalidSpec", "v2.image must be set for SAIA v2 deployment") + return fmt.Errorf("v2.image must be set for SAIA v2 deployment") + } + if ai.Spec.V2.Replicas == 0 { + ai.Spec.V2.Replicas = 1 + } + if ai.Spec.V2Worker.Replicas == 0 { + ai.Spec.V2Worker.Replicas = 1 + } + if ai.Spec.SplunkConfiguration.Endpoint == "" && ai.Spec.SplunkConfiguration.SplunkCustomResourceRef.Name == "" { r.Recorder.Event(ai, corev1.EventTypeWarning, "SplunkConfigMissing", "Splunk configuration is missing assuming no logging") return nil @@ -300,18 +324,31 @@ func (r *SaiaReconciler) reconcileSAIAConfigMap( cmName := fmt.Sprintf("%s-saia-config", ai.Name) // Defaults for static keys (override in user-managed CM if desired). + // + // ENABLE_AUTHZ MUST be "true" for SAIAAuthorizer.authorize() to run its + // CMP interactive-token validation branch, which is the ONLY code path + // that sets request.state.cmp_splunk_url on a successful token. The admin + // endpoints (AdminCapabilityAuthorizer) read that attribute to bridge the + // Splunk.interactive bearer into an EC-equivalent token. With "false" the + // main authorizer early-returns, the attribute is never set, and every + // /admin/* request fails with: + // 403 {"detail":"Admin endpoints require an authenticated EC user token."} + // There is no authorization-skip value that also preserves CMP bridging — + // the value IS "true" even in airgap CMP mode. defaults := map[string]string{ // previously hardcoded "SERVICE_NAME": "splunk_ai_assistant", "SERVICE_INTERNAL_NAME": "SAIA", "SPLUNK_ISSUERS": "https://splunk-splunk-standalone-standalone-service:8089", "SPLUNK_AI_ASSISTANT_SERVICE_CMP": "true", - "ENABLE_AUTHZ": "false", // FIXME remove when ready + "ENABLE_AUTHZ": "true", "FEATURE_CONFIG_FILE_LOCATION": "/etc/config/features_config.yaml", "PLATFORM_VERSION": "0.3.0", // TODO make configurable "SAIA_API_VERSION": "0.3.1", // TODO make configurable "TELEMETRY_ENV": "NOTLOCAL", // TODO make configurable "LOG_LEVEL": "info", + "USE_GPT_OSS": "true", + "SCS_TOKEN": "no-auth-required", } found := &corev1.ConfigMap{} @@ -542,22 +579,64 @@ func (r *SaiaReconciler) reconcilePostInstallHook( } } uri := fmt.Sprintf("http://%s:80", ai.Spec.VectorDbUrl) + backoffLimit := int32(1) job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ Name: ai.Name + "-vector-db-setup-posthook", Namespace: ai.Namespace, }, Spec: batchv1.JobSpec{ + BackoffLimit: &backoffLimit, Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ RestartPolicy: corev1.RestartPolicyNever, + // Wait for Weaviate to accept connections before running + // the schema setup container. This eliminates the + // error-pod churn that occurred when the Job was created + // before Weaviate was fully serving (the operator-level + // condition check can race with actual endpoint readiness). + InitContainers: []corev1.Container{ + { + Name: "wait-for-weaviate", + Image: hookImage, + ImagePullPolicy: corev1.PullIfNotPresent, + Command: []string{"python3", "-c", fmt.Sprintf( + `import urllib.request, time, sys +url = "http://%s:80/v1/.well-known/ready" +for i in range(120): + try: + r = urllib.request.urlopen(url, timeout=5) + if r.status == 200: + print("weaviate ready"); sys.exit(0) + except Exception as e: + print(f"attempt {i+1}/120: {e}") + time.sleep(5) +print("timed out waiting for weaviate"); sys.exit(1)`, + ai.Spec.VectorDbUrl, + )}, + }, + }, Containers: []corev1.Container{ { Name: "vector-db-setup-container", Image: hookImage, - ImagePullPolicy: corev1.PullAlways, + ImagePullPolicy: corev1.PullIfNotPresent, + // The v2 data-loader image (>= v2.0.4-13-g3b677604) uses the + // Weaviate v4 Python client, which performs a gRPC health + // check on connect and requires explicit gRPC host/port. Its + // URL-compat shim defaults to the Splunk production naming + // (grpc.:443 TLS) if these are unset — wrong for k0s + // airgap where Weaviate exposes gRPC on the same Service + // (port 50051, plaintext). Always set these explicitly so + // the shim's setdefault() calls are no-ops. Env: []corev1.EnvVar{ {Name: "VECTOR_DB_URL", Value: uri}, + {Name: "VECTOR_DB_HOST", Value: ai.Spec.VectorDbUrl}, + {Name: "VECTOR_DB_PORT", Value: "80"}, + {Name: "VECTOR_DB_GRPC_HOST", Value: ai.Spec.VectorDbUrl}, + {Name: "VECTOR_DB_GRPC_PORT", Value: "50051"}, + {Name: "VECTOR_DB_SECURE", Value: "false"}, + {Name: "VECTOR_DB_AUTH_ENABLED", Value: "false"}, {Name: "SPLUNK_AI_ASSISTANT_SERVICE_CMP", Value: "true"}, }, }, @@ -582,6 +661,258 @@ func (r *SaiaReconciler) reconcilePostInstallHook( return fmt.Errorf("created Job %q, waiting for completion", job.Name) } +// buildSAIABaseEnv returns the common environment variables shared across all SAIA pods +// (v1 API, v1 worker, v2 API, v2 worker). Callers append pod-specific vars. +func buildSAIABaseEnv(ai *aiv1.AIService) []corev1.EnvVar { + bucketName := extractBucketName(ai.Spec.TaskVolume.Path) + // WEAVIATE_PLATFORM_URL points directly at the native Weaviate service. + // When the value contains a scheme ("://"), the SAIA v1 pipeline uses it + // as-is (bypassing the cloud ML-Platform "/weaviate" path convention). + weaviatePlatformURL := fmt.Sprintf("http://%s:80", ai.Spec.VectorDbUrl) + env := []corev1.EnvVar{ + {Name: "PLATFORM_URL", Value: ai.Spec.AIPlatformUrl}, + {Name: "WEAVIATE_PLATFORM_URL", Value: weaviatePlatformURL}, + {Name: "VECTOR_DB_URL", Value: ai.Spec.VectorDbUrl}, + {Name: "S3_BUCKET", Value: bucketName}, + } + + if ai.Spec.TaskVolume.Endpoint != "" { + env = append(env, + corev1.EnvVar{Name: "S3COMPAT_OBJECT_STORE_ENDPOINT_URL", Value: ai.Spec.TaskVolume.Endpoint}, + corev1.EnvVar{Name: "S3COMPAT_OBJECT_STORE_BUCKET", Value: bucketName}, + ) + } + + if ai.Spec.TaskVolume.SecretRef != "" { + env = append(env, + corev1.EnvVar{ + Name: "S3COMPAT_OBJECT_STORE_ACCESS_KEY", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: ai.Spec.TaskVolume.SecretRef}, + Key: "s3_access_key", + }, + }, + }, + corev1.EnvVar{ + Name: "S3COMPAT_OBJECT_STORE_SECRET_KEY", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: ai.Spec.TaskVolume.SecretRef}, + Key: "s3_secret_key", + }, + }, + }, + ) + } + + return env +} + +// buildV2ExtraEnv returns additional env vars needed by the SAIA v2 image. +// v2 uses different env var names: VECTOR_DB_HOST (not VECTOR_DB_URL), +// ML_PLATFORM_URL (not PLATFORM_URL), and needs vector DB TLS/auth disabled. +// +// This also switches the conversation store from the ephemeral filesystem +// default to the S3 backend added in saia-service commit 3d3756f3 (Tony, +// merged into ai-tier-v2.0 via 9efe1fce on 2026-04-20, shipped in image +// build-v2-002). See the CONVERSATION_STORE block below for the full +// rationale; without this the v2 API returns 404 on GET /conversations/ +// /items after every pod restart. +// +// SAIA V2 FieldDescription backend selection (required by both v2 API and v2 +// worker, else FieldDescriptionRepositoryFactory.get() raises ValueError at +// startup and the worker enters a restart loop). +// +// Per Confluence ERD "ERD - AI Tier v0.2 - Bare Metal - SAIA 2.0", section +// 3.8.1.2 + decision A.3: Option B (clean architecture) — use the new `s3` +// backend that reads the global field-descriptions JSON from the same +// S3-compatible object store (SeaweedFS/MinIO/CVFS) that SAIA already uses +// for tenant data. The alternatives: +// - `dynamodb` — ERD assumption 2.1 explicitly disallows DynamoDB in AI Tier. +// - `file` — requires the saia-v2 Dockerfile to `COPY dataset/`, which +// the current image (v2.0.4-31-g9efe1fc) does NOT do. +// +// The JSON object must be pre-uploaded to S3_BUCKET/FIELD_DESCRIPTION_S3_KEY +// before the worker runs; the data-loader Job is the canonical bootstrap step +// for this (see scripts/data_loader/ in saia-service). +// +// AWS_ENDPOINT_URL / AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY: the v2 +// S3StorageAdapter (used by S3FieldDescriptionRepository, see +// app/repositories/field_description/factory.py) constructs boto3 directly and +// reads the canonical AWS_* names. v1's S3COMPAT_OBJECT_STORE_* env vars are +// already set in buildSAIABaseEnv but are NOT read by boto3, so without +// AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY the worker would silently fall back +// to no-credentials (NoCredentialsError caught by the repository as +// StorageAdapterError, returning an empty cache and degraded search results). +// Sourcing them from the same secret keys as the S3-compat creds keeps a +// single source of truth for object-store auth. +func buildV2ExtraEnv(ai *aiv1.AIService) []corev1.EnvVar { + bucketName := extractBucketName(ai.Spec.TaskVolume.Path) + env := []corev1.EnvVar{ + {Name: "ML_PLATFORM_URL", Value: ai.Spec.AIPlatformUrl}, + {Name: "VECTOR_DB_AUTH_ENABLED", Value: "false"}, + {Name: "VECTOR_DB_GRPC_HOST", Value: ai.Spec.VectorDbUrl}, + {Name: "VECTOR_DB_GRPC_PORT", Value: "50051"}, + {Name: "VECTOR_DB_HOST", Value: ai.Spec.VectorDbUrl}, + {Name: "VECTOR_DB_PORT", Value: "80"}, + {Name: "VECTOR_DB_SECURE", Value: "false"}, + // FieldDescription S3 backend (see doc-comment above). + {Name: "FIELD_DESCRIPTION_BACKEND", Value: "s3"}, + {Name: "FIELD_DESCRIPTION_S3_KEY", Value: "field-descriptions/global-field-descriptions.json"}, + } + // Conversation persistence backend. + // + // SAIA v2 defaults conversation_store to "filesystem" which writes to + // /home/splunk/.local_storage/conversations on the pod's ephemeral + // container overlay. Every v2 pod restart (worker crash-loop, operator + // reconfigure, Kuberay zero-downtime upgrade, node drain) wipes the full + // chat history and produces user-visible "Conversation ... not found" + // 404s on GET /conversations//items whenever the Splunk UI tries to + // re-hydrate a chat (incl. the saia_v2_audit_index_log_proxy flow). + // + // Tony's saia-service commits 3d3756f3 + 8e2a9f40 (merged into + // ai-tier-v2.0 via 9efe1fce on 2026-04-20, and present in image + // build-v2-002) added an S3ConversationStore that reuses the same + // S3-compatible object store already configured for TaskVolume + // (SeaweedFS / MinIO / CVFS / real AWS S3). Turning it on for the SAIA + // v2 API and v2 worker makes chat history survive pod restarts. + // + // Activation contract (saia-v2/app/core/config.py::Settings): + // - CONVERSATION_STORE=s3 + // - CONVERSATION_S3_BUCKET= (validator raises ValueError at + // startup if CONVERSATION_STORE=s3 and this is empty) + // - AWS_ENDPOINT_URL, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY: + // already wired below for the FieldDescription S3 adapter and + // transparently reused by S3ConversationStore via boto3. + // + // We only emit these when we can derive a bucket name from + // TaskVolume.Path (the canonical source of truth for SAIA object + // storage). Leaving the defaults alone in the pathological "no path" + // case avoids a v2 pod startup crash-loop on misconfigured CRs. + if bucketName != "" { + env = append(env, + corev1.EnvVar{Name: "CONVERSATION_STORE", Value: "s3"}, + corev1.EnvVar{Name: "CONVERSATION_S3_BUCKET", Value: bucketName}, + ) + } + // Only expose AWS_ENDPOINT_URL when the operator was configured with an + // explicit S3-compatible endpoint (SeaweedFS/MinIO). Omitting it lets the + // v2 adapter use the default AWS S3 endpoint when running in a real cloud + // deployment. + if ai.Spec.TaskVolume.Endpoint != "" { + env = append(env, corev1.EnvVar{ + Name: "AWS_ENDPOINT_URL", + Value: ai.Spec.TaskVolume.Endpoint, + }) + } + // boto3-canonical credentials for the v2 S3StorageAdapter. Mirrors the + // S3COMPAT_OBJECT_STORE_ACCESS_KEY/_SECRET_KEY plumbing in buildSAIABaseEnv; + // see s3compat secret schema in raybuilder/builder.go and ai.Spec.TaskVolume.SecretRef. + if ai.Spec.TaskVolume.SecretRef != "" { + env = append(env, + corev1.EnvVar{ + Name: "AWS_ACCESS_KEY_ID", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: ai.Spec.TaskVolume.SecretRef}, + Key: "s3_access_key", + }, + }, + }, + corev1.EnvVar{ + Name: "AWS_SECRET_ACCESS_KEY", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: ai.Spec.TaskVolume.SecretRef}, + Key: "s3_secret_key", + }, + }, + }, + ) + } + return env +} + +// buildSAIATLSEnv appends TLS-related env vars and returns updated env, volumes, and mounts. +func buildSAIATLSEnv(ai *aiv1.AIService, env []corev1.EnvVar, volumes []corev1.Volume, mounts []corev1.VolumeMount, ports []corev1.ContainerPort) ([]corev1.EnvVar, []corev1.Volume, []corev1.VolumeMount, []corev1.ContainerPort) { + if ai.Spec.MTLS.Enabled && ai.Spec.MTLS.Termination == "operator" { + volumes = append(volumes, corev1.Volume{ + Name: "tls", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{SecretName: ai.Spec.MTLS.SecretName}, + }, + }) + mounts = append(mounts, corev1.VolumeMount{Name: "tls", MountPath: "/etc/tls", ReadOnly: true}) + env = append(env, + corev1.EnvVar{Name: "TLS_CERT_FILE", Value: "/etc/tls/tls.crt"}, + corev1.EnvVar{Name: "TLS_KEY_FILE", Value: "/etc/tls/tls.key"}, + ) + ports = append(ports, corev1.ContainerPort{Name: "https", ContainerPort: 8443}) + } else { + env = append(env, corev1.EnvVar{Name: "TLS_DISABLED", Value: "true"}) + } + return env, volumes, mounts, ports +} + +// saiaEnvFrom returns the EnvFromSource for the SAIA ConfigMap. +func saiaEnvFrom(ai *aiv1.AIService) []corev1.EnvFromSource { + return []corev1.EnvFromSource{ + { + ConfigMapRef: &corev1.ConfigMapEnvSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: fmt.Sprintf("%s-saia-config", ai.Name), + }, + }, + }, + } +} + +// saiaVolumes returns the standard config volume and mount for SAIA pods. +func saiaVolumes(ai *aiv1.AIService) ([]corev1.Volume, []corev1.VolumeMount) { + featureConfigName := fmt.Sprintf("splunk-%s-feature-config", ai.Name) + volumes := []corev1.Volume{ + { + Name: "config-volume", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{Name: featureConfigName}, + }, + }, + }, + } + mounts := []corev1.VolumeMount{ + {Name: "config-volume", MountPath: "/etc/config"}, + } + return volumes, mounts +} + +// saiaLabelsAndAnnotations returns the labels and annotations for SAIA pods. +func saiaLabelsAndAnnotations(ai *aiv1.AIService, component string) (map[string]string, map[string]string) { + labels := map[string]string{ + "app": ai.Name, + "component": component, + "area": "ml", + "team": "ml", + } + for k, v := range ai.Labels { + labels[k] = v + } + + annotations := map[string]string{ + "prometheus.io/port": "8088", + "prometheus.io/path": "/metrics", + "prometheus.io/scheme": "http", + } + for k, v := range ai.Annotations { + if k == "kubectl.kubernetes.io/last-applied-configuration" || k == "kubectl.kubernetes.io/restartedAt" { + continue + } + annotations[k] = v + } + return labels, annotations +} + // reconcileSAIADeployment ensures the main Deployment exists and is configured. func (r *SaiaReconciler) reconcileSAIADeployment( ctx context.Context, @@ -610,25 +941,30 @@ func (r *SaiaReconciler) reconcileSAIADeployment( } // Base env: keep ONLY dynamic values here. + weaviatePlatformURL := fmt.Sprintf("http://%s:80", ai.Spec.VectorDbUrl) env := []corev1.EnvVar{ // Dynamic or runtime-derived values: {Name: "PLATFORM_URL", Value: ai.Spec.AIPlatformUrl}, + {Name: "WEAVIATE_PLATFORM_URL", Value: weaviatePlatformURL}, {Name: "VECTOR_DB_URL", Value: ai.Spec.VectorDbUrl}, // SAIA uses /tasks subdirectory within its feature path // Extract just the bucket name from the full path (e.g., "s3://bucket-name" -> "bucket-name") {Name: "S3_BUCKET", Value: extractBucketName(ai.Spec.TaskVolume.Path)}, } - // MinIO support: Add MinIO-specific environment variables if endpoint is configured - if strings.HasPrefix(ai.Spec.TaskVolume.Path, "minio") && ai.Spec.TaskVolume.Endpoint != "" { - env = append(env, corev1.EnvVar{Name: "MINIO_ENDPOINT_URL", Value: ai.Spec.TaskVolume.Endpoint}) + // S3-compatible object store: set S3COMPAT_OBJECT_STORE_ENDPOINT_URL and S3COMPAT_OBJECT_STORE_BUCKET for custom endpoint (MinIO, SeaweedFS, etc.). + if ai.Spec.TaskVolume.Endpoint != "" { + env = append(env, + corev1.EnvVar{Name: "S3COMPAT_OBJECT_STORE_ENDPOINT_URL", Value: ai.Spec.TaskVolume.Endpoint}, + corev1.EnvVar{Name: "S3COMPAT_OBJECT_STORE_BUCKET", Value: extractBucketName(ai.Spec.TaskVolume.Path)}, + ) } - // MinIO credentials: If secretRef is provided, add MINIO_ACCESS_KEY and MINIO_SECRET_KEY from secret + // S3-compatible object store credentials from secretRef (S3COMPAT_OBJECT_STORE_ACCESS_KEY, S3COMPAT_OBJECT_STORE_SECRET_KEY). if ai.Spec.TaskVolume.SecretRef != "" { env = append(env, corev1.EnvVar{ - Name: "MINIO_ACCESS_KEY", + Name: "S3COMPAT_OBJECT_STORE_ACCESS_KEY", ValueFrom: &corev1.EnvVarSource{ SecretKeyRef: &corev1.SecretKeySelector{ LocalObjectReference: corev1.LocalObjectReference{Name: ai.Spec.TaskVolume.SecretRef}, @@ -637,7 +973,7 @@ func (r *SaiaReconciler) reconcileSAIADeployment( }, }, corev1.EnvVar{ - Name: "MINIO_SECRET_KEY", + Name: "S3COMPAT_OBJECT_STORE_SECRET_KEY", ValueFrom: &corev1.EnvVarSource{ SecretKeyRef: &corev1.SecretKeySelector{ LocalObjectReference: corev1.LocalObjectReference{Name: ai.Spec.TaskVolume.SecretRef}, @@ -740,7 +1076,9 @@ func (r *SaiaReconciler) reconcileSAIADeployment( Containers: []corev1.Container{{ Name: ai.Name, Image: os.Getenv("RELATED_IMAGE_SAIA_API"), - ImagePullPolicy: corev1.PullAlways, + ImagePullPolicy: corev1.PullIfNotPresent, + Command: []string{"/bin/sh", "-c"}, + Args: []string{"python -m uvicorn --host 0.0.0.0 server.main:metrics_app --port 8088 & exec python -m uvicorn --host 0.0.0.0 server.main:app --port 8080"}, Ports: ports, VolumeMounts: mounts, Resources: ai.Spec.Resources, @@ -784,7 +1122,564 @@ func (r *SaiaReconciler) reconcileSAIADeployment( return nil } -// reconcileSAIAService ensures the Service for SAIA is created/updated. // remove me +// reconcileSAIAv2Deployment creates the v2 API Deployment and its internal Service. +func (r *SaiaReconciler) reconcileSAIAv2Deployment( + ctx context.Context, + ai *aiv1.AIService, +) error { + volumes, mounts := saiaVolumes(ai) + ports := []corev1.ContainerPort{ + {Name: "http", ContainerPort: 8000}, + {Name: "metrics", ContainerPort: 8088}, + } + + env := buildSAIABaseEnv(ai) + env = append(env, buildV2ExtraEnv(ai)...) + env = append(env, corev1.EnvVar{Name: "VAULT_TEMPLATE_DISABLED", Value: "true"}) + env, volumes, mounts, ports = buildSAIATLSEnv(ai, env, volumes, mounts, ports) + sort.Slice(env, func(i, j int) bool { return env[i].Name < env[j].Name }) + + component := ai.Name + "-v2-api" + labels, annotations := saiaLabelsAndAnnotations(ai, component) + + deployment := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: ai.Name + "-saia-v2-deployment", + Namespace: ai.Namespace, + }, + } + + if err := controllerutil.SetControllerReference(ai, deployment, r.Scheme); err != nil { + return fmt.Errorf("ownerref on v2 Deployment: %w", err) + } + + v2Resources := ai.Spec.V2.Resources + if v2Resources.Requests == nil { + v2Resources = corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("4Gi"), + corev1.ResourceEphemeralStorage: resource.MustParse("10Gi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("4Gi"), + corev1.ResourceEphemeralStorage: resource.MustParse("10Gi"), + }, + } + } + + if _, err := controllerutil.CreateOrUpdate(ctx, r.Client, deployment, func() error { + deployment.ObjectMeta.Labels = labels + deployment.ObjectMeta.Annotations = annotations + deployment.Spec.Replicas = &ai.Spec.V2.Replicas + + if deployment.Spec.Selector == nil { + deployment.Spec.Selector = &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": ai.Name, "component": component}, + } + } + + deployment.Spec.Template = corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": ai.Name, "component": component}, + Annotations: annotations, + }, + Spec: corev1.PodSpec{ + ServiceAccountName: ai.Spec.ServiceAccountName, + Containers: []corev1.Container{{ + Name: "saia-v2-api", + Image: ai.Spec.V2.Image, + ImagePullPolicy: corev1.PullIfNotPresent, + Command: []string{"/bin/sh", "-c"}, + Args: []string{". /home/splunk/init-prometheus.sh && python -m uvicorn --host 0.0.0.0 app.main:metrics_app --port 8088 & exec python -m uvicorn --host 0.0.0.0 app.main:app --port 8000"}, + Ports: ports, + VolumeMounts: mounts, + Resources: v2Resources, + Env: env, + EnvFrom: saiaEnvFrom(ai), + LivenessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{Path: "/health", Port: intstr.FromInt(8000)}, + }, + PeriodSeconds: 30, + FailureThreshold: 5, + }, + ReadinessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{Path: "/health", Port: intstr.FromInt(8000)}, + }, + PeriodSeconds: 30, + FailureThreshold: 5, + }, + StartupProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{Path: "/health", Port: intstr.FromInt(8000)}, + }, + InitialDelaySeconds: 10, + PeriodSeconds: 30, + FailureThreshold: 5, + }, + }}, + Volumes: volumes, + Affinity: &ai.Spec.Affinity, + Tolerations: ai.Spec.Tolerations, + ImagePullSecrets: ai.Spec.ImagePullSecrets, + }, + } + return nil + }); err != nil { + return fmt.Errorf("create/update v2 Deployment: %w", err) + } + return nil +} + +// reconcileSAIAv2Worker creates the v2 worker Deployment (same v2 image, command=run-worker.sh). +func (r *SaiaReconciler) reconcileSAIAv2Worker( + ctx context.Context, + ai *aiv1.AIService, +) error { + volumes, mounts := saiaVolumes(ai) + ports := []corev1.ContainerPort{ + {Name: "metrics", ContainerPort: 8088}, + } + + env := buildSAIABaseEnv(ai) + env = append(env, buildV2ExtraEnv(ai)...) + // Keep heartbeat path in sync with saia-v2's default (app/core/config.py: + // worker_heartbeat_path = "/tmp/ingestion_worker_heartbeat"). The ingestion + // worker writes a floating-point unix timestamp to this file every poll cycle. + // + // RUN_TASKS_DELAY_S (run_tasks_delay_s) is the per-iteration sleep in + // IngestionWorker.run() when the queue is empty OR the tenant lock is busy. + // The heartbeat is written only at the top of process_next(), so this sleep + // directly controls heartbeat cadence. The liveness probe rejects heartbeats + // older than 1200s, so we MUST keep this well under that threshold — 600s + // matches the saia-v2 helm default (see Settings.run_tasks_delay_s). Do NOT + // conflate with the v1 worker APScheduler cron (which uses 600s for weekly + // jobs); v2 reuses the same env name for a different purpose. + env = append(env, + corev1.EnvVar{Name: "RUN_TASKS_DELAY_S", Value: "600"}, + corev1.EnvVar{Name: "VAULT_TEMPLATE_DISABLED", Value: "true"}, + corev1.EnvVar{Name: "WORKER_HEARTBEAT_PATH", Value: "/tmp/ingestion_worker_heartbeat"}, + ) + env, volumes, mounts, _ = buildSAIATLSEnv(ai, env, volumes, mounts, nil) + sort.Slice(env, func(i, j int) bool { return env[i].Name < env[j].Name }) + + component := ai.Name + "-v2-worker" + labels, annotations := saiaLabelsAndAnnotations(ai, component) + + deployment := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: ai.Name + "-saia-v2-worker", + Namespace: ai.Namespace, + }, + } + + if err := controllerutil.SetControllerReference(ai, deployment, r.Scheme); err != nil { + return fmt.Errorf("ownerref on v2 worker Deployment: %w", err) + } + + v2WorkerResources := ai.Spec.V2Worker.Resources + if v2WorkerResources.Requests == nil { + v2WorkerResources = corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("16Gi"), + corev1.ResourceEphemeralStorage: resource.MustParse("25Gi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("16Gi"), + corev1.ResourceEphemeralStorage: resource.MustParse("25Gi"), + }, + } + } + + if _, err := controllerutil.CreateOrUpdate(ctx, r.Client, deployment, func() error { + deployment.ObjectMeta.Labels = labels + deployment.ObjectMeta.Annotations = annotations + deployment.Spec.Replicas = &ai.Spec.V2Worker.Replicas + + if deployment.Spec.Selector == nil { + deployment.Spec.Selector = &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": ai.Name, "component": component}, + } + } + + deployment.Spec.Template = corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": ai.Name, "component": component}, + Annotations: annotations, + }, + Spec: corev1.PodSpec{ + ServiceAccountName: ai.Spec.ServiceAccountName, + Containers: []corev1.Container{{ + Name: "saia-v2-worker", + Image: ai.Spec.V2.Image, + ImagePullPolicy: corev1.PullIfNotPresent, + Command: []string{"/bin/sh", "-c"}, + Args: []string{". /home/splunk/init-prometheus.sh && python -m uvicorn --host 0.0.0.0 app.main:metrics_app --port 8088 & exec python -m app.workers.ingestion_worker"}, + Ports: ports, + VolumeMounts: mounts, + Resources: v2WorkerResources, + Env: env, + EnvFrom: saiaEnvFrom(ai), + LivenessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + Exec: &corev1.ExecAction{ + // The saia-v2 base image (python3-debian13-vault:4.1.3) is a minimal + // Python runtime that lacks coreutils like `date`, `cat`, `cut`. Use + // python3 directly, which is guaranteed to exist. The heartbeat file + // contains a float "secs.usec\n" written by ingestion_worker. + Command: []string{ + "python3", "-c", + "import os,sys,time\n" + + "p=os.environ.get('WORKER_HEARTBEAT_PATH','/tmp/ingestion_worker_heartbeat')\n" + + "sys.exit(0 if os.path.exists(p) and (time.time()-float(open(p).read().strip()))<1200 else 1)", + }, + }, + }, + PeriodSeconds: 60, + FailureThreshold: 3, + InitialDelaySeconds: 30, + }, + }}, + Volumes: volumes, + Affinity: &ai.Spec.Affinity, + Tolerations: ai.Spec.Tolerations, + ImagePullSecrets: ai.Spec.ImagePullSecrets, + }, + } + return nil + }); err != nil { + return fmt.Errorf("create/update v2 worker Deployment: %w", err) + } + return nil +} + +// reconcileNginxConfigMap creates the ConfigMap with nginx.conf for path-based routing. +func (r *SaiaReconciler) reconcileNginxConfigMap( + ctx context.Context, + ai *aiv1.AIService, +) error { + v1ServiceName := ai.Name + "-saia-v1-service" + v2ServiceName := ai.Name + "-saia-v2-service" + + nginxConf := fmt.Sprintf(`worker_processes auto; +error_log /dev/stderr warn; +pid /tmp/nginx.pid; + +events { + worker_connections 1024; +} + +http { + log_format routing '$remote_addr - [$time_local] "$request" ' + 'status=$status upstream=$upstream_addr ' + 'rt=$request_time uct=$upstream_connect_time urt=$upstream_response_time'; + + access_log /dev/stdout routing; + + upstream saia_v1 { + server %s:8080; + } + + upstream saia_v2 { + server %s:8000; + } + + # Reflect Access-Control-Request-Headers back on preflight. If the browser + # didn't send any (rare), fall back to a broad default. Safer than a + # hardcoded allowlist because spl-copilot (and future clients) may add + # custom headers like x-requested-with, x-csrf-token, x-splunk-*, etc. + map $http_access_control_request_headers $cors_allow_headers { + default $http_access_control_request_headers; + "" "authorization, content-type, x-ec-token, x-es-tenant-bearer, x-stack-url, x-stack-url-legacy, splunk-client, x-conversation-key, x-request-id, x-admin-preferences-filename, x-requested-with"; + } + + server { + listen 8080; + + # Nginx health/status endpoints MUST be declared before the v2 regex + # match; otherwise nginx's longest-prefix-before-regex rule would let + # exact matches win only if explicitly marked with "^~" or "=", and we + # don't want a stray /saia-api-v2/nginx_status to ever hit the backend. + location = /nginx_health { + return 200 'ok'; + add_header Content-Type text/plain; + } + + location = /nginx_status { + # stub_status exposes counters (active connections, reqs/s, etc.). + # k8s probes use /nginx_health — NOT /nginx_status — so restricting + # this to the nginx pod's loopback is safe. Operators needing to + # scrape should exec into the pod (kubectl exec curl 127.0.0.1/nginx_status). + stub_status on; + allow 127.0.0.1; + deny all; + } + + # v2: any path containing /saia-api-v2/ (with or without a tenant + # prefix). Using "search anywhere" avoids the ^// requirement + # that would silently fall through to v1 for tenant-less callers. + # Word boundary via "/saia-api-v2/" (not "saia-api-v2" substring) + # prevents accidental matches like /foo/saia-api-v2-legacy/. + location ~ /saia-api-v2/ { + # CORS preflight short-circuit. Browser preflights are + # unauthenticated by spec; SAIA v2's TenantConversationKeyMiddleware + # rejects them with 400 before FastAPI's CORSMiddleware can respond, + # which makes the browser block the real request with "No + # Access-Control-Allow-Origin header present". Answer preflight + # here and never proxy OPTIONS upstream. + # + # IMPORTANT: Do NOT emit Access-Control-Allow-Origin on non-OPTIONS + # responses — FastAPI's CORSMiddleware already sets it on real + # responses. A second ACAO from nginx would produce duplicate + # "*, http://origin" values that browsers reject. + if ($request_method = OPTIONS) { + add_header Access-Control-Allow-Origin $http_origin always; + add_header Access-Control-Allow-Credentials true always; + add_header Access-Control-Allow-Methods 'GET, POST, PUT, DELETE, PATCH, OPTIONS' always; + add_header Access-Control-Allow-Headers $cors_allow_headers always; + add_header Access-Control-Max-Age 3600 always; + add_header Content-Length 0 always; + add_header Content-Type 'text/plain charset=UTF-8' always; + return 204; + } + + proxy_pass http://saia_v2; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_read_timeout 300s; + proxy_send_timeout 300s; + proxy_buffering off; + chunked_transfer_encoding on; + } + + # v1: everything else (including /health, /{tenant}/saia-api/v1alpha1/...) + location / { + # Mirror the CORS preflight short-circuit for v1 routes; spl-copilot's + # Pattern B (direct browser fetch) may hit v1 admin endpoints too. Same + # rationale as v2: SAIA v1 middlewares authenticate on OPTIONS and would + # reject the preflight before CORS headers are emitted. + if ($request_method = OPTIONS) { + add_header Access-Control-Allow-Origin $http_origin always; + add_header Access-Control-Allow-Credentials true always; + add_header Access-Control-Allow-Methods 'GET, POST, PUT, DELETE, PATCH, OPTIONS' always; + add_header Access-Control-Allow-Headers $cors_allow_headers always; + add_header Access-Control-Max-Age 3600 always; + add_header Content-Length 0 always; + add_header Content-Type 'text/plain charset=UTF-8' always; + return 204; + } + + proxy_pass http://saia_v1; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_read_timeout 300s; + proxy_send_timeout 300s; + proxy_buffering off; + } + } +} +`, v1ServiceName, v2ServiceName) + + cmName := ai.Name + "-saia-nginx-config" + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: cmName, + Namespace: ai.Namespace, + }, + } + + if err := controllerutil.SetControllerReference(ai, cm, r.Scheme); err != nil { + return fmt.Errorf("ownerref on nginx ConfigMap: %w", err) + } + + if _, err := controllerutil.CreateOrUpdate(ctx, r.Client, cm, func() error { + cm.Data = map[string]string{"nginx.conf": nginxConf} + return nil + }); err != nil { + return fmt.Errorf("create/update nginx ConfigMap: %w", err) + } + return nil +} + +// reconcileNginxDeployment creates the nginx reverse proxy Deployment. +func (r *SaiaReconciler) reconcileNginxDeployment( + ctx context.Context, + ai *aiv1.AIService, +) error { + component := ai.Name + "-nginx" + labels, annotations := saiaLabelsAndAnnotations(ai, component) + + deployment := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: ai.Name + "-saia-nginx", + Namespace: ai.Namespace, + }, + } + + if err := controllerutil.SetControllerReference(ai, deployment, r.Scheme); err != nil { + return fmt.Errorf("ownerref on nginx Deployment: %w", err) + } + + var replicas int32 = 1 + + // Resolve nginx image. Allow an override via RELATED_IMAGE_NGINX so airgapped + // installs can pull the image from a private mirror. Fall back to a stable + // upstream tag so `make run` / default helm deploys still work. + nginxImage := os.Getenv("RELATED_IMAGE_NGINX") + if nginxImage == "" { + nginxImage = "nginx:1.27-alpine" + } + + if _, err := controllerutil.CreateOrUpdate(ctx, r.Client, deployment, func() error { + deployment.ObjectMeta.Labels = labels + deployment.ObjectMeta.Annotations = annotations + deployment.Spec.Replicas = &replicas + + if deployment.Spec.Selector == nil { + deployment.Spec.Selector = &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": ai.Name, "component": component}, + } + } + + deployment.Spec.Template = corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": ai.Name, "component": component}, + Annotations: annotations, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{ + Name: "nginx", + Image: nginxImage, + ImagePullPolicy: corev1.PullIfNotPresent, + Ports: []corev1.ContainerPort{ + {Name: "http", ContainerPort: 8080}, + }, + VolumeMounts: []corev1.VolumeMount{ + {Name: "nginx-config", MountPath: "/etc/nginx/nginx.conf", SubPath: "nginx.conf"}, + }, + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("100m"), + corev1.ResourceMemory: resource.MustParse("64Mi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("500m"), + corev1.ResourceMemory: resource.MustParse("128Mi"), + }, + }, + LivenessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{Path: "/nginx_health", Port: intstr.FromInt(8080)}, + }, + PeriodSeconds: 30, + FailureThreshold: 3, + }, + ReadinessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{Path: "/nginx_health", Port: intstr.FromInt(8080)}, + }, + PeriodSeconds: 10, + FailureThreshold: 3, + }, + }}, + Volumes: []corev1.Volume{ + { + Name: "nginx-config", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: ai.Name + "-saia-nginx-config", + }, + }, + }, + }, + }, + ImagePullSecrets: ai.Spec.ImagePullSecrets, + }, + } + return nil + }); err != nil { + return fmt.Errorf("create/update nginx Deployment: %w", err) + } + return nil +} + +// reconcileSAIAv1Service creates the internal v1 ClusterIP Service. +func (r *SaiaReconciler) reconcileSAIAv1Service( + ctx context.Context, + ai *aiv1.AIService, +) error { + component := ai.Name // v1 API uses "app: {name}, component: {name}" from reconcileSAIADeployment + svc := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: ai.Name + "-saia-v1-service", + Namespace: ai.Namespace, + Labels: map[string]string{"app": ai.Name}, + }, + } + + if err := controllerutil.SetControllerReference(ai, svc, r.Scheme); err != nil { + return fmt.Errorf("ownerref on v1 Service: %w", err) + } + + if _, err := controllerutil.CreateOrUpdate(ctx, r.Client, svc, func() error { + svc.Spec.Selector = map[string]string{"app": ai.Name, "component": component} + svc.Spec.Ports = []corev1.ServicePort{ + {Name: "http", Port: 8080, TargetPort: intstr.FromInt(8080)}, + {Name: "metrics", Port: 8088, TargetPort: intstr.FromInt(8088)}, + } + svc.Spec.Type = corev1.ServiceTypeClusterIP + return nil + }); err != nil { + return fmt.Errorf("create/update v1 Service: %w", err) + } + return nil +} + +// reconcileSAIAv2Service creates the internal v2 ClusterIP Service. +func (r *SaiaReconciler) reconcileSAIAv2Service( + ctx context.Context, + ai *aiv1.AIService, +) error { + component := ai.Name + "-v2-api" + svc := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: ai.Name + "-saia-v2-service", + Namespace: ai.Namespace, + Labels: map[string]string{"app": ai.Name}, + }, + } + + if err := controllerutil.SetControllerReference(ai, svc, r.Scheme); err != nil { + return fmt.Errorf("ownerref on v2 Service: %w", err) + } + + if _, err := controllerutil.CreateOrUpdate(ctx, r.Client, svc, func() error { + svc.Spec.Selector = map[string]string{"app": ai.Name, "component": component} + svc.Spec.Ports = []corev1.ServicePort{ + {Name: "http", Port: 8000, TargetPort: intstr.FromInt(8000)}, + {Name: "metrics", Port: 8088, TargetPort: intstr.FromInt(8088)}, + } + svc.Spec.Type = corev1.ServiceTypeClusterIP + return nil + }); err != nil { + return fmt.Errorf("create/update v2 Service: %w", err) + } + return nil +} + +// reconcileSAIAService ensures the public-facing Service routes to nginx. func (r *SaiaReconciler) reconcileSAIAService( ctx context.Context, ai *aiv1.AIService, @@ -793,9 +1688,11 @@ func (r *SaiaReconciler) reconcileSAIAService( serviceTemplate := ai.Spec.ServiceTemplate.DeepCopy() cleanServiceTemplate(serviceTemplate) + // Public service points to nginx (which routes to v1/v2 by path) + nginxComponent := ai.Name + "-nginx" + ports := []corev1.ServicePort{ {Name: "http", Port: 8080, TargetPort: intstr.FromInt(8080)}, - {Name: "metrics", Port: 8088, TargetPort: intstr.FromInt(8088)}, } if ai.Spec.MTLS.Enabled && ai.Spec.MTLS.Termination == "operator" { ports = append(ports, corev1.ServicePort{ @@ -804,12 +1701,13 @@ func (r *SaiaReconciler) reconcileSAIAService( } svc := &corev1.Service{ ObjectMeta: metav1.ObjectMeta{ - Name: ai.Name + "-saia-service", - Namespace: ai.Namespace, - Labels: map[string]string{"app": ai.Name}, + Name: ai.Name + "-saia-service", + Namespace: ai.Namespace, + Labels: map[string]string{"app": ai.Name}, + Annotations: map[string]string{}, }, Spec: corev1.ServiceSpec{ - Selector: map[string]string{"app": ai.Name, "component": ai.Name}, + Selector: map[string]string{"app": ai.Name, "component": nginxComponent}, Ports: ports, Type: corev1.ServiceTypeClusterIP, }, @@ -849,10 +1747,8 @@ func (r *SaiaReconciler) reconcileSAIAService( return fmt.Errorf("ownerref on Service: %w", err) } if _, err := controllerutil.CreateOrUpdate(ctx, r.Client, svc, func() error { - // Update mutable fields - svc.Spec.Selector = map[string]string{"app": ai.Name, "component": ai.Name} + svc.Spec.Selector = map[string]string{"app": ai.Name, "component": nginxComponent} svc.Spec.Ports = ports - // Type is already set above based on ServiceTemplate return nil }); err != nil { r.Recorder.Event(ai, corev1.EventTypeWarning, "InvalidSpec", "create/update Service failed") @@ -932,14 +1828,16 @@ func (r *SaiaReconciler) createOrUpdateConfigMap( } // extractBucketName extracts the bucket name from an object storage path. -// Supports s3://, minio://, gs://, and azure:// prefixes. +// Supports s3://, s3compat://, minio://, seaweedfs://, gs://, and azure:// prefixes. // Examples: // - "s3://my-bucket/path/to/dir" -> "my-bucket" +// - "s3compat://bucket-name" -> "bucket-name" // - "minio://bucket-name" -> "bucket-name" +// - "seaweedfs://my-bucket/prefix" -> "my-bucket" // - "gs://my-bucket" -> "my-bucket" func extractBucketName(path string) string { // Remove supported prefixes - prefixes := []string{"s3://", "minio://", "gs://", "azure://"} + prefixes := []string{"s3://", "s3compat://", "minio://", "seaweedfs://", "gs://", "azure://"} for _, prefix := range prefixes { if strings.HasPrefix(path, prefix) { path = strings.TrimPrefix(path, prefix) diff --git a/pkg/ai/features/saia/impl_test.go b/pkg/ai/features/saia/impl_test.go index b5aec6c..e368531 100644 --- a/pkg/ai/features/saia/impl_test.go +++ b/pkg/ai/features/saia/impl_test.go @@ -2,16 +2,22 @@ package saia import ( "context" - //"errors" + "fmt" "os" + "strings" "testing" aiv1 "github.com/splunk/splunk-ai-operator/api/v1" "github.com/splunk/splunk-ai-operator/pkg/ai/features/common" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + appsv1 "k8s.io/api/apps/v1" + batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) @@ -102,6 +108,7 @@ func Test_validateAIService_defaults(t *testing.T) { Spec: aiv1.AIServiceSpec{ AIPlatformRef: corev1.ObjectReference{Name: "plat", Namespace: "ns"}, TaskVolume: aiv1.ObjectStorageSpec{Path: "/data"}, + V2: aiv1.SAIAv2Config{Image: "saia-v2:latest"}, }, } @@ -109,10 +116,20 @@ func Test_validateAIService_defaults(t *testing.T) { err := r.validateAIService(context.Background(), ai) assert.NoError(t, err) assert.Equal(t, int32(1), ai.Spec.Replicas) - assert.NotNil(t, ai.Spec.Resources.Requests) - assert.NotNil(t, ai.Spec.Resources.Limits) - assert.Equal(t, "ray.ns.svc.cluster.local:8000", ai.Spec.AIPlatformUrl) + assert.Equal(t, resource.MustParse("2"), ai.Spec.Resources.Requests[corev1.ResourceCPU]) + assert.Equal(t, resource.MustParse("4Gi"), ai.Spec.Resources.Requests[corev1.ResourceMemory]) + assert.Equal(t, resource.MustParse("10Gi"), ai.Spec.Resources.Requests[corev1.ResourceEphemeralStorage]) + assert.Equal(t, resource.MustParse("2"), ai.Spec.Resources.Limits[corev1.ResourceCPU]) + assert.Equal(t, resource.MustParse("4Gi"), ai.Spec.Resources.Limits[corev1.ResourceMemory]) + assert.Equal(t, resource.MustParse("10Gi"), ai.Spec.Resources.Limits[corev1.ResourceEphemeralStorage]) + // AIPlatformUrl is built as "://..svc.:8000". + // When AIPlatformScheme is unset, the operator defaults to "http" (see + // validateAIService). This makes the URL usable directly by httpx/openai + // clients in SAIA v2 without a second string-concat step. + assert.Equal(t, "http://ray.ns.svc.cluster.local:8000", ai.Spec.AIPlatformUrl) assert.Equal(t, "vec.ns.svc.cluster.local", ai.Spec.VectorDbUrl) + assert.Equal(t, int32(1), ai.Spec.V2.Replicas) + assert.Equal(t, int32(1), ai.Spec.V2Worker.Replicas) } func Test_getAIPlatform_success(t *testing.T) { @@ -154,3 +171,817 @@ func Test_getAIPlatform_error(t *testing.T) { assert.Error(t, err) assert.Nil(t, got) } + +func Test_validateAIService_missingV2Image(t *testing.T) { + os.Setenv("RELATED_IMAGE_POST_INSTALL_HOOK", "dummy") + defer os.Unsetenv("RELATED_IMAGE_POST_INSTALL_HOOK") + + r := &SaiaReconciler{ + Recorder: record.NewFakeRecorder(10), + Client: fake.NewClientBuilder().WithScheme(buildTestScheme(t)).Build(), + } + + ai := &aiv1.AIService{ + Spec: aiv1.AIServiceSpec{ + AIPlatformUrl: "http://platform:8000", + VectorDbUrl: "weaviate:80", + TaskVolume: aiv1.ObjectStorageSpec{Path: "s3://bucket"}, + }, + } + err := r.validateAIService(context.Background(), ai) + assert.ErrorContains(t, err, "v2.image must be set") +} + +// buildFullTestScheme creates a scheme that includes apps/v1 for Deployment testing. +func buildFullTestScheme(t *testing.T) *runtime.Scheme { + s := buildTestScheme(t) + require.NoError(t, appsv1.AddToScheme(s)) + return s +} + +// newTestAIService returns a minimal AIService for reconciliation tests. +func newTestAIService() *aiv1.AIService { + return &aiv1.AIService{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: "default", + UID: "uid-123", + }, + Spec: aiv1.AIServiceSpec{ + AIPlatformUrl: "http://platform:8000", + VectorDbUrl: "weaviate.ai-platform.svc.cluster.local", + Replicas: 1, + ServiceAccountName: "test-sa", + TaskVolume: aiv1.ObjectStorageSpec{ + Path: "s3://test-bucket/saia", + Endpoint: "http://seaweedfs:8333", + SecretRef: "s3-creds", + }, + V2: aiv1.SAIAv2Config{ + Image: "saia-v2:latest", + Replicas: 1, + }, + V2Worker: aiv1.SAIAWorkerConfig{Replicas: 1}, + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: *mustParseQuantity("2"), + corev1.ResourceMemory: *mustParseQuantity("4Gi"), + corev1.ResourceEphemeralStorage: *mustParseQuantity("10Gi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: *mustParseQuantity("2"), + corev1.ResourceMemory: *mustParseQuantity("4Gi"), + corev1.ResourceEphemeralStorage: *mustParseQuantity("10Gi"), + }, + }, + }, + } +} + +func mustParseQuantity(s string) *resource.Quantity { + q := resource.MustParse(s) + return &q +} + +func Test_reconcilePostInstallHook_SetsGRPCEnvForV2DataLoader(t *testing.T) { + // Regression: the saia-data-loader v2 image (>= v2.0.4-13-g3b677604) uses + // the Weaviate v4 Python client, which performs a gRPC health check on + // connect. Its url_compat shim defaults VECTOR_DB_GRPC_HOST to + // "grpc.{host}" and VECTOR_DB_GRPC_PORT to "443" (Splunk production + // convention). In k0s airgap, Weaviate exposes gRPC on the same Service at + // :50051. The operator MUST pass these vars explicitly so the shim's + // setdefault() calls are no-ops. + t.Setenv("RELATED_IMAGE_POST_INSTALL_HOOK", "dummy-hook-image:latest") + + scheme := buildFullTestScheme(t) + require.NoError(t, batchv1.AddToScheme(scheme)) + ai := newTestAIService() + ai.Spec.VectorDbUrl = "weaviate.ai-platform.svc.cluster.local" + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + // First call creates the Job and returns "waiting" as a sentinel error. + err := r.reconcilePostInstallHook(context.Background(), ai) + require.Error(t, err) + assert.Contains(t, err.Error(), "created Job") + + job := &batchv1.Job{} + require.NoError(t, fakeClient.Get(context.Background(), + types.NamespacedName{Name: "test-vector-db-setup-posthook", Namespace: "default"}, job)) + + // BackoffLimit must be 1 to avoid error-pod churn. + require.NotNil(t, job.Spec.BackoffLimit) + assert.Equal(t, int32(1), *job.Spec.BackoffLimit) + + // InitContainer must poll Weaviate readiness before the main container runs. + require.Len(t, job.Spec.Template.Spec.InitContainers, 1) + initC := job.Spec.Template.Spec.InitContainers[0] + assert.Equal(t, "wait-for-weaviate", initC.Name) + assert.Equal(t, "dummy-hook-image:latest", initC.Image) + require.NotEmpty(t, initC.Command) + assert.Equal(t, "python3", initC.Command[0]) + assert.Contains(t, initC.Command[2], "weaviate.ai-platform.svc.cluster.local") + assert.Contains(t, initC.Command[2], "/v1/.well-known/ready") + + // Collect env var names/values. + envMap := envToMap(job.Spec.Template.Spec.Containers[0].Env) + + assert.Equal(t, "http://weaviate.ai-platform.svc.cluster.local:80", envMap["VECTOR_DB_URL"]) + assert.Equal(t, "weaviate.ai-platform.svc.cluster.local", envMap["VECTOR_DB_HOST"]) + assert.Equal(t, "80", envMap["VECTOR_DB_PORT"]) + // Critical: GRPC host must NOT be "grpc."; it's the same Service. + assert.Equal(t, "weaviate.ai-platform.svc.cluster.local", envMap["VECTOR_DB_GRPC_HOST"]) + assert.Equal(t, "50051", envMap["VECTOR_DB_GRPC_PORT"]) + assert.Equal(t, "false", envMap["VECTOR_DB_SECURE"]) + assert.Equal(t, "false", envMap["VECTOR_DB_AUTH_ENABLED"]) + assert.Equal(t, "true", envMap["SPLUNK_AI_ASSISTANT_SERVICE_CMP"]) +} + +func Test_reconcileSAIAConfigMap_EnablesAuthzForCMPBridging(t *testing.T) { + // Regression: ENABLE_AUTHZ=true is REQUIRED for the SAIAAuthorizer's + // CMP interactive-token path to run. That path sets request.state.cmp_splunk_url, + // which AdminCapabilityAuthorizer needs to bridge a Splunk.interactive bearer + // into an EC-equivalent token. ENABLE_AUTHZ=false early-returns before the + // attribute is set, and /admin/* requests then fail with: + // 403 {"detail":"Admin endpoints require an authenticated EC user token."} + // Even in airgap CMP mode, ENABLE_AUTHZ must be "true" — there's no value + // that both skips authorization AND preserves the CMP bridge. + scheme := buildFullTestScheme(t) + ai := newTestAIService() + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + require.NoError(t, r.reconcileSAIAConfigMap(context.Background(), ai)) + + cm := &corev1.ConfigMap{} + require.NoError(t, fakeClient.Get(context.Background(), + types.NamespacedName{Name: "test-saia-config", Namespace: "default"}, cm)) + + assert.Equal(t, "true", cm.Data["ENABLE_AUTHZ"], + "ENABLE_AUTHZ must default to 'true' so CMP interactive-token bridging works on /admin/* routes") + assert.Equal(t, "true", cm.Data["SPLUNK_AI_ASSISTANT_SERVICE_CMP"], + "CMP mode flag must be set alongside ENABLE_AUTHZ so the authorizer picks the interactive-token branch") +} + +func Test_reconcileSAIAConfigMap_PreservesUserOverride(t *testing.T) { + // If an operator explicitly disables authz on an existing ConfigMap + // (e.g. for development/debugging), our reconcile must NOT clobber that + // value back to the "true" default. The merge logic fills in missing or + // empty keys only. + scheme := buildFullTestScheme(t) + ai := newTestAIService() + + existing := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-saia-config", + Namespace: "default", + }, + Data: map[string]string{"ENABLE_AUTHZ": "false"}, + } + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai, existing).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + require.NoError(t, r.reconcileSAIAConfigMap(context.Background(), ai)) + + cm := &corev1.ConfigMap{} + require.NoError(t, fakeClient.Get(context.Background(), + types.NamespacedName{Name: "test-saia-config", Namespace: "default"}, cm)) + + assert.Equal(t, "false", cm.Data["ENABLE_AUTHZ"], + "user-set ENABLE_AUTHZ=false must be preserved across reconciles") +} + +func Test_reconcileSAIAv2Deployment(t *testing.T) { + scheme := buildFullTestScheme(t) + ai := newTestAIService() + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + err := r.reconcileSAIAv2Deployment(context.Background(), ai) + require.NoError(t, err) + + dep := &appsv1.Deployment{} + err = fakeClient.Get(context.Background(), types.NamespacedName{Name: "test-saia-v2-deployment", Namespace: "default"}, dep) + require.NoError(t, err) + + container := dep.Spec.Template.Spec.Containers[0] + assert.Equal(t, "saia-v2:latest", container.Image) + assert.Equal(t, "saia-v2-api", container.Name) + + // v2 API listens on 8000 + assert.Equal(t, int32(8000), container.Ports[0].ContainerPort) + assert.Equal(t, "/health", container.ReadinessProbe.HTTPGet.Path) + assert.Equal(t, 8000, container.ReadinessProbe.HTTPGet.Port.IntValue()) + + envMap := envToMap(container.Env) + assert.Equal(t, "http://platform:8000", envMap["PLATFORM_URL"]) + assert.Equal(t, "test-bucket", envMap["S3_BUCKET"]) + assert.Equal(t, "true", envMap["VAULT_TEMPLATE_DISABLED"]) + + // SAIA V2 FieldDescription backend is REQUIRED (worker and API both call + // FieldDescriptionRepositoryFactory.get() which raises ValueError on empty + // backend). Per Confluence ERD section 3.8.1.2 decision A.3 we use the + // S3-compatible backend for AI Tier. + assert.Equal(t, "s3", envMap["FIELD_DESCRIPTION_BACKEND"]) + assert.Equal(t, "field-descriptions/global-field-descriptions.json", + envMap["FIELD_DESCRIPTION_S3_KEY"]) + // AWS_ENDPOINT_URL is what the v2 S3StorageAdapter reads (vs v1's + // S3COMPAT_OBJECT_STORE_ENDPOINT_URL). Only set when the AIService has + // an explicit endpoint — e.g. for SeaweedFS/MinIO. + assert.Equal(t, "http://seaweedfs:8333", envMap["AWS_ENDPOINT_URL"]) +} + +func Test_reconcileSAIAv2Worker(t *testing.T) { + scheme := buildFullTestScheme(t) + ai := newTestAIService() + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + err := r.reconcileSAIAv2Worker(context.Background(), ai) + require.NoError(t, err) + + dep := &appsv1.Deployment{} + err = fakeClient.Get(context.Background(), types.NamespacedName{Name: "test-saia-v2-worker", Namespace: "default"}, dep) + require.NoError(t, err) + + container := dep.Spec.Template.Spec.Containers[0] + assert.Equal(t, "saia-v2:latest", container.Image) + assert.Equal(t, "saia-v2-worker", container.Name) + assert.Equal(t, []string{"/bin/sh", "-c"}, container.Command) + assert.Contains(t, container.Args[0], "app.workers.ingestion_worker") + + envMap := envToMap(container.Env) + // RUN_TASKS_DELAY_S controls the v2 worker's poll sleep (saia-v2 + // IngestionWorker.run). The value MUST stay well under the liveness probe + // threshold (1200s) because the heartbeat file is only refreshed at the top + // of each iteration. 600s matches saia-v2's helm default. + assert.Equal(t, "600", envMap["RUN_TASKS_DELAY_S"]) + // Heartbeat path must match saia-v2's default (app/core/config.py). + assert.Equal(t, "/tmp/ingestion_worker_heartbeat", envMap["WORKER_HEARTBEAT_PATH"]) + assert.Equal(t, "true", envMap["VAULT_TEMPLATE_DISABLED"]) + + // SAIA V2 FieldDescription backend is REQUIRED — without this, the worker + // immediately raises ValueError and enters a restart loop. Ref Confluence + // ERD 3.8.1.2 + A.3: Option B (S3-compatible object store). These three + // vars are the minimum to make the worker bootstrap cleanly. + assert.Equal(t, "s3", envMap["FIELD_DESCRIPTION_BACKEND"]) + assert.Equal(t, "field-descriptions/global-field-descriptions.json", + envMap["FIELD_DESCRIPTION_S3_KEY"]) + assert.Equal(t, "http://seaweedfs:8333", envMap["AWS_ENDPOINT_URL"]) + + // Liveness uses exec (heartbeat file check), not HTTP + assert.NotNil(t, container.LivenessProbe.Exec) + assert.Nil(t, container.LivenessProbe.HTTPGet) + // Probe must use python3 (not coreutils) because the saia-v2 base image lacks date/cat/cut. + assert.Equal(t, "python3", container.LivenessProbe.Exec.Command[0]) + assert.Contains(t, container.LivenessProbe.Exec.Command[2], "WORKER_HEARTBEAT_PATH") + + // Only metrics port, no HTTP API port + assert.Len(t, container.Ports, 1) + assert.Equal(t, int32(8088), container.Ports[0].ContainerPort) +} + +func Test_reconcileNginxConfigMap(t *testing.T) { + scheme := buildFullTestScheme(t) + ai := newTestAIService() + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + err := r.reconcileNginxConfigMap(context.Background(), ai) + require.NoError(t, err) + + cm := &corev1.ConfigMap{} + err = fakeClient.Get(context.Background(), types.NamespacedName{Name: "test-saia-nginx-config", Namespace: "default"}, cm) + require.NoError(t, err) + + conf := cm.Data["nginx.conf"] + assert.NotEmpty(t, conf) + + // v2 routing: ANY path containing "/saia-api-v2/" — with or without a + // tenant prefix — must be sent to the v2 upstream. The regex must NOT + // require a path segment before "saia-api-v2" (that would silently route + // tenant-less probes to v1). + assert.Contains(t, conf, "location ~ /saia-api-v2/") + assert.Contains(t, conf, "proxy_pass http://saia_v2") + + // v1 is the default + assert.Contains(t, conf, "location /") + assert.Contains(t, conf, "proxy_pass http://saia_v1") + + // Upstream names reference the correct internal service names + assert.Contains(t, conf, "test-saia-v1-service:8080") + assert.Contains(t, conf, "test-saia-v2-service:8000") + + // SSE/streaming friendliness + assert.Contains(t, conf, "proxy_buffering off") + assert.Contains(t, conf, "proxy_http_version 1.1") + + // Health and status endpoints — stub_status must be loopback-only. + assert.Contains(t, conf, "location = /nginx_health") + assert.Contains(t, conf, "location = /nginx_status") + assert.Contains(t, conf, "deny all;") +} + +func Test_reconcileNginxConfigMap_CORSPreflight(t *testing.T) { + // Regression: saia-v2's TenantConversationKeyMiddleware rejects + // unauthenticated CORS preflight OPTIONS requests with 400 before + // FastAPI's CORSMiddleware can respond, causing browsers to block the + // subsequent real request with "No Access-Control-Allow-Origin header + // present". The nginx reverse proxy MUST short-circuit OPTIONS at the + // proxy layer and respond with permissive CORS headers so the browser + // accepts the preflight. See: + // saia-service/saia-v2/app/middleware/tenant_conversation_key.py + scheme := buildFullTestScheme(t) + ai := newTestAIService() + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + require.NoError(t, r.reconcileNginxConfigMap(context.Background(), ai)) + + cm := &corev1.ConfigMap{} + require.NoError(t, fakeClient.Get(context.Background(), + types.NamespacedName{Name: "test-saia-nginx-config", Namespace: "default"}, cm)) + + conf := cm.Data["nginx.conf"] + + // OPTIONS short-circuit must be present on BOTH v1 (/) and v2 + // (/saia-api-v2/) locations. Without it, v1 admin routes (Pattern B + // direct browser fetch) would also fail the same way. + assert.Equal(t, 2, strings.Count(conf, "if ($request_method = OPTIONS)"), + "OPTIONS short-circuit must exist in both v1 and v2 location blocks") + assert.Contains(t, conf, "return 204", + "preflight must return 204 No Content") + + // 'map' directive dynamically reflects Access-Control-Request-Headers + // so any custom header the client sends is auto-allowed (avoids drift + // between nginx allowlist and client's evolving header set). + assert.Contains(t, conf, "map $http_access_control_request_headers $cors_allow_headers", + "must use 'map' to reflect Access-Control-Request-Headers back to client") + assert.Contains(t, conf, "add_header Access-Control-Allow-Headers $cors_allow_headers", + "preflight response must echo the requested headers via $cors_allow_headers") + + // ACAO must be reflected from Origin (not a hardcoded wildcard) so that + // Access-Control-Allow-Credentials=true is valid (browsers reject + // Allow-Origin="*" + Allow-Credentials=true). + assert.Contains(t, conf, "add_header Access-Control-Allow-Origin $http_origin", + "preflight ACAO must be reflected from Origin to support Allow-Credentials=true") + + // CRITICAL: ACAO must ONLY appear in OPTIONS branches. FastAPI's + // CORSMiddleware already sets ACAO on real responses; adding it again + // from nginx produces duplicate "*, http://origin" values that browsers + // reject ("The 'Access-Control-Allow-Origin' header contains multiple + // values '*, http://localhost:18000', but only one is allowed"). + assert.Equal(t, 2, strings.Count(conf, "add_header Access-Control-Allow-Origin"), + "ACAO must appear EXACTLY TWICE (once per OPTIONS branch). Adding it "+ + "on real responses duplicates FastAPI's header and breaks the browser.") +} + +func Test_reconcileNginxDeployment(t *testing.T) { + // Ensure no env override leaks from other tests in the package. + os.Unsetenv("RELATED_IMAGE_NGINX") + + scheme := buildFullTestScheme(t) + ai := newTestAIService() + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + err := r.reconcileNginxDeployment(context.Background(), ai) + require.NoError(t, err) + + dep := &appsv1.Deployment{} + err = fakeClient.Get(context.Background(), types.NamespacedName{Name: "test-saia-nginx", Namespace: "default"}, dep) + require.NoError(t, err) + + container := dep.Spec.Template.Spec.Containers[0] + assert.Equal(t, "nginx:1.27-alpine", container.Image) + assert.Equal(t, "nginx", container.Name) + assert.Equal(t, int32(8080), container.Ports[0].ContainerPort) + + // ConfigMap volume mount + assert.Equal(t, "/etc/nginx/nginx.conf", container.VolumeMounts[0].MountPath) + assert.Equal(t, "nginx.conf", container.VolumeMounts[0].SubPath) + + // Health probes use nginx_health + assert.Equal(t, "/nginx_health", container.LivenessProbe.HTTPGet.Path) + assert.Equal(t, "/nginx_health", container.ReadinessProbe.HTTPGet.Path) +} + +func Test_reconcileNginxDeployment_imageOverride(t *testing.T) { + os.Setenv("RELATED_IMAGE_NGINX", "private.registry.example.com/nginx:1.29-alpine") + defer os.Unsetenv("RELATED_IMAGE_NGINX") + + scheme := buildFullTestScheme(t) + ai := newTestAIService() + ai.Name = "override" + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + require.NoError(t, r.reconcileNginxDeployment(context.Background(), ai)) + + dep := &appsv1.Deployment{} + require.NoError(t, fakeClient.Get(context.Background(), + types.NamespacedName{Name: "override-saia-nginx", Namespace: "default"}, dep)) + + assert.Equal(t, "private.registry.example.com/nginx:1.29-alpine", + dep.Spec.Template.Spec.Containers[0].Image) +} + +func Test_reconcileSAIAService_handlesAnnotationsWithoutPanic(t *testing.T) { + // Regression: the pre-existing code did not initialize svc.Annotations, so + // any user-provided annotation on the AIService caused a "assignment to + // entry in nil map" panic when reconciling the public service. + scheme := buildFullTestScheme(t) + ai := newTestAIService() + ai.Annotations = map[string]string{ + "operator.splunk.com/example": "v1", + "kubectl.kubernetes.io/restartedAt": "should-be-skipped", + "kubectl.kubernetes.io/last-applied-configuration": "should-be-skipped", + } + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + require.NotPanics(t, func() { + err := r.reconcileSAIAService(context.Background(), ai) + require.NoError(t, err) + }) + + svc := &corev1.Service{} + require.NoError(t, fakeClient.Get(context.Background(), + types.NamespacedName{Name: "test-saia-service", Namespace: "default"}, svc)) + + assert.Equal(t, "v1", svc.Annotations["operator.splunk.com/example"]) + assert.NotContains(t, svc.Annotations, "kubectl.kubernetes.io/restartedAt") + assert.NotContains(t, svc.Annotations, "kubectl.kubernetes.io/last-applied-configuration") +} + +func Test_reconcileSAIAv1Service(t *testing.T) { + scheme := buildFullTestScheme(t) + ai := newTestAIService() + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + err := r.reconcileSAIAv1Service(context.Background(), ai) + require.NoError(t, err) + + svc := &corev1.Service{} + err = fakeClient.Get(context.Background(), types.NamespacedName{Name: "test-saia-v1-service", Namespace: "default"}, svc) + require.NoError(t, err) + + assert.Equal(t, map[string]string{"app": "test", "component": "test"}, svc.Spec.Selector) + assert.Equal(t, int32(8080), svc.Spec.Ports[0].Port) +} + +func Test_reconcileSAIAv2Service(t *testing.T) { + scheme := buildFullTestScheme(t) + ai := newTestAIService() + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + err := r.reconcileSAIAv2Service(context.Background(), ai) + require.NoError(t, err) + + svc := &corev1.Service{} + err = fakeClient.Get(context.Background(), types.NamespacedName{Name: "test-saia-v2-service", Namespace: "default"}, svc) + require.NoError(t, err) + + assert.Equal(t, map[string]string{"app": "test", "component": "test-v2-api"}, svc.Spec.Selector) + assert.Equal(t, int32(8000), svc.Spec.Ports[0].Port) +} + +func Test_reconcileSAIAService_pointsToNginx(t *testing.T) { + scheme := buildFullTestScheme(t) + ai := newTestAIService() + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + err := r.reconcileSAIAService(context.Background(), ai) + require.NoError(t, err) + + svc := &corev1.Service{} + err = fakeClient.Get(context.Background(), types.NamespacedName{Name: "test-saia-service", Namespace: "default"}, svc) + require.NoError(t, err) + + // Public service must target nginx, not v1 directly + assert.Equal(t, map[string]string{"app": "test", "component": "test-nginx"}, svc.Spec.Selector) + assert.Equal(t, int32(8080), svc.Spec.Ports[0].Port) +} + +func Test_reconcileSAIAService_ServiceTypeVariations(t *testing.T) { + // Lock in the contract that the customer's k0s-cluster-config.yaml can + // omit / empty / explicitly-set serviceTemplate and get the expected + // Service.Type. Without this test, a future refactor could silently break + // the "just omit the block = ClusterIP" escape hatch documented in + // tools/cluster_setup/k0s-cluster-config.yaml. + scheme := buildFullTestScheme(t) + + cases := []struct { + name string + template corev1.Service + wantType corev1.ServiceType + wantNodePort int32 // 0 = don't check + }{ + { + name: "omitted/empty template → ClusterIP", + template: corev1.Service{}, // zero value, what yq-absent produces + wantType: corev1.ServiceTypeClusterIP, + }, + { + name: "explicit ClusterIP → ClusterIP", + template: corev1.Service{ + Spec: corev1.ServiceSpec{Type: corev1.ServiceTypeClusterIP}, + }, + wantType: corev1.ServiceTypeClusterIP, + }, + { + name: "NodePort without explicit port → NodePort auto-allocated", + template: corev1.Service{ + Spec: corev1.ServiceSpec{Type: corev1.ServiceTypeNodePort}, + }, + wantType: corev1.ServiceTypeNodePort, + // wantNodePort == 0 means we don't assert a specific value + }, + { + name: "NodePort with explicit 30080 → NodePort 30080", + template: corev1.Service{ + Spec: corev1.ServiceSpec{ + Type: corev1.ServiceTypeNodePort, + Ports: []corev1.ServicePort{ + {Name: "http", NodePort: 30080}, + }, + }, + }, + wantType: corev1.ServiceTypeNodePort, + wantNodePort: 30080, + }, + { + name: "LoadBalancer → LoadBalancer", + template: corev1.Service{ + Spec: corev1.ServiceSpec{Type: corev1.ServiceTypeLoadBalancer}, + }, + wantType: corev1.ServiceTypeLoadBalancer, + }, + { + name: "Unknown garbage type → ClusterIP (safe default)", + template: corev1.Service{ + Spec: corev1.ServiceSpec{Type: corev1.ServiceType("Bogus")}, + }, + wantType: corev1.ServiceTypeClusterIP, + }, + } + + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + ai := newTestAIService() + ai.Name = "svctype-" + sanitize(tc.name) + ai.Spec.ServiceTemplate = tc.template + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ai).Build() + r := &SaiaReconciler{Client: fakeClient, Scheme: scheme, Recorder: record.NewFakeRecorder(10)} + + require.NoError(t, r.reconcileSAIAService(context.Background(), ai)) + + svc := &corev1.Service{} + require.NoError(t, fakeClient.Get(context.Background(), + types.NamespacedName{Name: ai.Name + "-saia-service", Namespace: "default"}, svc)) + + assert.Equal(t, tc.wantType, svc.Spec.Type) + if tc.wantNodePort != 0 { + require.NotEmpty(t, svc.Spec.Ports) + assert.Equal(t, tc.wantNodePort, svc.Spec.Ports[0].NodePort) + } + }) + } +} + +// sanitize turns a free-form subtest name into a valid k8s resource name. +func sanitize(s string) string { + s = strings.ToLower(s) + out := make([]byte, 0, len(s)) + for i := 0; i < len(s); i++ { + c := s[i] + switch { + case c >= 'a' && c <= 'z', c >= '0' && c <= '9': + out = append(out, c) + default: + if len(out) > 0 && out[len(out)-1] != '-' { + out = append(out, '-') + } + } + } + // Trim trailing hyphen + for len(out) > 0 && out[len(out)-1] == '-' { + out = out[:len(out)-1] + } + return string(out) +} + +func Test_buildV2ExtraEnv_FieldDescriptionBackend(t *testing.T) { + // Explicit AIService with seaweedfs-style endpoint → AWS_ENDPOINT_URL is set. + t.Run("with S3-compatible endpoint", func(t *testing.T) { + ai := newTestAIService() // already sets TaskVolume.Endpoint = "http://seaweedfs:8333" + envMap := envToMap(buildV2ExtraEnv(ai)) + + assert.Equal(t, "s3", envMap["FIELD_DESCRIPTION_BACKEND"]) + assert.Equal(t, "field-descriptions/global-field-descriptions.json", + envMap["FIELD_DESCRIPTION_S3_KEY"]) + assert.Equal(t, "http://seaweedfs:8333", envMap["AWS_ENDPOINT_URL"]) + }) + + // No explicit endpoint (= real AWS S3 deployment) → AWS_ENDPOINT_URL must + // be omitted so boto3 falls back to the default AWS regional endpoint. + t.Run("without S3-compatible endpoint", func(t *testing.T) { + ai := newTestAIService() + ai.Spec.TaskVolume.Endpoint = "" + envMap := envToMap(buildV2ExtraEnv(ai)) + + assert.Equal(t, "s3", envMap["FIELD_DESCRIPTION_BACKEND"]) + assert.Equal(t, "field-descriptions/global-field-descriptions.json", + envMap["FIELD_DESCRIPTION_S3_KEY"]) + _, has := envMap["AWS_ENDPOINT_URL"] + assert.False(t, has, + "AWS_ENDPOINT_URL must be omitted when TaskVolume.Endpoint is empty (cloud S3 case)") + }) + + // SecretRef present → AWS_ACCESS_KEY_ID/SECRET sourced from same keys as + // the S3COMPAT_* envs in buildSAIABaseEnv. Required so that the v2 + // S3StorageAdapter (used by S3FieldDescriptionRepository) can authenticate + // to SeaweedFS / MinIO. + t.Run("AWS credentials sourced from SecretRef", func(t *testing.T) { + ai := newTestAIService() // already sets SecretRef = "s3-creds" + env := buildV2ExtraEnv(ai) + + var foundID, foundSecret bool + for _, e := range env { + if e.Name == "AWS_ACCESS_KEY_ID" { + foundID = true + if assert.NotNil(t, e.ValueFrom) && assert.NotNil(t, e.ValueFrom.SecretKeyRef) { + assert.Equal(t, "s3-creds", e.ValueFrom.SecretKeyRef.Name) + assert.Equal(t, "s3_access_key", e.ValueFrom.SecretKeyRef.Key) + } + } + if e.Name == "AWS_SECRET_ACCESS_KEY" { + foundSecret = true + if assert.NotNil(t, e.ValueFrom) && assert.NotNil(t, e.ValueFrom.SecretKeyRef) { + assert.Equal(t, "s3-creds", e.ValueFrom.SecretKeyRef.Name) + assert.Equal(t, "s3_secret_key", e.ValueFrom.SecretKeyRef.Key) + } + } + } + assert.True(t, foundID, "AWS_ACCESS_KEY_ID must be present so boto3 can auth to S3-compat endpoint") + assert.True(t, foundSecret, "AWS_SECRET_ACCESS_KEY must be present so boto3 can auth to S3-compat endpoint") + }) + + // No SecretRef → AWS_* must be omitted (cloud deployments use IAM role, + // not env-var creds; setting empty values would otherwise mask the IAM + // chain inside boto3). + t.Run("AWS credentials omitted when SecretRef empty", func(t *testing.T) { + ai := newTestAIService() + ai.Spec.TaskVolume.SecretRef = "" + env := buildV2ExtraEnv(ai) + + for _, e := range env { + assert.NotEqual(t, "AWS_ACCESS_KEY_ID", e.Name, + "AWS_ACCESS_KEY_ID must be omitted in cloud (IAM-role) case") + assert.NotEqual(t, "AWS_SECRET_ACCESS_KEY", e.Name, + "AWS_SECRET_ACCESS_KEY must be omitted in cloud (IAM-role) case") + } + }) +} + +// Test_buildV2ExtraEnv_ConversationStore verifies the switch from the +// ephemeral "filesystem" default (which lives on the pod's container overlay +// and loses all chat history on restart) to the "s3" backend introduced in +// saia-service by Tony's commits 3d3756f3 / 8e2a9f40 (merged into +// ai-tier-v2.0 via 9efe1fce on 2026-04-20, shipped in image build-v2-002). +// +// Contract (from saia-v2/app/core/config.py::Settings and +// app/repositories/conversation/store_factory.py): +// - CONVERSATION_STORE=s3 selects S3ConversationStore +// - CONVERSATION_S3_BUCKET must be non-empty (validator raises +// ValueError at startup otherwise, crash-looping the v2 pod) +// - AWS_ENDPOINT_URL / AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY are +// reused from the FieldDescription S3 wiring below +func Test_buildV2ExtraEnv_ConversationStore(t *testing.T) { + t.Run("enables s3 backend with bucket extracted from TaskVolume.Path", func(t *testing.T) { + ai := newTestAIService() // TaskVolume.Path = "s3://test-bucket/saia" + envMap := envToMap(buildV2ExtraEnv(ai)) + + assert.Equal(t, "s3", envMap["CONVERSATION_STORE"], + "CONVERSATION_STORE must be 's3' so S3ConversationStore is selected over the ephemeral filesystem default") + assert.Equal(t, "test-bucket", envMap["CONVERSATION_S3_BUCKET"], + "CONVERSATION_S3_BUCKET must be the extracted bucket name so SAIA v2's Settings validator passes at startup") + }) + + t.Run("handles all supported TaskVolume.Path prefixes", func(t *testing.T) { + cases := []struct { + path string + wantBucket string + }{ + {"s3://my-bucket/path", "my-bucket"}, + {"s3compat://bucket-name", "bucket-name"}, + {"minio://minio-bucket", "minio-bucket"}, + {"seaweedfs://sw-bucket/prefix", "sw-bucket"}, + {"gs://gcs-bucket", "gcs-bucket"}, + } + for _, tc := range cases { + t.Run(tc.path, func(t *testing.T) { + ai := newTestAIService() + ai.Spec.TaskVolume.Path = tc.path + envMap := envToMap(buildV2ExtraEnv(ai)) + + assert.Equal(t, "s3", envMap["CONVERSATION_STORE"]) + assert.Equal(t, tc.wantBucket, envMap["CONVERSATION_S3_BUCKET"]) + }) + } + }) + + // An empty TaskVolume.Path indicates a misconfigured CR. We must NOT + // emit CONVERSATION_STORE=s3 in that case, because CONVERSATION_S3_BUCKET + // would be empty and the v2 pod would crash-loop on the Pydantic + // validator. Leaving the defaults in place gives a clearer failure mode + // (ephemeral filesystem store) than a startup crash. + t.Run("omits conversation-store envs when TaskVolume.Path is empty", func(t *testing.T) { + ai := newTestAIService() + ai.Spec.TaskVolume.Path = "" + envMap := envToMap(buildV2ExtraEnv(ai)) + + _, hasStore := envMap["CONVERSATION_STORE"] + _, hasBucket := envMap["CONVERSATION_S3_BUCKET"] + assert.False(t, hasStore, + "CONVERSATION_STORE must be omitted when no bucket can be derived, to avoid the SAIA v2 startup validator crashing the pod") + assert.False(t, hasBucket, + "CONVERSATION_S3_BUCKET must be omitted when no bucket can be derived") + }) +} + +func Test_buildSAIABaseEnv(t *testing.T) { + ai := newTestAIService() + env := buildSAIABaseEnv(ai) + envMap := envToMap(env) + + assert.Equal(t, "http://platform:8000", envMap["PLATFORM_URL"]) + assert.Equal(t, "http://weaviate.ai-platform.svc.cluster.local:80", envMap["WEAVIATE_PLATFORM_URL"]) + assert.Equal(t, "weaviate.ai-platform.svc.cluster.local", envMap["VECTOR_DB_URL"]) + assert.Equal(t, "test-bucket", envMap["S3_BUCKET"]) + assert.Equal(t, "http://seaweedfs:8333", envMap["S3COMPAT_OBJECT_STORE_ENDPOINT_URL"]) + assert.Equal(t, "test-bucket", envMap["S3COMPAT_OBJECT_STORE_BUCKET"]) + + // S3 creds come from secretRef + found := false + for _, e := range env { + if e.Name == "S3COMPAT_OBJECT_STORE_ACCESS_KEY" { + found = true + assert.Equal(t, "s3-creds", e.ValueFrom.SecretKeyRef.Name) + assert.Equal(t, "s3_access_key", e.ValueFrom.SecretKeyRef.Key) + } + } + assert.True(t, found, "S3COMPAT_OBJECT_STORE_ACCESS_KEY should be present") +} + +func Test_extractBucketName(t *testing.T) { + tests := []struct { + input string + want string + }{ + {"s3://my-bucket/path", "my-bucket"}, + {"s3compat://bucket-name", "bucket-name"}, + {"minio://bucket-name", "bucket-name"}, + {"seaweedfs://my-bucket/prefix", "my-bucket"}, + {"gs://my-bucket", "my-bucket"}, + {"plain-bucket", "plain-bucket"}, + } + for _, tt := range tests { + t.Run(tt.input, func(t *testing.T) { + assert.Equal(t, tt.want, extractBucketName(tt.input)) + }) + } +} + +// envToMap converts a slice of EnvVar to a map for easy assertion. +// Only includes env vars with direct values (not ValueFrom). +func envToMap(envs []corev1.EnvVar) map[string]string { + m := make(map[string]string) + for _, e := range envs { + if e.ValueFrom == nil { + m[e.Name] = e.Value + } + } + return m +} + +// Suppress unused import warnings +var _ = fmt.Sprintf +var _ = strings.Contains diff --git a/pkg/ai/features/seca/seca.go b/pkg/ai/features/seca/seca.go index 04351bc..5915418 100644 --- a/pkg/ai/features/seca/seca.go +++ b/pkg/ai/features/seca/seca.go @@ -104,7 +104,11 @@ func (r *SecaReconciler) validateAIService(ctx context.Context, ai *aiv1.AIServi ); err != nil { return fmt.Errorf("fetching AIPlatform: %w", err) } - ai.Spec.AIPlatformUrl = fmt.Sprintf("%s.%s.svc.%s:8000", plat.Status.RayServiceName, ai.Spec.AIPlatformRef.Namespace, "cluster.local") + scheme := ai.Spec.AIPlatformScheme + if scheme == "" { + scheme = "http" + } + ai.Spec.AIPlatformUrl = fmt.Sprintf("%s://%s.%s.svc.%s:8000", scheme, plat.Status.RayServiceName, ai.Spec.AIPlatformRef.Namespace, "cluster.local") ai.Spec.VectorDbUrl = fmt.Sprintf("%s.%s.svc.%s", plat.Status.VectorDbServiceName, ai.Spec.AIPlatformRef.Namespace, "cluster.local") } if ai.Spec.AIPlatformRef.Name == "" && ai.Spec.AIPlatformUrl == "" { diff --git a/pkg/ai/raybuilder/builder.go b/pkg/ai/raybuilder/builder.go index e29a1a7..a50b8b7 100644 --- a/pkg/ai/raybuilder/builder.go +++ b/pkg/ai/raybuilder/builder.go @@ -44,9 +44,15 @@ type Builder struct { } type ApplicationParams struct { - ArtifactBucketName string `yaml:"ARTIFACTS_S3_BUCKET"` - CloudProvider string `yaml:"CLOUD_PROVIDER"` - Replicas map[string]int32 `yaml:"REPLICAS"` + ArtifactBucketName string `yaml:"ARTIFACTS_S3_BUCKET"` + ArtifactsProvider string `yaml:"ARTIFACTS_PROVIDER"` + CloudProvider string `yaml:"CLOUD_PROVIDER"` + S3CompatObjectStoreEndpointUrl string `yaml:"S3COMPAT_OBJECT_STORE_ENDPOINT_URL"` + S3CompatObjectStoreAccessKey string `yaml:"S3COMPAT_OBJECT_STORE_ACCESS_KEY"` + S3CompatObjectStoreSecretKey string `yaml:"S3COMPAT_OBJECT_STORE_SECRET_KEY"` + Replicas map[string]int32 `yaml:"REPLICAS"` + ModelVersion string `yaml:"MODEL_VERSION"` + AcceleratorType string `yaml:"ACCELERATOR_TYPE"` } type WorkerConfigs map[string][]InstanceDetail @@ -73,6 +79,14 @@ func New(ai *enterpriseApi.AIPlatform, client client.Client, scheme *runtime.Sch } } +// effectiveAcceleratorType returns spec.defaultAcceleratorType or L40S when unset, matching instance.yaml keys (L40S, H100_NVL). +func (b *Builder) effectiveAcceleratorType() string { + if s := strings.TrimSpace(b.ai.Spec.DefaultAcceleratorType); s != "" { + return s + } + return "L40S" +} + // --- 7️⃣ ReconcileRayService: build & create/update the RayService CR --- func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPlatform) error { logger := log.FromContext(ctx) // Define logger @@ -89,15 +103,30 @@ func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPl return err } - // Set CloudProvider based on URL scheme - var cloudProvider string + // Set CloudProvider and artifacts provider/bucket from URL scheme (for SDK model loaders). + // ARTIFACTS_PROVIDER matches storage client GetProvider(): s3/minio/seaweedfs/s3compat -> "s3", gs/gcs -> "gcs", azure -> "azure". + // S3 (AWS) uses cloudProvider "aws" when no custom endpoint; s3compat/minio/seaweedfs use "s3compat". + var cloudProvider, artifactsProvider string switch u.Scheme { case "s3": - cloudProvider = "aws" - case "gs": + if p.Spec.ObjectStorage.Endpoint != "" { + cloudProvider = "s3compat" + } else { + cloudProvider = "aws" + } + artifactsProvider = "s3" + case "s3compat", "minio", "seaweedfs": + cloudProvider = "s3compat" + artifactsProvider = "s3" + case "gs", "gcs": cloudProvider = "gcp" + artifactsProvider = "gcs" + case "azure": + cloudProvider = "azure" + artifactsProvider = "azure" default: - cloudProvider = "azure" // TODO: FIX THIS, need to support minio + cloudProvider = "azure" + artifactsProvider = "azure" } // Initialize the replicas map by iterating through features @@ -135,10 +164,39 @@ func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPl } } + // S3-compatible backends (s3compat, minio, seaweedfs) need custom endpoint and credentials. S3 (AWS) uses region/IRSA only. + s3CompatScheme := (u.Scheme == "s3compat" || u.Scheme == "minio" || u.Scheme == "seaweedfs") + s3CompatObjectStoreEndpoint := "" + if s3CompatScheme && p.Spec.ObjectStorage.Endpoint != "" { + s3CompatObjectStoreEndpoint = p.Spec.ObjectStorage.Endpoint + } + + var s3CompatObjectStoreAccessKey, s3CompatObjectStoreSecretKey string + if p.Spec.ObjectStorage.SecretRef != "" && s3CompatScheme { + var secret corev1.Secret + secretRef := types.NamespacedName{Namespace: p.Namespace, Name: p.Spec.ObjectStorage.SecretRef} + if err := b.Get(ctx, secretRef, &secret); err != nil { + logger.Error(err, "Failed to get object storage secret for S3-compatible credentials", "secret", p.Spec.ObjectStorage.SecretRef) + return err + } + if raw, ok := secret.Data["s3_access_key"]; ok { + s3CompatObjectStoreAccessKey = string(raw) + } + if raw, ok := secret.Data["s3_secret_key"]; ok { + s3CompatObjectStoreSecretKey = string(raw) + } + } + param := ApplicationParams{ - ArtifactBucketName: u.Host, - CloudProvider: cloudProvider, - Replicas: replicasMap, + ArtifactBucketName: u.Host, + ArtifactsProvider: artifactsProvider, + CloudProvider: cloudProvider, + S3CompatObjectStoreEndpointUrl: s3CompatObjectStoreEndpoint, + S3CompatObjectStoreAccessKey: s3CompatObjectStoreAccessKey, + S3CompatObjectStoreSecretKey: s3CompatObjectStoreSecretKey, + Replicas: replicasMap, + ModelVersion: os.Getenv("MODEL_VERSION"), + AcceleratorType: b.effectiveAcceleratorType(), } // Use embedded applications.yaml content @@ -578,6 +636,7 @@ func (b *Builder) Build(ctx context.Context) (*rayv1.RayService, error) { } func (b *Builder) buildClusterConfig(ctx context.Context) (*rayv1.RayClusterSpec, error) { + acceleratorType := b.effectiveAcceleratorType() annotations, labels := buildHeadAnnotationsAndLabels(b.ai) head := rayv1.HeadGroupSpec{ RayStartParams: map[string]string{ @@ -628,7 +687,7 @@ func (b *Builder) buildClusterConfig(ctx context.Context) (*rayv1.RayClusterSpec if err != nil { return nil, fmt.Errorf("failed to parse feature YAML file %s: %v", fileName, err) } - for k, val := range featureConfig.InstanceScale[b.ai.Spec.DefaultAcceleratorType] { + for k, val := range featureConfig.InstanceScale[acceleratorType] { old_val, ok := instanceScale[k] if ok { instanceScale[k] = old_val + val @@ -639,17 +698,29 @@ func (b *Builder) buildClusterConfig(ctx context.Context) (*rayv1.RayClusterSpec } var workers []rayv1.WorkerGroupSpec - var gpuConfigs = instanceMap[b.ai.Spec.DefaultAcceleratorType] + gpuConfigs := instanceMap[acceleratorType] + if len(gpuConfigs) == 0 { + return nil, fmt.Errorf("instance.yaml has no worker tiers for defaultAcceleratorType %q; keys must match exactly (e.g. L40S, H100_NVL)", acceleratorType) + } for _, cfg := range gpuConfigs { annotations, labels := buildWorkerAnnotationsAndLabels(b.ai, cfg) cpuLimit := cfg.Resources.Limits[corev1.ResourceCPU] + replicas := instanceScale[cfg.Tier] + + maxReplicas := replicas + 5 + if cfg.GPUsPerPod > 0 { + maxReplicas = replicas + } + wg := rayv1.WorkerGroupSpec{ - GroupName: cfg.Tier, - Replicas: int32Ptr(instanceScale[cfg.Tier]), + GroupName: cfg.Tier, + Replicas: int32Ptr(replicas), + MinReplicas: int32Ptr(replicas), + MaxReplicas: int32Ptr(maxReplicas), RayStartParams: map[string]string{ "num-cpus": cpuLimit.String(), - "resources": fmt.Sprintf(`"{\"accelerator_type:%s\":1,\"gpu_count:%d\":1}"`, b.ai.Spec.DefaultAcceleratorType, cfg.GPUsPerPod), + "resources": fmt.Sprintf(`"{\"accelerator_type:%s\":1,\"gpu_count:%d\":1}"`, acceleratorType, cfg.GPUsPerPod), }, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ @@ -662,20 +733,105 @@ func (b *Builder) buildClusterConfig(ctx context.Context) (*rayv1.RayClusterSpec workers = append(workers, wg) } + idleTimeout := int32Ptr(600) return &rayv1.RayClusterSpec{ RayVersion: os.Getenv("RAY_VERSION"), EnableInTreeAutoscaling: boolPtr(true), + AutoscalerOptions: &rayv1.AutoscalerOptions{IdleTimeoutSeconds: idleTimeout}, HeadGroupSpec: head, WorkerGroupSpecs: workers, }, nil } +// objectStorageSecretEnv returns env vars for S3COMPAT_OBJECT_STORE_ACCESS_KEY and S3COMPAT_OBJECT_STORE_SECRET_KEY from +// the objectStorage secret (s3_access_key/s3_secret_key) for S3-compatible object storage. +func (b *Builder) objectStorageSecretEnv() []corev1.EnvVar { + if b.ai.Spec.ObjectStorage.SecretRef == "" { + return nil + } + secretName := b.ai.Spec.ObjectStorage.SecretRef + return []corev1.EnvVar{ + { + Name: "S3COMPAT_OBJECT_STORE_ACCESS_KEY", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: secretName}, + Key: "s3_access_key", + }, + }, + }, + { + Name: "S3COMPAT_OBJECT_STORE_SECRET_KEY", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: secretName}, + Key: "s3_secret_key", + }, + }, + }, + } +} + +// rayS3DownloadEnv sets AWS_* variables so application code and Ray's runtime_env S3 fetch use the +// configured S3-compatible endpoint (via AWS_ENDPOINT_URL) and credentials when present. +func (b *Builder) rayS3DownloadEnv() []corev1.EnvVar { + u, err := url.Parse(b.ai.Spec.ObjectStorage.Path) + if err != nil { + return nil + } + endpoint := strings.TrimSpace(b.ai.Spec.ObjectStorage.Endpoint) + s3CompatScheme := u.Scheme == "s3compat" || u.Scheme == "minio" || u.Scheme == "seaweedfs" + s3WithCustomEndpoint := u.Scheme == "s3" && endpoint != "" + if (!s3CompatScheme && !s3WithCustomEndpoint) || endpoint == "" { + return nil + } + var out []corev1.EnvVar + out = append(out, corev1.EnvVar{Name: "AWS_ENDPOINT_URL", Value: endpoint}) + if r := strings.TrimSpace(b.ai.Spec.ObjectStorage.Region); r != "" { + out = append(out, + corev1.EnvVar{Name: "AWS_DEFAULT_REGION", Value: r}, + corev1.EnvVar{Name: "AWS_REGION", Value: r}, + ) + } + if b.ai.Spec.ObjectStorage.SecretRef == "" { + return out + } + sn := b.ai.Spec.ObjectStorage.SecretRef + out = append(out, + corev1.EnvVar{ + Name: "AWS_ACCESS_KEY_ID", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: sn}, + Key: "s3_access_key", + }, + }, + }, + corev1.EnvVar{ + Name: "AWS_SECRET_ACCESS_KEY", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: sn}, + Key: "s3_secret_key", + }, + }, + }, + ) + return out +} + func (b *Builder) makeHeadTemplate() corev1.PodTemplateSpec { + headEnv := []corev1.EnvVar{ + {Name: "DEFAULT_GPU_TYPE", Value: b.effectiveAcceleratorType()}, + {Name: "CLUSTER_NAME", Value: "ai-platform-models"}, // FIXME + } + headEnv = append(headEnv, b.rayS3DownloadEnv()...) + headEnv = append(headEnv, b.objectStorageSecretEnv()...) spec := corev1.PodSpec{ Containers: []corev1.Container{{ Name: "ray-head", Image: SetImageRegistry("RELATED_IMAGE_RAY_HEAD", b.ai.Spec.Images.RayHeadGroupImage), - ImagePullPolicy: corev1.PullAlways, + ImagePullPolicy: corev1.PullIfNotPresent, Args: []string{ "ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD", }, @@ -684,10 +840,7 @@ func (b *Builder) makeHeadTemplate() corev1.PodTemplateSpec { "-lc", "--", }, - Env: []corev1.EnvVar{ - {Name: "DEFAULT_GPU_TYPE", Value: b.ai.Spec.DefaultAcceleratorType}, - {Name: "CLUSTER_NAME", Value: "ai-platform-models"}, // FIXME - }, + Env: headEnv, Lifecycle: &corev1.Lifecycle{ PreStop: &corev1.LifecycleHandler{ Exec: &corev1.ExecAction{ @@ -756,13 +909,13 @@ func (b *Builder) makeHeadTemplate() corev1.PodTemplateSpec { func (b *Builder) makeWorkerTemplate(cfg InstanceDetail) corev1.PodTemplateSpec { defaultEnv := []corev1.EnvVar{ - {Name: "DEFAULT_GPU_TYPE", Value: b.ai.Spec.DefaultAcceleratorType}, + {Name: "DEFAULT_GPU_TYPE", Value: b.effectiveAcceleratorType()}, {Name: "RAY_HEAD_SERVICE_HOST", Value: fmt.Sprintf("%s.%s.svc.%s", b.ai.Name+"-head-svc", b.ai.Namespace, os.Getenv("CLUSTER_DOMAIN"))}, {Name: "SERVICE_NAME", Value: b.ai.Name}, {Name: "SERVICE_INTERNAL_NAME", Value: b.ai.Name}, {Name: "USE_SYSTEM_PERMISSIONS", Value: "true"}, {Name: "GPG_PUBLICKEY_PATH", Value: "kv-splunk/al-platform.ray-worker-sa/gpgkey"}, // FIXME - {Name: "GPU_TYPE", Value: b.ai.Spec.DefaultAcceleratorType}, // FIXME + {Name: "GPU_TYPE", Value: b.effectiveAcceleratorType()}, // FIXME } // Combine defaultEnv with cfg.Env to create combinedEnv @@ -783,11 +936,14 @@ func (b *Builder) makeWorkerTemplate(cfg InstanceDetail) corev1.PodTemplateSpec combinedEnv = append(combinedEnv, corev1.EnvVar{Name: key, Value: value}) } } + // S3-compatible: boto3 for Ray runtime_env working_dir + app-level S3COMPAT_* keys + combinedEnv = append(combinedEnv, b.rayS3DownloadEnv()...) + combinedEnv = append(combinedEnv, b.objectStorageSecretEnv()...) rayCommand := fmt.Sprintf(`echo %s worker; ulimit -n 65536; export PATH="/home/ray/anaconda3/bin:$PATH"; KUBERAY_GEN_RAY_START_CMD=$(echo $KUBERAY_GEN_RAY_START_CMD | sed -e 's/"{/{/g' -e 's/}"/}/g' -e 's/\\\"/"/g'); - $KUBERAY_GEN_RAY_START_CMD;`, cfg.Tier) + $KUBERAY_GEN_RAY_START_CMD`, cfg.Tier) spec := corev1.PodSpec{ Affinity: b.ai.Spec.GPUSchedulingSpec.Affinity, Tolerations: b.ai.Spec.GPUSchedulingSpec.Tolerations, @@ -796,7 +952,7 @@ func (b *Builder) makeWorkerTemplate(cfg InstanceDetail) corev1.PodTemplateSpec Containers: []corev1.Container{{ Name: "ray-worker", Image: SetImageRegistry("RELATED_IMAGE_RAY_WORKER", b.ai.Spec.WorkerGroupConfig.ImageRegistry), - ImagePullPolicy: corev1.PullAlways, + ImagePullPolicy: corev1.PullIfNotPresent, Command: []string{ "/bin/bash", "-lc", diff --git a/pkg/ai/raybuilder/builder_additional_test.go b/pkg/ai/raybuilder/builder_additional_test.go index 4a39746..22e0da7 100644 --- a/pkg/ai/raybuilder/builder_additional_test.go +++ b/pkg/ai/raybuilder/builder_additional_test.go @@ -543,7 +543,7 @@ func TestBuilder_makeWorkerTemplate(t *testing.T) { // Verify ray-worker container (first container is always ray-worker) rayWorker := template.Spec.Containers[0] assert.Equal(t, "ray-worker", rayWorker.Name) - assert.Equal(t, corev1.PullAlways, rayWorker.ImagePullPolicy) + assert.Equal(t, corev1.PullIfNotPresent, rayWorker.ImagePullPolicy) assert.Contains(t, rayWorker.Command, "/bin/bash") // Verify environment variables diff --git a/pkg/ai/raybuilder/builder_test.go b/pkg/ai/raybuilder/builder_test.go index 394d700..e5a1120 100644 --- a/pkg/ai/raybuilder/builder_test.go +++ b/pkg/ai/raybuilder/builder_test.go @@ -498,3 +498,4 @@ func TestSetImageRegistry(t *testing.T) { }) } } + diff --git a/pkg/ai/raybuilder/configmap_apps_test.go b/pkg/ai/raybuilder/configmap_apps_test.go new file mode 100644 index 0000000..07711c2 --- /dev/null +++ b/pkg/ai/raybuilder/configmap_apps_test.go @@ -0,0 +1,163 @@ +package raybuilder + +import ( + "os" + "path/filepath" + "regexp" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "gopkg.in/yaml.v3" +) + +// readApplicationsYAMLFromRepo locates the repo's +// config/configs/applications.yaml relative to the raybuilder test file. +// Keeping this a standalone helper (rather than using os.Getenv("APPLICATION_FILE")) +// lets the test run under `go test ./pkg/ai/raybuilder/...` without setting env. +func readApplicationsYAMLFromRepo(t *testing.T) string { + t.Helper() + wd, err := os.Getwd() + require.NoError(t, err) + // pkg/ai/raybuilder is three levels below the repo root. + repoRoot := filepath.Clean(filepath.Join(wd, "..", "..", "..")) + path := filepath.Join(repoRoot, "config", "configs", "applications.yaml") + raw, err := os.ReadFile(path) + require.NoError(t, err, "unable to read %s", path) + return string(raw) +} + +// maskGoTemplates replaces `{{ ... }}` tokens with a plain string so the +// result parses as valid YAML. applications.yaml interpolates Go template +// variables at runtime (see Builder.ReconcileApplicationsConfigMap) — during +// unit testing we never render them, so a syntactic mask is sufficient. +func maskGoTemplates(s string) string { + return regexp.MustCompile(`\{\{[^}]+\}\}`).ReplaceAllString(s, "PLACEHOLDER") +} + +// Test_ApplicationsYAML_DisableResponsesRedis is a regression test for the +// airgap k0s /query failure. Each vLLM TextGen Ray Serve deployment constructs +// a RedisOpenAIServingResponses on the first /v1/responses request; that +// class's __init__ raises RuntimeError if neither RESPONSES_REDIS_URL nor +// RESPONSES_REDIS_ADDRESS is configured, and the resulting empty SSE stream +// bubbles up to SAIA v2's search pipeline as "No generations found in stream" +// → SearchStreamError → "An error occurred processing your request" to the +// end user. See ai-platform-models commits c1f9aef3, da7628ea, b6ff101e. +// +// The fix (set DISABLE_RESPONSES_API_REDIS=True) switches vLLM to the new +// NoOpOpenAIServingResponses class that skips Redis entirely. It MUST be set +// on every app whose deployment_type is text_gen_model_deployment — that's +// the only Ray Serve deployment type that instantiates the Responses API +// serving class. Other deployment types (embedding_model_deployment, +// scoring_model_deployment, classification_model_deployment, custom_deployment) +// do not call /v1/responses and do not need this flag. +func Test_ApplicationsYAML_DisableResponsesRedis(t *testing.T) { + masked := maskGoTemplates(readApplicationsYAMLFromRepo(t)) + + // Parse just enough structure to traverse apps; keep the rest loose so + // unrelated config churn doesn't break this test. + type envVars = map[string]string + type runtimeEnv struct { + EnvVars envVars `yaml:"env_vars"` + } + type args struct { + DeploymentType string `yaml:"deployment_type"` + } + type app struct { + Name string `yaml:"name"` + Args args `yaml:"args"` + RuntimeEnv runtimeEnv `yaml:"runtime_env"` + } + var doc struct { + Applications []app `yaml:"applications"` + } + require.NoError(t, yaml.Unmarshal([]byte(masked), &doc)) + require.NotEmpty(t, doc.Applications, "applications.yaml parsed as empty") + + // Collect the set of text-gen apps (must-set) and everything else (must-not-set). + var textGenApps []app + var otherApps []app + for _, a := range doc.Applications { + if a.Args.DeploymentType == "text_gen_model_deployment" { + textGenApps = append(textGenApps, a) + } else { + otherApps = append(otherApps, a) + } + } + + // We expect exactly two text-gen apps today (GptOss120b, GptOss20b). If + // this count changes, someone added a new text-gen model; they MUST also + // add DISABLE_RESPONSES_API_REDIS to the new app. + require.Len(t, textGenApps, 2, + "expected exactly 2 text_gen_model_deployment apps (GptOss120b, GptOss20b); "+ + "found %d. New text-gen apps MUST set DISABLE_RESPONSES_API_REDIS.", + len(textGenApps)) + + for _, a := range textGenApps { + assert.Equal(t, "True", a.RuntimeEnv.EnvVars["DISABLE_RESPONSES_API_REDIS"], + "app %q (deployment_type=text_gen_model_deployment) must set "+ + "DISABLE_RESPONSES_API_REDIS=\"True\" in runtime_env.env_vars. Without this, "+ + "vLLM's RedisOpenAIServingResponses constructor raises "+ + "RuntimeError('Responses Redis URL not set') and /v1/responses calls fail "+ + "(surfaces to SAIA v2 /query as \"An error occurred processing your request\").", + a.Name) + } + + // Sanity: assert the two canonical app names we expect. Keeps the test + // readable if someone renames an app and forgets to re-check this. + var names []string + for _, a := range textGenApps { + names = append(names, a.Name) + } + assert.ElementsMatch(t, []string{"GptOss120b", "GptOss20b"}, names, + "unexpected set of text_gen_model_deployment apps: %v", names) + + // Hygiene check: non-text-gen apps should NOT carry this env (it's a + // no-op for them and misleading if present). + for _, a := range otherApps { + if _, ok := a.RuntimeEnv.EnvVars["DISABLE_RESPONSES_API_REDIS"]; ok { + t.Errorf("app %q (deployment_type=%q) should NOT set "+ + "DISABLE_RESPONSES_API_REDIS — it's only read by "+ + "vllm_text_gen_model.VLLMTextGenModel.", a.Name, a.Args.DeploymentType) + } + } +} + +// Test_ApplicationsYAML_IsWellFormed is a tiny smoke test that the bundled +// applications.yaml parses correctly after Go-template masking. Catches +// accidental structural breakage (e.g. un-indented env_vars, stray tabs). +func Test_ApplicationsYAML_IsWellFormed(t *testing.T) { + masked := maskGoTemplates(readApplicationsYAMLFromRepo(t)) + var raw map[string]any + require.NoError(t, yaml.Unmarshal([]byte(masked), &raw), + "applications.yaml does not parse as YAML (after masking Go templates)") + apps, ok := raw["applications"].([]any) + require.True(t, ok, "applications.yaml missing top-level 'applications' list") + require.NotEmpty(t, apps, "applications list is empty") + + // Spot-check: every app entry must have a 'name' key. 'args' is optional + // — the Entrypoint router app omits it, model apps carry deployment config + // there. + for i, a := range apps { + m, ok := a.(map[string]any) + require.True(t, ok, "app at index %d is not a mapping", i) + _, hasName := m["name"] + require.True(t, hasName, + "app at index %d missing 'name': keys=%v", i, keys(m)) + } +} + +func keys(m map[string]any) []string { + out := make([]string, 0, len(m)) + for k := range m { + out = append(out, k) + } + // Stable-ish for readability in failure messages. + for i := 1; i < len(out); i++ { + for j := i; j > 0 && strings.Compare(out[j], out[j-1]) < 0; j-- { + out[j], out[j-1] = out[j-1], out[j] + } + } + return out +} diff --git a/pkg/ai/reconciler.go b/pkg/ai/reconciler.go index 3230af1..9db671b 100644 --- a/pkg/ai/reconciler.go +++ b/pkg/ai/reconciler.go @@ -3,6 +3,7 @@ package ai_platform import ( "context" "fmt" + "os" aiApi "github.com/splunk/splunk-ai-operator/api/v1" "github.com/splunk/splunk-ai-operator/pkg/ai/raybuilder" @@ -131,6 +132,15 @@ func (r *AIPlatformReconciler) ReconcileFeatures(ctx context.Context, platform * svc.Namespace = platform.Namespace _, err := controllerutil.CreateOrUpdate(ctx, r.Client, &svc, func() error { + // After client Get, svc holds the live AIService (empty on first create). + preservedResources := svc.Spec.Resources + // Preserve any direct `kubectl patch aiservice` edit of ServiceTemplate. + // Without this, an admin who patches the public SAIA Service type + // (e.g. to NodePort for browser-direct v2 traffic) would see their + // change revert on the next AIPlatform reconcile, same footgun as + // Resources above. + preservedServiceTemplate := svc.Spec.ServiceTemplate + // Ensure ownership if err := controllerutil.SetControllerReference(platform, &svc, r.Scheme); err != nil { return err @@ -142,6 +152,18 @@ func (r *AIPlatformReconciler) ReconcileFeatures(ctx context.Context, platform * // Copy desired spec svc.Spec = built.Spec + // buildAIService does not set Resources; without this, every AIPlatform reconcile + // wipes kubectl patches / user-set limits (e.g. SAIA memory) back to empty → 2Gi defaults. + if resourceRequirementsNonEmpty(preservedResources) { + svc.Spec.Resources = preservedResources + } + // If the admin already patched serviceTemplate (non-empty + // spec.type), keep that override. Otherwise fall through to the + // value buildAIService() just set from AIPlatform.spec. + if preservedServiceTemplate.Spec.Type != "" { + svc.Spec.ServiceTemplate = preservedServiceTemplate + } + // Merge labels if svc.Labels == nil { svc.Labels = map[string]string{} @@ -189,6 +211,10 @@ func (r *AIPlatformReconciler) ReconcileFeatures(ctx context.Context, platform * return nil } +func resourceRequirementsNonEmpty(r corev1.ResourceRequirements) bool { + return len(r.Requests) > 0 || len(r.Limits) > 0 +} + func (r *AIPlatformReconciler) buildAIService(ctx context.Context, platform *aiApi.AIPlatform, feature aiApi.FeatureSpec, name string) *aiApi.AIService { vectorDbUrl := platform.Status.VectorDbServiceName @@ -198,7 +224,7 @@ func (r *AIPlatformReconciler) buildAIService(ctx context.Context, platform *aiA taskObjectStorage := platform.Spec.ObjectStorage // Don't append feature name - just pass the bucket path directly // taskObjectStorage.Path is already set from platform.Spec.ObjectStorage - return &aiApi.AIService{ + svc := &aiApi.AIService{ ObjectMeta: metav1.ObjectMeta{ Name: name, Namespace: platform.Namespace, @@ -227,10 +253,24 @@ func (r *AIPlatformReconciler) buildAIService(ctx context.Context, platform *aiA Path: "/metrics", }, MTLS: platform.Spec.MTLS, - // Propagate imagePullSecrets from AIPlatform to AIService + // Propagate public-exposure preference from AIPlatform. Customers deploy + // the higher-level AIPlatform CR, so any NodePort / LoadBalancer setting + // they configure at that level must flow down to the AIService. Without + // this copy, the spec lands on AIPlatform and is silently ignored. + // Deep-copy because corev1.Service is a value type with nested + // slices/maps; a shallow copy would share state across children. + ServiceTemplate: *platform.Spec.ServiceTemplate.DeepCopy(), ImagePullSecrets: platform.Spec.Images.ImagePullSecrets, }, } + + // SAIA v2: populate from operator env var if set + if v2Image := os.Getenv("RELATED_IMAGE_SAIA_API_V2"); v2Image != "" { + svc.Spec.V2 = aiApi.SAIAv2Config{Image: v2Image, Replicas: 1} + svc.Spec.V2Worker = aiApi.SAIAWorkerConfig{Replicas: 1} + } + + return svc } // CheckAIServiceStatus verifies that all AIService children have successful conditions. diff --git a/pkg/ai/reconciler_test.go b/pkg/ai/reconciler_test.go index d53ad90..dfbcc46 100644 --- a/pkg/ai/reconciler_test.go +++ b/pkg/ai/reconciler_test.go @@ -75,6 +75,48 @@ func TestBuildAIService_PopulatesExpectedFields(t *testing.T) { assert.Equal(t, "feature1", service.Labels["feature"]) } +func TestBuildAIService_PropagatesServiceTemplate(t *testing.T) { + // Customers configure public exposure (NodePort / LoadBalancer) at the + // AIPlatform level. Without propagation, the setting is silently dropped + // and SAIA is never reachable outside the cluster. This test locks in the + // contract that AIPlatform.spec.serviceTemplate flows into AIService. + scheme := buildTestScheme(t) + + platform := &aiApi.AIPlatform{ + ObjectMeta: metav1.ObjectMeta{Name: "my-ai", Namespace: "default"}, + Spec: aiApi.AIPlatformSpec{ + ObjectStorage: aiApi.ObjectStorageSpec{Path: "/data"}, + SplunkConfiguration: aiApi.SplunkConfigurationSpec{ + Endpoint: "splunk-endpoint", + }, + ServiceTemplate: corev1.Service{ + Spec: corev1.ServiceSpec{ + Type: corev1.ServiceTypeNodePort, + Ports: []corev1.ServicePort{ + {Name: "http", NodePort: 30080}, + }, + }, + }, + }, + } + feature := aiApi.FeatureSpec{Name: "saia", Version: "v1"} + r := &AIPlatformReconciler{Scheme: scheme} + + service := r.buildAIService(context.Background(), platform, feature, "my-ai-saia") + + assert.Equal(t, corev1.ServiceTypeNodePort, service.Spec.ServiceTemplate.Spec.Type, + "NodePort selection must propagate so customers can expose SAIA") + if assert.Len(t, service.Spec.ServiceTemplate.Spec.Ports, 1) { + assert.Equal(t, int32(30080), service.Spec.ServiceTemplate.Spec.Ports[0].NodePort, + "explicit NodePort must propagate") + } + + // Mutating the child spec must not affect the parent (deep-copy check). + service.Spec.ServiceTemplate.Spec.Ports[0].NodePort = 31234 + assert.Equal(t, int32(30080), platform.Spec.ServiceTemplate.Spec.Ports[0].NodePort, + "buildAIService must deep-copy ServiceTemplate to avoid shared state") +} + func TestReconcileFeatures_CreatesNewAIService(t *testing.T) { ctx := context.Background() scheme := buildTestScheme(t) diff --git a/pkg/ai/weaviate.go b/pkg/ai/weaviate.go index 23a0007..d10eb39 100644 --- a/pkg/ai/weaviate.go +++ b/pkg/ai/weaviate.go @@ -192,19 +192,22 @@ func (r *AIPlatformReconciler) ReconcileWeaviateDatabase(ctx context.Context, in // Container definition sts.Spec.Template.Spec.Containers = []corev1.Container{{ - Name: "weaviate", - Image: weaviateImage, - Resources: resources, - VolumeMounts: volumeMounts, - Ports: []corev1.ContainerPort{{ - Name: "http", - ContainerPort: 8080, - }}, + Name: "weaviate", + Image: weaviateImage, + ImagePullPolicy: corev1.PullIfNotPresent, + Resources: resources, + VolumeMounts: volumeMounts, + Ports: []corev1.ContainerPort{ + {Name: "http", ContainerPort: 8080}, + {Name: "grpc", ContainerPort: 50051}, + }, Env: []corev1.EnvVar{ - { - Name: "PERSISTENCE_DATA_PATH", - Value: "/var/lib/weaviate", - }, + {Name: "PERSISTENCE_DATA_PATH", Value: "/var/lib/weaviate"}, + // gRPC server is enabled by default in Weaviate v1.19+. Setting GRPC_PORT + // explicitly matches the Splunk vector-db reference chart and makes the + // port contract explicit for the 50051 containerPort/service declared below. + // Required by SAIA v2 which uses the Weaviate python v4 gRPC client. + {Name: "GRPC_PORT", Value: "50051"}, }, }} return nil @@ -229,11 +232,18 @@ func (r *AIPlatformReconciler) ReconcileWeaviateDatabase(ctx context.Context, in } if _, err := controllerutil.CreateOrUpdate(ctx, r.Client, svc, func() error { svc.Spec.Selector = labels - svc.Spec.Ports = []corev1.ServicePort{{ - Name: "http", - Port: 80, - TargetPort: intstr.FromInt(8080), - }} + svc.Spec.Ports = []corev1.ServicePort{ + { + Name: "http", + Port: 80, + TargetPort: intstr.FromInt(8080), + }, + { + Name: "grpc", + Port: 50051, + TargetPort: intstr.FromInt(50051), + }, + } return nil }); err != nil { return err diff --git a/pkg/ai/weaviate_test.go b/pkg/ai/weaviate_test.go index a397d6a..6f60038 100644 --- a/pkg/ai/weaviate_test.go +++ b/pkg/ai/weaviate_test.go @@ -150,11 +150,33 @@ func TestReconcileWeaviateDatabase(t *testing.T) { assert.NoError(t, err) assert.Equal(t, "weaviate:test", sts.Spec.Template.Spec.Containers[0].Image) - // Verify Service created + // Verify container exposes both http (8080) and grpc (50051) ports + containerPorts := sts.Spec.Template.Spec.Containers[0].Ports + portNames := map[string]int32{} + for _, p := range containerPorts { + portNames[p.Name] = p.ContainerPort + } + assert.Equal(t, int32(8080), portNames["http"]) + assert.Equal(t, int32(50051), portNames["grpc"]) + + // Verify container has GRPC_PORT env var (gRPC server is enabled by default in + // Weaviate v1.19+, GRPC_PORT is set explicitly to make the port contract clear). + envMap := map[string]string{} + for _, e := range sts.Spec.Template.Spec.Containers[0].Env { + envMap[e.Name] = e.Value + } + assert.Equal(t, "50051", envMap["GRPC_PORT"]) + + // Verify Service created with both http and grpc ports svc := &corev1.Service{} err = fc.Get(ctx, types.NamespacedName{Name: platformName + "-weaviate", Namespace: ns}, svc) assert.NoError(t, err) - assert.Equal(t, int32(80), svc.Spec.Ports[0].Port) + svcPorts := map[string]int32{} + for _, p := range svc.Spec.Ports { + svcPorts[p.Name] = p.Port + } + assert.Equal(t, int32(80), svcPorts["http"]) + assert.Equal(t, int32(50051), svcPorts["grpc"]) }) } diff --git a/pkg/storage/azure.go b/pkg/storage/azure.go index fa5f0ba..abbde0c 100644 --- a/pkg/storage/azure.go +++ b/pkg/storage/azure.go @@ -31,6 +31,9 @@ func NewAzureClient( namespace, container, prefix string, vs ai.ObjectStorageSpec, ) (StorageClient, error) { + if container == "" { + return nil, fmt.Errorf("Azure Blob storage requires a container name; use path format azure://container-name/prefix (e.g. azure://my-container/model_artifacts). Without it, model deployments fail with 'Please specify a container name'") + } var cred azcore.TokenCredential var err error diff --git a/pkg/storage/minio.go b/pkg/storage/minio.go index f55a4ba..d8a2abd 100644 --- a/pkg/storage/minio.go +++ b/pkg/storage/minio.go @@ -3,44 +3,17 @@ package storage import ( "context" - "github.com/aws/aws-sdk-go/aws" - "github.com/aws/aws-sdk-go/aws/credentials" - "github.com/aws/aws-sdk-go/aws/session" - "github.com/aws/aws-sdk-go/service/s3" ai "github.com/splunk/splunk-ai-operator/api/v1" - corev1 "k8s.io/api/core/v1" "sigs.k8s.io/controller-runtime/pkg/client" ) +// NewMinioClient creates a StorageClient for MinIO (S3-compatible). It delegates to NewS3CompatibleClient. +// Deprecated: Prefer NewS3CompatibleClient for MinIO, SeaweedFS, or any S3-compatible backend. func NewMinioClient( ctx context.Context, k8sClient client.Client, namespace, bucket, prefix string, vs ai.ObjectStorageSpec, ) (StorageClient, error) { - awsCfg := &aws.Config{ - Endpoint: aws.String(vs.Endpoint), - Region: aws.String(vs.Region), - S3ForcePathStyle: aws.Bool(true), - } - if vs.SecretRef != "" { - secret := &corev1.Secret{} - if err := k8sClient.Get(ctx, - client.ObjectKey{Namespace: namespace, Name: vs.SecretRef}, - secret, - ); err != nil { - return nil, err - } - awsCfg.Credentials = credentials.NewStaticCredentials( - string(secret.Data["s3_access_key"]), - string(secret.Data["s3_secret_key"]), - "", - ) - } - // no SecretRef → AWS SDK default chain (IRSA, env, etc) - sess, err := session.NewSession(awsCfg) - if err != nil { - return nil, err - } - return &s3Client{cli: s3.New(sess), bucket: bucket, prefix: prefix}, nil + return NewS3CompatibleClient(ctx, k8sClient, namespace, bucket, prefix, vs) } diff --git a/pkg/storage/s3compat.go b/pkg/storage/s3compat.go new file mode 100644 index 0000000..b50a735 --- /dev/null +++ b/pkg/storage/s3compat.go @@ -0,0 +1,47 @@ +package storage + +import ( + "context" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/credentials" + "github.com/aws/aws-sdk-go/aws/session" + "github.com/aws/aws-sdk-go/service/s3" + ai "github.com/splunk/splunk-ai-operator/api/v1" + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// NewS3CompatibleClient creates a StorageClient for any S3-compatible backend (MinIO, SeaweedFS, etc.). +// Endpoint must be set on vs; credentials come from vs.SecretRef (s3_access_key, s3_secret_key) if set. +func NewS3CompatibleClient( + ctx context.Context, + k8sClient client.Client, + namespace, bucket, prefix string, + vs ai.ObjectStorageSpec, +) (StorageClient, error) { + awsCfg := &aws.Config{ + Endpoint: aws.String(vs.Endpoint), + Region: aws.String(vs.Region), + S3ForcePathStyle: aws.Bool(true), + } + if vs.SecretRef != "" { + secret := &corev1.Secret{} + if err := k8sClient.Get(ctx, + client.ObjectKey{Namespace: namespace, Name: vs.SecretRef}, + secret, + ); err != nil { + return nil, err + } + awsCfg.Credentials = credentials.NewStaticCredentials( + string(secret.Data["s3_access_key"]), + string(secret.Data["s3_secret_key"]), + "", + ) + } + sess, err := session.NewSession(awsCfg) + if err != nil { + return nil, err + } + return &s3Client{cli: s3.New(sess), bucket: bucket, prefix: prefix}, nil +} diff --git a/pkg/storage/storageclient.go b/pkg/storage/storageclient.go index 7dbea32..cc04209 100644 --- a/pkg/storage/storageclient.go +++ b/pkg/storage/storageclient.go @@ -43,15 +43,44 @@ func NewStorageClient( switch u.Scheme { case "s3": + if u.Host == "" { + return nil, fmt.Errorf("invalid volume URI %q: S3 path must include bucket name (e.g. s3://bucket-name/prefix)", vs.Path) + } return NewS3Client(ctx, k8sClient, namespace, u.Host, prefix, vs) case "gs", "gcs": + if u.Host == "" { + return nil, fmt.Errorf("invalid volume URI %q: GCS path must include bucket name (e.g. gs://bucket-name/prefix)", vs.Path) + } return NewGCSClient(ctx, k8sClient, namespace, u.Host, prefix, vs) case "azure": + if u.Host == "" { + return nil, fmt.Errorf("invalid volume URI %q: Azure path must include container name (e.g. azure://container-name/prefix). Without it, model deployments fail with 'Please specify a container name'", vs.Path) + } return NewAzureClient(ctx, k8sClient, namespace, u.Host, prefix, vs) + case "s3compat": + if u.Host == "" { + return nil, fmt.Errorf("invalid volume URI %q: S3-compatible path must include bucket name (e.g. s3compat://bucket-name/prefix)", vs.Path) + } + if vs.Endpoint == "" { + return nil, fmt.Errorf("s3compat:// scheme requires spec.objectStorage.endpoint to be set (otherwise the AWS SDK targets real AWS S3)") + } + return NewS3CompatibleClient(ctx, k8sClient, namespace, u.Host, prefix, vs) case "minio": - // everything after "//" is host (bucket) and path. We treat u.Host as bucket, - // vs.Endpoint *must* be set to our MinIO URL for this case. - return NewMinioClient(ctx, k8sClient, namespace, u.Host, prefix, vs) + if u.Host == "" { + return nil, fmt.Errorf("invalid volume URI %q: MinIO path must include bucket name (e.g. minio://bucket-name/prefix)", vs.Path) + } + if vs.Endpoint == "" { + return nil, fmt.Errorf("minio:// scheme requires spec.objectStorage.endpoint to be set (otherwise the AWS SDK targets real AWS S3)") + } + return NewS3CompatibleClient(ctx, k8sClient, namespace, u.Host, prefix, vs) + case "seaweedfs": + if u.Host == "" { + return nil, fmt.Errorf("invalid volume URI %q: SeaweedFS path must include bucket name (e.g. seaweedfs://bucket-name/prefix)", vs.Path) + } + if vs.Endpoint == "" { + return nil, fmt.Errorf("seaweedfs:// scheme requires spec.objectStorage.endpoint to be set (otherwise the AWS SDK targets real AWS S3)") + } + return NewS3CompatibleClient(ctx, k8sClient, namespace, u.Host, prefix, vs) case "fixture": // fixture:// is a special scheme for testing purposes, using a fake client. // It does not require any credentials or endpoint. diff --git a/pkg/storage/storageclient_test.go b/pkg/storage/storageclient_test.go index c97dcc2..87742d2 100644 --- a/pkg/storage/storageclient_test.go +++ b/pkg/storage/storageclient_test.go @@ -75,12 +75,52 @@ func TestNewStorageClient(t *testing.T) { }, }, { - name: "MinIO storage", + name: "MinIO storage (S3-compatible)", volumeSpec: ai.ObjectStorageSpec{ Path: "minio://my-bucket/prefix", Endpoint: "http://minio.default.svc:9000", + Region: "us-east-1", }, - wantType: "minio", + wantType: "s3", + wantErr: false, + setupClient: func() *fake.ClientBuilder { + return fake.NewClientBuilder().WithScheme(s) + }, + }, + { + name: "S3-compatible storage (generic s3compat scheme)", + volumeSpec: ai.ObjectStorageSpec{ + Path: "s3compat://my-bucket/prefix", + Endpoint: "http://s3compat.default.svc:9000", + Region: "us-east-1", + }, + wantType: "s3", + wantErr: false, + setupClient: func() *fake.ClientBuilder { + return fake.NewClientBuilder().WithScheme(s) + }, + }, + { + name: "SeaweedFS storage (S3-compatible)", + volumeSpec: ai.ObjectStorageSpec{ + Path: "seaweedfs://my-bucket/prefix", + Endpoint: "http://seaweedfs.default.svc:8333", + Region: "us-east-1", + }, + wantType: "s3", + wantErr: false, + setupClient: func() *fake.ClientBuilder { + return fake.NewClientBuilder().WithScheme(s) + }, + }, + { + name: "S3 with custom endpoint (S3-compatible)", + volumeSpec: ai.ObjectStorageSpec{ + Path: "s3://my-bucket/prefix", + Endpoint: "http://custom-s3.example.com:9000", + Region: "us-east-1", + }, + wantType: "s3", wantErr: false, setupClient: func() *fake.ClientBuilder { return fake.NewClientBuilder().WithScheme(s) @@ -91,7 +131,7 @@ func TestNewStorageClient(t *testing.T) { volumeSpec: ai.ObjectStorageSpec{ Path: "fixture://test-bucket/prefix", }, - wantType: "fixture", + wantType: "s3", // fixtureClient.GetProvider() returns "s3" for artifact compatibility wantErr: false, setupClient: func() *fake.ClientBuilder { return fake.NewClientBuilder().WithScheme(s) @@ -117,6 +157,52 @@ func TestNewStorageClient(t *testing.T) { return fake.NewClientBuilder().WithScheme(s) }, }, + { + name: "Azure path without container name", + volumeSpec: ai.ObjectStorageSpec{ + Path: "azure:///model_artifacts", + Region: "eastus", + }, + wantErr: true, + setupClient: func() *fake.ClientBuilder { + return fake.NewClientBuilder().WithScheme(s) + }, + }, + { + name: "S3 path without bucket name", + volumeSpec: ai.ObjectStorageSpec{ + Path: "s3:///prefix", + Region: "us-west-2", + }, + wantErr: true, + setupClient: func() *fake.ClientBuilder { + return fake.NewClientBuilder().WithScheme(s) + }, + }, + { + name: "S3-compatible path without bucket name", + volumeSpec: ai.ObjectStorageSpec{ + Path: "s3compat:///prefix", + Endpoint: "http://s3compat:9000", + Region: "us-east-1", + }, + wantErr: true, + setupClient: func() *fake.ClientBuilder { + return fake.NewClientBuilder().WithScheme(s) + }, + }, + { + name: "SeaweedFS path without bucket name", + volumeSpec: ai.ObjectStorageSpec{ + Path: "seaweedfs:///prefix", + Endpoint: "http://seaweedfs:8333", + Region: "us-east-1", + }, + wantErr: true, + setupClient: func() *fake.ClientBuilder { + return fake.NewClientBuilder().WithScheme(s) + }, + }, } for _, tt := range tests { @@ -134,7 +220,7 @@ func TestNewStorageClient(t *testing.T) { // Verify provider matches expected type provider := client.GetProvider() - assert.NotEmpty(t, provider) + assert.Equal(t, tt.wantType, provider, "GetProvider() should match wantType") // Verify bucket/container is extracted bucket := client.GetBucket() diff --git a/tools/artifacts_download_upload_scripts/README.md b/tools/artifacts_download_upload_scripts/README.md index 98a5ce6..f847483 100755 --- a/tools/artifacts_download_upload_scripts/README.md +++ b/tools/artifacts_download_upload_scripts/README.md @@ -70,13 +70,14 @@ sudo ./download_from_huggingface.sh - Script returns non-zero exit code on failure (suitable for CI/CD pipelines) ### 2. `upload_to_minio.sh` -Uploads downloaded artifacts to MinIO storage. +Uploads downloaded artifacts to MinIO or any S3-compatible storage (e.g. SeaweedFS). **Features:** - Automatically uploads **all artifacts** from `./model_artifacts/` directory - No config file needed - just uploads everything found - **Auto-creates bucket** if it doesn't exist - Uses native MinIO Client (mc) for optimal performance +- Works with **MinIO, SeaweedFS, or any S3-compatible** backend; set endpoint and credentials to match your store. - Comprehensive dependency installation: - MinIO Client via **Homebrew on macOS** or **direct download on Linux** - Supports macOS (Intel & Apple Silicon) and Linux (amd64 & arm64) @@ -92,16 +93,109 @@ Or with sudo if dependency installation fails: sudo ./upload_to_minio.sh ``` +**Environment variables (S3-compatible target):** +Preferred generic names; `MINIO_*` are accepted for backward compatibility. + +| Preferred (generic) | Fallback | Description | +|---------------------|----------|-------------| +| `OBJECT_STORE_ENDPOINT` | `MINIO_ENDPOINT` | S3 API endpoint URL (e.g. http://host:9000 for MinIO, http://host:8333 for SeaweedFS) | +| `OBJECT_STORE_BUCKET` | `MINIO_BUCKET` | Bucket name | +| `OBJECT_STORE_ACCESS_KEY` | `MINIO_ROOT_USER` or `MINIO_ACCESS_KEY` | Access key | +| `OBJECT_STORE_SECRET_KEY` | `MINIO_ROOT_PASSWORD` or `MINIO_SECRET_KEY` | Secret key | + +Example for SeaweedFS: `OBJECT_STORE_ENDPOINT=http://seaweedfs:8333 OBJECT_STORE_BUCKET=my-bucket ./upload_to_minio.sh` + **Prerequisites:** - Run `download_from_huggingface.sh` first to download artifacts - May require sudo for installing MinIO Client (mc) -- Configure MinIO settings in the script or use environment variables: - - `MINIO_ENDPOINT` (default: http://127.0.0.1:9000) - - `MINIO_BUCKET` (default: personal) - - `MINIO_ROOT_USER` (default: minioadmin) - - `MINIO_ROOT_PASSWORD` (default: minioadmin) +- Set endpoint, bucket, and credentials via the env vars above (defaults point to a local MinIO). + +### 3. `upload_to_seaweedfs.sh` +Uploads downloaded artifacts to SeaweedFS (S3-compatible). If SeaweedFS is not running at the endpoint, the script can **install and start it** (downloads the `weed` binary from GitHub releases, no Docker). If you run SeaweedFS via **systemd** (see **§4 `install_seaweedfs_systemd.sh`** below), ensure the service is up (`sudo systemctl start seaweedfs`) before running the upload script so the script doesn’t start a second instance. + +**Features:** +- **Auto-install SeaweedFS** when not reachable: downloads latest `weed` for Linux/macOS (amd64/arm64), installs to `/usr/local/bin` or `~/.local/bin`, and starts `weed server -s3` in the background (S3 gateway on port 8333). +- Auto-install only runs when the endpoint is local (`127.0.0.1` or `localhost`). For remote endpoints, SeaweedFS must already be running. +- Creates configured buckets (from `SEAWEEDFS_BUCKETS` or primary bucket), then uploads all of `./model_artifacts/` to the primary bucket. +- Uses MinIO Client (mc); installs mc if missing. + +**Usage:** +```bash +./upload_to_seaweedfs.sh +``` + +With a remote SeaweedFS: +```bash +S3COMPAT_OBJECT_STORE_ENDPOINT=http://seaweedfs-host:8333 S3COMPAT_OBJECT_STORE_BUCKET=my-bucket ./upload_to_seaweedfs.sh +``` + +To skip auto-install and only fail if unreachable: +```bash +SEAWEEDFS_SKIP_INSTALL=1 ./upload_to_seaweedfs.sh +``` + +**Volume limit:** When the script starts SeaweedFS it uses `-volume.max=100` (set `SEAWEEDFS_VOLUME_MAX`; use `0` for auto). The default (~7) can cause "0 node candidates" once the volume server is "full." + +**Environment variables:** `S3COMPAT_OBJECT_STORE_ENDPOINT` (default: http://127.0.0.1:8333), `S3COMPAT_OBJECT_STORE_BUCKET`, `S3COMPAT_OBJECT_STORE_ACCESS_KEY`, `S3COMPAT_OBJECT_STORE_SECRET_KEY`, `SEAWEEDFS_BUCKETS`, `SEAWEEDFS_SKIP_INSTALL`, `SEAWEEDFS_UPLOAD_RETRIES`, `SEAWEEDFS_UPLOAD_RETRY_DELAY`, `SEAWEEDFS_PARALLEL_JOBS`, `SEAWEEDFS_ERROR_LOG`, `SEAWEEDFS_SKIP_EXISTING`, `SEAWEEDFS_WAIT_VOLUME_SERVER`, `SEAWEEDFS_MASTER`, `SEAWEEDFS_VOLUME_MAX` (default 100). -### 3. `upload_to_minio_aws.sh` +**SeaweedFS credentials:** SeaweedFS S3 has no built-in users (unlike MinIO’s default `minioadmin`). If you start SeaweedFS yourself, it must be configured to accept the same access key/secret the script uses (defaults: `minioadmin`/`minioadmin`). Options: (1) Start with env vars: `AWS_ACCESS_KEY_ID=minioadmin AWS_SECRET_ACCESS_KEY=minioadmin weed server -s3`; (2) Use a JSON config file with `weed s3 -config=/path/to/s3.json` (see [SeaweedFS S3 Credentials](https://github.com/seaweedfs/seaweedfs/wiki/S3-Credentials)). If you see *"The access key ID you provided does not exist in our records"*, restart SeaweedFS with the same credentials as `S3COMPAT_OBJECT_STORE_ACCESS_KEY`/`S3COMPAT_OBJECT_STORE_SECRET_KEY` (or set those env vars to match your SeaweedFS config). + +**Volume server readiness:** After SeaweedFS has just started (or restarted), the master may not see a volume server yet, so uploads can fail with "Not enough data nodes found". The script can **wait for a volume server** (when endpoint is local and `weed` is available): it polls `weed shell -master=... cluster.ps` for up to `SEAWEEDFS_WAIT_VOLUME_SERVER` seconds (default 60) before starting uploads. Set `SEAWEEDFS_WAIT_VOLUME_SERVER=0` to skip. + +**Parallel uploads and error log:** Uploads run in parallel (up to `SEAWEEDFS_PARALLEL_JOBS` at a time, default 3). Directory artifacts are uploaded **file-by-file** with per-file retries, so one failed file (e.g. a single `.safetensors` shard) only retries that file, not the whole artifact. Failed files/artifacts are appended to `SEAWEEDFS_ERROR_LOG` (default `./seaweedfs_upload_errors.log`) with artifact id and relative path; at the end the script prints that file and exits with code 1 if any failed. + +**Large artifacts (e.g. LLaMA 70B):** Uploads of very large files (multi-GB `.safetensors` shards) can fail with *"We encountered an internal error, please try again"*. The script retries each artifact up to `SEAWEEDFS_UPLOAD_RETRIES` (default 3) with `SEAWEEDFS_UPLOAD_RETRY_DELAY` seconds between attempts. If failures persist, check SeaweedFS host memory and disk (`/tmp/seaweedfs.log` or volume server logs), ensure enough free space for the full object, and consider increasing retries: `SEAWEEDFS_UPLOAD_RETRIES=5 SEAWEEDFS_UPLOAD_RETRY_DELAY=30 ./upload_to_seaweedfs.sh`. + +**"0 node candidates" / "Not enough data nodes":** Usually the volume server hit its max volume count (default ~7), disk is near full (read-only), heartbeat timeouts, or OOM. The script and systemd unit use `-volume.max=100` by default. When the error happens: `curl -s http://localhost:9333/cluster/status | jq` (master view); `curl -s http://127.0.0.1:8080/status | jq` (volume server; if Max==Count, increase `SEAWEEDFS_VOLUME_MAX`). See `tools/artifacts_download_upload_scripts/SEAWEEDFS_SYSTEMD.md` for full troubleshooting. + +**Prerequisites:** +- Run `download_from_huggingface.sh` first to download artifacts +- For auto-install: curl, tar; optional sudo for `/usr/local/bin` +- No Docker required + +**Create standard folders:** To create the platform folders (`apps/`, `artifacts/`, `config/`, `job_groups/`, `model_artifacts/`, `tasks/`) in SeaweedFS, run `./create_seaweedfs_folders.sh` after SeaweedFS is up. It uses the same endpoint and credentials as `upload_to_seaweedfs.sh`. + +**Upload Splunk AI Assistant app:** To upload `Splunk_AI_Assistant_Cloud.tgz` to `bucket/apps/`, run `./upload_splunk_app_to_seaweedfs.sh`. Put the .tgz in the current directory or set `SPLUNK_APP_LOCAL_PATH=/path/to/Splunk_AI_Assistant_Cloud.tgz`. Same endpoint/credentials as above. + +### 4. `install_seaweedfs_systemd.sh` +Installs SeaweedFS as a **systemd service** so it starts on boot and restarts on failure. Run this on the host where SeaweedFS should run (e.g. EC2), after the `weed` binary is installed. + +**Features:** +- Copies `seaweedfs.service` from this directory into `/etc/systemd/system/` +- Enables and starts the `seaweedfs` service (master, volume, filer, S3 gateway) +- Service runs as `ec2-user` (configurable in the unit file); data directory is `/home/ec2-user/data` by default +- Handles SELinux: on Enforcing systems, labels `/usr/local/bin/weed` so the service can execute it +- Requires the `weed` binary at `/usr/local/bin/weed` (install it first via `upload_to_seaweedfs.sh` or manually from [SeaweedFS releases](https://github.com/seaweedfs/seaweedfs/releases)) + +**Usage:** +```bash +# 1. Install weed first (e.g. run upload_to_seaweedfs.sh once, or download weed and put it in /usr/local/bin) +# 2. Then install the systemd service (requires sudo) +sudo ./install_seaweedfs_systemd.sh +``` + +**Prerequisites:** +- `weed` at `/usr/local/bin/weed` (run `./upload_to_seaweedfs.sh` once to auto-install it, or download and extract from GitHub releases) +- Run the script as root: `sudo ./install_seaweedfs_systemd.sh` +- The `seaweedfs.service` unit file must be in the same directory as the script + +**After install:** +- **Status:** `sudo systemctl status seaweedfs` +- **Logs:** `journalctl -u seaweedfs -f` +- **Stop:** `sudo systemctl stop seaweedfs` +- **Restart:** `sudo systemctl restart seaweedfs` +- **S3 endpoint:** http://127.0.0.1:8333 (default credentials: minioadmin/minioadmin) +- **Data directory:** `/home/ec2-user/data` (edit the unit file or use a drop-in to change) + +**Unit file details (`seaweedfs.service`):** +- `ExecStart`: `/usr/local/bin/weed server -s3 -ip.bind=0.0.0.0 -dir=/home/ec2-user/data -volume.max=100` +- `Restart=on-failure`, `RestartSec=5` +- S3 credentials are set via `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` in the unit (default minioadmin/minioadmin); override with `/etc/default/seaweedfs` or a systemd drop-in if needed +- To use a different user or data dir, copy the unit to a drop-in or edit `/etc/systemd/system/seaweedfs.service` after install + +**Troubleshooting:** If the service fails to start, check `sudo systemctl status seaweedfs` and `journalctl -u seaweedfs -n 50`. Ensure `/home/ec2-user/data` exists and is writable by `ec2-user`, and that `/usr/local/bin/weed` is executable. On SELinux systems, the script runs `chcon -t bin_t /usr/local/bin/weed` to allow execution. + +### 5. `upload_to_minio_aws.sh` Uploads downloaded artifacts to MinIO using AWS CLI (S3-compatible API). **Features:** @@ -128,18 +222,18 @@ sudo ./upload_to_minio_aws.sh **Prerequisites:** - Run `download_from_huggingface.sh` first to download artifacts - May require sudo for installing AWS CLI -- Configure MinIO settings in the script: - - `MINIO_ENDPOINT` (default: http://127.0.0.1:9000) - - `MINIO_BUCKET` (default: ml-platform-artifacts) - - `MINIO_ACCESS_KEY` (default: minioadmin) - - `MINIO_SECRET_KEY` (default: minioadmin) +- Use generic env vars (MINIO_* accepted for backward compatibility): + - `S3COMPAT_OBJECT_STORE_ENDPOINT` (default: http://127.0.0.1:9000) + - `S3COMPAT_OBJECT_STORE_BUCKET` (default: ai-platform-artifacts-bucket) + - `S3COMPAT_OBJECT_STORE_ACCESS_KEY` (default: minioadmin) + - `S3COMPAT_OBJECT_STORE_SECRET_KEY` (default: minioadmin) **When to use this vs `upload_to_minio.sh`:** - Use this if you prefer AWS CLI over MinIO Client (mc) - Use this if you already have AWS CLI installed - Use `upload_to_minio.sh` for better MinIO native support -### 4. `upload_to_s3.sh` +### 6. `upload_to_s3.sh` Uploads downloaded artifacts to AWS S3 storage. **Features:** @@ -181,12 +275,12 @@ sudo S3_BUCKET=your-bucket-name ./upload_to_s3.sh - Set `S3_BUCKET` environment variable - Optional: Set `S3_REGION` (default: us-east-1) and `S3_PREFIX` (default: model_artifacts) -### 5. `test_minio_connection.sh` -Diagnostic script to test MinIO connectivity and troubleshoot issues. +### 7. `test_minio_connection.sh` +Diagnostic script to test S3-compatible object store connectivity (MinIO, SeaweedFS, etc.) and troubleshoot issues. **Features:** - Tests MinIO Client (mc) installation -- Verifies MinIO endpoint connectivity +- Verifies endpoint connectivity - Tests authentication with credentials - Lists all existing buckets - Tests bucket creation permissions @@ -197,9 +291,9 @@ Diagnostic script to test MinIO connectivity and troubleshoot issues. ./test_minio_connection.sh ``` -Or with custom settings: +Or with custom settings (use generic names; MINIO_* also accepted): ```bash -MINIO_ENDPOINT=http://localhost:9000 MINIO_BUCKET=nexus ./test_minio_connection.sh +S3COMPAT_OBJECT_STORE_ENDPOINT=http://localhost:9000 S3COMPAT_OBJECT_STORE_BUCKET=nexus ./test_minio_connection.sh ``` Or with sudo if dependency installation fails: @@ -213,7 +307,7 @@ sudo ./test_minio_connection.sh **When to use:** - Before running upload scripts for the first time - When bucket creation fails -- To diagnose MinIO connectivity issues +- To diagnose object store connectivity issues - To verify credentials and permissions ## Configuration @@ -333,19 +427,18 @@ All artifacts in the list will be downloaded and uploaded automatically. ### For Download Script: - No additional environment variables needed (reads from `model_artifacts_configs.yaml`) -### For MinIO Upload Script (using mc): +### For MinIO / S3-compatible Upload Script (using mc, `upload_to_minio.sh`): - No config file needed - automatically uploads all artifacts from `./model_artifacts/` -- `MINIO_ENDPOINT`: MinIO server endpoint (default: http://127.0.0.1:9000) -- `MINIO_BUCKET`: Target bucket name (default: personal) -- `MINIO_ROOT_USER`: MinIO access key (default: minioadmin) -- `MINIO_ROOT_PASSWORD`: MinIO secret key (default: minioadmin) +- Works with MinIO, SeaweedFS, or any S3-compatible backend. +- **Preferred (generic):** `S3COMPAT_OBJECT_STORE_ENDPOINT`, `S3COMPAT_OBJECT_STORE_BUCKET`, `S3COMPAT_OBJECT_STORE_ACCESS_KEY`, `S3COMPAT_OBJECT_STORE_SECRET_KEY` +- **Backward compatibility:** `MINIO_ENDPOINT`, `MINIO_BUCKET`, `MINIO_ROOT_USER`, `MINIO_ROOT_PASSWORD` (or `MINIO_ACCESS_KEY`/`MINIO_SECRET_KEY`) +- Defaults: endpoint http://127.0.0.1:9000, bucket ai-platform-bucket, minioadmin/minioadmin -### For MinIO Upload Script (using AWS CLI): +### For S3-compatible Upload Script (using AWS CLI, `upload_to_minio_aws.sh`): - No config file needed - automatically uploads all artifacts from `./model_artifacts/` -- `MINIO_ENDPOINT`: MinIO server endpoint (default: http://127.0.0.1:9000) -- `MINIO_BUCKET`: Target bucket name (default: ml-platform-artifacts) -- `MINIO_ACCESS_KEY`: MinIO access key (default: minioadmin) -- `MINIO_SECRET_KEY`: MinIO secret key (default: minioadmin) +- **Preferred (generic):** `S3COMPAT_OBJECT_STORE_ENDPOINT`, `S3COMPAT_OBJECT_STORE_BUCKET`, `S3COMPAT_OBJECT_STORE_ACCESS_KEY`, `S3COMPAT_OBJECT_STORE_SECRET_KEY` +- **Backward compatibility:** `MINIO_ENDPOINT`, `MINIO_BUCKET`, `MINIO_ACCESS_KEY`, `MINIO_SECRET_KEY` (or `MINIO_ROOT_USER`/`MINIO_ROOT_PASSWORD`) +- Defaults: endpoint http://127.0.0.1:9000, bucket ai-platform-artifacts-bucket, minioadmin/minioadmin ### For S3 Upload Script: - No config file needed - automatically uploads all artifacts from `./model_artifacts/` diff --git a/tools/artifacts_download_upload_scripts/SEAWEEDFS_SYSTEMD.md b/tools/artifacts_download_upload_scripts/SEAWEEDFS_SYSTEMD.md new file mode 100644 index 0000000..f542ba3 --- /dev/null +++ b/tools/artifacts_download_upload_scripts/SEAWEEDFS_SYSTEMD.md @@ -0,0 +1,134 @@ +# SeaweedFS as a systemd service + +Run SeaweedFS as a systemd service so it **restarts on failure** and **starts on boot**. + +## Prerequisites + +- **weed** binary at `/usr/local/bin/weed`. If missing, run the upload script once from the artifacts directory (it installs weed), or [download a release](https://github.com/seaweedfs/seaweedfs/releases) and copy `weed` to `/usr/local/bin/`. +- **Root/sudo** on the host to install the service. + +## Quick install (EC2 or single host) + +On the host where SeaweedFS should run: + +```bash +cd /path/to/splunk-ai-operator/tools/artifacts_download_upload_scripts +sudo ./install_seaweedfs_systemd.sh +``` + +This copies `seaweedfs.service` to `/etc/systemd/system/`, enables and starts the service. + +## Manual install + +1. Copy the unit file: + ```bash + sudo cp tools/artifacts_download_upload_scripts/seaweedfs.service /etc/systemd/system/ + sudo systemctl daemon-reload + ``` + +2. Optionally override credentials or data dir via a drop-in or env file: + ```bash + sudo mkdir -p /etc/systemd/system/seaweedfs.service.d + echo -e '[Service]\nEnvironment="AWS_ACCESS_KEY_ID=mykey"\nEnvironment="AWS_SECRET_ACCESS_KEY=mysecret"' | sudo tee /etc/systemd/system/seaweedfs.service.d/override.conf + sudo systemctl daemon-reload + ``` + +3. Enable and start: + ```bash + sudo systemctl enable seaweedfs + sudo systemctl start seaweedfs + ``` + +## Service details + +- **User:** `ec2-user` (change in the unit if needed). +- **Data dir:** `/home/ec2-user/data` (hardcoded in `ExecStart`; override via a systemd drop-in that replaces `ExecStart` if needed). +- **Volume max:** `100` in `ExecStart` (override via drop-in if needed). +- **S3 credentials:** `minioadmin` / `minioadmin` by default; override with `Environment=` or `EnvironmentFile=-/etc/default/seaweedfs` in a drop-in. +- **Restart:** `on-failure` with 5s delay. +- **Logs:** `journalctl -u seaweedfs -f` + +## Useful commands + +| Command | Description | +|--------|-------------| +| `sudo systemctl status seaweedfs` | Show status | +| `journalctl -u seaweedfs -f` | Follow logs | +| `sudo systemctl restart seaweedfs` | Restart | +| `sudo systemctl stop seaweedfs` | Stop | +| `sudo systemctl disable seaweedfs` | Disable start on boot | + +## After install + +- S3 endpoint: **http://127.0.0.1:8333** (or the host’s IP if accessing remotely). +- Use the same credentials in the upload script or set `OBJECT_STORE_ACCESS_KEY` / `OBJECT_STORE_SECRET_KEY` to match the service. + +## Troubleshooting: "0 node candidates" / "Not enough data nodes found" + +When the Master has no writable volume servers, uploads fail with those errors. Common causes and fixes: + +| Cause | Fix | +|-------|-----| +| **1. Max volumes reached** | Volume server default `-max` is often 7–8. The unit sets `SEAWEEDFS_VOLUME_MAX=100`. To increase: add `Environment="SEAWEEDFS_VOLUME_MAX=200"` in a drop-in and restart. | +| **2. Disk space** | At ~95% usage the volume server reports read-only. Check `df -h` on the host; free space or add storage. | +| **3. Heartbeat / gRPC timeouts** | Under heavy load the volume server may miss heartbeats and be marked dead. Check `journalctl -u seaweedfs` for "heartbeat" or "connection refused" around the failure time. | +| **4. OOM** | On small instances the process may be killed. Run `dmesg -T | grep -i oom` on the host. | + +**When the error is happening, run:** + +```bash +# Master's view of nodes (look for empty Nodes or IsReadOnly: true) +curl -s http://localhost:9333/cluster/status | jq + +# Volume server status (check if Max and Count are equal = full) +curl -s http://127.0.0.1:8080/status | jq +``` + +If `Max == Count` on the volume server, increase `SEAWEEDFS_VOLUME_MAX` and restart the service. + +### "Permission denied" when starting the service (status=203/EXEC) + +The service runs as `ec2-user`. Common causes: + +1. **File permissions** – Ensure the binary is executable by all: + ```bash + sudo chmod 755 /usr/local/bin/weed + ``` + +2. **SELinux (Enforcing)** – On RHEL/Amazon Linux, SELinux can block execution. Fix by labeling the binary: + ```bash + sudo chcon -t bin_t /usr/local/bin/weed + sudo systemctl restart seaweedfs + ``` + To confirm SELinux is the cause: `sudo setenforce 0`, restart the service; if it then runs, re-enable with `sudo setenforce 1` and apply the `chcon` above. + +The install script runs `chmod 755` and, when SELinux is Enforcing, `chcon -t bin_t` automatically. + +### Connect timeout from EKS / Ray pods (Connection to <host> timed out) + +Ray workers (and other pods) in the cluster need to reach the SeaweedFS S3 endpoint to download model artifacts. If you see: + +- `Connect timeout on endpoint URL: "http://:8333/..."` +- `Connection to timed out. (connect timeout=60)"` + +then **pods cannot reach the SeaweedFS host** on port 8333. + +**Fix:** + +1. **Security group on the SeaweedFS EC2** + Allow **inbound TCP port 8333** from the EKS cluster: + - **Option A:** From the **EKS worker node security group** (so any pod on those nodes can reach SeaweedFS). + - **Option B:** From the **VPC CIDR** (e.g. `10.0.0.0/16` or `192.168.0.0/16`) so all pods in the VPC can reach SeaweedFS. + + In AWS Console: EC2 → Security Groups → select the security group attached to the SeaweedFS instance → Edit inbound rules → Add rule: Type = Custom TCP, Port = 8333, Source = node SG or VPC CIDR. + +2. **Prefer private IP when in the same VPC** + If SeaweedFS and EKS are in the same VPC, set `storage.objectStore.endpoint` in `cluster-config.yaml` to the **private IP** and port (e.g. `http://172.31.23.74:8333`). Then: + - Traffic stays inside the VPC (no internet path). + - The security group still must allow 8333 from the node SG or VPC CIDR as above. + +3. **Verify from a pod** (optional): + ```bash + kubectl run -it --rm curl --image=curlimages/curl --restart=Never -- curl -s -o /dev/null -w "%{http_code}" http://:8333 + ``` + Use the same IP (public or private) and port as in your config. A 200/403/400 means the pod can reach SeaweedFS. diff --git a/tools/artifacts_download_upload_scripts/create_seaweedfs_folders.sh b/tools/artifacts_download_upload_scripts/create_seaweedfs_folders.sh new file mode 100755 index 0000000..823c7eb --- /dev/null +++ b/tools/artifacts_download_upload_scripts/create_seaweedfs_folders.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# Create standard folder prefixes in SeaweedFS (S3-compatible). Uses the same +# OBJECT_STORE_* / SEAWEEDFS_* env vars as upload_to_seaweedfs.sh. Run after +# SeaweedFS is up (e.g. systemd service or upload script has started it). + +set -e + +# Same endpoint/credentials as upload_to_seaweedfs.sh +OBJECT_STORE_ENDPOINT="${OBJECT_STORE_ENDPOINT:-${SEAWEEDFS_ENDPOINT:-http://127.0.0.1:8333}}" +OBJECT_STORE_BUCKET="${OBJECT_STORE_BUCKET:-${SEAWEEDFS_BUCKET:-ai-platform-bucket}}" +OBJECT_STORE_ACCESS_KEY="${OBJECT_STORE_ACCESS_KEY:-${SEAWEEDFS_ACCESS_KEY:-minioadmin}}" +OBJECT_STORE_SECRET_KEY="${OBJECT_STORE_SECRET_KEY:-${SEAWEEDFS_SECRET_KEY:-minioadmin}}" + +OBJECT_STORE_BUCKET=$(echo "$OBJECT_STORE_BUCKET" | tr '[:upper:]' '[:lower:]') + +# Standard folders expected by the platform (create by uploading .keep) +FOLDERS=(apps artifacts config job_groups model_artifacts tasks) + +seaweedfs_ok() { + local code + code=$(curl -s -o /dev/null -w "%{http_code}" "${OBJECT_STORE_ENDPOINT}" 2>/dev/null || echo "000") + [[ "$code" == "200" || "$code" == "403" || "$code" == "400" ]] +} + +if ! seaweedfs_ok; then + echo "SeaweedFS not reachable at ${OBJECT_STORE_ENDPOINT}. Start SeaweedFS first (e.g. sudo systemctl start seaweedfs)." + exit 1 +fi + +# Install mc if needed +if ! command -v mc &>/dev/null; then + echo "Installing MinIO Client (mc)..." + OS="$(uname -s)" + ARCH="$(uname -m)" + if [[ "$OS" == "Darwin" ]]; then + if command -v brew &>/dev/null; then + brew install minio/stable/mc + else + if [[ "$ARCH" == "arm64" ]]; then MC_URL="https://dl.min.io/client/mc/release/darwin-arm64/mc"; else MC_URL="https://dl.min.io/client/mc/release/darwin-amd64/mc"; fi + curl -o /tmp/mc "$MC_URL" && chmod +x /tmp/mc && sudo mv /tmp/mc /usr/local/bin/mc + fi + elif [[ "$OS" == "Linux" ]]; then + if [[ "$ARCH" == "x86_64" ]]; then MC_URL="https://dl.min.io/client/mc/release/linux-amd64/mc"; elif [[ "$ARCH" == "aarch64" || "$ARCH" == "arm64" ]]; then MC_URL="https://dl.min.io/client/mc/release/linux-arm64/mc"; else echo "Unsupported arch: $ARCH"; exit 1; fi + curl -o /tmp/mc "$MC_URL" && chmod +x /tmp/mc + sudo mv /tmp/mc /usr/local/bin/mc 2>/dev/null || { mkdir -p ~/.local/bin; mv /tmp/mc ~/.local/bin/mc; export PATH="$PATH:$HOME/.local/bin"; } + else + echo "Unsupported OS: $OS"; exit 1 + fi +fi + +MC_ALIAS="seaweedfs" +mc alias set "$MC_ALIAS" "$OBJECT_STORE_ENDPOINT" "$OBJECT_STORE_ACCESS_KEY" "$OBJECT_STORE_SECRET_KEY" --api S3v4 +mc mb "${MC_ALIAS}/${OBJECT_STORE_BUCKET}" --ignore-existing 2>/dev/null || true + +echo "Creating folders in ${OBJECT_STORE_BUCKET}: ${FOLDERS[*]}" +for dir in "${FOLDERS[@]}"; do + echo "placeholder" | mc pipe "${MC_ALIAS}/${OBJECT_STORE_BUCKET}/${dir}/.keep" 2>/dev/null || true + echo " ${dir}/" +done +echo "Done. Folders: apps/, artifacts/, config/, job_groups/, model_artifacts/, tasks/" diff --git a/tools/artifacts_download_upload_scripts/install_minio_ec2.sh b/tools/artifacts_download_upload_scripts/install_minio_ec2.sh new file mode 100755 index 0000000..fcf93a1 --- /dev/null +++ b/tools/artifacts_download_upload_scripts/install_minio_ec2.sh @@ -0,0 +1,335 @@ +#!/usr/bin/env bash +# ----------------------------------------------------------------------------- +# MinIO on EC2 for Splunk AI Platform (EKS) +# +# Mode 1 - Install on this machine (run ON the EC2 instance after SSH, as root): +# sudo ./install_minio_ec2.sh [--bucket NAME] [--user USER] [--password PASSWORD] +# +# Mode 2 - Launch EC2 in same VPC as EKS, then install MinIO (run from laptop): +# CONFIG_FILE=./cluster-config.yaml ./install_minio_ec2.sh --launch-ec2 +# Then SSH to the instance and run: ./install_minio_ec2.sh (with same bucket/user/password) +# +# Prerequisites: aws CLI, same VPC as EKS (or provide VPC/subnet). For --launch-ec2: jq, yq (optional). +# ----------------------------------------------------------------------------- +set -euo pipefail + +MINIO_BUCKET="${MINIO_BUCKET:-ai-platform}" +MINIO_ROOT_USER="${MINIO_ROOT_USER:-minioadmin}" +MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-}" +MINIO_DATA_DIR="${MINIO_DATA_DIR:-/data/minio}" +MINIO_PORT="${MINIO_PORT:-9000}" + +# Launch-EC2 options (when --launch-ec2) +MINIO_EC2_INSTANCE_TYPE="${MINIO_EC2_INSTANCE_TYPE:-t3.xlarge}" +MINIO_EC2_AMI_QUERY="${MINIO_EC2_AMI_QUERY:-Amazon Linux 2023}" +MINIO_EC2_KEY_NAME="${MINIO_EC2_KEY_NAME:-}" +MINIO_EC2_VOLUME_SIZE="${MINIO_EC2_VOLUME_SIZE:-150}" + +log() { echo "[minio-ec2] $*"; } +err() { echo "[minio-ec2] ERROR: $*" >&2; } + +# ---------- Parse args ---------- +LAUNCH_EC2=false +while [[ $# -gt 0 ]]; do + case "$1" in + --launch-ec2) LAUNCH_EC2=true; shift ;; + --bucket) MINIO_BUCKET="$2"; shift 2 ;; + --user) MINIO_ROOT_USER="$2"; shift 2 ;; + --password) MINIO_ROOT_PASSWORD="$2"; shift 2 ;; + --data-dir) MINIO_DATA_DIR="$2"; shift 2 ;; + --port) MINIO_PORT="$2"; shift 2 ;; + *) echo "Unknown option: $1"; exit 1 ;; + esac +done + +# ---------- Mode 2: Launch EC2 in EKS VPC ---------- +launch_ec2_in_eks_vpc() { + need_file "${CONFIG_FILE:-}" + local cfg="${CONFIG_FILE}" + local cluster_name region vpc_id subnet_id sg_id instance_id private_ip + + if command -v yq &>/dev/null; then + cluster_name="$(yq eval '.cluster.name' "$cfg")" + region="$(yq eval '.cluster.region' "$cfg")" + else + cluster_name="$(grep -A1 'cluster:' "$cfg" | grep 'name:' | head -1 | sed 's/.*name: *"\(.*\)".*/\1/')" + region="$(grep 'region:' "$cfg" | head -1 | sed 's/.*region: *"\(.*\)".*/\1/')" + fi + [[ -z "$cluster_name" || -z "$region" ]] && { err "Could not read cluster.name and cluster.region from $cfg"; exit 1; } + + log "Cluster: $cluster_name, Region: $region" + if ! aws eks describe-cluster --name "$cluster_name" --region "$region" &>/dev/null; then + err "EKS cluster '$cluster_name' not found. Create the cluster first or provide VPC/subnet via MINIO_EC2_VPC_ID and MINIO_EC2_SUBNET_ID." + exit 1 + fi + + vpc_id="$(aws eks describe-cluster --name "$cluster_name" --region "$region" --query 'cluster.resourcesVpcConfig.vpcId' --output text)" + # Prefer private subnet for MinIO + subnet_id="$(aws eks describe-cluster --name "$cluster_name" --region "$region" --query 'cluster.resourcesVpcConfig.subnetIds[0]' --output text)" + [[ -z "$vpc_id" || "$vpc_id" == "None" ]] && { err "No VPC from cluster"; exit 1; } + [[ -z "$subnet_id" || "$subnet_id" == "None" ]] && { err "No subnet from cluster"; exit 1; } + + local vpc_cidr + vpc_cidr="$(aws ec2 describe-vpcs --vpc-ids "$vpc_id" --region "$region" --query 'Vpcs[0].CidrBlock' --output text 2>/dev/null || echo "10.0.0.0/8")" + + log "VPC: $vpc_id, Subnet: $subnet_id, CIDR: $vpc_cidr" + + # Security group: SSH (22) from anywhere; MinIO (9000) from VPC (reuse if exists) + local sg_name="minio-ec2-${cluster_name}" + sg_id="$(aws ec2 describe-security-groups --filters "Name=group-name,Values=$sg_name" "Name=vpc-id,Values=$vpc_id" --region "$region" --query 'SecurityGroups[0].GroupId' --output text 2>/dev/null)" + if [[ -z "$sg_id" || "$sg_id" == "None" ]]; then + sg_id="$(aws ec2 create-security-group --group-name "$sg_name" --description "MinIO EC2 for EKS" --vpc-id "$vpc_id" --region "$region" --query 'GroupId' --output text)" + fi + aws ec2 authorize-security-group-ingress --group-id "$sg_id" --protocol tcp --port 22 --cidr 0.0.0.0/0 --region "$region" 2>/dev/null || true + aws ec2 authorize-security-group-ingress --group-id "$sg_id" --protocol tcp --port "$MINIO_PORT" --cidr "$vpc_cidr" --region "$region" 2>/dev/null || true + log "Security group: $sg_id (22 from 0.0.0.0/0, ${MINIO_PORT} from $vpc_cidr)" + + # Key pair: use existing or create (idempotent: reuse same key name per cluster) + local key_name="$MINIO_EC2_KEY_NAME" + local key_file="" + if [[ -z "$key_name" ]]; then + key_name="minio-ec2-${cluster_name}" + key_file="/tmp/minio-ec2-${cluster_name}.pem" + if aws ec2 describe-key-pairs --key-names "$key_name" --region "$region" &>/dev/null; then + log "Using existing key pair: $key_name (if you lost the .pem, set MINIO_EC2_KEY_NAME to another key)" + elif aws ec2 create-key-pair --key-name "$key_name" --query 'KeyMaterial' --output text --region "$region" > "$key_file" 2>/dev/null; then + chmod 600 "$key_file" + log "Key pair created: $key_name (saved to $key_file)" + else + err "Create key pair failed. Set MINIO_EC2_KEY_NAME to an existing key name in this region." + exit 1 + fi + fi + + # AMI: Amazon Linux 2023 + local ami_id + ami_id="$(aws ec2 describe-images --owners amazon --filters "Name=name,Values=al2023-ami-*-x86_64" "Name=state,Values=available" --query 'sort_by(Images,&CreationDate)[-1].ImageId' --output text --region "$region")" + [[ -z "$ami_id" || "$ami_id" == "None" ]] && ami_id="$(aws ec2 describe-images --owners amazon --filters "Name=name,Values=amzn2-ami-hvm-*-x86_64-gp2" "Name=state,Values=available" --query 'sort_by(Images,&CreationDate)[-1].ImageId' --output text --region "$region")" + + instance_id="$(aws ec2 run-instances \ + --image-id "$ami_id" \ + --instance-type "$MINIO_EC2_INSTANCE_TYPE" \ + --subnet-id "$subnet_id" \ + --security-group-ids "$sg_id" \ + --key-name "$key_name" \ + --block-device-mappings "[{\"DeviceName\":\"/dev/xvda\",\"Ebs\":{\"VolumeSize\":${MINIO_EC2_VOLUME_SIZE},\"VolumeType\":\"gp3\"}}]" \ + --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=minio-ec2-${cluster_name}},{Key=Cluster,Value=${cluster_name}}]" \ + --region "$region" \ + --query 'Instances[0].InstanceId' --output text)" + log "Launched instance: $instance_id (key: $key_name)" + + log "Waiting for instance to get private IP..." + aws ec2 wait instance-running --instance-ids "$instance_id" --region "$region" + private_ip="$(aws ec2 describe-instances --instance-ids "$instance_id" --region "$region" --query 'Reservations[0].Instances[0].PrivateIpAddress' --output text)" + [[ -z "$private_ip" || "$private_ip" == "None" ]] && private_ip="(check console)" + + echo "" + log "=== MinIO EC2 instance ready ===" + echo " Instance ID: $instance_id" + echo " Private IP: $private_ip" + echo " Region: $region" + echo " Key name: $key_name" + [[ -n "$key_file" && -f "$key_file" ]] && echo " Key file: $key_file" + echo "" + echo "Next steps:" + echo " 1. SSH to the instance: ssh -i ${key_file:-/path/to/$key_name.pem} ec2-user@${private_ip}" + echo " 2. On the instance, copy and run this script (install-only mode, requires sudo):" + echo " sudo ./install_minio_ec2.sh --bucket ${MINIO_BUCKET} --user ${MINIO_ROOT_USER} --password ''" + echo " 3. Add to cluster-config.yaml (storage.minio):" + echo " enabled: true" + echo " external: true" + echo " endpoint: \"http://${private_ip}:${MINIO_PORT}\"" + echo " bucket: \"${MINIO_BUCKET}\"" + echo " auth: { rootUser: \"${MINIO_ROOT_USER}\", rootPassword: \"\" }" + echo "" +} + +need_file() { [[ -n "${1:-}" && -f "${1}" ]] || { err "File required: $1"; exit 1; }; } + +# ---------- Entry ---------- +if [[ "$LAUNCH_EC2" == "true" ]]; then + launch_ec2_in_eks_vpc + exit 0 +fi + +# ---------- Mode 1: Install MinIO on this machine ---------- +# Require root (for /usr/local/bin, /etc/default/minio, systemd) +if [[ "$(id -u)" -ne 0 ]]; then + err "This script must be run as root (or with sudo)." + err "Run: sudo $0 ${*:-}" + exit 1 +fi + +# Generate password if not set +if [[ -z "${MINIO_ROOT_PASSWORD}" ]]; then + MINIO_ROOT_PASSWORD="$(openssl rand -base64 24 2>/dev/null || head -c 32 /dev/urandom | base64)" + log "Generated MINIO_ROOT_PASSWORD (save it for cluster-config.yaml)" +fi + +# Install MinIO binary (use stable "latest" URL; archive URLs can 404 and return HTML) +install_minio_binary() { + local arch + arch="$(uname -m)" + case "$arch" in + x86_64|amd64) arch=amd64 ;; + aarch64|arm64) arch=arm64 ;; + *) err "Unsupported arch: $arch"; exit 1 ;; + esac + local url="https://dl.min.io/server/minio/release/linux-${arch}/minio" + local tmp="/tmp/minio.$$" + log "Downloading MinIO (linux-${arch})..." + if ! curl -sSL -o "$tmp" "$url"; then + err "Download failed. Check network or try: curl -sSL -o /tmp/minio '$url'" + rm -f "$tmp" + exit 1 + fi + # Reject HTML/error pages (e.g. 404); binary should not start with < or "Not" + if head -c 4 "$tmp" | grep -q '^<\|^Not'; then + err "Download returned HTML/error instead of binary. URL may be wrong or blocked." + head -1 "$tmp" + rm -f "$tmp" + exit 1 + fi + chmod +x "$tmp" + mv "$tmp" /usr/local/bin/minio + minio --version +} + +install_mc() { + local arch + arch="$(uname -m)" + case "$arch" in + x86_64|amd64) arch=amd64 ;; + aarch64|arm64) arch=arm64 ;; + *) arch=amd64 ;; + esac + local tmp="/tmp/mc.$$" + log "Downloading MinIO Client (mc)..." + if ! curl -sSL -o "$tmp" "https://dl.min.io/client/mc/release/linux-${arch}/mc"; then + err "Download failed for mc." + rm -f "$tmp" + exit 1 + fi + if head -c 4 "$tmp" | grep -q '^<\|^Not'; then + err "mc download returned HTML/error instead of binary." + rm -f "$tmp" + exit 1 + fi + chmod +x "$tmp" + mv "$tmp" /usr/local/bin/mc + mc --version +} + +# Stop MinIO so we can replace the binary without restart loop (e.g. after wrong-arch fix). +systemctl stop minio 2>/dev/null || true +# Always (re)install MinIO binary so we get the correct architecture for this host. +# A wrong-arch binary (e.g. amd64 on arm64 EC2) causes "Exec format error" and crash-loop. +install_minio_binary +if ! command -v mc &>/dev/null; then + install_mc +else + log "mc already present: $(mc --version 2>/dev/null || true)" +fi + +mkdir -p "$MINIO_DATA_DIR" +chmod 755 "$MINIO_DATA_DIR" +ENV_FILE="/etc/default/minio" +cat > "$ENV_FILE" < /etc/systemd/system/minio.service </dev/null | grep -q 200; then + minio_ok=true + break + fi + sleep 2 +done +if [[ "$minio_ok" != "true" ]]; then + err "MinIO did not respond on port ${MINIO_PORT} within 60s. Service may be failing or crash-looping." + echo "" >&2 + systemctl status minio --no-pager 2>&1 || true + echo "" >&2 + journalctl -u minio -n 30 --no-pager 2>&1 || true + exit 1 +fi +# Verify port is actually listening +if ! ( ss -tlnp 2>/dev/null || netstat -tlnp 2>/dev/null ) | grep -qE "[.:]${MINIO_PORT}([^0-9]|$)"; then + err "MinIO health passed but port ${MINIO_PORT} is not listening. Showing service status:" + systemctl status minio --no-pager 2>&1 || true + exit 1 +fi +sleep 2 + +export MC_HOST_local="http://${MINIO_ROOT_USER}:${MINIO_ROOT_PASSWORD}@127.0.0.1:${MINIO_PORT}" +mc mb "local/${MINIO_BUCKET}" --ignore-existing 2>/dev/null || true +for prefix in apps artifacts config job_groups model_artifacts tasks; do + echo -n | mc pipe "local/${MINIO_BUCKET}/${prefix}/.keep" 2>/dev/null || true +done +log "Bucket '${MINIO_BUCKET}' and prefixes apps/, artifacts/, config/, job_groups/, model_artifacts/, tasks/ ready" + +if command -v firewall-cmd &>/dev/null && systemctl is-active --quiet firewalld 2>/dev/null; then + firewall-cmd --permanent --add-port="${MINIO_PORT}/tcp" 2>/dev/null || true + firewall-cmd --reload 2>/dev/null || true +elif command -v ufw &>/dev/null && ufw status 2>/dev/null | grep -q "Status: active"; then + ufw allow "${MINIO_PORT}/tcp" 2>/dev/null || true + ufw reload 2>/dev/null || true +fi + +PRIVATE_IP="" +if command -v hostname &>/dev/null; then + PRIVATE_IP="$(hostname -I 2>/dev/null | awk '{print $1}')" +fi +[[ -z "$PRIVATE_IP" ]] && PRIVATE_IP="$(curl -s --connect-timeout 2 http://169.254.169.254/latest/meta-data/local-ipv4 2>/dev/null || echo 'MINIO_EC2_PRIVATE_IP')" +ENDPOINT="http://${PRIVATE_IP}:${MINIO_PORT}" + +echo "" +log "=== MinIO on EC2 is ready ===" +echo " Endpoint: ${ENDPOINT}" +echo " Bucket: ${MINIO_BUCKET}" +echo " Root user: ${MINIO_ROOT_USER}" +echo " Root pass: ${MINIO_ROOT_PASSWORD}" +echo "" +echo "Add to cluster-config.yaml (storage.minio):" +echo " minio:" +echo " enabled: true" +echo " external: true" +echo " endpoint: \"${ENDPOINT}\"" +echo " bucket: \"${MINIO_BUCKET}\"" +echo " auth:" +echo " rootUser: \"${MINIO_ROOT_USER}\"" +echo " rootPassword: \"${MINIO_ROOT_PASSWORD}\"" +echo "" +echo "Ensure EC2 security group allows inbound TCP ${MINIO_PORT} from your EKS node security group or VPC CIDR." +echo "" +echo "If MinIO is not reachable, check: systemctl status minio && ss -tlnp | grep ${MINIO_PORT}" +echo "" diff --git a/tools/artifacts_download_upload_scripts/install_seaweedfs_systemd.sh b/tools/artifacts_download_upload_scripts/install_seaweedfs_systemd.sh new file mode 100755 index 0000000..2f21090 --- /dev/null +++ b/tools/artifacts_download_upload_scripts/install_seaweedfs_systemd.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# Install SeaweedFS as a systemd service (restart on failure, start on boot). +# Run with sudo on the host where SeaweedFS should run (e.g. EC2). +# Prereqs: weed binary at /usr/local/bin/weed (run upload_to_seaweedfs.sh once to install, or install manually). + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SERVICE_NAME="seaweedfs" +UNIT_FILE="${SCRIPT_DIR}/seaweedfs.service" + +if [[ "$(id -u)" -ne 0 ]]; then + echo "Run with sudo to install the systemd service." + exit 1 +fi + +if [[ ! -f /usr/local/bin/weed ]]; then + echo "weed not found at /usr/local/bin/weed. Install it first, e.g.:" + echo " Run ./upload_to_seaweedfs.sh once (it will install weed), or" + echo " download from https://github.com/seaweedfs/seaweedfs/releases and extract weed to /usr/local/bin/" + exit 1 +fi + +# Service runs as ec2-user; ensure the binary is executable by that user (fixes "Permission denied" on EXEC). +chmod 755 /usr/local/bin/weed +# On SELinux systems (e.g. RHEL, Amazon Linux), label the binary so the service can execute it. +if command -v getenforce &>/dev/null && [[ "$(getenforce 2>/dev/null)" == "Enforcing" ]]; then + if command -v chcon &>/dev/null; then + chcon -t bin_t /usr/local/bin/weed 2>/dev/null || true + fi +fi + +echo "Installing ${SERVICE_NAME}.service..." +cp "$UNIT_FILE" /etc/systemd/system/"${SERVICE_NAME}.service" +chmod 644 /etc/systemd/system/"${SERVICE_NAME}.service" +systemctl daemon-reload + +echo "Enabling ${SERVICE_NAME} to start on boot..." +systemctl enable "${SERVICE_NAME}" + +echo "Starting ${SERVICE_NAME} now..." +systemctl start "${SERVICE_NAME}" + +sleep 2 +if ! systemctl is-active --quiet "${SERVICE_NAME}"; then + echo "Warning: ${SERVICE_NAME} did not stay running. Check: sudo systemctl status ${SERVICE_NAME} && journalctl -u ${SERVICE_NAME} -n 30" + exit 1 +fi + +echo "" +echo "SeaweedFS is running as a systemd service." +echo " status: sudo systemctl status ${SERVICE_NAME}" +echo " logs: journalctl -u ${SERVICE_NAME} -f" +echo " stop: sudo systemctl stop ${SERVICE_NAME}" +echo " restart: sudo systemctl restart ${SERVICE_NAME}" +echo "" +echo "S3 endpoint: http://127.0.0.1:8333 (default credentials minioadmin/minioadmin)" +echo "Data dir: /home/ec2-user/data (edit SEAWEEDFS_DIR in the unit to change)" diff --git a/tools/artifacts_download_upload_scripts/seaweedfs.service b/tools/artifacts_download_upload_scripts/seaweedfs.service new file mode 100644 index 0000000..1dc4079 --- /dev/null +++ b/tools/artifacts_download_upload_scripts/seaweedfs.service @@ -0,0 +1,39 @@ +# SeaweedFS all-in-one server (master, volume, filer, S3). +# Install: see tools/cluster_setup/SEAWEEDFS_SYSTEMD.md or run install_seaweedfs_systemd.sh +# Credentials: set in /etc/default/seaweedfs or use the drop-in below. + +[Unit] +Description=SeaweedFS server (master, volume, filer, S3) +Documentation=https://github.com/seaweedfs/seaweedfs +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +User=ec2-user +Group=ec2-user +# Data directory (must exist and be writable by User). Override via drop-in if needed. +Environment="SEAWEEDFS_DIR=/home/ec2-user/data" +# Max volumes per volume server. Override via drop-in if needed. +Environment="SEAWEEDFS_VOLUME_MAX=100" +# S3 credentials (must match upload script / mc alias) +Environment="AWS_ACCESS_KEY_ID=minioadmin" +Environment="AWS_SECRET_ACCESS_KEY=minioadmin" +# Override with /etc/default/seaweedfs or systemd drop-in if needed: +# EnvironmentFile=-/etc/default/seaweedfs + +# Use explicit paths so ExecStart works even if env expansion is not applied (e.g. after copy from Windows). +ExecStart=/usr/local/bin/weed server -s3 -ip.bind=0.0.0.0 -dir=/home/ec2-user/data -volume.max=100 +WorkingDirectory=/home/ec2-user +Restart=on-failure +RestartSec=5 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=seaweedfs + +# Security: no new privileges, restrict to usual caps +NoNewPrivileges=true +PrivateTmp=true + +[Install] +WantedBy=multi-user.target diff --git a/tools/artifacts_download_upload_scripts/test_minio_connection.sh b/tools/artifacts_download_upload_scripts/test_minio_connection.sh index 6d90525..ada3233 100755 --- a/tools/artifacts_download_upload_scripts/test_minio_connection.sh +++ b/tools/artifacts_download_upload_scripts/test_minio_connection.sh @@ -4,7 +4,7 @@ MINIO_ENDPOINT="${MINIO_ENDPOINT:-http://127.0.0.1:9000}" MINIO_ROOT_USER="${MINIO_ROOT_USER:-minioadmin}" MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-minioadmin}" -MINIO_BUCKET="${MINIO_BUCKET:-personal}" +MINIO_BUCKET="${MINIO_BUCKET:-ai-platform-bucket}" echo "==========================================" echo "MinIO Connection Test" diff --git a/tools/artifacts_download_upload_scripts/upload_splunk_app_to_seaweedfs.sh b/tools/artifacts_download_upload_scripts/upload_splunk_app_to_seaweedfs.sh new file mode 100755 index 0000000..eafda79 --- /dev/null +++ b/tools/artifacts_download_upload_scripts/upload_splunk_app_to_seaweedfs.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Upload Splunk_AI_Assistant_Cloud.tgz to SeaweedFS at bucket/apps/Splunk_AI_Assistant_Cloud.tgz. +# Uses the same OBJECT_STORE_* / SEAWEEDFS_* env vars as upload_to_seaweedfs.sh and create_seaweedfs_folders.sh. + +set -e + +APP_FILENAME="${SPLUNK_APP_FILENAME:-Splunk_AI_Assistant_Cloud.tgz}" +LOCAL_PATH="${SPLUNK_APP_LOCAL_PATH:-./${APP_FILENAME}}" + +OBJECT_STORE_ENDPOINT="${OBJECT_STORE_ENDPOINT:-${SEAWEEDFS_ENDPOINT:-http://127.0.0.1:8333}}" +OBJECT_STORE_BUCKET="${OBJECT_STORE_BUCKET:-${SEAWEEDFS_BUCKET:-ai-platform-bucket-minio-us-east-2}}" +OBJECT_STORE_ACCESS_KEY="${OBJECT_STORE_ACCESS_KEY:-${SEAWEEDFS_ACCESS_KEY:-minioadmin}}" +OBJECT_STORE_SECRET_KEY="${OBJECT_STORE_SECRET_KEY:-${SEAWEEDFS_SECRET_KEY:-minioadmin}}" + +OBJECT_STORE_BUCKET=$(echo "$OBJECT_STORE_BUCKET" | tr '[:upper:]' '[:lower:]') + +seaweedfs_ok() { + local code + code=$(curl -s -o /dev/null -w "%{http_code}" "${OBJECT_STORE_ENDPOINT}" 2>/dev/null || echo "000") + [[ "$code" == "200" || "$code" == "403" || "$code" == "400" ]] +} + +if [[ ! -f "$LOCAL_PATH" ]]; then + echo "Error: App file not found: $LOCAL_PATH" + echo "Set SPLUNK_APP_LOCAL_PATH to the path of Splunk_AI_Assistant_Cloud.tgz, or put the file in the current directory." + exit 1 +fi + +if ! seaweedfs_ok; then + echo "SeaweedFS not reachable at ${OBJECT_STORE_ENDPOINT}. Start SeaweedFS first (e.g. sudo systemctl start seaweedfs)." + exit 1 +fi + +if ! command -v mc &>/dev/null; then + echo "MinIO Client (mc) is required. Install it or run create_seaweedfs_folders.sh first (it installs mc)." + exit 1 +fi + +MC_ALIAS="seaweedfs" +mc alias set "$MC_ALIAS" "$OBJECT_STORE_ENDPOINT" "$OBJECT_STORE_ACCESS_KEY" "$OBJECT_STORE_SECRET_KEY" --api S3v4 +mc mb "${MC_ALIAS}/${OBJECT_STORE_BUCKET}" --ignore-existing 2>/dev/null || true + +DEST="${MC_ALIAS}/${OBJECT_STORE_BUCKET}/apps/${APP_FILENAME}" +echo "Uploading ${LOCAL_PATH} to ${DEST}..." +mc cp "$LOCAL_PATH" "$DEST" +echo "Done. App is at ${OBJECT_STORE_BUCKET}/apps/${APP_FILENAME}" diff --git a/tools/artifacts_download_upload_scripts/upload_to_minio.sh b/tools/artifacts_download_upload_scripts/upload_to_minio.sh index 826e275..3b314ac 100755 --- a/tools/artifacts_download_upload_scripts/upload_to_minio.sh +++ b/tools/artifacts_download_upload_scripts/upload_to_minio.sh @@ -1,12 +1,18 @@ #!/bin/bash -# Script to upload model artifacts to MinIO +# Script to upload model artifacts to MinIO or any S3-compatible storage (e.g. SeaweedFS). +# Prefer generic env vars; MINIO_* are accepted for backward compatibility. SOURCE_DIR="./model_artifacts" -MINIO_ENDPOINT="http://127.0.0.1:9000" -# Change the bucket name to the one you want to use. It will be created if it doesn't exist. -MINIO_BUCKET="ai-platform-artifacts-bucket" -MINIO_ROOT_USER="minioadmin" -MINIO_ROOT_PASSWORD="minioadmin" +# Generic names (preferred); fallback to MINIO_* for backward compatibility +OBJECT_STORE_ENDPOINT="${OBJECT_STORE_ENDPOINT:-${MINIO_ENDPOINT:-http://127.0.0.1:9000}}" +OBJECT_STORE_BUCKET="${OBJECT_STORE_BUCKET:-${MINIO_BUCKET:-ai-platform-bucket-minio-us-east-2}}" +OBJECT_STORE_ACCESS_KEY="${OBJECT_STORE_ACCESS_KEY:-${MINIO_ROOT_USER:-${MINIO_ACCESS_KEY:-minioadmin}}}" +OBJECT_STORE_SECRET_KEY="${OBJECT_STORE_SECRET_KEY:-${MINIO_ROOT_PASSWORD:-${MINIO_SECRET_KEY:-minioadmin}}}" +# Internal use (script uses one set) +MINIO_ENDPOINT="${OBJECT_STORE_ENDPOINT}" +MINIO_BUCKET="${OBJECT_STORE_BUCKET}" +MINIO_ROOT_USER="${OBJECT_STORE_ACCESS_KEY}" +MINIO_ROOT_PASSWORD="${OBJECT_STORE_SECRET_KEY}" # Convert bucket name to lowercase (S3/MinIO requirement) ORIGINAL_BUCKET="$MINIO_BUCKET" @@ -176,7 +182,7 @@ if [ $CONNECTION_STATUS -ne 0 ]; then echo "" # Check for specific error types - if echo "$CONNECTION_TEST" | grep -q "Access Denied\|InvalidAccessKeyId\|SignatureDoesNotMatch"; then + if echo "$CONNECTION_TEST" | grep -qi "Access Denied\|InvalidAccessKeyId\|SignatureDoesNotMatch\|signature.*does not match"; then echo "Error: Authentication failed - Invalid credentials" echo "" echo "Current configuration:" @@ -189,7 +195,8 @@ if [ $CONNECTION_STATUS -ne 0 ]; then echo " 3. Default MinIO credentials are usually:" echo " - Username: minioadmin" echo " - Password: minioadmin" - echo " 4. If you changed MinIO credentials, update them in this script" + echo " 4. If you installed MinIO with a custom password (e.g. install_minio_ec2.sh --password 'xxx'), run:" + echo " MINIO_ROOT_PASSWORD='your-password' ./upload_to_minio.sh" elif echo "$CONNECTION_TEST" | grep -q "dial tcp\|connection refused\|no such host"; then echo "Error: Cannot reach MinIO endpoint" echo "" @@ -252,10 +259,10 @@ for artifact_path in "$SOURCE_DIR"/*; do echo "Processing: $id" if [[ -d "$artifact_path" ]]; then - # It's a directory - upload recursively + # It's a directory - upload recursively (trailing slash on source = copy contents, not directory as single object) echo "Uploading directory to MinIO: $MINIO_ENDPOINT/$MINIO_BUCKET/model_artifacts/$id/" - mc cp --recursive "$artifact_path" "$MINIO_ALIAS/$MINIO_BUCKET/model_artifacts/$id/" + mc cp --recursive "$artifact_path/" "$MINIO_ALIAS/$MINIO_BUCKET/model_artifacts/$id/" else # It's a file - upload directly echo "Uploading file to MinIO: $MINIO_ENDPOINT/$MINIO_BUCKET/model_artifacts/$id" diff --git a/tools/artifacts_download_upload_scripts/upload_to_seaweedfs.sh b/tools/artifacts_download_upload_scripts/upload_to_seaweedfs.sh new file mode 100644 index 0000000..8f4bf08 --- /dev/null +++ b/tools/artifacts_download_upload_scripts/upload_to_seaweedfs.sh @@ -0,0 +1,264 @@ +#!/bin/bash +# Upload model artifacts to SeaweedFS (S3-compatible). If SeaweedFS is not running, +# the script can install and start it (weed binary, no Docker). Creates configured +# buckets and uploads from ./model_artifacts. Use OBJECT_STORE_* or SEAWEEDFS_* env vars. + +set -e + +SOURCE_DIR="./model_artifacts" +SEAWEEDFS_PORT="${SEAWEEDFS_PORT:-8333}" + +# Endpoint and credentials (prefer generic OBJECT_STORE_*, then SEAWEEDFS_*). +# SeaweedFS S3 has no built-in users: if the server is started with credentials (env or -config), +# they must match these values. This script sets them when it auto-starts SeaweedFS. +OBJECT_STORE_ENDPOINT="${OBJECT_STORE_ENDPOINT:-${SEAWEEDFS_ENDPOINT:-http://127.0.0.1:8333}}" +OBJECT_STORE_BUCKET="${OBJECT_STORE_BUCKET:-${SEAWEEDFS_BUCKET:-ai-platform-bucket}}" +OBJECT_STORE_ACCESS_KEY="${OBJECT_STORE_ACCESS_KEY:-${SEAWEEDFS_ACCESS_KEY:-minioadmin}}" +OBJECT_STORE_SECRET_KEY="${OBJECT_STORE_SECRET_KEY:-${SEAWEEDFS_SECRET_KEY:-minioadmin}}" +# Bucket list to create (comma-separated). If unset, only primary bucket is created. +SEAWEEDFS_BUCKETS="${SEAWEEDFS_BUCKETS:-$OBJECT_STORE_BUCKET}" +# Set to 1 to skip auto-install and only fail if SeaweedFS is not reachable. +SEAWEEDFS_SKIP_INSTALL="${SEAWEEDFS_SKIP_INSTALL:-0}" +# Retries for each artifact upload (large files can trigger transient "internal error"). +SEAWEEDFS_UPLOAD_RETRIES="${SEAWEEDFS_UPLOAD_RETRIES:-3}" +SEAWEEDFS_UPLOAD_RETRY_DELAY="${SEAWEEDFS_UPLOAD_RETRY_DELAY:-15}" +# Max concurrent uploads (1 = sequential). +SEAWEEDFS_PARALLEL_JOBS="${SEAWEEDFS_PARALLEL_JOBS:-1}" +# Path to log failed artifact ids and messages (appended to on failure). +SEAWEEDFS_ERROR_LOG="${SEAWEEDFS_ERROR_LOG:-./seaweedfs_upload_errors.log}" +# Set to 1 to skip uploading a file if it already exists at destination (avoids re-uploading on script re-runs). +SEAWEEDFS_SKIP_EXISTING="${SEAWEEDFS_SKIP_EXISTING:-0}" +# Wait up to this many seconds for a volume server to appear in the cluster before uploading (avoids "0 node candidates"). +# Set to 0 to skip. Only used when endpoint is local and weed is available. +SEAWEEDFS_WAIT_VOLUME_SERVER="${SEAWEEDFS_WAIT_VOLUME_SERVER:-60}" +# Master address for cluster.ps (default: host from endpoint with port 9333). +SEAWEEDFS_MASTER="${SEAWEEDFS_MASTER:-}" +# Max volumes per volume server (default 100; 0 = auto from disk). Avoids "0 node candidates" when default (e.g. 7) is reached. +SEAWEEDFS_VOLUME_MAX="${SEAWEEDFS_VOLUME_MAX:-100}" + +# Normalize primary bucket to lowercase +OBJECT_STORE_BUCKET=$(echo "$OBJECT_STORE_BUCKET" | tr '[:upper:]' '[:lower:]') + +# ---- Check SeaweedFS is reachable ---- +seaweedfs_ok() { + local code + code=$(curl -s -o /dev/null -w "%{http_code}" "${OBJECT_STORE_ENDPOINT}" 2>/dev/null || echo "000") + [[ "$code" == "200" || "$code" == "403" || "$code" == "400" ]] && return 0 + return 1 +} + +# ---- Install and start SeaweedFS (weed binary from GitHub releases) ---- +install_and_start_seaweedfs() { + local os arch tag asset url tmpdir bindir + os="$(uname -s)" + arch="$(uname -m)" + case "$os" in + Linux) case "$arch" in x86_64|amd64) asset="linux_amd64.tar.gz";; aarch64|arm64) asset="linux_arm64.tar.gz";; *) echo "Unsupported arch: $arch"; return 1;; esac ;; + Darwin) case "$arch" in x86_64|amd64) asset="darwin_amd64.tar.gz";; arm64) asset="darwin_arm64.tar.gz";; *) echo "Unsupported arch: $arch"; return 1;; esac ;; + *) echo "Unsupported OS: $os"; return 1 ;; + esac + echo "Installing SeaweedFS (weed) for $os $arch..." + tag=$(curl -sL https://api.github.com/repos/seaweedfs/seaweedfs/releases/latest | grep '"tag_name":' | sed -E 's/.*"tag_name":\s*"([^"]+)".*/\1/') + [[ -z "$tag" ]] && { echo "Could not get latest SeaweedFS release tag."; return 1; } + url="https://github.com/seaweedfs/seaweedfs/releases/download/${tag}/${asset}" + tmpdir="$(mktemp -d)" + if ! curl -sSL -o "$tmpdir/weed.tar.gz" "$url"; then + echo "Download failed: $url"; rm -rf "$tmpdir"; return 1 + fi + tar -xzf "$tmpdir/weed.tar.gz" -C "$tmpdir" + [[ ! -f "$tmpdir/weed" ]] && { echo "weed binary not found in archive."; rm -rf "$tmpdir"; return 1; } + chmod +x "$tmpdir/weed" + if [[ "$(id -u)" -eq 0 ]] && [[ -d /usr/local/bin ]]; then + mv "$tmpdir/weed" /usr/local/bin/weed + bindir="/usr/local/bin" + elif command -v sudo &>/dev/null && [[ -d /usr/local/bin ]]; then + sudo mv "$tmpdir/weed" /usr/local/bin/weed + bindir="/usr/local/bin" + else + mkdir -p ~/.local/bin + mv "$tmpdir/weed" ~/.local/bin/weed + bindir="$HOME/.local/bin" + export PATH="$PATH:$bindir" + echo "Note: weed installed to $bindir (ensure it is in your PATH)" + fi + rm -rf "$tmpdir" + echo "Installed: $bindir/weed" + "$bindir/weed" version 2>/dev/null || true + echo "Starting SeaweedFS (master, volume, filer, S3 on port ${SEAWEEDFS_PORT}, volume.max=${SEAWEEDFS_VOLUME_MAX})..." + # SeaweedFS S3 validates credentials when provided; use script defaults so mc alias works. + export AWS_ACCESS_KEY_ID="${OBJECT_STORE_ACCESS_KEY:-minioadmin}" + export AWS_SECRET_ACCESS_KEY="${OBJECT_STORE_SECRET_KEY:-minioadmin}" + nohup env AWS_ACCESS_KEY_ID="$AWS_ACCESS_KEY_ID" AWS_SECRET_ACCESS_KEY="$AWS_SECRET_ACCESS_KEY" "$bindir/weed" server -s3 -ip.bind=0.0.0.0 -volume.max="$SEAWEEDFS_VOLUME_MAX" > /tmp/seaweedfs.log 2>&1 & + echo $! > /tmp/seaweedfs.pid + local i + for i in {1..30}; do + sleep 2 + if seaweedfs_ok; then echo "SeaweedFS is up."; return 0; fi + done + echo "Timeout waiting for SeaweedFS. Check /tmp/seaweedfs.log" + return 1 +} + +if ! seaweedfs_ok; then + if [[ "$SEAWEEDFS_SKIP_INSTALL" == "1" ]]; then + echo "Error: SeaweedFS S3 gateway is not reachable at $OBJECT_STORE_ENDPOINT" + echo "Set OBJECT_STORE_ENDPOINT or start SeaweedFS manually (weed server -s3)." + exit 1 + fi + # Only auto-install when endpoint is local (otherwise we'd start local server while user meant a remote one) + if [[ "$OBJECT_STORE_ENDPOINT" != *"127.0.0.1"* ]] && [[ "$OBJECT_STORE_ENDPOINT" != *"localhost"* ]]; then + echo "Error: SeaweedFS is not reachable at $OBJECT_STORE_ENDPOINT" + echo "For a remote endpoint, start SeaweedFS on that host or set OBJECT_STORE_ENDPOINT=http://127.0.0.1:8333 and run again to install locally." + exit 1 + fi + echo "SeaweedFS not reachable at $OBJECT_STORE_ENDPOINT. Attempting to install and start..." + if ! install_and_start_seaweedfs; then + echo "" + echo "Install failed or SeaweedFS did not start. You can:" + echo " 1. Install manually: https://github.com/seaweedfs/seaweedfs/releases" + echo " 2. Run: weed server -s3" + echo " 3. Or set OBJECT_STORE_ENDPOINT=http://:8333 if SeaweedFS runs elsewhere" + exit 1 + fi +fi +echo "SeaweedFS reachable at $OBJECT_STORE_ENDPOINT" + +# ---- Wait for volume server (avoids "Not enough data nodes found" right after restart) ---- +if [[ "$SEAWEEDFS_WAIT_VOLUME_SERVER" -gt 0 ]] && command -v weed &>/dev/null; then + if [[ "$OBJECT_STORE_ENDPOINT" == *"127.0.0.1"* ]] || [[ "$OBJECT_STORE_ENDPOINT" == *"localhost"* ]]; then + master="${SEAWEEDFS_MASTER}" + [[ -z "$master" ]] && master="127.0.0.1:9333" + echo "Waiting up to ${SEAWEEDFS_WAIT_VOLUME_SERVER}s for a volume server in the cluster..." + waited=0 + while [[ $waited -lt "$SEAWEEDFS_WAIT_VOLUME_SERVER" ]]; do + out=$(echo -e "cluster.ps\nexit" | weed shell -master="$master" 2>/dev/null) || true + if echo "$out" | grep -q "volume servers" && echo "$out" | grep -q ":8080"; then + echo "Volume server is ready." + break + fi + sleep 2 + waited=$((waited + 2)) + done + if [[ $waited -ge "$SEAWEEDFS_WAIT_VOLUME_SERVER" ]]; then + echo "Warning: no volume server seen after ${SEAWEEDFS_WAIT_VOLUME_SERVER}s. Upload may fail with 'Not enough data nodes'. Wait longer and re-run, or set SEAWEEDFS_WAIT_VOLUME_SERVER=0 to skip." + fi + fi +fi +echo "" + +# ---- Install mc if needed (same pattern as upload_to_minio.sh) ---- +OS="$(uname -s)" +ARCH="$(uname -m)" +if ! command -v mc &>/dev/null; then + echo "Installing MinIO Client (mc)..." + if [[ "$OS" == "Darwin" ]]; then + if command -v brew &>/dev/null; then + brew install minio/stable/mc + else + if [[ "$ARCH" == "arm64" ]]; then MC_URL="https://dl.min.io/client/mc/release/darwin-arm64/mc"; else MC_URL="https://dl.min.io/client/mc/release/darwin-amd64/mc"; fi + curl -o /tmp/mc "$MC_URL" && chmod +x /tmp/mc && sudo mv /tmp/mc /usr/local/bin/mc + fi + elif [[ "$OS" == "Linux" ]]; then + if [[ "$ARCH" == "x86_64" ]]; then MC_URL="https://dl.min.io/client/mc/release/linux-amd64/mc"; elif [[ "$ARCH" == "aarch64" || "$ARCH" == "arm64" ]]; then MC_URL="https://dl.min.io/client/mc/release/linux-arm64/mc"; else echo "Unsupported arch: $ARCH"; exit 1; fi + curl -o /tmp/mc "$MC_URL" && chmod +x /tmp/mc + sudo mv /tmp/mc /usr/local/bin/mc 2>/dev/null || { mkdir -p ~/.local/bin; mv /tmp/mc ~/.local/bin/mc; export PATH="$PATH:$HOME/.local/bin"; } + else + echo "Unsupported OS: $OS"; exit 1 + fi +fi +mc --version +echo "" + +# ---- Source dir and count ---- +[[ ! -d "$SOURCE_DIR" ]] && { echo "Error: $SOURCE_DIR not found. Run ./download_from_huggingface.sh first."; exit 1; } +artifact_count=$(find "$SOURCE_DIR" -mindepth 1 -maxdepth 1 | wc -l | tr -d ' ') +[[ "$artifact_count" -eq 0 ]] && { echo "No artifacts in $SOURCE_DIR."; exit 1; } +echo "Found $artifact_count artifacts to upload." +echo "" + +# ---- Configure mc alias ---- +MC_ALIAS="seaweedfs" +mc alias set "$MC_ALIAS" "$OBJECT_STORE_ENDPOINT" "$OBJECT_STORE_ACCESS_KEY" "$OBJECT_STORE_SECRET_KEY" --api S3v4 + +# ---- Create buckets (from list + primary) ---- +for b in $(echo "$SEAWEEDFS_BUCKETS" | tr ',' '\n'); do + b=$(echo "$b" | tr '[:upper:]' '[:lower:]' | tr -d ' ') + [[ -z "$b" ]] && continue + mc mb "${MC_ALIAS}/${b}" --ignore-existing 2>/dev/null || true +done +mc mb "${MC_ALIAS}/${OBJECT_STORE_BUCKET}" --ignore-existing 2>/dev/null || true +echo "" + +# ---- Upload with retries (single file; large files can trigger "internal error") ---- +do_upload_file() { + local src="$1" dest="$2" attempt=1 + if [[ "$SEAWEEDFS_SKIP_EXISTING" == "1" ]]; then + mc stat "$dest" &>/dev/null && return 0 + fi + while [[ $attempt -le "$SEAWEEDFS_UPLOAD_RETRIES" ]]; do + mc cp "$src" "$dest" && return 0 + echo "Attempt $attempt/$SEAWEEDFS_UPLOAD_RETRIES failed. Retrying in ${SEAWEEDFS_UPLOAD_RETRY_DELAY}s..." + attempt=$((attempt + 1)) + [[ $attempt -le "$SEAWEEDFS_UPLOAD_RETRIES" ]] && sleep "$SEAWEEDFS_UPLOAD_RETRY_DELAY" + done + return 1 +} + +# Upload a directory artifact file-by-file (per-file retries; one failed file doesn't re-upload the rest). +upload_artifact_dir() { + local artifact_path="$1" dest_base="$2" id="$3" failed=0 f rel + while IFS= read -r -d '' f; do + rel="${f#${artifact_path}/}" + if ! do_upload_file "$f" "${dest_base}/${rel}"; then + echo "$(date -Iseconds 2>/dev/null || date) FAILED: $id $rel" >> "$SEAWEEDFS_ERROR_LOG" + failed=1 + fi + done < <(find "$artifact_path" -type f -print0) + return $failed +} + +# Clear error log from previous runs +: > "$SEAWEEDFS_ERROR_LOG" + +# Build list of artifacts for parallel upload +artifact_paths=() +for artifact_path in "$SOURCE_DIR"/*; do + [[ -e "$artifact_path" ]] || continue + artifact_paths+=("$artifact_path") +done + +parallel_jobs="$SEAWEEDFS_PARALLEL_JOBS" +[[ "$parallel_jobs" -lt 1 ]] && parallel_jobs=1 +idx=0 +total=${#artifact_paths[@]} +echo "Uploading $total artifacts (per-file) with up to $parallel_jobs parallel job(s). Errors logged to: $SEAWEEDFS_ERROR_LOG" +[[ "$SEAWEEDFS_SKIP_EXISTING" == "1" ]] && echo "Skip-existing is ON: files already present at destination will be skipped." +echo "" + +while [[ $idx -lt $total ]]; do + batch=0 + while [[ $batch -lt $parallel_jobs && $idx -lt $total ]]; do + artifact_path="${artifact_paths[$idx]}" + id=$(basename "$artifact_path") + dest_base="${MC_ALIAS}/${OBJECT_STORE_BUCKET}/model_artifacts/$id" + ( + if [[ -d "$artifact_path" ]]; then + upload_artifact_dir "$artifact_path" "$dest_base" "$id" || exit 1 + else + do_upload_file "$artifact_path" "$dest_base" || { echo "$(date -Iseconds 2>/dev/null || date) FAILED: $id" >> "$SEAWEEDFS_ERROR_LOG"; exit 1; } + fi + echo "Completed: $id" + ) & + batch=$((batch + 1)) + idx=$((idx + 1)) + done + wait || true +done + +if [[ -s "$SEAWEEDFS_ERROR_LOG" ]]; then + echo "" + echo "One or more artifacts failed. See $SEAWEEDFS_ERROR_LOG:" + cat "$SEAWEEDFS_ERROR_LOG" + exit 1 +fi +echo "Upload complete. Uploaded $artifact_count artifacts to ${OBJECT_STORE_ENDPOINT}/${OBJECT_STORE_BUCKET}/model_artifacts/" \ No newline at end of file diff --git a/tools/cluster_setup/EKS_README.md b/tools/cluster_setup/EKS_README.md index f160c1a..6ef9b75 100644 --- a/tools/cluster_setup/EKS_README.md +++ b/tools/cluster_setup/EKS_README.md @@ -50,16 +50,17 @@ The script installs everything needed for the AI Platform: 1. **EKS Cluster** (Kubernetes 1.31-1.34) - AWS-managed control plane 2. **VPC CNI** - Native AWS VPC networking for pods 3. **S3 Bucket** - Object storage for AI artifacts and models -4. **EBS CSI Driver** - Persistent volumes backed by AWS EBS -5. **Cluster Autoscaler** - Automatic node scaling based on demand +4. **EBS CSI Driver** - Persistent volumes backed by AWS EBS (IRSA-based IAM) +5. **Cluster Autoscaler** - Automatic node scaling based on demand (IRSA-based IAM) 6. **Cert-Manager** - Automated certificate management -7. **Kube-Prometheus Stack** - Monitoring with Prometheus + Grafana -8. **OpenTelemetry Operator** - Distributed tracing and telemetry -9. **NVIDIA Device Plugin** - GPU support for AI workloads -10. **KubeRay Operator** - Ray cluster management for distributed AI -11. **Splunk Operator** - Splunk Enterprise management -12. **Splunk AI Platform Operator** - AI platform orchestration -13. **AI Platform CR** - Complete AI deployment with features +7. **Object storage** - AWS S3 or external S3-compatible only (MinIO, SeaweedFS, etc.; no in-cluster MinIO install) +8. **Kube-Prometheus Stack** - Monitoring with Prometheus + Grafana +9. **OpenTelemetry Operator** - Distributed tracing and telemetry +10. **NVIDIA Device Plugin** - GPU support for AI workloads +11. **KubeRay Operator** - Ray cluster management for distributed AI +12. **Splunk Operator** - Splunk Enterprise management +13. **Splunk AI Platform Operator** - AI platform orchestration +14. **AI Platform CR** - Complete AI deployment with features ### AWS Integration Features @@ -389,13 +390,14 @@ You must configure these images in `cluster-config.yaml`: |-------|--------------|-------------| | Splunk AI Operator | `operator.image` | Main operator controller | | Splunk Enterprise | `splunk.image` | Splunk instance for observability | -| Splunk Operator | `splunk.operatorImage` | Splunk CRD controller (optional, has default) | +| Splunk Operator | `splunk.operatorImage` | Splunk CRD controller (optional, default: `docker.io/splunk/splunk-operator:3.0.0`) | | Ray Head | `ray.headImage` | Ray cluster head node | | Ray Worker | `ray.workerImage` | Ray worker nodes (GPU) | | Weaviate | `weaviate.image` | Vector database | | SAIA API | `saia.apiImage` | Splunk AI Assistant API | | SAIA Data Loader | `saia.dataLoaderImage` | SAIA initialization | -| Fluent Bit | `fluentBit.image` | Logging (optional, has default) | +| Fluent Bit | `fluentBit.image` | Logging (optional, default: `fluent/fluent-bit:1.9.6`) | +| OpenTelemetry Collector | `otelCollector.image` | Telemetry collection (optional, default: `otel/opentelemetry-collector-contrib:0.122.1`) | **No manual YAML editing required!** The script handles everything. @@ -518,7 +520,7 @@ vi my-cluster-config.yaml cluster: name: "my-ai-cluster" # ← CHANGE: Your unique cluster name (DNS-1123 compliant) region: "us-west-2" # ← CHANGE: Your AWS region - k8sVersion: "1.31" # Kubernetes version (1.29, 1.30, 1.31) + k8sVersion: "1.31" # Kubernetes version (1.31, 1.32, 1.33, 1.34) # Option A: Leave subnets empty to create new VPC automatically # Option B: Provide existing subnet IDs (eksctl auto-detects VPC from subnets) @@ -539,10 +541,55 @@ storage: # (3-63 chars, lowercase, numbers, hyphens) ``` +**Generic object store (`storage.objectStore.type`)** +Only **AWS S3** or **external S3-compatible** storage is supported (no in-cluster MinIO install). Set `storage.objectStore.type` to `aws`, `s3compat`, `minio`, or `seaweedfs` (default is `aws` when unset). The script sets the AIPlatform `objectStorage.path` and creates a credentials secret for s3compat/minio/seaweedfs; you must provide `endpoint` and credentials. See [Object Storage Selection](../../docs/configuration/object-storage.md). + +**External S3-compatible (MinIO, SeaweedFS, etc.)** +Set `storage.objectStore.type` to `minio`, `s3compat`, or `seaweedfs`, and set `storage.objectStore.endpoint` (e.g. `http://:9000` for MinIO) and credentials. You can run MinIO or SeaweedFS on EC2 or elsewhere; use `install_minio_ec2.sh` to install MinIO on an EC2 in the same VPC if desired. Pre-populate artifacts before cluster setup. The Splunk app (when using `splunkStandalone.localAppPath`) is not uploaded to external object storage automatically; upload it to your bucket at `apps/` via console or `mc`/`aws s3 --endpoint-url`. + +**S3-compatible / SeaweedFS (bring your own)** +- **Generic (`s3compat`):** Set `storage.objectStore.type: s3compat`, `storage.objectStore.endpoint`, `storage.objectStore.bucket`, and credentials. The script creates the credentials secret and sets the path to `s3compat://bucket`; it does not install any storage. Use for any S3-compatible backend (Ceph, custom gateway, etc.). +- **SeaweedFS:** Set `storage.objectStore.type: seaweedfs`, `storage.objectStore.endpoint` (e.g. `http://seaweedfs-s3:8333`), `storage.objectStore.bucket`, and credentials (env `MINIO_ROOT_USER`/`MINIO_ROOT_PASSWORD` or `objectStore.auth`). The script does not install SeaweedFS; it only creates the credentials secret and sets the AIPlatform path to `seaweedfs://bucket`. Ensure your SeaweedFS S3 gateway is reachable from the cluster. + +**Ensuring SeaweedFS is used (not MinIO)** +To force the stack to use SeaweedFS instead of MinIO: + +1. **Config:** In `cluster-config.yaml` set `storage.objectStore.type: "seaweedfs"` and `storage.objectStore.endpoint` to your SeaweedFS S3 URL with **port 8333** (e.g. `http://3.144.157.201:8333`). MinIO uses port 9000; using 8333 avoids pointing at MinIO by mistake. +2. **Preflight:** When you run the install script, preflight prints `Object storage: external S3-compatible (seaweedfs)` and `SeaweedFS endpoint: ...`. If the endpoint shows `:9000`, the script warns you to use `:8333` for SeaweedFS. +3. **After install:** Confirm the AIPlatform CR uses SeaweedFS: + ```bash + kubectl -n ai-platform get aiplatform -o yaml | grep -A6 objectStorage + ``` + You should see `path: seaweedfs://` and `endpoint: "http://...:8333"`. The secret name remains `minio-credentials` (used for any S3-compatible store). + +**Secure MinIO credentials (recommended)** +The script reads MinIO credentials in this order: **environment variables first**, then config file. Prefer not storing passwords in `cluster-config.yaml` (e.g. to avoid committing secrets to Git). + +| Approach | How | When to use | +|----------|-----|-------------| +| **Environment variables** | Export before running the script: `export MINIO_ROOT_USER=minioadmin` and `export MINIO_ROOT_PASSWORD=''`. You can leave `storage.objectStore.auth.rootUser` / `rootPassword` empty or omit them in config; env takes precedence. | Local runs, CI/CD (set secrets in pipeline), one-off setups. | +| **Config file only** | Set `storage.objectStore.auth.rootUser` and `storage.objectStore.auth.rootPassword` in `cluster-config.yaml`. | Quick testing only; avoid if the file is in version control. | +| **Pre-created Kubernetes Secret** | Create the secret yourself (e.g. from Vault or AWS Secrets Manager) in the AI platform namespace as `minio-credentials` with keys `s3_access_key` and `s3_secret_key`. The script can still create the secret from env/config; for stricter control, use a separate flow that only references the existing secret. | GitOps, when you already have a secrets pipeline. | +| **External secret manager** | Store credentials in AWS Secrets Manager, HashiCorp Vault, or similar. Before running the script, fetch the secret and set `MINIO_ROOT_USER` and `MINIO_ROOT_PASSWORD` (e.g. via a wrapper or CI step). Do not put the password in config. | Production; keeps secrets out of config and Git. | + +Example (MinIO credentials from environment only; no secrets in config): + +```bash +export MINIO_ROOT_USER=minioadmin +export MINIO_ROOT_PASSWORD='your-secure-password' +CONFIG_FILE=./cluster-config.yaml ./eks_cluster_with_stack.sh install +``` + +**Idempotency and existing clusters** +- The install is **idempotent**: if the EKS cluster already exists, the script skips cluster creation and only runs reconcile (addons, operators, AIPlatform). You can safely re-run `install` to update images, fix issues, or add components. +- **Require existing cluster:** Set `cluster.useExisting: true` to skip cluster creation entirely. The script will fail with a clear error if the cluster is not found. This is useful when you created the cluster separately or want to guard against accidentally creating a new cluster. +- **Use an existing VPC:** Provide `cluster.subnets` (private and public subnet IDs and AZs). eksctl will use that VPC and will not create a new one. +- **Preserve VPC on delete:** Set `cluster.preserveVpcOnDelete: true` when using an existing VPC to prevent the `delete` command from removing it. Requires at least 2 private subnets to be specified. + **Important Notes:** - **Cluster Name**: Must be DNS-1123 compliant (lowercase letters, numbers, hyphens; start/end with alphanumeric) -- **S3 Bucket**: Must be globally unique across all AWS accounts -- **Subnets**: If provided, script validates NAT Gateway, Internet Gateway, and route tables exist +- **S3 Bucket**: Must be globally unique across all AWS accounts (ignored when MinIO is enabled) +- **Subnets**: If provided, script validates NAT Gateway, Internet Gateway, and route tables exist; cluster uses this existing VPC - **Subnets**: Leave empty or comment out to let eksctl create a new VPC automatically **What each section configures:** @@ -551,13 +598,22 @@ storage: |---------|--------------|------------------| | `cluster.name` | EKS cluster name | ✅ **REQUIRED:** Change to your cluster name | | `cluster.region` | AWS region | ✅ **REQUIRED:** Change to your region | -| `cluster.subnets` | VPC subnets for nodes | ⚙️ **OPTIONAL:** Leave empty for new VPC or provide existing subnet IDs | -| `storage.s3Bucket` | S3 bucket for AI artifacts | ✅ **REQUIRED:** Choose unique name | +| `cluster.k8sVersion` | Kubernetes version (1.31-1.34) | ⚙️ Optional: default 1.31 | +| `cluster.useExisting` | Use existing cluster only (do not create) | ⚙️ Set `true` to skip cluster creation; script fails if cluster not found | +| `cluster.preserveVpcOnDelete` | Keep VPC when running `delete` | ⚙️ Set `true` when using an existing VPC you don't want deleted | +| `cluster.subnets` | VPC subnets for nodes | ⚙️ **OPTIONAL:** Leave empty for new VPC or provide existing subnet IDs to use existing VPC | +| `storage.s3Bucket` | S3 bucket for AI artifacts (used when `objectStore.type` is aws) | ✅ **REQUIRED** if not using MinIO/SeaweedFS | +| `storage.objectStore` | Object store: `type` (aws \| s3compat \| minio \| seaweedfs), `bucket`, `endpoint`, `auth`. Default type is `aws` when unset. External only (no in-cluster install). | ⚙️ Required for s3compat/minio/seaweedfs: set `endpoint` and credentials. See [Object Storage Selection](../../docs/configuration/object-storage.md). | | `images.registry` | Container registry URL | ✅ **REQUIRED:** Your ECR/Docker registry | | `images.*` | All container images | ✅ **REQUIRED:** Configure all image paths | | `nodeGroups.cpu` | CPU node group settings | ⚙️ Optional: adjust size/type | -| `nodeGroups.gpu` | GPU node group settings | ⚙️ Optional: adjust size/type | +| `nodeGroups.gpu` | GPU node group settings | ⚙️ Optional: adjust size/type/AZ/capacity reservation | +| `nodeGroups.gpu.availabilityZones` | Lock GPU nodes to specific AZs | ⚙️ Optional: for capacity-constrained GPU types | +| `nodeGroups.gpu.capacityReservation` | H100 Capacity Block reservation | ⚙️ Optional: for H100 with Capacity Blocks | +| `operators.ray.modelVersion` | Model version for AI serving | ⚙️ Optional: default `v0.3.14-36-g1549f5a` | +| `operators.ray.rayVersion` | Ray runtime version | ⚙️ Optional: default `2.44.0` | | `aiPlatform` | AI Platform configuration | ⚙️ Optional: customize features | +| `aiPlatform.defaultAcceleratorType` | GPU type: `L40S`, `H100` | ⚙️ Optional: default `L40S` | ### 5. Configure Container Images ⚠️ CRITICAL @@ -590,6 +646,9 @@ images: fluentBit: image: "fluent/fluent-bit:1.9.6" # ← OPTIONAL (has default) + + otelCollector: + image: "otel/opentelemetry-collector-contrib:0.122.1" # ← OPTIONAL (has default) ``` **Tips:** @@ -601,6 +660,12 @@ images: **The script will validate ALL images exist before deployment!** +**Additional Version Configuration:** + +The script also configures these versions in `artifacts.yaml`: +- `operators.ray.modelVersion` - Sets `MODEL_VERSION` env var (default: `v0.3.14-36-g1549f5a`) +- `operators.ray.rayVersion` - Sets `RAY_VERSION` env var (default: `2.44.0`) + ### 6. Login to Container Registries **For AWS ECR:** @@ -674,67 +739,156 @@ CONFIG_FILE=./my-cluster-config.yaml ./eks_cluster_with_stack.sh install - ✓ Creates backups 2. **Preflight Checks** (1 min) - - ✓ Checks AWS credentials + - ✓ Checks AWS credentials and identity + - ✓ Validates cluster name (DNS-1123), S3 bucket name - ✓ Verifies subnets exist (if provided) - - ✓ Validates NAT Gateway & Internet Gateway - - ✓ Checks required tools + - ✓ Validates NAT Gateway, Internet Gateway, route tables + - ✓ Checks required tools (aws, eksctl, kubectl, helm, git, jq, yq) -3. **Create EKS Cluster** (10-15 min) - - ✓ Creates managed control plane +3. **Create EKS Cluster** (10-15 min) - skipped if cluster already exists + - ✓ Creates managed control plane with OIDC - ✓ Sets up node groups (CPU + GPU) + - ✓ Creates H100 node group via CloudFormation (if using Capacity Block) 4. **Install Infrastructure** (10-15 min) - - ✓ EBS CSI Driver (for persistent volumes) - - ✓ Cluster Autoscaler (for node scaling) - - ✓ VPC CNI (for pod networking) + - ✓ OIDC provider for IRSA + - ✓ EBS CSI Driver with IRSA role + - ✓ gp3 StorageClass (set as default) + - ✓ Cluster Autoscaler with IRSA role + - ✓ NVIDIA device plugin 5. **Install Platform Components** (15-20 min) + - ✓ Kube-Prometheus Stack (Prometheus + Grafana) - ✓ Cert Manager (certificates) - - ✓ Prometheus + Grafana (monitoring) - - ✓ OpenTelemetry (tracing) - - ✓ NVIDIA GPU Operator (GPU support) + - ✓ S3-compatible credentials secret (if external object store) + - ✓ OpenTelemetry Operator + collector - ✓ KubeRay Operator (Ray clusters) - ✓ Splunk Operator (Splunk management) + - ✓ Splunk AI Platform Operator (with your images!) 6. **Deploy AI Platform** (5-10 min) - - ✓ Creates S3 bucket - - ✓ Sets up IAM roles (IRSA) - - ✓ Installs Splunk AI Operator (with your images!) - - ✓ Creates AIPlatform CR - - ✓ Deploys AI services + - ✓ Creates S3 bucket and prefixes (artifacts/, apps/, tasks/) + - ✓ Uploads Splunk app to S3 (if localAppPath configured) + - ✓ Sets up IRSA roles for Ray head, Ray worker, SAIA service + - ✓ Adds ECR permissions to IRSA roles + - ✓ Creates Splunk Standalone instance + - ✓ Creates AIPlatform CR and monitors until Ready + - ✓ Waits for Splunk AI Assistant app installation on Standalone **What Happens During Installation:** -1. ✓ Creates EKS cluster with control plane (5-10 minutes) -2. ✓ Creates managed node groups (CPU and GPU) (5-10 minutes) -3. ✓ Installs AWS Load Balancer Controller -4. ✓ Installs EBS CSI driver -5. ✓ Installs Cluster Autoscaler -6. ✓ Installs cert-manager -7. ✓ Installs monitoring stack (Prometheus, Grafana) -8. ✓ Installs OpenTelemetry -9. ✓ Installs NVIDIA GPU support -10. ✓ Installs Ray operator -11. ✓ Installs Splunk operator -12. ✓ Creates Splunk Standalone instance -13. ✓ Installs Splunk AI Platform operator -14. ✓ Creates S3 bucket and IAM roles -15. ✓ Creates ECR image pull secrets -16. ✓ Deploys AIPlatform CR +1. ✓ Validates configuration and container images (fails fast if images missing) +2. ✓ Runs preflight checks (AWS credentials, subnets, VPC networking, tools) +3. ✓ Creates EKS cluster with control plane (or skips if already exists) +4. ✓ Creates managed node groups (CPU and GPU) +5. ✓ Creates H100 GPU node group via CloudFormation (if using Capacity Block) +6. ✓ Ensures OIDC provider for IRSA +7. ✓ Installs EBS CSI driver (with IRSA role) +8. ✓ Creates gp3 StorageClass (set as default) +9. ✓ Installs Cluster Autoscaler (with IRSA role) +10. ✓ Installs NVIDIA device plugin +11. ✓ Installs kube-prometheus-stack (monitoring) +12. ✓ Installs cert-manager +13. ✓ Creates S3-compatible credentials secret (if using external object store) +14. ✓ Installs OpenTelemetry Operator + collector +15. ✓ Installs KubeRay Operator +16. ✓ Installs Splunk Operator +17. ✓ Installs Splunk AI Platform Operator +18. ✓ Creates S3 bucket and IAM roles (IRSA for Ray head/worker/SAIA) +19. ✓ Adds ECR permissions to IRSA roles +20. ✓ Creates Splunk Standalone instance +21. ✓ Deploys AIPlatform CR +22. ✓ Monitors AIPlatform status until Ready +23. ✓ Waits for Splunk AI Assistant app to be installed on Standalone ### 4. Verify Installation +After running `eks_cluster_with_stack.sh install` (or upgrade) with the latest operator image, use the commands below to verify the setup. Default namespace and AIPlatform name come from `cluster-config.yaml` (`aiPlatform.namespace` and `aiPlatform.name`); if you use a custom config, set `AI_NS` and `AI_PLATFORM_NAME` accordingly. + ```bash # Set kubeconfig (done automatically by script) export KUBECONFIG=~/.kube/config -# Check cluster +# ----- Optional: load namespace/name from your config ----- +# CONFIG_FILE="${CONFIG_FILE:-./cluster-config.yaml}" +# AI_NS="$(yq eval '.aiPlatform.namespace' "$CONFIG_FILE")" +# AI_PLATFORM_NAME="$(yq eval '.aiPlatform.name' "$CONFIG_FILE")" +# Or use defaults: +export AI_NS="${AI_NS:-ai-platform}" +export AI_PLATFORM_NAME="${AI_PLATFORM_NAME:-splunk-ai-stack}" +export SPLUNK_AI_NS="${SPLUNK_AI_NS:-splunk-ai-operator-system}" +``` + +**1. Cluster and nodes** + +```bash kubectl get nodes +kubectl get nodes -o wide +``` -# Check AI Platform -kubectl get aiplatform -n ai-platform +**2. Splunk AI Operator (confirm it is running the image you deployed)** -# Check all pods -kubectl get pods --all-namespaces +```bash +kubectl get deploy -n "$SPLUNK_AI_NS" -l app.kubernetes.io/name=splunk-ai-operator -o wide +kubectl get pods -n "$SPLUNK_AI_NS" -l app.kubernetes.io/name=splunk-ai-operator +# Show operator image (replace deployment name if different) +kubectl get deploy -n "$SPLUNK_AI_NS" -o jsonpath='{.items[0].spec.template.spec.containers[0].image}'; echo +``` + +**3. AIPlatform CR and status** + +```bash +kubectl get aiplatform "$AI_PLATFORM_NAME" -n "$AI_NS" +kubectl get aiplatform "$AI_PLATFORM_NAME" -n "$AI_NS" -o jsonpath='{.status.conditions[*].type}{"\n"}{.status.conditions[*].status}'; echo +# Detailed readiness (expect Ready=True when healthy) +kubectl get aiplatform "$AI_PLATFORM_NAME" -n "$AI_NS" -o jsonpath='{.status.conditions[?(@.type=="Ready")]}' | jq . +``` + +**4. Object storage secret (MinIO/S3 credentials for serve config)** + +```bash +# Secret name comes from AIPlatform spec.objectStorage.secretRef +SECRET_NAME="$(kubectl get aiplatform "$AI_PLATFORM_NAME" -n "$AI_NS" -o jsonpath='{.spec.objectStorage.secretRef}')" +echo "SecretRef: ${SECRET_NAME:-}" +kubectl get secret "${SECRET_NAME:-minio-credentials}" -n "$AI_NS" 2>/dev/null && echo "✓ Secret exists" || echo "✗ Secret missing" +kubectl get secret "${SECRET_NAME:-minio-credentials}" -n "$AI_NS" -o jsonpath='{.data}' 2>/dev/null | jq -r 'keys[]' | grep -E 's3_access_key|s3_secret_key' && echo "✓ Required keys present" || echo "✗ Check s3_access_key / s3_secret_key" +``` + +**5. RayService and serve config (object store credentials in apps)** + +```bash +kubectl get rayservice "$AI_PLATFORM_NAME" -n "$AI_NS" +# Count S3COMPAT_OBJECT_STORE_ACCESS_KEY in serve config (expect > 0 when using S3-compatible storage) +kubectl get rayservice "$AI_PLATFORM_NAME" -n "$AI_NS" -o jsonpath='{.spec.serveConfigV2}' | grep -o 'S3COMPAT_OBJECT_STORE_ACCESS_KEY' | wc -l +``` + +**6. Ray and application pods** + +```bash +kubectl get pods -n "$AI_NS" -l ray.io/cluster="$AI_PLATFORM_NAME" +kubectl get pods -n "$AI_NS" -l ai.splunk.com/platform="$AI_PLATFORM_NAME" +``` + +**7. Services (Ray Serve, Weaviate)** + +```bash +kubectl get svc -n "$AI_NS" -l ray.io/cluster="$AI_PLATFORM_NAME" +kubectl get svc -n "$AI_NS" | grep -E "ray|weaviate" +``` + +**8. Events (recent issues)** + +```bash +kubectl get events -n "$AI_NS" --sort-by='.lastTimestamp' | tail -30 +kubectl describe aiplatform "$AI_PLATFORM_NAME" -n "$AI_NS" | tail -40 +``` + +**Quick one-liner summary** + +```bash +echo "--- Operator ---"; kubectl get deploy -n "$SPLUNK_AI_NS" -o 'custom-columns=NAME:.metadata.name,READY:.status.readyReplicas,IMAGE:.spec.template.spec.containers[0].image' +echo "--- AIPlatform ---"; kubectl get aiplatform "$AI_PLATFORM_NAME" -n "$AI_NS" -o 'custom-columns=NAME:.metadata.name,READY:.status.conditions[0].status' +echo "--- RayService ---"; kubectl get rayservice "$AI_PLATFORM_NAME" -n "$AI_NS" +echo "--- Pods ---"; kubectl get pods -n "$AI_NS" --no-headers | wc -l; kubectl get pods -n "$AI_NS" | head -20 ``` --- @@ -753,7 +907,9 @@ The script uses a YAML configuration file (`cluster-config.yaml`) for all settin cluster: name: "my-ai-cluster" # EKS cluster name (DNS-1123 compliant) region: "us-west-2" # AWS region - k8sVersion: "1.31" # Kubernetes version (1.29, 1.30, 1.31) + k8sVersion: "1.31" # Kubernetes version (1.31, 1.32, 1.33, 1.34) + useExisting: false # Set true to require existing cluster (fails if not found) + preserveVpcOnDelete: false # Set true to keep VPC when running delete (existing VPC only) subnets: # Optional - leave empty for auto VPC creation private: # Private subnets (at least 2, different AZs) @@ -785,19 +941,51 @@ nodeGroups: maxSize: 4 # Maximum nodes volumeSize: 1000 # EBS volume size in GB volumeType: "gp3" # EBS volume type + availabilityZones: [] # Optional: lock GPU nodes to specific AZs + capacityReservation: # Optional: for H100 Capacity Blocks (CloudFormation-based) + id: "" # EC2 Capacity Reservation ID + az: "" # AZ of the reservation storage: - s3Bucket: "my-ai-platform-bucket" # S3 bucket for artifacts/apps/tasks + s3Bucket: "my-ai-platform-bucket" # S3 bucket for artifacts/apps/tasks (used when objectStore.type is aws) storageClass: "gp3" # Default storage class for PVCs vectorDbSize: "50Gi" # VectorDB PVC size + objectStore: # External S3-compatible storage (optional) + type: "aws" # aws | s3compat | minio | seaweedfs (default: aws) + bucket: "" # Bucket name (defaults to s3Bucket) + endpoint: "" # S3-compatible endpoint (required for s3compat/minio/seaweedfs) + namespace: "minio" # Namespace hint (for credential secret placement) + auth: + rootUser: "minioadmin" # S3-compatible access key (env MINIO_ROOT_USER takes precedence) + rootPassword: "" # S3-compatible secret key (env MINIO_ROOT_PASSWORD takes precedence) -operators: +images: + registry: "" # Container registry URL (prepended to relative image paths) + operator: + image: "" # Splunk AI Operator image splunk: - image: "splunk/splunk:10.2.0-dev1" # Splunk Enterprise image + image: "" # Splunk Enterprise image + operatorImage: "" # Splunk Operator image (default: docker.io/splunk/splunk-operator:3.0.0) ray: - version: "v1.2.2" # Ray operator version + headImage: "" # Ray head node image + workerImage: "" # Ray worker node image + weaviate: + image: "" # Weaviate vector database image + saia: + apiImage: "" # SAIA API image + dataLoaderImage: "" # SAIA data loader / post-install hook image + fluentBit: + image: "" # Fluent Bit image (default: fluent/fluent-bit:1.9.6) + otelCollector: + image: "" # OpenTelemetry Collector image (default: otel/opentelemetry-collector-contrib:0.122.1) + +operators: + ray: + version: "v1.2.2" # KubeRay operator version + modelVersion: "" # Model version (default: v0.3.14-36-g1549f5a) + rayVersion: "" # Ray runtime version (default: 2.44.0) nvidia: - devicePluginVersion: "v0.17.3" # NVIDIA device plugin version + devicePluginVersion: "v0.17.3" # NVIDIA device plugin version aiPlatform: namespace: "ai-platform" # Kubernetes namespace @@ -806,7 +994,7 @@ aiPlatform: rayHead: "ray-head-sa" rayWorker: "ray-worker-sa" saiaService: "saia-service-sa" - defaultAcceleratorType: "L40S" # Default GPU type + defaultAcceleratorType: "L40S" # Default GPU type (L40S, H100) workerGroupConfig: serviceAccountName: "ray-worker-sa" imageRegistry: "" # Leave empty for default @@ -815,11 +1003,13 @@ aiPlatform: className: "nginx" host: "ai.example.com" tlsSecretName: "ai-platform-tls" + certificate: + issuerName: "platform-issuer" # Cert-manager issuer name splunkStandalone: name: "splunk-standalone" # Splunk Standalone CR name serviceAccount: "saia-service-sa" # Service account for S3 access - localAppPath: "" # Optional: local path to Splunk app to upload + localAppPath: "" # Optional: local path to Splunk app to upload to S3 files: splunkOperatorManifest: "./splunk-operator-cluster.yaml" @@ -869,8 +1059,6 @@ storage: vectorDbSize: "20Gi" # Smaller vector DB operators: - splunk: - image: "splunk/splunk:10.2.0-dev1" ray: version: "v1.2.2" @@ -893,6 +1081,7 @@ cluster: name: "prod-ai-platform" region: "us-west-2" k8sVersion: "1.31" + preserveVpcOnDelete: true # Don't delete VPC on cleanup subnets: private: # 3 AZs for high availability - id: "subnet-private-2a" @@ -934,8 +1123,6 @@ storage: vectorDbSize: "200Gi" # Large vector DB operators: - splunk: - image: "splunk/splunk:10.2.0-dev1" ray: version: "v1.2.2" @@ -986,8 +1173,6 @@ storage: vectorDbSize: "100Gi" operators: - splunk: - image: "splunk/splunk:10.2.0-dev1" ray: version: "v1.2.2" @@ -997,6 +1182,53 @@ aiPlatform: defaultAcceleratorType: "L40S" ``` +#### Example 4: H100 GPU Cluster with Capacity Block + +```yaml +# h100-cluster-config.yaml - H100 instances with EC2 Capacity Blocks + +cluster: + name: "h100-ai-cluster" + region: "us-east-2" + k8sVersion: "1.31" + +nodeGroups: + cpu: + enabled: true + instanceType: "m5.xlarge" + desiredCapacity: 3 + minSize: 2 + maxSize: 6 + volumeSize: 300 + volumeType: "gp3" + + gpu: + enabled: true + instanceType: "p5.48xlarge" # 8x H100 GPUs + desiredCapacity: 2 + minSize: 2 + maxSize: 2 + volumeSize: 2000 + volumeType: "gp3" + capacityReservation: # H100 Capacity Block + id: "cr-0abcdef1234567890" # Your Capacity Reservation ID + az: "us-east-2b" # AZ of the reservation + +storage: + s3Bucket: "h100-ai-platform-data" + storageClass: "gp3" + vectorDbSize: "100Gi" + +operators: + ray: + version: "v1.2.2" + +aiPlatform: + namespace: "ai-platform" + name: "splunk-ai-stack" + defaultAcceleratorType: "H100" # Must be H100 for capacity block +``` + ### Instance Type Selection Guide #### CPU Instance Types (For Ray head, Weaviate, general workloads) @@ -1015,11 +1247,13 @@ aiPlatform: | Instance Type | GPUs | GPU Memory | vCPU | Memory | Use Case | Approx Cost/hr | |---------------|------|------------|------|--------|----------|----------------| | g5.xlarge | 1x A10G | 24 GB | 4 | 16 GB | Dev/Small Models | $1.01 | -| g5.2xlarge | 1x A10G | 24 GB | 8 | 32 GB | **Recommended** | $1.21 | +| g5.2xlarge | 1x A10G | 24 GB | 8 | 32 GB | Small Production | $1.21 | | g5.4xlarge | 1x A10G | 24 GB | 16 | 64 GB | Large Single-GPU | $1.62 | | g5.12xlarge | 4x A10G | 96 GB | 48 | 192 GB | Multi-GPU Training | $5.67 | +| g6e.12xlarge | 4x L40S | 192 GB | 48 | 384 GB | **Recommended (L40S)** | $7.77 | | p3.2xlarge | 1x V100 | 16 GB | 8 | 61 GB | ML Training | $3.06 | | p4d.24xlarge | 8x A100 | 320 GB | 96 | 1152 GB | Large-Scale Training | $32.77 | +| p5.48xlarge | 8x H100 | 640 GB | 192 | 2048 GB | H100 (Capacity Block) | $98.32 | **Note:** Prices are approximate for US East/West regions and may vary. Check [AWS Pricing](https://aws.amazon.com/ec2/pricing/on-demand/) for current rates. @@ -1030,19 +1264,53 @@ aiPlatform: ### Basic Commands ```bash -# Install EKS cluster and AI Platform +# Install EKS cluster and AI Platform (idempotent - safe to re-run) ./eks_cluster_with_stack.sh install -# Delete entire cluster and all AWS resources +# Delete cluster and ALL AWS resources/roles/policies created by this script ./eks_cluster_with_stack.sh delete -# Full cleanup (including S3 buckets, IAM roles) +# Full cleanup: uninstall CRs/operators then run comprehensive AWS cleanup ./eks_cluster_with_stack.sh delete-full +``` + +#### What `delete` Does (10-Step Cleanup) -# Check AIPlatform status -./eks_cluster_with_stack.sh status +The `delete` command performs a comprehensive, ordered cleanup of all AWS resources created by the script: + +| Step | Action | Details | +|------|--------|---------| +| 1 | Delete IRSA Service Accounts | Removes SA CloudFormation stacks for Cluster Autoscaler, Ray head/worker, SAIA, EBS CSI | +| 2 | Delete IAM Roles | Removes IRSA roles for all service accounts | +| 3 | Clean up EBS CSI addon roles | Finds and deletes any `eksctl--addon-aws-ebs-csi-driver-*` roles | +| 4 | Delete EKS Addons | Removes `aws-ebs-csi-driver` addon | +| 5 | Delete EKS Cluster | Runs `eksctl delete cluster --wait` and waits for CloudFormation stack deletion | +| 6 | Clean up CloudFormation stacks | Deletes lingering nodegroup, IAMServiceAccount, and addon stacks | +| 7 | Delete IAM Policies | Removes S3 bucket policy (or ECR-only policy if using external object store) | +| 8 | Purge IRSA roles by OIDC | Finds and removes any remaining roles associated with the cluster's OIDC provider | +| 9 | Delete OIDC Provider | Removes the IAM OIDC identity provider | +| 10 | Delete EBS Volumes | Removes all EBS volumes tagged with the cluster name | + +**VPC Preservation:** If `cluster.preserveVpcOnDelete: true` is set, the VPC and subnets are preserved; only EKS and related resources are deleted. + +**Verification after delete:** +```bash +# Check for remaining IAM roles +aws iam list-roles --query "Roles[?contains(RoleName, '${CLUSTER_NAME}')].RoleName" + +# Check for remaining CloudFormation stacks +aws cloudformation list-stacks --query "StackSummaries[?contains(StackName, 'eksctl-${CLUSTER_NAME}')].StackName" + +# Check for remaining EBS volumes +aws ec2 describe-volumes --region ${REGION} \ + --filters "Name=tag:kubernetes.io/cluster/${CLUSTER_NAME},Values=owned" \ + --query 'Volumes[].VolumeId' ``` +#### What `delete-full` Does + +The `delete-full` command runs a full teardown: it first uninstalls all Kubernetes CRs and operators (AIPlatform, Splunk Standalone, Splunk Operator, OpenTelemetry, Cluster Autoscaler, KubeRay, kube-prometheus-stack, cert-manager, gp3 StorageClass), then runs the same 10-step `delete` cleanup above. + ### Post-Installation Tasks #### 1. Access the Cluster @@ -1161,7 +1429,7 @@ aws eks update-nodegroup-config \ aws eks describe-cluster --name ${CLUSTER_NAME} --query cluster.version # Update control plane -aws eks update-cluster-version --name ${CLUSTER_NAME} --kubernetes-version 1.29 +aws eks update-cluster-version --name ${CLUSTER_NAME} --kubernetes-version 1.32 # Wait for update to complete (check status) aws eks describe-update --name ${CLUSTER_NAME} --update-id @@ -1881,6 +2149,43 @@ aws ecr describe-images --repository-name ray --region us-west-2 ## Advanced Topics +### H100 GPU Nodes with Capacity Blocks + +For H100 instances, the script supports AWS EC2 Capacity Blocks, which guarantee GPU capacity for a reserved time period. When `defaultAcceleratorType: "H100"` and a `capacityReservation.id` is set, GPU nodes are created separately via CloudFormation instead of eksctl managed node groups. + +**How It Works:** +1. CPU node group is created first via eksctl (standard managed node group) +2. The script then creates a CloudFormation stack with a Launch Template that references the Capacity Block reservation +3. Nodes auto-join the cluster with `nvidia.com/gpu=true` label and taint +4. The CloudFormation stack is idempotent (skipped if already healthy) + +**Configuration:** +```yaml +nodeGroups: + gpu: + enabled: true + instanceType: "p5.48xlarge" # H100 instance type + desiredCapacity: 2 + volumeSize: 2000 + volumeType: "gp3" + capacityReservation: + id: "cr-0abcdef1234567890" # Your Capacity Reservation ID + az: "us-east-2b" # AZ of the reservation + +aiPlatform: + defaultAcceleratorType: "H100" # Must be H100 for capacity block path +``` + +**Requirements:** +- You must have a valid EC2 Capacity Reservation (Capacity Block) purchased in your region +- The AZ of the reservation must match a subnet in your VPC +- `defaultAcceleratorType` must be set to `H100` + +**Cleanup:** +The `delete` and `delete-full` commands automatically clean up the CloudFormation stack (`-gpu-capacity-block`). + +--- + ### Auto Scaling #### Cluster Autoscaler @@ -2136,6 +2441,40 @@ EOF ## Troubleshooting +### Ray / AI model deployment: "Invalid repository ID or local directory" + +If a Ray Serve replica (e.g. `Llama31Instruct:LLMDeploymentL40S`) fails with: + +```text +Invalid repository ID or local directory specified: '/home/ray/.cache/s3/artifacts/model_artifacts/llama31-8b-instruct'. +Please verify the following requirements: +1. Provide a valid Hugging Face repository ID. +2. Specify a local directory that contains a recognized configuration file (e.g. config.json). +``` + +the model is loaded from object storage (S3/MinIO) into that path inside the pod. The path is missing or incomplete because the download from object storage failed or the model was never uploaded. + +**Checklist:** + +1. **Model is in MinIO/S3** + Upload the model so the bucket has the prefix `model_artifacts/llama31-8b-instruct/` with at least `config.json` and the model weights (see [artifacts README](../artifacts_download_upload_scripts/README.md)): + - Download: `./tools/artifacts_download_upload_scripts/download_from_huggingface.sh` + - Upload: `./tools/artifacts_download_upload_scripts/upload_to_minio.sh` (set `S3COMPAT_OBJECT_STORE_ENDPOINT`, `S3COMPAT_OBJECT_STORE_BUCKET`, and credentials to match your `cluster-config.yaml`; `MINIO_*` env vars are also accepted). + +2. **External MinIO reachable from EKS** + If using external MinIO (e.g. EC2), ensure: + - `storage.objectStore.endpoint` in `cluster-config.yaml` is correct (e.g. `http://:9000`). + - The EC2 security group allows **inbound TCP 9000** from your EKS node security group or VPC CIDR (see `install_minio_ec2.sh` output). + - From a Ray worker pod: + `kubectl exec -it -n -- curl -s -o /dev/null -w "%{http_code}" http:///minio/health/live` + +3. **Credentials secret** + AIPlatform must have `objectStorage.secretRef` set (e.g. `minio-credentials`). The secret must contain `s3_access_key` and `s3_secret_key` matching the MinIO user that can read the bucket: + - `kubectl get secret minio-credentials -n -o jsonpath='{.data}'` + +4. **Full troubleshooting steps** + See [Troubleshooting: Invalid repository ID or local directory](../../docs/troubleshooting.md) in the main docs for verification commands and details. + ### Script Execution Issues #### Issue: Script Exits Silently Without Error Message diff --git a/tools/cluster_setup/K0S_README.md b/tools/cluster_setup/K0S_README.md index 18668d5..3d116f9 100644 --- a/tools/cluster_setup/K0S_README.md +++ b/tools/cluster_setup/K0S_README.md @@ -5,7 +5,6 @@ Complete guide for deploying Splunk AI Platform on k0s Kubernetes clusters. ## Table of Contents - [Overview](#overview) -- [Pure On-Premises Deployments](#pure-on-premises-deployments-no-aws) - [Features](#features) - [Prerequisites](#prerequisites) - [Quick Start](#quick-start) @@ -16,6 +15,7 @@ Complete guide for deploying Splunk AI Platform on k0s Kubernetes clusters. - [Advanced Topics](#advanced-topics) - [Troubleshooting](#troubleshooting) - [Security](#security) +- [Internet Dependencies](#internet-dependencies) - [Migration Guide](#migration-guide) --- @@ -24,10 +24,13 @@ Complete guide for deploying Splunk AI Platform on k0s Kubernetes clusters. The `k0s_cluster_with_stack.sh` script deploys the complete Splunk AI Platform on k0s Kubernetes, supporting: -- **On-premises deployments** with existing hardware -- **Bare metal servers** with customer-managed infrastructure -- **AWS EC2 instances** for testing and simulation -- **Air-gapped environments** with MinIO object storage +- **Bare metal / on-premises deployments** with existing hardware and SSH access +- **External S3-compatible object storage** (SeaweedFS, MinIO, or any S3-compatible endpoint) — customer-managed +- **Air-gapped environments** with private registries +- **Session logging** — all output captured to timestamped log files +- **Safety gates** — refuses to wipe a live cluster with Ready nodes + +> **Important:** This script requires pre-provisioned nodes with `existingIPs` in the config YAML. It does **not** auto-create cloud instances. Object storage must be external and customer-managed (no in-cluster MinIO is deployed). ### What is k0s? @@ -39,389 +42,41 @@ The `k0s_cluster_with_stack.sh` script deploys the complete Splunk AI Platform o --- -## Pure On-Premises Deployments (No AWS) - -### Does this work for customers in their own data centers? - -**Yes!** The k0s deployment is specifically designed for on-premises deployments where customers have zero AWS presence. Here's what you need to know: - -### What Works Without AWS - -✅ **Complete AI Platform Stack** - All features work in pure on-prem environments -✅ **MinIO Object Storage** - Replaces AWS S3, runs entirely in your cluster -✅ **No Cloud Dependencies** - No AWS services required -✅ **Air-Gapped Support** - Can run completely disconnected from the internet -✅ **Private Registries** - Use your own container registry instead of ECR - -### What You Need to Provide (On-Premises) - -**1. Physical/Virtual Infrastructure:** -- Physical servers or VMs with Ubuntu 22.04 LTS (or similar) -- Minimum 3 nodes (1 controller + 2 workers), recommended 5+ nodes -- Direct SSH access to all nodes -- Root/sudo privileges on all nodes - -**2. Network Infrastructure:** -- **Internal Network**: All nodes must be on the same network segment -- **IP Addressing**: Static IPs or DHCP reservations for all nodes -- **DNS (Optional but recommended)**: Internal DNS for node resolution -- **Internet Access (Initial Setup)**: For downloading k0s binary and container images - - Can be removed after installation for air-gapped operation - -**3. Network Ports (Between Nodes):** - -| Port | Protocol | Source | Destination | Purpose | -|------|----------|--------|-------------|---------| -| 22 | TCP | Admin workstation | All nodes | SSH management | -| 6443 | TCP | All nodes | Controller | Kubernetes API | -| 2380 | TCP | Controllers | Controllers | etcd peer communication | -| 10250 | TCP | All nodes | All nodes | Kubelet API | -| 8132 | TCP | Worker nodes | Controller | Konnectivity agent | -| 179 | TCP | All nodes | All nodes | Calico BGP (if using BGP) | -| 4789 | UDP | All nodes | All nodes | Calico VXLAN overlay | -| 30000-32767 | TCP | User networks | Worker nodes | NodePort services (optional) | - -**4. Storage:** -- Local disk space on each node: - - Controller: 100GB minimum - - CPU Worker: 200GB minimum (for MinIO and workloads) - - GPU Worker: 500GB+ recommended (for models and datasets) - -**5. For Private Container Registry:** -- Your own Docker registry (Harbor, Artifactory, etc.) -- Pre-pull and push all required images to your registry -- Configure imagePullSecrets for the registry - -### Network Architecture (Pure On-Premises) - -``` -┌─────────────────────────────────────────────────────────────┐ -│ Your Data Center Network │ -│ (e.g., 10.0.0.0/16) │ -└─────────────────────────────────────────────────────────────┘ - │ - ┌───────────────────┼───────────────────┐ - │ │ │ -┌───────▼──────────┐ ┌──────▼───────────┐ ┌───▼──────────────┐ -│ Controller Node │ │ CPU Worker 1 │ │ GPU Worker 1 │ -│ 10.0.1.10 │ │ 10.0.1.20 │ │ 10.0.1.30 │ -│ :6443 (API) │ │ │ │ │ -│ :8132 (Konnect) │ │ • MinIO │ │ • Ray GPU Pods │ -└──────────────────┘ └──────────────────┘ └──────────────────┘ - │ │ │ - └───────────────────┼───────────────────┘ - │ - ┌─────────▼──────────┐ - │ Calico VXLAN │ - │ Pod Network │ - │ 10.244.0.0/16 │ - └────────────────────┘ -``` - -**Key Points:** -- **Host Network (10.0.0.0/16)**: Your physical data center network -- **Pod Network (10.244.0.0/16)**: Calico VXLAN overlay network -- **Service Network (10.96.0.0/16)**: Kubernetes ClusterIP services -- All pod-to-pod communication happens over VXLAN (no cloud networking) -- MinIO storage is local to the cluster (no S3) - -### Configuration Example (Pure On-Premises) - -```yaml -cluster: - name: onprem-ai-cluster - region: us-west-2 # Ignored for on-prem, but required in config - sshUser: ubuntu - sshKeyPath: ~/.ssh/onprem-key - -nodes: - controllers: 1 - cpuWorkers: 0 # Not used with existingIPs - gpuWorkers: 0 # Not used with existingIPs - - existingIPs: - controllers: - - 10.0.1.10 # Your controller server IP - workers: - - 10.0.1.20 # CPU worker 1 - - 10.0.1.21 # CPU worker 2 - - 10.0.1.30 # GPU worker 1 - - 10.0.1.31 # GPU worker 2 - -minio: - accessKey: minio-admin - secretKey: SuperSecurePassword123! - bucket: ai-platform-data - -kubernetes: - namespace: ai-platform - -imagePullSecrets: - secrets: - - private-registry-secret # Your private registry - autoCreateECR: false # No AWS ECR - -aiplatform: - vectordb: - storageSize: "100Gi" - workers: - cpu: - maxReplicas: 4 - gpu: - maxReplicas: 2 -``` - -### Installation Steps (Pure On-Premises) - -**1. Prepare Your Nodes:** -```bash -# On each node, ensure: -# - Ubuntu 22.04 LTS installed -# - SSH access configured -# - Passwordless sudo enabled -# - Python 3.8+ installed - -# Example setup on each node: -ssh ubuntu@10.0.1.10 -sudo apt-get update -sudo apt-get install -y python3 curl -``` - -**2. Configure SSH Access:** -```bash -# From your admin workstation -# Test SSH access to all nodes -ssh -i ~/.ssh/onprem-key ubuntu@10.0.1.10 "hostname" -ssh -i ~/.ssh/onprem-key ubuntu@10.0.1.20 "hostname" -ssh -i ~/.ssh/onprem-key ubuntu@10.0.1.21 "hostname" -``` - -**3. Create Configuration File:** -```bash -# Copy template and edit -cp k0s-cluster-config.yaml onprem-config.yaml -vi onprem-config.yaml -# - Set existingIPs to your node IPs -# - Set autoCreateECR: false -# - Configure MinIO credentials -``` - -**4. Run Installation:** -```bash -# From your admin workstation (must have internet access for initial download) -CONFIG_FILE=./onprem-config.yaml ./k0s_cluster_with_stack.sh install -``` - -**5. Access Your Cluster:** -```bash -# Kubeconfig is saved to ~/.kube/k0s- -export KUBECONFIG=~/.kube/k0s-onprem-ai-cluster - -# Verify -kubectl get nodes -kubectl get pods -A -``` - -### Private Container Registry Setup - -If using a private registry instead of public Docker Hub: - -**1. Set up your registry** (Harbor, Artifactory, JFrog, etc.) - -**2. Pre-pull and push images:** -```bash -# Pull from public registries -docker pull rayproject/ray:2.9.0 -docker pull semitechnologies/weaviate:1.28.0 -docker pull minio/minio:latest - -# Tag for your registry -docker tag rayproject/ray:2.9.0 registry.yourcompany.com/ray:2.9.0 -docker tag semitechnologies/weaviate:1.28.0 registry.yourcompany.com/weaviate:1.28.0 -docker tag minio/minio:latest registry.yourcompany.com/minio:latest - -# Push to your registry -docker push registry.yourcompany.com/ray:2.9.0 -docker push registry.yourcompany.com/weaviate:1.28.0 -docker push registry.yourcompany.com/minio:latest -``` - -**3. Create registry secret:** -```bash -kubectl create secret docker-registry private-registry-secret \ - --docker-server=registry.yourcompany.com \ - --docker-username=admin \ - --docker-password=secretpassword \ - --namespace=ai-platform -``` - -**4. Configure in k0s-cluster-config.yaml:** -```yaml -imagePullSecrets: - secrets: - - private-registry-secret - autoCreateECR: false - -aiplatform: - ray: - image: "registry.yourcompany.com/ray:2.9.0" - vectordb: - image: "registry.yourcompany.com/weaviate:1.28.0" -``` - -### Air-Gapped Deployment - -For completely disconnected environments: - -**1. Pre-stage on a connected system:** -- Download k0s binary -- Pull all required container images -- Download Helm charts - -**2. Transfer to air-gapped environment:** -- Copy k0s binary to all nodes -- Load images into local registry -- Copy Helm charts and manifests - -**3. Configure to use local resources:** -```yaml -imagePullSecrets: - secrets: - - airgap-registry - autoCreateECR: false -``` - -**4. Run installation pointing to local registry** - -### Common On-Premises Scenarios - -#### Scenario 1: Corporate Data Center with Proxy - -```yaml -# Configure nodes to use corporate proxy -# On each node: -export HTTP_PROXY=http://proxy.corp.com:8080 -export HTTPS_PROXY=http://proxy.corp.com:8080 -export NO_PROXY=localhost,127.0.0.1,10.0.0.0/8,.cluster.local - -# Then run installation -``` - -#### Scenario 2: Multiple Data Centers (Multi-Site) - -For multi-site deployments: -- Deploy separate k0s cluster per data center -- Use federation or multi-cluster management (not covered in this script) -- Consider network latency between sites (<10ms recommended for etcd) - -#### Scenario 3: Existing Kubernetes Cluster - -If you already have a Kubernetes cluster: -```yaml -cluster: - useExisting: force # Use existing cluster instead of creating new one -``` - -Then install just the AI Platform stack on your existing cluster. - -### Networking Deep Dive - -#### Required Connectivity Matrix - -| From | To | Ports | Purpose | -|------|-----|-------|---------| -| Admin Workstation | All nodes | 22/TCP | SSH management | -| All nodes | Controller | 6443/TCP | Kubernetes API | -| All nodes | Controller | 8132/TCP | Konnectivity | -| All nodes | All nodes | 10250/TCP | Kubelet | -| All nodes | All nodes | 4789/UDP | VXLAN overlay | -| Controllers | Controllers | 2380/TCP | etcd (HA only) | -| User clients | Worker nodes | 30000-32767/TCP | NodePort (optional) | - -#### Firewall Configuration Example (iptables) - -```bash -# On controller node -sudo iptables -A INPUT -p tcp --dport 6443 -s 10.0.0.0/16 -j ACCEPT -sudo iptables -A INPUT -p tcp --dport 8132 -s 10.0.0.0/16 -j ACCEPT -sudo iptables -A INPUT -p tcp --dport 2380 -s 10.0.0.0/16 -j ACCEPT - -# On all nodes -sudo iptables -A INPUT -p tcp --dport 10250 -s 10.0.0.0/16 -j ACCEPT -sudo iptables -A INPUT -p udp --dport 4789 -s 10.0.0.0/16 -j ACCEPT -sudo iptables -A INPUT -p tcp --dport 179 -s 10.0.0.0/16 -j ACCEPT -``` - -#### DNS Requirements - -**Optional but Recommended:** -- Internal DNS server resolving node hostnames -- Or: Configure /etc/hosts on all nodes with all node IPs - -```bash -# Example /etc/hosts on each node -10.0.1.10 controller1.corp.local controller1 -10.0.1.20 worker1.corp.local worker1 -10.0.1.21 worker2.corp.local worker2 -``` - -### What About AWS Features? - -| AWS Feature | On-Prem Alternative | -|-------------|---------------------| -| S3 Storage | MinIO (S3-compatible) ✅ | -| ECR Registry | Harbor, Artifactory, JFrog ✅ | -| EBS Volumes | Local storage (local-path) ✅ | -| IAM Roles | Kubernetes ServiceAccounts ✅ | -| ELB/ALB | NodePort or MetalLB ✅ | -| VPC Networking | Calico VXLAN ✅ | -| Route53 DNS | Internal DNS server ✅ | -| CloudWatch | Prometheus + Grafana ✅ | - -**Everything works on-premises with alternative solutions!** - ---- - ## Features ### Complete AI Platform Stack The script installs everything needed for the AI Platform: -1. **k0s Kubernetes Cluster** (v1.30+) - CNCF certified Kubernetes -2. **Calico CNI** - High-performance networking with VXLAN -3. **MinIO** - S3-compatible object storage (replaces AWS S3) -4. **Cert-Manager** - Automated certificate management -5. **Kube-Prometheus Stack** - Monitoring with Prometheus + Grafana -6. **OpenTelemetry Operator** - Distributed tracing and telemetry -7. **NVIDIA GPU Operator** - GPU support for AI workloads (optional) -8. **KubeRay Operator** - Ray cluster management for distributed AI -9. **Splunk Operator** - Splunk Enterprise management -10. **Splunk AI Platform Operator** - AI platform orchestration -11. **AI Platform CR** - Complete AI deployment with features - -### Two Deployment Modes - -#### Mode 1: On-Premises/Baremetal ✅ -- Provide existing IP addresses -- Passwordless SSH with sudo access required -- Production-ready for on-prem deployments -- Air-gapped support with MinIO - -#### Mode 2: AWS EC2 (Testing) 🧪 -- Automatically creates EC2 instances -- Simulates on-prem environment -- Quick setup for testing/validation -- Uses AWS networking - -### Image Pull Secrets Support 🔐 +1. **k0s Kubernetes Cluster** — CNCF certified, single-binary Kubernetes +2. **Calico CNI** — High-performance networking with VXLAN +3. **local-path Storage Provisioner** — Default StorageClass for PVCs +4. **Cert-Manager v1.13.0** — Automated certificate management +5. **Kube-Prometheus Stack** — Monitoring with Prometheus + Grafana +6. **OpenTelemetry Operator** — Distributed tracing and telemetry +7. **NVIDIA Host Drivers + Device Plugin** — GPU support (RHEL 9/10, AL2023, Debian/Ubuntu) +8. **KubeRay Operator v1.2.2** — Ray cluster management for distributed AI +9. **Splunk Operator** — Splunk Enterprise management +10. **Splunk AI Platform Operator** — AI platform orchestration (SAIA feature) +11. **AIPlatform CR** — Complete AI deployment with features, scheduling, and secrets + +### Operational Features + +- **Two-phase parallel installation** — Independent components install concurrently for faster deployments +- **Helm retry with exponential backoff** — Automatic retries on transient errors (timeouts, TLS handshake failures) +- **Preflight validation** — Checks tools, config, SSH connectivity, and disk space before starting +- **Safety gate** — Refuses to wipe a cluster that has Ready nodes (prevents accidental data loss) +- **Session logging** — All stdout/stderr captured to `tools/cluster_setup/logs/k0s-install-YYYY-MM-DD_HH-MM-SS.log` +- **Existing cluster detection** — `useExisting` flag (auto/force/never) to skip k0s install and deploy stack only + +### Image Pull Secrets Support Automatically creates and configures secrets for private container registries: -- **AWS ECR** - Elastic Container Registry (auto-token refresh) -- **Docker Hub** - Docker Hub private repositories -- **GCR** - Google Container Registry -- **ACR** - Azure Container Registry -- **Custom** - Any Docker registry +- **AWS ECR** — Elastic Container Registry (auto-token refresh) +- **Docker Hub** — Docker Hub private repositories +- **GCR** — Google Container Registry +- **ACR** — Azure Container Registry +- **Custom** — Any Docker registry Secrets are automatically propagated through the platform: ``` @@ -432,11 +87,11 @@ AIPlatform CR → AIService → Job/RayCluster → Pods ## Prerequisites -### Required Tools +### Required Tools (on Admin Workstation) ```bash # Install required tools on macOS -brew install kubectl helm git jq yq aws-cli +brew install kubectl helm git jq yq # Install required tools on Ubuntu/Debian sudo apt-get update @@ -452,58 +107,42 @@ jq --version yq --version ``` -### For On-Prem Deployments +### Hardware Requirements + +| Node Type | CPU | RAM | Disk | Notes | +|-----------|-----|-----|------|-------| +| Controller | 4+ | 8GB+ | 100GB+ | Runs API server, etcd, scheduler | +| CPU Worker | 8+ | 32GB+ | 200GB+ | Runs Weaviate, Ray head, Splunk | +| GPU Worker | 8+ | 32GB+ | 500GB+ | NVIDIA GPU required for AI inference | -**Hardware Requirements:** -- **Controller Node**: 4 CPU, 8GB RAM, 50GB disk (minimum) -- **CPU Worker**: 8 CPU, 32GB RAM, 100GB disk (recommended for AI) -- **GPU Worker**: 8 CPU, 32GB RAM, 100GB disk + NVIDIA GPU +### Software Requirements (on All Nodes) -**Software Requirements:** -- Ubuntu 22.04 LTS (or similar Linux distribution) -- Passwordless SSH access to all nodes +- RHEL 9/10, Amazon Linux 2023, or Debian/Ubuntu +- Passwordless SSH access from admin workstation - Sudo privileges without password -- Python 3.8+ installed on all nodes +- Python 3.8+ installed + +### Network Requirements -**Network Requirements:** Open the following ports between nodes: | Port | Protocol | Purpose | |------|----------|---------| +| 22 | TCP | SSH management | | 6443 | TCP | Kubernetes API server | -| 2380 | TCP | etcd client | +| 2380 | TCP | etcd peer communication | | 10250 | TCP | Kubelet API | | 8132 | TCP | Konnectivity agent | | 179 | TCP | Calico BGP | | 4789 | UDP | Calico VXLAN | -| 30000-32767 | TCP | NodePort services | - -### For AWS EC2 Deployments - -**AWS Requirements:** -- AWS CLI configured with credentials -- IAM permissions: EC2, VPC, Security Groups -- Existing VPC with internet gateway -- SSH key pair in AWS region -- Sufficient EC2 quotas: - - t3.xlarge (controllers): 1+ instances - - m5.4xlarge (CPU workers): 2+ instances - - g5.2xlarge (GPU workers): 1+ instances +| 30000-32767 | TCP | NodePort services (optional) | -**Verify AWS Access:** -```bash -# Check AWS credentials -aws sts get-caller-identity - -# Check available regions -aws ec2 describe-regions --output table +### External Object Storage -# Check EC2 quotas -aws service-quotas get-service-quota \ - --service-code ec2 \ - --quota-code L-1216C47A \ - --region us-west-2 -``` +You must provide an external S3-compatible object storage endpoint: +- **SeaweedFS**, **MinIO**, or any S3-compatible service +- Must be reachable from all cluster nodes +- The script does **not** deploy object storage in-cluster --- @@ -529,17 +168,13 @@ vi my-cluster.yaml ### 3. Deploy the Cluster ```bash -# For on-prem deployment -CONFIG_FILE=./my-cluster.yaml ./k0s_cluster_with_stack.sh install - -# For EC2 testing CONFIG_FILE=./my-cluster.yaml ./k0s_cluster_with_stack.sh install ``` ### 4. Verify Installation ```bash -# Set kubeconfig +# Set kubeconfig (saved automatically during install) export KUBECONFIG=~/.kube/k0s-my-cluster # Check nodes @@ -561,355 +196,355 @@ kubectl get pods --all-namespaces The `k0s-cluster-config.yaml` file controls all aspects of the deployment: ```yaml -cluster: # Cluster-wide settings -nodes: # Node configuration -ec2: # AWS EC2 settings (if using EC2 mode) -instanceTypes: # EC2 instance types -minio: # MinIO object storage -kubernetes: # Kubernetes settings -splunk: # Splunk configuration -ecr: # ECR configuration -imagePullSecrets: # Private registry secrets -aiplatform: # AI Platform settings +cluster: # Cluster name, useExisting, SSH user/key +nodes: # Controller/worker counts and existingIPs +storage: # storageClass, vectorDbSize, objectStore, minimumDiskSpace +images: # registry prefix, operator, splunk, ray, weaviate, saia, nginx, fluentBit, otelCollector +operators: # ray (version/modelVersion/rayVersion), certManager, nvidia devicePluginVersion +kubernetes: # namespace +files: # splunkOperator, aiPlatform manifest paths +splunk: # standaloneName +aiPlatform: # defaultAcceleratorType, workerGroupConfig, features, scheduling, serviceTemplate +imagePullSecrets: # secrets list, autoCreateECR, dockerHub, gcr, acr, custom +ecr: # account, region ``` -### Configuration Examples - -#### Example 1: On-Premises Production Cluster - -**Use Case:** Production deployment on existing hardware +### Configuration Example ```yaml cluster: name: prod-ai-platform + useExisting: auto # auto | force | never sshUser: ubuntu sshKeyPath: ~/.ssh/prod-key.pem nodes: controllers: 1 - cpuWorkers: 0 # Ignored when using existingIPs - gpuWorkers: 0 # Ignored when using existingIPs - + cpuWorkers: 2 # First 2 workers treated as CPU + gpuWorkers: 2 # Remaining 2 workers treated as GPU existingIPs: controllers: - - 10.0.1.10 # Physical server 1 + - 10.0.1.10 workers: - - 10.0.1.20 # Physical server 2 (CPU) - - 10.0.1.21 # Physical server 3 (CPU) - - 10.0.1.22 # Physical server 4 (GPU) - - 10.0.1.23 # Physical server 5 (GPU) - -minio: - accessKey: admin - secretKey: Change-This-Strong-Password-123! - bucket: ai-platform-production + - 10.0.1.20 # CPU (worker index 0) + - 10.0.1.21 # CPU (worker index 1) + - 10.0.1.22 # GPU (worker index 2) + - 10.0.1.23 # GPU (worker index 3) + +storage: + storageClass: "local-path" + vectorDbSize: "200Gi" + minimumDiskSpace: # Preflight disk checks (GB) + controller: 100 + cpuWorker: 200 + gpuWorker: 500 + objectStore: + type: "seaweedfs" # aws | s3compat | minio | seaweedfs + bucket: "ai-platform-data" + endpoint: "http://10.0.1.50:8333" # REQUIRED for s3compat/minio/seaweedfs + auth: + rootUser: "admin" + rootPassword: "Change-This-Strong-Password!" + +images: + registry: "registry.corp.com" + operator: + image: "registry.corp.com/splunk/splunk-ai-operator:v0.1.5" + splunk: + image: "registry.corp.com/splunk/splunk:latest" + operatorImage: "docker.io/splunk/splunk-operator:3.0.0" + ray: + headImage: "registry.corp.com/ray/ray-head:build-v1alpha1" + workerImage: "registry.corp.com/ray/ray-worker-gpu:build-v1alpha1" + weaviate: + image: "docker.io/semitechnologies/weaviate:stable-v1.28" + saia: + apiImage: "registry.corp.com/saia/saia-api:build-v1alpha1" + apiV2Image: "registry.corp.com/saia/saia-api-v2:build-v1alpha1" + dataLoaderImage: "registry.corp.com/saia/saia-data-loader:build-v1alpha1" + nginx: + image: "docker.io/library/nginx:1.27-alpine" + fluentBit: + image: "docker.io/fluent/fluent-bit:1.9.6" + otelCollector: + image: "docker.io/otel/opentelemetry-collector-contrib:0.122.1" + +operators: + ray: + version: "v1.2.2" + modelVersion: "v0.3.14-36-g1549f5a" + rayVersion: "2.44.0" + certManager: + installCRDs: true + nvidia: + devicePluginVersion: "v0.17.3" kubernetes: namespace: ai-platform splunk: standaloneName: splunk-prod - index: ai-platform - -imagePullSecrets: - secrets: - - ecr-registry-secret - autoCreateECR: false # Manually create in air-gapped - -aiplatform: - vectordb: - storageSize: "200Gi" # Large storage for production - workers: - cpu: - maxReplicas: 8 - gpu: - maxReplicas: 4 -``` - -#### Example 2: AWS EC2 Testing Cluster -**Use Case:** Quick testing/validation before on-prem deployment - -```yaml -cluster: - name: test-ai-platform - region: us-west-2 - useExisting: auto - sshUser: ubuntu - sshKeyPath: ~/.ssh/test-key.pem - -nodes: - controllers: 1 - cpuWorkers: 2 - gpuWorkers: 1 - - existingIPs: - controllers: [] # Empty = auto-create EC2 - workers: [] # Empty = auto-create EC2 - -ec2: - vpcId: vpc-0123456789abcdef0 - subnetId: "" # Auto-select first available - keyName: test-key - -instanceTypes: - controller: t3.xlarge - cpuWorker: m5.2xlarge - gpuWorker: g5.xlarge - -ecr: - account: "123456789012" # Your AWS account ID - -imagePullSecrets: - secrets: [] # Auto-added when autoCreateECR=true - autoCreateECR: true # Automatically create ECR secret - -minio: - accessKey: minioadmin - secretKey: minioadmin123 - bucket: ai-platform-test - -kubernetes: - namespace: ai-platform -``` - -#### Example 3: Hybrid Cluster (Some Existing, Some New) - -**Use Case:** Mix existing on-prem nodes with cloud nodes - -```yaml -cluster: - name: hybrid-cluster - region: us-east-1 - sshUser: ubuntu - sshKeyPath: ~/.ssh/hybrid-key.pem - -nodes: - controllers: 1 - cpuWorkers: 2 # Will create 2 new EC2 CPU workers - gpuWorkers: 0 # No new GPU workers - - existingIPs: - controllers: - - 192.168.1.10 # Existing on-prem controller - workers: - - 192.168.1.20 # Existing GPU worker 1 - - 192.168.1.21 # Existing GPU worker 2 - # + 2 CPU workers will be created in EC2 - -ec2: - vpcId: vpc-0123456789abcdef0 - keyName: hybrid-key - -instanceTypes: - cpuWorker: m5.2xlarge # For new EC2 workers +aiPlatform: + name: "prod-ai-stack" + defaultAcceleratorType: "L40S" # GPU tier: L40S, H100, or "" + workerGroupConfig: + imageRegistry: "" # Override registry for Ray worker images + features: + - name: "saia" + version: "1.1.0" + serviceAccountName: "" + cpuScheduling: + nodeSelector: + splunk.ai/workload-type: cpu + tolerations: [] + gpuScheduling: + nodeSelector: + splunk.ai/workload-type: gpu + tolerations: + - key: "nvidia.com/gpu" + operator: "Equal" + value: "true" + effect: "NoSchedule" + serviceTemplate: # Optional: expose SAIA externally + type: "NodePort" # NodePort | LoadBalancer + nodePort: 30080 # Port for NodePort type imagePullSecrets: + secrets: [] autoCreateECR: true -``` - -#### Example 4: Air-Gapped On-Prem Cluster - -**Use Case:** Secure environment with no internet access + dockerHub: + enabled: false + username: "" + password: "" + email: "" + gcr: + enabled: false + jsonKey: "" + acr: + enabled: false + registry: "" + username: "" + password: "" + custom: + enabled: false + name: "custom-registry-secret" + server: "" + username: "" + password: "" + email: "" -```yaml -cluster: - name: airgap-cluster - sshUser: admin - sshKeyPath: ~/.ssh/secure-key.pem - -nodes: - controllers: 3 # HA setup - cpuWorkers: 0 - gpuWorkers: 0 - - existingIPs: - controllers: - - 172.16.0.10 - - 172.16.0.11 - - 172.16.0.12 - workers: - - 172.16.0.20 - - 172.16.0.21 - - 172.16.0.22 - -minio: - accessKey: secure-admin - secretKey: Very-Long-Secure-Password-456! - bucket: airgap-storage - -imagePullSecrets: - secrets: - - private-registry-secret # Pre-created manually - autoCreateECR: false - -# Note: Pre-pull all images to local registry before installation +ecr: + account: "123456789012" + region: us-east-2 ``` ### Configuration Reference #### Cluster Section -```yaml -cluster: - # Cluster name (used for tagging, kubeconfig, etc.) - name: my-cluster - - # Use existing cluster instead of creating new one - # Options: auto (detect), force (fail if not found), never (always create) - useExisting: auto - - # AWS region (required for EC2 mode) - region: us-west-2 - - # SSH configuration - sshUser: ubuntu # SSH username - sshKeyPath: ~/.ssh/my-key.pem # Path to private key -``` +| Field | Required | Default | Description | +|-------|----------|---------|-------------| +| `cluster.name` | Yes | — | Cluster identifier (used for kubeconfig, labels) | +| `cluster.useExisting` | No | `never` | `auto` = detect existing cluster, `force` = fail if not found, `never` = always create new | +| `cluster.sshUser` | Yes | `ubuntu` | SSH username for all nodes | +| `cluster.sshKeyPath` | Yes | — | Path to SSH private key | #### Nodes Section -```yaml -nodes: - # Number of controller nodes (1 or 3 for HA) - controllers: 1 - - # Number of CPU worker nodes (only for EC2 mode) - cpuWorkers: 2 - - # Number of GPU worker nodes (only for EC2 mode) - gpuWorkers: 1 - - # Existing IP addresses (on-prem mode) - existingIPs: - controllers: [] # Leave empty for EC2 auto-creation - workers: [] # Leave empty for EC2 auto-creation -``` +| Field | Required | Default | Description | +|-------|----------|---------|-------------| +| `nodes.controllers` | No | `1` | Number of controller nodes (1 or 3 for HA) | +| `nodes.cpuWorkers` | No | `2` | First N workers in the list are labeled as CPU | +| `nodes.gpuWorkers` | No | `1` | Remaining workers after cpuWorkers are labeled as GPU | +| `nodes.existingIPs.controllers` | **Yes** | — | List of controller node IPs | +| `nodes.existingIPs.workers` | **Yes** | — | List of worker node IPs | + +#### Storage Section + +| Field | Required | Default | Description | +|-------|----------|---------|-------------| +| `storage.storageClass` | No | `local-path` | Kubernetes StorageClass for PVCs | +| `storage.vectorDbSize` | No | `50Gi` | Weaviate PersistentVolume size | +| `storage.minimumDiskSpace.controller` | No | `100` | Minimum disk (GB) for controller preflight check | +| `storage.minimumDiskSpace.cpuWorker` | No | `200` | Minimum disk (GB) for CPU worker preflight check | +| `storage.minimumDiskSpace.gpuWorker` | No | `500` | Minimum disk (GB) for GPU worker preflight check | +| `storage.objectStore.type` | No | `minio` | `aws`, `s3compat`, `minio`, or `seaweedfs` | +| `storage.objectStore.bucket` | No | `ai-platform-data` | S3 bucket name | +| `storage.objectStore.endpoint` | **Yes*** | — | S3-compatible endpoint URL (*required for s3compat/minio/seaweedfs) | +| `storage.objectStore.auth.rootUser` | Yes | — | Access key / root user | +| `storage.objectStore.auth.rootPassword` | Yes | — | Secret key / root password | + +#### Images Section + +Short image paths (without a FQDN) are automatically prefixed with `images.registry`. + +| Field | Required | Default | Description | +|-------|----------|---------|-------------| +| `images.registry` | No | `""` | Registry prefix for short image paths | +| `images.operator.image` | **Yes** | — | Splunk AI Operator image | +| `images.splunk.image` | **Yes** | — | Splunk Enterprise image | +| `images.splunk.operatorImage` | No | `docker.io/splunk/splunk-operator:3.0.0` | Splunk Operator image | +| `images.ray.headImage` | **Yes** | — | Ray head node image | +| `images.ray.workerImage` | **Yes** | — | Ray GPU worker image | +| `images.weaviate.image` | **Yes** | — | Weaviate vector DB image | +| `images.saia.apiImage` | **Yes** | — | SAIA API v1 image | +| `images.saia.apiV2Image` | **Yes** | — | SAIA API v2 image | +| `images.saia.dataLoaderImage` | **Yes** | — | SAIA data loader / post-install hook image | +| `images.nginx.image` | No | `docker.io/library/nginx:1.27-alpine` | Nginx reverse proxy for SAIA v1/v2 routing | +| `images.fluentBit.image` | No | `fluent/fluent-bit:1.9.6` | Fluent Bit log forwarder | +| `images.otelCollector.image` | No | `otel/opentelemetry-collector-contrib:0.122.1` | OpenTelemetry Collector | + +**Image patching chain:** The script reads these config values, resolves them via `build_image_url()` (prepends registry if needed), then uses `sed` to patch the corresponding `RELATED_IMAGE_*` env vars in manifest files: + +| Config field | Env var patched | Target file | +|---|---|---| +| `images.operator.image` | Container `image:` field | `artifacts.yaml` | +| `images.splunk.image` | `RELATED_IMAGE_SPLUNK_ENTERPRISE` | `splunk-operator-cluster.yaml` | +| `images.splunk.operatorImage` | Container `image:` field | `splunk-operator-cluster.yaml` | +| `images.ray.headImage` | `RELATED_IMAGE_RAY_HEAD` | `artifacts.yaml` | +| `images.ray.workerImage` | `RELATED_IMAGE_RAY_WORKER` | `artifacts.yaml` | +| `images.weaviate.image` | `RELATED_IMAGE_WEAVIATE` | `artifacts.yaml` | +| `images.saia.apiImage` | `RELATED_IMAGE_SAIA_API` | `artifacts.yaml` | +| `images.saia.apiV2Image` | `RELATED_IMAGE_SAIA_API_V2` | `artifacts.yaml` | +| `images.saia.dataLoaderImage` | `RELATED_IMAGE_POST_INSTALL_HOOK` | `artifacts.yaml` | +| `images.nginx.image` | `RELATED_IMAGE_NGINX` | `artifacts.yaml` | +| `images.fluentBit.image` | `RELATED_IMAGE_FLUENT_BIT` | `artifacts.yaml` | +| `images.otelCollector.image` | `RELATED_IMAGE_OTEL_COLLECTOR` | `artifacts.yaml` | +| `operators.ray.modelVersion` | `MODEL_VERSION` | `artifacts.yaml` | +| `operators.ray.rayVersion` | `RAY_VERSION` | `artifacts.yaml` | + +#### AI Platform Section + +| Field | Required | Default | Description | +|-------|----------|---------|-------------| +| `aiPlatform.name` | No | `${CLUSTER_NAME}-ai-platform` | Base name for the AIPlatform CR | +| `aiPlatform.defaultAcceleratorType` | No | `""` | GPU tier label: `L40S`, `H100`, or empty | +| `aiPlatform.workerGroupConfig.imageRegistry` | No | `""` | Override registry for Ray worker images | +| `aiPlatform.features` | Yes | — | Array of features to deploy (read dynamically from config) | +| `aiPlatform.features[].name` | Yes | — | Feature name (e.g., `saia`) | +| `aiPlatform.features[].version` | Yes | — | Feature version | +| `aiPlatform.features[].serviceAccountName` | No | `""` | Service account override | +| `aiPlatform.cpuScheduling.nodeSelector` | No | auto-generated | Node selector for CPU workloads | +| `aiPlatform.cpuScheduling.tolerations` | No | `[]` | Tolerations for CPU workloads | +| `aiPlatform.gpuScheduling.nodeSelector` | No | auto-generated | Node selector for GPU workloads | +| `aiPlatform.gpuScheduling.tolerations` | No | GPU toleration | Tolerations for GPU workloads | +| `aiPlatform.serviceTemplate.type` | No | — | Service type for SAIA exposure: `NodePort` or `LoadBalancer` | +| `aiPlatform.serviceTemplate.nodePort` | No | — | Node port number (only when type=NodePort) | #### Image Pull Secrets Section +The `secrets` list is **not consumed** by the script. Instead, the script auto-detects which secrets exist in the namespace by checking for hardcoded names: `ecr-registry-secret`, `docker-hub-secret`, `gcr-secret`, `acr-secret`, `custom-registry-secret`. + ```yaml imagePullSecrets: - # List of secret names to use - secrets: - - ecr-registry-secret - - docker-hub-secret - - # Auto-create ECR secret - autoCreateECR: true # Requires AWS credentials + secrets: [] # NOT consumed; script auto-detects in namespace + autoCreateECR: true # Consumed → creates ECR secret from AWS creds + + dockerHub: + enabled: false + username: "" + password: "" + email: "" + + gcr: + enabled: false + jsonKey: "" + + acr: + enabled: false + registry: "" + username: "" + password: "" + + custom: + enabled: false + name: "custom-registry-secret" + server: "" + username: "" + password: "" + email: "" ``` --- ## Usage -### Basic Commands +### Commands ```bash -# Install cluster with custom config +# Install cluster and full AI Platform stack CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh install -# Delete entire cluster +# Delete entire cluster (stop k0s, remove services) CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh delete -# Health check -CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh health - -# Get cluster info -CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh info -``` - -### Advanced Commands - -```bash -# Install without confirmation prompts -AUTO_APPROVE=true CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh install +# Clean all k0s state from bare-metal nodes (stop/reset/remove) +CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh clean-all -# Skip specific components -SKIP_MINIO=true CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh install -SKIP_GPU_OPERATOR=true CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh install - -# Use existing cluster (skip k0s installation) -USE_EXISTING=force CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh install - -# Join additional workers +# Join additional workers to an existing cluster (or rejoin failed workers) CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh join-workers ``` -### Post-Installation Tasks +### Environment Variables -#### 1. Access the Cluster +| Variable | Default | Description | +|----------|---------|-------------| +| `CONFIG_FILE` | `./k0s-cluster-config.yaml` | Path to configuration file | +| `AUTO_APPROVE` | `false` | Skip confirmation prompts | +| `USE_EXISTING` | (from config) | Override `cluster.useExisting` (`auto`/`force`/`never`) | +| `LOG_DIR` | `./logs` | Directory for session log files | -```bash -# Set kubeconfig environment variable -export KUBECONFIG=~/.kube/k0s-my-cluster +### Session Logging -# Or copy to default location -cp ~/.kube/k0s-my-cluster ~/.kube/config +All script output (stdout and stderr) is automatically captured to a timestamped log file: -# Verify cluster access -kubectl cluster-info -kubectl get nodes ``` - -#### 2. Check Installation Status - -```bash -# Check all namespaces -kubectl get pods --all-namespaces - -# Check AI Platform specifically -kubectl get aiplatform -n ai-platform -o wide - -# Check AIServices -kubectl get aiservice -n ai-platform - -# Check RayCluster -kubectl get rayservice -n ai-platform +tools/cluster_setup/logs/k0s-install-2026-04-29_14-30-00.log ``` -#### 3. Access MinIO Console - +Override the log directory: ```bash -# Port forward MinIO console -kubectl port-forward -n minio-system svc/minio 9001:9001 - -# Open in browser: http://localhost:9001 -# Login with credentials from config file +LOG_DIR=/var/log/k0s CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh install ``` -#### 4. Access Splunk +### Install Flow -```bash -# Get Splunk admin password -SPLUNK_PASSWORD=$(kubectl get secret \ - splunk--standalone-secret-v1 \ - -n ai-platform \ - -o jsonpath='{.data.password}' | base64 -d) +The `install` command executes these steps in order: -echo "Splunk password: $SPLUNK_PASSWORD" +1. **Load config** — Parse YAML, validate existingIPs +2. **Validate images** — Ensure all required image fields are set +3. **Configure images** — Patch `RELATED_IMAGE_*` env vars in manifest files +4. **Preflight checks** — Validate tools, SSH connectivity, disk space, config +5. **Install k0s cluster** — Safety gate check → clean state → install controller → join workers → label nodes +6. **Install AI Platform stack** (two-phase parallel): + - Phase 1 (parallel): cert-manager, kube-prometheus, NVIDIA host drivers + - Between phases: Ensure S3 credentials secret + - Phase 2 (parallel): OTel operator, Ray operator, Splunk operator, NVIDIA device plugin + - Sequential: Image pull secrets → Splunk standalone → AI operator → AIPlatform CR +7. **Health checks** — Verify all components are running +8. **Access info** — Display kubeconfig path and service endpoints -# Port forward Splunk web UI -kubectl port-forward -n ai-platform \ - svc/splunk--standalone-service 8000:8000 +### join-workers Command -# Access at http://localhost:8000 -# Username: admin -# Password: (from above command) -``` +The `join-workers` command is used to: +- Add new worker nodes to an existing cluster +- Rejoin workers that were disconnected or failed -#### 5. Access Prometheus/Grafana +It: +1. Loads config and identifies which workers are not yet joined +2. Generates a fresh worker token from the controller +3. Installs k0s worker on each missing node +4. Waits for nodes to become Ready +5. Labels nodes with `splunk.ai/*` labels based on CPU/GPU role -```bash -# Prometheus -kubectl port-forward -n monitoring svc/prometheus-operated 9090:9090 -# Access at http://localhost:9090 - -# Grafana -kubectl port-forward -n monitoring svc/grafana 3000:80 -# Access at http://localhost:3000 -# Default credentials: admin/admin -``` +### useExisting Flag + +| Value | Behavior | +|-------|----------| +| `never` | Always creates a new k0s cluster (default). Fails if nodes have a live cluster (safety gate). | +| `auto` | Checks if a running k0s cluster exists on the controller. If yes, skips cluster creation and deploys stack only. If no, creates new cluster. | +| `force` | Assumes an existing cluster. Fails if no running cluster is found on the controller. | --- @@ -938,9 +573,9 @@ kubectl port-forward -n monitoring svc/grafana 3000:80 ┌─▼───────▼──────┐ ┌─────────▼────────┐ ┌───────▼─────────┐ │ CPU Worker 1 │ │ CPU Worker 2 │ │ GPU Worker │ │ │ │ │ │ │ -│ • MinIO │ │ • Weaviate │ │ • Ray GPU Pods │ -│ • Ray Head │ │ • Ray CPU Pods │ │ • AI Training │ -│ • Monitoring │ │ • AI Inference │ │ │ +│ • Ray Head │ │ • Weaviate │ │ • Ray GPU Pods │ +│ • Splunk │ │ • Ray CPU Pods │ │ • AI Inference │ +│ • Monitoring │ │ • AI Services │ │ │ └────────────────┘ └──────────────────┘ └─────────────────┘ ``` @@ -949,7 +584,6 @@ kubectl port-forward -n monitoring svc/grafana 3000:80 **Pod Network (Calico VXLAN):** - CIDR: `10.244.0.0/16` - Overlay network across all nodes -- Isolated from host network **Service Network:** - CIDR: `10.96.0.0/16` @@ -957,55 +591,44 @@ kubectl port-forward -n monitoring svc/grafana 3000:80 - NodePort range: `30000-32767` **Host Network:** -- Controller API: `:6443` -- Konnectivity: `:8132` -- SSH: `:22` +- Controller API: `:6443` +- Konnectivity: `:8132` +- SSH: `:22` ### Storage Architecture ``` ┌──────────────────────────────────────────────────────────┐ -│ MinIO Object Storage │ -│ (S3-Compatible, Running in Kubernetes) │ +│ External S3-Compatible Object Storage │ +│ (Customer-Managed: SeaweedFS / MinIO / S3) │ │ │ -│ Endpoint: http://minio.minio-system.svc.cluster.local │ -│ Port: 9000 (API), 9001 (Console) │ +│ Endpoint: http://: │ │ │ │ Buckets: │ -│ ├─ ai-platform-bucket/ │ -│ │ ├─ artifacts/ (Build artifacts) │ -│ │ ├─ models/ (ML models) │ -│ │ ├─ datasets/ (Training data) │ -│ │ └─ tasks/ (Task outputs) │ -│ │ │ -│ └─ splunk-index/ (Splunk SmartStore indexes) │ +│ └─ ai-platform-data/ │ +│ ├─ artifacts/ (Build artifacts) │ +│ ├─ models/ (ML models) │ +│ ├─ datasets/ (Training data) │ +│ └─ tasks/ (Task outputs) │ │ │ -│ Persistence: │ -│ └─ PVC: minio-storage (local-path) │ -│ Size: 100Gi (configurable) │ +│ Credentials stored in-cluster as: │ +│ └─ Secret: s3-secret (namespace: ai-platform) │ +│ Keys: s3_access_key, s3_secret_key │ └──────────────────────────────────────────────────────────┘ ``` **Access Patterns:** ```yaml -# From pods in cluster -endpoint: http://minio.minio-system.svc.cluster.local:9000 - -# From outside cluster (via port-forward) -endpoint: http://localhost:9000 - # AIPlatform CR reference objectStorage: - path: s3://ai-platform-bucket/artifacts - endpoint: http://minio.minio-system.svc.cluster.local:9000 - region: us-east-1 # Ignored by MinIO, but required + path: s3:///artifacts + endpoint: http://: + region: us-east-1 secretRef: s3-secret ``` ### Component Architecture -#### Operator and Resource Hierarchy - ```mermaid graph TB subgraph "Control Plane Operators" @@ -1018,7 +641,7 @@ graph TB subgraph "AI Platform Namespace" AIPLATFORM[AIPlatform CR
Custom Resource] - AISERVICE[AIService CRs
saia, dspy, etc.] + AISERVICE[AIService CRs
saia] RAYSERVICE[RayService
Ray Serve + Cluster] RAYCLUSTER[RayCluster
Head + Workers] WEAVIATE[Weaviate
Vector Database] @@ -1027,7 +650,7 @@ graph TB end subgraph "Infrastructure" - MINIO[MinIO
Object Storage] + OBJSTORE[External Object Storage
S3-Compatible] PROMETHEUS[Prometheus
Metrics] GRAFANA[Grafana
Dashboards] STORAGE[Persistent Volumes
local-path] @@ -1043,14 +666,14 @@ graph TB RAYCLUSTER -->|provisions| RAYWORKER[Ray Worker Pods
CPU + GPU] SPLOP -->|watches & reconciles| SPLUNK - SPLUNK -->|stores logs| MINIO + SPLUNK -->|stores logs| OBJSTORE CERTMGR -->|issues certs| RAYSERVICE OTELOP -->|watches & creates| OTELCOL OTELCOL -->|sends traces| SPLUNK - AIPLATFORM -->|references| MINIO + AIPLATFORM -->|references| OBJSTORE AIPLATFORM -->|references| SPLUNK WEAVIATE -->|stores vectors| STORAGE @@ -1066,233 +689,10 @@ graph TB style OTELOP fill:#e1f5ff style AIPLATFORM fill:#fff3e0 style AISERVICE fill:#fff3e0 - style MINIO fill:#f3e5f5 + style OBJSTORE fill:#f3e5f5 style STORAGE fill:#f3e5f5 ``` -#### Data Flow and Interactions - -```mermaid -graph LR - subgraph "User Interface" - USER[User] - SPLUNKUI[Splunk UI
Search Head] - SAIAAPP[SAIA App
Splunk Application] - end - - subgraph "AI Platform Services" - SAIASERVICE[SAIA Service
AI Service CR] - RAYHEAD[Ray Head
Ray Serve API] - RAYWORKER_CPU[Ray Workers
CPU Nodes] - RAYWORKER_GPU[Ray Workers
GPU Nodes] - WEAVIATE[Weaviate
Vector DB] - end - - subgraph "Storage Layer" - MINIO[MinIO
S3-Compatible
Models & Artifacts] - PV[Persistent Volumes
Vector Data] - end - - subgraph "Observability" - SPLUNK[Splunk Enterprise
Logs & Events] - OTEL[OpenTelemetry
Traces] - PROM[Prometheus
Metrics] - end - - USER -->|uses| SPLUNKUI - SPLUNKUI -->|runs| SAIAAPP - SAIAAPP -->|sends prompts| SAIASERVICE - SAIASERVICE -->|connects to| RAYHEAD - RAYHEAD -->|distributes tasks| RAYWORKER_CPU - RAYHEAD -->|distributes tasks| RAYWORKER_GPU - RAYHEAD -->|vector search| WEAVIATE - - WEAVIATE -->|returns results| RAYHEAD - RAYHEAD -->|inference results| SAIASERVICE - SAIASERVICE -->|prompt results| SAIAAPP - SAIAAPP -->|displays to| USER - - RAYWORKER_CPU -->|load models| MINIO - RAYWORKER_GPU -->|load models| MINIO - RAYHEAD -->|store results| MINIO - - WEAVIATE -->|persist vectors| PV - - RAYHEAD -->|send logs| SPLUNK - RAYWORKER_CPU -->|send logs| SPLUNK - RAYWORKER_GPU -->|send logs| SPLUNK - WEAVIATE -->|send logs| SPLUNK - SAIASERVICE -->|send logs| SPLUNK - - RAYHEAD -->|send traces| OTEL - RAYWORKER_CPU -->|send traces| OTEL - SAIASERVICE -->|send traces| OTEL - OTEL -->|forward| SPLUNK - - RAYHEAD -->|expose metrics| PROM - RAYWORKER_CPU -->|expose metrics| PROM - RAYWORKER_GPU -->|expose metrics| PROM - WEAVIATE -->|expose metrics| PROM - SAIASERVICE -->|expose metrics| PROM - - style USER fill:#e8f5e9 - style SPLUNKUI fill:#fff9c4 - style SAIAAPP fill:#fff3e0 - style SAIASERVICE fill:#e1f5ff - style RAYHEAD fill:#e1f5ff - style RAYWORKER_CPU fill:#e1f5ff - style RAYWORKER_GPU fill:#e1f5ff - style WEAVIATE fill:#f3e5f5 - style MINIO fill:#fce4ec - style PV fill:#fce4ec - style SPLUNK fill:#fff9c4 - style OTEL fill:#fff9c4 - style PROM fill:#fff9c4 -``` - -#### Complete Platform Deployment - -```mermaid -graph TB - subgraph "Kubernetes Cluster - k0s" - subgraph "kube-system Namespace" - K8S_API[Kubernetes API Server] - CALICO[Calico CNI
VXLAN Networking] - end - - subgraph "cert-manager Namespace" - CERTMGR[Cert Manager
Certificate Controller] - ISSUER[Issuers & Certificates] - end - - subgraph "monitoring Namespace" - PROM[Prometheus
Metrics Collection] - GRAFANA[Grafana
Visualization] - ALERTMGR[Alert Manager
Alerting] - end - - subgraph "opentelemetry-operator-system" - OTELOP[OpenTelemetry Operator] - end - - subgraph "ray-system Namespace" - RAYOP[KubeRay Operator
Ray Management] - end - - subgraph "splunk-operator Namespace" - SPLOP[Splunk Operator
Splunk Management] - end - - subgraph "splunk-ai-operator-system" - AIOP[Splunk AI Operator
AI Platform Controller] - WEBHOOK[Admission Webhooks
Validation] - end - - subgraph "minio-system Namespace" - MINIO[MinIO Deployment
Object Storage] - MINIOPVC[MinIO PVC
200Gi] - end - - subgraph "ai-platform Namespace" - AIPLATFORM[AIPlatform CR
Main Resource] - - subgraph "AI Services" - SAIA[AIService: saia
Splunk AI Assistant] - end - - subgraph "Ray Infrastructure" - RAYSERVICE[RayService
Ray Serve] - RAYCLUSTER[RayCluster
Distributed Cluster] - RAYHEAD[Ray Head Pod
8 CPU, 32GB RAM] - RAYWORKER1[Ray Worker Pod
16 CPU, 64GB RAM] - RAYWORKER2[Ray Worker GPU Pod
8 CPU, 32GB, 1x GPU] - end - - subgraph "Data Services" - WEAVIATE[Weaviate StatefulSet
Vector Database] - WEAVIATEPVC[Weaviate PVC
50Gi] - end - - subgraph "Splunk Services" - SPLUNK[Splunk Standalone
Enterprise] - SPLUNKETC[Splunk etc PVC] - SPLUNKVAR[Splunk var PVC] - end - - subgraph "Observability" - OTELCOL[OpenTelemetry Collector
Traces] - end - - subgraph "Networking" - RAYSVC[Ray Head Service
ClusterIP] - WEAVIATESVC[Weaviate Service
ClusterIP] - SPLUNKSVC[Splunk Service
ClusterIP] - end - end - - subgraph "gpu-operator Namespace" - GPUOP[NVIDIA GPU Operator] - GPUPLUGIN[NVIDIA Device Plugin] - end - end - - K8S_API -->|manages| AIOP - K8S_API -->|manages| SPLOP - K8S_API -->|manages| RAYOP - - AIOP -->|reconciles| AIPLATFORM - AIPLATFORM -->|creates| SAIA - SAIA -->|creates| RAYSERVICE - RAYOP -->|reconciles| RAYSERVICE - RAYSERVICE -->|creates| RAYCLUSTER - RAYCLUSTER -->|provisions| RAYHEAD - RAYCLUSTER -->|provisions| RAYWORKER1 - RAYCLUSTER -->|provisions| RAYWORKER2 - - AIPLATFORM -->|creates| WEAVIATE - WEAVIATE -->|claims| WEAVIATEPVC - - SPLOP -->|reconciles| SPLUNK - SPLUNK -->|claims| SPLUNKETC - SPLUNK -->|claims| SPLUNKVAR - - CERTMGR -->|provisions certs| RAYSERVICE - - OTELOP -->|creates| OTELCOL - - RAYHEAD -->|exposes| RAYSVC - WEAVIATE -->|exposes| WEAVIATESVC - SPLUNK -->|exposes| SPLUNKSVC - - RAYHEAD -->|reads/writes| MINIO - RAYWORKER1 -->|reads/writes| MINIO - RAYWORKER2 -->|reads/writes| MINIO - SPLUNK -->|reads apps| MINIO - - MINIO -->|stores on| MINIOPVC - - PROM -->|scrapes| RAYHEAD - PROM -->|scrapes| RAYWORKER1 - PROM -->|scrapes| RAYWORKER2 - PROM -->|scrapes| WEAVIATE - GRAFANA -->|queries| PROM - - RAYHEAD -->|sends traces| OTELCOL - RAYWORKER1 -->|sends traces| OTELCOL - OTELCOL -->|forwards to| SPLUNK - - GPUOP -->|installs| GPUPLUGIN - GPUPLUGIN -->|provides GPUs to| RAYWORKER2 - - style AIOP fill:#e1f5ff,stroke:#01579b,stroke-width:3px - style AIPLATFORM fill:#fff3e0,stroke:#e65100,stroke-width:3px - style RAYSERVICE fill:#f3e5f5,stroke:#4a148c,stroke-width:2px - style RAYCLUSTER fill:#f3e5f5,stroke:#4a148c,stroke-width:2px - style MINIO fill:#fce4ec,stroke:#880e4f,stroke-width:2px - style SPLUNK fill:#fff9c4,stroke:#f57f17,stroke-width:2px - style WEAVIATE fill:#e0f2f1,stroke:#004d40,stroke-width:2px -``` - --- ## Image Pull Secrets @@ -1309,15 +709,13 @@ The platform supports automatic creation and propagation of image pull secrets f ### Automatic ECR Configuration -The easiest way to use private ECR images: - ```yaml -# In k0s-cluster-config.yaml ecr: - account: "123456789012" # Your AWS account ID + account: "123456789012" + region: us-east-2 imagePullSecrets: - autoCreateECR: true # Enable automatic ECR secret creation + autoCreateECR: true ``` **What happens automatically:** @@ -1334,14 +732,12 @@ imagePullSecrets: ### Manual Secret Creation -For air-gapped or custom registries: - ```bash # ECR secret kubectl create secret docker-registry ecr-registry-secret \ - --docker-server=123456789012.dkr.ecr.us-west-2.amazonaws.com \ + --docker-server=123456789012.dkr.ecr.us-east-2.amazonaws.com \ --docker-username=AWS \ - --docker-password=$(aws ecr get-login-password --region us-west-2) \ + --docker-password=$(aws ecr get-login-password --region us-east-2) \ --namespace=ai-platform # Docker Hub secret @@ -1352,24 +748,13 @@ kubectl create secret docker-registry docker-hub-secret \ --namespace=ai-platform # Private registry secret -kubectl create secret docker-registry private-registry \ +kubectl create secret docker-registry custom-registry-secret \ --docker-server=registry.example.com \ --docker-username=admin \ --docker-password=secret123 \ --namespace=ai-platform ``` -Then reference in config: - -```yaml -imagePullSecrets: - secrets: - - ecr-registry-secret - - docker-hub-secret - - private-registry - autoCreateECR: false -``` - ### Image Pull Secret Propagation Secrets are automatically propagated through the platform: @@ -1398,20 +783,6 @@ Pods (Ray head, Ray workers, Weaviate, etc.) - name: ecr-registry-secret ``` -### Using Private Images - -Once secrets are configured, specify private images in your config: - -```yaml -# In k0s-cluster-config.yaml or AIPlatform CR -aiplatform: - ray: - image: "123456789012.dkr.ecr.us-west-2.amazonaws.com/ray:2.9.0" - - vectordb: - image: "123456789012.dkr.ecr.us-west-2.amazonaws.com/weaviate:1.28.0" -``` - ### Troubleshooting Image Pull Issues ```bash @@ -1422,11 +793,7 @@ kubectl get secret ecr-registry-secret -n ai-platform kubectl get secret ecr-registry-secret -n ai-platform -o jsonpath='{.type}' # Should output: kubernetes.io/dockerconfigjson -# Check secret content -kubectl get secret ecr-registry-secret -n ai-platform \ - -o jsonpath='{.data.\.dockerconfigjson}' | base64 -d | jq - -# Check pod events +# Check pod events for pull errors kubectl describe pod -n ai-platform | grep -A10 Events # Common errors: @@ -1470,7 +837,7 @@ nvidia.com/gpu: "true" nvidia.com/gpu.count: "1" # Auto-detected ``` -#### Taints +#### GPU Taints GPU nodes are automatically tainted to prevent non-GPU workloads: ```yaml @@ -1483,62 +850,31 @@ taints: #### Viewing Labels ```bash -# Show all labels -kubectl get nodes --show-labels - # Show specific labels kubectl get nodes -L splunk.ai/workload-type,splunk.ai/node-role -# Filter by label +# Filter by type kubectl get nodes -l splunk.ai/workload-type=gpu kubectl get nodes -l splunk.ai/workload-type=cpu - -# Count by type -echo "GPU nodes: $(kubectl get nodes -l splunk.ai/workload-type=gpu --no-headers | wc -l)" -echo "CPU nodes: $(kubectl get nodes -l splunk.ai/workload-type=cpu --no-headers | wc -l)" ``` -#### Custom Scheduling in AIPlatform CR +### NVIDIA GPU Support -```yaml -apiVersion: ai.splunk.com/v1 -kind: AIPlatform -metadata: - name: my-platform -spec: - # CPU workloads (Weaviate, Ray head, etc.) - cpuSchedulingSpec: - nodeSelector: - splunk.ai/workload-type: cpu - tolerations: [] - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: splunk.ai/workload-type - operator: In - values: - - cpu - - # GPU workloads (Ray GPU workers) - gpuSchedulingSpec: - nodeSelector: - splunk.ai/workload-type: gpu - nvidia.com/gpu: "true" - tolerations: - - key: nvidia.com/gpu - operator: Equal - value: "true" - effect: NoSchedule - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: nvidia.com/gpu.count - operator: Exists -``` +The script installs NVIDIA host drivers directly on GPU nodes (not the GPU Operator). + +**Supported distributions:** +- RHEL 9 +- RHEL 10 +- Amazon Linux 2023 +- Debian/Ubuntu + +**What happens on GPU nodes:** +1. Kernel headers installed +2. NVIDIA CUDA repository configured +3. `cuda-drivers` package installed (falls back to `nvidia-driver-550` on Debian) +4. NVIDIA Container Toolkit installed and configured +5. `nvidia-smi` verification run +6. NVIDIA device plugin DaemonSet applied cluster-wide with RuntimeClass ### High Availability Setup @@ -1546,7 +882,7 @@ For production deployments, use 3 controller nodes: ```yaml nodes: - controllers: 3 # HA etcd cluster + controllers: 3 existingIPs: controllers: - 10.0.1.10 @@ -1559,76 +895,52 @@ nodes: - etcd quorum maintained - Zero downtime for API server -**Requirements:** -- Odd number of controllers (1, 3, 5) -- Same datacenter/region for low latency -- Reliable network between controllers +### Service Template (SAIA Public Exposure) -### Custom CA Certificates +To expose the SAIA v2 chat UI externally: -For air-gapped or secure environments: - -```bash -# Create custom CA secret -kubectl create secret generic custom-ca \ - --from-file=ca.crt=/path/to/ca.crt \ - -n cert-manager - -# Update cert-manager to use custom CA -kubectl patch deployment cert-manager -n cert-manager \ - --patch '{"spec":{"template":{"spec":{"volumes":[{"name":"custom-ca","secret":{"secretName":"custom-ca"}}],"containers":[{"name":"cert-manager","volumeMounts":[{"name":"custom-ca","mountPath":"/etc/ssl/certs/custom-ca.crt","subPath":"ca.crt"}]}]}}}}' +```yaml +aiPlatform: + serviceTemplate: + type: "NodePort" # or "LoadBalancer" + nodePort: 30080 # only for NodePort ``` -### Resource Quotas - -Set resource limits per namespace: +This generates a Kubernetes Service exposing port 8080 on the specified NodePort across all worker nodes. -```bash -kubectl apply -f - < backup/minio-secret.yaml -``` +### Backup and Restore #### Backup etcd ```bash -# On controller node ssh ubuntu@controller-ip sudo k0s etcd snapshot save /tmp/etcd-backup.db @@ -1639,13 +951,9 @@ scp ubuntu@controller-ip:/tmp/etcd-backup.db ./backup/ #### Restore from Backup ```bash -# Restore etcd scp ./backup/etcd-backup.db ubuntu@controller-ip:/tmp/ ssh ubuntu@controller-ip sudo k0s etcd snapshot restore /tmp/etcd-backup.db - -# Restore MinIO data -mc mirror ./backup/minio-data k0s-minio/ai-platform-bucket ``` --- @@ -1669,10 +977,7 @@ eval $(ssh-agent) ssh-add ~/.ssh/my-key.pem # 3. Firewall blocking port 22 -# Open port 22 on node firewall - -# 4. Wrong username -# Try: ubuntu, ec2-user, admin, root +# 4. Wrong username (try: ubuntu, ec2-user, admin, root) ``` #### k0s Installation Failures @@ -1687,11 +992,6 @@ sudo journalctl -u k0scontroller -f # Check k0s config sudo cat /etc/k0s/k0s.yaml - -# Reset k0s and retry -sudo k0s stop -sudo k0s reset -# Re-run installation script ``` #### Worker Join Failures @@ -1704,85 +1004,34 @@ sudo k0s status # View worker logs sudo journalctl -u k0sworker -f -# Regenerate token and retry -ssh ubuntu@controller-ip -sudo k0s token create --role=worker - -# Manually join worker -ssh ubuntu@worker-ip -sudo k0s install worker --token-file=<(echo 'NEW_TOKEN_HERE') -sudo k0s start -``` - -### Networking Issues - -#### Pods Cannot Communicate - -```bash -# Check Calico status -kubectl get pods -n kube-system | grep calico - -# View Calico logs -kubectl logs -n kube-system daemonset/calico-node - -# Check VXLAN interface -kubectl exec -n kube-system calico-node-xxx -- ip link show vxlan.calico - -# Verify routes -kubectl exec -n kube-system calico-node-xxx -- ip route +# Use join-workers command to retry +CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh join-workers ``` -#### Konnectivity Issues - -```bash -# Check konnectivity-agent pods -kubectl get pods -n kube-system | grep konnectivity-agent - -# All should be 1/1 Running -# If 0/1 or CrashLoopBackOff: +#### Safety Gate Blocking Install -# Check agent logs -kubectl logs -n kube-system konnectivity-agent-xxx - -# Common issue: Port 8132 not open -# Verify security group allows TCP 8132 from 0.0.0.0/0 - -# Test connectivity from worker -ssh ubuntu@worker-ip -nc -zv 8132 -``` - -#### DNS Resolution Failures +If install fails with "k0s cluster has Ready nodes — refusing to wipe": ```bash -# Test DNS from a pod -kubectl run -it --rm debug --image=busybox --restart=Never -- nslookup kubernetes.default +# Option 1: Use existing cluster (deploy stack only) +# Set useExisting: auto in config, then re-run install -# If fails, check CoreDNS -kubectl get pods -n kube-system | grep coredns -kubectl logs -n kube-system deployment/coredns +# Option 2: Tear down first +CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh delete +CONFIG_FILE=./my-config.yaml ./k0s_cluster_with_stack.sh install ``` ### Storage Issues -#### MinIO Not Starting +#### Object Storage Connectivity ```bash -# Check MinIO pods -kubectl get pods -n minio-system - -# View MinIO logs -kubectl logs -n minio-system deployment/minio - -# Common issues: -# 1. PVC not bound -kubectl get pvc -n minio-system - -# 2. Storage class not available -kubectl get sc +# Test endpoint from a node +ssh ubuntu@worker-ip +curl -s http://:/minio/health/live -# 3. Insufficient disk space -kubectl describe node | grep -A5 "Allocated resources" +# Verify S3 secret exists +kubectl get secret s3-secret -n ai-platform -o yaml ``` #### PVC Stuck in Pending @@ -1791,9 +1040,6 @@ kubectl describe node | grep -A5 "Allocated resources" # Check PVC status kubectl get pvc -n ai-platform -# Describe PVC for events -kubectl describe pvc -n ai-platform - # Check storage class kubectl get sc @@ -1807,12 +1053,8 @@ kubectl logs -n local-path-storage deployment/local-path-provisioner #### GPU Not Detected ```bash -# Check GPU operator pods -kubectl get pods -n gpu-operator - -# All pods should be Running -# If not, check logs: -kubectl logs -n gpu-operator deployment/gpu-operator +# Check NVIDIA device plugin pods +kubectl get pods -n kube-system -l name=nvidia-device-plugin-ds # Check node GPU resources kubectl get nodes -o json | jq '.items[].status.capacity | select(.["nvidia.com/gpu"] != null)' @@ -1828,14 +1070,8 @@ nvidia-smi # Check if GPU nodes are tainted kubectl describe node | grep Taints -# Should have: -# nvidia.com/gpu=true:NoSchedule - # Check if pods have tolerations kubectl get pod -n ai-platform -o yaml | grep -A5 tolerations - -# Manually label GPU node if needed -kubectl label nodes nvidia.com/gpu=true --overwrite ``` ### Application Issues @@ -1852,86 +1088,18 @@ kubectl describe aiplatform -n ai-platform # Check operator logs kubectl logs -n splunk-ai-operator-system \ deployment/splunk-ai-operator-controller-manager - -# Common issues: -# 1. Missing dependencies (MinIO, Splunk) -kubectl get all -n minio-system -kubectl get standalone -n ai-platform - -# 2. Invalid configuration -kubectl get aiplatform -n ai-platform -o yaml -``` - -#### RayCluster Pods ImagePullBackOff - -```bash -# Check pod events -kubectl describe pod -n ai-platform | grep -A10 Events - -# Common causes: -# 1. Image doesn't exist -# Verify image exists in registry - -# 2. Missing imagePullSecrets -kubectl get pod -n ai-platform -o yaml | grep -A5 imagePullSecrets - -# 3. Invalid ECR token -kubectl get secret ecr-registry-secret -n ai-platform - -# Recreate ECR secret if expired (tokens expire after 12 hours) -kubectl delete secret ecr-registry-secret -n ai-platform -# Re-run installation or create manually -``` - -#### Weaviate Pod Stuck Pending - -```bash -# Check pod status -kubectl describe pod -n ai-platform - -# Common issue: No CPU nodes labeled -kubectl get nodes -l splunk.ai/workload-type=cpu - -# If no nodes found, label manually: -kubectl label nodes splunk.ai/workload-type=cpu - -# Or remove CPU nodeSelector from AIPlatform: -kubectl patch aiplatform -n ai-platform --type=json \ - -p='[{"op": "remove", "path": "/spec/cpuScheduler/nodeSelector"}]' ``` -### Performance Issues +### Session Logs -#### Slow Pod Startup +All install output is captured in timestamped log files: ```bash -# Check image pull time -kubectl describe pod -n ai-platform | grep -A20 Events - -# If pulling large images (GB+): -# 1. Pre-pull images to nodes -# 2. Use local registry mirror -# 3. Enable image pull parallelization +# View the latest log +ls -lt tools/cluster_setup/logs/ | head -5 -# Check node resources -kubectl top nodes -kubectl describe node | grep -A10 "Allocated resources" -``` - -#### High Memory Usage - -```bash -# Check memory usage per node -kubectl top nodes - -# Check memory usage per pod -kubectl top pods -n ai-platform - -# Check pod limits -kubectl get pods -n ai-platform -o json | \ - jq '.items[] | {name: .metadata.name, limits: .spec.containers[].resources.limits}' - -# If needed, adjust resource limits in AIPlatform CR +# Tail a running install +tail -f tools/cluster_setup/logs/k0s-install-*.log ``` ### Debugging Commands @@ -1952,15 +1120,6 @@ kubectl exec -it -n ai-platform -- /bin/bash # Check pod logs (all containers) kubectl logs -n ai-platform --all-containers=true --tail=100 - -# Check previous container logs (if crashed) -kubectl logs -n ai-platform --previous - -# Port forward for testing -kubectl port-forward -n ai-platform svc/ 8080:80 - -# Create debug pod -kubectl run -it --rm debug --image=nicolaka/netshoot --restart=Never -- bash ``` --- @@ -1969,7 +1128,7 @@ kubectl run -it --rm debug --image=nicolaka/netshoot --restart=Never -- bash ### Production Security Checklist -- [ ] Change default MinIO credentials +- [ ] Use strong object storage credentials (not defaults) - [ ] Enable TLS for all services - [ ] Configure network policies - [ ] Use unique SSH keys per environment @@ -1979,104 +1138,13 @@ kubectl run -it --rm debug --image=nicolaka/netshoot --restart=Never -- bash - [ ] Configure secrets encryption at rest - [ ] Set up backup and disaster recovery - [ ] Enable monitoring and alerting -- [ ] Harden SSH configuration -- [ ] Disable root SSH access +- [ ] Harden SSH configuration (disable root login) - [ ] Enable firewall on all nodes - [ ] Regular security updates -### Changing MinIO Credentials - -```bash -# 1. Create new secret -kubectl create secret generic minio-creds-new \ - --from-literal=accesskey='new-strong-access-key' \ - --from-literal=secretkey='new-strong-secret-key-123!' \ - --namespace=minio-system \ - --dry-run=client -o yaml | kubectl apply -f - - -# 2. Update MinIO deployment -kubectl patch deployment minio -n minio-system \ - --patch '{"spec":{"template":{"spec":{"containers":[{"name":"minio","env":[{"name":"MINIO_ROOT_USER","valueFrom":{"secretKeyRef":{"name":"minio-creds-new","key":"accesskey"}}},{"name":"MINIO_ROOT_PASSWORD","valueFrom":{"secretKeyRef":{"name":"minio-creds-new","key":"secretkey"}}}]}]}}}}' - -# 3. Update s3-secret in ai-platform namespace -kubectl create secret generic s3-secret \ - --from-literal=s3_access_key='new-strong-access-key' \ - --from-literal=s3_secret_key='new-strong-secret-key-123!' \ - --namespace=ai-platform \ - --dry-run=client -o yaml | kubectl apply -f - - -# 4. Restart affected pods -kubectl rollout restart deployment -n minio-system -kubectl delete pods -n ai-platform -l app=splunk -``` - -### Enabling TLS with Cert-Manager - -```bash -# 1. Create ClusterIssuer for Let's Encrypt -kubectl apply -f - </deployments/static/nvidia-device-plugin.yml` | +| local-path-provisioner | `https://raw.githubusercontent.com/rancher/local-path-provisioner/v0.0.24/deploy/local-path-storage.yaml` | +| Prometheus Helm repo | `https://prometheus-community.github.io/helm-charts` | +| kube-prometheus-stack chart | `prometheus-community/kube-prometheus-stack` (via `helm install`) | +| OpenTelemetry Helm repo | `https://open-telemetry.github.io/opentelemetry-helm-charts` | +| OpenTelemetry Operator chart | `open-telemetry/opentelemetry-operator` (via `helm install`) | +| KubeRay Helm repo | `https://ray-project.github.io/kuberay-helm/` | +| KubeRay Operator chart | `kuberay/kuberay-operator` version `1.2.2` (via `helm install`) | + +### Downloads on All Nodes via SSH + +| What | URL / Source | +|------|-------------| +| iptables-nft | `dnf install -y iptables-nft` (RHEL/Fedora, if missing) | +| python3-pyyaml | `dnf install -y python3-pyyaml` or `apt-get install -y python3-yaml` or `pip3 install pyyaml` | +| k0s binary | `curl -sSLf https://get.k0s.sh | sudo sh` (if not already installed) | + +### Downloads on GPU Worker Nodes via SSH + +| What | URL / Source | +|------|-------------| +| Kernel headers | `dnf/yum install kernel-devel-$(uname -r) kernel-headers-$(uname -r)` or `apt-get install linux-headers-$(uname -r)` | +| NVIDIA GPU driver (AL2023) | Repo: `https://developer.download.nvidia.com/compute/cuda/repos/amzn2023/x86_64/cuda-amzn2023.repo` | +| NVIDIA GPU driver (RHEL 9/10) | Repo: `https://developer.download.nvidia.com/compute/cuda/repos/rhel{9,10}/x86_64/...` | +| NVIDIA GPU driver (Ubuntu) | `https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb` + `nvidia-driver-550` | +| EPEL for dkms (RHEL 10) | `https://dl.fedoraproject.org/pub/epel/epel-release-latest-10.noarch.rpm` | +| NVIDIA Container Toolkit | Repo: `https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo`, GPG: `https://nvidia.github.io/libnvidia-container/gpgkey` | + +### Container Images Pulled by Kubernetes at Runtime + +These images are pulled from registries when pods are scheduled. Pre-pull for air-gapped environments. + +| Image | Default Source | +|-------|---------------| +| Splunk AI Operator | ECR or configured registry | +| Ray Head / Ray Worker GPU | ECR or configured registry | +| Weaviate | `docker.io/semitechnologies/weaviate:...` | +| SAIA API v1 / v2 / Data Loader | ECR or configured registry | +| Nginx | `docker.io/library/nginx:1.27-alpine` | +| Fluent Bit | `docker.io/fluent/fluent-bit:1.9.6` | +| OpenTelemetry Collector | `docker.io/otel/opentelemetry-collector-contrib:0.122.1` | +| Splunk Enterprise | ECR or configured registry | +| Splunk Operator | `docker.io/splunk/splunk-operator:3.0.0` | +| Prometheus, Grafana, Alertmanager | Pulled by kube-prometheus-stack Helm chart | +| KubeRay Operator | `quay.io/kuberay/operator:v1.2.2` | +| OpenTelemetry Operator | Pulled by opentelemetry-operator Helm chart | +| cert-manager (controller, webhook, cainjector) | Pulled by cert-manager manifest | +| NVIDIA device plugin | Pulled by DaemonSet manifest | +| local-path-provisioner | Pulled by provisioner manifest | + +--- + ## Migration Guide ### From EKS to k0s @@ -2117,10 +1247,7 @@ If you're migrating from an existing EKS deployment: # Export AIPlatform CR kubectl get aiplatform -n ai-platform -o yaml > aiplatform-backup.yaml -# Export Splunk Standalone -kubectl get standalone -n ai-platform -o yaml > splunk-backup.yaml - -# Backup MinIO/S3 data +# Backup S3 data aws s3 sync s3://my-ai-bucket ./s3-backup/ ``` @@ -2129,23 +1256,13 @@ aws s3 sync s3://my-ai-bucket ./s3-backup/ CONFIG_FILE=./k0s-config.yaml ./k0s_cluster_with_stack.sh install ``` -**3. Restore Data to MinIO** +**3. Restore Data to Object Storage** ```bash -# Copy data to MinIO -mc mirror ./s3-backup/ k0s-minio/ai-platform-bucket/ -``` - -**4. Update AIPlatform CR** -```yaml -# Change objectStorage from S3 to MinIO -objectStorage: - path: s3://ai-platform-bucket/artifacts - endpoint: http://minio.minio-system.svc.cluster.local:9000 - region: us-east-1 - secretRef: s3-secret +# Copy data to your S3-compatible endpoint +mc mirror ./s3-backup/ my-storage/ai-platform-bucket/ ``` -**5. Apply Resources** +**4. Apply Resources** ```bash kubectl apply -f aiplatform-backup.yaml ``` @@ -2169,52 +1286,12 @@ sudo k0s start --- -## Comparison with EKS - -| Feature | EKS | k0s | -|---------|-----|-----| -| **Infrastructure** | -| Control Plane | AWS Managed | Self-managed | -| Worker Nodes | EC2 Auto Scaling Groups | Manual or EC2 | -| High Availability | Multi-AZ | Multi-node etcd | -| **Storage** | -| Object Storage | S3 (managed) | MinIO (self-hosted) | -| Block Storage | EBS CSI | local-path/Longhorn | -| Storage Costs | Pay per GB | Included in nodes | -| **Networking** | -| CNI | AWS VPC CNI | Calico VXLAN | -| Load Balancer | AWS ELB/ALB | NodePort/MetalLB | -| Ingress | AWS ALB Controller | NGINX Ingress | -| **Security** | -| IAM Integration | IRSA for pods | Service accounts only | -| Encryption | KMS | Manual cert-manager | -| Network Isolation | VPC Security Groups | Calico policies | -| **Operations** | -| Upgrades | Automated | Manual | -| Monitoring | CloudWatch | Self-hosted Prometheus | -| Logging | CloudWatch Logs | Self-hosted Loki | -| Backup | AWS Backup | Manual scripts | -| **Cost** | -| Control Plane | $0.10/hour | Included | -| Worker Nodes | EC2 pricing | EC2 or free (on-prem) | -| Storage | S3 pricing | Included in nodes | -| Networking | Data transfer fees | Free (on-prem) | -| **Use Cases** | -| Production Cloud | ✅ Excellent | ⚠️ Possible | -| On-Premises | ❌ Not possible | ✅ Excellent | -| Air-Gapped | ❌ Not possible | ✅ Excellent | -| Cost Optimization | ⚠️ Can be expensive | ✅ Lower cost | -| Quick Testing | ✅ Fast setup | ✅ Fast setup | - ---- - ## Support and Resources ### Documentation - k0s Official Docs: https://docs.k0sproject.io/ - Splunk AI Operator: https://github.com/splunk/splunk-ai-operator -- MinIO Docs: https://min.io/docs/ - KubeRay: https://docs.ray.io/en/latest/cluster/kubernetes/ ### Getting Help @@ -2223,144 +1300,8 @@ sudo k0s start - **Splunk Community**: https://community.splunk.com/ - **k0s Slack**: https://k8slens.slack.com -### Contributing - -Contributions are welcome! Please: -1. Fork the repository -2. Create a feature branch -3. Submit a pull request - -### License - -See the main repository LICENSE file. - ---- - -## Appendix - -### Complete Config File Reference - -```yaml -# Full k0s-cluster-config.yaml with all options -cluster: - name: my-cluster # Cluster identifier - useExisting: auto # auto|force|never - region: us-west-2 # AWS region (EC2 mode) - sshUser: ubuntu # SSH username - sshKeyPath: ~/.ssh/key.pem # SSH private key - -nodes: - controllers: 1 # 1 or 3 for HA - cpuWorkers: 2 # For EC2 mode - gpuWorkers: 1 # For EC2 mode - existingIPs: - controllers: [] # Empty = create EC2 - workers: [] # Or list of IPs - -ec2: - vpcId: vpc-xxx # Required for EC2 - subnetId: subnet-xxx # Optional - keyName: my-key # AWS key pair name - -instanceTypes: - controller: t3.xlarge # 4 CPU, 16GB RAM - cpuWorker: m5.4xlarge # 16 CPU, 64GB RAM - gpuWorker: g5.2xlarge # 8 CPU, 24GB RAM, A10G GPU - -minio: - accessKey: admin # MinIO admin user - secretKey: password123 # MinIO admin password - bucket: ai-platform-data # Default bucket - -kubernetes: - namespace: ai-platform # AI Platform namespace - -splunk: - standaloneName: splunk-standalone # Splunk instance name - hecEndpoint: "" # Optional external HEC - hecToken: "" # Optional HEC token - index: ai-platform # Splunk index name - -ecr: - account: "123456789012" # AWS account ID - -imagePullSecrets: - secrets: [] # Manual secret names - autoCreateECR: true # Auto-create ECR secret - -aiplatform: - ray: - version: "2.9.0" - image: "rayproject/ray:2.9.0" - vectordb: - image: "semitechnologies/weaviate:1.28.0" - storageSize: "50Gi" - workers: - cpu: - minReplicas: 1 - maxReplicas: 5 - resourcesPerWorker: - cpu: "4" - memory: "16Gi" - gpu: - minReplicas: 0 - maxReplicas: 2 - resourcesPerWorker: - cpu: "8" - memory: "32Gi" - nvidia.com/gpu: "1" -``` - -### Environment Variables - -```bash -# Override config file location -CONFIG_FILE=./my-config.yaml - -# Skip confirmation prompts -AUTO_APPROVE=true - -# Use existing cluster -USE_EXISTING=force - -# Skip components -SKIP_MINIO=true -SKIP_GPU_OPERATOR=true -SKIP_PROMETHEUS=true -SKIP_OTEL=true - -# Debug mode -DEBUG=true -``` - -### Common Recipes - -**Minimal Test Cluster:** -```bash -# Single CPU node, no GPU -CONFIG_FILE=minimal.yaml ./k0s_cluster_with_stack.sh install -``` - -**Production Cluster:** -```bash -# 3 controllers (HA), 5 workers, GPU support -CONFIG_FILE=production.yaml ./k0s_cluster_with_stack.sh install -``` - -**Air-Gapped Cluster:** -```bash -# Pre-pull all images, no internet access -# See air-gapped setup guide -``` - -**Development Cluster:** -```bash -# Quick setup for testing -CONFIG_FILE=dev.yaml AUTO_APPROVE=true ./k0s_cluster_with_stack.sh install -``` - --- -**Version:** 1.0 -**Last Updated:** 2024 +**Version:** 3.0 +**Last Updated:** April 2026 **Maintainer:** Splunk AI Platform Team diff --git a/tools/cluster_setup/artifacts.yaml b/tools/cluster_setup/artifacts.yaml index 39d6c5d..c6953e7 100644 --- a/tools/cluster_setup/artifacts.yaml +++ b/tools/cluster_setup/artifacts.yaml @@ -2237,8 +2237,8 @@ spec: path: description: |- Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, or minio://bucketname/ - pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ + azure://containername/, minio://bucketname/, seaweedfs://bucketname/, or s3compat://bucketname/ + pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ type: string region: description: Region of the remote storage volume. Required for @@ -4866,15 +4866,27 @@ spec: properties: endpoint: description: |- - Optional override endpoint (only needed for S3-compatible services like MinIO) - Must be a valid HTTP/HTTPS URL + Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS) + Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible (MinIO, SeaweedFS, etc.) pattern: ^https?://.*$ type: string path: description: |- Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, or minio://bucketname/ - pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ + azure://containername/, s3compat://bucketname/ (generic S3-compatible), minio://, or seaweedfs:// + pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ + type: string + provider: + description: |- + Provider is an optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint. + Values: aws, minio, seaweedfs, s3compat, gcs, azure + enum: + - aws + - minio + - seaweedfs + - s3compat + - gcs + - azure type: string region: description: Region of the remote storage volume. Required for @@ -4882,7 +4894,8 @@ spec: minLength: 1 type: string secretRef: - description: Secret name containing storage credentials + description: Secret name containing storage credentials (e.g. + s3_access_key, s3_secret_key for S3-compatible backends) maxLength: 253 minLength: 1 type: string @@ -4930,6 +4943,152 @@ spec: type: string type: object type: array + v2: + description: |- + V2 configures the SAIA v2 deployment. v2 is always deployed alongside v1 behind nginx. + Users toggle Agent Mode (v1 vs v2) from the Splunk Settings UI. + properties: + image: + description: Image is the container image for the v2 API pod + type: string + replicas: + default: 1 + description: Replicas is the number of v2 API replicas + format: int32 + minimum: 0 + type: integer + resources: + description: Resources defines the compute resources for the v2 + API pods + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This is an alpha field and requires enabling the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + type: object + v2Worker: + description: V2Worker configures the v2 SAIA worker deployment (same + v2 image, command=run-worker.sh). + properties: + replicas: + default: 1 + description: Replicas is the number of worker replicas + format: int32 + minimum: 0 + type: integer + resources: + description: Resources defines the compute resources for the worker + pods + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This is an alpha field and requires enabling the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + type: object vectorDbUrl: description: VectorDbUrl specifies the URL or service name for the vector database @@ -5523,26 +5682,30 @@ spec: fieldRef: fieldPath: metadata.name - name: RELATED_IMAGE_RAY_HEAD - value: splunk/ai/ray/ray-head:build-17 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-head:build-v2-010 - name: RELATED_IMAGE_RAY_WORKER - value: splunk/ai/ray/ray-worker-gpu:build-17 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-worker-gpu:build-v2-010 - name: RELATED_IMAGE_WEAVIATE value: docker.io/semitechnologies/weaviate:stable-v1.28-007846a - name: RELATED_IMAGE_SAIA_API - value: splunk/ai/saia/saia-api:build-1 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:build-v2-012 + - name: RELATED_IMAGE_SAIA_API_V2 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api-v2:build-v2-012 - name: RELATED_IMAGE_POST_INSTALL_HOOK - value: splunk/ai/saia/saia-data-loader:build-1 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-data-loader:build-v2-012 - name: SPLUNK_METRICS_INDEX_NAME value: _metrics - name: RELATED_IMAGE_FLUENT_BIT value: docker.io/fluent/fluent-bit:1.9.6 - name: RELATED_IMAGE_OTEL_COLLECTOR value: docker.io/otel/opentelemetry-collector-contrib:0.122.1 + - name: RELATED_IMAGE_NGINX + value: docker.io/library/nginx:1.27-alpine - name: MODEL_VERSION value: v0.3.14-36-g1549f5a - name: RAY_VERSION - value: 2.44.0 - image: splunk/ai/splunk-ai-operator:build-v1alpha1 + value: 2.53.0 + image: 658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.29 livenessProbe: httpGet: path: /healthz diff --git a/tools/cluster_setup/cluster-config.yaml b/tools/cluster_setup/cluster-config.yaml index d3738db..c0ed83a 100644 --- a/tools/cluster_setup/cluster-config.yaml +++ b/tools/cluster_setup/cluster-config.yaml @@ -13,29 +13,50 @@ # ---------- Cluster Configuration ---------- cluster: - useExisting: false - name: "my-ai-cluster" # CHANGE THIS: Your EKS cluster name (DNS-1123 compliant: lowercase, numbers, hyphens) - region: "us-west-2" # CHANGE THIS: Your AWS region (e.g., us-east-1, us-west-2, eu-west-1) + useExisting: false # true = do not create cluster; use existing one (script fails if cluster not found) + name: "my-ai-cluster" # CHANGE THIS: Your EKS cluster name (DNS-1123 compliant: lowercase, numbers, hyphens) + region: "us-east-2" # CHANGE THIS: Your AWS region (e.g., us-east-1, us-west-2, eu-west-1) k8sVersion: "1.31" # Kubernetes version (1.29, 1.30, 1.31 supported) + # When true: require subnets (existing VPC). On 'delete', only EKS and related resources are removed; VPC is preserved so you can redeploy (e.g. with MinIO on EC2 in same VPC). + preserveVpcOnDelete: false # Set true to keep VPC on delete and redeploy without recreating VPC - # If you donot provide any subnet information, eksctl will create a new VPC with public and private subnets automatically. + # To use an EXISTING VPC: provide subnets below; eksctl will not create a new VPC. Idempotent: cluster is only created if it does not exist. + # If you do not provide subnets, eksctl creates a new VPC and subnets automatically. # VPC Subnets - CHANGE ALL OF THESE to your actual subnet IDs # Find your subnets: aws ec2 describe-subnets --filters "Name=vpc-id,Values=vpc-xxxxx" --region us-west-2 - #subnets: - # private: # Private subnets (at least 2 in different AZs) - # - id: "subnet-1a2b3c4d5e6f7g8h" # CHANGE THIS: Your private subnet 1 - # az: "us-west-2a" # CHANGE THIS: Availability zone for subnet 1 - # - id: "subnet-9h8g7f6e5d4c3b2a" # CHANGE THIS: Your private subnet 2 - # az: "us-west-2b" # CHANGE THIS: Availability zone for subnet 2 - # public: # Public subnets (at least 2 in different AZs) - # - id: "subnet-a1b2c3d4e5f6g7h8" # CHANGE THIS: Your public subnet 1 - # az: "us-west-2a" # CHANGE THIS: Availability zone for subnet 1 - # - id: "subnet-h8g7f6e5d4c3b2a1" # CHANGE THIS: Your public subnet 2 - # az: "us-west-2b" # CHANGE THIS: Availability zone for subnet 2 - # - id: "subnet-1h2g3f4e5d6c7b8a" # OPTIONAL: Additional public subnet for HA - # az: "us-west-2c" # OPTIONAL: Third availability zone +# subnets: +# private: # Private subnets (at least 2 in different AZs) +# - id: "subnet-02734905b10e7ad5a" # CHANGE THIS: Your private subnet 1 +# az: "us-east-2b" # CHANGE THIS: Availability zone for subnet 1 +# - id: "subnet-0c1d7dc49788d11dc" # CHANGE THIS: Your private subnet 2 +# az: "us-east-2c" # CHANGE THIS: Availability zone for subnet 2 +# - id: "subnet-0f8f94998d65dfcd2" # CHANGE THIS: Your private subnet 2 +# az: "us-east-2a" +# public: # Public subnets (at least 2 in different AZs) +# - id: "subnet-0f0ea3b190a618540" # CHANGE THIS: Your public subnet 1 +# az: "us-east-2c" # CHANGE THIS: Availability zone for subnet 1 +# - id: "subnet-02b736130e7c2a787" # CHANGE THIS: Your public subnet 2 +# az: "us-east-2a" # CHANGE THIS: Availability zone for subnet 2 +# - id: "subnet-02c35a8cd0b5d90a5" # OPTIONAL: Additional public subnet for HA +# az: "us-east-2b" # OPTIONAL: Third availability zone # ---------- Node Groups ---------- +# +# GPU TYPE QUICK REFERENCE — set instanceType and defaultAcceleratorType (under aiPlatform) together: +# +# L40S (default): +# instanceType: g6e.12xlarge (4x L40S GPUs, 48 GB VRAM each) +# defaultAcceleratorType: L40S +# capacityReservation: not required +# availabilityZones: not required +# +# H100: +# instanceType: p5.4xlarge (8x H100 GPUs, 80 GB VRAM each; capacity reservation required) +# defaultAcceleratorType: H100 +# capacityReservation: required — uncomment block below and set id + az +# availabilityZones: required — must match capacityReservation.az +# maxSize: must equal desiredCapacity (capacity reservations are fixed-size) +# nodeGroups: cpu: enabled: true # Set to false to skip CPU node group @@ -48,19 +69,46 @@ nodeGroups: gpu: enabled: true # Set to false to skip GPU nodes (saves cost) - instanceType: "g6e.12xlarge" # GPU instance type (g6e.12xlarge=4xL40S GPUs, g5.xlarge=1xA10G) + instanceType: "g6e.12xlarge" # CHANGE THIS: see GPU TYPE QUICK REFERENCE above desiredCapacity: 2 # Initial number of GPU nodes minSize: 2 # Minimum GPU nodes - maxSize: 4 # Maximum GPU nodes + maxSize: 4 # Maximum GPU nodes (set equal to desiredCapacity for H100) volumeSize: 1000 # EBS volume size per GPU node (GB) - larger for model storage volumeType: "gp3" # EBS volume type + # ── H100 ONLY ────────────────────────────────────────────────────────────── + # Capacity Reservation: required for P5/H100 instances (scarce capacity). + # Uncomment and fill in when defaultAcceleratorType is H100. + # capacityReservation: + # id: "cr-xxxxxxxxxxxxxxxxx" # CHANGE THIS: your capacity reservation ID + # az: "us-east-2c" # CHANGE THIS: AZ where the reservation exists + + # Availability Zones: lock GPU nodes to the AZ matching the capacity reservation. + # Uncomment and fill in when defaultAcceleratorType is H100. + # availabilityZones: + # - "us-east-2c" # CHANGE THIS: must match capacityReservation.az + # ─────────────────────────────────────────────────────────────────────────── + # ---------- Storage Configuration ---------- +# Object storage: only AWS S3 or external S3-compatible (no in-cluster MinIO install). +# Use objectStore.type: aws (S3) or s3compat | minio | seaweedfs (external; endpoint + credentials required). storage: - s3Bucket: "my-company-ai-platform-bucket" # CHANGE THIS: Globally unique S3 bucket name + s3Bucket: "ai-platform-bucket-minio-us-east-2" # Used when objectStore.type is aws storageClass: "gp3" # Storage class for Kubernetes PVCs (gp3, gp2, io1, io2) vectorDbSize: "50Gi" # VectorDB persistent volume size + # Object store: aws (S3) or external S3-compatible (s3compat, minio, seaweedfs). No in-cluster install. + # - s3compat: generic S3 API (MinIO :9000, SeaweedFS S3 :8333, etc.) — AIPlatform path uses s3compat://bucket + # - minio: same wiring as s3compat but path uses minio:// (use if an older operator webhook rejects s3compat://) + # - seaweedfs: path uses seaweedfs:// (requires operator webhook that allows that scheme) + objectStore: + type: "minio" # aws | s3compat | minio | seaweedfs (external only for non-aws) + bucket: "ai-platform-bucket-minio-us-east-2" + endpoint: "http://10.0.0.5:9000" # CHANGE THIS: MinIO API (9000) or SeaweedFS S3 gateway (8333) + auth: + rootUser: "" # CHANGE THIS: S3-compatible access key (or MinIO root user) + rootPassword: "" # CHANGE THIS: S3-compatible secret key (or MinIO root password) + # ---------- Container Images Configuration ---------- images: # ================================================================================== @@ -82,7 +130,7 @@ images: # # REQUIRED: Specify your private registry URL for custom images # Leave empty to use Docker Hub defaults for all images - registry: "1234567890.dkr.ecr.us-west-2.amazonaws.com" # CHANGE THIS: Your ECR/Docker/Harbor registry + registry: "658391232643.dkr.ecr.us-east-2.amazonaws.com" # CHANGE THIS: Your ECR/Docker/Harbor registry # ================================================================================== # CONTAINER IMAGES - Specify paths (registry prefix auto-applied if needed) @@ -97,18 +145,17 @@ images: # Option 2: Full path (ignores registry prefix) # image: "docker.io/myorg/splunk-ai-operator:v1.0.0" # Result: "docker.io/myorg/splunk-ai-operator:v1.0.0" - image: "docker.io/splunk/splunk-ai-operator:0.1.0" + # Bump tag after building fixed operator (SAIA 8Gi default, SchemaJobId persist, feature config) + #image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/splunk-ai-operator:v0.1.8" + image: "docker.io/kpratyush775/splunk-ai-operator:v0.1.31" # Splunk Enterprise Images splunk: - # Option 1: Relative path (uses registry prefix) - # image: "splunk/splunk:10.2.0" - # Result: "123456789012.dkr.ecr.us-west-2.amazonaws.com/splunk/splunk:10.2.0" - # - # Option 2: Full path (ignores registry prefix) - # image: "docker.io/myorg/splunk:10.2.0" - # Result: "docker.io/myorg/splunk:10.2.0" - image: "splunk/splunk:10-2-ai-custom" + # Splunk Enterprise image + # Default behavior: If no registry in path, uses Docker Hub + # "splunk/splunk:10.2.0" → Docker Hub + # "123456789012.dkr.ecr.us-west-2.amazonaws.com/splunk/splunk:10.2.0" → ECR + image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/splunk/splunk:10-2-ai-custom" # Splunk Operator image (optional - has default) # Default: "docker.io/splunk/splunk-operator:3.0.0" @@ -123,8 +170,8 @@ images: # Option 2: Full path with different registry # headImage: "docker.io/rayproject/ray:2.44.0" # Result: "docker.io/rayproject/ray:2.44.0" - headImage: "ml-platform/ray/ray-head:build-17" - workerImage: "ml-platform/ray/ray-worker-gpu:build-17" + headImage: "ml-platform/ray/ray-head:build-008" + workerImage: "ml-platform/ray/ray-worker-gpu:build-008" # Weaviate Vector Database weaviate: @@ -136,8 +183,8 @@ images: # SAIA (Splunk AI Assistant) Images saia: # Relative paths - registry prefix auto-applied - apiImage: "ml-platform/saia/saia-api:build-1" - dataLoaderImage: "ml-platform/saia/saia-data-loader:build-1" + apiImage: "ml-platform/saia/saia-api:build-005" + dataLoaderImage: "ml-platform/saia/saia-data-loader:build-003" # Supporting Images fluentBit: @@ -146,6 +193,11 @@ images: # image: "fluent-bit:1.9.6" → uses registry prefix image: "docker.io/fluent/fluent-bit:1.9.6" + # OpenTelemetry Collector (use full URL so it is not rewritten to ECR) + otelCollector: + # Public image - full path so registry prefix is NOT applied; validation checks this URL + image: "docker.io/otel/opentelemetry-collector-contrib:0.122.1" + # ---------- Operator Versions ---------- operators: ray: @@ -170,8 +222,11 @@ aiPlatform: rayWorker: "ray-worker-sa" # no change saiaService: "saia-service-sa" # no change - # Default accelerator type - defaultAcceleratorType: "L40S" + # Default accelerator type — must match a top-level key in instance.yaml. + # Must be changed in sync with nodeGroups.gpu.instanceType (see GPU TYPE QUICK REFERENCE above). + # L40S → instanceType: g6e.12xlarge + # H100 → instanceType: p5.4xlarge (also uncomment capacityReservation + availabilityZones) + defaultAcceleratorType: "L40S" # Features to enable features: # no change diff --git a/tools/cluster_setup/eks_cluster_with_stack.sh b/tools/cluster_setup/eks_cluster_with_stack.sh index 62e64ee..93df2bd 100755 --- a/tools/cluster_setup/eks_cluster_with_stack.sh +++ b/tools/cluster_setup/eks_cluster_with_stack.sh @@ -29,6 +29,7 @@ load_config() { CLUSTER_NAME="$(yq eval '.cluster.name' "$cfg")" REGION="$(yq eval '.cluster.region' "$cfg")" K8S_VERSION="$(yq eval '.cluster.k8sVersion' "$cfg")" + USE_EXISTING_CLUSTER="$(yq eval '.cluster.useExisting // false' "$cfg")" # Node groups ENABLE_CPU="$(yq eval '.nodeGroups.cpu.enabled' "$cfg")" @@ -47,10 +48,40 @@ load_config() { GPU_VOLUME_SIZE="$(yq eval '.nodeGroups.gpu.volumeSize' "$cfg")" GPU_VOLUME_TYPE="$(yq eval '.nodeGroups.gpu.volumeType' "$cfg")" + # GPU Availability Zones (optional - for capacity-constrained instance types like P5/H100) + GPU_AVAILABILITY_ZONES=() + while IFS= read -r az; do + [[ -n "$az" ]] && GPU_AVAILABILITY_ZONES+=("$az") + done < <(yq eval '.nodeGroups.gpu.availabilityZones[]' "$cfg" 2>/dev/null) + + # Capacity Reservation (optional - for H100/P5 instances) + GPU_CAPACITY_RESERVATION_ID="$(yq eval '.nodeGroups.gpu.capacityReservation.id' "$cfg" 2>/dev/null)" + GPU_CAPACITY_RESERVATION_AZ="$(yq eval '.nodeGroups.gpu.capacityReservation.az' "$cfg" 2>/dev/null)" + [[ "$GPU_CAPACITY_RESERVATION_ID" == "null" ]] && GPU_CAPACITY_RESERVATION_ID="" + [[ "$GPU_CAPACITY_RESERVATION_AZ" == "null" ]] && GPU_CAPACITY_RESERVATION_AZ="" + + # Cluster options + PRESERVE_VPC_ON_DELETE="$(yq eval '.cluster.preserveVpcOnDelete // false' "$cfg")" + # Storage S3_BUCKET="$(yq eval '.storage.s3Bucket' "$cfg")" STORAGE_CLASS="$(yq eval '.storage.storageClass' "$cfg")" VECTORDB_SIZE="$(yq eval '.storage.vectorDbSize' "$cfg")" + # Object storage: objectStore.type (aws | s3compat | minio | seaweedfs); default aws when unset + OBJ_STORE_TYPE="$(yq eval '.storage.objectStore.type // "aws"' "$cfg")" + OBJ_STORE_BUCKET="$(yq eval '.storage.objectStore.bucket // .storage.s3Bucket // "ai-platform"' "$cfg")" + OBJ_STORE_ENDPOINT="$(yq eval '.storage.objectStore.endpoint // ""' "$cfg")" + OBJ_STORE_NS="$(yq eval '.storage.objectStore.namespace // "minio"' "$cfg")" + _obj_user="$(yq eval '.storage.objectStore.auth.rootUser // "minioadmin"' "$cfg")" + _obj_pw="$(yq eval '.storage.objectStore.auth.rootPassword // ""' "$cfg")" + # External S3-compatible only (no in-cluster MinIO install). True when type is s3compat, minio, or seaweedfs. + USE_EXTERNAL_OBJ_STORE="false" + case "${OBJ_STORE_TYPE}" in s3compat|minio|seaweedfs) USE_EXTERNAL_OBJ_STORE="true"; esac + MINIO_ENDPOINT="${OBJ_STORE_ENDPOINT}" + MINIO_NS="${OBJ_STORE_NS}" + MINIO_BUCKET="${OBJ_STORE_BUCKET}" + MINIO_ROOT_USER="${MINIO_ROOT_USER:-$_obj_user}" + MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-$_obj_pw}" # AI Platform AI_NS="$(yq eval '.aiPlatform.namespace' "$cfg")" @@ -93,32 +124,44 @@ load_config() { FLUENT_BIT_IMAGE="$(yq eval '.images.fluentBit.image' "$cfg")" OTEL_COLLECTOR_IMAGE="$(yq eval '.images.otelCollector.image' "$cfg")" - # Subnets - read as arrays (Bash 3.2 compatible) + # Subnets - read as arrays (support both cluster.subnets and top-level subnets) PRIVATE_SUBNETS=() while IFS= read -r subnet; do [[ -n "$subnet" ]] && PRIVATE_SUBNETS+=("$subnet") - done < <(yq eval '.cluster.subnets.private[].id' "$cfg") + done < <(yq eval '.cluster.subnets.private[].id // .subnets.private[].id' "$cfg") PRIVATE_SUBNETS_AZ=() while IFS= read -r az; do [[ -n "$az" ]] && PRIVATE_SUBNETS_AZ+=("$az") - done < <(yq eval '.cluster.subnets.private[].az' "$cfg") + done < <(yq eval '.cluster.subnets.private[].az // .subnets.private[].az' "$cfg") PUBLIC_SUBNETS=() while IFS= read -r subnet; do [[ -n "$subnet" ]] && PUBLIC_SUBNETS+=("$subnet") - done < <(yq eval '.cluster.subnets.public[].id' "$cfg") + done < <(yq eval '.cluster.subnets.public[].id // .subnets.public[].id' "$cfg") PUBLIC_SUBNETS_AZ=() while IFS= read -r az; do [[ -n "$az" ]] && PUBLIC_SUBNETS_AZ+=("$az") - done < <(yq eval '.cluster.subnets.public[].az' "$cfg") + done < <(yq eval '.cluster.subnets.public[].az // .subnets.public[].az' "$cfg") else # Fallback: simple grep-based parsing (less robust but works without yq) CLUSTER_NAME="$(grep 'name:' "$cfg" | head -1 | sed 's/.*name: *"\(.*\)".*/\1/')" REGION="$(grep 'region:' "$cfg" | head -1 | sed 's/.*region: *"\(.*\)".*/\1/')" K8S_VERSION="$(grep 'k8sVersion:' "$cfg" | sed 's/.*k8sVersion: *"\(.*\)".*/\1/')" + USE_EXISTING_CLUSTER="false" + PRESERVE_VPC_ON_DELETE="false" S3_BUCKET="$(grep 's3Bucket:' "$cfg" | sed 's/.*s3Bucket: *"\(.*\)".*/\1/')" + OBJ_STORE_TYPE="" + OBJ_STORE_BUCKET="${S3_BUCKET}" + OBJ_STORE_ENDPOINT="" + OBJ_STORE_NS="minio" + USE_EXTERNAL_OBJ_STORE="false" + MINIO_ENDPOINT="" + MINIO_NS="minio" + MINIO_BUCKET="ai-platform" + MINIO_ROOT_USER="${MINIO_ROOT_USER:-minioadmin}" + MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-}" AI_NS="$(grep 'namespace:' "$cfg" | grep -A2 'aiPlatform:' | tail -1 | sed 's/.*namespace: *"\(.*\)".*/\1/')" AI_PLATFORM_NAME="splunk-ai-stack" AI_STANDALONE_NAME="splunk-standalone" @@ -152,6 +195,9 @@ load_config() { GPU_MAX=4 GPU_VOLUME_SIZE=1000 GPU_VOLUME_TYPE="gp3" + GPU_AVAILABILITY_ZONES=() + GPU_CAPACITY_RESERVATION_ID="" + GPU_CAPACITY_RESERVATION_AZ="" SPLUNK_APP_LOCAL_PATH="" # Hardcoded subnets for fallback @@ -163,6 +209,7 @@ load_config() { ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" S3_PREFIXES=("artifacts/" "apps/" "tasks/") AI_BUCKET_POLICY_NAME="S3Access-${CLUSTER_NAME}-ai-platform" + AI_ECR_ONLY_POLICY_NAME="ECRAccess-${CLUSTER_NAME}-ai-platform" # IRSA for EBS CSI EBS_IRSA_ROLE_NAME="EBSCSIDriverRole-${CLUSTER_NAME}" @@ -762,7 +809,11 @@ generate_node_groups() { k8s.io/cluster-autoscaler/enabled: \"true\" k8s.io/cluster-autoscaler/${CLUSTER_NAME}: owned" fi - if [[ "$ENABLE_GPU" == "true" ]]; then + # H100 with capacity reservation: node group created separately via CloudFormation + # All other GPU types (L40S): standard eksctl managed node group + if [[ "$ENABLE_GPU" == "true" && "$DEFAULT_ACCELERATOR" == "H100" && -n "$GPU_CAPACITY_RESERVATION_ID" ]]; then + log "GPU nodes will be created separately with capacity reservation ${GPU_CAPACITY_RESERVATION_ID}" + elif [[ "$ENABLE_GPU" == "true" ]]; then nodes+=" - name: gpu-nodes instanceType: ${GPU_INSTANCE_TYPE} @@ -770,7 +821,17 @@ generate_node_groups() { minSize: ${GPU_MIN} maxSize: ${GPU_MAX} volumeSize: ${GPU_VOLUME_SIZE} - volumeType: ${GPU_VOLUME_TYPE} + volumeType: ${GPU_VOLUME_TYPE}" + # Lock to specific AZ when availabilityZones are specified + if [[ ${#GPU_AVAILABILITY_ZONES[@]} -gt 0 ]]; then + nodes+=" + availabilityZones:" + for az in "${GPU_AVAILABILITY_ZONES[@]}"; do + nodes+=" + - ${az}" + done + fi + nodes+=" tags: Name: ${CLUSTER_NAME}-gpu Environment: prod @@ -853,6 +914,174 @@ EOF create_cluster() { log "Creating EKS cluster..."; eksctl create cluster -f eks-cluster-config.yaml; ensure_kubeconfig; } +# Create GPU node group with Capacity Block using CloudFormation. +# Only called when DEFAULT_ACCELERATOR=H100 and GPU_CAPACITY_RESERVATION_ID is set. +create_gpu_nodegroup_with_capacity_block() { + if [[ "$DEFAULT_ACCELERATOR" != "H100" || -z "$GPU_CAPACITY_RESERVATION_ID" ]]; then + return 0 + fi + + log "Creating GPU node group with Capacity Block (H100)..." + log " Reservation: ${GPU_CAPACITY_RESERVATION_ID} in ${GPU_CAPACITY_RESERVATION_AZ}" + + local stack_name="${CLUSTER_NAME}-gpu-capacity-block" + local cfn_template_file="/tmp/${stack_name}-template.yaml" + + # Get cluster info + local cluster_info vpc_id cluster_sg + cluster_info=$(aws eks describe-cluster --name "${CLUSTER_NAME}" --region "${REGION}" --query 'cluster') + vpc_id=$(echo "$cluster_info" | jq -r '.resourcesVpcConfig.vpcId') + cluster_sg=$(echo "$cluster_info" | jq -r '.resourcesVpcConfig.clusterSecurityGroupId') + log " VPC: ${vpc_id}, Security Group: ${cluster_sg}" + + # Get EKS GPU AMI + local ami_id + ami_id=$(aws ssm get-parameter \ + --name "/aws/service/eks/optimized-ami/${K8S_VERSION}/amazon-linux-2-gpu/recommended/image_id" \ + --region "${REGION}" --query 'Parameter.Value' --output text) + log " AMI: ${ami_id}" + + # Get node IAM role created by eksctl for the CPU node group + local node_role_arn + node_role_arn=$(aws iam list-roles \ + --query "Roles[?contains(RoleName, '${CLUSTER_NAME}') && contains(RoleName, 'NodeInstanceRole')].Arn" \ + --output text | head -1) + log " Node Role: ${node_role_arn}" + + if [[ -z "$node_role_arn" || "$node_role_arn" == "None" ]]; then + err "Node role not found — ensure CPU node group was created first." + fi + + # Find subnet in the capacity reservation AZ + local subnet_id + subnet_id=$(aws ec2 describe-subnets --region "${REGION}" \ + --filters "Name=availability-zone,Values=${GPU_CAPACITY_RESERVATION_AZ}" \ + "Name=vpc-id,Values=${vpc_id}" \ + "Name=tag:Name,Values=*eksctl-${CLUSTER_NAME}*Private*" \ + --query 'Subnets[0].SubnetId' --output text) + if [[ -z "$subnet_id" || "$subnet_id" == "None" ]]; then + subnet_id=$(aws ec2 describe-subnets --region "${REGION}" \ + --filters "Name=availability-zone,Values=${GPU_CAPACITY_RESERVATION_AZ}" \ + "Name=vpc-id,Values=${vpc_id}" \ + --query 'Subnets[0].SubnetId' --output text) + fi + if [[ -z "$subnet_id" || "$subnet_id" == "None" ]]; then + err "Subnet not found in ${GPU_CAPACITY_RESERVATION_AZ} for VPC ${vpc_id}" + fi + log " Subnet: ${subnet_id}" + + # Generate CloudFormation template + cat > "${cfn_template_file}" </dev/null || echo "NOT_EXISTS") + + if [[ "$stack_status" == "CREATE_COMPLETE" || "$stack_status" == "UPDATE_COMPLETE" ]]; then + log "GPU node group already exists and is healthy — skipping." + rm -f "${cfn_template_file}"; return 0 + elif [[ "$stack_status" != "NOT_EXISTS" ]]; then + log "Deleting ${stack_status} stack before retry..." + aws cloudformation delete-stack --stack-name "${stack_name}" --region "${REGION}" + aws cloudformation wait stack-delete-complete --stack-name "${stack_name}" --region "${REGION}" || true + fi + + aws cloudformation deploy \ + --template-file "${cfn_template_file}" \ + --stack-name "${stack_name}" \ + --region "${REGION}" \ + --parameter-overrides \ + ClusterName="${CLUSTER_NAME}" \ + ReservationId="${GPU_CAPACITY_RESERVATION_ID}" \ + SubnetId="${subnet_id}" \ + NodeRoleArn="${node_role_arn}" \ + SecurityGroupId="${cluster_sg}" \ + AmiId="${ami_id}" \ + InstanceType="${GPU_INSTANCE_TYPE}" \ + VolumeSize="${GPU_VOLUME_SIZE}" \ + DesiredCapacity="${GPU_DESIRED}" \ + --capabilities CAPABILITY_IAM \ + --no-fail-on-empty-changeset + + rm -f "${cfn_template_file}" + + local final_status + final_status=$(aws cloudformation describe-stacks --stack-name "${stack_name}" --region "${REGION}" \ + --query 'Stacks[0].StackStatus' --output text) + if [[ "$final_status" != "CREATE_COMPLETE" && "$final_status" != "UPDATE_COMPLETE" ]]; then + err "CloudFormation stack failed: ${final_status}. Check: aws cloudformation describe-stack-events --stack-name ${stack_name} --region ${REGION}" + fi + + log "GPU node group with Capacity Block created successfully." + log "Waiting for nodes to join cluster..." + sleep 30 + kubectl get nodes -l nvidia.com/gpu=true 2>/dev/null || log "(Nodes may still be joining...)" +} + ensure_oidc() { log "Ensuring IAM OIDC provider is associated..." @@ -1002,6 +1231,7 @@ ensure_ebs_irsa_role() { # Create IRSA for EBS CSI using eksctl (handles role creation, trust policy, and SA annotation) eksctl create iamserviceaccount \ --cluster "${CLUSTER_NAME}" \ + --region "${REGION}" \ --namespace "${EBS_NS}" \ --name "${EBS_SA}" \ --role-name "${EBS_IRSA_ROLE_NAME}" \ @@ -1086,6 +1316,7 @@ install_cluster_autoscaler() { log "Installing Cluster Autoscaler with IRSA..." eksctl create iamserviceaccount \ --cluster "${CLUSTER_NAME}" \ + --region "${REGION}" \ --name "${AUTOSCALER_SA}" \ --namespace "${AUTOSCALER_NS}" \ --role-name "${AUTOSCALER_ROLE_NAME}" \ @@ -1134,6 +1365,35 @@ install_cert_manager() { check_ready cert-manager "app.kubernetes.io/instance=cert-manager,app.kubernetes.io/component=controller" } +# ---------- External S3-compatible object storage (credentials only; no in-cluster install) ---------- +ensure_s3compat_credentials() { + # Only create credentials secret when using external S3-compatible storage (s3compat, minio, seaweedfs). + if [[ "${USE_EXTERNAL_OBJ_STORE}" != "true" ]]; then + return 0 + fi + + log "Object store type is ${OBJ_STORE_TYPE}; creating credentials secret for external S3-compatible storage." + if [[ -z "${OBJ_STORE_ENDPOINT}" && -z "${MINIO_ENDPOINT}" ]]; then + err "storage.objectStore.type=${OBJ_STORE_TYPE} requires storage.objectStore.endpoint" + return 1 + fi + if [[ -z "${MINIO_ROOT_PASSWORD}" ]]; then + err "External S3-compatible storage requires credentials (objectStore.auth.rootPassword or MINIO_ROOT_PASSWORD)" + return 1 + fi + ensure_namespace "${AI_NS}" + local secret_name="minio-credentials" + kubectl -n "${AI_NS}" create secret generic "${secret_name}" \ + --from-literal=AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" \ + --from-literal=AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \ + --from-literal=s3_access_key="${MINIO_ROOT_USER}" \ + --from-literal=s3_secret_key="${MINIO_ROOT_PASSWORD}" \ + --from-literal=MINIO_ACCESS_KEY="${MINIO_ROOT_USER}" \ + --from-literal=MINIO_SECRET_KEY="${MINIO_ROOT_PASSWORD}" \ + --dry-run=client -o yaml | kubectl -n "${AI_NS}" apply -f - + log "✓ External S3-compatible credentials secret ${AI_NS}/${secret_name} ready" +} + # ---------- OTEL Operator + contrib collector (idempotent) ---------- install_otel_operator_and_contrib_collector() { log "Installing OpenTelemetry Operator (Helm)..." @@ -1328,6 +1588,62 @@ EOF printf "%s" "$arn" } +# ECR-only policy for IRSA when using MinIO (no S3) - allows pulling images from ECR +ensure_ecr_only_policy() { + local name="${AI_ECR_ONLY_POLICY_NAME}" + local expected_arn="arn:aws:iam::${ACCOUNT_ID}:policy/${name}" + if aws iam get-policy --policy-arn "$expected_arn" >/dev/null 2>&1; then + printf "%s" "$expected_arn" + return 0 + fi + local arn + arn="$(get_policy_arn_by_name "$name")" + if [[ -z "$arn" ]]; then + log "Creating IAM policy ${name} (ECR read-only, for MinIO-only mode)" + local pd; pd="$(mktemp)"; TMP_FILES+=("$pd") + cat > "$pd" <<'ECRPOL' +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "ECRAuth", + "Effect": "Allow", + "Action": "ecr:GetAuthorizationToken", + "Resource": "*" + }, + { + "Sid": "ECRPull", + "Effect": "Allow", + "Action": [ + "ecr:BatchCheckLayerAvailability", + "ecr:GetDownloadUrlForLayer", + "ecr:BatchGetImage" + ], + "Resource": "arn:aws:ecr:*:*:repository/*" + } + ] +} +ECRPOL + local create_out rc + set +e + create_out="$(aws iam create-policy --policy-name "${name}" --policy-document "file://${pd}" --query 'Policy.Arn' --output text 2>&1)" + rc=$? + set -e + if (( rc == 0 )); then + arn="$(normalize_arn "$create_out")" + else + if grep -qi 'EntityAlreadyExists' <<<"$create_out"; then + arn="$(get_policy_arn_by_name "$name")" + else + err "Failed to create IAM policy ${name}: $create_out" + fi + fi + fi + arn="$(normalize_arn "$arn")" + [[ -z "$arn" ]] && err "Failed to resolve ARN for policy ${name}" + printf "%s" "$arn" +} + # ------- IRSA helpers: ensure & validate ------- generate_irsa_trust_policy() { local ns="$1" sa="$2" @@ -1386,6 +1702,18 @@ ensure_irsa_for_sa() { local sa="$1" ns="$2" policy_arn_raw="${3:-}" local role="IRSA-${CLUSTER_NAME}-${sa}" + # Fail fast if kubectl cannot reach the cluster (e.g. wrong KUBECONFIG or context) + local kerr + kerr="$(kubectl get ns "${ns}" 2>&1)" || true + if echo "${kerr}" | grep -q "connection refused\|localhost:8080\|dial tcp.*8080"; then + err "kubectl cannot reach the cluster (API server connection refused). \ +Fix: run 'aws eks update-kubeconfig --name ${CLUSTER_NAME} --region ${REGION}' and ensure KUBECONFIG (if set) points to that file. \ +Then re-run this script." + fi + if ! kubectl get ns "${ns}" >/dev/null 2>&1; then + err "Cannot access namespace ${ns} (kubectl get ns failed). Ensure the cluster is reachable and the namespace exists." + fi + # Resolve/repair policy ARN if invalid local policy_arn; policy_arn="$(normalize_arn "$policy_arn_raw")" if [[ -z "$policy_arn" || $policy_arn != arn:aws:iam::* ]]; then @@ -1401,6 +1729,7 @@ ensure_irsa_for_sa() { log "Ensuring IRSA (role ${role}) for ${ns}/${sa} with policy ${policy_arn}" eksctl create iamserviceaccount \ --cluster "${CLUSTER_NAME}" \ + --region "${REGION}" \ --namespace "${ns}" \ --name "${sa}" \ --role-name "${role}" \ @@ -1454,28 +1783,34 @@ install_splunk_standalone() { ensure_namespace "${AI_NS}" wait_for_crd standalones.enterprise.splunk.com 600 - # Create IRSA for Splunk Standalone (recommended approach) + # IRSA for Splunk Standalone: S3 bucket policy when using AWS S3, ECR-only when using external S3-compatible log "Setting up IRSA for Splunk Standalone service account..." - local policy_arn; policy_arn="$(ensure_bucket_policy "${AI_BUCKET_POLICY_NAME}" "${S3_BUCKET}")" + local policy_arn + if [[ "${USE_EXTERNAL_OBJ_STORE}" == "true" ]]; then + policy_arn="$(ensure_ecr_only_policy)" + else + policy_arn="$(ensure_bucket_policy "${AI_BUCKET_POLICY_NAME}" "${S3_BUCKET}")" + fi ensure_irsa_for_sa "${STANDALONE_SA}" "${AI_NS}" "${policy_arn}" - # DEPRECATED: Create s3-secret using AWS credentials - # This is legacy approach - IRSA above is preferred, but Splunk Operator may still require the secret - log "Creating s3-secret for Splunk Standalone (fallback if IRSA not fully supported)..." - if resolve_aws_creds_for_secret 2>/dev/null; then - local ak="${AWS_ACCESS_KEY_ID:-}"; local sk="${AWS_SECRET_ACCESS_KEY:-}"; local st="${AWS_SESSION_TOKEN:-}" - if [[ -n "$ak" && -n "$sk" ]]; then - kubectl -n "${AI_NS}" create secret generic s3-secret \ - --from-literal=s3_access_key="${ak}" \ - --from-literal=s3_secret_key="${sk}" \ - $( [[ -n "$st" ]] && printf -- "--from-literal=s3_session_token=%s" "$st" ) \ - --dry-run=client -o yaml | kubectl apply -f - - log "✓ Created s3-secret with explicit credentials" + if [[ "${USE_EXTERNAL_OBJ_STORE}" != "true" ]]; then + # Create s3-secret for Standalone when using S3 (fallback if IRSA not fully supported) + log "Creating s3-secret for Splunk Standalone (S3 mode)..." + if resolve_aws_creds_for_secret 2>/dev/null; then + local ak="${AWS_ACCESS_KEY_ID:-}"; local sk="${AWS_SECRET_ACCESS_KEY:-}"; local st="${AWS_SESSION_TOKEN:-}" + if [[ -n "$ak" && -n "$sk" ]]; then + kubectl -n "${AI_NS}" create secret generic s3-secret \ + --from-literal=s3_access_key="${ak}" \ + --from-literal=s3_secret_key="${sk}" \ + $( [[ -n "$st" ]] && printf -- "--from-literal=s3_session_token=%s" "$st" ) \ + --dry-run=client -o yaml | kubectl apply -f - + log "✓ Created s3-secret with explicit credentials" + else + warn "No AWS credentials available - s3-secret not created. Splunk Standalone will use IRSA." + fi else - warn "No AWS credentials available - s3-secret not created. Splunk Standalone will use IRSA." + warn "AWS credentials not available - s3-secret not created. Splunk Standalone will use IRSA via ${STANDALONE_SA}." fi - else - warn "AWS credentials not available - s3-secret not created. Splunk Standalone will use IRSA via ${STANDALONE_SA}." fi cat <<'YAML' | kubectl -n "${AI_NS}" apply -f - @@ -1497,7 +1832,47 @@ data: sslPassword: password YAML - cat </dev/null 2>&1; then pf_ok "$t found ($(command -v $t))"; else pf_fail "$t not found in PATH"; fi done @@ -2445,6 +2870,13 @@ preflight_env() { pf_header "Subnets exist" # Check if subnets are provided (arrays may be empty) local subnet_count=$((${#PRIVATE_SUBNETS[@]} + ${#PUBLIC_SUBNETS[@]})) + if [[ "${PRESERVE_VPC_ON_DELETE}" == "true" ]]; then + if [[ ${#PRIVATE_SUBNETS[@]} -lt 2 ]]; then + pf_fail "cluster.preserveVpcOnDelete is true: you must specify at least 2 private subnets under cluster.subnets.private so the cluster uses an existing VPC (VPC will not be deleted on 'delete')." + else + pf_ok "Preserve VPC on delete: using existing VPC (subnets specified); VPC will not be deleted when you run delete." + fi + fi if [[ $subnet_count -eq 0 ]]; then pf_ok "No subnets specified - eksctl will create new VPC and subnets automatically" else @@ -2668,11 +3100,20 @@ add_ecr_permissions_to_role() { # ---------- Orchestrator for AI Platform setup ---------- install_ai_platform_stack() { log "=== Setting up Splunk AI Platform stack ===" - ensure_s3_bucket_and_prefixes - ensure_s3_upload_splunk_app + if [[ "${USE_EXTERNAL_OBJ_STORE}" == "true" ]]; then + log "Using external S3-compatible object storage (${OBJ_STORE_TYPE}); skipping S3 bucket creation; using ECR-only policy for IRSA." + else + ensure_s3_bucket_and_prefixes + ensure_s3_upload_splunk_app + fi ensure_namespace "${AI_NS}" - local policy_arn; policy_arn="$(ensure_bucket_policy "${AI_BUCKET_POLICY_NAME}" "${S3_BUCKET}")" + local policy_arn + if [[ "${USE_EXTERNAL_OBJ_STORE}" == "true" ]]; then + policy_arn="$(ensure_ecr_only_policy)" + else + policy_arn="$(ensure_bucket_policy "${AI_BUCKET_POLICY_NAME}" "${S3_BUCKET}")" + fi ensure_irsa_for_sa "${RAY_HEAD_SA}" "${AI_NS}" "${policy_arn}" ensure_irsa_for_sa "${RAY_WORKER_SA}" "${AI_NS}" "${policy_arn}" @@ -2698,7 +3139,14 @@ install_ai_platform_stack() { } # ---------- CREATE / RECONCILE / DELETE FLOWS ---------- -create_cluster_flow() { create_cluster_config; create_cluster; } +create_cluster_flow() { + create_cluster_config + create_cluster + # H100 with capacity reservation: eksctl cannot manage these nodes — create via CloudFormation + if [[ "$DEFAULT_ACCELERATOR" == "H100" && -n "$GPU_CAPACITY_RESERVATION_ID" ]]; then + create_gpu_nodegroup_with_capacity_block + fi +} reconcile_flow() { ensure_oidc @@ -2709,8 +3157,19 @@ reconcile_flow() { install_cluster_autoscaler install_nvidia_device_plugin uncordon_ready_nodes + # H100 with capacity reservation: create GPU node group if not already present + if [[ "$DEFAULT_ACCELERATOR" == "H100" && -n "$GPU_CAPACITY_RESERVATION_ID" ]]; then + local gpu_node_count + gpu_node_count=$(kubectl get nodes -l nvidia.com/gpu=true --no-headers 2>/dev/null | wc -l | tr -d ' ') + if [[ "$gpu_node_count" -lt 1 ]]; then + create_gpu_nodegroup_with_capacity_block + else + log "Found ${gpu_node_count} H100 GPU node(s) — skipping capacity block creation." + fi + fi install_kube_prometheus install_cert_manager + ensure_s3compat_credentials install_otel_operator_and_contrib_collector install_ray_operator install_splunk_operator @@ -2722,11 +3181,16 @@ reconcile_flow() { # ---------- MAIN ---------- main_install() { - for t in aws eksctl kubectl helm git jq; do need "$t"; done + for t in aws eksctl kubectl helm git jq yq; do need "$t"; done # Load configuration from YAML file load_config + # Force region for all AWS CLI and eksctl commands + export AWS_DEFAULT_REGION="${REGION}" + export AWS_REGION="${REGION}" + log "Using AWS Region: ${REGION}" + # Validate and configure container images validate_image_config configure_images @@ -2750,9 +3214,16 @@ main_install() { pf_summary fi + # Idempotent: create cluster only if it does not exist. When cluster.useExisting is true, fail if cluster is missing. if ! cluster_exists; then + if [[ "${USE_EXISTING_CLUSTER}" == "true" ]]; then + err "cluster.useExisting is true but cluster '${CLUSTER_NAME}' was not found in ${REGION}. Create the cluster first or set useExisting: false." + exit 1 + fi create_cluster_flow ensure_kubeconfig + else + log "Cluster ${CLUSTER_NAME} already exists; skipping cluster creation (idempotent)." fi preflight_api_connectivity diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml new file mode 100644 index 0000000..9faa669 --- /dev/null +++ b/tools/cluster_setup/k0s-cluster-config.yaml @@ -0,0 +1,186 @@ +# =================================================================== +# k0s Cluster Configuration for Splunk AI Platform +# =================================================================== +# Mirrors cluster-config.yaml (EKS) but adapted for k0s on bare-metal / EC2. +# +# Quick Start: +# 1. Copy: cp k0s-cluster-config.yaml my-k0s-config.yaml +# 2. Edit: vi my-k0s-config.yaml +# 3. Replace all values marked with "CHANGE THIS" +# 4. Run: CONFIG_FILE=./my-k0s-config.yaml ./k0s_cluster_with_stack.sh install +# =================================================================== + +# ---------- Cluster Configuration ---------- +cluster: + name: airgap-cluster + # region: us-east-2 # Ignored for on-prem, but required in config + sshUser: ec2-user # CHANGE THIS: SSH user for remote nodes + sshKeyPath: ~/.ssh/id_rsa # CHANGE THIS: Path to SSH private key + +# ---------- Node Configuration ---------- +nodes: + controllers: 1 + cpuWorkers: 1 # Not used with existingIPs + gpuWorkers: 2 # Not used with existingIPs + + existingIPs: + controllers: + - 10.0.0.1 # CHANGE THIS: Your controller server IP + workers: + - 10.0.0.2 # CHANGE THIS: CPU worker 1 + - 10.0.0.3 # CHANGE THIS: GPU worker 1 + - 10.0.0.4 # CHANGE THIS: GPU worker 2 + +# ---------- Storage Configuration ---------- +# Prerequisites (must be provisioned BEFORE running the installer): +# - /var/lib/k0s must have at least 500 GB free on GPU workers +# - /var/lib/k0s must have at least 200 GB free on CPU workers +# - /var/lib/k0s must have at least 100 GB free on controllers +# If using a dedicated disk, mount it at /var/lib/k0s before running this script. +# +# Object storage: AWS S3 or external S3-compatible (no in-cluster MinIO install for external). +# Use objectStore.type: aws (S3) or s3compat | minio | seaweedfs (external; endpoint + credentials required). +storage: + storageClass: "local-path" # Storage class for Kubernetes PVCs (gp3, gp2, io1, io2) + vectorDbSize: "50Gi" # VectorDB persistent volume size + + # Minimum available disk space (GB) on /var/lib/k0s per node role. + # The installer checks these thresholds at preflight and fails if not met. + # Override to lower values only if you know your workload footprint is smaller. + # minimumDiskSpace: + # controller: 100 # k0s control plane, kine/etcd, container images + # cpuWorker: 200 # weaviate, saia-api, data-loader, fluent-bit + # gpuWorker: 500 # model weights (60-240 GB each), ray-worker-gpu image (~30 GB) + + objectStore: + type: "minio" # aws | s3compat | minio | seaweedfs (external only for non-aws) + bucket: "ai-platform-bucket-minio-us-east-2" + # endpoint: "http://3.144.157.201:8333" # SeaweedFS (deprecated — see comment above) + endpoint: "http://10.0.0.5:9000" # CHANGE THIS: MinIO/SeaweedFS S3 API endpoint + auth: + rootUser: "minioadmin" + rootPassword: "minioadmin" + +# ---------- Container Images Configuration ---------- +images: + # Registry prefix - applied to images without a full registry path + registry: "" # CHANGE THIS: Your ECR/Docker/Harbor registry (e.g. 123456789012.dkr.ecr.us-east-2.amazonaws.com) + + operator: + image: "splunk-ai-operator:latest" # CHANGE THIS: Your operator image + + splunk: + image: "splunk/splunk:10.2.0" # CHANGE THIS: Your Splunk Enterprise image + operatorImage: "docker.io/splunk/splunk-operator:3.0.0" + + ray: + headImage: "ml-platform/ray/ray-head:build-v2-010" + workerImage: "ml-platform/ray/ray-worker-gpu:build-v2-010" + + weaviate: + image: "docker.io/semitechnologies/weaviate:stable-v1.28-007846a" + + saia: + apiImage: "ml-platform/saia/saia-api:build-v2-012" + apiV2Image: "ml-platform/saia/saia-api-v2:build-v2-012" + dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v2-012" + + fluentBit: + image: "docker.io/fluent/fluent-bit:1.9.6" + + otelCollector: + image: "docker.io/otel/opentelemetry-collector-contrib:0.122.1" + + # Reverse proxy used by the SAIA reconciler to route v1 / v2 requests by + # path. Consumed via RELATED_IMAGE_NGINX. Point this at an internal mirror + # for airgapped clusters. + nginx: + image: "docker.io/library/nginx:1.27-alpine" + +# ---------- Operator Versions ---------- +operators: + ray: + version: "v1.2.2" + modelVersion: "v0.3.14-36-g1549f5a" + rayVersion: "2.53.0" + + certManager: + installCRDs: true + + nvidia: + devicePluginVersion: "v0.17.3" + +# ---------- Kubernetes ---------- +kubernetes: + namespace: ai-platform + +# ---------- File Paths ---------- +files: + splunkOperator: "./splunk-operator-cluster.yaml" # CHANGE THIS: Path to Splunk Operator manifest + aiPlatform: "./artifacts.yaml" # CHANGE THIS: Path to AI Platform artifacts + +# ---------- Splunk Configuration ---------- +splunk: + standaloneName: splunk-standalone + +# ---------- AI Platform Configuration ---------- +aiPlatform: + name: "splunk-ai-stack" + defaultAcceleratorType: "L40S" + # defaultAcceleratorType: "H100" + + workerGroupConfig: + imageRegistry: "" + + # ---------- SAIA public exposure (OPTIONAL) ---------- + # The SAIA "public" Service (nginx reverse proxy in front of v1+v2 API pods) + # defaults to ClusterIP, meaning it is only reachable from inside the cluster. + # + # Two call patterns hit this Service: + # (A) Splunk Enterprise pod → saia-service (works with ClusterIP) + # (B) End user's browser → saia-service (needs external exposure) + # + # Pattern B is used by the v2 chat UI (/query streaming, conversations, + # feedback, admin endpoints). Without external exposure the v2 chat UI + # breaks for users, even though v1 one-shot SPL features still work. + # + # To DISABLE external exposure (use ClusterIP only), either: + # * Delete / comment-out the entire `serviceTemplate:` block below, OR + # * Set `type: ClusterIP` explicitly. + # Either is treated identically — the installer skips emitting serviceTemplate + # into the AIPlatform CR and the operator falls through to the ClusterIP + # default in reconcileSAIAService(). + # + # To ENABLE external exposure for on-prem / airgap customers, NodePort is the + # recommended default: any k8s node IP + the configured nodePort yields a + # reachable endpoint from VPN-connected users. No cloud LB / cert-manager + # needed. Use LoadBalancer only if the customer runs MetalLB or a cloud LB. + serviceTemplate: + type: NodePort # ClusterIP | NodePort | LoadBalancer (omit block = ClusterIP) + nodePort: 30080 # Fixed NodePort (30000-32767). Required for stable DNS. + + features: + - name: "saia" + version: "1.1.0" + + cpuScheduling: + nodeSelector: {} + tolerations: [] + + gpuScheduling: + nodeSelector: {} + tolerations: + - key: "nvidia.com/gpu" + operator: "Equal" + value: "true" + effect: "NoSchedule" + +# ---------- Image Pull Secrets ---------- +imagePullSecrets: + secrets: + - ecr-registry-secret + autoCreateECR: true + +ecr: + account: "" # CHANGE THIS: Your AWS account ID (e.g. 123456789012) + region: us-east-2 # CHANGE THIS: Your AWS region diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index 1e65fd1..2adcffe 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -4,14 +4,15 @@ set -euo pipefail # ============================================================================= # k0s Cluster Setup Script for Splunk AI Platform # ============================================================================= -# Mirrors eks_cluster_with_stack.sh functionality but for k0s clusters -# Supports: -# 1. On-prem/baremetal: Use customer-provided IP addresses -# 2. AWS EC2: Automatically create EC2 instances for testing +# Deploys a k0s cluster on customer-provided (on-prem / baremetal) nodes. +# Requires existingIPs in the config YAML (controller + worker IPs). # ============================================================================= -# --- Unset conflicting AWS credentials --- -unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY AWS_SESSION_TOKEN AWS_PROFILE 2>/dev/null || true +# --- AWS credentials handling --- +# Don't unset AWS credentials - they may be needed for ECR access in on-prem/air-gapped scenarios +# The original unset was to prevent conflicts, but it breaks SSO/assumed-role credentials +# If you need to clear credentials, do it explicitly before running the script +# unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY AWS_SESSION_TOKEN AWS_PROFILE 2>/dev/null || true # --- Non-interactive setup --- export AWS_PAGER="" @@ -26,6 +27,13 @@ export LANG=C LC_ALL=C # ====== CONFIG FILE LOCATION ====== CONFIG_FILE="${CONFIG_FILE:-$(dirname "$0")/k0s-cluster-config.yaml}" +# ====== SESSION LOG ====== +LOG_DIR="${LOG_DIR:-$(dirname "$0")/logs}" +mkdir -p "${LOG_DIR}" +LOG_FILE="${LOG_DIR}/k0s-install-$(date '+%Y-%m-%d_%H-%M-%S').log" +exec > >(tee -a "${LOG_FILE}") 2>&1 +echo "[LOG] Session log: ${LOG_FILE}" + # ====== COLORS & LOGGING ====== log() { echo -e "\033[1;36m[INFO]\033[0m $*" >&2; } warn() { echo -e "\033[1;33m[WARN]\033[0m $*" >&2; } @@ -45,7 +53,7 @@ helm_retry() { set -e if (( rc == 0 )); then printf "%s\n" "$out"; return 0; fi # Check for transient errors that should be retried - if grep -qiE 'timed out|operation timed out|i/o timeout|connection reset|TLS handshake timeout|could not get information about the resource' <<<"$out"; then + if grep -qiE 'timed out|operation timed out|i/o timeout|connection reset|TLS handshake timeout|could not get information about the resource|context deadline exceeded|not ready' <<<"$out"; then warn "Helm transient error (attempt $i/$tries). Retrying in ${backoff}s…" warn "$out" sleep "$backoff"; backoff=$(( backoff*2 )); (( i++ )) @@ -107,23 +115,39 @@ load_config() { SSH_USER=$(yq eval '.cluster.sshUser' "${CONFIG_FILE}" 2>/dev/null || echo "ubuntu") SSH_KEY_PATH=$(yq eval '.cluster.sshKeyPath' "${CONFIG_FILE}" 2>/dev/null || echo "") - # EC2 configuration (if creating instances) - VPC_ID=$(yq eval '.ec2.vpcId' "${CONFIG_FILE}" 2>/dev/null || echo "") - SUBNET_ID=$(yq eval '.ec2.subnetId' "${CONFIG_FILE}" 2>/dev/null || echo "") - KEY_NAME=$(yq eval '.ec2.keyName' "${CONFIG_FILE}" 2>/dev/null || echo "") + # Validate existingIPs are provided (mandatory for on-prem) + if [[ -z "${EXISTING_CONTROLLER_IPS}" ]]; then + err "nodes.existingIPs.controllers must be set in config YAML — this script requires pre-provisioned nodes" + fi CONTROLLER_COUNT=$(yq eval '.nodes.controllers' "${CONFIG_FILE}" 2>/dev/null || echo "1") CPU_WORKER_COUNT=$(yq eval '.nodes.cpuWorkers' "${CONFIG_FILE}" 2>/dev/null || echo "2") GPU_WORKER_COUNT=$(yq eval '.nodes.gpuWorkers' "${CONFIG_FILE}" 2>/dev/null || echo "1") - CONTROLLER_INSTANCE_TYPE=$(yq eval '.instanceTypes.controller' "${CONFIG_FILE}" 2>/dev/null || echo "t3.xlarge") - CPU_WORKER_INSTANCE_TYPE=$(yq eval '.instanceTypes.cpuWorker' "${CONFIG_FILE}" 2>/dev/null || echo "m5.4xlarge") - GPU_WORKER_INSTANCE_TYPE=$(yq eval '.instanceTypes.gpuWorker' "${CONFIG_FILE}" 2>/dev/null || echo "g5.2xlarge") - - # MinIO configuration - MINIO_ACCESS_KEY=$(yq eval '.minio.accessKey' "${CONFIG_FILE}" 2>/dev/null || echo "minioadmin") - MINIO_SECRET_KEY=$(yq eval '.minio.secretKey' "${CONFIG_FILE}" 2>/dev/null || echo "minioadmin123") - MINIO_BUCKET=$(yq eval '.minio.bucket' "${CONFIG_FILE}" 2>/dev/null || echo "ai-platform-data") + # Storage configuration + STORAGE_CLASS=$(yq eval '.storage.storageClass // "local-path"' "${CONFIG_FILE}" 2>/dev/null || echo "local-path") + VECTORDB_SIZE=$(yq eval '.storage.vectorDbSize // "50Gi"' "${CONFIG_FILE}" 2>/dev/null || echo "50Gi") + + # Minimum disk space thresholds (GB) for preflight validation. + # Customers must ensure /var/lib/k0s has at least this much space before install. + MIN_DISK_CONTROLLER=$(yq eval '.storage.minimumDiskSpace.controller // "100"' "${CONFIG_FILE}" 2>/dev/null || echo "100") + MIN_DISK_CPU_WORKER=$(yq eval '.storage.minimumDiskSpace.cpuWorker // "200"' "${CONFIG_FILE}" 2>/dev/null || echo "200") + MIN_DISK_GPU_WORKER=$(yq eval '.storage.minimumDiskSpace.gpuWorker // "500"' "${CONFIG_FILE}" 2>/dev/null || echo "500") + # Strip non-numeric suffixes (e.g. "30Gi" -> "30") so arithmetic comparisons work + MIN_DISK_CONTROLLER="${MIN_DISK_CONTROLLER//[!0-9]/}" + MIN_DISK_CPU_WORKER="${MIN_DISK_CPU_WORKER//[!0-9]/}" + MIN_DISK_GPU_WORKER="${MIN_DISK_GPU_WORKER//[!0-9]/}" + + # Object storage: objectStore.type (aws | s3compat | minio | seaweedfs); default minio when unset + OBJ_STORE_TYPE="$(yq eval '.storage.objectStore.type // "minio"' "$CONFIG_FILE" 2>/dev/null || echo "minio")" + OBJ_STORE_BUCKET="$(yq eval '.storage.objectStore.bucket // "ai-platform-data"' "$CONFIG_FILE" 2>/dev/null || echo "ai-platform-data")" + OBJ_STORE_ENDPOINT="$(yq eval '.storage.objectStore.endpoint // ""' "$CONFIG_FILE" 2>/dev/null || echo "")" + _obj_user="$(yq eval '.storage.objectStore.auth.rootUser // "minioadmin"' "$CONFIG_FILE" 2>/dev/null || echo "minioadmin")" + _obj_pw="$(yq eval '.storage.objectStore.auth.rootPassword // ""' "$CONFIG_FILE" 2>/dev/null || echo "")" + MINIO_ENDPOINT="${OBJ_STORE_ENDPOINT}" + MINIO_BUCKET="${OBJ_STORE_BUCKET}" + MINIO_ROOT_USER="${MINIO_ROOT_USER:-$_obj_user}" + MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-$_obj_pw}" # Kubernetes namespace AI_NS=$(yq eval '.kubernetes.namespace' "${CONFIG_FILE}" 2>/dev/null || echo "ai-platform") @@ -131,13 +155,35 @@ load_config() { # Splunk configuration AI_STANDALONE_NAME=$(yq eval '.splunk.standaloneName' "${CONFIG_FILE}" 2>/dev/null || echo "splunk-standalone") + # Container images + IMAGE_REGISTRY="$(yq eval '.images.registry // ""' "$CONFIG_FILE" 2>/dev/null || echo "")" + OPERATOR_IMAGE="$(yq eval '.images.operator.image' "$CONFIG_FILE" 2>/dev/null || echo "")" + SPLUNK_IMAGE="$(yq eval '.images.splunk.image' "$CONFIG_FILE" 2>/dev/null || echo "")" + SPLUNK_OPERATOR_IMAGE="$(yq eval '.images.splunk.operatorImage' "$CONFIG_FILE" 2>/dev/null || echo "")" + RAY_HEAD_IMAGE="$(yq eval '.images.ray.headImage' "$CONFIG_FILE" 2>/dev/null || echo "")" + RAY_WORKER_IMAGE="$(yq eval '.images.ray.workerImage' "$CONFIG_FILE" 2>/dev/null || echo "")" + WEAVIATE_IMAGE="$(yq eval '.images.weaviate.image' "$CONFIG_FILE" 2>/dev/null || echo "")" + SAIA_API_IMAGE="$(yq eval '.images.saia.apiImage' "$CONFIG_FILE" 2>/dev/null || echo "")" + SAIA_API_V2_IMAGE="$(yq eval '.images.saia.apiV2Image' "$CONFIG_FILE" 2>/dev/null || echo "")" + SAIA_DATALOADER_IMAGE="$(yq eval '.images.saia.dataLoaderImage' "$CONFIG_FILE" 2>/dev/null || echo "")" + FLUENT_BIT_IMAGE="$(yq eval '.images.fluentBit.image' "$CONFIG_FILE" 2>/dev/null || echo "")" + OTEL_COLLECTOR_IMAGE="$(yq eval '.images.otelCollector.image' "$CONFIG_FILE" 2>/dev/null || echo "")" + NGINX_IMAGE="$(yq eval '.images.nginx.image' "$CONFIG_FILE" 2>/dev/null || echo "")" + + # Operator versions + MODEL_VERSION="$(yq eval '.operators.ray.modelVersion // ""' "$CONFIG_FILE" 2>/dev/null || echo "")" + RAY_RUNTIME_VERSION="$(yq eval '.operators.ray.rayVersion // "2.44.0"' "$CONFIG_FILE" 2>/dev/null || echo "2.44.0")" + + # AI Platform CR configuration + DEFAULT_ACCELERATOR=$(yq eval '.aiPlatform.defaultAcceleratorType // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + WORKER_IMAGE_REGISTRY=$(yq eval '.aiPlatform.workerGroupConfig.imageRegistry // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + + # NVIDIA device plugin version + NVIDIA_VERSION=$(yq eval '.operators.nvidia.devicePluginVersion // "v0.17.3"' "${CONFIG_FILE}" 2>/dev/null || echo "v0.17.3") + # ECR configuration (for private image repositories) ECR_ACCOUNT=$(yq eval '.ecr.account' "${CONFIG_FILE}" 2>/dev/null || echo "") - - # Get AWS account if using EC2 - if [[ -z "${EXISTING_CONTROLLER_IPS}" ]]; then - ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text 2>/dev/null || echo "") - fi + ECR_REGION=$(yq eval '.ecr.region // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") # Auto-detect ECR account from AWS if not specified if [[ -z "${ECR_ACCOUNT}" ]] && aws sts get-caller-identity &>/dev/null; then @@ -156,6 +202,7 @@ load_config() { SPLUNK_AI_FILE=$(yq eval '.files.aiPlatform' "${CONFIG_FILE}" 2>/dev/null || echo "./artifacts.yaml") log "Configuration loaded: cluster=${CLUSTER_NAME}, namespace=${AI_NS}" + log "Object storage: ${OBJ_STORE_TYPE}, endpoint=${OBJ_STORE_ENDPOINT:-not set}, bucket=${OBJ_STORE_BUCKET}" if [[ -n "${ECR_ACCOUNT}" ]]; then log "ECR Account: ${ECR_ACCOUNT}" fi @@ -173,6 +220,174 @@ load_config() { fi } +# ====== IMAGE HELPERS ====== +build_image_url() { + local registry="$1" + local image_path="$2" + if [[ "$image_path" =~ ^([a-zA-Z0-9.-]+\.[a-zA-Z]{2,}|[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(:[0-9]+)?)/.*:.+ ]]; then + echo "$image_path" + return 0 + fi + if [[ -n "$registry" && "$registry" != "null" ]]; then + echo "${registry}/${image_path}" + else + echo "$image_path" + fi +} + +validate_image_config() { + log "Validating image configuration..." + + if [[ -z "$OPERATOR_IMAGE" || "$OPERATOR_IMAGE" == "null" ]]; then + err "REQUIRED: images.operator.image must be specified in k0s-cluster-config.yaml" + fi + if [[ -z "$SPLUNK_IMAGE" || "$SPLUNK_IMAGE" == "null" ]]; then + err "REQUIRED: images.splunk.image must be specified in k0s-cluster-config.yaml" + fi + if [[ -z "$RAY_HEAD_IMAGE" || "$RAY_HEAD_IMAGE" == "null" ]]; then + err "REQUIRED: images.ray.headImage must be specified in k0s-cluster-config.yaml" + fi + if [[ -z "$RAY_WORKER_IMAGE" || "$RAY_WORKER_IMAGE" == "null" ]]; then + err "REQUIRED: images.ray.workerImage must be specified in k0s-cluster-config.yaml" + fi + if [[ -z "$WEAVIATE_IMAGE" || "$WEAVIATE_IMAGE" == "null" ]]; then + err "REQUIRED: images.weaviate.image must be specified in k0s-cluster-config.yaml" + fi + if [[ -z "$SAIA_API_IMAGE" || "$SAIA_API_IMAGE" == "null" ]]; then + err "REQUIRED: images.saia.apiImage must be specified in k0s-cluster-config.yaml" + fi + if [[ -z "$SAIA_API_V2_IMAGE" || "$SAIA_API_V2_IMAGE" == "null" ]]; then + err "REQUIRED: images.saia.apiV2Image must be specified in k0s-cluster-config.yaml" + fi + if [[ -z "$SAIA_DATALOADER_IMAGE" || "$SAIA_DATALOADER_IMAGE" == "null" ]]; then + err "REQUIRED: images.saia.dataLoaderImage must be specified in k0s-cluster-config.yaml" + fi + if [[ -z "$SPLUNK_OPERATOR_IMAGE" || "$SPLUNK_OPERATOR_IMAGE" == "null" ]]; then + SPLUNK_OPERATOR_IMAGE="docker.io/splunk/splunk-operator:3.0.0" + log "Using default Splunk Operator image: $SPLUNK_OPERATOR_IMAGE" + fi + if [[ -z "$FLUENT_BIT_IMAGE" || "$FLUENT_BIT_IMAGE" == "null" ]]; then + FLUENT_BIT_IMAGE="fluent/fluent-bit:1.9.6" + log "Using default Fluent Bit image: $FLUENT_BIT_IMAGE" + fi + if [[ -z "$OTEL_COLLECTOR_IMAGE" || "$OTEL_COLLECTOR_IMAGE" == "null" ]]; then + OTEL_COLLECTOR_IMAGE="otel/opentelemetry-collector-contrib:0.122.1" + log "Using default OpenTelemetry Collector image: $OTEL_COLLECTOR_IMAGE" + fi + if [[ -z "$NGINX_IMAGE" || "$NGINX_IMAGE" == "null" ]]; then + NGINX_IMAGE="docker.io/library/nginx:1.27-alpine" + log "Using default Nginx image: $NGINX_IMAGE" + fi + if [[ -z "$MODEL_VERSION" || "$MODEL_VERSION" == "null" ]]; then + MODEL_VERSION="v0.3.14-36-g1549f5a" + log "Using default Model version: $MODEL_VERSION" + fi + if [[ -z "$RAY_RUNTIME_VERSION" || "$RAY_RUNTIME_VERSION" == "null" ]]; then + RAY_RUNTIME_VERSION="2.44.0" + log "Using default Ray runtime version: $RAY_RUNTIME_VERSION" + fi + + log "✓ Image configuration validated successfully" +} + +configure_images() { + log "Configuring container images in manifest files..." + + if [[ ! -f "${SPLUNK_AI_FILE}.original" ]]; then + log "Creating backup: ${SPLUNK_AI_FILE}.original" + cp "$SPLUNK_AI_FILE" "${SPLUNK_AI_FILE}.original" + fi + if [[ ! -f "${SPLUNK_OPERATOR_FILE}.original" ]]; then + log "Creating backup: ${SPLUNK_OPERATOR_FILE}.original" + cp "$SPLUNK_OPERATOR_FILE" "${SPLUNK_OPERATOR_FILE}.original" + fi + + log "Restoring from clean originals to ensure idempotent updates..." + cp "${SPLUNK_AI_FILE}.original" "$SPLUNK_AI_FILE" + cp "${SPLUNK_OPERATOR_FILE}.original" "$SPLUNK_OPERATOR_FILE" + + log "Updating $SPLUNK_AI_FILE..." + + local operator_full=$(build_image_url "$IMAGE_REGISTRY" "$OPERATOR_IMAGE") + local ray_head_full=$(build_image_url "$IMAGE_REGISTRY" "$RAY_HEAD_IMAGE") + local ray_worker_full=$(build_image_url "$IMAGE_REGISTRY" "$RAY_WORKER_IMAGE") + local weaviate_full=$(build_image_url "$IMAGE_REGISTRY" "$WEAVIATE_IMAGE") + local saia_api_full=$(build_image_url "$IMAGE_REGISTRY" "$SAIA_API_IMAGE") + local saia_api_v2_full=$(build_image_url "$IMAGE_REGISTRY" "$SAIA_API_V2_IMAGE") + local saia_dataloader_full=$(build_image_url "$IMAGE_REGISTRY" "$SAIA_DATALOADER_IMAGE") + local fluent_bit_full=$(build_image_url "$IMAGE_REGISTRY" "$FLUENT_BIT_IMAGE") + local otel_collector_full=$(build_image_url "$IMAGE_REGISTRY" "$OTEL_COLLECTOR_IMAGE") + # Nginx is an upstream image; don't rewrite it to the ECR registry unless the + # user explicitly put it under their registry. build_image_url already + # preserves a fully-qualified image path, so `docker.io/library/nginx:...` + # stays intact and `nginx:1.27-alpine` gets prefixed with $IMAGE_REGISTRY. + local nginx_full=$(build_image_url "$IMAGE_REGISTRY" "$NGINX_IMAGE") + + local ray_head_escaped=$(echo "$ray_head_full" | sed 's/[\/&]/\\&/g') + local ray_worker_escaped=$(echo "$ray_worker_full" | sed 's/[\/&]/\\&/g') + local weaviate_escaped=$(echo "$weaviate_full" | sed 's/[\/&]/\\&/g') + local saia_api_escaped=$(echo "$saia_api_full" | sed 's/[\/&]/\\&/g') + local saia_api_v2_escaped=$(echo "$saia_api_v2_full" | sed 's/[\/&]/\\&/g') + local saia_dataloader_escaped=$(echo "$saia_dataloader_full" | sed 's/[\/&]/\\&/g') + local fluent_bit_escaped=$(echo "$fluent_bit_full" | sed 's/[\/&]/\\&/g') + local otel_collector_escaped=$(echo "$otel_collector_full" | sed 's/[\/&]/\\&/g') + local nginx_escaped=$(echo "$nginx_full" | sed 's/[\/&]/\\&/g') + local operator_escaped=$(echo "$operator_full" | sed 's/[\/&]/\\&/g') + + # BSD (macOS) sed requires an explicit backup-suffix arg after -i. + # GNU (Linux) sed accepts -i without the suffix arg. + # Use a bash array so the empty-string "" is preserved as a distinct argv entry + # on macOS; without this, unquoted $SEDOPTION word-splitting created stray + # "filename''" backup files next to each artifact. + local SED_INPLACE + if [[ "$OSTYPE" == "darwin"* ]]; then + SED_INPLACE=(sed -i "") + else + SED_INPLACE=(sed -i) + fi + + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_RAY_HEAD/,/value:/ s|value:.*|value: ${ray_head_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_RAY_WORKER/,/value:/ s|value:.*|value: ${ray_worker_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_WEAVIATE/,/value:/ s|value:.*|value: ${weaviate_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_SAIA_API$/,/value:/ s|value:.*|value: ${saia_api_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_SAIA_API_V2/,/value:/ s|value:.*|value: ${saia_api_v2_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_POST_INSTALL_HOOK/,/value:/ s|value:.*|value: ${saia_dataloader_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_FLUENT_BIT/,/value:/ s|value:.*|value: ${fluent_bit_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_OTEL_COLLECTOR/,/value:/ s|value:.*|value: ${otel_collector_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_NGINX/,/value:/ s|value:.*|value: ${nginx_escaped}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: MODEL_VERSION/,/value:/ s|value:.*|value: ${MODEL_VERSION}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RAY_VERSION/,/value:/ s|value:.*|value: ${RAY_RUNTIME_VERSION}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "s|image: .*splunk.*ai.*operator.*|image: ${operator_escaped}|I" "$SPLUNK_AI_FILE" + + log " ✓ Updated RELATED_IMAGE_RAY_HEAD: $ray_head_full" + log " ✓ Updated RELATED_IMAGE_RAY_WORKER: $ray_worker_full" + log " ✓ Updated RELATED_IMAGE_WEAVIATE: $weaviate_full" + log " ✓ Updated RELATED_IMAGE_SAIA_API: $saia_api_full" + log " ✓ Updated RELATED_IMAGE_SAIA_API_V2: $saia_api_v2_full" + log " ✓ Updated RELATED_IMAGE_POST_INSTALL_HOOK: $saia_dataloader_full" + log " ✓ Updated RELATED_IMAGE_FLUENT_BIT: $fluent_bit_full" + log " ✓ Updated RELATED_IMAGE_OTEL_COLLECTOR: $otel_collector_full" + log " ✓ Updated RELATED_IMAGE_NGINX: $nginx_full" + log " ✓ Updated operator image: $operator_full" + log " ✓ Updated MODEL_VERSION: $MODEL_VERSION" + log " ✓ Updated RAY_VERSION: $RAY_RUNTIME_VERSION" + + log "Updating $SPLUNK_OPERATOR_FILE..." + + local splunk_full=$(build_image_url "$IMAGE_REGISTRY" "$SPLUNK_IMAGE") + local splunk_operator_full=$(build_image_url "$IMAGE_REGISTRY" "$SPLUNK_OPERATOR_IMAGE") + + local splunk_escaped=$(echo "$splunk_full" | sed 's/[\/&]/\\&/g') + local splunk_op_escaped=$(echo "$splunk_operator_full" | sed 's/[\/&]/\\&/g') + + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_SPLUNK_ENTERPRISE/,/value:/ s|value:.*|value: ${splunk_escaped}|" "$SPLUNK_OPERATOR_FILE" + "${SED_INPLACE[@]}" "s|image: .*splunk.*operator.*|image: ${splunk_op_escaped}|I" "$SPLUNK_OPERATOR_FILE" + + log " ✓ Updated Splunk Enterprise image: $splunk_full" + log " ✓ Updated Splunk Operator image: $splunk_operator_full" + log "✓ All images configured successfully" +} + # ====== PREFLIGHT CHECKS ====== preflight_checks() { pf_header "Required tools" @@ -196,23 +411,27 @@ preflight_checks() { [[ -f "${SPLUNK_OPERATOR_FILE}" ]] && pf_ok "Splunk operator file: ${SPLUNK_OPERATOR_FILE}" || pf_warn "Splunk operator file not found: ${SPLUNK_OPERATOR_FILE}" [[ -f "${SPLUNK_AI_FILE}" ]] && pf_ok "AI platform file: ${SPLUNK_AI_FILE}" || pf_warn "AI platform file not found: ${SPLUNK_AI_FILE}" - pf_header "Infrastructure mode" - if [[ -n "${EXISTING_CONTROLLER_IPS}" ]]; then - pf_ok "Using existing infrastructure (on-prem/baremetal)" - pf_ok "Controller IPs: ${EXISTING_CONTROLLER_IPS}" - pf_ok "Worker IPs: ${EXISTING_WORKER_IPS}" - [[ -n "${SSH_KEY_PATH}" && -f "${SSH_KEY_PATH}" ]] && pf_ok "SSH key: ${SSH_KEY_PATH}" || pf_fail "SSH key not found: ${SSH_KEY_PATH}" - else - pf_ok "Creating EC2 instances" - if command -v aws >/dev/null 2>&1; then - pf_ok "AWS CLI found" - [[ -n "${ACCOUNT_ID}" ]] && pf_ok "AWS Account: ${ACCOUNT_ID}" || pf_fail "Cannot get AWS account ID" - [[ -n "${VPC_ID}" ]] && pf_ok "VPC ID: ${VPC_ID}" || pf_fail "VPC ID not set" - [[ -n "${KEY_NAME}" ]] && pf_ok "EC2 Key name: ${KEY_NAME}" || pf_fail "EC2 key name not set" + pf_header "Object storage (customer-managed)" + pf_ok "Object storage type: ${OBJ_STORE_TYPE} (bucket=${OBJ_STORE_BUCKET})" + if [[ "${OBJ_STORE_TYPE}" == "seaweedfs" ]]; then + if echo "${OBJ_STORE_ENDPOINT}" | grep -q ':9000'; then + pf_warn "SeaweedFS uses port 8333 (not 9000). Endpoint has :9000 (MinIO); use http://host:8333 for SeaweedFS." else - pf_fail "AWS CLI not found - required for EC2 instance creation" + [[ -n "${OBJ_STORE_ENDPOINT}" ]] && pf_ok "SeaweedFS endpoint: ${OBJ_STORE_ENDPOINT}" || pf_fail "objectStore.endpoint is required" fi + else + [[ -n "${OBJ_STORE_ENDPOINT}" ]] && pf_ok "Endpoint: ${OBJ_STORE_ENDPOINT}" || pf_fail "objectStore.endpoint is required" fi + [[ -n "${MINIO_ROOT_PASSWORD}" ]] && pf_ok "Credentials configured" || pf_fail "Object store credentials required (objectStore.auth.rootPassword)" + + pf_header "Infrastructure mode" + pf_ok "Using existing infrastructure (on-prem/baremetal)" + pf_ok "Controller IPs: ${EXISTING_CONTROLLER_IPS}" + pf_ok "Worker IPs: ${EXISTING_WORKER_IPS}" + [[ -n "${SSH_KEY_PATH}" && -f "${SSH_KEY_PATH}" ]] && pf_ok "SSH key: ${SSH_KEY_PATH}" || pf_fail "SSH key not found: ${SSH_KEY_PATH}" + + # Validate disk space on every node (requires SSH access) + preflight_check_node_storage pf_summary } @@ -242,307 +461,120 @@ scp_file() { fi } -# ====== EC2 INSTANCE CREATION ====== -create_security_group() { - log "Creating security group for k0s cluster..." - - local sg_name="${CLUSTER_NAME}-k0s-sg" - local sg_id - - sg_id=$(aws ec2 describe-security-groups \ - --region "${REGION}" \ - --filters "Name=group-name,Values=${sg_name}" "Name=vpc-id,Values=${VPC_ID}" \ - --query 'SecurityGroups[0].GroupId' --output text 2>/dev/null || echo "None") - - if [[ "${sg_id}" != "None" && -n "${sg_id}" ]]; then - log "Security group already exists: ${sg_id}" - echo "${sg_id}" - return 0 - fi - - sg_id=$(aws ec2 create-security-group \ - --region "${REGION}" \ - --group-name "${sg_name}" \ - --description "Security group for ${CLUSTER_NAME} k0s cluster" \ - --vpc-id "${VPC_ID}" \ - --query 'GroupId' --output text) - - # Tag the security group - aws ec2 create-tags --region "${REGION}" --resources "${sg_id}" \ - --tags "Key=Cluster,Value=${CLUSTER_NAME}" "Key=ManagedBy,Value=k0s-script" "Key=Name,Value=${sg_name}" - - log "Created security group: ${sg_id}" +# ====== PREPARE NODES (RHEL/Fedora compatibility + k0s binary) ====== +prepare_nodes_for_k0s() { + local node_ips=("$@") + log "Preparing ${#node_ips[@]} node(s) for k0s (OS compatibility + binary)..." + for node_ip in "${node_ips[@]}"; do + log " Preparing node ${node_ip}..." + ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ + ${SSH_KEY_PATH:+-i "${SSH_KEY_PATH}"} "${SSH_USER}@${node_ip}" \ + bash -s <<'REMOTE_SCRIPT' || warn " Preparation had issues on ${node_ip}" + # Disable firewalld if active (blocks k0s ports: 6443, 10250, 8472, etc.) + if systemctl is-active firewalld >/dev/null 2>&1; then + echo 'Disabling firewalld...' + sudo systemctl stop firewalld + sudo systemctl disable firewalld + fi - # Add ingress rules (redirect output to avoid pollution) - log "Configuring security group rules (restricted to your IP)..." + # Load kernel modules required by Calico and kube-proxy + for mod in br_netfilter overlay nf_conntrack; do + if ! lsmod | grep -q "^${mod} "; then + sudo modprobe "${mod}" 2>/dev/null || echo "WARN: could not load kernel module ${mod}" + fi + done + # Persist across reboots + sudo mkdir -p /etc/modules-load.d + printf 'br_netfilter\noverlay\nnf_conntrack\n' | sudo tee /etc/modules-load.d/k0s.conf >/dev/null + + # Ensure python3 + PyYAML are available (used for k0s config generation) + if ! python3 -c 'import yaml' 2>/dev/null; then + if command -v dnf >/dev/null 2>&1; then + sudo dnf install -y python3-pyyaml 2>/dev/null || sudo pip3 install pyyaml 2>/dev/null || true + elif command -v apt-get >/dev/null 2>&1; then + sudo apt-get install -y python3-yaml 2>/dev/null || true + fi + fi - # Detect current public IP address - MY_IP="${ALLOWED_CIDR:-}" - if [[ -z "$MY_IP" ]]; then - log "Auto-detecting your public IP address..." - MY_IP=$(curl -s https://checkip.amazonaws.com || curl -s https://ipinfo.io/ip || curl -s https://api.ipify.org) - if [[ -z "$MY_IP" ]]; then - warn "Could not auto-detect IP. Set ALLOWED_CIDR environment variable." - warn "Example: export ALLOWED_CIDR=\"1.2.3.4/32\"" - err "Failed to determine your IP address" - fi - # Add /32 for single IP - MY_IP="${MY_IP}/32" - log " Detected IP: ${MY_IP}" - else - log " Using provided CIDR: ${MY_IP}" - fi - - # === EXTERNAL ACCESS (restricted to your IP) === - # API server - allow ONLY from your IP for kubectl access - aws ec2 authorize-security-group-ingress --region "${REGION}" --group-id "${sg_id}" \ - --protocol tcp --port 6443 --cidr "${MY_IP}" >/dev/null 2>&1 || true - log " ✓ Port 6443 (Kubernetes API): RESTRICTED to ${MY_IP}" - - # SSH - allow ONLY from your IP for management - aws ec2 authorize-security-group-ingress --region "${REGION}" --group-id "${sg_id}" \ - --protocol tcp --port 22 --cidr "${MY_IP}" >/dev/null 2>&1 || true - log " ✓ Port 22 (SSH): RESTRICTED to ${MY_IP}" - - # NodePort services - allow ONLY from your IP for accessing deployed services - aws ec2 authorize-security-group-ingress --region "${REGION}" --group-id "${sg_id}" \ - --protocol tcp --port 30000-32767 --cidr "${MY_IP}" >/dev/null 2>&1 || true - log " ✓ Ports 30000-32767 (NodePort): RESTRICTED to ${MY_IP}" - - # Konnectivity agent port - allow ONLY from your IP - aws ec2 authorize-security-group-ingress --region "${REGION}" --group-id "${sg_id}" \ - --protocol tcp --port 8132 --cidr "${MY_IP}" >/dev/null 2>&1 || true - log " ✓ Port 8132 (Konnectivity): RESTRICTED to ${MY_IP}" - - # === INTERNAL CLUSTER COMMUNICATION (within security group only) === - # All internal traffic - etcd (2380), kubelet (10250), CNI, pod networking, etc. - aws ec2 authorize-security-group-ingress --region "${REGION}" --group-id "${sg_id}" \ - --protocol -1 --source-group "${sg_id}" >/dev/null 2>&1 || true - log " ✓ All ports: INTERNAL ONLY - for cluster communication via private IPs" - - log "Security group rules configured" - echo "${sg_id}" -} + # Install k0s binary if not present + if ! command -v k0s >/dev/null 2>&1; then + echo 'Installing k0s binary...' + curl -sSLf https://get.k0s.sh | sudo sh + fi -find_existing_instances() { - local role="$1" - aws ec2 describe-instances \ - --region "${REGION}" \ - --filters \ - "Name=tag:Cluster,Values=${CLUSTER_NAME}" \ - "Name=tag:Role,Values=${role}" \ - "Name=instance-state-name,Values=running,pending,stopping,stopped" \ - --query 'Reservations[].Instances[].InstanceId' \ - --output text + # Ensure k0s is in sudo secure_path + if [ -f /usr/local/bin/k0s ] && [ ! -f /usr/bin/k0s ]; then + sudo ln -sf /usr/local/bin/k0s /usr/bin/k0s + fi +REMOTE_SCRIPT + done } -create_ec2_instances() { - log "Creating EC2 instances for k0s cluster..." - - # Check for existing instances - local existing_controllers existing_cpu_workers existing_gpu_workers - existing_controllers=$(find_existing_instances "controller") - existing_cpu_workers=$(find_existing_instances "cpu-worker") - existing_gpu_workers=$(find_existing_instances "gpu-worker") - - local existing_controller_count=$(echo "${existing_controllers}" | wc -w) - local existing_cpu_worker_count=$(echo "${existing_cpu_workers}" | wc -w) - local existing_gpu_worker_count=$(echo "${existing_gpu_workers}" | wc -w) - - log "Found existing instances: ${existing_controller_count} controllers, ${existing_cpu_worker_count} CPU workers, ${existing_gpu_worker_count} GPU workers" - - local sg_id - sg_id=$(create_security_group) - - # Get subnet if not provided - if [[ -z "${SUBNET_ID}" ]]; then - SUBNET_ID=$(aws ec2 describe-subnets \ - --region "${REGION}" \ - --filters "Name=vpc-id,Values=${VPC_ID}" \ - --query 'Subnets[0].SubnetId' --output text) - fi - - [[ -n "${SUBNET_ID}" && "${SUBNET_ID}" != "None" ]] || err "No subnets found in VPC ${VPC_ID}" - - # Get latest Ubuntu 22.04 AMI - local ami_id - ami_id=$(aws ec2 describe-images \ - --region "${REGION}" \ - --owners 099720109477 \ - --filters "Name=name,Values=ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server-*" \ - --query 'sort_by(Images, &CreationDate)[-1].ImageId' --output text) - - log "Using AMI: ${ami_id}" - - # User data for k0s installation - write to temp file - local user_data_file="/tmp/k0s-userdata-$$.sh" - cat > "${user_data_file}" <<'EOF' -#!/bin/bash -set -ex -apt-get update -apt-get install -y curl wget jq -curl -sSLf https://get.k0s.sh | sh -EOF - TMP_FILES+=("${user_data_file}") - - # Create instances (arrays already declared globally at top of script) - CONTROLLER_IPS=() - CONTROLLER_PRIVATE_IPS=() - CONTROLLER_PUBLIC_IPS=() - WORKER_IPS=() - WORKER_PRIVATE_IPS=() - ALL_INSTANCE_IDS=() - - # Add existing instances to tracking arrays - if [[ -n "${existing_controllers}" ]]; then - for id in ${existing_controllers}; do - ALL_INSTANCE_IDS+=("${id}") - done - fi - if [[ -n "${existing_cpu_workers}" ]]; then - for id in ${existing_cpu_workers}; do - ALL_INSTANCE_IDS+=("${id}") - done - fi - if [[ -n "${existing_gpu_workers}" ]]; then - for id in ${existing_gpu_workers}; do - ALL_INSTANCE_IDS+=("${id}") - done - fi - - # Controllers - only create if needed - local controllers_to_create=$((CONTROLLER_COUNT - existing_controller_count)) - if [[ ${controllers_to_create} -gt 0 ]]; then - log "Creating ${controllers_to_create} additional controller(s)..." - for ((i=existing_controller_count; i/dev/null | tail -1 | tr -d ' ') + if [ -z \"\${avail_kb}\" ] || [ \"\${avail_kb}\" = \"Avail\" ]; then + avail_kb=\$(df --output=avail / 2>/dev/null | tail -1 | tr -d ' ') + fi + echo \$(( \${avail_kb:-0} / 1048576 )) + " 2>/dev/null || echo "0" + } + + # Check controller nodes + for ip in "${_ctrl_ips[@]}"; do + local avail + avail=$(_get_avail_gb "${ip}") + avail=$(echo "${avail}" | tr -d '[:space:]') + if [[ "${avail}" -ge "${MIN_DISK_CONTROLLER}" ]]; then + pf_ok "Controller ${ip}: ${avail} GB available (minimum: ${MIN_DISK_CONTROLLER} GB)" else - log "All ${GPU_WORKER_COUNT} GPU worker(s) already exist, skipping creation" + pf_fail "Controller ${ip}: ${avail} GB available — need at least ${MIN_DISK_CONTROLLER} GB on /var/lib/k0s" fi - fi - - log "Waiting for instances to be running..." - aws ec2 wait instance-running --region "${REGION}" --instance-ids "${ALL_INSTANCE_IDS[@]}" - - log "Waiting for instance status checks (this may take 3-5 minutes)..." - aws ec2 wait instance-status-ok --region "${REGION}" --instance-ids "${ALL_INSTANCE_IDS[@]}" || true - - log "Waiting additional time for SSH to be fully ready..." - sleep 60 - - # Get IPs - collect BOTH public and private IPs - # Use public IPs for SSH from local machine, private IPs for k0s internal communication - for id in "${ALL_INSTANCE_IDS[@]}"; do - local role - role=$(aws ec2 describe-instances --region "${REGION}" --instance-ids "${id}" \ - --query 'Reservations[0].Instances[0].Tags[?Key==`Role`].Value' --output text) + done - # Get public IP for SSH access from local machine - local public_ip - public_ip=$(aws ec2 describe-instances --region "${REGION}" --instance-ids "${id}" \ - --query 'Reservations[0].Instances[0].PublicIpAddress' --output text) + # Check worker nodes (distinguish CPU vs GPU by index) + local widx=0 + for ip in "${_worker_ips[@]}"; do + local avail role min_required + avail=$(_get_avail_gb "${ip}") + avail=$(echo "${avail}" | tr -d '[:space:]') - # Get private IP for k0s internal communication - local private_ip - private_ip=$(aws ec2 describe-instances --region "${REGION}" --instance-ids "${id}" \ - --query 'Reservations[0].Instances[0].PrivateIpAddress' --output text) + if [[ ${widx} -lt ${CPU_WORKER_COUNT} ]]; then + role="CPU worker" + min_required="${MIN_DISK_CPU_WORKER}" + else + role="GPU worker" + min_required="${MIN_DISK_GPU_WORKER}" + fi - # Use public IP for SSH, but store private IP for k0s config - if [[ "${role}" == "controller" ]]; then - CONTROLLER_IPS+=("${public_ip}") # For SSH from local machine - CONTROLLER_PRIVATE_IPS+=("${private_ip}") # For k0s internal communication - CONTROLLER_PUBLIC_IPS+=("${public_ip}") # For kubectl access and certificates - log "Controller - Public IP: ${public_ip}, Private IP: ${private_ip}" + if [[ "${avail}" -ge "${min_required}" ]]; then + pf_ok "${role} ${ip}: ${avail} GB available (minimum: ${min_required} GB)" else - WORKER_IPS+=("${public_ip}") # For SSH from local machine - WORKER_PRIVATE_IPS+=("${private_ip}") # For k0s internal communication - log "Worker - Public IP: ${public_ip}, Private IP: ${private_ip} (${role})" + pf_fail "${role} ${ip}: ${avail} GB available — need at least ${min_required} GB on /var/lib/k0s" fi + widx=$((widx + 1)) done - - # Set SSH key path from EC2 key - SSH_KEY_PATH="${HOME}/.ssh/${KEY_NAME}.pem" } # ====== K0S CLUSTER INSTALLATION ====== @@ -553,20 +585,26 @@ install_k0s_cluster() { if [[ -n "${EXISTING_CONTROLLER_IPS}" ]]; then IFS=' ' read -ra CONTROLLER_IPS <<< "${EXISTING_CONTROLLER_IPS}" IFS=' ' read -ra WORKER_IPS <<< "${EXISTING_WORKER_IPS}" + log "Using existing infrastructure - IPs from config" fi - local controller_ip="${CONTROLLER_IPS[0]}" # Public IP for SSH - local controller_private_ip="${CONTROLLER_PRIVATE_IPS[0]}" # Private IP for k0s - local controller_public_ip="${CONTROLLER_PUBLIC_IPS[0]}" # Public IP for kubectl access + local controller_ip="${CONTROLLER_IPS[0]}" - log "Primary controller - Public IP: ${controller_public_ip}, Private IP: ${controller_private_ip}" + log "Primary controller IP: ${controller_ip}" + + # Prepare all nodes (firewalld, iptables, python3) + local all_ips=("${CONTROLLER_IPS[@]}") + if [[ ${#WORKER_IPS[@]} -gt 0 ]]; then + all_ips+=("${WORKER_IPS[@]}") + fi + prepare_nodes_for_k0s "${all_ips[@]}" # Generate k0s config log "Generating k0s configuration..." ssh_exec "${controller_ip}" "k0s config create > /tmp/k0s.yaml" - # Configure k0s to use private IP for internal communication, add public IP to SANs for external access - log "Configuring k0s: Private IP ${controller_private_ip} for internal, Public IP ${controller_public_ip} for external access..." + # Configure k0s API with the controller IP for SANs and externalAddress + log "Configuring k0s with controller IP ${controller_ip}..." ssh_exec "${controller_ip}" "cat > /tmp/k0s-config-update.py <<'PYSCRIPT' import yaml @@ -574,7 +612,7 @@ import yaml with open('/tmp/k0s.yaml', 'r') as f: config = yaml.safe_load(f) -# Add SANs to API section - include BOTH private and public IPs +# Add the controller IP to SANs (for kubectl access and cluster communication) if 'spec' not in config: config['spec'] = {} if 'api' not in config['spec']: @@ -582,14 +620,10 @@ if 'api' not in config['spec']: if 'sans' not in config['spec']['api']: config['spec']['api']['sans'] = [] -# Add private IP (for internal cluster communication) -config['spec']['api']['sans'].append('${controller_private_ip}') -# Add public IP (for kubectl access from outside) -config['spec']['api']['sans'].append('${controller_public_ip}') +config['spec']['api']['sans'].append('${controller_ip}') -# CRITICAL: Use public IP for externalAddress so konnectivity-agents can connect -# konnectivity-agents run in pods and need to reach API server via routable address -config['spec']['api']['externalAddress'] = '${controller_public_ip}' +# Use the same IP for externalAddress so konnectivity-agents can connect +config['spec']['api']['externalAddress'] = '${controller_ip}' # Set Calico as network provider if 'network' not in config['spec']: @@ -611,16 +645,50 @@ PYSCRIPT" ssh_exec "${controller_ip}" "python3 /tmp/k0s-config-update.py" - log "Verifying k0s configuration includes public IP..." + log "Verifying k0s configuration includes controller IP..." ssh_exec "${controller_ip}" "grep -A3 'api:' /tmp/k0s.yaml | head -5" + # Ensure k0s is in sudo's secure_path (some distros exclude /usr/local/bin) + ssh_exec "${controller_ip}" "if [ -f /usr/local/bin/k0s ] && [ ! -f /usr/bin/k0s ]; then sudo ln -sf /usr/local/bin/k0s /usr/bin/k0s; fi" || true + + # Safety gate: refuse to wipe if a live cluster with Ready nodes exists. + # This prevents accidental data loss when the existing-cluster detection + # (useExisting) flakes due to an SSH timeout or transient k0s status error. + if ssh_exec "${controller_ip}" "sudo k0s kubectl get nodes --no-headers 2>/dev/null" 2>/dev/null | grep -q ' Ready'; then + err "k0s cluster on ${controller_ip} has Ready nodes — refusing to wipe. + Use 'delete' or 'clean-all' to tear down first, or set useExisting=auto in config." + fi + + # Clean stale k0s state from any previous run + ssh_exec "${controller_ip}" " + sudo systemctl stop k0scontroller 2>/dev/null || true + sudo systemctl reset-failed k0scontroller 2>/dev/null || true + sudo rm -f /etc/systemd/system/k0scontroller.service 2>/dev/null || true + sudo systemctl stop k0sworker 2>/dev/null || true + sudo systemctl reset-failed k0sworker 2>/dev/null || true + sudo rm -f /etc/systemd/system/k0sworker.service 2>/dev/null || true + sudo pkill -9 containerd-shim 2>/dev/null || true + sudo rm -rf /var/lib/k0s /run/k0s /etc/k0s 2>/dev/null || true + sudo rm -f /run/k0s/containerd.sock 2>/dev/null || true + sudo systemctl daemon-reload + " 2>/dev/null || true + # Install k0s controller log "Installing k0s controller on ${controller_ip}..." ssh_exec "${controller_ip}" "sudo k0s install controller --config /tmp/k0s.yaml --enable-worker" ssh_exec "${controller_ip}" "sudo k0s start" - log "Waiting for controller to be ready (60s)..." - sleep 60 + log "Waiting for controller API server to be ready..." + local ctrl_retries=0 + while (( ctrl_retries < 60 )); do + if ssh_exec "${controller_ip}" "sudo k0s kubectl get --raw /healthz 2>/dev/null" &>/dev/null; then + log " ✓ Controller API server is ready (${ctrl_retries}s)" + break + fi + sleep 5 + ctrl_retries=$((ctrl_retries + 5)) + log " Waiting... ${ctrl_retries}/300s" + done # Generate worker token log "Generating worker join token..." @@ -633,8 +701,19 @@ PYSCRIPT" for worker_ip in "${WORKER_IPS[@]}"; do log " Installing k0s worker on ${worker_ip}..." + # Ensure k0s is in sudo's secure_path (some distros exclude /usr/local/bin) + ssh_exec "${worker_ip}" "if [ -f /usr/local/bin/k0s ] && [ ! -f /usr/bin/k0s ]; then sudo ln -sf /usr/local/bin/k0s /usr/bin/k0s; fi" || true + + # Clean stale k0sworker state from any previous run (service file, data dirs, systemd failed state) + ssh_exec "${worker_ip}" " + sudo systemctl stop k0sworker 2>/dev/null || true + sudo systemctl reset-failed k0sworker 2>/dev/null || true + sudo rm -f /etc/systemd/system/k0sworker.service 2>/dev/null || true + sudo rm -rf /var/lib/k0s /run/k0s /etc/k0s /tmp/k0s-token 2>/dev/null || true + sudo systemctl daemon-reload + " 2>/dev/null || true + # Write token to temp file first (stdin pipe doesn't work reliably over SSH) - # Note: Token file must remain until worker bootstraps, so we don't delete it here if ssh_exec "${worker_ip}" "echo '${worker_token}' | sudo tee /tmp/k0s-token >/dev/null && sudo k0s install worker --token-file=/tmp/k0s-token"; then log " ✓ k0s installed on ${worker_ip}" else @@ -673,8 +752,21 @@ PYSCRIPT" warn "Some workers failed to install/start: ${failed_workers[*]}" fi - log "Waiting for workers to join (60s)..." - sleep 60 + log "Waiting for workers to join the cluster..." + local expected_join=$((${#CONTROLLER_IPS[@]} + ${#WORKER_IPS[@]})) + local join_retries=0 + while (( join_retries < 120 )); do + local current_nodes + current_nodes=$(ssh_exec "${controller_ip}" "sudo k0s kubectl get nodes --no-headers 2>/dev/null | wc -l" 2>/dev/null || echo "0") + current_nodes=$(echo "${current_nodes}" | tr -d '[:space:]') + if [[ "${current_nodes}" -ge "${expected_join}" ]]; then + log " ✓ All ${current_nodes} node(s) joined (${join_retries}s)" + break + fi + sleep 10 + join_retries=$((join_retries + 10)) + log " Waiting... ${current_nodes}/${expected_join} nodes joined (${join_retries}/120s)" + done # Verify workers actually joined log "Verifying worker nodes joined the cluster..." @@ -736,9 +828,9 @@ PYSCRIPT" mkdir -p "${HOME}/.kube" ssh_exec "${controller_ip}" "sudo cat /var/lib/k0s/pki/admin.conf" > "${HOME}/.kube/k0s-${CLUSTER_NAME}" - # Update server address to use public IP for kubectl access from local machine - log "Configuring kubeconfig to use public IP for external access..." - sed -i.bak "s|server: .*|server: https://${controller_public_ip}:6443|" "${HOME}/.kube/k0s-${CLUSTER_NAME}" + # Update server address to use the controller IP for kubectl access from local machine + log "Configuring kubeconfig to use controller IP for external access..." + sed -i.bak "s|server: .*|server: https://${controller_ip}:6443|" "${HOME}/.kube/k0s-${CLUSTER_NAME}" export KUBECONFIG="${HOME}/.kube/k0s-${CLUSTER_NAME}" @@ -749,86 +841,154 @@ PYSCRIPT" label_nodes } +# ====== RESOLVE NODE NAME ====== +# Maps a config IP to its Kubernetes node name by SSHing to the node +# and reading its hostname (which is what k0s uses as the node name). +# Usage: node_name=$(resolve_node_name "1.2.3.4") +resolve_node_name() { + local ip="$1" + # SSH to the node and get the hostname that k0s registered it with + local node_name + node_name=$(ssh_exec "${ip}" "hostname -f 2>/dev/null || hostname" 2>/dev/null || echo "") + echo "${node_name}" +} + # ====== LABEL NODES FOR WORKLOAD SCHEDULING ====== label_nodes() { log "Labeling nodes for AI workload scheduling..." - # Wait for all nodes to be ready + # Wait for all nodes to be ready. + # + # NOTE: we count nodes whose "Ready" condition is exactly "True" via a + # structured JSON query — NOT by grepping for the string "Ready" in the + # plain-text `kubectl get nodes` output. That string match is a trap + # because the STATUS column of a not-yet-ready node prints the substring + # "NotReady" which ALSO matches a naive `grep -c Ready`, causing the loop + # to exit prematurely. Downstream labeling then silently skips any worker + # that joined the API server late with "Node not found in cluster". local node_count=$((${#CONTROLLER_IPS[@]} + ${#WORKER_IPS[@]})) - log "Waiting for ${node_count} nodes to be ready..." + log "Waiting for ${node_count} node(s) to be Ready..." local timeout=300 local elapsed=0 - while [[ $(kubectl get nodes --no-headers | grep -c "Ready") -lt ${node_count} ]]; do + local ready_count + while :; do + ready_count=$(kubectl get nodes -o json 2>/dev/null \ + | jq '[.items[] | select(.status.conditions[] | select(.type=="Ready" and .status=="True"))] | length' 2>/dev/null \ + || echo 0) + if [[ "${ready_count}" -ge "${node_count}" ]]; then + log " ✓ All ${ready_count}/${node_count} nodes Ready" + break + fi sleep 5 elapsed=$((elapsed + 5)) if [[ ${elapsed} -ge ${timeout} ]]; then - warn "Timeout waiting for all nodes to be ready, proceeding anyway..." + warn "Timeout (${timeout}s) waiting for all nodes to be Ready (have ${ready_count}/${node_count}); proceeding anyway..." break fi + if (( elapsed % 30 == 0 )); then + log " ${ready_count}/${node_count} nodes Ready (${elapsed}/${timeout}s)" + fi done - # Get all nodes - local all_nodes - all_nodes=$(kubectl get nodes -o jsonpath='{.items[*].metadata.name}') + # Helper: wait up to 60s for a given node name to appear in the API server. + # This guards against the race where a worker joined the cluster just after + # the top-of-function readiness check returned but its Node object is still + # propagating to the API server we're talking to. + _wait_for_node_visible() { + local node_name="$1" + local ip="$2" + local tries=0 + local max_tries=12 # 12 * 5s = 60s + while (( tries < max_tries )); do + if kubectl get node "${node_name}" &>/dev/null; then + return 0 + fi + sleep 5 + tries=$((tries + 1)) + done + warn " Node '${node_name}' (from ${ip}) did not become visible in API server after 60s" + return 1 + } + + # Track labeling outcomes so we can fail loud if any node ends up unlabeled. + local labeling_failures=() # Label controller nodes for controller_ip in "${CONTROLLER_IPS[@]}"; do - # Find node by IP local node_name - node_name=$(kubectl get nodes -o json | jq -r ".items[] | select(.status.addresses[]? | select(.type==\"InternalIP\" and .address==\"${controller_ip}\")) | .metadata.name" | head -1) + node_name=$(resolve_node_name "${controller_ip}") - if [[ -n "${node_name}" ]]; then - log "Labeling controller node: ${node_name}" + if [[ -z "${node_name}" ]]; then + warn " Could not resolve hostname for controller ${controller_ip}, skipping..." + labeling_failures+=("${controller_ip} (hostname unresolved)") + continue + fi + + if ! _wait_for_node_visible "${node_name}" "${controller_ip}"; then + labeling_failures+=("${controller_ip} / ${node_name} (never visible)") + continue + fi + + log "Labeling controller node: ${node_name} (${controller_ip})" + kubectl label nodes "${node_name}" \ + splunk.ai/node-role=controller \ + splunk.ai/workload-type=control-plane \ + node.kubernetes.io/role=controller \ + --overwrite + + # For single-node clusters (controller with --enable-worker), also add CPU workload labels + if [[ ${#WORKER_IPS[@]} -eq 0 ]]; then + log " → Single-node cluster detected, adding CPU workload labels to controller..." kubectl label nodes "${node_name}" \ - splunk.ai/node-role=controller \ - splunk.ai/workload-type=control-plane \ - node.kubernetes.io/role=controller \ + splunk.ai/workload-type=cpu \ + node.kubernetes.io/workload=ai-cpu \ + splunk.ai/instance-type=cpu-worker \ --overwrite - - # For single-node clusters (controller with --enable-worker), also add CPU workload labels - if [[ ${#WORKER_IPS[@]} -eq 0 ]]; then - log " → Single-node cluster detected, adding CPU workload labels to controller..." - kubectl label nodes "${node_name}" \ - splunk.ai/workload-type=cpu \ - node.kubernetes.io/workload=ai-cpu \ - splunk.ai/instance-type=cpu-worker \ - --overwrite - log " ✓ CPU workload labels added to controller node" - fi + log " ✓ CPU workload labels added to controller node" fi done # Label worker nodes based on their configuration local worker_index=0 for worker_ip in "${WORKER_IPS[@]}"; do - # Find node by IP local node_name - node_name=$(kubectl get nodes -o json | jq -r ".items[] | select(.status.addresses[]? | select(.type==\"InternalIP\" and .address==\"${worker_ip}\")) | .metadata.name" | head -1) - - if [[ -n "${node_name}" ]]; then - # Determine if this is a GPU or CPU worker based on index - # First CPU_WORKER_COUNT workers are CPU, rest are GPU - if [[ ${worker_index} -lt ${CPU_WORKER_COUNT} ]]; then - log "Labeling CPU worker node: ${node_name}" - kubectl label nodes "${node_name}" \ - splunk.ai/node-role=worker \ - splunk.ai/workload-type=cpu \ - node.kubernetes.io/workload=ai-cpu \ - splunk.ai/instance-type=cpu-worker \ - --overwrite - else - log "Labeling GPU worker node: ${node_name}" - kubectl label nodes "${node_name}" \ - splunk.ai/node-role=worker \ - splunk.ai/workload-type=gpu \ - node.kubernetes.io/workload=ai-gpu \ - splunk.ai/instance-type=gpu-worker \ - nvidia.com/gpu=true \ - --overwrite - fi + node_name=$(resolve_node_name "${worker_ip}") + + if [[ -z "${node_name}" ]]; then + warn " Could not resolve hostname for worker ${worker_ip}, skipping..." + labeling_failures+=("${worker_ip} (hostname unresolved)") worker_index=$((worker_index + 1)) + continue + fi + + if ! _wait_for_node_visible "${node_name}" "${worker_ip}"; then + labeling_failures+=("${worker_ip} / ${node_name} (never visible)") + worker_index=$((worker_index + 1)) + continue + fi + + # Determine if this is a GPU or CPU worker based on index + # First CPU_WORKER_COUNT workers are CPU, rest are GPU + if [[ ${worker_index} -lt ${CPU_WORKER_COUNT} ]]; then + log "Labeling CPU worker node: ${node_name} (${worker_ip})" + kubectl label nodes "${node_name}" \ + splunk.ai/node-role=worker \ + splunk.ai/workload-type=cpu \ + node.kubernetes.io/workload=ai-cpu \ + splunk.ai/instance-type=cpu-worker \ + --overwrite + else + log "Labeling GPU worker node: ${node_name} (${worker_ip})" + kubectl label nodes "${node_name}" \ + splunk.ai/node-role=worker \ + splunk.ai/workload-type=gpu \ + node.kubernetes.io/workload=ai-gpu \ + splunk.ai/instance-type=gpu-worker \ + nvidia.com/gpu=true \ + --overwrite fi + worker_index=$((worker_index + 1)) done # Add taints to GPU nodes to prevent non-GPU workloads from scheduling there @@ -837,6 +997,91 @@ label_nodes() { kubectl taint nodes "${node#node/}" nvidia.com/gpu=true:NoSchedule --overwrite || true done + # --- Final verification: every node must have splunk.ai/workload-type set --- + # Without this, downstream scheduling silently breaks: weaviate / ray-head / + # many operator-created workloads use nodeSelector: splunk.ai/workload-type=cpu + # and will sit in Pending forever on a node that only has default labels. + log "Verifying every node has splunk.ai/workload-type set..." + local unlabeled + unlabeled=$(kubectl get nodes -o json 2>/dev/null \ + | jq -r '.items[] | select(.metadata.labels["splunk.ai/workload-type"] == null) | .metadata.name' 2>/dev/null \ + || echo "") + if [[ -n "${unlabeled}" ]]; then + # Last-chance recovery: re-iterate config IPs and label whichever matches. + # This catches the case where resolve_node_name raced earlier in the run. + warn "Found unlabeled node(s), attempting recovery:" + echo "${unlabeled}" | while IFS= read -r nn; do + warn " - ${nn}" + done + for ip in "${CONTROLLER_IPS[@]}" "${WORKER_IPS[@]}"; do + local nn + nn=$(resolve_node_name "${ip}") + [[ -z "${nn}" ]] && continue + if echo "${unlabeled}" | grep -qx "${nn}"; then + # Best-effort: apply CPU labels to the controller, CPU labels to + # any worker whose index is < CPU_WORKER_COUNT, else GPU labels. + # This duplicates a small amount of logic but keeps the recovery + # path fully self-contained. + local is_controller=false + for cip in "${CONTROLLER_IPS[@]}"; do + [[ "${cip}" == "${ip}" ]] && is_controller=true && break + done + if ${is_controller}; then + log " Recovery: labeling controller ${nn} (${ip})" + kubectl label nodes "${nn}" \ + splunk.ai/node-role=controller \ + splunk.ai/workload-type=control-plane \ + node.kubernetes.io/role=controller \ + --overwrite || true + else + local wi=0 + for wip in "${WORKER_IPS[@]}"; do + [[ "${wip}" == "${ip}" ]] && break + wi=$((wi + 1)) + done + if [[ ${wi} -lt ${CPU_WORKER_COUNT} ]]; then + log " Recovery: labeling CPU worker ${nn} (${ip})" + kubectl label nodes "${nn}" \ + splunk.ai/node-role=worker \ + splunk.ai/workload-type=cpu \ + node.kubernetes.io/workload=ai-cpu \ + splunk.ai/instance-type=cpu-worker \ + --overwrite || true + else + log " Recovery: labeling GPU worker ${nn} (${ip})" + kubectl label nodes "${nn}" \ + splunk.ai/node-role=worker \ + splunk.ai/workload-type=gpu \ + node.kubernetes.io/workload=ai-gpu \ + splunk.ai/instance-type=gpu-worker \ + nvidia.com/gpu=true \ + --overwrite || true + fi + fi + fi + done + + # Re-check after recovery attempt. + unlabeled=$(kubectl get nodes -o json 2>/dev/null \ + | jq -r '.items[] | select(.metadata.labels["splunk.ai/workload-type"] == null) | .metadata.name' 2>/dev/null \ + || echo "") + if [[ -n "${unlabeled}" ]]; then + err "Nodes still unlabeled after recovery pass: +$(echo "${unlabeled}" | sed 's/^/ /') + +Workloads that select splunk.ai/workload-type=cpu (weaviate, ray-head, +most operator-managed pods) will stay Pending. Aborting." + fi + log " ✓ Recovery successful — all nodes now have workload-type set" + else + log " ✓ All nodes have splunk.ai/workload-type set" + fi + + if [[ ${#labeling_failures[@]} -gt 0 ]]; then + warn "label_nodes encountered ${#labeling_failures[@]} non-fatal issue(s):" + for f in "${labeling_failures[@]}"; do warn " - ${f}"; done + fi + log "Node labeling complete!" log "Nodes with labels:" kubectl get nodes --show-labels @@ -868,228 +1113,30 @@ ensure_namespace() { fi } -# ====== INSTALL MINIO ====== -install_minio() { - log "Installing MinIO..." - - ensure_namespace "minio-system" - - # Create MinIO secret - kubectl create secret generic minio-creds \ - --namespace=minio-system \ - --from-literal=accesskey="${MINIO_ACCESS_KEY}" \ - --from-literal=secretkey="${MINIO_SECRET_KEY}" \ - --dry-run=client -o yaml | kubectl apply -f - - - # Deploy MinIO - cat </dev/null || true - sleep 2 - - cat </dev/null 2>&1; then - echo "✓ Bucket '${MINIO_BUCKET}' already exists" - else - echo "Creating bucket: ${MINIO_BUCKET}" - mc mb myminio/${MINIO_BUCKET} - echo "Setting anonymous read policy for bucket..." - mc anonymous set download myminio/${MINIO_BUCKET} || true - fi - - echo "" - echo "Verifying required directories..." - DIRS_TO_CREATE="" - - # Check each directory - for dir in apps artifacts model_artifacts tasks; do - if mc ls myminio/${MINIO_BUCKET}/\$dir/ >/dev/null 2>&1; then - echo " ✓ \$dir/ exists" - else - echo " → \$dir/ missing, will create" - DIRS_TO_CREATE="\$DIRS_TO_CREATE \$dir" - fi - done - - # Create missing directories only - if [ -n "\$DIRS_TO_CREATE" ]; then - echo "" - echo "Creating missing directories..." - for dir in \$DIRS_TO_CREATE; do - case \$dir in - apps) - echo " - apps/ (for Splunk apps and add-ons)" - echo "placeholder" | mc pipe myminio/${MINIO_BUCKET}/apps/.keep - ;; - artifacts) - echo " - artifacts/ (for AI Platform artifacts)" - echo "placeholder" | mc pipe myminio/${MINIO_BUCKET}/artifacts/.keep - ;; - model_artifacts) - echo " - model_artifacts/ (for AI model artifacts)" - echo "placeholder" | mc pipe myminio/${MINIO_BUCKET}/model_artifacts/.keep - ;; - tasks) - echo " - tasks/ (for AI Platform tasks)" - echo "placeholder" | mc pipe myminio/${MINIO_BUCKET}/tasks/.keep - ;; - esac - done - else - echo "" - echo "✓ All directories already exist, nothing to create" - fi - - echo "" - echo "Final verification:" - ALL_OK=true - for dir in apps artifacts model_artifacts tasks; do - if mc ls myminio/${MINIO_BUCKET}/\$dir/ >/dev/null 2>&1; then - echo " ✓ \$dir/ verified" - else - echo " ✗ \$dir/ missing" - ALL_OK=false - fi - done - - if [ "\$ALL_OK" = "true" ]; then - echo "" - echo "✓ Bucket structure ready!" - echo "" - echo "Bucket contents:" - mc ls myminio/${MINIO_BUCKET}/ - else - echo "" - echo "✗ Some directories are missing" - exit 1 - fi -EOF - - log "Waiting for bucket verification job to complete..." - if kubectl wait --for=condition=complete job/minio-create-bucket -n minio-system --timeout=120s; then - log "✓ MinIO bucket structure verified" - - # Show job logs for verification - kubectl logs -n minio-system job/minio-create-bucket --tail=20 2>/dev/null || true - else - warn "Bucket verification job did not complete in time, checking status..." - kubectl describe job/minio-create-bucket -n minio-system || true - kubectl logs -n minio-system job/minio-create-bucket --tail=50 || true +# ====== S3-COMPATIBLE OBJECT STORAGE CREDENTIALS ====== +# Object storage is always customer-managed (external). This function creates +# the Kubernetes credentials secret so the operator and workloads can auth. +ensure_s3compat_credentials() { + log "Creating credentials secret for S3-compatible object storage (${OBJ_STORE_TYPE})..." + if [[ -z "${OBJ_STORE_ENDPOINT}" && -z "${MINIO_ENDPOINT}" ]]; then + err "storage.objectStore.type=${OBJ_STORE_TYPE} requires storage.objectStore.endpoint" + return 1 fi + if [[ -z "${MINIO_ROOT_PASSWORD}" ]]; then + err "S3-compatible storage requires credentials (objectStore.auth.rootPassword or MINIO_ROOT_PASSWORD)" + return 1 + fi + ensure_namespace "${AI_NS}" + local secret_name="minio-credentials" + kubectl -n "${AI_NS}" create secret generic "${secret_name}" \ + --from-literal=AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" \ + --from-literal=AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \ + --from-literal=s3_access_key="${MINIO_ROOT_USER}" \ + --from-literal=s3_secret_key="${MINIO_ROOT_PASSWORD}" \ + --from-literal=MINIO_ACCESS_KEY="${MINIO_ROOT_USER}" \ + --from-literal=MINIO_SECRET_KEY="${MINIO_ROOT_PASSWORD}" \ + --dry-run=client -o yaml | kubectl -n "${AI_NS}" apply -f - + log "✓ S3-compatible credentials secret ${AI_NS}/${secret_name} ready" } # ====== INSTALL CERT-MANAGER ====== @@ -1125,9 +1172,9 @@ install_cert_manager() { warn "cert-manager webhook endpoint not found after ${max_retries} retries" fi - # Give webhooks extra time to stabilize and register with API server - log "Waiting for webhooks to stabilize (30s)..." - sleep 30 + # Brief pause for webhook registration with API server + log "Waiting for webhooks to stabilize (10s)..." + sleep 10 # Test webhook by creating a test Certificate resource log "Testing cert-manager webhook functionality..." @@ -1144,28 +1191,696 @@ EOF # Clean up test issuer kubectl delete issuer test-selfsigned -n cert-manager --ignore-not-found=true 2>/dev/null || true - log "cert-manager installed successfully" + log "cert-manager installed successfully" +} + +# ====== INSTALL NVIDIA DRIVERS ON GPU NODES (bare-metal / EC2) ====== +# Per-node NVIDIA driver + container toolkit install (called in parallel). +# +# Error handling philosophy: +# - `set -euo pipefail` inside every remote block so the first real failure +# aborts the node install immediately. +# - NO blanket `|| true` / `2>/dev/null` on installer commands — failures +# are loud and caught. +# - After install, strict verification gates hard-fail if the artifacts +# aren't where they should be (nvidia-smi works, libnvidia-ml.so exists, +# nvidia-ctk present, CDI spec populated). +# - RHEL 9 and RHEL 10 paths are deliberately symmetric: both install EPEL, +# both install DKMS, both clean stale cross-major CUDA repos. +# +# Returns 0 on fully-successful install, non-zero on any verification failure. +_install_nvidia_on_node() { + local gpu_ip="$1" + + # ---- Phase A: detect if driver is already installed --------------------- + local driver_ver="" + if ssh_exec "${gpu_ip}" "command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi --query-gpu=driver_version --format=csv,noheader" 2>/dev/null; then + driver_ver=$(ssh_exec "${gpu_ip}" "nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1") || driver_ver="" + fi + + if [[ -n "${driver_ver}" ]]; then + echo "✓ NVIDIA driver already installed on ${gpu_ip} (version: ${driver_ver})" + else + echo "Installing NVIDIA driver on ${gpu_ip}..." + + # ---- Phase B: install driver + supporting packages -------------------- + # `set -euo pipefail` means ANY failure aborts the block. Each step below + # must either succeed or have an explicit fallback branch that succeeds. + if ! ssh_exec "${gpu_ip}" " + set -euo pipefail + + # --- OS detection (RHEL 9, RHEL 10, Amazon Linux 2023, Debian/Ubuntu) --- + # OS_VERSION holds the numeric major we use to build the CUDA+EPEL URLs. + # For RHEL we read %{rhel}; for Amazon Linux 2023 we hardcode 9 because + # AL2023 is binary-compatible with RHEL/Fedora 9's nvidia-driver RPMs + # and the Fedora EPEL9 repo is the standard 3rd-party source. + echo '--- OS detection ---' + OS_FAMILY= + OS_VERSION= + if grep -qiE '^ID=\"?amzn\"?' /etc/os-release 2>/dev/null; then + OS_FAMILY=amzn + OS_VERSION=\$(. /etc/os-release; echo \"\${VERSION_ID%%.*}\") + elif [ -f /etc/redhat-release ]; then + OS_FAMILY=rhel + OS_VERSION=\$(rpm -E %{rhel}) + elif [ -f /etc/debian_version ]; then + OS_FAMILY=debian + fi + if [ -z \"\${OS_FAMILY}\" ]; then + echo 'ERROR: unsupported OS (not amzn/rhel/debian)' >&2 + cat /etc/os-release >&2 || true + exit 1 + fi + echo \"OS_FAMILY=\${OS_FAMILY} OS_VERSION=\${OS_VERSION:-n/a}\" + + # --- Step 1: kernel headers (required for DKMS to build nvidia kmod) --- + KREL=\$(uname -r) + echo \"--- Installing kernel headers for kernel \${KREL} ---\" + if [ \"\${OS_FAMILY}\" = 'debian' ]; then + sudo apt-get update -qq + sudo apt-get install -y \"linux-headers-\${KREL}\" + else + # Exact-match: every historical kernel-devel is usually in RHUI for + # RHEL 9/10. Fall back to the latest only when absent (rare). + if ! sudo dnf install -y \"kernel-devel-\${KREL}\" \"kernel-headers-\${KREL}\"; then + echo \"WARN: Exact kernel-devel-\${KREL} not found; installing latest kernel-devel/headers.\" + echo \" DKMS will build against the latest headers — if they don't match the running kernel,\" + echo \" modprobe will fail below and you'll need to reboot into the updated kernel.\" + sudo dnf install -y kernel-devel kernel-headers + fi + fi + + # --- Step 2: EPEL + DKMS + build toolchain ---------------------------- + # DKMS builds the nvidia kernel module from source on every kernel + # update. It needs: dkms (from EPEL), gcc, make, elfutils-libelf-devel. + # On a BARE RHEL minimal install, NONE of these are pre-installed. + # On AWS AMIs they may be partially pre-installed but we should not + # rely on that — be explicit. + if [ \"\${OS_FAMILY}\" = 'rhel' ] || [ \"\${OS_FAMILY}\" = 'amzn' ]; then + # EPEL: AL2023 = EPEL9 (binary-compat). RHEL: matching major. + if [ \"\${OS_FAMILY}\" = 'amzn' ]; then + EPEL_MAJOR=9 + else + EPEL_MAJOR=\${OS_VERSION} + fi + + # dnf-plugins-core provides 'dnf config-manager'. Pre-installed on + # most AMIs; install explicitly for minimal images. + sudo dnf install -y dnf-plugins-core + + # EPEL: provides DKMS on RHEL (RHEL's own repos don't ship DKMS). + if ! rpm -q epel-release >/dev/null 2>&1; then + echo \"--- Installing EPEL for DKMS (major \${EPEL_MAJOR}) ---\" + sudo dnf install -y \"https://dl.fedoraproject.org/pub/epel/epel-release-latest-\${EPEL_MAJOR}.noarch.rpm\" + fi + # CRB (formerly PowerTools on RHEL 8) hosts a few EPEL build deps on + # RHEL. AL2023 doesn't have a CRB repo (its core packages are in + # 'amazonlinux' directly), so this whole chain is best-effort — the + # trailing '|| true' only runs when ALL three names fail to match + # any known repo, which is the expected state on AL2023. + sudo dnf config-manager --set-enabled crb 2>/dev/null \\ + || sudo dnf config-manager --set-enabled PowerTools 2>/dev/null \\ + || sudo dnf config-manager --set-enabled powertools 2>/dev/null \\ + || true + + # DKMS + the build toolchain. Being explicit means a minimal / bare + # RHEL install works out-of-the-box and future driver versions + # with different weak-deps don't silently miss a needed package. + echo '--- Installing DKMS + build toolchain (gcc, make, elfutils-libelf-devel) ---' + sudo dnf install -y dkms gcc make elfutils-libelf-devel + fi + + # --- Step 3: CUDA repo for the right OS family + version -------------- + # Clean cross-major repos so dnf doesn't try to install from the wrong + # CUDA metadata (common failure mode on in-place RHEL 9 → 10 upgrades, + # and on re-runs of this script where the target OS may have changed). + if [ \"\${OS_FAMILY}\" = 'amzn' ]; then + sudo rm -f /etc/yum.repos.d/cuda-amzn*.repo + sudo dnf config-manager --add-repo \\ + \"https://developer.download.nvidia.com/compute/cuda/repos/amzn\${OS_VERSION:-2023}/x86_64/cuda-amzn\${OS_VERSION:-2023}.repo\" + elif [ \"\${OS_FAMILY}\" = 'rhel' ]; then + sudo rm -f /etc/yum.repos.d/cuda-rhel*.repo + sudo dnf config-manager --add-repo \\ + \"https://developer.download.nvidia.com/compute/cuda/repos/rhel\${OS_VERSION}/x86_64/cuda-rhel\${OS_VERSION}.repo\" + elif [ \"\${OS_FAMILY}\" = 'debian' ]; then + curl -fsSL \\ + https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb \\ + -o /tmp/cuda-keyring.deb + sudo dpkg -i /tmp/cuda-keyring.deb + sudo apt-get update -qq + fi + + # --- Step 4: install the driver ------------------------------------- + # Follows NVIDIA's official RHEL install guidance: + # https://docs.nvidia.com/datacenter/tesla/driver-installation-guide/ + # + # Package names in the CUDA repo (rhel9/rhel10/amzn2023): + # - cuda-drivers -> meta-pkg for proprietary driver + # (pulls nvidia-driver, kmod-nvidia-latest-dkms, + # nvidia-driver-cuda, nvidia-driver-libs, + # libnvidia-ml, etc.) + # - nvidia-open -> meta-pkg for open-source kernel driver + # - nvidia-driver:latest-dkms -> RHEL 9 only (dnf modular stream). + # Removed in RHEL 10 (modularity deprecated). + # + # There is NO package called 'nvidia-driver-dkms' in either repo — + # previous attempts at it failed on every fresh install. + # + # Strategy: single meta-package install. RHEL 10 requires --allowerasing + # because it has to remove conflicting nouveau packages. The flag is + # a no-op on RHEL 9/AL2023 where there's nothing to erase. + echo '--- Installing NVIDIA driver (meta package: cuda-drivers) ---' + if [ \"\${OS_FAMILY}\" = 'debian' ]; then + sudo apt-get install -y nvidia-driver-550 + else + # Blacklist nouveau so the new nvidia driver can load without fighting it. + # Harmless if nouveau isn't loaded (grep returns nothing). + if lsmod | grep -q '^nouveau'; then + echo '--- Blacklisting nouveau + unloading ---' + echo -e 'blacklist nouveau\\noptions nouveau modeset=0' \\ + | sudo tee /etc/modprobe.d/blacklist-nouveau.conf >/dev/null + sudo rmmod nouveau 2>/dev/null || true + # Regenerate initramfs so nouveau doesn't come back on reboot. + sudo dracut --force 2>/dev/null || true + fi + + # Primary strategy: cuda-drivers meta-package (works on RHEL 9, RHEL 10, + # AL2023 — the CUDA repo ships the same package name everywhere). + if sudo dnf install -y --allowerasing cuda-drivers; then + echo '✓ Installed cuda-drivers meta-package' + elif [ \"\${OS_VERSION}\" = '9' ] && sudo dnf module install -y nvidia-driver:latest-dkms; then + # RHEL 9 fallback: classic dnf module stream (RHEL 10 dropped modularity). + # Kept as a safety net — cuda-drivers should always work above. + echo '✓ Installed nvidia-driver:latest-dkms via dnf module (RHEL 9 legacy path)' + elif sudo dnf install -y --allowerasing nvidia-open; then + # Last-resort fallback: open-source kernel driver. + echo '✓ Installed nvidia-open (open-kernel fallback)' + else + echo 'ERROR: all NVIDIA driver install strategies failed' >&2 + echo ' Tried: cuda-drivers, nvidia-driver:latest-dkms (module), nvidia-open' >&2 + echo ' Possible causes:' >&2 + echo ' - CUDA repo URL incorrect for OS version \${OS_VERSION}' >&2 + echo ' - EPEL/DKMS not available' >&2 + echo ' - Network blocked to developer.download.nvidia.com' >&2 + exit 1 + fi + fi + + # --- Step 5: verify DKMS built + load kmod --------------------------- + # Before modprobe: check dkms status so we catch kernel-mismatch cases + # early with a clear error instead of the cryptic 'Module not found'. + echo '--- Verifying DKMS status + loading nvidia kmod ---' + if [ \"\${OS_FAMILY}\" != 'debian' ]; then + DKMS_OUT=\$(sudo dkms status 2>&1 | grep nvidia || true) + if [ -z \"\${DKMS_OUT}\" ]; then + echo 'ERROR: dkms status shows no nvidia entry — driver install did not register with DKMS' >&2 + exit 1 + fi + echo \"DKMS: \${DKMS_OUT}\" + if ! echo \"\${DKMS_OUT}\" | grep -qE 'installed|built'; then + echo 'ERROR: nvidia DKMS module is not installed/built. See: sudo dkms status; dmesg | grep nvidia' >&2 + exit 1 + fi + # Check the built-for kernel matches the running kernel. If not, + # a reboot into the newer installed kernel is required — DO NOT pretend + # modprobe will work. This is exactly what prevents false-positive + # 'install succeeded' on nodes that had a pending kernel update. + if ! echo \"\${DKMS_OUT}\" | grep -qF \"\${KREL}\"; then + echo \"ERROR: DKMS built nvidia module for a different kernel than \${KREL}.\" >&2 + echo \" 'sudo dkms status' shows: \${DKMS_OUT}\" >&2 + echo \" Action: reboot the node into the kernel DKMS built for, then re-run.\" >&2 + exit 1 + fi + fi + sudo modprobe nvidia || { + echo 'ERROR: modprobe nvidia failed after DKMS build succeeded.' >&2 + echo 'Diagnose with: sudo dmesg | grep -i nvidia | tail -30' >&2 + exit 1 + } + "; then + echo "❌ NVIDIA driver install failed on ${gpu_ip}" >&2 + return 1 + fi + + # ---- Phase C: hard-verify driver actually works ----------------------- + local ver_check + ver_check=$(ssh_exec "${gpu_ip}" "nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>&1 | head -1" || echo "") + if [[ -z "${ver_check}" ]] || ! [[ "${ver_check}" =~ ^[0-9]+\.[0-9]+ ]]; then + echo "❌ nvidia-smi verification failed on ${gpu_ip} (got: '${ver_check}')" >&2 + return 1 + fi + echo "✓ NVIDIA driver v${ver_check} running on ${gpu_ip}" + fi + + # ---- Phase D: NVIDIA Container Toolkit install ------------------------ + echo "Installing NVIDIA Container Toolkit on ${gpu_ip}..." + if ! ssh_exec "${gpu_ip}" " + set -euo pipefail + if command -v nvidia-ctk >/dev/null 2>&1; then + echo '✓ nvidia-ctk already installed (version: '\"\$(nvidia-ctk --version 2>/dev/null | head -1)\"')' + else + echo '--- Adding NVIDIA container-toolkit repo ---' + if [ -f /etc/debian_version ]; then + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \ + sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list >/dev/null + sudo apt-get update -qq + sudo apt-get install -y nvidia-container-toolkit + else + # RHEL 9 and 10 both use the same libnvidia-container stable RPM repo. + curl -fsSL https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | \ + sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo >/dev/null + sudo dnf install -y nvidia-container-toolkit + fi + fi + + # --- Configure k0s containerd (k0s uses /run/k0s/containerd.sock) ---- + # Strategy (compatible with nvidia-ctk >= 1.14, validated against 1.19): + # + # 1. Run \`nvidia-ctk runtime configure --runtime=containerd + # --nvidia-set-as-default\` with NO --config= flag. This makes + # nvidia-ctk emit a complete, correct drop-in at its known-good + # default path /etc/containerd/conf.d/99-nvidia.toml, containing: + # + # - version = 2 + # - plugins.\"io.containerd.grpc.v1.cri\".containerd.runtimes.nvidia + # - default_runtime_name = \"nvidia\" + # + # 2. On k0s nodes, we cannot leave that file at its default path + # because k0s's managed /etc/k0s/containerd.toml only imports + # /etc/k0s/containerd.d/*.toml — anything under + # /etc/containerd/conf.d/ is ignored. So we move it. + # + # We deliberately avoid passing --config= pointing at the k0s drop-in, + # because nvidia-ctk 1.19 treats the --config target as a \"main\" + # containerd config and only writes a two-line stub (imports + version) + # into it, emitting the actual runtime config to /etc/containerd/conf.d/ + # regardless. That silent behavior caused the 'containerd-nvidia-runtime: + # FAIL' verification error that reaching here used to surface. + echo '--- Configuring containerd runtime for nvidia ---' + if [ -d /etc/k0s/containerd.d ]; then + sudo mkdir -p /etc/k0s/containerd.d + + # Preserve any existing drop-in so idempotent re-runs don't lose + # hand-tuned configuration. + if [ -s /etc/k0s/containerd.d/nvidia.toml ]; then + sudo cp -a /etc/k0s/containerd.d/nvidia.toml /etc/k0s/containerd.d/nvidia.toml.bak + fi + + # Wipe any previous output so we can tell whether this invocation + # actually produced a file. + sudo rm -f /etc/containerd/conf.d/99-nvidia.toml + + # Generate the canonical drop-in at nvidia-ctk's default path. We + # rely on --nvidia-set-as-default to inject default_runtime_name. + sudo nvidia-ctk runtime configure \\ + --runtime=containerd \\ + --nvidia-set-as-default + + # Hard-fail if the file is missing or empty. + if [ ! -s /etc/containerd/conf.d/99-nvidia.toml ]; then + echo 'ERROR: nvidia-ctk did not produce /etc/containerd/conf.d/99-nvidia.toml' >&2 + echo 'nvidia-ctk --version:' >&2 + nvidia-ctk --version 2>&1 | head -3 >&2 + exit 1 + fi + + # Verify the generated drop-in actually names nvidia as the default + # runtime. Earlier nvidia-ctk versions (< 1.14) ignored + # --nvidia-set-as-default silently. + if ! sudo grep -q 'default_runtime_name = \"nvidia\"' /etc/containerd/conf.d/99-nvidia.toml; then + echo 'ERROR: nvidia-ctk drop-in does not set default_runtime_name = \"nvidia\".' >&2 + echo 'nvidia-ctk --version:' >&2 + nvidia-ctk --version 2>&1 | head -3 >&2 + echo '--- generated drop-in (first 30 lines) ---' >&2 + sudo head -30 /etc/containerd/conf.d/99-nvidia.toml >&2 + exit 1 + fi + + # Relocate the drop-in from nvidia-ctk's default path to the path + # that k0s's managed containerd.toml imports. We strip keys that + # would duplicate declarations already made by k0s's top-level + # config (version / imports / disabled_plugins / required_plugins); + # leaving them in place causes containerd to refuse to start with + # duplicate top-level-key errors. + sudo mv /etc/containerd/conf.d/99-nvidia.toml /etc/k0s/containerd.d/nvidia.toml + sudo sed -i '/^version/d; /^imports/d; /^disabled_plugins/d; /^required_plugins/d' \\ + /etc/k0s/containerd.d/nvidia.toml + + # Final sanity: the k0s drop-in must still carry default_runtime_name + # after the key-strip above (it lives under a nested table, not at + # top level, so the sed above never touches it — but verify anyway + # so failure is loud instead of silently broken). + if ! sudo grep -q 'default_runtime_name = \"nvidia\"' /etc/k0s/containerd.d/nvidia.toml; then + echo 'ERROR: /etc/k0s/containerd.d/nvidia.toml lost default_runtime_name after relocation.' >&2 + echo '--- file contents ---' >&2 + sudo cat /etc/k0s/containerd.d/nvidia.toml >&2 + exit 1 + fi + elif [ -f /etc/containerd/config.toml ]; then + # Non-k0s containerd (standalone) — safe to let nvidia-ctk edit in place. + sudo nvidia-ctk runtime configure --runtime=containerd --nvidia-set-as-default + else + echo 'ERROR: no containerd config dir found at /etc/k0s/containerd.d or /etc/containerd/config.toml' >&2 + exit 1 + fi + + # --- Generate the CDI spec so k8s device plugin can find the GPUs --- + echo '--- Generating CDI spec ---' + sudo mkdir -p /etc/cdi + sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml + if [ ! -s /etc/cdi/nvidia.yaml ]; then + echo 'ERROR: /etc/cdi/nvidia.yaml empty after generation' >&2 + exit 1 + fi + # Sanity-check that NVML could enumerate at least one device (without + # this, the spec contains no devices and the device plugin crash-loops). + if ! grep -q 'name: ' /etc/cdi/nvidia.yaml; then + echo 'ERROR: /etc/cdi/nvidia.yaml contains no device entries' >&2 + cat /etc/cdi/nvidia.yaml | head -40 >&2 + exit 1 + fi + + # --- Restart k0sworker to pick up new runtime + CDI spec ----------- + echo '--- Restarting k0sworker to pick up runtime changes ---' + sudo systemctl stop k0sworker || true + sleep 3 + sudo pkill -9 containerd-shim || true + sudo rm -f /run/k0s/containerd.sock || true + sudo systemctl start k0sworker + + # Quick sanity: confirm nvidia-ctk + libnvidia-ml.so exist where expected. + # Search all known paths (distributions differ): RHEL/Fedora use + # /usr/lib64, Debian/Ubuntu use /usr/lib/x86_64-linux-gnu, and + # some distros also expose it via ldconfig. + echo '--- Post-install sanity ---' + nvidia-ctk --version | head -1 + LIBNVML_PATH=\$(ldconfig -p 2>/dev/null | awk '/libnvidia-ml\\.so\\.1/ {print \$NF; exit}') + if [ -z \"\${LIBNVML_PATH}\" ]; then + for so in /usr/lib64/libnvidia-ml.so.1 \\ + /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1 \\ + /usr/lib/libnvidia-ml.so.1; do + if [ -e \"\${so}\" ]; then LIBNVML_PATH=\"\${so}\"; break; fi + done + fi + if [ -n \"\${LIBNVML_PATH}\" ]; then + echo \"✓ libnvidia-ml.so.1 found: \${LIBNVML_PATH}\" + else + echo 'ERROR: libnvidia-ml.so.1 not found on any standard path.' >&2 + exit 1 + fi + "; then + echo "❌ Container toolkit setup failed on ${gpu_ip}" >&2 + return 1 + fi + + # ---- Phase E: post-install strict verification ----------------------- + # These checks are what the device plugin will actually need at runtime. + local checks_out + checks_out=$(ssh_exec "${gpu_ip}" " + set +e + echo -n 'nvidia-smi: ' + nvidia-smi --query-gpu=name --format=csv,noheader >/dev/null 2>&1 && echo OK || echo FAIL + echo -n 'libnvidia-ml.so: ' + # Check ldconfig cache first (most reliable), then fall back to the + # common per-distribution install paths. + if ldconfig -p 2>/dev/null | grep -q 'libnvidia-ml\.so\.1'; then + echo OK + elif ls /usr/lib64/libnvidia-ml.so.1 \\ + /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1 \\ + /usr/lib/libnvidia-ml.so.1 2>/dev/null | head -1 | grep -q .; then + echo OK + else + echo FAIL + fi + echo -n 'nvidia-ctk: ' + command -v nvidia-ctk >/dev/null 2>&1 && echo OK || echo FAIL + echo -n 'cdi-spec: ' + [ -s /etc/cdi/nvidia.yaml ] && grep -q 'name: ' /etc/cdi/nvidia.yaml && echo OK || echo FAIL + echo -n 'nvidia-kmod: ' + lsmod | grep -q '^nvidia ' && echo OK || echo FAIL + echo -n 'containerd-nvidia-runtime: ' + grep -q 'default_runtime_name = \"nvidia\"' /etc/k0s/containerd.d/nvidia.toml 2>/dev/null && echo OK || echo FAIL + ") + + echo "Strict verification on ${gpu_ip}:" + echo "${checks_out}" | sed 's/^/ /' + if echo "${checks_out}" | grep -q FAIL; then + echo "❌ Strict verification failed on ${gpu_ip} — device plugin will crash-loop with ERROR_LIBRARY_NOT_FOUND" >&2 + return 1 + fi + echo "✓ Strict verification passed on ${gpu_ip}" + return 0 +} + +# EKS GPU AMIs ship with NVIDIA drivers pre-installed. +# For k0s on generic AMIs (e.g. Amazon Linux 2023), we must install them +# on the host before the Kubernetes device-plugin can expose GPUs. +install_nvidia_host_drivers() { + if [[ ${GPU_WORKER_COUNT} -eq 0 ]]; then + log "Skipping NVIDIA host driver install (no GPU workers)" + return 0 + fi + + log "Installing NVIDIA drivers & container toolkit on GPU worker nodes..." + + # Ensure WORKER_IPS is populated (it may not be if install_k0s_cluster was skipped) + if [[ -z "${WORKER_IPS+x}" || ${#WORKER_IPS[@]} -eq 0 ]]; then + if [[ -n "${EXISTING_WORKER_IPS}" ]]; then + IFS=' ' read -ra WORKER_IPS <<< "${EXISTING_WORKER_IPS}" + log " Loaded ${#WORKER_IPS[@]} worker IP(s) from config: ${WORKER_IPS[*]}" + else + warn "No worker IPs available; skipping host driver install" + return 0 + fi + fi + + # Identify GPU worker IPs (workers after the first CPU_WORKER_COUNT) + local gpu_ips=() + local idx=0 + for ip in "${WORKER_IPS[@]}"; do + if [[ ${idx} -ge ${CPU_WORKER_COUNT} ]]; then + gpu_ips+=("${ip}") + fi + idx=$((idx + 1)) + done + + if [[ ${#gpu_ips[@]} -eq 0 ]]; then + warn "No GPU worker IPs found; skipping host driver install" + return 0 + fi + + # Run driver + toolkit install on all GPU nodes in parallel + log "Installing NVIDIA drivers on ${#gpu_ips[@]} GPU node(s) in parallel..." + local pids=() + local logdir + logdir=$(mktemp -d) + + for gpu_ip in "${gpu_ips[@]}"; do + ( + _install_nvidia_on_node "${gpu_ip}" > "${logdir}/${gpu_ip}.log" 2>&1 + echo $? > "${logdir}/${gpu_ip}.rc" + ) & + pids+=($!) + log " Started NVIDIA install on ${gpu_ip} (pid $!)" + done + + # Wait for all background installs to finish + local failed=0 + for i in "${!pids[@]}"; do + local pid=${pids[$i]} + local gpu_ip=${gpu_ips[$i]} + if wait "${pid}"; then + log " ✓ NVIDIA setup completed on ${gpu_ip}" + else + warn " NVIDIA setup on ${gpu_ip} had issues" + failed=$((failed + 1)) + fi + # Stream the per-node log so output is visible + while IFS= read -r line; do + log " [${gpu_ip}] ${line}" + done < "${logdir}/${gpu_ip}.log" + done + + rm -rf "${logdir}" + + if [[ ${failed} -gt 0 ]]; then + err "${failed}/${#gpu_ips[@]} GPU node(s) had NVIDIA install failures. Aborting install. + + What to check on a failing node: + ssh 'dkms status | grep nvidia' # must show 'installed' + ssh 'lsmod | grep nvidia' # must list nvidia kmod + ssh 'ls /usr/lib64/libnvidia-ml.so.1' # must exist + ssh 'nvidia-ctk --version' # must work + ssh 'cat /etc/cdi/nvidia.yaml | head -40' # must list GPU devices + ssh 'sudo dmesg | grep -i nvidia | tail -30'# kernel-level errors + + Common causes: + - kernel-devel for running kernel not available (exact match too new); + reboot to match a released kernel, then re-run + - EPEL/DKMS didn't install (check 'rpm -q epel-release dkms') + - Stale /etc/yum.repos.d/cuda-rhel*.repo from a prior OS upgrade" + else + log "NVIDIA drivers installed successfully on all ${#gpu_ips[@]} GPU node(s)" + fi + + # Wait for GPU workers to rejoin and verify they are Ready + log "Waiting for GPU worker nodes to rejoin cluster and become Ready..." + local gpu_wait_timeout=180 + local gpu_wait_elapsed=0 + local all_gpu_ready=false + + while [[ ${gpu_wait_elapsed} -lt ${gpu_wait_timeout} ]]; do + all_gpu_ready=true + for gpu_ip in "${gpu_ips[@]}"; do + # Resolve GPU node name via SSH hostname lookup + local gpu_node + gpu_node=$(resolve_node_name "${gpu_ip}") + + if [[ -z "${gpu_node}" ]] || ! kubectl get node "${gpu_node}" &>/dev/null; then + all_gpu_ready=false + break + fi + + local ready_status + ready_status=$(kubectl get node "${gpu_node}" -o json 2>/dev/null | \ + jq -r '.status.conditions[] | select(.type=="Ready") | .status' 2>/dev/null || echo "") + if [[ "${ready_status}" != "True" ]]; then + all_gpu_ready=false + break + fi + done + + if [[ "${all_gpu_ready}" == "true" ]]; then + log "✓ All GPU worker nodes are Ready" + break + fi + + sleep 10 + gpu_wait_elapsed=$((gpu_wait_elapsed + 10)) + log " Waiting for GPU nodes to be Ready... ${gpu_wait_elapsed}/${gpu_wait_timeout}s" + done + + if [[ "${all_gpu_ready}" != "true" ]]; then + err "Some GPU nodes did not become Ready within ${gpu_wait_timeout}s. Check: kubectl get nodes" + fi + + # Verify GPUs are visible to Kubernetes. If the device-plugin DaemonSet + # isn't installed yet (expected during the initial install — it's created + # by install_nvidia_device_plugin() in Phase 2), short-circuit immediately + # instead of waiting a fruitless 120s. For idempotent re-runs where the + # DS is already present we poll up to 120s. + if ! kubectl -n kube-system get ds nvidia-device-plugin-daemonset &>/dev/null; then + log " (device plugin DaemonSet not yet installed; capacity will appear after install_nvidia_device_plugin runs)" + log "NVIDIA host driver installation complete" + return 0 + fi + + log "Checking if GPUs are visible to Kubernetes..." + local gpu_capacity="0" + local cap_wait=0 + local cap_timeout=120 + while [[ ${cap_wait} -lt ${cap_timeout} ]]; do + gpu_capacity=$(kubectl get nodes -l splunk.ai/workload-type=gpu -o json 2>/dev/null | \ + jq '[.items[].status.capacity["nvidia.com/gpu"] // "0" | tonumber] | add' 2>/dev/null || echo "0") + if [[ "${gpu_capacity}" -gt 0 ]]; then + log "✓ Total GPUs visible to Kubernetes: ${gpu_capacity}" + break + fi + sleep 10 + cap_wait=$((cap_wait + 10)) + log " Waiting for GPU capacity to be reported... ${cap_wait}/${cap_timeout}s" + done + + if [[ "${gpu_capacity}" -le 0 ]]; then + err "Device plugin DaemonSet is installed but no GPUs are visible after ${cap_timeout}s. + Investigate with: + kubectl -n kube-system logs ds/nvidia-device-plugin-daemonset --tail 40 + kubectl -n kube-system describe pod -l name=nvidia-device-plugin-ds" + fi + + log "NVIDIA host driver installation complete" } -# ====== INSTALL NVIDIA GPU OPERATOR ====== +# ====== INSTALL NVIDIA DEVICE PLUGIN (matches EKS approach) ====== +# Ref: eks_cluster_with_stack.sh — uses the simple DaemonSet, NOT the GPU Operator. +# The GPU Operator's driver container images don't exist for Amazon Linux 2023. install_nvidia_device_plugin() { if [[ ${GPU_WORKER_COUNT} -eq 0 ]]; then - log "Skipping NVIDIA GPU operator (no GPU workers)" + log "Skipping NVIDIA device plugin (no GPU workers)" return 0 fi - log "Installing NVIDIA GPU Operator..." + local ver="${NVIDIA_VERSION:-v0.17.3}" + log "Installing NVIDIA device plugin DaemonSet (${ver})..." - helm repo add nvidia https://helm.ngc.nvidia.com/nvidia || true - helm repo update + # Create the nvidia RuntimeClass FIRST. The device-plugin DaemonSet we + # apply below references this RuntimeClass via runtimeClassName=nvidia, so + # it must exist before any DS pod is scheduled — otherwise kubelet will + # reject the pod with 'RuntimeClass "nvidia" not found'. + log " Creating nvidia RuntimeClass..." + cat <<'RTEOF' | kubectl apply -f - +apiVersion: node.k8s.io/v1 +kind: RuntimeClass +metadata: + name: nvidia +handler: nvidia +RTEOF + + # Fetch the upstream manifest into a temp file, inject our required + # pod-spec fields (nodeSelector + runtimeClassName) BEFORE applying. + # Doing this in one shot — instead of apply-then-patch — avoids the + # race where the initial DS pods start under the default runtime + # (runc), hit 'ERROR_LIBRARY_NOT_FOUND' because they have no access to + # libnvidia-ml.so or /dev/nvidia*, and land in CrashLoopBackOff before + # the patch ever reaches them. + local manifest + manifest=$(mktemp) + if ! curl -fsSL \ + "https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/${ver}/deployments/static/nvidia-device-plugin.yml" \ + -o "${manifest}"; then + rm -f "${manifest}" + err "Failed to fetch NVIDIA device-plugin manifest from GitHub (version ${ver}). + Check network connectivity and that version ${ver} exists upstream." + fi + + log " Patching manifest in place: GPU nodeSelector + nvidia runtimeClassName..." + # Use yq when available (cleanest, structure-aware); fall back to kubectl + # patch --local on stdout — both produce the same patched manifest on + # stdout which we then `apply -f -`. + local patched + patched=$(mktemp) + if command -v yq >/dev/null 2>&1; then + yq eval ' + (select(.kind == "DaemonSet") | .spec.template.spec.nodeSelector."splunk.ai/workload-type") = "gpu" + | (select(.kind == "DaemonSet") | .spec.template.spec.runtimeClassName) = "nvidia" + ' "${manifest}" > "${patched}" + else + # Fallback: use kubectl patch --local. This requires reading from the + # manifest and piping through patch; multi-document files complicate + # things, but this upstream manifest is a single DaemonSet. + kubectl patch -f "${manifest}" --local -o yaml \ + --type='json' \ + -p='[ + {"op": "add", "path": "/spec/template/spec/nodeSelector", "value": {"splunk.ai/workload-type": "gpu"}}, + {"op": "add", "path": "/spec/template/spec/runtimeClassName", "value": "nvidia"} + ]' > "${patched}" + fi - helm_retry 3 upgrade --install gpu-operator nvidia/gpu-operator \ - --namespace gpu-operator --create-namespace \ - --set driver.enabled=true \ - --set toolkit.enabled=true \ - --wait --timeout=10m + if ! kubectl apply -n kube-system -f "${patched}"; then + rm -f "${manifest}" "${patched}" + err "Failed to apply patched NVIDIA device-plugin manifest. Check kubectl connectivity." + fi + rm -f "${manifest}" "${patched}" - log "NVIDIA GPU Operator installed successfully" + # Wait for the DS to roll out so the caller observes GPU capacity as + # soon as possible. Non-fatal: we verify capacity explicitly upstream + # via the strict-verification loop. + kubectl -n kube-system rollout status ds/nvidia-device-plugin-daemonset --timeout=3m || true + + log "NVIDIA device plugin installed successfully" } # ====== INSTALL PROMETHEUS OPERATOR ====== @@ -1173,7 +1888,7 @@ install_kube_prometheus() { log "Installing kube-prometheus-stack..." helm repo add prometheus-community https://prometheus-community.github.io/helm-charts || true - helm repo update + helm repo update prometheus-community # Only update the specific repo we need helm_retry 3 upgrade --install kube-prometheus-stack prometheus-community/kube-prometheus-stack \ --namespace monitoring --create-namespace \ @@ -1188,8 +1903,11 @@ install_kube_prometheus() { install_otel_operator_and_contrib_collector() { log "Installing OpenTelemetry Operator..." + # OTEL operator uses cert-manager for webhook certs — ensure webhook is ready + wait_for_cert_manager_webhook 30 10 + helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts || true - helm repo update + helm repo update open-telemetry # Only update the specific repo we need # Use cert-manager for webhook certificates (now that konnectivity is fixed) helm_retry 3 upgrade --install opentelemetry-operator open-telemetry/opentelemetry-operator \ @@ -1208,11 +1926,13 @@ install_ray_operator() { log "Installing KubeRay Operator..." helm repo add kuberay https://ray-project.github.io/kuberay-helm/ || true - helm repo update + helm repo update kuberay # Only update the specific repo we need helm_retry 3 upgrade --install kuberay-operator kuberay/kuberay-operator \ --namespace ray-system --create-namespace \ - --version 1.0.0 \ + --version 1.2.2 \ + --set image.repository=quay.io/kuberay/operator \ + --set image.tag=v1.2.2 \ --wait --timeout=10m wait_for_crd rayservices.ray.io 300 @@ -1230,6 +1950,14 @@ install_splunk_operator() { return 0 fi + # Determine the namespace from the YAML file or use default + local splunk_operator_ns="splunk-operator" + ensure_namespace "${splunk_operator_ns}" + + # Create image pull secrets in splunk-operator namespace BEFORE applying manifests + log "Creating image pull secrets in ${splunk_operator_ns} namespace..." + create_image_pull_secrets "${splunk_operator_ns}" >/dev/null 2>&1 || true + # Use kubectl replace --force for CRDs to avoid annotation size limits # This deletes and recreates the resource, avoiding the annotation issue log "Installing/updating Splunk Operator CRDs and resources..." @@ -1243,11 +1971,111 @@ install_splunk_operator() { kubectl replace --force -f "${SPLUNK_OPERATOR_FILE}" 2>&1 | grep -v "Warning: --force is deprecated" || true fi + # Patch splunk-operator deployment with imagePullSecrets if any exist + log "Checking for imagePullSecrets to add to Splunk Operator deployment..." + local secrets_patch="" + for secret_name in ecr-registry-secret docker-hub-secret gcr-secret acr-secret custom-registry-secret; do + if kubectl get secret "${secret_name}" -n "${splunk_operator_ns}" &>/dev/null 2>&1; then + secrets_patch+='{"name":"'"${secret_name}"'"},' + log " Found secret: ${secret_name}" + fi + done + + if [[ -n "${secrets_patch}" ]]; then + secrets_patch="${secrets_patch%,}" + local dep_name + dep_name=$(kubectl -n "${splunk_operator_ns}" get deploy -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + + if [[ -n "${dep_name}" ]]; then + log "Patching Splunk Operator deployment (${dep_name}) with imagePullSecrets..." + kubectl -n "${splunk_operator_ns}" patch deployment "${dep_name}" \ + --type='json' \ + -p='[{"op":"add","path":"/spec/template/spec/imagePullSecrets","value":['"${secrets_patch}"']}]' \ + 2>/dev/null || log " imagePullSecrets may already exist" + + # Restart to apply changes + kubectl rollout restart deployment "${dep_name}" -n "${splunk_operator_ns}" 2>/dev/null || true + fi + fi + wait_for_crd standalones.enterprise.splunk.com 300 log "Splunk Operator installed successfully" } +# ====== WAIT FOR CERT-MANAGER WEBHOOK ====== +# Ensures cert-manager webhook is responsive before applying resources that +# contain Certificate/Issuer CRs (e.g. artifacts.yaml). +wait_for_cert_manager_webhook() { + local max_attempts="${1:-30}" + local sleep_interval="${2:-10}" + + log "Verifying cert-manager webhook is responsive..." + + # 1. Ensure webhook pod is running + if ! kubectl get namespace cert-manager &>/dev/null; then + warn "cert-manager namespace not found, skipping webhook check" + return 0 + fi + + kubectl wait --for=condition=ready pod \ + -l app.kubernetes.io/component=webhook \ + -n cert-manager --timeout=120s 2>/dev/null \ + || warn "cert-manager webhook pod may not be fully ready" + + # 2. Ensure webhook endpoint has addresses + local attempt=0 + while (( attempt < max_attempts )); do + local webhook_ip + webhook_ip=$(kubectl -n cert-manager get endpoints cert-manager-webhook \ + -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || echo "") + + if [[ -n "${webhook_ip}" ]]; then + log "cert-manager webhook endpoint: ${webhook_ip}" + break + fi + + log " Waiting for cert-manager webhook endpoint... (${attempt}/${max_attempts})" + sleep "${sleep_interval}" + attempt=$((attempt + 1)) + done + + if (( attempt >= max_attempts )); then + warn "cert-manager webhook endpoint not found after ${max_attempts} attempts" + return 1 + fi + + # 3. Functional test: create and delete a test Issuer + local test_ok=false + for i in $(seq 1 "${max_attempts}"); do + if kubectl apply -f - <<'TESTEOF' 2>/dev/null +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: cert-manager-webhook-test + namespace: cert-manager +spec: + selfSigned: {} +TESTEOF + then + kubectl delete issuer cert-manager-webhook-test -n cert-manager \ + --ignore-not-found=true 2>/dev/null || true + test_ok=true + log "✓ cert-manager webhook is responsive" + break + fi + log " cert-manager webhook not yet accepting requests... (${i}/${max_attempts})" + sleep "${sleep_interval}" + done + + if [[ "${test_ok}" != "true" ]]; then + warn "cert-manager webhook did not become responsive after ${max_attempts} attempts" + return 1 + fi + + return 0 +} + # ====== INSTALL SPLUNK AI OPERATOR ====== install_splunk_ai_operator() { log "Installing Splunk AI Operator from ${SPLUNK_AI_FILE}..." @@ -1262,13 +2090,60 @@ install_splunk_ai_operator() { local ai_operator_ns="splunk-ai-operator-system" ensure_namespace "${ai_operator_ns}" + # Create image pull secrets in operator namespace BEFORE applying manifests + log "Creating image pull secrets in ${ai_operator_ns} namespace..." + create_image_pull_secrets "${ai_operator_ns}" >/dev/null 2>&1 || true + + # Ensure cert-manager webhook is ready before applying (artifacts.yaml contains + # Certificate and Issuer resources that require the webhook to be responsive) + wait_for_cert_manager_webhook 30 10 + # Apply the artifacts.yaml file (contains CRDs and operator deployment) log "Applying Splunk AI Operator manifests..." - # First try to apply normally - if kubectl apply -f "${SPLUNK_AI_FILE}" 2>&1 | grep -q "field is immutable\|too long"; then - log "Standard apply failed, using server-side apply with force..." - kubectl apply --server-side --force-conflicts -f "${SPLUNK_AI_FILE}" + # Use server-side apply with force to ensure all fields are updated including images + log "Using server-side apply to ensure image URLs are updated..." + local apply_output + apply_output=$(kubectl apply --server-side --force-conflicts -f "${SPLUNK_AI_FILE}" 2>&1) || true + echo "${apply_output}" + + # Check if any cert-manager resources (Certificate/Issuer) failed due to webhook errors + if echo "${apply_output}" | grep -qi "webhook.*cert-manager\|failed calling webhook.*cert-manager\|i/o timeout"; then + warn "Some cert-manager resources failed on first attempt, retrying..." + + # Wait for webhook to stabilize and retry + sleep 15 + wait_for_cert_manager_webhook 15 10 + + log "Retrying full apply for cert-manager resources..." + kubectl apply --server-side --force-conflicts -f "${SPLUNK_AI_FILE}" 2>&1 | \ + grep -iE "certificate|issuer|error|warning" || true + fi + + # Verify that the critical Certificate and Issuer resources exist + log "Verifying cert-manager resources were created..." + local cm_retries=0 + local cm_max=12 + while (( cm_retries < cm_max )); do + local serving_cert + serving_cert=$(kubectl get certificate splunk-ai-operator-serving-cert \ + -n "${ai_operator_ns}" -o jsonpath='{.metadata.name}' 2>/dev/null || echo "") + + if [[ -n "${serving_cert}" ]]; then + log "✓ Certificate 'splunk-ai-operator-serving-cert' exists" + break + fi + + log " Waiting for cert-manager resources to be created... (${cm_retries}/${cm_max})" + sleep 10 + # Re-apply on each retry to ensure cert-manager resources are processed + kubectl apply --server-side --force-conflicts -f "${SPLUNK_AI_FILE}" 2>&1 | \ + grep -iE "certificate|issuer" || true + cm_retries=$((cm_retries + 1)) + done + + if (( cm_retries >= cm_max )); then + warn "Certificate resources may not have been created — the AI operator webhook may not work" fi # Specifically ensure ClusterRole is updated (common RBAC update issue) @@ -1291,6 +2166,31 @@ install_splunk_ai_operator() { # Remove 'deployment.apps/' prefix if present dep="${dep#deployment.apps/}" log "Found deployment: ${dep}" + + # Patch deployment with imagePullSecrets if any exist + log "Checking for imagePullSecrets to add to operator deployment..." + local secrets_patch="" + for secret_name in ecr-registry-secret docker-hub-secret gcr-secret acr-secret custom-registry-secret; do + if kubectl get secret "${secret_name}" -n "${ai_operator_ns}" &>/dev/null 2>&1; then + secrets_patch+='{"name":"'"${secret_name}"'"},' + log " Found secret: ${secret_name}" + fi + done + + if [[ -n "${secrets_patch}" ]]; then + # Remove trailing comma + secrets_patch="${secrets_patch%,}" + log "Patching operator deployment with imagePullSecrets..." + kubectl -n "${ai_operator_ns}" patch deployment "${dep}" \ + --type='json' \ + -p='[{"op":"add","path":"/spec/template/spec/imagePullSecrets","value":['"${secrets_patch}"']}]' \ + 2>/dev/null || log " imagePullSecrets may already exist or path differs" + fi + + # Force restart the deployment to pick up new environment variables (image URLs) + log "Restarting operator deployment to apply updated image configuration..." + kubectl rollout restart deployment "${dep}" -n "${ai_operator_ns}" + wait_rollout "${ai_operator_ns}" deploy "${dep}" else warn "Could not find operator deployment, will wait for CRDs instead" @@ -1313,8 +2213,8 @@ create_minio_secret() { kubectl create secret generic minio-credentials \ --namespace="${ns}" \ - --from-literal=accessKey="${MINIO_ACCESS_KEY}" \ - --from-literal=secretKey="${MINIO_SECRET_KEY}" \ + --from-literal=accessKey="${MINIO_ROOT_USER}" \ + --from-literal=secretKey="${MINIO_ROOT_PASSWORD}" \ --dry-run=client -o yaml | kubectl apply -f - log "MinIO credentials secret created" @@ -1324,8 +2224,10 @@ create_minio_secret() { # ====== SETUP ECR REPOSITORY PERMISSIONS ====== setup_ecr_permissions() { local repo_prefix="${1:-ml-platform}" + # Use ECR_REGION from config, fallback to REGION, then us-east-2 + local ecr_region="${ECR_REGION:-${REGION:-us-east-2}}" - log "Checking ECR repository permissions for: ${repo_prefix}..." + log "Checking ECR repository permissions for: ${repo_prefix} in region ${ecr_region}..." # Check if AWS credentials are available if ! aws sts get-caller-identity &>/dev/null; then @@ -1339,8 +2241,8 @@ setup_ecr_permissions() { # List repositories matching prefix local repos - repos=$(aws ecr describe-repositories --region "${REGION}" 2>/dev/null | \ - jq -r ".repositories[] | select(.repositoryName | startswith(\"${repo_prefix}\")) | .repositoryName" || echo "") + repos=$(aws ecr describe-repositories --region "${ecr_region}" 2>/dev/null | \ + jq -r --arg prefix "${repo_prefix}" '.repositories[] | select(.repositoryName | startswith($prefix)) | .repositoryName' || echo "") if [[ -z "${repos}" ]]; then warn "No ECR repositories found with prefix: ${repo_prefix}" @@ -1360,7 +2262,7 @@ setup_ecr_permissions() { # Get current policy local policy - policy=$(aws ecr get-repository-policy --repository-name "${repo}" --region "${REGION}" 2>/dev/null | jq -r '.policyText' || echo "") + policy=$(aws ecr get-repository-policy --repository-name "${repo}" --region "${ecr_region}" 2>/dev/null | jq -r '.policyText' || echo "") if [[ -z "${policy}" ]]; then log " No policy found, creating one to allow pull access..." @@ -1388,7 +2290,7 @@ EOF if aws ecr set-repository-policy \ --repository-name "${repo}" \ - --region "${REGION}" \ + --region "${ecr_region}" \ --policy-text "file:///tmp/ecr-policy-${repo//\//-}.json" &>/dev/null; then log " ✓ Pull permissions granted for repository: ${repo}" else @@ -1418,8 +2320,10 @@ create_image_pull_secrets() { # 1. Create ECR secret if enabled if [[ "${IMAGE_PULL_SECRETS_ECR_ENABLED}" == "true" ]]; then log "Creating ECR secret..." - local ecr_region="${REGION:-us-west-2}" + # Use ECR_REGION from config, fallback to REGION, then us-east-2 + local ecr_region="${ECR_REGION:-${REGION:-us-east-2}}" local ecr_account="${ECR_ACCOUNT:-}" + log " ECR Region: ${ecr_region}, ECR Account: ${ecr_account}" # Check if AWS credentials are available if ! aws sts get-caller-identity &>/dev/null; then @@ -1555,7 +2459,8 @@ create_image_pull_secrets() { # ====== CREATE ECR IMAGE PULL SECRET (Legacy - kept for compatibility) ====== create_ecr_secret() { local ns="$1" - local region="${REGION:-us-west-2}" + # Use ECR_REGION from config, fallback to REGION, then us-east-2 + local region="${ECR_REGION:-${REGION:-us-east-2}}" local ecr_account="${ECR_ACCOUNT:-}" ensure_namespace "${ns}" @@ -1606,7 +2511,6 @@ create_ecr_secret() { log "✓ ECR secret created: ecr-registry-secret" log "✓ Secret will be referenced in AIPlatform CR spec.imagePullSecrets" - log "Note: ECR tokens expire after 12 hours. Re-run installation to refresh." } # ====== INSTALL SPLUNK STANDALONE ====== @@ -1616,12 +2520,18 @@ install_splunk_standalone() { ensure_namespace "${AI_NS}" wait_for_crd standalones.enterprise.splunk.com 600 - # Create MinIO secret for Splunk (S3-compatible credentials) - log "Creating S3-compatible secret for Splunk App Framework..." - kubectl -n "${AI_NS}" create secret generic s3-secret \ - --from-literal=s3_access_key="${MINIO_ACCESS_KEY}" \ - --from-literal=s3_secret_key="${MINIO_SECRET_KEY}" \ - --dry-run=client -o yaml | kubectl apply -f - + # Ensure credentials secret exists for Splunk App Framework + if ! kubectl get secret minio-credentials -n "${AI_NS}" &>/dev/null; then + log "Creating minio-credentials secret in ${AI_NS}..." + kubectl -n "${AI_NS}" create secret generic minio-credentials \ + --from-literal=AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" \ + --from-literal=AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \ + --from-literal=s3_access_key="${MINIO_ROOT_USER}" \ + --from-literal=s3_secret_key="${MINIO_ROOT_PASSWORD}" \ + --from-literal=MINIO_ACCESS_KEY="${MINIO_ROOT_USER}" \ + --from-literal=MINIO_SECRET_KEY="${MINIO_ROOT_PASSWORD}" \ + --dry-run=client -o yaml | kubectl -n "${AI_NS}" apply -f - + fi # Create splunk-defaults ConfigMap (optional but recommended) cat <<'YAML' | kubectl -n "${AI_NS}" apply -f - @@ -1643,7 +2553,16 @@ data: sslPassword: password YAML - # Create Splunk Standalone with App Framework (not SmartStore) + # Ensure default ServiceAccount has imagePullSecrets for ECR + if kubectl get secret ecr-registry-secret -n "${AI_NS}" &>/dev/null; then + log "Patching default ServiceAccount with ecr-registry-secret..." + kubectl patch serviceaccount default -n "${AI_NS}" \ + -p '{"imagePullSecrets": [{"name": "ecr-registry-secret"}]}' 2>/dev/null || \ + warn "Could not patch default ServiceAccount" + fi + + # Standalone app repo: uses customer-managed S3-compatible object storage + local minio_endpoint="${MINIO_ENDPOINT:-${OBJ_STORE_ENDPOINT}}" cat </dev/null || true + kubectl delete pods -n "${AI_NS}" --field-selector status.phase=Failed --wait=false 2>/dev/null || true + # Delete pods stuck in ImagePullBackOff or ErrImagePull (use jq to avoid bash 3.x jsonpath parsing issues) + kubectl get pods -n "${AI_NS}" -o json 2>/dev/null | \ + jq -r '.items[] | select(.status.containerStatuses[]? | .state.waiting?.reason? == "ImagePullBackOff") | .metadata.name' 2>/dev/null | \ + xargs -r -I {} kubectl delete pod {} -n "${AI_NS}" --wait=false --grace-period=0 --force 2>/dev/null || true + kubectl get pods -n "${AI_NS}" -o json 2>/dev/null | \ + jq -r '.items[] | select(.status.containerStatuses[]? | .state.waiting?.reason? == "ErrImagePull") | .metadata.name' 2>/dev/null | \ + xargs -r -I {} kubectl delete pod {} -n "${AI_NS}" --wait=false --grace-period=0 --force 2>/dev/null || true + log "✓ Cleanup complete" + # Get Splunk secret name (for HEC endpoint) local splunk_secret="splunk-${AI_STANDALONE_NAME}-standalone-secret-v1" log "Using Splunk secret: ${splunk_secret}" - # Ensure s3-secret exists in AI namespace (for MinIO credentials) - log "Creating/updating MinIO credentials secret (s3-secret) in ${AI_NS}..." - kubectl -n "${AI_NS}" create secret generic s3-secret \ - --from-literal=s3_access_key="${MINIO_ACCESS_KEY}" \ - --from-literal=s3_secret_key="${MINIO_SECRET_KEY}" \ - --dry-run=client -o yaml | kubectl apply -f - - log "✓ MinIO credentials secret ready" + # Ensure object storage credentials secret exists in AI namespace + log "Creating/updating S3-compatible credentials secret (minio-credentials) in ${AI_NS}..." + kubectl -n "${AI_NS}" create secret generic minio-credentials \ + --from-literal=AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" \ + --from-literal=AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \ + --from-literal=s3_access_key="${MINIO_ROOT_USER}" \ + --from-literal=s3_secret_key="${MINIO_ROOT_PASSWORD}" \ + --from-literal=MINIO_ACCESS_KEY="${MINIO_ROOT_USER}" \ + --from-literal=MINIO_SECRET_KEY="${MINIO_ROOT_PASSWORD}" \ + --dry-run=client -o yaml | kubectl -n "${AI_NS}" apply -f - + log "✓ Object storage credentials secret ready" # Build imagePullSecrets YAML from created secrets local image_pull_secrets="" @@ -1734,6 +2669,76 @@ EOF log "No imagePullSecrets found, using public images only" fi + # objectStorage: path/endpoint/secret by object store type (aws | s3compat | minio | seaweedfs) + local obj_path obj_endpoint obj_secret + obj_secret="minio-credentials" + case "${OBJ_STORE_TYPE}" in + s3compat) + obj_path="s3compat://${OBJ_STORE_BUCKET}" + obj_endpoint="${OBJ_STORE_ENDPOINT}" + ;; + minio) + obj_path="minio://${MINIO_BUCKET}" + obj_endpoint="${MINIO_ENDPOINT:-${OBJ_STORE_ENDPOINT}}" + ;; + seaweedfs) + obj_path="seaweedfs://${OBJ_STORE_BUCKET}" + obj_endpoint="${OBJ_STORE_ENDPOINT}" + ;; + aws) + obj_path="s3://${OBJ_STORE_BUCKET}" + obj_endpoint="${OBJ_STORE_ENDPOINT}" + ;; + *) + err "Unsupported objectStore.type: ${OBJ_STORE_TYPE}. Supported: aws, s3compat, minio, seaweedfs" + ;; + esac + + # Build SAIA public-Service exposure block. + # The AIPlatform reconciler copies AIPlatform.spec.serviceTemplate down to + # each AIService; the SAIA feature reconciler uses it as the spec for the + # public saia-service. For on-prem / airgap customers, NodePort is the + # recommended default (no cloud LB, no cert-manager, browser on VPN can + # reach any node IP for Pattern-B v2 APIs like /query streaming). + local svc_template_yaml="" + local svc_type + svc_type=$(yq eval '.aiPlatform.serviceTemplate.type // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + if [[ -n "${svc_type}" && "${svc_type}" != "null" && "${svc_type}" != "ClusterIP" ]]; then + local svc_node_port + svc_node_port=$(yq eval '.aiPlatform.serviceTemplate.nodePort // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + svc_template_yaml=" serviceTemplate:"$'\n'" spec:"$'\n'" type: ${svc_type}"$'\n' + if [[ -n "${svc_node_port}" && "${svc_node_port}" != "null" && "${svc_type}" == "NodePort" ]]; then + svc_template_yaml+=" ports:"$'\n'" - name: http"$'\n'" port: 8080"$'\n'" targetPort: 8080"$'\n'" nodePort: ${svc_node_port}"$'\n' + fi + log "SAIA public exposure: ${svc_type}${svc_node_port:+ (nodePort=${svc_node_port})}" + fi + + # Build features YAML from config file (reads aiPlatform.features[] array) + local features_yaml="" + local feature_count + feature_count=$(yq eval '.aiPlatform.features | length' "${CONFIG_FILE}" 2>/dev/null || echo "0") + + if [[ "${feature_count}" -gt 0 ]]; then + log "Reading ${feature_count} feature(s) from config..." + local i=0 + while [[ $i -lt $feature_count ]]; do + local fname fver fsa + fname=$(yq eval ".aiPlatform.features[$i].name" "${CONFIG_FILE}" 2>/dev/null || echo "") + fver=$(yq eval ".aiPlatform.features[$i].version // \"1.0.0\"" "${CONFIG_FILE}" 2>/dev/null || echo "1.0.0") + fsa=$(yq eval ".aiPlatform.features[$i].serviceAccountName // \"\"" "${CONFIG_FILE}" 2>/dev/null || echo "") + if [[ -n "$fname" && "$fname" != "null" ]]; then + features_yaml+=" - name: ${fname}"$'\n' + features_yaml+=" version: \"${fver}\""$'\n' + [[ -n "$fsa" && "$fsa" != "null" ]] && features_yaml+=" serviceAccountName: ${fsa}"$'\n' + log " Feature: ${fname} v${fver}" + fi + i=$((i + 1)) + done + else + log "No features in config — defaulting to saia" + features_yaml=" - name: saia"$'\n'" version: \"1.1.0\""$'\n' + fi + # Apply AIPlatform CR (matching EKS script pattern) log "Applying AIPlatform CR: ${CLUSTER_NAME}-ai-platform" cat < "${phase1_logdir}/cert-manager.log" 2>&1 & + phase1_pids+=($!); phase1_names+=("cert-manager") - # Install AI Platform operator - install_splunk_ai_operator + install_kube_prometheus > "${phase1_logdir}/kube-prometheus.log" 2>&1 & + phase1_pids+=($!); phase1_names+=("kube-prometheus") + + install_nvidia_host_drivers > "${phase1_logdir}/nvidia-drivers.log" 2>&1 & + phase1_pids+=($!); phase1_names+=("nvidia-drivers") + + # Track which phase-1 tasks failed. nvidia-drivers failures are fatal: + # without them the device-plugin crash-loops and the whole GPU stack + # silently fails. Every other phase-1 task is merely warned on failure. + local phase1_fatal_failures=0 + for i in "${!phase1_pids[@]}"; do + if wait "${phase1_pids[$i]}"; then + log " ✓ ${phase1_names[$i]} completed" + else + warn " ✗ ${phase1_names[$i]} had issues" + if [[ "${phase1_names[$i]}" == "nvidia-drivers" ]]; then + phase1_fatal_failures=$((phase1_fatal_failures + 1)) + fi + fi + while IFS= read -r line; do + log " [${phase1_names[$i]}] ${line}" + done < "${phase1_logdir}/${phase1_names[$i]}.log" + done + rm -rf "${phase1_logdir}" + + if [[ ${phase1_fatal_failures} -gt 0 ]]; then + err "NVIDIA driver install failed on at least one GPU node; aborting install. + Device-plugin pods would otherwise crash-loop with NVML: ERROR_LIBRARY_NOT_FOUND + and model pods would stay Pending forever. Fix the errors above and re-run." + fi + + ensure_s3compat_credentials + + # --- Phase 2: cert-manager-dependent components (parallel) --- + log "Phase 2: Installing cert-manager-dependent components in parallel..." + local phase2_pids=() phase2_names=() phase2_logdir + phase2_logdir=$(mktemp -d) + + install_otel_operator_and_contrib_collector > "${phase2_logdir}/otel.log" 2>&1 & + phase2_pids+=($!); phase2_names+=("otel-operator") - # Create image pull secrets from configuration + install_ray_operator > "${phase2_logdir}/ray.log" 2>&1 & + phase2_pids+=($!); phase2_names+=("ray-operator") + + install_splunk_operator > "${phase2_logdir}/splunk-operator.log" 2>&1 & + phase2_pids+=($!); phase2_names+=("splunk-operator") + + install_nvidia_device_plugin > "${phase2_logdir}/nvidia-plugin.log" 2>&1 & + phase2_pids+=($!); phase2_names+=("nvidia-device-plugin") + + for i in "${!phase2_pids[@]}"; do + if wait "${phase2_pids[$i]}"; then + log " ✓ ${phase2_names[$i]} completed" + else + warn " ✗ ${phase2_names[$i]} had issues" + fi + while IFS= read -r line; do + log " [${phase2_names[$i]}] ${line}" + done < "${phase2_logdir}/${phase2_names[$i]}.log" + done + rm -rf "${phase2_logdir}" + + # Create image pull secrets before Splunk Standalone (it uses the default SA which needs ECR creds) create_image_pull_secrets "${AI_NS}" - # Install AI Platform CR + # Apply Splunk Standalone CR (non-blocking — pod boots in background) + install_splunk_standalone + + # Install AI Platform operator and CR while Splunk Standalone boots + install_splunk_ai_operator install_ai_platform_cr + # Now wait for Splunk Standalone to be ready (likely already done by now) + wait_for_splunk_standalone + log "AI Platform stack installation complete!" } @@ -1876,13 +2944,12 @@ check_platform_health() { fi log "" - # Check 3: MinIO - log "Checking MinIO..." - if kubectl get pod -n minio-system -l app=minio 2>/dev/null | grep -q "Running"; then - log "✅ MinIO is running" + # Check 3: Object Storage + log "Checking object storage configuration..." + if [[ -n "${OBJ_STORE_ENDPOINT}" ]]; then + log "✅ Object storage configured: ${OBJ_STORE_TYPE} at ${OBJ_STORE_ENDPOINT} (customer-managed)" else - warn "MinIO pod not in Running state" - kubectl get pods -n minio-system + warn "Object storage endpoint not configured" ((health_issues++)) fi log "" @@ -2012,18 +3079,11 @@ show_platform_access_info() { kubectl get nodes -o wide 2>/dev/null || warn "Could not retrieve node information" log "" - # MinIO information - log "🗄️ MinIO (Object Storage):" - log " Console URL: http://localhost:9001" - log " API URL: http://localhost:9000" - log " " - log " 💡 Access MinIO Console:" - log " kubectl port-forward svc/minio -n minio-system 9001:9001" - log " Open: http://localhost:9001" - log " " - log " 🔑 Credentials:" - log " Username: ${MINIO_ACCESS_KEY}" - log " Password: ${MINIO_SECRET_KEY}" + # Object storage information + log "🗄️ Object Storage (customer-managed):" + log " Type: ${OBJ_STORE_TYPE}" + log " Endpoint: ${OBJ_STORE_ENDPOINT}" + log " Bucket: ${OBJ_STORE_BUCKET}" log "" # AI Platform information @@ -2110,6 +3170,10 @@ show_platform_access_info() { # ====== MAIN INSTALL FLOW ====== main_install() { load_config + + validate_image_config + configure_images + preflight_checks # Check if existing Kubernetes cluster should be used @@ -2174,6 +3238,25 @@ main_install() { log "" log "Skipping k0s installation, using existing cluster" use_existing_cluster=true + + # Prepare all nodes for OS compatibility (iptables, firewalld, etc.) + local all_node_ips=("${CONTROLLER_IPS[@]}") + if [[ -n "${EXISTING_WORKER_IPS}" ]]; then + IFS=' ' read -ra WORKER_IPS <<< "${EXISTING_WORKER_IPS}" + all_node_ips+=("${WORKER_IPS[@]}") + fi + prepare_nodes_for_k0s "${all_node_ips[@]}" + + # Ensure all expected workers are joined + if [[ -n "${EXISTING_WORKER_IPS}" ]]; then + local current_node_count + current_node_count=$(kubectl get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ') + local expected_total=$(( ${#CONTROLLER_IPS[@]} + ${#WORKER_IPS[@]} )) + if [[ "${current_node_count}" -lt "${expected_total}" ]]; then + log "Cluster has ${current_node_count} nodes but ${expected_total} expected — joining missing workers..." + join_workers + fi + fi elif [[ "${USE_EXISTING}" == "force" ]]; then err "useExisting=force but no k0s cluster found on provided nodes" fi @@ -2189,19 +3272,9 @@ main_install() { if [[ "${use_existing_cluster}" == "false" ]]; then log "No existing cluster found, starting k0s cluster installation..." - # Setup infrastructure - if [[ -n "${EXISTING_CONTROLLER_IPS}" ]]; then - log "Using existing infrastructure..." - else - log "Creating EC2 instances..." - create_ec2_instances - fi - - # After getting IPs (from config or EC2), check if k0s is already installed - # Parse IPs if from config - if [[ -n "${EXISTING_CONTROLLER_IPS}" ]]; then - IFS=' ' read -ra CONTROLLER_IPS <<< "${EXISTING_CONTROLLER_IPS}" - fi + # Parse IPs from config + log "Using existing infrastructure..." + IFS=' ' read -ra CONTROLLER_IPS <<< "${EXISTING_CONTROLLER_IPS}" # Check if k0s is already running on the controller node if [[ "${#CONTROLLER_IPS[@]}" -gt 0 ]]; then @@ -2210,7 +3283,7 @@ main_install() { if ssh_exec "${controller_ip}" "command -v k0s >/dev/null 2>&1 && sudo k0s status >/dev/null 2>&1"; then log "============================================" - log "✓ k0s cluster already running on EC2 instances!" + log "✓ k0s cluster already running on existing nodes!" log "============================================" log "Retrieving kubeconfig from existing k0s cluster..." mkdir -p "${HOME}/.kube" @@ -2222,6 +3295,25 @@ main_install() { log "" log "Skipping k0s installation, using existing cluster" use_existing_cluster=true + + # Prepare all nodes for OS compatibility (iptables, firewalld, etc.) + local all_node_ips2=("${CONTROLLER_IPS[@]}") + if [[ -n "${EXISTING_WORKER_IPS}" ]]; then + IFS=' ' read -ra WORKER_IPS <<< "${EXISTING_WORKER_IPS}" + all_node_ips2+=("${WORKER_IPS[@]}") + fi + prepare_nodes_for_k0s "${all_node_ips2[@]}" + + # Ensure all expected workers are joined + if [[ -n "${EXISTING_WORKER_IPS}" ]]; then + local current_node_count + current_node_count=$(kubectl get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ') + local expected_total=$(( ${#CONTROLLER_IPS[@]} + ${#WORKER_IPS[@]} )) + if [[ "${current_node_count}" -lt "${expected_total}" ]]; then + log "Cluster has ${current_node_count} nodes but ${expected_total} expected — joining missing workers..." + join_workers + fi + fi fi fi @@ -2257,208 +3349,38 @@ main_delete() { log "Starting cleanup of k0s cluster: ${CLUSTER_NAME}" log "============================================" - # For EC2 mode: Just delete AWS resources (instances, security groups) - # Kubernetes resources will be destroyed when instances are terminated - # This is much faster and avoids stuck namespace deletion issues - - if [[ -n "${EXISTING_CONTROLLER_IPS}" ]]; then - # On-prem mode: Need to clean Kubernetes resources gracefully - log "On-prem mode detected - performing graceful Kubernetes cleanup..." - - export KUBECONFIG="${HOME}/.kube/k0s-${CLUSTER_NAME}" - - if [[ -f "${KUBECONFIG}" ]] && timeout 10 kubectl cluster-info &>/dev/null; then - log "Deleting Kubernetes resources..." - kubectl delete aiplatform --all -n "${AI_NS}" --timeout=60s || true - kubectl delete namespace "${AI_NS}" --timeout=120s || true - kubectl delete namespace splunk-ai-operator-system --timeout=60s || true - kubectl delete namespace monitoring --timeout=60s || true - fi - # On-prem: Stop k0s on existing infrastructure - IFS=' ' read -ra CONTROLLER_IPS <<< "${EXISTING_CONTROLLER_IPS}" - IFS=' ' read -ra WORKER_IPS <<< "${EXISTING_WORKER_IPS}" - - log "Stopping k0s on controller nodes..." - for ip in "${CONTROLLER_IPS[@]}"; do - log " Stopping k0s on controller: ${ip}..." - ssh_exec "${ip}" "sudo k0s stop || true; sudo k0s reset --force || true" || warn "Failed to stop k0s on ${ip}" - done - - log "Stopping k0s on worker nodes..." - for ip in "${WORKER_IPS[@]}"; do - log " Stopping k0s on worker: ${ip}..." - ssh_exec "${ip}" "sudo k0s stop || true; sudo k0s reset --force || true" || warn "Failed to stop k0s on ${ip}" - done - - log "k0s stopped on all on-prem nodes" - log "NOTE: Node machines are still running. To clean up completely:" - log " - Remove k0s binaries: sudo rm -f /usr/local/bin/k0s" - log " - Clean up data: sudo rm -rf /var/lib/k0s /etc/k0s" - - else - # EC2: Terminate instances - log "============================================" - log "Scanning for resources to delete..." - log "============================================" - - # First, preview what will be deleted - local instance_ids instance_count=0 - instance_ids=$(aws ec2 describe-instances \ - --region "${REGION}" \ - --filters \ - "Name=tag:Cluster,Values=${CLUSTER_NAME}" \ - "Name=tag:ManagedBy,Values=k0s-script" \ - "Name=instance-state-name,Values=running,stopped,stopping" \ - --query 'Reservations[].Instances[].InstanceId' --output text) - - if [[ -n "${instance_ids}" ]]; then - instance_count=$(echo "${instance_ids}" | wc -w) - log "EC2 Instances to terminate: ${instance_count}" - # Show instance details - aws ec2 describe-instances --region "${REGION}" --instance-ids ${instance_ids} \ - --query 'Reservations[].Instances[].[InstanceId,Tags[?Key==`Name`].Value|[0],InstanceType,State.Name]' \ - --output table 2>/dev/null || echo " ${instance_ids}" - else - log "EC2 Instances: None found" - fi - - # Check other resources - local enis=$(aws ec2 describe-network-interfaces --region "${REGION}" \ - --filters "Name=tag:Cluster,Values=${CLUSTER_NAME}" "Name=tag:ManagedBy,Values=k0s-script" \ - --query 'NetworkInterfaces[?Status==`available`].NetworkInterfaceId' --output text 2>/dev/null || echo "") - local eni_count=$(echo "${enis}" | wc -w) - log "Network Interfaces: ${eni_count:-0}" - - local sg_id=$(aws ec2 describe-security-groups --region "${REGION}" \ - --filters "Name=group-name,Values=${CLUSTER_NAME}-k0s-sg" "Name=tag:ManagedBy,Values=k0s-script" \ - --query 'SecurityGroups[0].GroupId' --output text 2>/dev/null || echo "") - if [[ -n "${sg_id}" && "${sg_id}" != "None" ]]; then - log "Security Groups: 1 (${sg_id})" - else - log "Security Groups: 0" - fi - - local volumes=$(aws ec2 describe-volumes --region "${REGION}" \ - --filters "Name=tag:Cluster,Values=${CLUSTER_NAME}" "Name=tag:ManagedBy,Values=k0s-script" "Name=status,Values=available" \ - --query 'Volumes[].VolumeId' --output text 2>/dev/null || echo "") - local vol_count=$(echo "${volumes}" | wc -w) - log "EBS Volumes: ${vol_count:-0}" - - log "" - log "All resources are tagged with:" - log " - Cluster: ${CLUSTER_NAME}" - log " - ManagedBy: k0s-script" - log "" + # Graceful Kubernetes cleanup, then stop k0s on all nodes + log "Performing graceful Kubernetes cleanup..." - # Confirmation prompt (skip if AUTO_APPROVE is set) - if [[ "${AUTO_APPROVE:-false}" != "true" ]]; then - warn "This will permanently delete the above AWS resources!" - read -p "Type 'yes' to confirm deletion: " -r - if [[ ! $REPLY =~ ^[Yy]es$ ]]; then - log "Deletion cancelled by user" - exit 0 - fi - fi + export KUBECONFIG="${HOME}/.kube/k0s-${CLUSTER_NAME}" - log "" - log "============================================" - log "Starting resource deletion..." - log "============================================" - log "" + if [[ -f "${KUBECONFIG}" ]] && timeout 10 kubectl cluster-info &>/dev/null; then + log "Deleting Kubernetes resources..." + kubectl delete aiplatform --all -n "${AI_NS}" --timeout=60s || true + kubectl delete namespace "${AI_NS}" --timeout=120s || true + kubectl delete namespace splunk-ai-operator-system --timeout=60s || true + kubectl delete namespace monitoring --timeout=60s || true + fi - # Now proceed with deletion - if [[ -n "${instance_ids}" ]]; then - log "Terminating ${instance_count} EC2 instance(s)..." - aws ec2 terminate-instances --region "${REGION}" --instance-ids ${instance_ids} + IFS=' ' read -ra CONTROLLER_IPS <<< "${EXISTING_CONTROLLER_IPS}" + IFS=' ' read -ra WORKER_IPS <<< "${EXISTING_WORKER_IPS}" - log "Waiting for instances to terminate..." - aws ec2 wait instance-terminated --region "${REGION}" --instance-ids ${instance_ids} || warn "Timeout waiting for instances to terminate" + log "Stopping k0s on controller nodes..." + for ip in "${CONTROLLER_IPS[@]}"; do + log " Stopping k0s on controller: ${ip}..." + ssh_exec "${ip}" "sudo k0s stop || true; sudo k0s reset --force || true" || warn "Failed to stop k0s on ${ip}" + done - log "EC2 instances terminated successfully" - else - log "No EC2 instances to terminate" - fi - - # Clean up network interfaces that may be stuck - log "Checking for orphaned network interfaces..." - local enis eni_count=0 - enis=$(aws ec2 describe-network-interfaces \ - --region "${REGION}" \ - --filters \ - "Name=tag:Cluster,Values=${CLUSTER_NAME}" \ - "Name=tag:ManagedBy,Values=k0s-script" \ - --query 'NetworkInterfaces[?Status==`available`].NetworkInterfaceId' --output text 2>/dev/null || echo "") - - if [[ -n "${enis}" ]]; then - eni_count=$(echo "${enis}" | wc -w) - log "Found ${eni_count} orphaned network interface(s), deleting..." - for eni in ${enis}; do - log " Deleting network interface: ${eni}" - aws ec2 delete-network-interface --region "${REGION}" --network-interface-id "${eni}" 2>/dev/null || warn "Could not delete ENI ${eni}" - done - else - log "No orphaned network interfaces found" - fi - - # Delete security group (with retries for ENI detachment) - log "Deleting security group..." - local sg_id sg_deleted=false - sg_id=$(aws ec2 describe-security-groups \ - --region "${REGION}" \ - --filters \ - "Name=group-name,Values=${CLUSTER_NAME}-k0s-sg" \ - "Name=tag:ManagedBy,Values=k0s-script" \ - --query 'SecurityGroups[0].GroupId' --output text 2>/dev/null || echo "") - - if [[ -n "${sg_id}" && "${sg_id}" != "None" ]]; then - log "Found security group: ${sg_id}" - - # Try multiple times with increasing wait periods - for attempt in 1 2 3 4 5; do - log " Attempt ${attempt}/5 to delete security group..." - if aws ec2 delete-security-group --region "${REGION}" --group-id "${sg_id}" 2>/dev/null; then - log "Security group deleted successfully" - sg_deleted=true - break - else - if [[ ${attempt} -lt 5 ]]; then - local wait_time=$((attempt * 15)) - log " Security group still has dependencies, waiting ${wait_time}s for ENIs to detach..." - sleep ${wait_time} - fi - fi - done + log "Stopping k0s on worker nodes..." + for ip in "${WORKER_IPS[@]}"; do + log " Stopping k0s on worker: ${ip}..." + ssh_exec "${ip}" "sudo k0s stop || true; sudo k0s reset --force || true" || warn "Failed to stop k0s on ${ip}" + done - if [[ "${sg_deleted}" == "false" ]]; then - warn "Could not delete security group after 5 attempts (may have dependencies)" - warn "AWS will auto-clean it when dependencies are removed" - fi - else - log "Security group not found or already deleted" - fi - - # Delete any EBS volumes that were created - log "Checking for orphaned EBS volumes..." - local volumes vol_count=0 - volumes=$(aws ec2 describe-volumes \ - --region "${REGION}" \ - --filters \ - "Name=tag:Cluster,Values=${CLUSTER_NAME}" \ - "Name=tag:ManagedBy,Values=k0s-script" \ - "Name=status,Values=available" \ - --query 'Volumes[].VolumeId' --output text) - - if [[ -n "${volumes}" ]]; then - vol_count=$(echo "${volumes}" | wc -w) - log "Found ${vol_count} orphaned EBS volume(s), deleting..." - for vol in ${volumes}; do - log " Deleting volume: ${vol}" - aws ec2 delete-volume --region "${REGION}" --volume-id "${vol}" && log " Volume ${vol} deleted" || warn " Could not delete volume ${vol}" - done - else - log "No orphaned EBS volumes found" - fi - fi + log "k0s stopped on all nodes" + log "NOTE: Node machines are still running. To clean up completely:" + log " - Remove k0s binaries: sudo rm -f /usr/local/bin/k0s" + log " - Clean up data: sudo rm -rf /var/lib/k0s /etc/k0s" # Clean up local files log "Cleaning up local files..." @@ -2475,17 +3397,9 @@ main_delete() { log "Cleanup Summary" log "============================================" - if [[ -n "${EXISTING_CONTROLLER_IPS}" ]]; then - log "Infrastructure: On-premises" - log " - k0s stopped and reset on all nodes" - log " - NOTE: Nodes are still running, k0s binaries remain" - else - log "Infrastructure: AWS EC2" - log " - EC2 Instances: ${instance_count:-0} terminated" - log " - Network Interfaces: ${eni_count:-0} cleaned up" - log " - Security Groups: $([ "${sg_deleted}" == "true" ] && echo "1 deleted" || echo "pending cleanup")" - log " - EBS Volumes: ${vol_count:-0} deleted" - fi + log "Infrastructure: On-premises" + log " - k0s stopped and reset on all nodes" + log " - NOTE: Nodes are still running, k0s binaries remain" log "" log "Kubernetes Resources:" @@ -2505,21 +3419,11 @@ main_delete() { log "" log "Cluster '${CLUSTER_NAME}' has been deleted." - if [[ -n "${EXISTING_CONTROLLER_IPS}" ]]; then - log "" - log "On-prem nodes are still running with k0s stopped." - log "To fully clean up each node, run:" - log " sudo rm -f /usr/local/bin/k0s" - log " sudo rm -rf /var/lib/k0s /etc/k0s" - else - # Check if any resources failed to delete - if [[ "${sg_deleted}" == "false" ]]; then - log "" - warn "Some resources may require manual cleanup:" - warn " - Security group ${sg_id} may have lingering dependencies" - warn " - Check AWS console for any remaining resources tagged with Cluster=${CLUSTER_NAME}" - fi - fi + log "" + log "Nodes are still running with k0s stopped." + log "To fully clean up each node, run:" + log " sudo rm -f /usr/local/bin/k0s" + log " sudo rm -rf /var/lib/k0s /etc/k0s" } # ====== CLEAN ALL (AGGRESSIVE CLEANUP) ====== @@ -2567,36 +3471,34 @@ usage() { cat <&1" || echo "not running") + + if echo "${k0s_status}" | grep -q "Kube-api probing successful: true"; then + log " ✓ k0s running and API reachable" + return 0 + elif echo "${k0s_status}" | grep -q "Role: worker"; then + # k0s is running but API not reachable yet + log " ⏳ k0s running but API not yet reachable" + return 1 + else + log " ✗ k0s not running" + return 2 + fi +} + +# ====== THOROUGH WORKER CLEANUP ====== +# Completely clean up k0s on a worker node (for fresh rejoin) +cleanup_worker_k0s() { + local worker_ip="$1" + + log " Performing thorough k0s cleanup on ${worker_ip}..." + + ssh_exec "${worker_ip}" " + sudo systemctl stop k0sworker 2>/dev/null || true + sudo systemctl disable k0sworker 2>/dev/null || true + sudo systemctl reset-failed k0sworker 2>/dev/null || true + sudo pkill -9 k0s 2>/dev/null || true + sudo pkill -9 kubelet 2>/dev/null || true + sudo pkill -9 containerd-shim 2>/dev/null || true + sudo rm -f /etc/systemd/system/k0sworker.service + sudo rm -rf /var/lib/k0s /run/k0s /etc/k0s /tmp/k0s-token + sudo rm -f /run/k0s/containerd.sock 2>/dev/null || true + sudo systemctl daemon-reload + " 2>/dev/null || true + + log " ✓ Cleanup complete" +} + # ====== JOIN WORKERS (Resume/Retry Worker Joins) ====== join_workers() { log "============================================" @@ -2638,78 +3583,83 @@ join_workers() { err "Kubeconfig not found at ${KUBECONFIG}. Please run 'install' first." fi - # Get controller IP from existing cluster + # Get IPs from config log "Detecting cluster configuration..." + IFS=' ' read -ra CONTROLLER_IPS <<< "${EXISTING_CONTROLLER_IPS}" + IFS=' ' read -ra WORKER_IPS <<< "${EXISTING_WORKER_IPS}" - # Option 1: Get from EC2 instances - if [[ -z "${EXISTING_CONTROLLER_IPS}" ]]; then - log "Discovering EC2 instances for cluster: ${CLUSTER_NAME}..." - - # Get controller IPs - local controller_ips - controller_ips=$(aws ec2 describe-instances --region "${REGION}" \ - --filters "Name=tag:Cluster,Values=${CLUSTER_NAME}" \ - "Name=tag:Role,Values=controller" \ - "Name=instance-state-name,Values=running" \ - --query 'Reservations[*].Instances[*].PublicIpAddress' \ - --output text) - - if [[ -z "${controller_ips}" ]]; then - err "No running controller instances found for cluster ${CLUSTER_NAME}" - fi - - # Convert newlines and tabs to spaces, then split into array - controller_ips=$(echo "${controller_ips}" | tr '\n\t' ' ') - IFS=' ' read -ra CONTROLLER_IPS <<< "${controller_ips}" - - # Get worker IPs - local worker_ips - worker_ips=$(aws ec2 describe-instances --region "${REGION}" \ - --filters "Name=tag:Cluster,Values=${CLUSTER_NAME}" \ - "Name=tag:Role,Values=cpu-worker,gpu-worker" \ - "Name=instance-state-name,Values=running" \ - --query 'Reservations[*].Instances[*].PublicIpAddress' \ - --output text) - - if [[ -z "${worker_ips}" ]]; then - warn "No worker instances found for cluster ${CLUSTER_NAME}" - log "Nothing to join, exiting." - return 0 - fi - - # Convert newlines and tabs to spaces, then split into array - worker_ips=$(echo "${worker_ips}" | tr '\n\t' ' ') - IFS=' ' read -ra WORKER_IPS <<< "${worker_ips}" - SSH_KEY_PATH="${HOME}/.ssh/${KEY_NAME}.pem" - else - # Option 2: Use existing IPs from config - IFS=' ' read -ra CONTROLLER_IPS <<< "${EXISTING_CONTROLLER_IPS}" - IFS=' ' read -ra WORKER_IPS <<< "${EXISTING_WORKER_IPS}" + if [[ ${#WORKER_IPS[@]} -eq 0 ]]; then + warn "No worker IPs found in config" + log "Nothing to join, exiting." + return 0 fi local controller_ip="${CONTROLLER_IPS[0]}" log "Controller IP: ${controller_ip}" log "Worker IPs: ${WORKER_IPS[*]}" - # Check which workers are already joined + # Check which workers are already joined AND healthy log "Checking current cluster nodes..." kubectl get nodes -o wide || true local already_joined_ips=() + local needs_rejoin_ips=() + + # Get all cluster nodes once for matching + local cluster_nodes_json + cluster_nodes_json=$(kubectl get nodes -o json 2>/dev/null || echo '{"items":[]}') + for worker_ip in "${WORKER_IPS[@]}"; do - # Check if node with this IP already exists in cluster - local node_exists - node_exists=$(kubectl get nodes -o json | jq -r ".items[] | select(.status.addresses[]? | select(.type==\"InternalIP\" and .address==\"${worker_ip}\")) | .metadata.name" 2>/dev/null || echo "") + # Resolve the Kubernetes node name by SSHing to the worker and getting its hostname + local node_exists="" + node_exists=$(resolve_node_name "${worker_ip}") + + # Verify this node actually exists in the cluster + if [[ -n "${node_exists}" ]]; then + local found_in_cluster + found_in_cluster=$(echo "${cluster_nodes_json}" | jq -r --arg name "${node_exists}" \ + '.items[] | select(.metadata.name==$name) | .metadata.name' 2>/dev/null | head -1 || echo "") + if [[ -z "${found_in_cluster}" ]]; then + node_exists="" + fi + fi if [[ -n "${node_exists}" ]]; then - log " ✓ Worker ${worker_ip} already joined as ${node_exists}" - already_joined_ips+=("${worker_ip}") + # Node exists in cluster, check if it's Ready + local node_ready + node_ready=$(echo "${cluster_nodes_json}" | jq -r --arg name "${node_exists}" \ + '.items[] | select(.metadata.name==$name) | .status.conditions[] | select(.type=="Ready") | .status' 2>/dev/null || echo "Unknown") + + if [[ "${node_ready}" == "True" ]]; then + log " ✓ Worker ${worker_ip} joined and Ready as ${node_exists}" + already_joined_ips+=("${worker_ip}") + else + log " ⚠ Worker ${worker_ip} exists as ${node_exists} but not Ready (${node_ready})" + needs_rejoin_ips+=("${worker_ip}") + fi else - log " ✗ Worker ${worker_ip} not joined yet" + # Node doesn't exist in cluster, check k0s status on worker + log " Checking k0s status on ${worker_ip}..." + if verify_worker_status "${worker_ip}" "${controller_ip}"; then + log " ⏳ Worker ${worker_ip} k0s running, waiting for cluster sync..." + # Give it more time to appear in cluster + else + log " ✗ Worker ${worker_ip} not properly connected" + needs_rejoin_ips+=("${worker_ip}") + fi fi done + # If all workers are joined, nothing to do + if [[ ${#already_joined_ips[@]} -eq ${#WORKER_IPS[@]} ]]; then + log "" + log "✓ All ${#WORKER_IPS[@]} workers are already joined and healthy!" + kubectl get nodes -o wide + return 0 + fi + # Generate worker token from controller + log "" log "Generating worker join token..." local worker_token worker_token=$(ssh_exec "${controller_ip}" "sudo k0s token create --role=worker" 2>/dev/null) @@ -2718,26 +3668,38 @@ join_workers() { err "Failed to generate worker token from controller" fi - log "Worker token generated successfully" + log "Worker token generated successfully (${#worker_token} chars)" - # Install and join workers that aren't already joined + # Join workers that need to be joined/rejoined local workers_joined=0 + local workers_to_process=() + + # Build list of workers to process (use ${arr[@]+...} to avoid unbound-variable on empty arrays) for worker_ip in "${WORKER_IPS[@]}"; do - # Skip if already joined - local skip_worker=false + local skip=false if [[ ${#already_joined_ips[@]} -gt 0 ]]; then for joined_ip in "${already_joined_ips[@]}"; do if [[ "${joined_ip}" == "${worker_ip}" ]]; then - skip_worker=true + skip=true break fi done fi - - if [[ "${skip_worker}" == "true" ]]; then - continue + if [[ "${skip}" == "false" ]]; then + workers_to_process+=("${worker_ip}") fi + done + + log "" + log "Workers to join/rejoin: ${workers_to_process[*]:-none}" + if [[ ${#workers_to_process[@]} -eq 0 ]]; then + log "No workers need joining" + return 0 + fi + + for worker_ip in "${workers_to_process[@]}"; do + log "" log "============================================" log "Joining worker: ${worker_ip}" log "============================================" @@ -2754,15 +3716,17 @@ join_workers() { log " ✓ k0s already installed" fi - # Stop k0s if it's running (to rejoin cleanly) - log " Stopping any existing k0s worker process..." - ssh_exec "${worker_ip}" "sudo k0s stop 2>/dev/null || true" - ssh_exec "${worker_ip}" "sudo k0s reset 2>/dev/null || true" + # Ensure k0s is in sudo's secure_path (some distros exclude /usr/local/bin) + ssh_exec "${worker_ip}" "if [ -f /usr/local/bin/k0s ] && [ ! -f /usr/bin/k0s ]; then sudo ln -sf /usr/local/bin/k0s /usr/bin/k0s; fi" || true - # Install worker + # Thorough cleanup before rejoining (handles stale configurations) + cleanup_worker_k0s "${worker_ip}" + + # RHEL/Fedora compatibility (firewalld, kernel modules, python3-pyyaml, k0s binary) + prepare_nodes_for_k0s "${worker_ip}" + + # Install worker with fresh token log " Installing k0s worker configuration..." - # Write token to temp file first (stdin pipe doesn't work reliably over SSH) - # Note: Token file must remain until worker bootstraps, so we don't delete it here if ssh_exec "${worker_ip}" "echo '${worker_token}' | sudo tee /tmp/k0s-token >/dev/null && sudo k0s install worker --token-file=/tmp/k0s-token"; then log " ✓ Worker configuration installed" else @@ -2770,22 +3734,36 @@ join_workers() { continue fi - # Start worker + # Start worker using systemctl (more reliable than k0s start) log " Starting k0s worker..." - if ssh_exec "${worker_ip}" "sudo k0s start"; then - log " ✓ Worker started successfully" - workers_joined=$((workers_joined + 1)) + if ssh_exec "${worker_ip}" "sudo systemctl start k0sworker"; then + log " ✓ Worker service started" else warn " Failed to start k0s worker on ${worker_ip}" - continue + # Try fallback + ssh_exec "${worker_ip}" "sudo k0s start" || continue + fi + + # Wait briefly and verify + log " Waiting for worker to initialize (15s)..." + sleep 15 + + # Verify worker status + if verify_worker_status "${worker_ip}" "${controller_ip}"; then + log " ✓ Worker ${worker_ip} connected successfully!" + workers_joined=$((workers_joined + 1)) + else + warn " Worker ${worker_ip} may still be connecting..." + workers_joined=$((workers_joined + 1)) # Count as attempted fi done if [[ ${workers_joined} -gt 0 ]]; then log "" - log "Waiting for workers to join cluster (60s)..." - sleep 60 + log "Waiting for workers to appear in cluster (45s)..." + sleep 45 + log "" log "Current cluster nodes:" kubectl get nodes -o wide @@ -2796,11 +3774,23 @@ join_workers() { log "" log "============================================" - log "✓ Successfully joined ${workers_joined} worker(s)" + log "✓ Processed ${workers_joined} worker(s)" log "============================================" + + # Final verification + local final_count + final_count=$(kubectl get nodes --no-headers | wc -l) + local expected_count=$((${#CONTROLLER_IPS[@]} + ${#WORKER_IPS[@]})) + + if [[ ${final_count} -ge ${expected_count} ]]; then + log "✓ All ${expected_count} nodes are now in the cluster!" + else + warn "Only ${final_count}/${expected_count} nodes in cluster. Some workers may need more time." + warn "Run '$0 join-workers' again if workers don't appear within a few minutes." + fi else log "" - log "All workers already joined or no new workers to join" + log "No workers needed to be joined" fi } diff --git a/tools/cluster_setup/refresh_ecr_credentials.sh b/tools/cluster_setup/refresh_ecr_credentials.sh new file mode 100755 index 0000000..24abeef --- /dev/null +++ b/tools/cluster_setup/refresh_ecr_credentials.sh @@ -0,0 +1,84 @@ +#!/usr/bin/env bash +# refresh_ecr_credentials.sh - Refresh ECR image pull secrets using an ECR token +# +# Usage (run on controller node): +# ./refresh_ecr_credentials.sh # auto-fetches token via aws cli +# ./refresh_ecr_credentials.sh "$(aws ecr get-login-password --region us-east-2)" # pass token +# ECR_TOKEN=xxxx ./refresh_ecr_credentials.sh # pass via env +set -euo pipefail + +ECR_ACCOUNT="${ECR_ACCOUNT:-658391232643}" +ECR_REGION="${ECR_REGION:-us-east-2}" +ECR_SERVER="${ECR_ACCOUNT}.dkr.ecr.${ECR_REGION}.amazonaws.com" +NAMESPACES="${TARGET_NAMESPACES:-ai-platform splunk-ai-operator-system}" +KUBECTL="${KUBECTL:-k0s kubectl}" + +info() { echo "[INFO] $*"; } +error() { echo "[ERROR] $*" >&2; } + +# --- Get ECR token: argument > env > auto-fetch --- +TOKEN="${1:-${ECR_TOKEN:-}}" + +if [[ -z "$TOKEN" ]]; then + info "No token provided, fetching via: aws ecr get-login-password --region ${ECR_REGION}" + TOKEN=$(aws ecr get-login-password --region "${ECR_REGION}" 2>/dev/null || true) +fi + +if [[ -z "$TOKEN" ]]; then + error "Failed to get ECR token." + error "Usage: $0 \"\$(aws ecr get-login-password --region ${ECR_REGION})\"" + exit 1 +fi +info "ECR token obtained (${#TOKEN} chars)" + +# --- Step 1: Update ecr-registry-secret in all namespaces --- +for ns in ${NAMESPACES}; do + info "Updating ecr-registry-secret in ${ns}..." + $KUBECTL -n "${ns}" delete secret ecr-registry-secret 2>/dev/null || true + $KUBECTL -n "${ns}" create secret docker-registry ecr-registry-secret \ + --docker-server="${ECR_SERVER}" \ + --docker-username=AWS \ + --docker-password="${TOKEN}" && \ + info " ✓ ecr-registry-secret refreshed in ${ns}" || \ + error " Failed to create ecr-registry-secret in ${ns}" +done + +# --- Step 2: Delete pods stuck in ImagePullBackOff --- +info "Cleaning up ImagePullBackOff pods..." +for ns in ${NAMESPACES}; do + backoff_pods=$($KUBECTL -n "${ns}" get pods 2>/dev/null \ + | grep -i "ImagePullBackOff\|ErrImagePull" \ + | awk '{print $1}' || true) + + if [[ -n "$backoff_pods" ]]; then + while IFS= read -r pod; do + [[ -z "$pod" ]] && continue + $KUBECTL -n "${ns}" delete pod "${pod}" --grace-period=0 --force 2>/dev/null && \ + info " Deleted: ${pod}" || true + done <<< "$backoff_pods" + else + info " No stuck pods in ${ns}" + fi +done + +# --- Step 3: Restart deployments that use ECR images --- +info "Restarting ECR-based deployments..." +for ns in ${NAMESPACES}; do + for dep in $($KUBECTL -n "${ns}" get deployments -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null); do + [[ -z "$dep" ]] && continue + images=$($KUBECTL -n "${ns}" get deployment "${dep}" -o jsonpath='{.spec.template.spec.containers[*].image}' 2>/dev/null || "") + if echo "$images" | grep -q "${ECR_ACCOUNT}" 2>/dev/null; then + $KUBECTL -n "${ns}" rollout restart deployment "${dep}" 2>/dev/null && \ + info " Restarted: ${dep}" || true + fi + done +done + +echo "" +info "==========================================" +info "ECR credentials refreshed!" +info " Server: ${ECR_SERVER}" +info " Namespaces: ${NAMESPACES}" +info "" +info " Token expires in ~12 hours. Re-run this script to refresh." +info "==========================================" diff --git a/tools/cluster_setup/splunk-operator-cluster.yaml b/tools/cluster_setup/splunk-operator-cluster.yaml index 06573be..467879e 100644 --- a/tools/cluster_setup/splunk-operator-cluster.yaml +++ b/tools/cluster_setup/splunk-operator-cluster.yaml @@ -55428,7 +55428,7 @@ spec: - name: WATCH_NAMESPACE value: "" - name: RELATED_IMAGE_SPLUNK_ENTERPRISE - value: docker.io/splunk/splunk:10.2.0-dev1 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/splunk/splunk:10-2-ai-custom - name: OPERATOR_NAME value: splunk-operator - name: SPLUNK_GENERAL_TERMS