diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index 23a5274..eb384ce 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -14,6 +14,19 @@ applications: S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" + # AWS / boto3 standard credential names — populated whenever the + # operator can load credentials from spec.objectStorage.secretRef. For + # CLOUD_PROVIDER=aws these are the values boto3 reads (the S3COMPAT_* + # names above are only consumed by the s3compat shim). Both code paths + # share the same source-of-truth Secret keys (s3_access_key / + # s3_secret_key) so emitting both pairs is safe — each provider only + # reads its own. AWS_REGION lets boto3 resolve the default regional S3 + # endpoint when no AWS_ENDPOINT_URL is set; required for any AWS S3 + # bucket outside us-east-1 to avoid PermanentRedirect on the first call. + AWS_ACCESS_KEY_ID: "{{.S3CompatObjectStoreAccessKey}}" + AWS_SECRET_ACCESS_KEY: "{{.S3CompatObjectStoreSecretKey}}" + AWS_REGION: "{{.Region}}" + AWS_DEFAULT_REGION: "{{.Region}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -86,6 +99,19 @@ applications: S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" + # AWS / boto3 standard credential names — populated whenever the + # operator can load credentials from spec.objectStorage.secretRef. For + # CLOUD_PROVIDER=aws these are the values boto3 reads (the S3COMPAT_* + # names above are only consumed by the s3compat shim). Both code paths + # share the same source-of-truth Secret keys (s3_access_key / + # s3_secret_key) so emitting both pairs is safe — each provider only + # reads its own. AWS_REGION lets boto3 resolve the default regional S3 + # endpoint when no AWS_ENDPOINT_URL is set; required for any AWS S3 + # bucket outside us-east-1 to avoid PermanentRedirect on the first call. + AWS_ACCESS_KEY_ID: "{{.S3CompatObjectStoreAccessKey}}" + AWS_SECRET_ACCESS_KEY: "{{.S3CompatObjectStoreSecretKey}}" + AWS_REGION: "{{.Region}}" + AWS_DEFAULT_REGION: "{{.Region}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -170,6 +196,19 @@ applications: S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" + # AWS / boto3 standard credential names — populated whenever the + # operator can load credentials from spec.objectStorage.secretRef. For + # CLOUD_PROVIDER=aws these are the values boto3 reads (the S3COMPAT_* + # names above are only consumed by the s3compat shim). Both code paths + # share the same source-of-truth Secret keys (s3_access_key / + # s3_secret_key) so emitting both pairs is safe — each provider only + # reads its own. AWS_REGION lets boto3 resolve the default regional S3 + # endpoint when no AWS_ENDPOINT_URL is set; required for any AWS S3 + # bucket outside us-east-1 to avoid PermanentRedirect on the first call. + AWS_ACCESS_KEY_ID: "{{.S3CompatObjectStoreAccessKey}}" + AWS_SECRET_ACCESS_KEY: "{{.S3CompatObjectStoreSecretKey}}" + AWS_REGION: "{{.Region}}" + AWS_DEFAULT_REGION: "{{.Region}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -230,6 +269,19 @@ applications: S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" + # AWS / boto3 standard credential names — populated whenever the + # operator can load credentials from spec.objectStorage.secretRef. For + # CLOUD_PROVIDER=aws these are the values boto3 reads (the S3COMPAT_* + # names above are only consumed by the s3compat shim). Both code paths + # share the same source-of-truth Secret keys (s3_access_key / + # s3_secret_key) so emitting both pairs is safe — each provider only + # reads its own. AWS_REGION lets boto3 resolve the default regional S3 + # endpoint when no AWS_ENDPOINT_URL is set; required for any AWS S3 + # bucket outside us-east-1 to avoid PermanentRedirect on the first call. + AWS_ACCESS_KEY_ID: "{{.S3CompatObjectStoreAccessKey}}" + AWS_SECRET_ACCESS_KEY: "{{.S3CompatObjectStoreSecretKey}}" + AWS_REGION: "{{.Region}}" + AWS_DEFAULT_REGION: "{{.Region}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -280,6 +332,19 @@ applications: S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" + # AWS / boto3 standard credential names — populated whenever the + # operator can load credentials from spec.objectStorage.secretRef. For + # CLOUD_PROVIDER=aws these are the values boto3 reads (the S3COMPAT_* + # names above are only consumed by the s3compat shim). Both code paths + # share the same source-of-truth Secret keys (s3_access_key / + # s3_secret_key) so emitting both pairs is safe — each provider only + # reads its own. AWS_REGION lets boto3 resolve the default regional S3 + # endpoint when no AWS_ENDPOINT_URL is set; required for any AWS S3 + # bucket outside us-east-1 to avoid PermanentRedirect on the first call. + AWS_ACCESS_KEY_ID: "{{.S3CompatObjectStoreAccessKey}}" + AWS_SECRET_ACCESS_KEY: "{{.S3CompatObjectStoreSecretKey}}" + AWS_REGION: "{{.Region}}" + AWS_DEFAULT_REGION: "{{.Region}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -330,6 +395,19 @@ applications: S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" + # AWS / boto3 standard credential names — populated whenever the + # operator can load credentials from spec.objectStorage.secretRef. For + # CLOUD_PROVIDER=aws these are the values boto3 reads (the S3COMPAT_* + # names above are only consumed by the s3compat shim). Both code paths + # share the same source-of-truth Secret keys (s3_access_key / + # s3_secret_key) so emitting both pairs is safe — each provider only + # reads its own. AWS_REGION lets boto3 resolve the default regional S3 + # endpoint when no AWS_ENDPOINT_URL is set; required for any AWS S3 + # bucket outside us-east-1 to avoid PermanentRedirect on the first call. + AWS_ACCESS_KEY_ID: "{{.S3CompatObjectStoreAccessKey}}" + AWS_SECRET_ACCESS_KEY: "{{.S3CompatObjectStoreSecretKey}}" + AWS_REGION: "{{.Region}}" + AWS_DEFAULT_REGION: "{{.Region}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -371,6 +449,19 @@ applications: S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" + # AWS / boto3 standard credential names — populated whenever the + # operator can load credentials from spec.objectStorage.secretRef. For + # CLOUD_PROVIDER=aws these are the values boto3 reads (the S3COMPAT_* + # names above are only consumed by the s3compat shim). Both code paths + # share the same source-of-truth Secret keys (s3_access_key / + # s3_secret_key) so emitting both pairs is safe — each provider only + # reads its own. AWS_REGION lets boto3 resolve the default regional S3 + # endpoint when no AWS_ENDPOINT_URL is set; required for any AWS S3 + # bucket outside us-east-1 to avoid PermanentRedirect on the first call. + AWS_ACCESS_KEY_ID: "{{.S3CompatObjectStoreAccessKey}}" + AWS_SECRET_ACCESS_KEY: "{{.S3CompatObjectStoreSecretKey}}" + AWS_REGION: "{{.Region}}" + AWS_DEFAULT_REGION: "{{.Region}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -427,6 +518,19 @@ applications: S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" + # AWS / boto3 standard credential names — populated whenever the + # operator can load credentials from spec.objectStorage.secretRef. For + # CLOUD_PROVIDER=aws these are the values boto3 reads (the S3COMPAT_* + # names above are only consumed by the s3compat shim). Both code paths + # share the same source-of-truth Secret keys (s3_access_key / + # s3_secret_key) so emitting both pairs is safe — each provider only + # reads its own. AWS_REGION lets boto3 resolve the default regional S3 + # endpoint when no AWS_ENDPOINT_URL is set; required for any AWS S3 + # bucket outside us-east-1 to avoid PermanentRedirect on the first call. + AWS_ACCESS_KEY_ID: "{{.S3CompatObjectStoreAccessKey}}" + AWS_SECRET_ACCESS_KEY: "{{.S3CompatObjectStoreSecretKey}}" + AWS_REGION: "{{.Region}}" + AWS_DEFAULT_REGION: "{{.Region}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -453,6 +557,19 @@ applications: S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" + # AWS / boto3 standard credential names — populated whenever the + # operator can load credentials from spec.objectStorage.secretRef. For + # CLOUD_PROVIDER=aws these are the values boto3 reads (the S3COMPAT_* + # names above are only consumed by the s3compat shim). Both code paths + # share the same source-of-truth Secret keys (s3_access_key / + # s3_secret_key) so emitting both pairs is safe — each provider only + # reads its own. AWS_REGION lets boto3 resolve the default regional S3 + # endpoint when no AWS_ENDPOINT_URL is set; required for any AWS S3 + # bucket outside us-east-1 to avoid PermanentRedirect on the first call. + AWS_ACCESS_KEY_ID: "{{.S3CompatObjectStoreAccessKey}}" + AWS_SECRET_ACCESS_KEY: "{{.S3CompatObjectStoreSecretKey}}" + AWS_REGION: "{{.Region}}" + AWS_DEFAULT_REGION: "{{.Region}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -504,6 +621,19 @@ applications: S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" + # AWS / boto3 standard credential names — populated whenever the + # operator can load credentials from spec.objectStorage.secretRef. For + # CLOUD_PROVIDER=aws these are the values boto3 reads (the S3COMPAT_* + # names above are only consumed by the s3compat shim). Both code paths + # share the same source-of-truth Secret keys (s3_access_key / + # s3_secret_key) so emitting both pairs is safe — each provider only + # reads its own. AWS_REGION lets boto3 resolve the default regional S3 + # endpoint when no AWS_ENDPOINT_URL is set; required for any AWS S3 + # bucket outside us-east-1 to avoid PermanentRedirect on the first call. + AWS_ACCESS_KEY_ID: "{{.S3CompatObjectStoreAccessKey}}" + AWS_SECRET_ACCESS_KEY: "{{.S3CompatObjectStoreSecretKey}}" + AWS_REGION: "{{.Region}}" + AWS_DEFAULT_REGION: "{{.Region}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -560,6 +690,19 @@ applications: S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" + # AWS / boto3 standard credential names — populated whenever the + # operator can load credentials from spec.objectStorage.secretRef. For + # CLOUD_PROVIDER=aws these are the values boto3 reads (the S3COMPAT_* + # names above are only consumed by the s3compat shim). Both code paths + # share the same source-of-truth Secret keys (s3_access_key / + # s3_secret_key) so emitting both pairs is safe — each provider only + # reads its own. AWS_REGION lets boto3 resolve the default regional S3 + # endpoint when no AWS_ENDPOINT_URL is set; required for any AWS S3 + # bucket outside us-east-1 to avoid PermanentRedirect on the first call. + AWS_ACCESS_KEY_ID: "{{.S3CompatObjectStoreAccessKey}}" + AWS_SECRET_ACCESS_KEY: "{{.S3CompatObjectStoreSecretKey}}" + AWS_REGION: "{{.Region}}" + AWS_DEFAULT_REGION: "{{.Region}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -606,6 +749,19 @@ applications: S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" + # AWS / boto3 standard credential names — populated whenever the + # operator can load credentials from spec.objectStorage.secretRef. For + # CLOUD_PROVIDER=aws these are the values boto3 reads (the S3COMPAT_* + # names above are only consumed by the s3compat shim). Both code paths + # share the same source-of-truth Secret keys (s3_access_key / + # s3_secret_key) so emitting both pairs is safe — each provider only + # reads its own. AWS_REGION lets boto3 resolve the default regional S3 + # endpoint when no AWS_ENDPOINT_URL is set; required for any AWS S3 + # bucket outside us-east-1 to avoid PermanentRedirect on the first call. + AWS_ACCESS_KEY_ID: "{{.S3CompatObjectStoreAccessKey}}" + AWS_SECRET_ACCESS_KEY: "{{.S3CompatObjectStoreSecretKey}}" + AWS_REGION: "{{.Region}}" + AWS_DEFAULT_REGION: "{{.Region}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -638,6 +794,19 @@ applications: S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" + # AWS / boto3 standard credential names — populated whenever the + # operator can load credentials from spec.objectStorage.secretRef. For + # CLOUD_PROVIDER=aws these are the values boto3 reads (the S3COMPAT_* + # names above are only consumed by the s3compat shim). Both code paths + # share the same source-of-truth Secret keys (s3_access_key / + # s3_secret_key) so emitting both pairs is safe — each provider only + # reads its own. AWS_REGION lets boto3 resolve the default regional S3 + # endpoint when no AWS_ENDPOINT_URL is set; required for any AWS S3 + # bucket outside us-east-1 to avoid PermanentRedirect on the first call. + AWS_ACCESS_KEY_ID: "{{.S3CompatObjectStoreAccessKey}}" + AWS_SECRET_ACCESS_KEY: "{{.S3CompatObjectStoreSecretKey}}" + AWS_REGION: "{{.Region}}" + AWS_DEFAULT_REGION: "{{.Region}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" diff --git a/pkg/ai/features/saia/impl.go b/pkg/ai/features/saia/impl.go index 75d9d0a..c018395 100644 --- a/pkg/ai/features/saia/impl.go +++ b/pkg/ai/features/saia/impl.go @@ -706,6 +706,49 @@ func buildSAIABaseEnv(ai *aiv1.AIService) []corev1.EnvVar { ) } + return appendSAIABoto3Env(ai, env) +} + +// appendSAIABoto3Env adds boto3-canonical AWS_* env vars for all SAIA pods (v1 and v2). +// SAIA v1 calls boto3 directly and does not read S3COMPAT_OBJECT_STORE_*; without +// AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY k0s deployments with static keys fail with +// NoCredentialsError at startup. +func appendSAIABoto3Env(ai *aiv1.AIService, env []corev1.EnvVar) []corev1.EnvVar { + if ai.Spec.TaskVolume.Endpoint != "" { + env = append(env, corev1.EnvVar{ + Name: "AWS_ENDPOINT_URL", + Value: ai.Spec.TaskVolume.Endpoint, + }) + } + if r := strings.TrimSpace(ai.Spec.TaskVolume.Region); r != "" { + env = append(env, + corev1.EnvVar{Name: "AWS_REGION", Value: r}, + corev1.EnvVar{Name: "AWS_DEFAULT_REGION", Value: r}, + ) + } + if ai.Spec.TaskVolume.SecretRef != "" { + sn := ai.Spec.TaskVolume.SecretRef + env = append(env, + corev1.EnvVar{ + Name: "AWS_ACCESS_KEY_ID", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: sn}, + Key: "s3_access_key", + }, + }, + }, + corev1.EnvVar{ + Name: "AWS_SECRET_ACCESS_KEY", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: sn}, + Key: "s3_secret_key", + }, + }, + }, + ) + } return env } @@ -796,41 +839,6 @@ func buildV2ExtraEnv(ai *aiv1.AIService) []corev1.EnvVar { corev1.EnvVar{Name: "CONVERSATION_S3_BUCKET", Value: bucketName}, ) } - // Only expose AWS_ENDPOINT_URL when the operator was configured with an - // explicit S3-compatible endpoint (SeaweedFS/MinIO). Omitting it lets the - // v2 adapter use the default AWS S3 endpoint when running in a real cloud - // deployment. - if ai.Spec.TaskVolume.Endpoint != "" { - env = append(env, corev1.EnvVar{ - Name: "AWS_ENDPOINT_URL", - Value: ai.Spec.TaskVolume.Endpoint, - }) - } - // boto3-canonical credentials for the v2 S3StorageAdapter. Mirrors the - // S3COMPAT_OBJECT_STORE_ACCESS_KEY/_SECRET_KEY plumbing in buildSAIABaseEnv; - // see s3compat secret schema in raybuilder/builder.go and ai.Spec.TaskVolume.SecretRef. - if ai.Spec.TaskVolume.SecretRef != "" { - env = append(env, - corev1.EnvVar{ - Name: "AWS_ACCESS_KEY_ID", - ValueFrom: &corev1.EnvVarSource{ - SecretKeyRef: &corev1.SecretKeySelector{ - LocalObjectReference: corev1.LocalObjectReference{Name: ai.Spec.TaskVolume.SecretRef}, - Key: "s3_access_key", - }, - }, - }, - corev1.EnvVar{ - Name: "AWS_SECRET_ACCESS_KEY", - ValueFrom: &corev1.EnvVarSource{ - SecretKeyRef: &corev1.SecretKeySelector{ - LocalObjectReference: corev1.LocalObjectReference{Name: ai.Spec.TaskVolume.SecretRef}, - Key: "s3_secret_key", - }, - }, - }, - ) - } return env } @@ -940,49 +948,7 @@ func (r *SaiaReconciler) reconcileSAIADeployment( {Name: "config-volume", MountPath: "/etc/config"}, } - // Base env: keep ONLY dynamic values here. - weaviatePlatformURL := fmt.Sprintf("http://%s:80", ai.Spec.VectorDbUrl) - env := []corev1.EnvVar{ - // Dynamic or runtime-derived values: - {Name: "PLATFORM_URL", Value: ai.Spec.AIPlatformUrl}, - {Name: "WEAVIATE_PLATFORM_URL", Value: weaviatePlatformURL}, - {Name: "VECTOR_DB_URL", Value: ai.Spec.VectorDbUrl}, - // SAIA uses /tasks subdirectory within its feature path - // Extract just the bucket name from the full path (e.g., "s3://bucket-name" -> "bucket-name") - {Name: "S3_BUCKET", Value: extractBucketName(ai.Spec.TaskVolume.Path)}, - } - - // S3-compatible object store: set S3COMPAT_OBJECT_STORE_ENDPOINT_URL and S3COMPAT_OBJECT_STORE_BUCKET for custom endpoint (MinIO, SeaweedFS, etc.). - if ai.Spec.TaskVolume.Endpoint != "" { - env = append(env, - corev1.EnvVar{Name: "S3COMPAT_OBJECT_STORE_ENDPOINT_URL", Value: ai.Spec.TaskVolume.Endpoint}, - corev1.EnvVar{Name: "S3COMPAT_OBJECT_STORE_BUCKET", Value: extractBucketName(ai.Spec.TaskVolume.Path)}, - ) - } - - // S3-compatible object store credentials from secretRef (S3COMPAT_OBJECT_STORE_ACCESS_KEY, S3COMPAT_OBJECT_STORE_SECRET_KEY). - if ai.Spec.TaskVolume.SecretRef != "" { - env = append(env, - corev1.EnvVar{ - Name: "S3COMPAT_OBJECT_STORE_ACCESS_KEY", - ValueFrom: &corev1.EnvVarSource{ - SecretKeyRef: &corev1.SecretKeySelector{ - LocalObjectReference: corev1.LocalObjectReference{Name: ai.Spec.TaskVolume.SecretRef}, - Key: "s3_access_key", - }, - }, - }, - corev1.EnvVar{ - Name: "S3COMPAT_OBJECT_STORE_SECRET_KEY", - ValueFrom: &corev1.EnvVarSource{ - SecretKeyRef: &corev1.SecretKeySelector{ - LocalObjectReference: corev1.LocalObjectReference{Name: ai.Spec.TaskVolume.SecretRef}, - Key: "s3_secret_key", - }, - }, - }, - ) - } + env := buildSAIABaseEnv(ai) // mTLS handling (dynamic) if ai.Spec.MTLS.Enabled && ai.Spec.MTLS.Termination == "operator" { diff --git a/pkg/ai/features/saia/impl_test.go b/pkg/ai/features/saia/impl_test.go index e368531..d1ce50b 100644 --- a/pkg/ai/features/saia/impl_test.go +++ b/pkg/ai/features/saia/impl_test.go @@ -789,19 +789,17 @@ func sanitize(s string) string { } func Test_buildV2ExtraEnv_FieldDescriptionBackend(t *testing.T) { - // Explicit AIService with seaweedfs-style endpoint → AWS_ENDPOINT_URL is set. t.Run("with S3-compatible endpoint", func(t *testing.T) { ai := newTestAIService() // already sets TaskVolume.Endpoint = "http://seaweedfs:8333" envMap := envToMap(buildV2ExtraEnv(ai)) + baseMap := envToMap(buildSAIABaseEnv(ai)) assert.Equal(t, "s3", envMap["FIELD_DESCRIPTION_BACKEND"]) assert.Equal(t, "field-descriptions/global-field-descriptions.json", envMap["FIELD_DESCRIPTION_S3_KEY"]) - assert.Equal(t, "http://seaweedfs:8333", envMap["AWS_ENDPOINT_URL"]) + assert.Equal(t, "http://seaweedfs:8333", baseMap["AWS_ENDPOINT_URL"]) }) - // No explicit endpoint (= real AWS S3 deployment) → AWS_ENDPOINT_URL must - // be omitted so boto3 falls back to the default AWS regional endpoint. t.Run("without S3-compatible endpoint", func(t *testing.T) { ai := newTestAIService() ai.Spec.TaskVolume.Endpoint = "" @@ -810,54 +808,6 @@ func Test_buildV2ExtraEnv_FieldDescriptionBackend(t *testing.T) { assert.Equal(t, "s3", envMap["FIELD_DESCRIPTION_BACKEND"]) assert.Equal(t, "field-descriptions/global-field-descriptions.json", envMap["FIELD_DESCRIPTION_S3_KEY"]) - _, has := envMap["AWS_ENDPOINT_URL"] - assert.False(t, has, - "AWS_ENDPOINT_URL must be omitted when TaskVolume.Endpoint is empty (cloud S3 case)") - }) - - // SecretRef present → AWS_ACCESS_KEY_ID/SECRET sourced from same keys as - // the S3COMPAT_* envs in buildSAIABaseEnv. Required so that the v2 - // S3StorageAdapter (used by S3FieldDescriptionRepository) can authenticate - // to SeaweedFS / MinIO. - t.Run("AWS credentials sourced from SecretRef", func(t *testing.T) { - ai := newTestAIService() // already sets SecretRef = "s3-creds" - env := buildV2ExtraEnv(ai) - - var foundID, foundSecret bool - for _, e := range env { - if e.Name == "AWS_ACCESS_KEY_ID" { - foundID = true - if assert.NotNil(t, e.ValueFrom) && assert.NotNil(t, e.ValueFrom.SecretKeyRef) { - assert.Equal(t, "s3-creds", e.ValueFrom.SecretKeyRef.Name) - assert.Equal(t, "s3_access_key", e.ValueFrom.SecretKeyRef.Key) - } - } - if e.Name == "AWS_SECRET_ACCESS_KEY" { - foundSecret = true - if assert.NotNil(t, e.ValueFrom) && assert.NotNil(t, e.ValueFrom.SecretKeyRef) { - assert.Equal(t, "s3-creds", e.ValueFrom.SecretKeyRef.Name) - assert.Equal(t, "s3_secret_key", e.ValueFrom.SecretKeyRef.Key) - } - } - } - assert.True(t, foundID, "AWS_ACCESS_KEY_ID must be present so boto3 can auth to S3-compat endpoint") - assert.True(t, foundSecret, "AWS_SECRET_ACCESS_KEY must be present so boto3 can auth to S3-compat endpoint") - }) - - // No SecretRef → AWS_* must be omitted (cloud deployments use IAM role, - // not env-var creds; setting empty values would otherwise mask the IAM - // chain inside boto3). - t.Run("AWS credentials omitted when SecretRef empty", func(t *testing.T) { - ai := newTestAIService() - ai.Spec.TaskVolume.SecretRef = "" - env := buildV2ExtraEnv(ai) - - for _, e := range env { - assert.NotEqual(t, "AWS_ACCESS_KEY_ID", e.Name, - "AWS_ACCESS_KEY_ID must be omitted in cloud (IAM-role) case") - assert.NotEqual(t, "AWS_SECRET_ACCESS_KEY", e.Name, - "AWS_SECRET_ACCESS_KEY must be omitted in cloud (IAM-role) case") - } }) } @@ -938,6 +888,7 @@ func Test_buildSAIABaseEnv(t *testing.T) { assert.Equal(t, "test-bucket", envMap["S3_BUCKET"]) assert.Equal(t, "http://seaweedfs:8333", envMap["S3COMPAT_OBJECT_STORE_ENDPOINT_URL"]) assert.Equal(t, "test-bucket", envMap["S3COMPAT_OBJECT_STORE_BUCKET"]) + assert.Equal(t, "http://seaweedfs:8333", envMap["AWS_ENDPOINT_URL"]) // S3 creds come from secretRef found := false @@ -949,6 +900,54 @@ func Test_buildSAIABaseEnv(t *testing.T) { } } assert.True(t, found, "S3COMPAT_OBJECT_STORE_ACCESS_KEY should be present") + + t.Run("AWS credentials sourced from SecretRef", func(t *testing.T) { + var foundID, foundSecret bool + for _, e := range env { + if e.Name == "AWS_ACCESS_KEY_ID" { + foundID = true + if assert.NotNil(t, e.ValueFrom) && assert.NotNil(t, e.ValueFrom.SecretKeyRef) { + assert.Equal(t, "s3-creds", e.ValueFrom.SecretKeyRef.Name) + assert.Equal(t, "s3_access_key", e.ValueFrom.SecretKeyRef.Key) + } + } + if e.Name == "AWS_SECRET_ACCESS_KEY" { + foundSecret = true + if assert.NotNil(t, e.ValueFrom) && assert.NotNil(t, e.ValueFrom.SecretKeyRef) { + assert.Equal(t, "s3-creds", e.ValueFrom.SecretKeyRef.Name) + assert.Equal(t, "s3_secret_key", e.ValueFrom.SecretKeyRef.Key) + } + } + } + assert.True(t, foundID, "AWS_ACCESS_KEY_ID must be present for boto3 (v1 and v2)") + assert.True(t, foundSecret, "AWS_SECRET_ACCESS_KEY must be present for boto3 (v1 and v2)") + }) + + t.Run("AWS region from TaskVolume.Region", func(t *testing.T) { + ai := newTestAIService() + ai.Spec.TaskVolume.Region = "ap-southeast-2" + envMap := envToMap(buildSAIABaseEnv(ai)) + assert.Equal(t, "ap-southeast-2", envMap["AWS_REGION"]) + assert.Equal(t, "ap-southeast-2", envMap["AWS_DEFAULT_REGION"]) + }) + + t.Run("without S3-compatible endpoint", func(t *testing.T) { + ai := newTestAIService() + ai.Spec.TaskVolume.Endpoint = "" + envMap := envToMap(buildSAIABaseEnv(ai)) + _, has := envMap["AWS_ENDPOINT_URL"] + assert.False(t, has, + "AWS_ENDPOINT_URL must be omitted when TaskVolume.Endpoint is empty (cloud S3 case)") + }) + + t.Run("AWS credentials omitted when SecretRef empty", func(t *testing.T) { + ai := newTestAIService() + ai.Spec.TaskVolume.SecretRef = "" + for _, e := range buildSAIABaseEnv(ai) { + assert.NotEqual(t, "AWS_ACCESS_KEY_ID", e.Name) + assert.NotEqual(t, "AWS_SECRET_ACCESS_KEY", e.Name) + } + }) } func Test_extractBucketName(t *testing.T) { diff --git a/pkg/ai/raybuilder/builder.go b/pkg/ai/raybuilder/builder.go index a50b8b7..adf05ce 100644 --- a/pkg/ai/raybuilder/builder.go +++ b/pkg/ai/raybuilder/builder.go @@ -47,14 +47,103 @@ type ApplicationParams struct { ArtifactBucketName string `yaml:"ARTIFACTS_S3_BUCKET"` ArtifactsProvider string `yaml:"ARTIFACTS_PROVIDER"` CloudProvider string `yaml:"CLOUD_PROVIDER"` + Region string `yaml:"AWS_REGION"` S3CompatObjectStoreEndpointUrl string `yaml:"S3COMPAT_OBJECT_STORE_ENDPOINT_URL"` S3CompatObjectStoreAccessKey string `yaml:"S3COMPAT_OBJECT_STORE_ACCESS_KEY"` S3CompatObjectStoreSecretKey string `yaml:"S3COMPAT_OBJECT_STORE_SECRET_KEY"` - Replicas map[string]int32 `yaml:"REPLICAS"` - ModelVersion string `yaml:"MODEL_VERSION"` + Replicas map[string]int32 `yaml:"REPLICAS"` + ModelVersion string `yaml:"MODEL_VERSION"` AcceleratorType string `yaml:"ACCELERATOR_TYPE"` } +// classifyObjectStorage maps an AIPlatform objectStorage URL scheme + endpoint +// pair to the (cloudProvider, artifactsProvider, needsS3CompatCreds) tuple +// expected by the SAIA / ML-platform SDK that runs inside Ray Serve replicas. +// +// SDK contract (see /home/ray/sdk/storage/factory.py in the ai-platform-models +// image): CLOUD_PROVIDER accepts the values emitted below, including +// "s3compat" for S3-compatible backends such as MinIO and SeaweedFS. When +// CLOUD_PROVIDER is "s3compat", the SDK uses the S3COMPAT_OBJECT_STORE_* +// env vars for endpoint and credentials while still speaking the S3 API with +// SigV4-compatible request signing. +// +// Decision table: +// +// scheme=s3, endpoint empty → ("aws", "s3", needsCreds=true) ← AWS S3 default URL +// scheme=s3, endpoint matches AWS host → ("aws", "s3", needsCreds=true) ← installer-set regional URL +// scheme=s3, endpoint set to non-AWS → ("s3compat", "s3", needsCreds=true) ← MinIO/SeaweedFS behind s3:// +// scheme=s3compat|minio|seaweedfs → ("s3compat", "s3", needsCreds=true) +// scheme=gs|gcs → ("gcp", "gcs", needsCreds=false) +// scheme=azure → ("azure", "azure", needsCreds=false) +// other / unknown → ("azure", "azure", needsCreds=false) +// +// `needsS3CompatCreds` is true whenever the resolved provider can use the +// S3COMPAT_*/AWS_* credential set in the ObjectStorage Secret. Callers gate +// secret loading on it AND on a non-empty SecretRef. +func classifyObjectStorage(scheme, endpoint string) (cloudProvider, artifactsProvider string, needsS3CompatCreds bool) { + switch scheme { + case "s3": + artifactsProvider = "s3" + ep := strings.TrimSpace(endpoint) + if ep == "" || isAWSRegionalEndpoint(ep) { + // Real AWS S3 — either no endpoint (boto3 derives from region) or an + // AWS regional URL (e.g. https://s3.us-east-2.amazonaws.com, which + // the k0s installer requires non-empty even for type=aws). + cloudProvider = "aws" + } else { + // s3:// against a non-AWS endpoint = S3-compatible store (MinIO, + // SeaweedFS, etc.). Keep the s3compat code path so the SDK reads + // S3COMPAT_* env vars. + cloudProvider = "s3compat" + } + needsS3CompatCreds = true + case "s3compat", "minio", "seaweedfs": + cloudProvider = "s3compat" + artifactsProvider = "s3" + needsS3CompatCreds = true + case "gs", "gcs": + cloudProvider = "gcp" + artifactsProvider = "gcs" + case "azure": + cloudProvider = "azure" + artifactsProvider = "azure" + default: + // Unknown scheme: preserve the legacy default (azure) rather than + // failing — the operator hasn't validated this scheme until now and a + // hard error here would break running clusters during upgrade. + cloudProvider = "azure" + artifactsProvider = "azure" + } + return +} + +// isAWSRegionalEndpoint returns true for AWS S3 regional endpoints such as: +// +// https://s3.us-east-2.amazonaws.com +// https://s3-fips.us-east-1.amazonaws.com +// https://bucket-name.s3.us-east-2.amazonaws.com (virtual-hosted-style) +// https://s3.dualstack.us-east-1.amazonaws.com +// +// We need this because the k0s installer requires a non-empty +// objectStore.endpoint even for type=aws (see preflight in +// tools/cluster_setup/k0s_cluster_with_stack.sh:434), so an empty-endpoint +// check alone is not sufficient to identify real AWS S3. +// +// The match is intentionally narrow: host must end in `.amazonaws.com` AND +// contain `s3` somewhere in the host (case-insensitive). This catches every +// AWS S3 endpoint pattern documented by AWS but rejects unrelated AWS hosts +// (e.g. `lambda.us-east-1.amazonaws.com`) and any third-party impostor whose +// host doesn't end in `.amazonaws.com`. Returns false on parse error or empty +// host (caller already handles the empty-endpoint case). +func isAWSRegionalEndpoint(endpoint string) bool { + u, err := url.Parse(strings.TrimSpace(endpoint)) + if err != nil || u.Hostname() == "" { + return false + } + host := strings.ToLower(u.Hostname()) + return strings.HasSuffix(host, ".amazonaws.com") && strings.Contains(host, "s3") +} + type WorkerConfigs map[string][]InstanceDetail type InstanceDetail struct { @@ -103,31 +192,11 @@ func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPl return err } - // Set CloudProvider and artifacts provider/bucket from URL scheme (for SDK model loaders). - // ARTIFACTS_PROVIDER matches storage client GetProvider(): s3/minio/seaweedfs/s3compat -> "s3", gs/gcs -> "gcs", azure -> "azure". - // S3 (AWS) uses cloudProvider "aws" when no custom endpoint; s3compat/minio/seaweedfs use "s3compat". - var cloudProvider, artifactsProvider string - switch u.Scheme { - case "s3": - if p.Spec.ObjectStorage.Endpoint != "" { - cloudProvider = "s3compat" - } else { - cloudProvider = "aws" - } - artifactsProvider = "s3" - case "s3compat", "minio", "seaweedfs": - cloudProvider = "s3compat" - artifactsProvider = "s3" - case "gs", "gcs": - cloudProvider = "gcp" - artifactsProvider = "gcs" - case "azure": - cloudProvider = "azure" - artifactsProvider = "azure" - default: - cloudProvider = "azure" - artifactsProvider = "azure" - } + // Classify object-storage URL into the (CLOUD_PROVIDER, ARTIFACTS_PROVIDER, + // needsCreds) tuple the SAIA / ai-platform SDK consumes via runtime_env + // env vars. See classifyObjectStorage doc-comment for the full decision + // table, including AWS regional-endpoint detection. + cloudProvider, artifactsProvider, needsS3CompatCreds := classifyObjectStorage(u.Scheme, p.Spec.ObjectStorage.Endpoint) // Initialize the replicas map by iterating through features replicasMap := make(map[string]int32) @@ -164,19 +233,34 @@ func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPl } } - // S3-compatible backends (s3compat, minio, seaweedfs) need custom endpoint and credentials. S3 (AWS) uses region/IRSA only. - s3CompatScheme := (u.Scheme == "s3compat" || u.Scheme == "minio" || u.Scheme == "seaweedfs") + // S3-compatible endpoint is only meaningful when the classifier picked the + // s3compat code path. For real AWS (cloudProvider=aws) we leave it empty so + // boto3 falls through to the default regional URL derived from AWS_REGION. s3CompatObjectStoreEndpoint := "" - if s3CompatScheme && p.Spec.ObjectStorage.Endpoint != "" { + if cloudProvider == "s3compat" && p.Spec.ObjectStorage.Endpoint != "" { s3CompatObjectStoreEndpoint = p.Spec.ObjectStorage.Endpoint } + // Load S3 credentials from the operator-managed Secret whenever the chosen + // provider can use them (aws OR s3compat). The Secret is the single + // source of truth — `s3_access_key`/`s3_secret_key` populate both the + // boto3-standard AWS_* env vars (consumed by the AWS code path) and the + // S3COMPAT_* env vars (consumed by the s3compat shim). Templating both + // pairs is safe because each code path only reads its own set. + // + // Previously this block was gated behind s3CompatScheme, which silently + // skipped credential injection for real-AWS deployments and produced + // `botocore.exceptions.NoCredentialsError` inside every Serve replica + // when the cluster lacked IRSA / EC2 instance-profile credentials (true + // for k0s on bare-metal / non-EKS deployments). var s3CompatObjectStoreAccessKey, s3CompatObjectStoreSecretKey string - if p.Spec.ObjectStorage.SecretRef != "" && s3CompatScheme { + if p.Spec.ObjectStorage.SecretRef != "" && needsS3CompatCreds { var secret corev1.Secret secretRef := types.NamespacedName{Namespace: p.Namespace, Name: p.Spec.ObjectStorage.SecretRef} if err := b.Get(ctx, secretRef, &secret); err != nil { - logger.Error(err, "Failed to get object storage secret for S3-compatible credentials", "secret", p.Spec.ObjectStorage.SecretRef) + logger.Error(err, "Failed to get object storage credentials Secret", + "secret", p.Spec.ObjectStorage.SecretRef, + "cloudProvider", cloudProvider) return err } if raw, ok := secret.Data["s3_access_key"]; ok { @@ -191,6 +275,7 @@ func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPl ArtifactBucketName: u.Host, ArtifactsProvider: artifactsProvider, CloudProvider: cloudProvider, + Region: p.Spec.ObjectStorage.Region, S3CompatObjectStoreEndpointUrl: s3CompatObjectStoreEndpoint, S3CompatObjectStoreAccessKey: s3CompatObjectStoreAccessKey, S3CompatObjectStoreSecretKey: s3CompatObjectStoreSecretKey, diff --git a/pkg/ai/raybuilder/object_storage_test.go b/pkg/ai/raybuilder/object_storage_test.go new file mode 100644 index 0000000..3343eb5 --- /dev/null +++ b/pkg/ai/raybuilder/object_storage_test.go @@ -0,0 +1,227 @@ +package raybuilder + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +// TestClassifyObjectStorage covers the URL-scheme + endpoint mapping that +// determines which storage code path the SAIA / ai-platform SDK takes at +// Serve-replica startup. See classifyObjectStorage doc-comment for full +// rationale; in particular the AWS-regional-endpoint detection covers the +// case where the k0s installer requires a non-empty endpoint even for +// type=aws. +// +// Regression for: "Unsupported CLOUD_PROVIDER: s3compat" panic when an AWS +// regional endpoint was passed through to a real AWS S3 bucket. +func TestClassifyObjectStorage(t *testing.T) { + tests := []struct { + name string + scheme string + endpoint string + wantCloudProvider string + wantArtifactsProvider string + wantNeedsS3CompatCreds bool + }{ + // --- AWS S3 ---------------------------------------------------------- + { + name: "s3 scheme with no endpoint → AWS", + scheme: "s3", + endpoint: "", + wantCloudProvider: "aws", + wantArtifactsProvider: "s3", + wantNeedsS3CompatCreds: true, + }, + { + name: "s3 scheme with AWS regional endpoint → AWS (regression: was s3compat)", + scheme: "s3", + endpoint: "https://s3.us-east-2.amazonaws.com", + wantCloudProvider: "aws", + wantArtifactsProvider: "s3", + wantNeedsS3CompatCreds: true, + }, + { + name: "s3 scheme with AWS dualstack endpoint → AWS", + scheme: "s3", + endpoint: "https://s3.dualstack.us-east-1.amazonaws.com", + wantCloudProvider: "aws", + wantArtifactsProvider: "s3", + wantNeedsS3CompatCreds: true, + }, + { + name: "s3 scheme with AWS FIPS endpoint → AWS", + scheme: "s3", + endpoint: "https://s3-fips.us-east-1.amazonaws.com", + wantCloudProvider: "aws", + wantArtifactsProvider: "s3", + wantNeedsS3CompatCreds: true, + }, + { + name: "s3 scheme with virtual-hosted-style AWS endpoint → AWS", + scheme: "s3", + endpoint: "https://my-bucket.s3.us-east-2.amazonaws.com", + wantCloudProvider: "aws", + wantArtifactsProvider: "s3", + wantNeedsS3CompatCreds: true, + }, + { + name: "s3 scheme with whitespace-padded AWS endpoint → AWS", + scheme: "s3", + endpoint: " https://s3.us-east-2.amazonaws.com ", + wantCloudProvider: "aws", + wantArtifactsProvider: "s3", + wantNeedsS3CompatCreds: true, + }, + + // --- S3-compatible behind s3:// scheme ------------------------------ + { + name: "s3 scheme with MinIO endpoint → s3compat", + scheme: "s3", + endpoint: "http://minio.minio-system.svc.cluster.local:9000", + wantCloudProvider: "s3compat", + wantArtifactsProvider: "s3", + wantNeedsS3CompatCreds: true, + }, + { + name: "s3 scheme with SeaweedFS endpoint → s3compat", + scheme: "s3", + endpoint: "http://seaweed.example.com:8333", + wantCloudProvider: "s3compat", + wantArtifactsProvider: "s3", + wantNeedsS3CompatCreds: true, + }, + { + name: "s3 scheme with plain-IP endpoint → s3compat", + scheme: "s3", + endpoint: "http://10.0.0.5:9000", + wantCloudProvider: "s3compat", + wantArtifactsProvider: "s3", + wantNeedsS3CompatCreds: true, + }, + + // --- Explicit s3-compatible schemes --------------------------------- + { + name: "s3compat scheme → s3compat", + scheme: "s3compat", + endpoint: "https://example.com", + wantCloudProvider: "s3compat", + wantArtifactsProvider: "s3", + wantNeedsS3CompatCreds: true, + }, + { + name: "minio scheme → s3compat", + scheme: "minio", + endpoint: "http://minio.example.com:9000", + wantCloudProvider: "s3compat", + wantArtifactsProvider: "s3", + wantNeedsS3CompatCreds: true, + }, + { + name: "seaweedfs scheme → s3compat", + scheme: "seaweedfs", + endpoint: "http://seaweed.example.com:8333", + wantCloudProvider: "s3compat", + wantArtifactsProvider: "s3", + wantNeedsS3CompatCreds: true, + }, + + // --- Non-S3 backends ------------------------------------------------ + { + name: "gs scheme → gcp", + scheme: "gs", + endpoint: "", + wantCloudProvider: "gcp", + wantArtifactsProvider: "gcs", + wantNeedsS3CompatCreds: false, + }, + { + name: "gcs scheme alias → gcp", + scheme: "gcs", + endpoint: "", + wantCloudProvider: "gcp", + wantArtifactsProvider: "gcs", + wantNeedsS3CompatCreds: false, + }, + { + name: "azure scheme → azure", + scheme: "azure", + endpoint: "", + wantCloudProvider: "azure", + wantArtifactsProvider: "azure", + wantNeedsS3CompatCreds: false, + }, + { + name: "unknown scheme → azure (legacy default; preserves prior behaviour)", + scheme: "wasb", + endpoint: "", + wantCloudProvider: "azure", + wantArtifactsProvider: "azure", + wantNeedsS3CompatCreds: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cp, ap, needsCreds := classifyObjectStorage(tt.scheme, tt.endpoint) + assert.Equal(t, tt.wantCloudProvider, cp, "CLOUD_PROVIDER mismatch") + assert.Equal(t, tt.wantArtifactsProvider, ap, "ARTIFACTS_PROVIDER mismatch") + assert.Equal(t, tt.wantNeedsS3CompatCreds, needsCreds, "needsS3CompatCreds mismatch") + + // SDK contract: CLOUD_PROVIDER must be one of aws/gcp/azure/s3compat. + // Anything else triggers `RuntimeError: Unsupported CLOUD_PROVIDER` + // inside the Serve replica. Catch any future drift. + assert.Contains(t, []string{"aws", "gcp", "azure", "s3compat"}, cp, + "classifier returned a CLOUD_PROVIDER value the SDK does not accept") + }) + } +} + +// TestIsAWSRegionalEndpoint covers the host-pattern recogniser used to tell +// real AWS S3 endpoints apart from MinIO / SeaweedFS / Wasabi etc. when the +// scheme is s3://. False negatives here would mis-classify real AWS as +// s3compat (the original bug); false positives would mis-classify an +// S3-compatible store as AWS (and silently strip the custom endpoint, leading +// to "NoSuchBucket" or wrong-region errors). +func TestIsAWSRegionalEndpoint(t *testing.T) { + tests := []struct { + name string + endpoint string + want bool + }{ + // Real AWS — should return true + {"path-style", "https://s3.us-east-2.amazonaws.com", true}, + {"path-style us-west-1", "https://s3.us-west-1.amazonaws.com", true}, + {"FIPS", "https://s3-fips.us-east-1.amazonaws.com", true}, + {"dualstack", "https://s3.dualstack.us-east-1.amazonaws.com", true}, + {"virtual-hosted bucket subdomain", "https://my-bucket.s3.us-east-2.amazonaws.com", true}, + {"case-insensitive host", "https://S3.US-EAST-2.AMAZONAWS.COM", true}, + {"china s3 (still amazonaws.com)", "https://s3.cn-north-1.amazonaws.com.cn", false}, // .cn TLD, intentionally not matched; users in China get s3compat path which still works + {"http (non-tls) AWS — rare but legal", "http://s3.us-east-2.amazonaws.com", true}, + + // Other AWS services — must return false + {"lambda endpoint", "https://lambda.us-east-1.amazonaws.com", false}, + {"ec2 endpoint", "https://ec2.us-east-1.amazonaws.com", false}, + {"sts endpoint", "https://sts.amazonaws.com", false}, + + // Third-party / S3-compatible — must return false + {"MinIO by IP", "http://10.0.0.5:9000", false}, + {"MinIO with cluster.local host", "http://minio.minio-system.svc.cluster.local:9000", false}, + {"SeaweedFS", "http://seaweed.example.com:8333", false}, + {"Wasabi", "https://s3.wasabisys.com", false}, + {"DigitalOcean Spaces", "https://nyc3.digitaloceanspaces.com", false}, + + // Edge cases — must return false (caller treats empty endpoint as AWS separately) + {"empty string", "", false}, + {"only scheme, no host", "https://", false}, + {"malformed url", "not a url", false}, + {"no scheme just host", "s3.us-east-2.amazonaws.com", false}, // url.Parse keeps this in Path, not Host + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := isAWSRegionalEndpoint(tt.endpoint) + assert.Equal(t, tt.want, got, "isAWSRegionalEndpoint(%q)", tt.endpoint) + }) + } +} diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index 17defca..bad29f3 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -104,6 +104,21 @@ load_config() { log "Loading configuration from: ${CONFIG_FILE}" [[ -f "${CONFIG_FILE}" ]] || err "Config file not found: ${CONFIG_FILE}" + # Validate the WHOLE file parses cleanly before pulling individual fields. + # Every yq lookup below uses `2>/dev/null` to fall through to a default, which + # silently swallows YAML syntax errors and makes them look like missing + # fields downstream (e.g. "nodes.existingIPs.controllers must be set" when + # the real problem is a corrupted comment 90 lines later in the file). + # Surface parse errors with their actual line number and content instead. + if command -v yq >/dev/null 2>&1; then + local yq_err + if ! yq_err=$(yq eval '.' "${CONFIG_FILE}" 2>&1 >/dev/null); then + err "Config file ${CONFIG_FILE} has YAML syntax errors: +${yq_err} +Run 'yq eval . ${CONFIG_FILE}' for details, then fix the line and retry." + fi + fi + # Parse YAML configuration CLUSTER_NAME=$(yq eval '.cluster.name' "${CONFIG_FILE}" 2>/dev/null || grep '^ name:' "${CONFIG_FILE}" | awk '{print $2}') USE_EXISTING=$(yq eval '.cluster.useExisting' "${CONFIG_FILE}" 2>/dev/null || echo "never") @@ -424,19 +439,47 @@ preflight_checks() { pf_header "Object storage (customer-managed)" pf_ok "Object storage type: ${OBJ_STORE_TYPE} (bucket=${OBJ_STORE_BUCKET})" - if [[ "${OBJ_STORE_TYPE}" == "seaweedfs" ]]; then - if echo "${OBJ_STORE_ENDPOINT}" | grep -q ':9000'; then - pf_warn "SeaweedFS uses port 8333 (not 9000). Endpoint has :9000 (MinIO); use http://host:8333 for SeaweedFS." - else - [[ -n "${OBJ_STORE_ENDPOINT}" ]] && pf_ok "SeaweedFS endpoint: ${OBJ_STORE_ENDPOINT}" || pf_fail "objectStore.endpoint is required" - fi - else - [[ -n "${OBJ_STORE_ENDPOINT}" ]] && pf_ok "Endpoint: ${OBJ_STORE_ENDPOINT}" || pf_fail "objectStore.endpoint is required" - fi + case "${OBJ_STORE_TYPE}" in + seaweedfs) + if echo "${OBJ_STORE_ENDPOINT}" | grep -q ':9000'; then + pf_warn "SeaweedFS uses port 8333 (not 9000). Endpoint has :9000 (MinIO); use http://host:8333 for SeaweedFS." + else + [[ -n "${OBJ_STORE_ENDPOINT}" ]] && pf_ok "SeaweedFS endpoint: ${OBJ_STORE_ENDPOINT}" || pf_fail "objectStore.endpoint is required" + fi + ;; + s3compat|minio) + [[ -n "${OBJ_STORE_ENDPOINT}" ]] && pf_ok "Endpoint: ${OBJ_STORE_ENDPOINT}" || pf_fail "objectStore.endpoint is required" + ;; + aws) + # type=aws does NOT require endpoint — boto3 derives the regional URL + # from AWS_REGION. If a user does pass one (e.g. for VPC endpoint pinning + # or testing), warn that the installer will ignore it for the AIPlatform + # CR. Only AWS regional hosts are sane here; anything else means the + # user likely meant type=s3compat. + if [[ -n "${OBJ_STORE_ENDPOINT}" ]]; then + case "${OBJ_STORE_ENDPOINT}" in + *.amazonaws.com|*amazonaws.com*) pf_warn "type=aws: ignoring objectStore.endpoint='${OBJ_STORE_ENDPOINT}' (boto3 will derive the regional URL from AWS_REGION)." ;; + *) pf_warn "type=aws but endpoint '${OBJ_STORE_ENDPOINT}' is not an AWS host. If you meant to point at MinIO/SeaweedFS, change objectStore.type to s3compat. The endpoint will be dropped for type=aws." ;; + esac + else + pf_ok "Endpoint: (using default AWS S3 regional URL from AWS_REGION)" + fi + ;; + *) + pf_fail "Unsupported objectStore.type: ${OBJ_STORE_TYPE}. Supported: aws, s3compat, minio, seaweedfs" + ;; + esac [[ -n "${MINIO_ROOT_PASSWORD}" ]] && pf_ok "Credentials configured" || pf_fail "Object store credentials required (objectStore.auth.rootPassword)" if object_store_auth_looks_like_placeholder; then pf_fail "objectStore.auth still contains template placeholders (e.g. <...> or CHANGEME). Replace with a real access key and secret in your config (keep secrets in a Git-ignored file such as tools/cluster_setup/k0s-config.local.yaml)." fi + # Reject STS temporary credentials early — the minio-credentials Secret schema + # has no AWS_SESSION_TOKEN field, so ASIA* keys silently fail at SAIA startup + # with InvalidToken. Permanent IAM keys (AKIA*) are required. See + # codeguard-1-hardcoded-credentials for IAM user setup guidance. + case "${MINIO_ROOT_USER}" in + ASIA*) pf_fail "objectStore.auth.rootUser '${MINIO_ROOT_USER}' is an STS temporary key (ASIA…). The k0s installer does not propagate AWS_SESSION_TOKEN; use a permanent IAM access key (AKIA…) instead. To mint one: aws iam create-access-key --user-name ." ;; + esac pf_header "Infrastructure mode" pf_ok "Using existing infrastructure (on-prem/baremetal)" @@ -544,36 +587,98 @@ preflight_check_node_storage() { # Helper: SSH to a node and return available GB on the filesystem backing # /var/lib/k0s (falls back to / if k0s hasn't been installed yet). + # + # Resilience notes: + # - Uses POSIX `df -Pk` instead of `df --output=avail` so it works on + # BusyBox / non-GNU coreutils. The 4th awk column (`$4`) is the avail + # count in 1024-byte blocks across POSIX-compliant df implementations. + # - Distinguishes SSH failure (rc=255) from "df returned no data" so the + # caller can show a helpful error instead of a misleading "0 GB" that + # looks like an actual disk-pressure problem. + # - 10s SSH timeout so a bad host doesn't stall the whole preflight. _get_avail_gb() { - local ip="$1" - ssh_exec "${ip}" " - avail_kb=\$(df --output=avail /var/lib/k0s 2>/dev/null | tail -1 | tr -d ' ') - if [ -z \"\${avail_kb}\" ] || [ \"\${avail_kb}\" = \"Avail\" ]; then - avail_kb=\$(df --output=avail / 2>/dev/null | tail -1 | tr -d ' ') - fi - echo \$(( \${avail_kb:-0} / 1048576 )) - " 2>/dev/null || echo "0" + local ip="$1" out rc + local -a ssh_cmd=( + ssh + -o StrictHostKeyChecking=no + -o UserKnownHostsFile=/dev/null + -o ConnectTimeout=10 + -o BatchMode=yes + ) + if [ -n "${SSH_KEY_PATH:-}" ]; then + ssh_cmd+=(-i "$SSH_KEY_PATH") + fi + out=$( + "${ssh_cmd[@]}" "${SSH_USER}@${ip}" \ + "avail_kb=\$(df -Pk /var/lib/k0s 2>/dev/null | awk 'NR==2 {print \$4}') + [ -z \"\$avail_kb\" ] && avail_kb=\$(df -Pk / 2>/dev/null | awk 'NR==2 {print \$4}') + echo \"\${avail_kb:-0}\"" 2>/dev/null + ) + rc=$? + if [ $rc -ne 0 ]; then + # SSH itself failed (wrong user, host unreachable, key rejected, etc.). + # Return a sentinel value the caller can recognise — preserve old "0" + # behaviour for back-compat but stamp the SSH error so pf_fail messages + # are actionable. + echo "SSH_ERROR_RC=${rc}" >&2 + echo "0" + return + fi + out=$(echo "${out}" | tr -d '[:space:]') + # KB → GB (integer truncation; close enough for a preflight threshold) + echo "$(( ${out:-0} / 1048576 ))" } - # Check controller nodes - for ip in "${_ctrl_ips[@]}"; do - local avail - avail=$(_get_avail_gb "${ip}") - avail=$(echo "${avail}" | tr -d '[:space:]') - if [[ "${avail}" -ge "${MIN_DISK_CONTROLLER}" ]]; then - pf_ok "Controller ${ip}: ${avail} GB available (minimum: ${MIN_DISK_CONTROLLER} GB)" + # Helper that runs _get_avail_gb and turns its sentinel stderr (SSH_ERROR_RC=...) + # into a human-readable failure message. SSH errors look very different from + # genuine disk-pressure problems and should not be reported as "0 GB available". + _check_node_disk() { + local ip="$1" role="$2" min_required="$3" + local stdout stderr_file stderr avail ssh_err + # Capture stdout and stderr separately via a temp file (avoids the fd-3 + # redirection trick that leaked stdout "0" lines to the terminal). + stderr_file=$(mktemp) + stdout=$(_get_avail_gb "${ip}" 2>"${stderr_file}") + stderr=$(cat "${stderr_file}"); rm -f "${stderr_file}" + + if printf '%s' "${stderr}" | grep -q 'SSH_ERROR_RC='; then + ssh_err=$(printf '%s' "${stderr}" | sed -n 's/.*SSH_ERROR_RC=\([0-9]*\).*/\1/p') + local hint + case "${ssh_err}" in + 255) + # Most common rc=255 cause on a fresh Mac+EC2 setup is a too-permissive + # key file; SSH then silently refuses to use it. Probe perms first so + # users don't waste time on SG/user rotations. + if [[ -f "${SSH_KEY_PATH}" ]]; then + local perms + perms=$(stat -f '%Lp' "${SSH_KEY_PATH}" 2>/dev/null || stat -c '%a' "${SSH_KEY_PATH}" 2>/dev/null) + if [[ "${perms}" != "400" && "${perms}" != "600" ]]; then + hint=" — SSH key ${SSH_KEY_PATH} has permissions ${perms} (must be 400 or 600). Run: chmod 400 ${SSH_KEY_PATH}" + fi + fi + ;; + esac + pf_fail "${role} ${ip}: SSH failed (rc=${ssh_err:-?})${hint:-}. Verify cluster.sshUser='${SSH_USER}' matches the AMI default (ec2-user/ubuntu/rocky/admin), the security group allows port 22 from your IP, and the SSH key at ${SSH_KEY_PATH:-default} is authorised on the node." + return + fi + + avail=$(printf '%s' "${stdout}" | tr -d '[:space:]') + if [[ "${avail:-0}" -ge "${min_required}" ]]; then + pf_ok "${role} ${ip}: ${avail} GB available (minimum: ${min_required} GB)" else - pf_fail "Controller ${ip}: ${avail} GB available — need at least ${MIN_DISK_CONTROLLER} GB on /var/lib/k0s" + pf_fail "${role} ${ip}: ${avail:-0} GB available — need at least ${min_required} GB on /var/lib/k0s" fi + } + + # Check controller nodes + for ip in "${_ctrl_ips[@]}"; do + _check_node_disk "${ip}" "Controller" "${MIN_DISK_CONTROLLER}" done # Check worker nodes (distinguish CPU vs GPU by index) local widx=0 for ip in "${_worker_ips[@]}"; do - local avail role min_required - avail=$(_get_avail_gb "${ip}") - avail=$(echo "${avail}" | tr -d '[:space:]') - + local role min_required if [[ ${widx} -lt ${CPU_WORKER_COUNT} ]]; then role="CPU worker" min_required="${MIN_DISK_CPU_WORKER}" @@ -581,12 +686,7 @@ preflight_check_node_storage() { role="GPU worker" min_required="${MIN_DISK_GPU_WORKER}" fi - - if [[ "${avail}" -ge "${min_required}" ]]; then - pf_ok "${role} ${ip}: ${avail} GB available (minimum: ${min_required} GB)" - else - pf_fail "${role} ${ip}: ${avail} GB available — need at least ${min_required} GB on /var/lib/k0s" - fi + _check_node_disk "${ip}" "${role}" "${min_required}" widx=$((widx + 1)) done } @@ -1131,17 +1231,27 @@ ensure_namespace() { # Object storage is always customer-managed (external). This function creates # the Kubernetes credentials secret so the operator and workloads can auth. ensure_s3compat_credentials() { - log "Creating credentials secret for S3-compatible object storage (${OBJ_STORE_TYPE})..." + log "Creating credentials secret for object storage (type=${OBJ_STORE_TYPE})..." if object_store_auth_looks_like_placeholder; then err "Refusing to create minio-credentials: objectStore.auth contains template placeholders; fix ${CONFIG_FILE}" return 1 fi - if [[ -z "${OBJ_STORE_ENDPOINT}" && -z "${MINIO_ENDPOINT}" ]]; then - err "storage.objectStore.type=${OBJ_STORE_TYPE} requires storage.objectStore.endpoint" - return 1 - fi + # Endpoint is only required for S3-compatible backends (MinIO/SeaweedFS/ + # generic s3compat). For type=aws boto3 derives the regional URL from + # AWS_REGION on the consuming pods, and the installer intentionally renders + # the AIPlatform CR without an endpoint field (see setup_ai_platform case + # "aws" — endpoint dropped to mirror the EKS installer's behaviour and to + # match the operator's classifyObjectStorage() helper). + case "${OBJ_STORE_TYPE}" in + s3compat|minio|seaweedfs) + if [[ -z "${OBJ_STORE_ENDPOINT}" && -z "${MINIO_ENDPOINT}" ]]; then + err "storage.objectStore.type=${OBJ_STORE_TYPE} requires storage.objectStore.endpoint" + return 1 + fi + ;; + esac if [[ -z "${MINIO_ROOT_PASSWORD}" ]]; then - err "S3-compatible storage requires credentials (objectStore.auth.rootPassword or MINIO_ROOT_PASSWORD)" + err "Object storage requires credentials (objectStore.auth.rootPassword or MINIO_ROOT_PASSWORD)" return 1 fi ensure_namespace "${AI_NS}" @@ -2579,8 +2689,39 @@ YAML warn "Could not patch default ServiceAccount" fi - # Standalone app repo: uses customer-managed S3-compatible object storage + # Standalone app repo: uses customer-managed S3-compatible object storage. + # + # IMPORTANT — unlike the AIPlatform CR (which lets boto3 derive the AWS + # regional URL from AWS_REGION when endpoint is empty), the Splunk Operator's + # validateStandaloneSpec hard-requires `endpoint` on every appRepo volume. + # An empty/missing value yields: + # Error validateStandaloneSpec validate standalone spec failed + # volume Endpoint URI is missing + # ...and the Standalone goes into PHASE=Error indefinitely (the operator's + # secret never gets created, breaking the downstream AIPlatform reconcile). + # + # For type=aws we therefore synthesise https://s3..amazonaws.com + # from cluster.region (which boto3 inside SAIA would have computed anyway). + # For s3compat/minio/seaweedfs we use the user-provided endpoint as-is — + # preflight already enforces it's non-empty for those types. + # + # NOTE on `provider: aws` vs `storageType: s3`: + # These are the Splunk Operator CRD field names; both apply for any + # S3-compatible store (MinIO/SeaweedFS/CVFS/real AWS S3) — `aws` is the + # provider taxonomy in the Splunk Operator's bucket abstraction, not the + # cloud provider. Do not change this even when objectStore.type != aws. local minio_endpoint="${MINIO_ENDPOINT:-${OBJ_STORE_ENDPOINT}}" + if [[ -z "${minio_endpoint}" && "${OBJ_STORE_TYPE}" == "aws" ]]; then + local aws_region="${REGION:-${ECR_REGION:-us-east-1}}" + minio_endpoint="https://s3.${aws_region}.amazonaws.com" + log "type=aws: synthesised Splunk Standalone S3 endpoint = ${minio_endpoint}" + fi + if [[ -z "${minio_endpoint}" ]]; then + err "Splunk Standalone needs a non-empty S3 endpoint; check storage.objectStore.endpoint (or storage.objectStore.type)." + return 1 + fi + local endpoint_line=" endpoint: ${minio_endpoint}" + cat <.amazonaws.com) from AWS_REGION. Passing + # the endpoint through into the AIPlatform CR would (a) duplicate the + # default and risk region drift, (b) trigger the operator's legacy + # "endpoint non-empty ⇒ s3compat" classification on older operator + # builds, and (c) prevent later migration to IRSA / EC2 instance-profile + # credentials (which fail when an explicit endpoint is set without + # matching AWS_ENDPOINT_URL plumbing). Matches the EKS installer + # behaviour in eks_cluster_with_stack.sh:2715-2719. + obj_endpoint="" ;; *) err "Unsupported objectStore.type: ${OBJ_STORE_TYPE}. Supported: aws, s3compat, minio, seaweedfs" @@ -2775,7 +2925,7 @@ metadata: spec: objectStorage: path: ${obj_path} - region: us-east-1 + region: ${REGION:-${ECR_REGION:-us-east-1}} $( [[ -n "$obj_endpoint" ]] && echo "endpoint: \"${obj_endpoint}\"" ) $( [[ -n "$obj_secret" ]] && echo "secretRef: ${obj_secret}" )