From da6bdf34a620478ca182df40dd1df2258aa53bf5 Mon Sep 17 00:00:00 2001 From: Dmitrii Creed Date: Fri, 8 May 2026 00:31:15 +0400 Subject: [PATCH] fix(k8s): isolate custom-stack namespaces and retain shared ns on destroy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sub-env client stacks (parentEnv: production, stackEnv: gl-pay/payhey/...) were derivieng their k8s namespace name from stackName via deployment.go:67: namespace := lo.If(args.Namespace == "", stackName).Else(args.Namespace) Every sibling under the same stackName ended up pointing at the same physical namespace (e.g. `pay-space-wallet`). Each Pulumi stack independently created a Namespace resource for that metadata.Name, with a unique URN suffix `-ns`. When *any* sibling stack was destroyed, Pulumi ran the delete operation for its tracked Namespace resource — which calls k8s to delete the namespace by metadata.Name. Kubernetes obliged and cascade-deleted *every* resource in that namespace, including everything owned by the other live sibling stacks. Real outage: a destroy of a throwaway `caddy-test` sub-env stack wiped the production wallet/gl-pay/payhey/rulex/smart-gate Deployments and Services. Recovery required redeploying all five plus rolling Caddy. Two-layer fix in this PR: 1. Proper isolation — each custom stack gets its own physical namespace. `generateNamespaceName(baseNS, stackEnv, parentEnv)` in naming.go suffixes the namespace with `-stackEnv` for custom stacks (parentEnv != stackEnv), mirroring the per-stackEnv suffix every other resource type (Deployment/Service/Secret/ConfigMap/HPA/VPA/ImagePullSecret) already gets via generateResourceName. Standard stacks (parentEnv unset, or parentEnv == stackEnv) keep their existing stackName-based namespace, so the parent stack itself is untouched. After this change, sibling sub-envs no longer share a namespace and `pulumi destroy` cleanly removes only that stack's resources. 2. RetainOnDelete safety net — both `corev1.NewNamespace` call sites (the client-stack one in simple_container.go and the helm-operator one in helpers.go) now pass `sdk.RetainOnDelete(true)`. Pulumi keeps the namespace resource in state but skips the k8s delete API call on destroy. This is critical during the per-stack migration: when a custom stack first runs `pulumi up` with this version, Pulumi sees the namespace metadata.Name change (`pay-space-wallet` → `pay-space-wallet-gl-pay`), schedules a Replace, creates the new namespace, and *would* delete the shared parent namespace if not for RetainOnDelete. After migration, RetainOnDelete continues to defend against accidental destroy of any namespace that ends up holding more than one stack's resources (e.g. shared helm-operator namespaces). Migration semantics: any deploy that already uses parentEnv != stackEnv will Replace its namespace-scoped resources on the next `pulumi up` — Pulumi creates them in the new namespace and deletes the old ones. The parent stack is unaffected because its resources sit in a different Pulumi stack with different URNs. Caddy auto-discovers services across all namespaces (kubectl get services --all-namespaces) and the Caddyfile upstream URL encodes namespace via the existing `\${namespace}` placeholder in simple_container.go, so routing follows the new namespace automatically. The empty parent namespace lingers only if the *last* sibling under one stackName is destroyed; it must be cleaned up manually. That's the right default — silent destructive cascade across stacks is far worse than a leaked empty namespace. Tests: - TestGenerateNamespaceName covers all parentEnv/stackEnv combinations including the regression cases. - TestGenerateNamespaceName_SiblingsAreUnique enumerates the pay_space_wallet outage scenario (production parent + 5 sub-envs + caddy-test) and asserts each resolves to a distinct namespace. Signed-off-by: Dmitrii Creed --- pkg/clouds/pulumi/gcp/compute_proc.go | 21 +++- .../pulumi/kubernetes/compute_proc_mongodb.go | 10 +- .../kubernetes/compute_proc_postgres.go | 10 +- pkg/clouds/pulumi/kubernetes/deployment.go | 9 +- pkg/clouds/pulumi/kubernetes/helpers.go | 5 +- pkg/clouds/pulumi/kubernetes/naming.go | 34 ++++- pkg/clouds/pulumi/kubernetes/naming_test.go | 117 +++++++++++++++++- .../pulumi/kubernetes/simple_container.go | 22 +++- 8 files changed, 215 insertions(+), 13 deletions(-) diff --git a/pkg/clouds/pulumi/gcp/compute_proc.go b/pkg/clouds/pulumi/gcp/compute_proc.go index e42ee8fb..109c85c3 100644 --- a/pkg/clouds/pulumi/gcp/compute_proc.go +++ b/pkg/clouds/pulumi/gcp/compute_proc.go @@ -251,8 +251,16 @@ func createCloudsqlProxy(ctx *sdk.Context, params appendParams) (*CloudSQLProxy, // This ensures service accounts are unique per environment (e.g., telegram-bot--on-sidecarcsql--production vs telegram-bot--on-sidecarcsql--staging) baseProxyName := fmt.Sprintf("%s-%s-sidecarcsql", params.stack.Name, params.postgresName) cloudsqlProxyName := kubernetes.SanitizeK8sName(params.input.ToResName(baseProxyName)) - // Sanitize namespace name as well - sanitizedNamespace := kubernetes.SanitizeK8sName(params.input.StackParams.StackName) + // The CloudSQL proxy emits a Kubernetes Secret (proxy credentials) that must live in + // the same namespace as the consuming pod for it to be mountable. For custom stacks + // (parentEnv != stackEnv) the pod namespace is `-` per + // kubernetes.GenerateNamespaceName, so derive that here. + parentEnv := "" + if params.provisionParams.ParentStack != nil { + parentEnv = params.provisionParams.ParentStack.ParentEnv + } + derivedNamespace := kubernetes.GenerateNamespaceName(params.input.StackParams.StackName, params.input.StackParams.Environment, parentEnv) + sanitizedNamespace := kubernetes.SanitizeK8sName(derivedNamespace) cloudsqlProxy, err := NewCloudsqlProxy(ctx, CloudSQLProxyArgs{ Name: cloudsqlProxyName, DBInstance: PostgresDBInstanceArgs{ @@ -345,7 +353,14 @@ func createUserForDatabase(ctx *sdk.Context, userName, dbName string, params app } // Sanitize names to comply with Kubernetes RFC 1123 requirements (no underscores) cloudsqlProxyName := kubernetes.SanitizeK8sName(util.TrimStringMiddle(fmt.Sprintf("%s-%s-initcsql", userName, params.postgresName), 60, "-")) - namespace := kubernetes.SanitizeK8sName(params.input.StackParams.StackName) + // Init job + ad-hoc CloudSQL proxy must live in the same namespace as the consuming + // pod so the proxy's credential Secret is mountable. For custom stacks the pod is + // in `-` per kubernetes.GenerateNamespaceName. + parentEnv := "" + if params.provisionParams.ParentStack != nil { + parentEnv = params.provisionParams.ParentStack.ParentEnv + } + namespace := kubernetes.SanitizeK8sName(kubernetes.GenerateNamespaceName(params.input.StackParams.StackName, params.input.StackParams.Environment, parentEnv)) cloudsqlProxy, err := NewCloudsqlProxy(ctx, CloudSQLProxyArgs{ Name: cloudsqlProxyName, DBInstance: dbInstanceArgs, diff --git a/pkg/clouds/pulumi/kubernetes/compute_proc_mongodb.go b/pkg/clouds/pulumi/kubernetes/compute_proc_mongodb.go index ef5fb06e..ff4003dd 100644 --- a/pkg/clouds/pulumi/kubernetes/compute_proc_mongodb.go +++ b/pkg/clouds/pulumi/kubernetes/compute_proc_mongodb.go @@ -129,7 +129,15 @@ func createMongodbUserForDatabase(ctx *sdk.Context, userName, dbName string, par } ctx.Export(passwordName, password.Result) - namespace := params.input.StackParams.StackName + // The init job must run in the same namespace as the consuming pod so it can be + // observed and cleaned up alongside the workload. For custom stacks (parentEnv != stackEnv) + // the pod lives in `-` per GenerateNamespaceName, so derive the same + // namespace here. Standard stacks keep the existing `` namespace. + parentEnv := "" + if params.provisionParams.ParentStack != nil { + parentEnv = params.provisionParams.ParentStack.ParentEnv + } + namespace := GenerateNamespaceName(params.input.StackParams.StackName, params.input.StackParams.Environment, parentEnv) params.collector.AddPreProcessor(&SimpleContainerArgs{}, func(c any) error { _, err = NewMongodbInitDbUserJob(ctx, userName, InitDbUserJobArgs{ diff --git a/pkg/clouds/pulumi/kubernetes/compute_proc_postgres.go b/pkg/clouds/pulumi/kubernetes/compute_proc_postgres.go index bda9a6b3..05228965 100644 --- a/pkg/clouds/pulumi/kubernetes/compute_proc_postgres.go +++ b/pkg/clouds/pulumi/kubernetes/compute_proc_postgres.go @@ -210,7 +210,15 @@ func createPostgresUserForDatabase(ctx *sdk.Context, userName, dbName string, pa return nil, errors.Wrapf(err, "failed to parse postgres url for database %q", dbName) } - namespace := params.input.StackParams.StackName + // The init job must run in the same namespace as the consuming pod so it can be + // observed and cleaned up alongside the workload. For custom stacks (parentEnv != stackEnv) + // the pod lives in `-` per GenerateNamespaceName, so derive the same + // namespace here. Standard stacks keep the existing `` namespace. + parentEnv := "" + if params.provisionParams.ParentStack != nil { + parentEnv = params.provisionParams.ParentStack.ParentEnv + } + namespace := GenerateNamespaceName(params.input.StackParams.StackName, params.input.StackParams.Environment, parentEnv) params.collector.AddPreProcessor(&SimpleContainerArgs{}, func(c any) error { _, err = NewPostgresInitDbUserJob(ctx, userName, InitDbUserJobArgs{ diff --git a/pkg/clouds/pulumi/kubernetes/deployment.go b/pkg/clouds/pulumi/kubernetes/deployment.go index 76b763ee..6d1d0487 100644 --- a/pkg/clouds/pulumi/kubernetes/deployment.go +++ b/pkg/clouds/pulumi/kubernetes/deployment.go @@ -63,8 +63,13 @@ func DeploySimpleContainer(ctx *sdk.Context, args Args, opts ...sdk.ResourceOpti parentEnv = args.Params.ParentStack.ParentEnv } - // Determine namespace - always use stack name as namespace (service name) - namespace := lo.If(args.Namespace == "", stackName).Else(args.Namespace) + // Derive namespace via GenerateNamespaceName: standard stacks keep the stackName-based + // namespace; custom stacks (parentEnv != stackEnv) get a stackEnv-suffixed namespace so + // sibling sub-envs no longer share a physical namespace. See the helper's docstring for + // migration semantics. The shared parent namespace stays intact through the per-stack + // migration thanks to RetainOnDelete on the namespace resource. + baseNamespace := lo.If(args.Namespace == "", stackName).Else(args.Namespace) + namespace := GenerateNamespaceName(baseNamespace, stackEnv, parentEnv) // Generate deployment name with environment suffix for custom stacks baseDeploymentName := lo.If(args.DeploymentName == "", stackName).Else(args.DeploymentName) diff --git a/pkg/clouds/pulumi/kubernetes/helpers.go b/pkg/clouds/pulumi/kubernetes/helpers.go index fd0f272a..0bef2a7b 100644 --- a/pkg/clouds/pulumi/kubernetes/helpers.go +++ b/pkg/clouds/pulumi/kubernetes/helpers.go @@ -45,7 +45,10 @@ func sanitizeK8sName(name string) string { } func ensureNamespace(ctx *sdk.Context, input api.ResourceInput, params pApi.ProvisionParams, namespace string) (*corev1.Namespace, error) { - opts := []sdk.ResourceOption{sdk.Provider(params.Provider)} + // RetainOnDelete: see the rationale at simple_container.go's NewNamespace call — + // helm operator stacks share namespaces across sibling stacks the same way client + // stacks do, so the destroy-cascade hazard is identical here. + opts := []sdk.ResourceOption{sdk.Provider(params.Provider), sdk.RetainOnDelete(true)} sanitizedNamespace := sanitizeK8sName(namespace) return corev1.NewNamespace(ctx, fmt.Sprintf("create-ns-%s-%s", sanitizedNamespace, input.ToResName(input.Descriptor.Name)), &corev1.NamespaceArgs{ Metadata: &metav1.ObjectMetaArgs{ diff --git a/pkg/clouds/pulumi/kubernetes/naming.go b/pkg/clouds/pulumi/kubernetes/naming.go index c57bead1..161e5fb7 100644 --- a/pkg/clouds/pulumi/kubernetes/naming.go +++ b/pkg/clouds/pulumi/kubernetes/naming.go @@ -71,6 +71,36 @@ func isCustomStack(stackEnv, parentEnv string) bool { return parentEnv != "" && parentEnv != stackEnv } +// GenerateNamespaceName derives the physical k8s namespace name for a stack. +// Standard stacks (parentEnv unset, or parentEnv == stackEnv) keep baseNamespace +// (typically the stackName). Custom stacks (parentEnv set and differing from +// stackEnv, e.g. parentEnv=production with stackEnv=tenant-a/tenant-b/...) get +// baseNamespace suffixed with stackEnv, mirroring the per-stackEnv suffix every +// other resource type (Deployment, Service, Secret, ConfigMap, HPA, VPA, +// ImagePullSecret) already gets via generateResourceName. +// +// The result is sanitized to comply with Kubernetes RFC 1123 (lowercase +// alphanumeric and `-`, ≤63 chars with FNV-1a truncation hash) so callers can +// pass it directly into `metadata.namespace` without an extra sanitization +// step. Sanitization is idempotent — callers that pre-sanitize their inputs +// see no behavioural change. +// +// Without this isolation, sibling sub-env stacks share one physical namespace, +// and any `pulumi destroy` on a sub-env cascade-deletes every sibling's resources +// via the k8s namespace delete API. Migrating an existing custom stack to its +// dedicated namespace is automatic on the next `pulumi up`: Pulumi sees the +// namespace metadata.Name change and Replaces the namespace plus its +// namespace-scoped resources. The namespace is created with RetainOnDelete (see +// NewSimpleContainer), so the parent's shared namespace is left in place — the +// parent stack's resources continue running through the migration. +func GenerateNamespaceName(baseNamespace, stackEnv, parentEnv string) string { + name := baseNamespace + if isCustomStack(stackEnv, parentEnv) { + name = fmt.Sprintf("%s-%s", baseNamespace, stackEnv) + } + return SanitizeK8sName(name) +} + // GenerateCaddyDeploymentName creates the Caddy deployment name with environment suffix // Caddy deployments always include the environment suffix for backwards compatibility // This is exported so it can be used by both kubernetes and gcp packages for consistency @@ -87,8 +117,8 @@ func GenerateCaddyDeploymentName(stackEnv string) string { // // Caddy is provisioned by the parent infra stack, so its deployment name is keyed on // parentEnv (e.g. caddy-production). For sub-env client stacks where parentEnv differs -// from stackEnv (e.g. parentEnv=production, stackEnv=gl-pay), passing stackEnv would -// produce caddy-gl-pay — which doesn't exist — and the patch would fail silently. +// from stackEnv (e.g. parentEnv=production, stackEnv=tenant-a), passing stackEnv would +// produce caddy-tenant-a — which doesn't exist — and the patch would fail silently. // For single-env stacks (parentEnv empty or equal to stackEnv) this falls back to stackEnv. // // Note: this is the call site asymmetric to GenerateCaddyDeploymentName, which is used diff --git a/pkg/clouds/pulumi/kubernetes/naming_test.go b/pkg/clouds/pulumi/kubernetes/naming_test.go index 108a984b..77438c4a 100644 --- a/pkg/clouds/pulumi/kubernetes/naming_test.go +++ b/pkg/clouds/pulumi/kubernetes/naming_test.go @@ -455,7 +455,7 @@ func TestCaddyDeploymentNameForChild(t *testing.T) { }, { name: "sub-env stack targets parent's caddy", - stackEnv: "gl-pay", + stackEnv: "tenant-a", parentEnv: "production", expected: "caddy-production", }, @@ -533,6 +533,121 @@ func TestIsCustomStack(t *testing.T) { } } +// TestGenerateNamespaceName covers the namespace derivation that gives custom stacks +// (parentEnv != stackEnv) their own physical k8s namespace, while leaving standard +// stacks on the stackName-based namespace. Includes regression cases for the +// shared-namespace outage class where production parentEnv plus several sibling +// sub-env stacks (tenant-a/tenant-b/tenant-c/preview-test/...) all collided in one +// namespace, so destroying any sibling cascade-deleted the rest. +func TestGenerateNamespaceName(t *testing.T) { + tests := []struct { + name string + baseNamespace string + stackEnv string + parentEnv string + expected string + }{ + { + name: "standard stack - no parentEnv", + baseNamespace: "myapp", + stackEnv: "staging", + parentEnv: "", + expected: "myapp", + }, + { + name: "self-reference - parentEnv equals stackEnv", + baseNamespace: "myapp", + stackEnv: "production", + parentEnv: "production", + expected: "myapp", + }, + { + name: "custom stack - sub-env under production", + baseNamespace: "myapp", + stackEnv: "tenant-a", + parentEnv: "production", + expected: "myapp-tenant-a", + }, + { + // Realistic stackNames often contain underscores; the namespace must be + // sanitized to RFC 1123 so callers can pass the result directly into + // metadata.namespace without their own sanitization step. + name: "underscores in stackName get normalized", + baseNamespace: "my_app", + stackEnv: "tenant-a", + parentEnv: "production", + expected: "my-app-tenant-a", + }, + { + name: "uppercase gets lowercased", + baseNamespace: "MyApp", + stackEnv: "TenantA", + parentEnv: "production", + expected: "myapp-tenanta", + }, + { + name: "custom stack - PR preview under staging", + baseNamespace: "api", + stackEnv: "staging-pr-123", + parentEnv: "staging", + expected: "api-staging-pr-123", + }, + { + name: "custom stack - throwaway test sibling", + baseNamespace: "myapp", + stackEnv: "preview-test", + parentEnv: "production", + expected: "myapp-preview-test", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := GenerateNamespaceName(tt.baseNamespace, tt.stackEnv, tt.parentEnv) + if got != tt.expected { + t.Errorf("GenerateNamespaceName(%q, %q, %q) = %q, expected %q", + tt.baseNamespace, tt.stackEnv, tt.parentEnv, got, tt.expected) + } + }) + } +} + +// TestGenerateNamespaceName_SiblingsAreUnique is the direct regression test for the +// shared-namespace outage: every sub-env stack under one stackName must resolve to +// a distinct namespace, while the parent (parentEnv == stackEnv) keeps its existing +// namespace. This mirrors the production scenario where one parent env hosted +// multiple tenant sub-envs plus a throwaway test sub-env, all sharing one namespace. +func TestGenerateNamespaceName_SiblingsAreUnique(t *testing.T) { + baseNamespace := "myapp" + parentEnv := "production" + + siblings := []struct { + stackEnv string + expectedNamespace string + }{ + {"production", "myapp"}, + {"tenant-a", "myapp-tenant-a"}, + {"tenant-b", "myapp-tenant-b"}, + {"tenant-c", "myapp-tenant-c"}, + {"tenant-d", "myapp-tenant-d"}, + {"preview-test", "myapp-preview-test"}, + } + + seen := make(map[string]string) + for _, s := range siblings { + got := GenerateNamespaceName(baseNamespace, s.stackEnv, parentEnv) + if got != s.expectedNamespace { + t.Errorf("stackEnv=%q: got namespace %q, expected %q", + s.stackEnv, got, s.expectedNamespace) + } + if prior, dup := seen[got]; dup { + t.Errorf("stackEnv=%q produced namespace %q already used by stackEnv=%q — "+ + "siblings must not share a namespace", s.stackEnv, got, prior) + } + seen[got] = s.stackEnv + } +} + // TestComplexScenarios tests real-world deployment scenarios func TestComplexScenarios(t *testing.T) { t.Run("multiple preview environments with unique namespaces", func(t *testing.T) { diff --git a/pkg/clouds/pulumi/kubernetes/simple_container.go b/pkg/clouds/pulumi/kubernetes/simple_container.go index c8aa47b1..74d35796 100644 --- a/pkg/clouds/pulumi/kubernetes/simple_container.go +++ b/pkg/clouds/pulumi/kubernetes/simple_container.go @@ -216,7 +216,25 @@ func NewSimpleContainer(ctx *sdk.Context, args *SimpleContainerArgs, opts ...sdk // Sanitize namespace name to comply with Kubernetes RFC 1123 requirements sanitizedNamespace := sanitizeK8sName(args.Namespace) // Use deployment name as Pulumi resource name to ensure uniqueness across environments - // while keeping the actual K8s namespace name as specified by the user + // while keeping the actual K8s namespace name as specified by the user. + // + // RetainOnDelete: in legacy deploys, sub-env client stacks (e.g. parentEnv=production + // with stackEnv=tenant-a/tenant-b/...) shared one physical K8s namespace because the + // namespace metadata.Name was derived from stackName, not from stackEnv. Each stack + // tracked its own Pulumi Namespace resource with a unique URN, but they all referenced + // the same physical k8s namespace. Without RetainOnDelete, destroying any single + // sub-env stack would cascade-delete the shared namespace and wipe every sibling + // stack's resources (Deployments, Services, Secrets) — a real production outage when + // a throwaway sub-env destroy took down all live siblings. + // + // GenerateNamespaceName now isolates custom stacks per-stackEnv, but RetainOnDelete + // remains load-bearing for the migration step: when a pre-existing custom stack + // first runs `pulumi up` after this version, Pulumi Replaces the namespace, and the + // old shared namespace must NOT be deleted because the parent stack still lives + // there. Post-migration, RetainOnDelete continues to defend against any case where + // multiple stacks legitimately share a namespace (helm operators, explicit + // `Namespace` overrides). Empty namespaces left after the last referencing stack + // is destroyed must be cleaned up by hand. namespaceResourceName := fmt.Sprintf("%s-ns", sanitizedDeployment) namespace, err := corev1.NewNamespace(ctx, namespaceResourceName, &corev1.NamespaceArgs{ Metadata: &metav1.ObjectMetaArgs{ @@ -224,7 +242,7 @@ func NewSimpleContainer(ctx *sdk.Context, args *SimpleContainerArgs, opts ...sdk Labels: sdk.ToStringMap(appLabels), Annotations: sdk.ToStringMap(appAnnotations), }, - }, opts...) + }, append(opts, sdk.RetainOnDelete(true))...) if err != nil { return nil, err }