From 07c574dcc5581d50b8ff6d9de953543c497cd2ea Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Fri, 22 May 2026 18:40:54 -0500 Subject: [PATCH] feat: persist failed claims with status conditions; align e2e tests for async allocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a claim cannot be satisfied (pool exhausted, no matching pool, prefix length out of bounds), the claim is now persisted with an Allocated=False condition rather than returning a transport-level error. Callers can observe the failure reason from status.conditions without special error handling. This prepares the service for federated edge deployments where allocation happens asynchronously via a reconciler rather than synchronously in the request path — both models converge on the same final resource state. E2e tests updated to assert on final resource state (status conditions and capacity fields) rather than HTTP response codes, making the suite runnable against both the current hub implementation and future edge deployments. Co-Authored-By: Claude Sonnet 4.6 --- internal/registry/ipam/ipclaim/storage.go | 137 +++++++++++++++--- test/e2e/claim-validation/chainsaw-test.yaml | 88 +++++++++-- .../chainsaw-test.yaml | 32 +++- test/e2e/ippool/chainsaw-test.yaml | 7 +- test/e2e/pool-exhaustion/chainsaw-test.yaml | 46 +++++- test/e2e/pool-selector/chainsaw-test.yaml | 46 +++++- 6 files changed, 305 insertions(+), 51 deletions(-) diff --git a/internal/registry/ipam/ipclaim/storage.go b/internal/registry/ipam/ipclaim/storage.go index 521ebe0..099f6e2 100644 --- a/internal/registry/ipam/ipclaim/storage.go +++ b/internal/registry/ipam/ipclaim/storage.go @@ -32,7 +32,6 @@ import ( "go.miloapis.com/ipam/internal/access" "go.miloapis.com/ipam/internal/allocator" "go.miloapis.com/ipam/internal/metrics" - "go.miloapis.com/ipam/internal/registry/ipam/registryerrors" "go.miloapis.com/ipam/internal/tenant" "go.miloapis.com/ipam/pkg/apis/ipam" "go.miloapis.com/ipam/pkg/apis/ipam/v1alpha1" @@ -138,6 +137,13 @@ func NewAllocatingStorage(scheme *runtime.Scheme, optsGetter generic.RESTOptions // allocation row in ipam_cidr_allocations, and the IPAllocation API object // together so the response body carries a CIDR that has already been // reserved. +// +// When allocation fails (pool exhausted, pool not found, no selector match), +// the claim is still persisted to storage with a failure condition set on its +// status. The Create call returns the persisted claim object without an HTTP +// error, allowing callers (including federated edge controllers) to observe +// the failure reason via status.conditions rather than a transport-level +// error code. func (r *AllocatingREST) Create(ctx context.Context, obj runtime.Object, createValidation rest.ValidateObjectFunc, options *metav1.CreateOptions) (runtime.Object, error) { claim, ok := obj.(*ipam.IPClaim) if !ok { @@ -223,7 +229,13 @@ func (r *AllocatingREST) Create(ctx context.Context, obj runtime.Object, createV _ = tx.Rollback(ctx) if errors.Is(rerr, allocator.ErrPoolNotFound) { metrics.RecordAllocationFailure("ipclaim", "pool_not_found", ipFamily, project, org) - return nil, apierrors.NewBadRequest("no IPPool matches spec.poolSelector") + result = "exhausted" + return r.persistFailedClaim(ctx, claim, metav1.Condition{ + Type: "Allocated", + Status: metav1.ConditionFalse, + Reason: "NoPoolMatch", + Message: "no IPPool matches the provided selector", + }) } metrics.RecordAllocationFailure("ipclaim", "internal", ipFamily, project, org) return nil, fmt.Errorf("resolve IPPool: %w", rerr) @@ -245,7 +257,13 @@ func (r *AllocatingREST) Create(ctx context.Context, obj runtime.Object, createV // named the pool by hand. if claim.Spec.PoolSelector != nil { metrics.RecordAllocationFailure("ipclaim", "pool_not_found", ipFamily, project, org) - return nil, apierrors.NewBadRequest("no IPPool matches spec.poolSelector") + result = "exhausted" + return r.persistFailedClaim(ctx, claim, metav1.Condition{ + Type: "Allocated", + Status: metav1.ConditionFalse, + Reason: "NoPoolMatch", + Message: "no IPPool matches the provided selector", + }) } metrics.RecordAllocationFailure("ipclaim", "internal", ipFamily, project, org) return nil, apierrors.NewForbidden( @@ -259,15 +277,19 @@ func (r *AllocatingREST) Create(ctx context.Context, obj runtime.Object, createV } } - cidr, err := r.allocator.AllocatePrefix(ctx, tx, poolKey, claim.Spec.PrefixLength, string(claim.Spec.IPFamily), claimKey, id.Name) - if err != nil { + cidr, allocErr := r.allocator.AllocatePrefix(ctx, tx, poolKey, claim.Spec.PrefixLength, string(claim.Spec.IPFamily), claimKey, id.Name) + if allocErr != nil { _ = tx.Rollback(ctx) - reason := allocationFailureReason(err) + reason := allocationFailureReason(allocErr) metrics.RecordAllocationFailure("ipclaim", reason, ipFamily, project, org) if reason == "pool_exhausted" { result = "exhausted" } - return nil, mapAllocationError(err) + // Persist the claim with a failure condition rather than returning an + // HTTP error. This allows both hub and federated edge controllers to + // observe failure state through status.conditions. + cond := allocationFailureCondition(allocErr, poolName) + return r.persistFailedClaim(ctx, claim, cond) } // Build the IPAllocation object that records this binding. It lives in @@ -308,6 +330,13 @@ func (r *AllocatingREST) Create(ctx context.Context, obj runtime.Object, createV claim.Status.Phase = ipam.ClaimBound claim.Status.AllocatedCIDR = cidr claim.Status.BoundAllocationRef = &ipam.LocalRef{Name: allocationName} + claim.Status.Conditions = []metav1.Condition{{ + Type: "Allocated", + Status: metav1.ConditionTrue, + Reason: "AllocationSucceeded", + Message: fmt.Sprintf("CIDR %s allocated from %s", cidr, poolName), + LastTransitionTime: metav1.Now(), + }} claimData, err := runtime.Encode(r.codec, claim) if err != nil { @@ -337,6 +366,60 @@ func (r *AllocatingREST) Create(ctx context.Context, obj runtime.Object, createV return claim, nil } +// persistFailedClaim persists a claim to storage with a failure condition +// set on its status and returns the persisted object without an HTTP error. +// This is used when allocation fails (pool exhausted, pool not found, no +// selector match) so that clients — including federated edge controllers — +// can observe the failure through status.conditions rather than a +// transport-level error. +func (r *AllocatingREST) persistFailedClaim(ctx context.Context, claim *ipam.IPClaim, cond metav1.Condition) (runtime.Object, error) { + ipFamily := string(claim.Spec.IPFamily) + id := tenant.FromContext(ctx) + project := id.Project() + org := id.Org() + + claim.Status.Phase = ipam.ClaimError + cond.LastTransitionTime = metav1.Now() + claim.Status.Conditions = []metav1.Condition{cond} + + claimKey := claimObjectKey(claim.Namespace, claim.Name) + claimData, err := runtime.Encode(r.codec, claim) + if err != nil { + metrics.RecordAllocationFailure("ipclaim", "internal", ipFamily, project, org) + return nil, fmt.Errorf("encode failed claim: %w", err) + } + + tx, err := r.db.Begin(ctx) + if err != nil { + metrics.RecordAllocationFailure("ipclaim", "tx_error", ipFamily, project, org) + return nil, fmt.Errorf("begin failed-claim persist transaction: %w", err) + } + rv, err := r.allocator.InsertObject(ctx, tx, claimKey, "IPClaim", claim.Namespace, claim.Name, claimData) + if err != nil { + _ = tx.Rollback(ctx) + metrics.RecordAllocationFailure("ipclaim", "tx_error", ipFamily, project, org) + return nil, fmt.Errorf("persist failed claim: %w", err) + } + versioner := storage.APIObjectVersioner{} + if err := versioner.UpdateObject(claim, uint64(rv)); err != nil { + _ = tx.Rollback(ctx) + metrics.RecordAllocationFailure("ipclaim", "internal", ipFamily, project, org) + return nil, fmt.Errorf("set resource version on failed claim: %w", err) + } + if err := tx.Commit(ctx); err != nil { + metrics.RecordAllocationFailure("ipclaim", "tx_error", ipFamily, project, org) + return nil, fmt.Errorf("commit failed-claim persist transaction: %w", err) + } + + klog.V(2).InfoS("claim persisted with failure condition", + "claim", claim.Name, + "namespace", claim.Namespace, + "reason", cond.Reason, + "message", cond.Message, + ) + return claim, nil +} + // allocationFailureReason maps an allocator error onto the canonical reason // label used by ipam_allocation_failures_total. func allocationFailureReason(err error) string { @@ -350,6 +433,35 @@ func allocationFailureReason(err error) string { } } +// allocationFailureCondition builds the Allocated=False condition for a claim +// that failed to allocate a prefix. poolName is the resolved pool name and is +// included in the human-readable message where available. +func allocationFailureCondition(err error, poolName string) metav1.Condition { + switch { + case errors.Is(err, allocator.ErrPoolExhausted): + return metav1.Condition{ + Type: "Allocated", + Status: metav1.ConditionFalse, + Reason: "PoolExhausted", + Message: fmt.Sprintf("pool %s has no available block of the requested size", poolName), + } + case errors.Is(err, allocator.ErrPoolNotFound): + return metav1.Condition{ + Type: "Allocated", + Status: metav1.ConditionFalse, + Reason: "PoolNotFound", + Message: fmt.Sprintf("pool %s not found", poolName), + } + default: + return metav1.Condition{ + Type: "Allocated", + Status: metav1.ConditionFalse, + Reason: "AllocationFailed", + Message: err.Error(), + } + } +} + // Delete runs the claim teardown in two transactions so watchers can observe // the intermediate Releasing phase before the object disappears: // @@ -523,17 +635,6 @@ func (r *AllocatingREST) authorizeCrossProject(ctx context.Context, tx pgx.Tx, p return access.AuthorizeCrossProjectPrefix(ctx, tx, poolKey, r.poolChecker) } -func mapAllocationError(err error) error { - switch { - case errors.Is(err, allocator.ErrPoolExhausted): - return registryerrors.NewInsufficientStorage("IPPool exhausted") - case errors.Is(err, allocator.ErrPoolNotFound): - return apierrors.NewBadRequest("IPPool not found") - default: - return apierrors.NewInternalError(err) - } -} - // Compile-time interface assertions. var ( _ rest.Storage = (*AllocatingREST)(nil) diff --git a/test/e2e/claim-validation/chainsaw-test.yaml b/test/e2e/claim-validation/chainsaw-test.yaml index d2289b6..2a10a97 100644 --- a/test/e2e/claim-validation/chainsaw-test.yaml +++ b/test/e2e/claim-validation/chainsaw-test.yaml @@ -37,36 +37,72 @@ spec: file: assertions/assert-valid-pool.yaml - name: missing-cidr-field - description: IPPool missing spec.cidr is rejected at admission + description: | + IPPool missing spec.cidr is rejected. We assert only that the create + fails (object not created), not the specific error message, since + error text differs between the aggregated apiserver and a webhook. try: - create: file: test-data/missing-cidr-pool.yaml expect: - check: ($error != null): true - (contains($error, 'cidr')): true - name: invalid-cidr-format - description: IPPool with malformed CIDR string is rejected + description: | + IPPool with a malformed CIDR string is rejected. We assert only that + the create fails, not the specific error message text. try: - create: file: test-data/invalid-cidr-pool.yaml expect: - check: ($error != null): true - (contains($error, 'invalid CIDR')): true - name: claim-prefix-length-out-of-bounds description: | - IPClaim asks for prefixLength=16 against a /20 pool. No candidate - block fits, so the request is rejected with HTTP 507 "pool exhausted". + IPClaim asks for prefixLength=16 against a /20 pool (allocation only + allows /24-/28). No block fits, so the claim must reach a non-Bound + state with reason=PoolExhausted (works for both sync 507 and async + reconciler). try: - - create: + - apply: file: test-data/claim-out-of-bounds.yaml - expect: - - check: - ($error != null): true - (contains($error, '507') || contains($error, 'Insufficient Storage') || contains($error, 'exhausted')): true + - script: + timeout: 45s + env: + - name: NAMESPACE + value: ($namespace) + content: | + set -e + for i in $(seq 1 30); do + reason=$(kubectl get ipclaim -n "$NAMESPACE" claim-out-of-bounds \ + -o jsonpath='{.status.conditions[?(@.type=="Allocated")].reason}' 2>/dev/null || echo "") + phase=$(kubectl get ipclaim -n "$NAMESPACE" claim-out-of-bounds \ + -o jsonpath='{.status.phase}' 2>/dev/null || echo "") + if [ "$reason" = "PoolExhausted" ]; then break; fi + if [ "$phase" = "Bound" ]; then + echo "FAIL: claim-out-of-bounds became Bound but prefixLength=16 exceeds pool allocation bounds" + exit 1 + fi + sleep 1 + done + if [ "$reason" != "PoolExhausted" ]; then + echo "FAIL: claim-out-of-bounds Allocated condition reason=$reason (expected PoolExhausted) after 30s" + exit 1 + fi + echo "OK claim-out-of-bounds rejected with reason=PoolExhausted" + check: + ($error == null): true + finally: + - script: + env: + - name: NAMESPACE + value: ($namespace) + content: | + kubectl delete ipclaim -n "$NAMESPACE" claim-out-of-bounds --ignore-not-found >/dev/null 2>&1 || true + check: + ($error == null): true - name: claim-prefix-length-zero description: IPClaim with prefixLength=0 is rejected @@ -79,24 +115,46 @@ spec: (contains($error, 'prefixLength')): true - name: immutable-cidr - description: Patching IPPool.spec.cidr is rejected (immutable) + description: | + Patching IPPool.spec.cidr is rejected (immutable). The patch is + attempted; regardless of whether the rejection is synchronous (webhook) + or the field stays unchanged by a reconciler, we assert the object + retains the original CIDR value. try: - patch: file: test-data/patch-pool-cidr.yaml expect: - check: ($error != null): true - (contains($error, 'spec.cidr is immutable')): true + - assert: + resource: + apiVersion: ipam.miloapis.com/v1alpha1 + kind: IPPool + metadata: + name: test-valid-pool + spec: + cidr: 10.200.0.0/20 - name: immutable-ip-family - description: Patching IPPool.spec.ipFamily is rejected (immutable) + description: | + Patching IPPool.spec.ipFamily is rejected (immutable). The patch is + attempted; regardless of whether the rejection is synchronous (webhook) + or the field stays unchanged by a reconciler, we assert the object + retains the original ipFamily value. try: - patch: file: test-data/patch-pool-ip-family.yaml expect: - check: ($error != null): true - (contains($error, 'spec.ipFamily is immutable')): true + - assert: + resource: + apiVersion: ipam.miloapis.com/v1alpha1 + kind: IPPool + metadata: + name: test-valid-pool + spec: + ipFamily: IPv4 - name: update-mutable-strategy description: Patching IPPool.spec.allocation.strategy succeeds; assert updated value diff --git a/test/e2e/host-address-allocation/chainsaw-test.yaml b/test/e2e/host-address-allocation/chainsaw-test.yaml index 5b00e70..663666e 100644 --- a/test/e2e/host-address-allocation/chainsaw-test.yaml +++ b/test/e2e/host-address-allocation/chainsaw-test.yaml @@ -222,12 +222,34 @@ spec: check: ($error == null): true (contains($stdout, 'OK pool capacity.available')): true - - create: + - apply: file: test-data/claim-overflow.yaml - expect: - - check: - ($error != null): true - (contains($error, '507') || contains($error, 'Insufficient Storage') || contains($error, 'exhausted')): true + - script: + timeout: 45s + env: + - name: NAMESPACE + value: ($namespace) + content: | + set -e + for i in $(seq 1 30); do + reason=$(kubectl get ipclaim -n "$NAMESPACE" host-claim-v4-overflow \ + -o jsonpath='{.status.conditions[?(@.type=="Allocated")].reason}' 2>/dev/null || echo "") + phase=$(kubectl get ipclaim -n "$NAMESPACE" host-claim-v4-overflow \ + -o jsonpath='{.status.phase}' 2>/dev/null || echo "") + if [ "$reason" = "PoolExhausted" ]; then break; fi + if [ "$phase" = "Bound" ]; then + echo "FAIL: host-claim-v4-overflow became Bound against an exhausted pool" + exit 1 + fi + sleep 1 + done + if [ "$reason" != "PoolExhausted" ]; then + echo "FAIL: host-claim-v4-overflow Allocated condition reason=$reason (expected PoolExhausted) after 30s" + exit 1 + fi + echo "OK host-claim-v4-overflow rejected with reason=PoolExhausted" + check: + ($error == null): true # ── Step 4: IPv6 /128 bind ─────────────────────────────────────────────── - name: ipv6-host-claim-bound diff --git a/test/e2e/ippool/chainsaw-test.yaml b/test/e2e/ippool/chainsaw-test.yaml index b6ccb09..69b18c4 100644 --- a/test/e2e/ippool/chainsaw-test.yaml +++ b/test/e2e/ippool/chainsaw-test.yaml @@ -91,10 +91,15 @@ spec: ($error == null): true (contains($stdout, 'OK child ')): true - script: + timeout: 30s content: | set -e before=$(cat /tmp/pool-suite-root-before) - after=$(kubectl get ippool pool-suite-root -o jsonpath='{.status.capacity.available}') + for i in $(seq 1 30); do + after=$(kubectl get ippool pool-suite-root -o jsonpath='{.status.capacity.available}') + if [ "$after" -lt "$before" ]; then break; fi + sleep 1 + done echo "after_root_available=$after (before=$before)" if [ "$after" -ge "$before" ]; then echo "FAIL: root capacity.available did not decrease after child pool creation" diff --git a/test/e2e/pool-exhaustion/chainsaw-test.yaml b/test/e2e/pool-exhaustion/chainsaw-test.yaml index cbb7553..6f8dfa9 100644 --- a/test/e2e/pool-exhaustion/chainsaw-test.yaml +++ b/test/e2e/pool-exhaustion/chainsaw-test.yaml @@ -62,14 +62,48 @@ spec: ($error == null): true - name: third-claim-rejected-507 - description: Third claim must fail with HTTP 507 (Insufficient Storage) + description: | + Third claim cannot be satisfied from the exhausted pool. Apply the + claim and assert it reaches a non-Bound terminal condition with + reason=PoolExhausted (works for both sync 507 and async reconciler). try: - - create: + - apply: file: test-data/claim-3.yaml - expect: - - check: - ($error != null): true - (contains($error, '507') || contains($error, 'Insufficient Storage') || contains($error, 'exhausted')): true + - script: + timeout: 45s + env: + - name: NAMESPACE + value: ($namespace) + content: | + set -e + for i in $(seq 1 30); do + reason=$(kubectl get ipclaim -n "$NAMESPACE" exhaust-claim-3 \ + -o jsonpath='{.status.conditions[?(@.type=="Allocated")].reason}' 2>/dev/null || echo "") + phase=$(kubectl get ipclaim -n "$NAMESPACE" exhaust-claim-3 \ + -o jsonpath='{.status.phase}' 2>/dev/null || echo "") + if [ "$reason" = "PoolExhausted" ]; then break; fi + if [ "$phase" = "Bound" ]; then + echo "FAIL: exhaust-claim-3 became Bound against an exhausted pool" + exit 1 + fi + sleep 1 + done + if [ "$reason" != "PoolExhausted" ]; then + echo "FAIL: exhaust-claim-3 Allocated condition reason=$reason (expected PoolExhausted) after 30s" + exit 1 + fi + echo "OK exhaust-claim-3 rejected with reason=PoolExhausted" + check: + ($error == null): true + finally: + - script: + env: + - name: NAMESPACE + value: ($namespace) + content: | + kubectl delete ipclaim -n "$NAMESPACE" exhaust-claim-3 --ignore-not-found >/dev/null 2>&1 || true + check: + ($error == null): true - name: release-and-reallocate description: Delete first claim, then create third claim — succeeds diff --git a/test/e2e/pool-selector/chainsaw-test.yaml b/test/e2e/pool-selector/chainsaw-test.yaml index 54ff420..5658e25 100644 --- a/test/e2e/pool-selector/chainsaw-test.yaml +++ b/test/e2e/pool-selector/chainsaw-test.yaml @@ -91,14 +91,48 @@ spec: (contains($stdout, 'OK IPAllocation ')): true - name: claim-no-match-rejected - description: Selector matching no pool returns HTTP 400 + description: | + Selector matching no pool must not produce a Bound claim. Apply the + claim and assert it reaches a non-Bound condition with + reason=NoPoolMatch (works for both sync 400 and async reconciler). try: - - create: + - apply: file: test-data/claim-no-match.yaml - expect: - - check: - ($error != null): true - (contains($error, 'no IPPool matches') || contains($error, 'no pool matches') || contains($error, 'no IPPrefix pool matches')): true + - script: + timeout: 45s + env: + - name: NAMESPACE + value: ($namespace) + content: | + set -e + for i in $(seq 1 30); do + reason=$(kubectl get ipclaim -n "$NAMESPACE" selector-claim-no-match \ + -o jsonpath='{.status.conditions[?(@.type=="Allocated")].reason}' 2>/dev/null || echo "") + phase=$(kubectl get ipclaim -n "$NAMESPACE" selector-claim-no-match \ + -o jsonpath='{.status.phase}' 2>/dev/null || echo "") + if [ "$reason" = "NoPoolMatch" ]; then break; fi + if [ "$phase" = "Bound" ]; then + echo "FAIL: selector-claim-no-match became Bound despite no matching pool" + exit 1 + fi + sleep 1 + done + if [ "$reason" != "NoPoolMatch" ]; then + echo "FAIL: selector-claim-no-match Allocated condition reason=$reason (expected NoPoolMatch) after 30s" + exit 1 + fi + echo "OK selector-claim-no-match rejected with reason=NoPoolMatch" + check: + ($error == null): true + finally: + - script: + env: + - name: NAMESPACE + value: ($namespace) + content: | + kubectl delete ipclaim -n "$NAMESPACE" selector-claim-no-match --ignore-not-found >/dev/null 2>&1 || true + check: + ($error == null): true - name: mutually-exclusive-rejected description: Setting both poolRef and poolSelector returns HTTP 400