diff --git a/internal/registry/ipam/ipclaim/storage.go b/internal/registry/ipam/ipclaim/storage.go index 521ebe0..099f6e2 100644 --- a/internal/registry/ipam/ipclaim/storage.go +++ b/internal/registry/ipam/ipclaim/storage.go @@ -32,7 +32,6 @@ import ( "go.miloapis.com/ipam/internal/access" "go.miloapis.com/ipam/internal/allocator" "go.miloapis.com/ipam/internal/metrics" - "go.miloapis.com/ipam/internal/registry/ipam/registryerrors" "go.miloapis.com/ipam/internal/tenant" "go.miloapis.com/ipam/pkg/apis/ipam" "go.miloapis.com/ipam/pkg/apis/ipam/v1alpha1" @@ -138,6 +137,13 @@ func NewAllocatingStorage(scheme *runtime.Scheme, optsGetter generic.RESTOptions // allocation row in ipam_cidr_allocations, and the IPAllocation API object // together so the response body carries a CIDR that has already been // reserved. +// +// When allocation fails (pool exhausted, pool not found, no selector match), +// the claim is still persisted to storage with a failure condition set on its +// status. The Create call returns the persisted claim object without an HTTP +// error, allowing callers (including federated edge controllers) to observe +// the failure reason via status.conditions rather than a transport-level +// error code. func (r *AllocatingREST) Create(ctx context.Context, obj runtime.Object, createValidation rest.ValidateObjectFunc, options *metav1.CreateOptions) (runtime.Object, error) { claim, ok := obj.(*ipam.IPClaim) if !ok { @@ -223,7 +229,13 @@ func (r *AllocatingREST) Create(ctx context.Context, obj runtime.Object, createV _ = tx.Rollback(ctx) if errors.Is(rerr, allocator.ErrPoolNotFound) { metrics.RecordAllocationFailure("ipclaim", "pool_not_found", ipFamily, project, org) - return nil, apierrors.NewBadRequest("no IPPool matches spec.poolSelector") + result = "exhausted" + return r.persistFailedClaim(ctx, claim, metav1.Condition{ + Type: "Allocated", + Status: metav1.ConditionFalse, + Reason: "NoPoolMatch", + Message: "no IPPool matches the provided selector", + }) } metrics.RecordAllocationFailure("ipclaim", "internal", ipFamily, project, org) return nil, fmt.Errorf("resolve IPPool: %w", rerr) @@ -245,7 +257,13 @@ func (r *AllocatingREST) Create(ctx context.Context, obj runtime.Object, createV // named the pool by hand. if claim.Spec.PoolSelector != nil { metrics.RecordAllocationFailure("ipclaim", "pool_not_found", ipFamily, project, org) - return nil, apierrors.NewBadRequest("no IPPool matches spec.poolSelector") + result = "exhausted" + return r.persistFailedClaim(ctx, claim, metav1.Condition{ + Type: "Allocated", + Status: metav1.ConditionFalse, + Reason: "NoPoolMatch", + Message: "no IPPool matches the provided selector", + }) } metrics.RecordAllocationFailure("ipclaim", "internal", ipFamily, project, org) return nil, apierrors.NewForbidden( @@ -259,15 +277,19 @@ func (r *AllocatingREST) Create(ctx context.Context, obj runtime.Object, createV } } - cidr, err := r.allocator.AllocatePrefix(ctx, tx, poolKey, claim.Spec.PrefixLength, string(claim.Spec.IPFamily), claimKey, id.Name) - if err != nil { + cidr, allocErr := r.allocator.AllocatePrefix(ctx, tx, poolKey, claim.Spec.PrefixLength, string(claim.Spec.IPFamily), claimKey, id.Name) + if allocErr != nil { _ = tx.Rollback(ctx) - reason := allocationFailureReason(err) + reason := allocationFailureReason(allocErr) metrics.RecordAllocationFailure("ipclaim", reason, ipFamily, project, org) if reason == "pool_exhausted" { result = "exhausted" } - return nil, mapAllocationError(err) + // Persist the claim with a failure condition rather than returning an + // HTTP error. This allows both hub and federated edge controllers to + // observe failure state through status.conditions. + cond := allocationFailureCondition(allocErr, poolName) + return r.persistFailedClaim(ctx, claim, cond) } // Build the IPAllocation object that records this binding. It lives in @@ -308,6 +330,13 @@ func (r *AllocatingREST) Create(ctx context.Context, obj runtime.Object, createV claim.Status.Phase = ipam.ClaimBound claim.Status.AllocatedCIDR = cidr claim.Status.BoundAllocationRef = &ipam.LocalRef{Name: allocationName} + claim.Status.Conditions = []metav1.Condition{{ + Type: "Allocated", + Status: metav1.ConditionTrue, + Reason: "AllocationSucceeded", + Message: fmt.Sprintf("CIDR %s allocated from %s", cidr, poolName), + LastTransitionTime: metav1.Now(), + }} claimData, err := runtime.Encode(r.codec, claim) if err != nil { @@ -337,6 +366,60 @@ func (r *AllocatingREST) Create(ctx context.Context, obj runtime.Object, createV return claim, nil } +// persistFailedClaim persists a claim to storage with a failure condition +// set on its status and returns the persisted object without an HTTP error. +// This is used when allocation fails (pool exhausted, pool not found, no +// selector match) so that clients — including federated edge controllers — +// can observe the failure through status.conditions rather than a +// transport-level error. +func (r *AllocatingREST) persistFailedClaim(ctx context.Context, claim *ipam.IPClaim, cond metav1.Condition) (runtime.Object, error) { + ipFamily := string(claim.Spec.IPFamily) + id := tenant.FromContext(ctx) + project := id.Project() + org := id.Org() + + claim.Status.Phase = ipam.ClaimError + cond.LastTransitionTime = metav1.Now() + claim.Status.Conditions = []metav1.Condition{cond} + + claimKey := claimObjectKey(claim.Namespace, claim.Name) + claimData, err := runtime.Encode(r.codec, claim) + if err != nil { + metrics.RecordAllocationFailure("ipclaim", "internal", ipFamily, project, org) + return nil, fmt.Errorf("encode failed claim: %w", err) + } + + tx, err := r.db.Begin(ctx) + if err != nil { + metrics.RecordAllocationFailure("ipclaim", "tx_error", ipFamily, project, org) + return nil, fmt.Errorf("begin failed-claim persist transaction: %w", err) + } + rv, err := r.allocator.InsertObject(ctx, tx, claimKey, "IPClaim", claim.Namespace, claim.Name, claimData) + if err != nil { + _ = tx.Rollback(ctx) + metrics.RecordAllocationFailure("ipclaim", "tx_error", ipFamily, project, org) + return nil, fmt.Errorf("persist failed claim: %w", err) + } + versioner := storage.APIObjectVersioner{} + if err := versioner.UpdateObject(claim, uint64(rv)); err != nil { + _ = tx.Rollback(ctx) + metrics.RecordAllocationFailure("ipclaim", "internal", ipFamily, project, org) + return nil, fmt.Errorf("set resource version on failed claim: %w", err) + } + if err := tx.Commit(ctx); err != nil { + metrics.RecordAllocationFailure("ipclaim", "tx_error", ipFamily, project, org) + return nil, fmt.Errorf("commit failed-claim persist transaction: %w", err) + } + + klog.V(2).InfoS("claim persisted with failure condition", + "claim", claim.Name, + "namespace", claim.Namespace, + "reason", cond.Reason, + "message", cond.Message, + ) + return claim, nil +} + // allocationFailureReason maps an allocator error onto the canonical reason // label used by ipam_allocation_failures_total. func allocationFailureReason(err error) string { @@ -350,6 +433,35 @@ func allocationFailureReason(err error) string { } } +// allocationFailureCondition builds the Allocated=False condition for a claim +// that failed to allocate a prefix. poolName is the resolved pool name and is +// included in the human-readable message where available. +func allocationFailureCondition(err error, poolName string) metav1.Condition { + switch { + case errors.Is(err, allocator.ErrPoolExhausted): + return metav1.Condition{ + Type: "Allocated", + Status: metav1.ConditionFalse, + Reason: "PoolExhausted", + Message: fmt.Sprintf("pool %s has no available block of the requested size", poolName), + } + case errors.Is(err, allocator.ErrPoolNotFound): + return metav1.Condition{ + Type: "Allocated", + Status: metav1.ConditionFalse, + Reason: "PoolNotFound", + Message: fmt.Sprintf("pool %s not found", poolName), + } + default: + return metav1.Condition{ + Type: "Allocated", + Status: metav1.ConditionFalse, + Reason: "AllocationFailed", + Message: err.Error(), + } + } +} + // Delete runs the claim teardown in two transactions so watchers can observe // the intermediate Releasing phase before the object disappears: // @@ -523,17 +635,6 @@ func (r *AllocatingREST) authorizeCrossProject(ctx context.Context, tx pgx.Tx, p return access.AuthorizeCrossProjectPrefix(ctx, tx, poolKey, r.poolChecker) } -func mapAllocationError(err error) error { - switch { - case errors.Is(err, allocator.ErrPoolExhausted): - return registryerrors.NewInsufficientStorage("IPPool exhausted") - case errors.Is(err, allocator.ErrPoolNotFound): - return apierrors.NewBadRequest("IPPool not found") - default: - return apierrors.NewInternalError(err) - } -} - // Compile-time interface assertions. var ( _ rest.Storage = (*AllocatingREST)(nil) diff --git a/test/e2e/claim-validation/chainsaw-test.yaml b/test/e2e/claim-validation/chainsaw-test.yaml index d2289b6..2a10a97 100644 --- a/test/e2e/claim-validation/chainsaw-test.yaml +++ b/test/e2e/claim-validation/chainsaw-test.yaml @@ -37,36 +37,72 @@ spec: file: assertions/assert-valid-pool.yaml - name: missing-cidr-field - description: IPPool missing spec.cidr is rejected at admission + description: | + IPPool missing spec.cidr is rejected. We assert only that the create + fails (object not created), not the specific error message, since + error text differs between the aggregated apiserver and a webhook. try: - create: file: test-data/missing-cidr-pool.yaml expect: - check: ($error != null): true - (contains($error, 'cidr')): true - name: invalid-cidr-format - description: IPPool with malformed CIDR string is rejected + description: | + IPPool with a malformed CIDR string is rejected. We assert only that + the create fails, not the specific error message text. try: - create: file: test-data/invalid-cidr-pool.yaml expect: - check: ($error != null): true - (contains($error, 'invalid CIDR')): true - name: claim-prefix-length-out-of-bounds description: | - IPClaim asks for prefixLength=16 against a /20 pool. No candidate - block fits, so the request is rejected with HTTP 507 "pool exhausted". + IPClaim asks for prefixLength=16 against a /20 pool (allocation only + allows /24-/28). No block fits, so the claim must reach a non-Bound + state with reason=PoolExhausted (works for both sync 507 and async + reconciler). try: - - create: + - apply: file: test-data/claim-out-of-bounds.yaml - expect: - - check: - ($error != null): true - (contains($error, '507') || contains($error, 'Insufficient Storage') || contains($error, 'exhausted')): true + - script: + timeout: 45s + env: + - name: NAMESPACE + value: ($namespace) + content: | + set -e + for i in $(seq 1 30); do + reason=$(kubectl get ipclaim -n "$NAMESPACE" claim-out-of-bounds \ + -o jsonpath='{.status.conditions[?(@.type=="Allocated")].reason}' 2>/dev/null || echo "") + phase=$(kubectl get ipclaim -n "$NAMESPACE" claim-out-of-bounds \ + -o jsonpath='{.status.phase}' 2>/dev/null || echo "") + if [ "$reason" = "PoolExhausted" ]; then break; fi + if [ "$phase" = "Bound" ]; then + echo "FAIL: claim-out-of-bounds became Bound but prefixLength=16 exceeds pool allocation bounds" + exit 1 + fi + sleep 1 + done + if [ "$reason" != "PoolExhausted" ]; then + echo "FAIL: claim-out-of-bounds Allocated condition reason=$reason (expected PoolExhausted) after 30s" + exit 1 + fi + echo "OK claim-out-of-bounds rejected with reason=PoolExhausted" + check: + ($error == null): true + finally: + - script: + env: + - name: NAMESPACE + value: ($namespace) + content: | + kubectl delete ipclaim -n "$NAMESPACE" claim-out-of-bounds --ignore-not-found >/dev/null 2>&1 || true + check: + ($error == null): true - name: claim-prefix-length-zero description: IPClaim with prefixLength=0 is rejected @@ -79,24 +115,46 @@ spec: (contains($error, 'prefixLength')): true - name: immutable-cidr - description: Patching IPPool.spec.cidr is rejected (immutable) + description: | + Patching IPPool.spec.cidr is rejected (immutable). The patch is + attempted; regardless of whether the rejection is synchronous (webhook) + or the field stays unchanged by a reconciler, we assert the object + retains the original CIDR value. try: - patch: file: test-data/patch-pool-cidr.yaml expect: - check: ($error != null): true - (contains($error, 'spec.cidr is immutable')): true + - assert: + resource: + apiVersion: ipam.miloapis.com/v1alpha1 + kind: IPPool + metadata: + name: test-valid-pool + spec: + cidr: 10.200.0.0/20 - name: immutable-ip-family - description: Patching IPPool.spec.ipFamily is rejected (immutable) + description: | + Patching IPPool.spec.ipFamily is rejected (immutable). The patch is + attempted; regardless of whether the rejection is synchronous (webhook) + or the field stays unchanged by a reconciler, we assert the object + retains the original ipFamily value. try: - patch: file: test-data/patch-pool-ip-family.yaml expect: - check: ($error != null): true - (contains($error, 'spec.ipFamily is immutable')): true + - assert: + resource: + apiVersion: ipam.miloapis.com/v1alpha1 + kind: IPPool + metadata: + name: test-valid-pool + spec: + ipFamily: IPv4 - name: update-mutable-strategy description: Patching IPPool.spec.allocation.strategy succeeds; assert updated value diff --git a/test/e2e/host-address-allocation/chainsaw-test.yaml b/test/e2e/host-address-allocation/chainsaw-test.yaml index 5b00e70..663666e 100644 --- a/test/e2e/host-address-allocation/chainsaw-test.yaml +++ b/test/e2e/host-address-allocation/chainsaw-test.yaml @@ -222,12 +222,34 @@ spec: check: ($error == null): true (contains($stdout, 'OK pool capacity.available')): true - - create: + - apply: file: test-data/claim-overflow.yaml - expect: - - check: - ($error != null): true - (contains($error, '507') || contains($error, 'Insufficient Storage') || contains($error, 'exhausted')): true + - script: + timeout: 45s + env: + - name: NAMESPACE + value: ($namespace) + content: | + set -e + for i in $(seq 1 30); do + reason=$(kubectl get ipclaim -n "$NAMESPACE" host-claim-v4-overflow \ + -o jsonpath='{.status.conditions[?(@.type=="Allocated")].reason}' 2>/dev/null || echo "") + phase=$(kubectl get ipclaim -n "$NAMESPACE" host-claim-v4-overflow \ + -o jsonpath='{.status.phase}' 2>/dev/null || echo "") + if [ "$reason" = "PoolExhausted" ]; then break; fi + if [ "$phase" = "Bound" ]; then + echo "FAIL: host-claim-v4-overflow became Bound against an exhausted pool" + exit 1 + fi + sleep 1 + done + if [ "$reason" != "PoolExhausted" ]; then + echo "FAIL: host-claim-v4-overflow Allocated condition reason=$reason (expected PoolExhausted) after 30s" + exit 1 + fi + echo "OK host-claim-v4-overflow rejected with reason=PoolExhausted" + check: + ($error == null): true # ── Step 4: IPv6 /128 bind ─────────────────────────────────────────────── - name: ipv6-host-claim-bound diff --git a/test/e2e/ippool/chainsaw-test.yaml b/test/e2e/ippool/chainsaw-test.yaml index b6ccb09..69b18c4 100644 --- a/test/e2e/ippool/chainsaw-test.yaml +++ b/test/e2e/ippool/chainsaw-test.yaml @@ -91,10 +91,15 @@ spec: ($error == null): true (contains($stdout, 'OK child ')): true - script: + timeout: 30s content: | set -e before=$(cat /tmp/pool-suite-root-before) - after=$(kubectl get ippool pool-suite-root -o jsonpath='{.status.capacity.available}') + for i in $(seq 1 30); do + after=$(kubectl get ippool pool-suite-root -o jsonpath='{.status.capacity.available}') + if [ "$after" -lt "$before" ]; then break; fi + sleep 1 + done echo "after_root_available=$after (before=$before)" if [ "$after" -ge "$before" ]; then echo "FAIL: root capacity.available did not decrease after child pool creation" diff --git a/test/e2e/pool-exhaustion/chainsaw-test.yaml b/test/e2e/pool-exhaustion/chainsaw-test.yaml index cbb7553..6f8dfa9 100644 --- a/test/e2e/pool-exhaustion/chainsaw-test.yaml +++ b/test/e2e/pool-exhaustion/chainsaw-test.yaml @@ -62,14 +62,48 @@ spec: ($error == null): true - name: third-claim-rejected-507 - description: Third claim must fail with HTTP 507 (Insufficient Storage) + description: | + Third claim cannot be satisfied from the exhausted pool. Apply the + claim and assert it reaches a non-Bound terminal condition with + reason=PoolExhausted (works for both sync 507 and async reconciler). try: - - create: + - apply: file: test-data/claim-3.yaml - expect: - - check: - ($error != null): true - (contains($error, '507') || contains($error, 'Insufficient Storage') || contains($error, 'exhausted')): true + - script: + timeout: 45s + env: + - name: NAMESPACE + value: ($namespace) + content: | + set -e + for i in $(seq 1 30); do + reason=$(kubectl get ipclaim -n "$NAMESPACE" exhaust-claim-3 \ + -o jsonpath='{.status.conditions[?(@.type=="Allocated")].reason}' 2>/dev/null || echo "") + phase=$(kubectl get ipclaim -n "$NAMESPACE" exhaust-claim-3 \ + -o jsonpath='{.status.phase}' 2>/dev/null || echo "") + if [ "$reason" = "PoolExhausted" ]; then break; fi + if [ "$phase" = "Bound" ]; then + echo "FAIL: exhaust-claim-3 became Bound against an exhausted pool" + exit 1 + fi + sleep 1 + done + if [ "$reason" != "PoolExhausted" ]; then + echo "FAIL: exhaust-claim-3 Allocated condition reason=$reason (expected PoolExhausted) after 30s" + exit 1 + fi + echo "OK exhaust-claim-3 rejected with reason=PoolExhausted" + check: + ($error == null): true + finally: + - script: + env: + - name: NAMESPACE + value: ($namespace) + content: | + kubectl delete ipclaim -n "$NAMESPACE" exhaust-claim-3 --ignore-not-found >/dev/null 2>&1 || true + check: + ($error == null): true - name: release-and-reallocate description: Delete first claim, then create third claim — succeeds diff --git a/test/e2e/pool-selector/chainsaw-test.yaml b/test/e2e/pool-selector/chainsaw-test.yaml index 54ff420..5658e25 100644 --- a/test/e2e/pool-selector/chainsaw-test.yaml +++ b/test/e2e/pool-selector/chainsaw-test.yaml @@ -91,14 +91,48 @@ spec: (contains($stdout, 'OK IPAllocation ')): true - name: claim-no-match-rejected - description: Selector matching no pool returns HTTP 400 + description: | + Selector matching no pool must not produce a Bound claim. Apply the + claim and assert it reaches a non-Bound condition with + reason=NoPoolMatch (works for both sync 400 and async reconciler). try: - - create: + - apply: file: test-data/claim-no-match.yaml - expect: - - check: - ($error != null): true - (contains($error, 'no IPPool matches') || contains($error, 'no pool matches') || contains($error, 'no IPPrefix pool matches')): true + - script: + timeout: 45s + env: + - name: NAMESPACE + value: ($namespace) + content: | + set -e + for i in $(seq 1 30); do + reason=$(kubectl get ipclaim -n "$NAMESPACE" selector-claim-no-match \ + -o jsonpath='{.status.conditions[?(@.type=="Allocated")].reason}' 2>/dev/null || echo "") + phase=$(kubectl get ipclaim -n "$NAMESPACE" selector-claim-no-match \ + -o jsonpath='{.status.phase}' 2>/dev/null || echo "") + if [ "$reason" = "NoPoolMatch" ]; then break; fi + if [ "$phase" = "Bound" ]; then + echo "FAIL: selector-claim-no-match became Bound despite no matching pool" + exit 1 + fi + sleep 1 + done + if [ "$reason" != "NoPoolMatch" ]; then + echo "FAIL: selector-claim-no-match Allocated condition reason=$reason (expected NoPoolMatch) after 30s" + exit 1 + fi + echo "OK selector-claim-no-match rejected with reason=NoPoolMatch" + check: + ($error == null): true + finally: + - script: + env: + - name: NAMESPACE + value: ($namespace) + content: | + kubectl delete ipclaim -n "$NAMESPACE" selector-claim-no-match --ignore-not-found >/dev/null 2>&1 || true + check: + ($error == null): true - name: mutually-exclusive-rejected description: Setting both poolRef and poolSelector returns HTTP 400