From f414dae4778f01381f453faf0a1d8ce0d7a112d2 Mon Sep 17 00:00:00 2001 From: Matt Jenkinson <75292329+mattdjenkinson@users.noreply.github.com> Date: Fri, 15 May 2026 15:21:26 +0100 Subject: [PATCH 1/2] fix: detect discovery response format from body kind, not request Accept MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit filterAPIIndex previously decided whether to decode the response as APIGroupDiscoveryList based solely on the request Accept header. The intent of that change (6ef5237) was to work around Milo not echoing the as=APIGroupDiscoveryList Content-Type, but the trade-off silently broke clients when the inner apiserver emitted legacy APIGroupList JSON in response to an aggregated request: json.Unmarshal into APIGroupDiscoveryList succeeds with items: nil (Go JSON is lenient about extra/missing fields), and the filter re-encoded that empty list and sent it back to the client. The user-visible effect was every API group disappearing from project- context discovery — including built-in groups like coordination.k8s.io that no client has any way to register. controller-runtime's REST mapper then failed with `no matches for kind "Lease" in version "coordination.k8s.io/v1"`, and downstream operators that watch Lease (network-services-operator's connector + iroh-dns controllers, plus anything similar) couldn't engage. ~221 staging project control planes were affected. Probe the response body's `kind` field instead. Body kind is the only reliable signal for the format that's actually in `cw.body`; the Accept header is kept as a tie-break when the body lacks an explicit kind. Unknown or unrecognised bodies pass through verbatim so the filter never silently emits a malformed response. New regression test covers the four format/header combinations. Fixes: #616 Related: datum-cloud/network-services-operator#160 (downstream manifestation), datum-cloud/network-services-operator#161 (NSO-side defensive fix that no longer crash-loops on this). --- pkg/server/discovery/filter.go | 41 +++++--- pkg/server/discovery/filter_test.go | 141 ++++++++++++++++++++++++++++ 2 files changed, 170 insertions(+), 12 deletions(-) diff --git a/pkg/server/discovery/filter.go b/pkg/server/discovery/filter.go index 14c2201b..a6891239 100644 --- a/pkg/server/discovery/filter.go +++ b/pkg/server/discovery/filter.go @@ -93,7 +93,17 @@ func filterAPIResourceList(w http.ResponseWriter, req *http.Request, next http.H } // filterAPIIndex filters /apis. Handles both the legacy APIGroupList and the -// modern aggregated APIGroupDiscoveryList based on Content-Type. +// modern aggregated APIGroupDiscoveryList. We must decide which format we +// were given before re-encoding, otherwise the filter will silently emit an +// empty list. +// +// The request's Accept header tells us what kubectl/controller-runtime +// *wants* but not what Milo actually *returned* — Milo doesn't always echo +// the matching Content-Type, and the underlying apiserver may emit legacy +// APIGroupList JSON even when aggregated was requested. So we probe the +// response body's `kind` field first; that's the only reliable signal for +// the format that's actually in `cw.body`. The Accept header is a tie-break +// when the body lacks an explicit kind. func filterAPIIndex(w http.ResponseWriter, req *http.Request, next http.Handler, registry *Registry) { cw := newCaptureWriter(w) next.ServeHTTP(cw, req) @@ -105,11 +115,14 @@ func filterAPIIndex(w http.ResponseWriter, req *http.Request, next http.Handler, parentCtx := FromRequest(req.Context()) - // Detect aggregated discovery from the request Accept header. kubectl sends - // Accept: application/json;...;as=APIGroupDiscoveryList and a conformant - // server echoes it in Content-Type — but checking the request is more - // reliable since Milo returns plain application/json in the response. - if isAggregatedDiscovery(req.Header.Get("Accept")) { + var kindProbe struct { + Kind string `json:"kind"` + } + _ = json.Unmarshal(cw.body.Bytes(), &kindProbe) + + switch { + case kindProbe.Kind == "APIGroupDiscoveryList" || + (kindProbe.Kind == "" && isAggregatedDiscovery(req.Header.Get("Accept"))): var list apidiscoveryv2.APIGroupDiscoveryList if err := json.Unmarshal(cw.body.Bytes(), &list); err != nil { cw.flushUnchanged() @@ -117,13 +130,17 @@ func filterAPIIndex(w http.ResponseWriter, req *http.Request, next http.Handler, } filterAggregatedGroups(&list, registry, parentCtx) cw.flushJSON(&list) - return + case kindProbe.Kind == "APIGroupList": + // Legacy APIGroupList — we can't tell from the index alone which + // resources live in each group, so we leave the group list intact. + // The per-(group,version) request will be filtered by + // filterAPIResourceList. + cw.flushUnchanged() + default: + // Unknown/unrecognised body — pass through verbatim rather than + // risk emitting a malformed response. + cw.flushUnchanged() } - - // Legacy APIGroupList — we can't tell from the index alone which - // resources live in each group, so we leave the group list intact. The - // per-(group,version) request will be filtered by filterAPIResourceList. - cw.flushUnchanged() } func filterAggregatedGroups(list *apidiscoveryv2.APIGroupDiscoveryList, registry *Registry, parentCtx ParentContext) { diff --git a/pkg/server/discovery/filter_test.go b/pkg/server/discovery/filter_test.go index c1fd11c8..ed097bf4 100644 --- a/pkg/server/discovery/filter_test.go +++ b/pkg/server/discovery/filter_test.go @@ -8,6 +8,7 @@ import ( "strings" "testing" + apidiscoveryv2 "k8s.io/api/apidiscovery/v2" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" @@ -129,6 +130,146 @@ func TestFilterAPIResourceListByContext(t *testing.T) { } } +// TestFilterAPIIndex_FormatDetection covers the body-kind probe added to +// filterAPIIndex so that responses with a kind mismatch between the request +// Accept header and the actual body are not silently coerced to an empty +// list. +// +// Regression test for the bug where an upstream apiserver emitting legacy +// APIGroupList JSON in response to an aggregated-discovery request caused +// filterAPIIndex to unmarshal into APIGroupDiscoveryList (succeeds, but +// produces items: nil) and re-emit an empty list — dropping every group from +// discovery, including built-ins like coordination.k8s.io that no client had +// any way to register. +func TestFilterAPIIndex_FormatDetection(t *testing.T) { + aggregatedBody := func() []byte { + body := apidiscoveryv2.APIGroupDiscoveryList{ + TypeMeta: metav1.TypeMeta{Kind: "APIGroupDiscoveryList", APIVersion: "apidiscovery.k8s.io/v2"}, + Items: []apidiscoveryv2.APIGroupDiscovery{ + { + ObjectMeta: metav1.ObjectMeta{Name: "coordination.k8s.io"}, + Versions: []apidiscoveryv2.APIVersionDiscovery{ + {Version: "v1", Resources: []apidiscoveryv2.APIResourceDiscovery{{Resource: "leases"}}}, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "resourcemanager.miloapis.com"}, + Versions: []apidiscoveryv2.APIVersionDiscovery{ + {Version: "v1alpha1", Resources: []apidiscoveryv2.APIResourceDiscovery{ + {Resource: "projects"}, {Resource: "untagged"}, + }}, + }, + }, + }, + } + raw, _ := json.Marshal(body) + return raw + }() + legacyBody := []byte(`{ + "kind":"APIGroupList","apiVersion":"v1", + "groups":[ + {"name":"coordination.k8s.io","versions":[{"groupVersion":"coordination.k8s.io/v1","version":"v1"}]}, + {"name":"resourcemanager.miloapis.com","versions":[{"groupVersion":"resourcemanager.miloapis.com/v1alpha1","version":"v1alpha1"}]} + ] + }`) + + cases := []struct { + name string + body []byte + accept string + wantPassThru bool // expect the body verbatim + wantGroups []string // for aggregated, the kept group names + wantContains []string // for pass-through, substrings expected in body + }{ + { + name: "aggregated body, aggregated Accept -> filtered, all visible kept", + body: aggregatedBody, + accept: "application/json;g=apidiscovery.k8s.io;v=v2;as=APIGroupDiscoveryList", + wantGroups: []string{"coordination.k8s.io", "resourcemanager.miloapis.com"}, + }, + { + name: "legacy body, aggregated Accept -> pass through verbatim (no silent emptying)", + body: legacyBody, + accept: "application/json;g=apidiscovery.k8s.io;v=v2;as=APIGroupDiscoveryList", + wantPassThru: true, + wantContains: []string{`"kind":"APIGroupList"`, `"coordination.k8s.io"`}, + }, + { + name: "legacy body, legacy Accept -> pass through", + body: legacyBody, + accept: "application/json", + wantPassThru: true, + wantContains: []string{`"kind":"APIGroupList"`, `"coordination.k8s.io"`}, + }, + { + name: "aggregated body, legacy Accept -> still filtered (body kind wins)", + body: aggregatedBody, + accept: "application/json", + wantGroups: []string{"coordination.k8s.io", "resourcemanager.miloapis.com"}, + }, + } + + registry := NewRegistry() + // "projects" is Organization-tagged; in Project context it must be hidden. + registry.crd[schema.GroupResource{Group: "resourcemanager.miloapis.com", Resource: "projects"}] = []ParentContext{ContextOrganization} + registry.hasInit = true + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + inner := http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _, _ = w.Write(tc.body) + }) + handler := DiscoveryContextFilter(inner, registry) + + req := httptest.NewRequest(http.MethodGet, "/apis", nil) + req.Header.Set("Accept", tc.accept) + req = req.WithContext(request.WithProject(context.Background(), "p1")) + + rr := httptest.NewRecorder() + handler.ServeHTTP(rr, req) + + if rr.Code != http.StatusOK { + t.Fatalf("status = %d, want 200; body=%s", rr.Code, rr.Body.String()) + } + + if tc.wantPassThru { + body := rr.Body.String() + for _, want := range tc.wantContains { + if !strings.Contains(body, want) { + t.Errorf("body missing %q\nbody=%s", want, body) + } + } + return + } + + var got apidiscoveryv2.APIGroupDiscoveryList + if err := json.Unmarshal(rr.Body.Bytes(), &got); err != nil { + t.Fatalf("decode response: %v; body=%s", err, rr.Body.String()) + } + names := make([]string, 0, len(got.Items)) + for _, g := range got.Items { + // Inside resourcemanager, "projects" should be filtered out + // in Project context. We only check the group survives at + // all by including it here if any version has any resource. + keepGroup := false + for _, v := range g.Versions { + if len(v.Resources) > 0 { + keepGroup = true + } + } + if keepGroup { + names = append(names, g.Name) + } + } + if strings.Join(names, ",") != strings.Join(tc.wantGroups, ",") { + t.Errorf("groups = %v, want %v", names, tc.wantGroups) + } + }) + } +} + func equalContexts(a, b []ParentContext) bool { if len(a) != len(b) { return false From f6b4ce0049967e22c3bc2ac9126c1d1e12d585df Mon Sep 17 00:00:00 2001 From: Matt Jenkinson <75292329+mattdjenkinson@users.noreply.github.com> Date: Fri, 15 May 2026 16:35:34 +0100 Subject: [PATCH 2/2] fix: allow coordination.k8s.io/leases in Project context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The core-kubernetes-resources DiscoveryContextPolicy listed Lease as Platform-only alongside genuinely admin-only groups (rbac, authn, authz, admissionregistration). That was incorrect: Leases are a namespace-scoped primitive used legitimately by per-project workloads — leader election in any operator that targets project apiservers, or custom heartbeats like Datum Connect's connector health Lease. The Platform-only restriction made Project-context aggregated discovery omit coordination.k8s.io entirely, which broke any controller-runtime client (REST mapper resolves the group/kind from discovery). Downstream impact in datum-cloud/network-services-operator#160: the connector and iroh-dns controllers' Lease watches fail to engage, Connector status freezes, and the operator burns an engage-retry storm against every affected project control plane. The companion body-kind-probe fix in this PR is still valuable as defensive handling for discovery responses where the Accept header and body format disagree, but the user-visible symptom — Lease missing from project discovery — is caused entirely by this policy configuration. --- config/discovery/core/discovery-context-policy.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/config/discovery/core/discovery-context-policy.yaml b/config/discovery/core/discovery-context-policy.yaml index a9ed75c6..84609f1f 100644 --- a/config/discovery/core/discovery-context-policy.yaml +++ b/config/discovery/core/discovery-context-policy.yaml @@ -27,7 +27,12 @@ spec: contexts: ["Platform"] - group: "coordination.k8s.io" resources: ["leases"] - contexts: ["Platform"] + # Leases are used by per-project workloads (leader election in + # operators, custom heartbeat mechanisms like Datum Connect's connector + # health Lease) so they must be discoverable in Project context. + # Datum's network-services-operator depends on this: see + # datum-cloud/network-services-operator#160. + contexts: ["Platform", "Project"] - group: "admissionregistration.k8s.io" resources: ["mutatingwebhookconfigurations", "validatingadmissionpolicies", "validatingadmissionpolicybindings", "validatingwebhookconfigurations"] contexts: ["Platform"]