From 492eb6c6c5e19061941239d4c8c4bbeb41b0cf3c Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Wed, 20 May 2026 16:14:23 -0500 Subject: [PATCH] feat: implement federated deployment scheduling across POP cells Workloads targeting a city location are now automatically routed to the correct physical site via a Karmada-based federation layer. Each POP cell operates independently, instance health is surfaced back to the control plane in real time, and the platform remains available even when parts of the control plane are temporarily unreachable. Controllers added: - WorkloadDeploymentFederator: replicates WDs into Karmada and manages PropagationPolicies per city code - InstanceProjector: mirrors Instance write-backs from Karmada into the project namespace on the control plane ResourceInterpreterCustomization deployed at config time teaches Karmada how to aggregate replica counts and conditions across POP cells. Operator flags --enable-management-controllers and --enable-cell-controllers allow each deployment to opt into only the controllers it needs. Includes a 6-test Chainsaw e2e suite covering federation, deletion cascade, propagation policy lifecycle, instance projection, instance write-back, and the full end-to-end chain. Resolves #85 Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/publish.yaml | 3 + .gitignore | 3 + Taskfile.yaml | 481 ++++++++++++++++++ api/v1alpha/workloaddeployment_types.go | 9 - api/v1alpha/zz_generated.deepcopy.go | 5 - cmd/main.go | 109 +++- .../base/downstream-rbac/kustomization.yaml | 5 + config/base/downstream-rbac/rbac.yaml | 32 ++ config/base/federation/kustomization.yaml | 10 + config/base/manager/manager.yaml | 28 +- config/base/manager/service_account.yaml | 2 +- .../cell-controllers/kustomization.yaml | 20 + .../metrics_auth_role_binding.yaml | 2 +- .../controller_rbac/role_binding.yaml | 2 +- .../components/federation/kustomization.yaml | 5 + .../workloaddeployment-interpreter.yaml | 51 ++ .../leader_election_role_binding.yaml | 2 +- .../management-controllers/kustomization.yaml | 20 + go.mod | 17 +- go.sum | 42 +- hack/e2e/kind-control-plane.yaml | 17 + hack/e2e/make-internal-kubeconfig.sh | 60 +++ hack/e2e/patch-cluster-secret.sh | 90 ++++ internal/controller/indexers.go | 24 +- internal/controller/instance_controller.go | 286 ++++++++--- .../controller/instance_controller_test.go | 78 ++- internal/controller/instance_projector.go | 164 ++++++ .../controller/instance_projector_test.go | 361 +++++++++++++ .../stateful/stateful_control.go | 2 - internal/controller/testing_helpers_test.go | 100 ++++ .../workloaddeployment_controller.go | 192 ++++--- .../workloaddeployment_federator.go | 402 +++++++++++++++ .../workloaddeployment_federator_test.go | 398 +++++++++++++++ .../workloaddeployment_scheduler.go | 153 ------ test/e2e/chainsaw-config.yaml | 47 ++ .../assert-downstream-wd-exists.yaml | 7 + test/e2e/deletion-cascade/chainsaw-test.yaml | 79 +++ .../deletion-cascade/workload-deployment.yaml | 21 + test/e2e/env/README.md | 251 +++++++++ test/e2e/env/env.go | 233 +++++++++ test/e2e/full-federation/chainsaw-test.yaml | 150 ++++++ .../full-federation/workload-deployment.yaml | 21 + .../assert-downstream-wd.yaml | 6 + .../assert-projected-instance.yaml | 19 + .../instance-projection/chainsaw-test.yaml | 123 +++++ .../workload-deployment.yaml | 21 + .../assert-downstream-instance.yaml | 16 + .../e2e/instance-writeback/chainsaw-test.yaml | 112 ++++ .../instance-writeback/instance-pop-dfw.yaml | 15 + .../assert-pp-exists.yaml | 6 + .../chainsaw-test.yaml | 133 +++++ .../workload-deployment-alpha.yaml | 21 + .../workload-deployment-beta.yaml | 21 + .../assert-downstream-pp.yaml | 20 + .../assert-downstream-wd.yaml | 9 + .../chainsaw-test.yaml | 84 +++ .../workload-deployment.yaml | 22 + 57 files changed, 4208 insertions(+), 404 deletions(-) create mode 100644 Taskfile.yaml create mode 100644 config/base/downstream-rbac/kustomization.yaml create mode 100644 config/base/downstream-rbac/rbac.yaml create mode 100644 config/base/federation/kustomization.yaml create mode 100644 config/components/cell-controllers/kustomization.yaml create mode 100644 config/components/federation/kustomization.yaml create mode 100644 config/components/federation/workloaddeployment-interpreter.yaml create mode 100644 config/components/management-controllers/kustomization.yaml create mode 100644 hack/e2e/kind-control-plane.yaml create mode 100755 hack/e2e/make-internal-kubeconfig.sh create mode 100755 hack/e2e/patch-cluster-secret.sh create mode 100644 internal/controller/instance_projector.go create mode 100644 internal/controller/instance_projector_test.go create mode 100644 internal/controller/testing_helpers_test.go create mode 100644 internal/controller/workloaddeployment_federator.go create mode 100644 internal/controller/workloaddeployment_federator_test.go delete mode 100644 internal/controller/workloaddeployment_scheduler.go create mode 100644 test/e2e/chainsaw-config.yaml create mode 100644 test/e2e/deletion-cascade/assert-downstream-wd-exists.yaml create mode 100644 test/e2e/deletion-cascade/chainsaw-test.yaml create mode 100644 test/e2e/deletion-cascade/workload-deployment.yaml create mode 100644 test/e2e/env/README.md create mode 100644 test/e2e/env/env.go create mode 100644 test/e2e/full-federation/chainsaw-test.yaml create mode 100644 test/e2e/full-federation/workload-deployment.yaml create mode 100644 test/e2e/instance-projection/assert-downstream-wd.yaml create mode 100644 test/e2e/instance-projection/assert-projected-instance.yaml create mode 100644 test/e2e/instance-projection/chainsaw-test.yaml create mode 100644 test/e2e/instance-projection/workload-deployment.yaml create mode 100644 test/e2e/instance-writeback/assert-downstream-instance.yaml create mode 100644 test/e2e/instance-writeback/chainsaw-test.yaml create mode 100644 test/e2e/instance-writeback/instance-pop-dfw.yaml create mode 100644 test/e2e/propagation-policy-lifecycle/assert-pp-exists.yaml create mode 100644 test/e2e/propagation-policy-lifecycle/chainsaw-test.yaml create mode 100644 test/e2e/propagation-policy-lifecycle/workload-deployment-alpha.yaml create mode 100644 test/e2e/propagation-policy-lifecycle/workload-deployment-beta.yaml create mode 100644 test/e2e/workload-deployment-federation/assert-downstream-pp.yaml create mode 100644 test/e2e/workload-deployment-federation/assert-downstream-wd.yaml create mode 100644 test/e2e/workload-deployment-federation/chainsaw-test.yaml create mode 100644 test/e2e/workload-deployment-federation/workload-deployment.yaml diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 8949c76..5dcc90b 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -18,6 +18,7 @@ jobs: secrets: inherit publish-kustomize-bundles: + needs: publish-container-image permissions: id-token: write contents: read @@ -26,4 +27,6 @@ jobs: with: bundle-name: ghcr.io/datum-cloud/compute-kustomize bundle-path: config + image-name: ghcr.io/datum-cloud/compute + image-overlays: config/base/manager secrets: inherit diff --git a/.gitignore b/.gitignore index 2b0c6e4..05b47b6 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,6 @@ go.work.sum .env bin/ + +# Local e2e environment artefacts (Kind kubeconfigs, etc.) +tmp/ diff --git a/Taskfile.yaml b/Taskfile.yaml new file mode 100644 index 0000000..bcfbb0f --- /dev/null +++ b/Taskfile.yaml @@ -0,0 +1,481 @@ +version: '3' + +# ─── Variables ────────────────────────────────────────────────────────────── + +vars: + # Karmada Helm chart version to install (karmada-charts/karmada) + KARMADA_VERSION: v1.16.0 + + # karmadactl CLI version for cluster registration + KARMADACTL_VERSION: v1.16.0 + + # Chainsaw version for e2e testing (kyverno/chainsaw) + CHAINSAW_VERSION: v0.2.15 + + # Local tool directory (mirrors Makefile convention) + LOCALBIN: '{{.ROOT_DIR}}/bin' + KARMADACTL: '{{.ROOT_DIR}}/bin/karmadactl' + CHAINSAW: '{{.ROOT_DIR}}/bin/chainsaw' + + # Kind cluster names + KIND_CONTROL_PLANE: compute-control-plane + KIND_POP_DFW: compute-pop-dfw + KIND_POP_ORD: compute-pop-ord + + # All cluster names (for CRD installation loops) + KIND_ALL_CLUSTERS: '{{.KIND_CONTROL_PLANE}} {{.KIND_POP_DFW}} {{.KIND_POP_ORD}}' + + # Working directory for e2e artefacts (gitignored) + E2E_DIR: '{{.ROOT_DIR}}/tmp/e2e' + KUBECONFIG_DIR: '{{.ROOT_DIR}}/tmp/e2e/kubeconfigs' + + # Fixed NodePort for the Karmada API server. + # The Kind management cluster is created with an extraPortMapping for this port + # so it is reachable at https://localhost:32443 from the developer's machine. + KARMADA_API_NODEPORT: "32443" + +# ─── Tasks ────────────────────────────────────────────────────────────────── + +tasks: + + default: + cmds: + - task --list + silent: true + + # ════════════════════════════════════════════════════════════════════════ + # e2e environment lifecycle + # ════════════════════════════════════════════════════════════════════════ + + e2e:up: + desc: "Create the full local Kind+Karmada e2e environment (idempotent)" + cmds: + - task: e2e:tools + - task: e2e:clusters:create + - task: e2e:karmada:install + - task: e2e:karmada:configure + - task: e2e:karmada:join-clusters + - task: e2e:crds:install + - cmd: | + echo "" + echo "╔══════════════════════════════════════════════════════════╗" + echo "║ e2e environment ready ║" + echo "╠══════════════════════════════════════════════════════════╣" + echo "║ Control plane: {{.KUBECONFIG_DIR}}/control-plane.yaml" + echo "║ Karmada API: {{.KUBECONFIG_DIR}}/karmada.yaml" + echo "║ POP DFW: {{.KUBECONFIG_DIR}}/pop-dfw.yaml" + echo "║ POP ORD: {{.KUBECONFIG_DIR}}/pop-ord.yaml" + echo "╠══════════════════════════════════════════════════════════╣" + echo "║ Export for kubectl: ║" + echo "║ export KUBECONFIG={{.KUBECONFIG_DIR}}/control-plane.yaml" + echo "╚══════════════════════════════════════════════════════════╝" + silent: false + + e2e:down: + desc: "Tear down the local e2e environment" + cmds: + - kind delete cluster --name {{.KIND_CONTROL_PLANE}} 2>/dev/null || true + - kind delete cluster --name {{.KIND_POP_DFW}} 2>/dev/null || true + - kind delete cluster --name {{.KIND_POP_ORD}} 2>/dev/null || true + - rm -rf {{.E2E_DIR}} + - cmd: echo "✓ e2e environment torn down" + silent: false + + e2e:test: + desc: "Run Chainsaw e2e tests against the local Kind+Karmada environment" + deps: [e2e:tools:chainsaw] + cmds: + - | + KUBECONFIG={{.KUBECONFIG_DIR}}/control-plane.yaml \ + {{.CHAINSAW}} test \ + --config test/e2e/chainsaw-config.yaml \ + test/e2e/ \ + {{.CLI_ARGS}} + + e2e:test:filter: + desc: "Run a subset of e2e tests by name regex (e.g. task e2e:test:filter -- --include-test-regex federation)" + deps: [e2e:tools:chainsaw] + cmds: + - | + KUBECONFIG={{.KUBECONFIG_DIR}}/control-plane.yaml \ + {{.CHAINSAW}} test \ + --config test/e2e/chainsaw-config.yaml \ + {{.CLI_ARGS}} \ + test/e2e/ + + # ════════════════════════════════════════════════════════════════════════ + # Tool installation + # ════════════════════════════════════════════════════════════════════════ + + e2e:tools: + desc: "Install e2e-specific tooling (karmadactl, chainsaw, helm repo)" + cmds: + - task: e2e:tools:karmadactl + - task: e2e:tools:chainsaw + - task: e2e:tools:helm-repo + + e2e:tools:karmadactl: + desc: "Download karmadactl {{.KARMADACTL_VERSION}}" + cmds: + - mkdir -p {{.LOCALBIN}} + - | + if [ ! -f "{{.KARMADACTL}}" ]; then + OS=$(uname -s | tr '[:upper:]' '[:lower:]') + ARCH=$(uname -m | sed 's/x86_64/amd64/;s/aarch64/arm64/') + URL="https://github.com/karmada-io/karmada/releases/download/{{.KARMADACTL_VERSION}}/karmadactl-${OS}-${ARCH}.tgz" + echo "Downloading karmadactl {{.KARMADACTL_VERSION}} (${OS}/${ARCH}) from ${URL}..." + curl -sSfL "${URL}" | tar -xz -C {{.LOCALBIN}} karmadactl + chmod +x {{.KARMADACTL}} + echo "karmadactl installed → {{.KARMADACTL}}" + else + echo "karmadactl already present at {{.KARMADACTL}}" + fi + status: + - test -f {{.KARMADACTL}} + + e2e:tools:chainsaw: + desc: "Download chainsaw {{.CHAINSAW_VERSION}}" + cmds: + - mkdir -p {{.LOCALBIN}} + - | + if [ ! -f "{{.CHAINSAW}}" ]; then + OS=$(uname -s | tr '[:upper:]' '[:lower:]') + ARCH=$(uname -m | sed 's/x86_64/amd64/;s/aarch64/arm64/') + URL="https://github.com/kyverno/chainsaw/releases/download/{{.CHAINSAW_VERSION}}/chainsaw_${OS}_${ARCH}.tar.gz" + echo "Downloading chainsaw {{.CHAINSAW_VERSION}} (${OS}/${ARCH}) from ${URL}..." + curl -sSfL "${URL}" | tar -xz -C {{.LOCALBIN}} chainsaw + chmod +x {{.CHAINSAW}} + echo "chainsaw installed → {{.CHAINSAW}}" + else + echo "chainsaw already present at {{.CHAINSAW}}" + fi + status: + - test -f {{.CHAINSAW}} + + e2e:tools:helm-repo: + desc: "Add/update karmada-charts Helm repository" + cmds: + - | + if ! helm repo list 2>/dev/null | grep -q karmada-charts; then + helm repo add karmada-charts https://raw.githubusercontent.com/karmada-io/karmada/master/charts + echo "Added karmada-charts Helm repository" + fi + helm repo update karmada-charts + status: + - helm repo list 2>/dev/null | grep -q karmada-charts + + # ════════════════════════════════════════════════════════════════════════ + # Kind cluster management + # ════════════════════════════════════════════════════════════════════════ + + e2e:clusters:create: + desc: "Create all Kind clusters (idempotent)" + cmds: + # Management / control-plane cell cluster — needs extraPortMappings for + # the Karmada API server NodePort so it is accessible at localhost:32443. + - task: _e2e:cluster:create + vars: + CLUSTER_NAME: "{{.KIND_CONTROL_PLANE}}" + KIND_CONFIG: hack/e2e/kind-control-plane.yaml + # POP cell clusters — default Kind config is sufficient. + - task: _e2e:cluster:create + vars: + CLUSTER_NAME: "{{.KIND_POP_DFW}}" + KIND_CONFIG: "" + - task: _e2e:cluster:create + vars: + CLUSTER_NAME: "{{.KIND_POP_ORD}}" + KIND_CONFIG: "" + - mkdir -p {{.KUBECONFIG_DIR}} + - task: _e2e:kubeconfigs:export + + _e2e:cluster:create: + internal: true + cmds: + - | + if kind get clusters 2>/dev/null | grep -qx '{{.CLUSTER_NAME}}'; then + echo "Kind cluster '{{.CLUSTER_NAME}}' already exists — skipping" + else + echo "Creating Kind cluster '{{.CLUSTER_NAME}}'..." + CONFIG_FLAG="" + if [ -n "{{.KIND_CONFIG}}" ]; then + CONFIG_FLAG="--config {{.KIND_CONFIG}}" + fi + kind create cluster \ + --name {{.CLUSTER_NAME}} \ + $CONFIG_FLAG \ + --wait 90s + fi + + _e2e:kubeconfigs:export: + internal: true + desc: "Export Kind kubeconfigs and create Docker-IP variants for cross-cluster use" + cmds: + # Standard kubeconfigs (localhost-based, for developer kubectl use) + - kind export kubeconfig --name {{.KIND_CONTROL_PLANE}} --kubeconfig {{.KUBECONFIG_DIR}}/control-plane.yaml + - kind export kubeconfig --name {{.KIND_POP_DFW}} --kubeconfig {{.KUBECONFIG_DIR}}/pop-dfw.yaml + - kind export kubeconfig --name {{.KIND_POP_ORD}} --kubeconfig {{.KUBECONFIG_DIR}}/pop-ord.yaml + # Docker-IP kubeconfigs (used by Karmada controller, running inside Docker, + # to reach POP cell API servers across the kind bridge network) + - | + hack/e2e/make-internal-kubeconfig.sh \ + {{.KUBECONFIG_DIR}}/pop-dfw.yaml \ + {{.KUBECONFIG_DIR}}/pop-dfw-internal.yaml \ + {{.KIND_POP_DFW}} + - | + hack/e2e/make-internal-kubeconfig.sh \ + {{.KUBECONFIG_DIR}}/pop-ord.yaml \ + {{.KUBECONFIG_DIR}}/pop-ord-internal.yaml \ + {{.KIND_POP_ORD}} + + # ════════════════════════════════════════════════════════════════════════ + # Karmada installation + # ════════════════════════════════════════════════════════════════════════ + + e2e:karmada:install: + desc: "Install Karmada into the management cluster via Helm (idempotent)" + cmds: + - | + if kubectl --kubeconfig={{.KUBECONFIG_DIR}}/control-plane.yaml \ + get ns karmada-system &>/dev/null; then + echo "Karmada already installed (karmada-system namespace exists)" + else + echo "Installing Karmada {{.KARMADA_VERSION}} via Helm..." + helm install karmada karmada-charts/karmada \ + --kubeconfig={{.KUBECONFIG_DIR}}/control-plane.yaml \ + --namespace karmada-system \ + --create-namespace \ + --version {{.KARMADA_VERSION}} \ + --set apiServer.serviceType=NodePort \ + --set apiServer.nodePort={{.KARMADA_API_NODEPORT}} \ + --wait \ + --timeout 5m + echo "Karmada installed" + fi + - task: _e2e:karmada:build-kubeconfig + + e2e:karmada:configure: + desc: "Apply federation component config to the Karmada API server (idempotent)" + cmds: + - | + echo "Applying federation component to Karmada..." + kubectl --kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml apply \ + -k config/components/federation/ + echo "Federation component applied" + + _e2e:karmada:build-kubeconfig: + internal: true + desc: "Extract Karmada kubeconfig from secret and patch server to localhost:{{.KARMADA_API_NODEPORT}}" + cmds: + - | + echo "Building Karmada kubeconfig → {{.KUBECONFIG_DIR}}/karmada.yaml" + # Extract raw kubeconfig from the secret the Helm chart creates + kubectl --kubeconfig={{.KUBECONFIG_DIR}}/control-plane.yaml \ + get secret karmada-kubeconfig \ + -n karmada-system \ + -o jsonpath='{.data.kubeconfig}' \ + | base64 -d > {{.KUBECONFIG_DIR}}/karmada-raw.yaml + # Rewrite the server address to the NodePort exposed on localhost + python3 - {{.KUBECONFIG_DIR}}/karmada-raw.yaml {{.KUBECONFIG_DIR}}/karmada.yaml 127.0.0.1 {{.KARMADA_API_NODEPORT}} << 'PYEOF' + import sys, yaml + + src, dst, host, port = sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4] + + with open(src) as f: + cfg = yaml.safe_load(f) + + for cluster in cfg.get('clusters', []): + old = cluster['cluster'].get('server', '') + cluster['cluster']['server'] = f'https://{host}:{port}' + # The cert is for the internal cluster IP, so skip TLS verification. + # This is a local dev-only environment. + cluster['cluster']['insecure-skip-tls-verify'] = True + cluster['cluster'].pop('certificate-authority-data', None) + print(f" karmada server: {old} → https://{host}:{port}", file=sys.stderr) + + with open(dst, 'w') as f: + yaml.dump(cfg, f, default_flow_style=False) + PYEOF + rm {{.KUBECONFIG_DIR}}/karmada-raw.yaml + + # ════════════════════════════════════════════════════════════════════════ + # POP cell cluster registration + # ════════════════════════════════════════════════════════════════════════ + + e2e:karmada:join-clusters: + desc: "Register POP cell clusters with Karmada and apply city-code labels" + cmds: + - task: _e2e:karmada:join-cluster + vars: + CLUSTER_NAME: "{{.KIND_POP_DFW}}" + CITY_CODE: dfw + EXTERNAL_KUBECONFIG: "{{.KUBECONFIG_DIR}}/pop-dfw.yaml" + INTERNAL_KUBECONFIG: "{{.KUBECONFIG_DIR}}/pop-dfw-internal.yaml" + - task: _e2e:karmada:join-cluster + vars: + CLUSTER_NAME: "{{.KIND_POP_ORD}}" + CITY_CODE: ord + EXTERNAL_KUBECONFIG: "{{.KUBECONFIG_DIR}}/pop-ord.yaml" + INTERNAL_KUBECONFIG: "{{.KUBECONFIG_DIR}}/pop-ord-internal.yaml" + + _e2e:karmada:join-cluster: + internal: true + cmds: + # ── Register with karmadactl join ────────────────────────────────── + # We pass the EXTERNAL kubeconfig (localhost-based) here so karmadactl + # can reach the member cluster from this macOS host to set up initial + # RBAC. The stored secret is patched below to the Docker-IP variant. + - | + if kubectl --kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml \ + get cluster {{.CLUSTER_NAME}} &>/dev/null; then + echo "Cluster '{{.CLUSTER_NAME}}' already registered in Karmada — skipping join" + else + echo "Joining '{{.CLUSTER_NAME}}' to Karmada..." + {{.KARMADACTL}} join {{.CLUSTER_NAME}} \ + --kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml \ + --cluster-kubeconfig={{.EXTERNAL_KUBECONFIG}} \ + --cluster-context=kind-{{.CLUSTER_NAME}} + echo "Cluster '{{.CLUSTER_NAME}}' registered" + fi + # ── Patch cluster secret → Docker-IP kubeconfig ─────────────────── + # The Karmada controller manager runs inside Docker; it cannot use + # localhost to reach POP cell API servers. We update the stored secret + # with a kubeconfig whose server address uses the Kind container IP so + # container-to-container communication works across the kind bridge. + - | + hack/e2e/patch-cluster-secret.sh \ + {{.KUBECONFIG_DIR}}/karmada.yaml \ + {{.CLUSTER_NAME}} \ + {{.INTERNAL_KUBECONFIG}} + # ── Apply city-code label ────────────────────────────────────────── + - | + kubectl --kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml \ + label cluster {{.CLUSTER_NAME}} \ + topology.datum.net/city-code={{.CITY_CODE}} \ + --overwrite + echo "Labeled cluster '{{.CLUSTER_NAME}}' with topology.datum.net/city-code={{.CITY_CODE}}" + + # ════════════════════════════════════════════════════════════════════════ + # CRD installation + # ════════════════════════════════════════════════════════════════════════ + + e2e:crds:install: + desc: "Install compute + NSO CRDs to all clusters" + cmds: + - task: _e2e:crds:compute + - task: _e2e:crds:nso + + _e2e:crds:compute: + internal: true + desc: "Apply compute CRDs to all clusters and the Karmada API server" + cmds: + # All three Kind clusters + the Karmada API server get the compute CRDs. + # The Karmada API server needs them so it can store and propagate + # WorkloadDeployment objects. + - | + for KC in \ + {{.KUBECONFIG_DIR}}/control-plane.yaml \ + {{.KUBECONFIG_DIR}}/karmada.yaml \ + {{.KUBECONFIG_DIR}}/pop-dfw.yaml \ + {{.KUBECONFIG_DIR}}/pop-ord.yaml; do + echo "Installing compute CRDs → $(basename $KC .yaml)..." + kubectl --kubeconfig="$KC" apply -k config/base/crd --server-side + done + + _e2e:crds:nso: + internal: true + desc: "Apply NSO CRDs to control-plane and POP cell clusters" + cmds: + # NSO CRDs (NetworkBinding, SubnetClaim, etc.) are installed on the + # control-plane as well as POP cells. The control-plane operator needs them + # so that Subnet/SubnetClaim informer watches can start without cache errors, + # even though NSO controllers themselves only run on POP cells. + - | + go mod download go.datum.net/network-services-operator + NSO_VERSION=$(go list -m -json go.datum.net/network-services-operator \ + | python3 -c "import sys, json; print(json.load(sys.stdin)['Version'])") + NSO_CRD_PATH="$(go env GOMODCACHE)/go.datum.net/network-services-operator@${NSO_VERSION}/config/crd" + echo "NSO CRDs from: ${NSO_CRD_PATH}" + for KC in \ + {{.KUBECONFIG_DIR}}/control-plane.yaml \ + {{.KUBECONFIG_DIR}}/pop-dfw.yaml \ + {{.KUBECONFIG_DIR}}/pop-ord.yaml; do + echo "Installing NSO CRDs → $(basename $KC .yaml)..." + kubectl --kubeconfig="$KC" apply -k "${NSO_CRD_PATH}" --server-side + done + + # ════════════════════════════════════════════════════════════════════════ + # Operator lifecycle (background processes for federation e2e) + # ════════════════════════════════════════════════════════════════════════ + + e2e:operator:start: + desc: "Start management (control-plane) and cell (pop-dfw) operator instances in the background" + cmds: + - mkdir -p {{.E2E_DIR}}/logs {{.E2E_DIR}}/pids + - | + echo "Starting management operator (control-plane)..." + KUBECONFIG={{.KUBECONFIG_DIR}}/control-plane.yaml \ + go run ./cmd/main.go \ + --karmada-kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml \ + --enable-cell-controllers=false \ + --leader-elect=false \ + --health-probe-bind-address=:9091 \ + > {{.E2E_DIR}}/logs/operator-management.log 2>&1 & + echo $! > {{.E2E_DIR}}/pids/operator-management.pid + echo "Management operator PID: $!" + - | + echo "Waiting for management operator health check on :9091..." + deadline=$((SECONDS + 15)) + until curl -sf http://localhost:9091/healthz >/dev/null 2>&1; do + if [ $SECONDS -ge $deadline ]; then + echo "ERROR: management operator did not become healthy within 15s" + cat {{.E2E_DIR}}/logs/operator-management.log || true + exit 1 + fi + sleep 1 + done + echo "Management operator is healthy" + - | + echo "Starting cell operator (pop-dfw)..." + KUBECONFIG={{.KUBECONFIG_DIR}}/pop-dfw.yaml \ + go run ./cmd/main.go \ + --karmada-kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml \ + --enable-management-controllers=false \ + --leader-elect=false \ + --health-probe-bind-address=:9092 \ + > {{.E2E_DIR}}/logs/operator-cell-dfw.log 2>&1 & + echo $! > {{.E2E_DIR}}/pids/operator-cell-dfw.pid + echo "Cell operator PID: $!" + - | + echo "Waiting for cell operator health check on :9092..." + deadline=$((SECONDS + 15)) + until curl -sf http://localhost:9092/healthz >/dev/null 2>&1; do + if [ $SECONDS -ge $deadline ]; then + echo "ERROR: cell operator did not become healthy within 15s" + cat {{.E2E_DIR}}/logs/operator-cell-dfw.log || true + exit 1 + fi + sleep 1 + done + echo "Cell operator is healthy" + + e2e:operator:stop: + desc: "Stop background operator instances" + cmds: + - | + for PIDFILE in \ + {{.E2E_DIR}}/pids/operator-management.pid \ + {{.E2E_DIR}}/pids/operator-cell-dfw.pid; do + if [ -f "$PIDFILE" ]; then + PID=$(cat "$PIDFILE") + if kill -0 "$PID" 2>/dev/null; then + echo "Stopping PID $PID ($(basename $PIDFILE .pid))..." + kill -TERM "$PID" || true + else + echo "Process $PID ($(basename $PIDFILE .pid)) is not running" + fi + rm -f "$PIDFILE" + else + echo "PID file not found: $PIDFILE" + fi + done diff --git a/api/v1alpha/workloaddeployment_types.go b/api/v1alpha/workloaddeployment_types.go index 7da27c8..03ac341 100644 --- a/api/v1alpha/workloaddeployment_types.go +++ b/api/v1alpha/workloaddeployment_types.go @@ -2,8 +2,6 @@ package v1alpha import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - - networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" ) // WorkloadDeploymentSpec defines the desired state of WorkloadDeployment @@ -37,11 +35,6 @@ type WorkloadDeploymentSpec struct { // WorkloadDeploymentStatus defines the observed state of WorkloadDeployment type WorkloadDeploymentStatus struct { - // The location which the deployment has been scheduled to - // - // +kubebuilder:validation:Optional - Location *networkingv1alpha.LocationReference `json:"location,omitempty"` - // Represents the observations of a deployment's current state. // Known condition types are: "Available", "Progressing" Conditions []metav1.Condition `json:"conditions,omitempty"` @@ -80,8 +73,6 @@ const ( // +kubebuilder:printcolumn:name="Ready",type=string,JSONPath=`.status.readyReplicas` // +kubebuilder:printcolumn:name="Desired",type=string,JSONPath=`.status.desiredReplicas` // +kubebuilder:printcolumn:name="Up-to-date",type=string,JSONPath=`.status.currentReplicas` -// +kubebuilder:printcolumn:name="Location Namespace",type=string,JSONPath=`.status.location.namespace`,priority=1 -// +kubebuilder:printcolumn:name="Location Name",type=string,JSONPath=`.status.location.name`,priority=1 type WorkloadDeployment struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` diff --git a/api/v1alpha/zz_generated.deepcopy.go b/api/v1alpha/zz_generated.deepcopy.go index 8ecc1ba..6bd72a2 100644 --- a/api/v1alpha/zz_generated.deepcopy.go +++ b/api/v1alpha/zz_generated.deepcopy.go @@ -917,11 +917,6 @@ func (in *WorkloadDeploymentSpec) DeepCopy() *WorkloadDeploymentSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *WorkloadDeploymentStatus) DeepCopyInto(out *WorkloadDeploymentStatus) { *out = *in - if in.Location != nil { - in, out := &in.Location, &out.Location - *out = new(apiv1alpha.LocationReference) - **out = **in - } if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions *out = make([]metav1.Condition, len(*in)) diff --git a/cmd/main.go b/cmd/main.go index 3bb44bc..32e0c5f 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -18,17 +18,22 @@ import ( "k8s.io/apimachinery/pkg/runtime/serializer" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/cluster" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/manager" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" "sigs.k8s.io/controller-runtime/pkg/webhook" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcsingle "sigs.k8s.io/multicluster-runtime/providers/single" + karmadaclusterv1alpha1 "github.com/karmada-io/api/cluster/v1alpha1" + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" computev1alpha "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/config" "go.datum.net/compute/internal/controller" @@ -51,6 +56,11 @@ var ( gitCommit = "unknown" gitTreeState = "unknown" buildDate = "unknown" + + // downstreamRestConfig holds the REST config for the downstream control plane. + // It is populated from --downstream-kubeconfig when set, and is nil when the + // flag is omitted (e.g. in non-federation deployments). + downstreamRestConfig *rest.Config ) func init() { @@ -61,6 +71,8 @@ func init() { utilruntime.Must(computev1alpha.AddToScheme(scheme)) utilruntime.Must(networkingv1alpha.AddToScheme(scheme)) utilruntime.Must(quotav1alpha1.AddToScheme(scheme)) + utilruntime.Must(karmadapolicyv1alpha1.Install(scheme)) + utilruntime.Must(karmadaclusterv1alpha1.Install(scheme)) // +kubebuilder:scaffold:scheme } @@ -71,12 +83,27 @@ func main() { var leaderElectionNamespace string var probeAddr string var serverConfigFile string + var downstreamKubeconfig string + var downstreamContext string + var enableManagementControllers bool + var enableCellControllers bool flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") flag.BoolVar(&enableLeaderElection, "leader-elect", false, "Enable leader election for controller manager. "+ "Enabling this will ensure there is only one active controller manager.") flag.StringVar(&leaderElectionNamespace, "leader-elect-namespace", "", "The namespace to use for leader election.") + flag.StringVar(&downstreamKubeconfig, "downstream-kubeconfig", "", + "Path to the kubeconfig file for the downstream control plane. "+ + "When omitted, downstream federation features are disabled.") + flag.StringVar(&downstreamContext, "downstream-context", "", + "Context to use from the downstream kubeconfig. When omitted, the current context is used.") + flag.BoolVar(&enableManagementControllers, "enable-management-controllers", true, + "Enable management-plane controllers (WorkloadDeploymentFederator, InstanceProjector). "+ + "Disable when running a cell-only operator instance.") + flag.BoolVar(&enableCellControllers, "enable-cell-controllers", true, + "Enable cell controllers (WorkloadDeploymentReconciler, InstanceReconciler). "+ + "Disable when running a management-only operator instance.") opts := zap.Options{ Development: true, @@ -89,6 +116,23 @@ func main() { ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + // Load the downstream REST config when --downstream-kubeconfig is provided. + // When the flag is omitted, downstreamRestConfig remains nil and federation + // features will be skipped at controller setup time. + if downstreamKubeconfig != "" { + loader := clientcmd.NewNonInteractiveDeferredLoadingClientConfig( + &clientcmd.ClientConfigLoadingRules{ExplicitPath: downstreamKubeconfig}, + &clientcmd.ConfigOverrides{CurrentContext: downstreamContext}, + ) + var err error + downstreamRestConfig, err = loader.ClientConfig() + if err != nil { + setupLog.Error(err, "unable to load downstream kubeconfig", "path", downstreamKubeconfig) + os.Exit(1) + } + setupLog.Info("downstream kubeconfig loaded", "path", downstreamKubeconfig) + } + setupLog.Info("starting compute", "version", version, "gitCommit", gitCommit, @@ -180,17 +224,63 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "Workload") os.Exit(1) } - if err = (&controller.WorkloadDeploymentReconciler{}).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeployment") - os.Exit(1) + + // Build a single downstream client shared across all controllers that need + // to read or write to the downstream control plane. Nil when federation is disabled. + var downstreamClient client.Client + if downstreamRestConfig != nil { + downstreamClient, err = client.New(downstreamRestConfig, client.Options{Scheme: scheme}) + if err != nil { + setupLog.Error(err, "unable to create downstream client") + os.Exit(1) + } } - if err = (&controller.WorkloadDeploymentScheduler{}).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeploymentScheduler") - os.Exit(1) + + if enableCellControllers { + if err = (&controller.WorkloadDeploymentReconciler{}).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeployment") + os.Exit(1) + } } - if err = (&controller.InstanceReconciler{}).SetupWithManager(mgr, deploymentCluster); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "Instance") - os.Exit(1) + + if enableCellControllers { + instanceReconciler := &controller.InstanceReconciler{DownstreamClient: downstreamClient} + if err = instanceReconciler.SetupWithManager(mgr, deploymentCluster); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "Instance") + os.Exit(1) + } + } + + // WorkloadDeploymentFederator and InstanceProjector are management-plane + // controllers that run on the control-plane cluster. They require a downstream + // control plane to be configured (--downstream-kubeconfig provided). + if enableManagementControllers && downstreamRestConfig != nil { + federator := &controller.WorkloadDeploymentFederator{DownstreamClient: downstreamClient} + if err = federator.SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeploymentFederator") + os.Exit(1) + } + + // InstanceProjector: runs in the Control Plane Cell, watches Instances + // written back to the downstream control plane by POP-cell operators, and + // projects them into the corresponding project namespaces via the + // multicluster manager. + downstreamMgr, err := manager.New(downstreamRestConfig, manager.Options{ + Scheme: scheme, + Metrics: metricsserver.Options{BindAddress: "0"}, + }) + if err != nil { + setupLog.Error(err, "unable to create downstream manager for InstanceProjector") + os.Exit(1) + } + if err = (&controller.InstanceProjector{ + DownstreamClient: downstreamClient, + MCManager: mgr, + }).SetupWithManager(downstreamMgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "InstanceProjector") + os.Exit(1) + } + runnables = append(runnables, downstreamMgr) } if serverConfig.WebhookServer != nil { @@ -284,6 +374,7 @@ func initializeClusterDiscovery( } discoveryManager, err := manager.New(discoveryRestConfig, manager.Options{ + Metrics: metricsserver.Options{BindAddress: "0"}, Client: client.Options{ Cache: &client.CacheOptions{ Unstructured: true, diff --git a/config/base/downstream-rbac/kustomization.yaml b/config/base/downstream-rbac/kustomization.yaml new file mode 100644 index 0000000..4c4dbe4 --- /dev/null +++ b/config/base/downstream-rbac/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - rbac.yaml diff --git a/config/base/downstream-rbac/rbac.yaml b/config/base/downstream-rbac/rbac.yaml new file mode 100644 index 0000000..d214abc --- /dev/null +++ b/config/base/downstream-rbac/rbac.yaml @@ -0,0 +1,32 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: compute-manager +rules: + - apiGroups: ["compute.datumapis.com"] + resources: ["workloaddeployments", "workloaddeployments/status", "instances", "instances/status"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: ["policy.karmada.io"] + resources: ["propagationpolicies", "clusterpropagationpolicies"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: ["cluster.karmada.io"] + resources: ["clusters"] + verbs: ["get", "list", "watch"] + - apiGroups: ["work.karmada.io"] + resources: ["resourcebindings", "clusterresourcebindings"] + verbs: ["get", "list", "watch"] + - apiGroups: ["config.karmada.io"] + resources: ["resourceinterpreterwebhookconfigurations", "resourceinterpretercustomizations"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: compute-manager +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: compute-manager +subjects: + - kind: User + name: system:serviceaccount:compute-system:compute-manager diff --git a/config/base/federation/kustomization.yaml b/config/base/federation/kustomization.yaml new file mode 100644 index 0000000..1261dac --- /dev/null +++ b/config/base/federation/kustomization.yaml @@ -0,0 +1,10 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../crd/bases/compute.datumapis.com_instances.yaml + - ../crd/bases/compute.datumapis.com_workloaddeployments.yaml + - ../crd/bases/compute.datumapis.com_workloads.yaml + +components: + - ../../components/federation diff --git a/config/base/manager/manager.yaml b/config/base/manager/manager.yaml index 0302817..9528f3c 100644 --- a/config/base/manager/manager.yaml +++ b/config/base/manager/manager.yaml @@ -26,14 +26,30 @@ spec: seccompProfile: type: RuntimeDefault containers: - - command: + - name: manager + command: - /manager args: - - --leader-elect - - --health-probe-bind-address=:8081 - - --server-config=/config/config.yaml + - --leader-elect=$(LEADER_ELECT) + - --health-probe-bind-address=$(HEALTH_PROBE_BIND_ADDRESS) + - --server-config=$(SERVER_CONFIG) + - --downstream-kubeconfig=$(DOWNSTREAM_KUBECONFIG) + - --enable-management-controllers=$(ENABLE_MANAGEMENT_CONTROLLERS) + - --enable-cell-controllers=$(ENABLE_CELL_CONTROLLERS) + env: + - name: LEADER_ELECT + value: "true" + - name: HEALTH_PROBE_BIND_ADDRESS + value: ":8081" + - name: SERVER_CONFIG + value: /config/config.yaml + - name: DOWNSTREAM_KUBECONFIG + value: "" + - name: ENABLE_MANAGEMENT_CONTROLLERS + value: "false" + - name: ENABLE_CELL_CONTROLLERS + value: "false" image: ghcr.io/datum-cloud/compute:latest - name: manager ports: - containerPort: 9443 name: webhook-server @@ -69,7 +85,7 @@ spec: - name: webhook-cert mountPath: /tmp/k8s-webhook-server/serving-certs readOnly: true - serviceAccountName: compute + serviceAccountName: compute-manager terminationGracePeriodSeconds: 10 volumes: - name: config diff --git a/config/base/manager/service_account.yaml b/config/base/manager/service_account.yaml index f8711de..cc6bd6c 100644 --- a/config/base/manager/service_account.yaml +++ b/config/base/manager/service_account.yaml @@ -4,4 +4,4 @@ metadata: labels: app.kubernetes.io/name: compute app.kubernetes.io/managed-by: kustomize - name: compute + name: compute-manager diff --git a/config/components/cell-controllers/kustomization.yaml b/config/components/cell-controllers/kustomization.yaml new file mode 100644 index 0000000..3f32da3 --- /dev/null +++ b/config/components/cell-controllers/kustomization.yaml @@ -0,0 +1,20 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + - target: + kind: Deployment + name: compute-manager + patch: |- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: compute-manager + spec: + template: + spec: + containers: + - name: manager + env: + - name: ENABLE_CELL_CONTROLLERS + value: "true" diff --git a/config/components/controller_rbac/metrics_auth_role_binding.yaml b/config/components/controller_rbac/metrics_auth_role_binding.yaml index 1ea3d97..ada1a1d 100644 --- a/config/components/controller_rbac/metrics_auth_role_binding.yaml +++ b/config/components/controller_rbac/metrics_auth_role_binding.yaml @@ -8,4 +8,4 @@ roleRef: name: compute-metrics-auth-role subjects: - kind: ServiceAccount - name: compute + name: compute-manager diff --git a/config/components/controller_rbac/role_binding.yaml b/config/components/controller_rbac/role_binding.yaml index 6256bf3..2f3e267 100644 --- a/config/components/controller_rbac/role_binding.yaml +++ b/config/components/controller_rbac/role_binding.yaml @@ -11,4 +11,4 @@ roleRef: name: compute subjects: - kind: ServiceAccount - name: compute + name: compute-manager diff --git a/config/components/federation/kustomization.yaml b/config/components/federation/kustomization.yaml new file mode 100644 index 0000000..3ba207f --- /dev/null +++ b/config/components/federation/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +resources: + - workloaddeployment-interpreter.yaml diff --git a/config/components/federation/workloaddeployment-interpreter.yaml b/config/components/federation/workloaddeployment-interpreter.yaml new file mode 100644 index 0000000..3e6a9e2 --- /dev/null +++ b/config/components/federation/workloaddeployment-interpreter.yaml @@ -0,0 +1,51 @@ +apiVersion: config.karmada.io/v1alpha1 +kind: ResourceInterpreterCustomization +metadata: + name: workloaddeployment +spec: + target: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + customizations: + statusReflection: + luaScript: | + function ReflectStatus(observedObj) + if observedObj.status == nil then + return nil + end + return observedObj.status + end + statusAggregation: + luaScript: | + function AggregateStatus(desiredObj, statusItems) + if statusItems == nil then + return desiredObj + end + local replicas = 0 + local currentReplicas = 0 + local desiredReplicas = 0 + local readyReplicas = 0 + local conditions = nil + for i = 1, #statusItems do + local item = statusItems[i] + if item.status ~= nil then + replicas = replicas + (item.status.replicas or 0) + currentReplicas = currentReplicas + (item.status.currentReplicas or 0) + desiredReplicas = desiredReplicas + (item.status.desiredReplicas or 0) + readyReplicas = readyReplicas + (item.status.readyReplicas or 0) + if conditions == nil and item.status.conditions ~= nil then + conditions = item.status.conditions + end + end + end + desiredObj.status = { + replicas = replicas, + currentReplicas = currentReplicas, + desiredReplicas = desiredReplicas, + readyReplicas = readyReplicas, + } + if conditions ~= nil then + desiredObj.status.conditions = conditions + end + return desiredObj + end diff --git a/config/components/leader_election/leader_election_role_binding.yaml b/config/components/leader_election/leader_election_role_binding.yaml index a5fe999..d6783c0 100644 --- a/config/components/leader_election/leader_election_role_binding.yaml +++ b/config/components/leader_election/leader_election_role_binding.yaml @@ -11,4 +11,4 @@ roleRef: name: compute-leader-election subjects: - kind: ServiceAccount - name: compute + name: compute-manager diff --git a/config/components/management-controllers/kustomization.yaml b/config/components/management-controllers/kustomization.yaml new file mode 100644 index 0000000..d1e29e7 --- /dev/null +++ b/config/components/management-controllers/kustomization.yaml @@ -0,0 +1,20 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + - target: + kind: Deployment + name: compute-manager + patch: |- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: compute-manager + spec: + template: + spec: + containers: + - name: manager + env: + - name: ENABLE_MANAGEMENT_CONTROLLERS + value: "true" diff --git a/go.mod b/go.mod index 19fc010..286c1d9 100644 --- a/go.mod +++ b/go.mod @@ -1,22 +1,21 @@ module go.datum.net/compute -go 1.24.0 - -toolchain go1.24.2 +go 1.24.6 require ( github.com/google/go-cmp v0.7.0 + github.com/karmada-io/api v1.15.0 github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/gomega v1.37.0 github.com/stretchr/testify v1.11.1 go.datum.net/network-services-operator v0.1.0 - go.miloapis.com/milo v0.24.11 + go.miloapis.com/milo v0.25.2-0.20260518184803-e6ac7ea55253 golang.org/x/crypto v0.39.0 golang.org/x/sync v0.16.0 google.golang.org/protobuf v1.36.11 - k8s.io/api v0.33.1 + k8s.io/api v0.33.2 k8s.io/apimachinery v0.33.2 - k8s.io/client-go v0.33.1 + k8s.io/client-go v0.33.2 k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 sigs.k8s.io/controller-runtime v0.21.0 sigs.k8s.io/gateway-api v1.2.1 @@ -95,9 +94,9 @@ require ( gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiextensions-apiserver v0.33.1 // indirect - k8s.io/apiserver v0.33.1 // indirect - k8s.io/component-base v0.33.1 // indirect + k8s.io/apiextensions-apiserver v0.33.2 // indirect + k8s.io/apiserver v0.33.2 // indirect + k8s.io/component-base v0.33.2 // indirect k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20250610211856-8b98d1ed966a // indirect sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 // indirect diff --git a/go.sum b/go.sum index c472bd8..fb725cf 100644 --- a/go.sum +++ b/go.sum @@ -62,8 +62,6 @@ github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 h1:TmHmbvxPmaegwhDubVz0lICL0J5Ka2vwTzhoePEXsGE= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0/go.mod h1:qztMSjm835F2bXf+5HKAPIS5qsmQDqZna/PgVt4rWtI= github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 h1:5ZPtiqj0JL5oKWmcsq4VMaAW5ukBEgSGXEN89zeH1Jo= github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3/go.mod h1:ndYquD05frm2vACXE1nsccT4oJzjhw2arTS2cpUD1PI= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= @@ -72,6 +70,8 @@ github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8Hm github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/karmada-io/api v1.15.0 h1:6Dx+Q36LaoPqKM4gduUuhSBQ3eKjKusjkvmggLpt9xs= +github.com/karmada-io/api v1.15.0/go.mod h1:wNbBEmXYkrRLSC2VgmXizIG12FW+/sAUF7UIz5WlYAU= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= @@ -129,8 +129,6 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= -github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= @@ -139,10 +137,8 @@ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= go.datum.net/network-services-operator v0.1.0 h1:PAXOZ5DdJFgRoeVBPIXhqkCm6DxbP4tVOPcr3Y7h/So= go.datum.net/network-services-operator v0.1.0/go.mod h1:uloVfxqE+8DgSiMB651X8UC9yECpXbwp/NBstofCceE= -go.miloapis.com/milo v0.1.0 h1:AYFVz1lfta/NbWSFSSKPtnkCA2rN+iegxlfQrDgEvYY= -go.miloapis.com/milo v0.1.0/go.mod h1:X+DpWOchv/Vm63mwHnboW00KRGsODY2bUTS/bBbK1+E= -go.miloapis.com/milo v0.24.11 h1:rByXDKbP4ZEN0I/z1C2RyUCyQi0NWrITLqoQILSAn2E= -go.miloapis.com/milo v0.24.11/go.mod h1:xOFYvUsvSZV3z6eow5YdB5C/qRQf2s/5/arcfJs5XPg= +go.miloapis.com/milo v0.25.2-0.20260518184803-e6ac7ea55253 h1:0GIQZMFWzUf3XkyWahnMGDdl2+7n69NwYdwYAyo0i5Y= +go.miloapis.com/milo v0.25.2-0.20260518184803-e6ac7ea55253/go.mod h1:xOFYvUsvSZV3z6eow5YdB5C/qRQf2s/5/arcfJs5XPg= go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 h1:yd02MEjBdJkG3uabWP9apV+OuWRIXGDuJEUJbOHmCFU= @@ -195,8 +191,6 @@ golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKl golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8= -golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -224,20 +218,12 @@ golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8T golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gomodules.xyz/jsonpatch/v2 v2.5.0 h1:JELs8RLM12qJGXU4u/TO3V25KW8GreMKl9pdkk14RM0= gomodules.xyz/jsonpatch/v2 v2.5.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= -google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422 h1:GVIKPyP/kLIyVOgOnTwFOrvQaQUzOzGMCxgFUOEmm24= -google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422/go.mod h1:b6h1vNKhxaSoEI+5jc3PJUCustfli/mRab7295pY7rw= google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb h1:p31xT4yrYrSM/G4Sn2+TNUkVhFCbG9y8itM2S6Th950= google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:jbe3Bkdp+Dh2IrslsFCklNhweNTBgSYanP1UXhJDhKg= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a h1:51aaUVRocpvUOSQKM6Q7VuoaktNIaMCLuhZB6DKksq4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a/go.mod h1:uRxBH1mhmO8PGhU89cMcHaXKZqO+OfakD8QQO0oYwlQ= google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb h1:TLPQVbx1GJ8VKZxz52VAxl1EBgKXXbTiU9Fc5fZeLn4= google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:LuRYeWDFV6WOn90g357N17oMCaxpgCnbi/44qJvDn2I= -google.golang.org/grpc v1.71.0 h1:kF77BGdPTQ4/JZWMlb9VpJ5pa25aqvVqogsxNHHdeBg= -google.golang.org/grpc v1.71.0/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= google.golang.org/grpc v1.71.1 h1:ffsFWr7ygTUscGPI0KKK6TLrGz0476KUvvsbqWK0rPI= google.golang.org/grpc v1.71.1/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= -google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= -google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= @@ -250,18 +236,18 @@ gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.33.1 h1:tA6Cf3bHnLIrUK4IqEgb2v++/GYUtqiu9sRVk3iBXyw= -k8s.io/api v0.33.1/go.mod h1:87esjTn9DRSRTD4fWMXamiXxJhpOIREjWOSjsW1kEHw= -k8s.io/apiextensions-apiserver v0.33.1 h1:N7ccbSlRN6I2QBcXevB73PixX2dQNIW0ZRuguEE91zI= -k8s.io/apiextensions-apiserver v0.33.1/go.mod h1:uNQ52z1A1Gu75QSa+pFK5bcXc4hq7lpOXbweZgi4dqA= +k8s.io/api v0.33.2 h1:YgwIS5jKfA+BZg//OQhkJNIfie/kmRsO0BmNaVSimvY= +k8s.io/api v0.33.2/go.mod h1:fhrbphQJSM2cXzCWgqU29xLDuks4mu7ti9vveEnpSXs= +k8s.io/apiextensions-apiserver v0.33.2 h1:6gnkIbngnaUflR3XwE1mCefN3YS8yTD631JXQhsU6M8= +k8s.io/apiextensions-apiserver v0.33.2/go.mod h1:IvVanieYsEHJImTKXGP6XCOjTwv2LUMos0YWc9O+QP8= k8s.io/apimachinery v0.33.2 h1:IHFVhqg59mb8PJWTLi8m1mAoepkUNYmptHsV+Z1m5jY= k8s.io/apimachinery v0.33.2/go.mod h1:BHW0YOu7n22fFv/JkYOEfkUYNRN0fj0BlvMFWA7b+SM= -k8s.io/apiserver v0.33.1 h1:yLgLUPDVC6tHbNcw5uE9mo1T6ELhJj7B0geifra3Qdo= -k8s.io/apiserver v0.33.1/go.mod h1:VMbE4ArWYLO01omz+k8hFjAdYfc3GVAYPrhP2tTKccs= -k8s.io/client-go v0.33.1 h1:ZZV/Ks2g92cyxWkRRnfUDsnhNn28eFpt26aGc8KbXF4= -k8s.io/client-go v0.33.1/go.mod h1:JAsUrl1ArO7uRVFWfcj6kOomSlCv+JpvIsp6usAGefA= -k8s.io/component-base v0.33.1 h1:EoJ0xA+wr77T+G8p6T3l4efT2oNwbqBVKR71E0tBIaI= -k8s.io/component-base v0.33.1/go.mod h1:guT/w/6piyPfTgq7gfvgetyXMIh10zuXA6cRRm3rDuY= +k8s.io/apiserver v0.33.2 h1:KGTRbxn2wJagJowo29kKBp4TchpO1DRO3g+dB/KOJN4= +k8s.io/apiserver v0.33.2/go.mod h1:9qday04wEAMLPWWo9AwqCZSiIn3OYSZacDyu/AcoM/M= +k8s.io/client-go v0.33.2 h1:z8CIcc0P581x/J1ZYf4CNzRKxRvQAwoAolYPbtQes+E= +k8s.io/client-go v0.33.2/go.mod h1:9mCgT4wROvL948w6f6ArJNb7yQd7QsvqavDeZHvNmHo= +k8s.io/component-base v0.33.2 h1:sCCsn9s/dG3ZrQTX/Us0/Sx2R0G5kwa0wbZFYoVp/+0= +k8s.io/component-base v0.33.2/go.mod h1:/41uw9wKzuelhN+u+/C59ixxf4tYQKW7p32ddkYNe2k= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/kube-openapi v0.0.0-20250610211856-8b98d1ed966a h1:ZV3Zr+/7s7aVbjNGICQt+ppKWsF1tehxggNfbM7XnG8= diff --git a/hack/e2e/kind-control-plane.yaml b/hack/e2e/kind-control-plane.yaml new file mode 100644 index 0000000..47f3c63 --- /dev/null +++ b/hack/e2e/kind-control-plane.yaml @@ -0,0 +1,17 @@ +# Kind cluster configuration for the compute-control-plane management cluster. +# +# extraPortMappings exposes port 32443 on the macOS host so that the Karmada +# API server NodePort service (nodePort: 32443) is accessible at +# https://localhost:32443 without any additional port-forwarding. +# +# This matches KARMADA_API_NODEPORT in Taskfile.yaml. + +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: + - role: control-plane + extraPortMappings: + - containerPort: 32443 # Karmada API server NodePort + hostPort: 32443 + protocol: TCP + listenAddress: "127.0.0.1" diff --git a/hack/e2e/make-internal-kubeconfig.sh b/hack/e2e/make-internal-kubeconfig.sh new file mode 100755 index 0000000..3303a5b --- /dev/null +++ b/hack/e2e/make-internal-kubeconfig.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +# make-internal-kubeconfig.sh +# +# Produces a kubeconfig variant that uses the Kind node's Docker container IP +# instead of localhost. This variant is stored in Karmada so the controller +# manager (running inside Docker) can reach member cluster API servers across +# the kind bridge network. +# +# Background: Kind maps each cluster's API server to a random localhost port +# on the developer machine. Inside Docker containers, "localhost" refers to the +# container's own loopback — not the host. We therefore swap the server address +# to the Kind control-plane container's Docker bridge IP (e.g. 172.18.0.x) and +# set insecure-skip-tls-verify because the node certificate does not include +# the Docker bridge IP in its SANs. +# +# Usage: +# hack/e2e/make-internal-kubeconfig.sh \ +# tmp/e2e/kubeconfigs/pop-dfw.yaml \ +# tmp/e2e/kubeconfigs/pop-dfw-internal.yaml \ +# compute-pop-dfw + +set -euo pipefail + +INPUT="${1:?usage: $0 }" +OUTPUT="${2:?usage: $0 }" +CLUSTER_NAME="${3:?usage: $0 }" + +CONTAINER_NAME="${CLUSTER_NAME}-control-plane" + +# Resolve the container's Docker bridge IP. +DOCKER_IP=$(docker inspect \ + -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' \ + "${CONTAINER_NAME}" 2>/dev/null || true) + +if [ -z "${DOCKER_IP}" ]; then + echo "ERROR: Could not resolve Docker IP for container '${CONTAINER_NAME}'." >&2 + echo " Is the Kind cluster '${CLUSTER_NAME}' running?" >&2 + exit 1 +fi + +echo " ${CLUSTER_NAME}: Docker IP ${DOCKER_IP} → ${OUTPUT}" + +python3 - "${INPUT}" "${OUTPUT}" "${DOCKER_IP}" <<'PYEOF' +import sys, yaml + +src, dst, docker_ip = sys.argv[1], sys.argv[2], sys.argv[3] + +with open(src) as f: + cfg = yaml.safe_load(f) + +for cluster in cfg.get('clusters', []): + # Kind API server always listens on port 6443 inside the container. + cluster['cluster']['server'] = f'https://{docker_ip}:6443' + # The node cert only covers localhost / 127.0.0.1, not the bridge IP. + cluster['cluster']['insecure-skip-tls-verify'] = True + cluster['cluster'].pop('certificate-authority-data', None) + +with open(dst, 'w') as f: + yaml.dump(cfg, f, default_flow_style=False) +PYEOF diff --git a/hack/e2e/patch-cluster-secret.sh b/hack/e2e/patch-cluster-secret.sh new file mode 100755 index 0000000..e29ed38 --- /dev/null +++ b/hack/e2e/patch-cluster-secret.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash +# patch-cluster-secret.sh +# +# After "karmadactl join", Karmada stores the member cluster's kubeconfig in a +# Secret referenced by the Cluster object's spec.secretRef, and sets +# spec.apiEndpoint to the localhost address it resolved from the external +# kubeconfig. The Karmada controller manager runs inside Docker and cannot use +# localhost to reach POP cell API servers. +# +# This script: +# 1. Replaces the kubeconfig in the Secret with the Docker-IP variant so that +# the Karmada controller can make API calls to the member cluster. +# 2. Patches spec.apiEndpoint on the Cluster object so that health checks also +# use the Docker bridge IP instead of localhost. +# +# Usage: +# hack/e2e/patch-cluster-secret.sh \ +# tmp/e2e/kubeconfigs/karmada.yaml \ +# compute-pop-dfw \ +# tmp/e2e/kubeconfigs/pop-dfw-internal.yaml + +set -euo pipefail + +KARMADA_KUBECONFIG="${1:?usage: $0 }" +CLUSTER_NAME="${2:?usage: $0 }" +INTERNAL_KUBECONFIG="${3:?usage: $0 }" + +# ------------------------------------------------------------------ +# Read the Cluster object's secretRef (name + namespace) +# ------------------------------------------------------------------ +SECRET_NAME=$(kubectl \ + --kubeconfig="${KARMADA_KUBECONFIG}" \ + get cluster "${CLUSTER_NAME}" \ + -o jsonpath='{.spec.secretRef.name}' 2>/dev/null || true) + +if [ -z "${SECRET_NAME}" ]; then + echo "ERROR: Could not find spec.secretRef.name on cluster '${CLUSTER_NAME}'." >&2 + echo " Has karmadactl join completed successfully?" >&2 + exit 1 +fi + +SECRET_NAMESPACE=$(kubectl \ + --kubeconfig="${KARMADA_KUBECONFIG}" \ + get cluster "${CLUSTER_NAME}" \ + -o jsonpath='{.spec.secretRef.namespace}' 2>/dev/null || true) + +SECRET_NAMESPACE="${SECRET_NAMESPACE:-karmada-system}" + +echo " Patching secret ${SECRET_NAMESPACE}/${SECRET_NAME} with Docker-IP kubeconfig..." + +# ------------------------------------------------------------------ +# Replace the kubeconfig data in the secret +# ------------------------------------------------------------------ +kubectl \ + --kubeconfig="${KARMADA_KUBECONFIG}" \ + create secret generic "${SECRET_NAME}" \ + --namespace="${SECRET_NAMESPACE}" \ + --from-file=kubeconfig="${INTERNAL_KUBECONFIG}" \ + --dry-run=client -o yaml \ + | kubectl \ + --kubeconfig="${KARMADA_KUBECONFIG}" \ + apply -f - + +echo " Secret ${SECRET_NAMESPACE}/${SECRET_NAME} updated — Karmada controller will use Docker bridge IP" + +# ------------------------------------------------------------------ +# Extract the Docker-IP server URL from the internal kubeconfig and +# patch spec.apiEndpoint on the Cluster object so that Karmada's +# cluster-status controller uses the same reachable address for health +# checks. Without this patch the controller continues to probe the +# localhost address stored by karmadactl join and the cluster never +# transitions to Ready. +# ------------------------------------------------------------------ +DOCKER_SERVER=$(kubectl \ + --kubeconfig="${INTERNAL_KUBECONFIG}" \ + config view --minify -o jsonpath='{.clusters[0].cluster.server}') + +if [ -z "${DOCKER_SERVER}" ]; then + echo "ERROR: Could not read server URL from ${INTERNAL_KUBECONFIG}" >&2 + exit 1 +fi + +echo " Patching spec.apiEndpoint on cluster '${CLUSTER_NAME}' → ${DOCKER_SERVER}..." +kubectl \ + --kubeconfig="${KARMADA_KUBECONFIG}" \ + patch cluster "${CLUSTER_NAME}" \ + --type=merge \ + -p "{\"spec\":{\"apiEndpoint\":\"${DOCKER_SERVER}\"}}" + +echo " Cluster '${CLUSTER_NAME}' patched — health checks will now use Docker bridge IP" diff --git a/internal/controller/indexers.go b/internal/controller/indexers.go index fb0ebe8..7d9e1ae 100644 --- a/internal/controller/indexers.go +++ b/internal/controller/indexers.go @@ -15,7 +15,10 @@ import ( const ( deploymentWorkloadUIDIndex = "deploymentWorkloadUIDIndex" workloadNetworksIndex = "workloadNetworksIndex" - deploymentLocationIndex = "deploymentLocationIndex" + // deploymentCityCodeIndex indexes WorkloadDeployments by their Spec.CityCode + // so that SubnetClaim/Subnet watches can efficiently find the deployments + // that target the same city as a changed networking resource. + deploymentCityCodeIndex = "deploymentCityCodeIndex" ) func AddIndexers(ctx context.Context, mgr mcmanager.Manager) error { @@ -30,9 +33,10 @@ func addWorkloadDeploymentIndexers(ctx context.Context, mgr mcmanager.Manager) e return fmt.Errorf("failed to add workload deployment indexer %q: %w", deploymentWorkloadUIDIndex, err) } - // Index workload deployments by location - if err := mgr.GetFieldIndexer().IndexField(ctx, &computev1alpha.WorkloadDeployment{}, deploymentLocationIndex, deploymentLocationIndexFunc); err != nil { - return fmt.Errorf("failed to add workload deployment indexer %q: %w", deploymentLocationIndex, err) + // Index workload deployments by city code so that SubnetClaim/Subnet watch + // handlers can efficiently find deployments targeting the same city. + if err := mgr.GetFieldIndexer().IndexField(ctx, &computev1alpha.WorkloadDeployment{}, deploymentCityCodeIndex, deploymentCityCodeIndexFunc); err != nil { + return fmt.Errorf("failed to add workload deployment indexer %q: %w", deploymentCityCodeIndex, err) } return nil @@ -44,18 +48,12 @@ func deploymentWorkloadUIDIndexFunc(o client.Object) []string { } } -func deploymentLocationIndexFunc(o client.Object) []string { +func deploymentCityCodeIndexFunc(o client.Object) []string { deployment := o.(*computev1alpha.WorkloadDeployment) - if deployment.Status.Location == nil { + if deployment.Spec.CityCode == "" { return nil } - - return []string{ - types.NamespacedName{ - Namespace: deployment.Status.Location.Namespace, - Name: deployment.Status.Location.Name, - }.String(), - } + return []string{deployment.Spec.CityCode} } func addWorkloadIndexers(ctx context.Context, mgr mcmanager.Manager) error { diff --git a/internal/controller/instance_controller.go b/internal/controller/instance_controller.go index e5bc356..e90e695 100644 --- a/internal/controller/instance_controller.go +++ b/internal/controller/instance_controller.go @@ -17,6 +17,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/cluster" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/finalizer" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/reconcile" @@ -29,11 +30,20 @@ import ( computev1alpha "go.datum.net/compute/api/v1alpha" networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" quotav1alpha1 "go.miloapis.com/milo/pkg/apis/quota/v1alpha1" + "go.miloapis.com/milo/pkg/downstreamclient" "go.datum.net/compute/internal/controller/instancecontrol" ) -const instanceQuotaFinalizer = "quota.compute.datumapis.com/claim-cleanup" +const ( + // instanceQuotaFinalizer ensures the quota ResourceClaim is deleted when + // an Instance is removed. + instanceQuotaFinalizer = "quota.compute.datumapis.com/claim-cleanup" + + // instanceControllerFinalizer is registered with the finalizer framework and + // triggers downstream write-back cleanup on deletion. + instanceControllerFinalizer = "compute.datumapis.com/instance-controller" +) // clusterGetter is the subset of mcmanager.Manager used by InstanceReconciler. // Keeping it narrow allows unit tests to substitute a minimal fake. @@ -45,6 +55,13 @@ type clusterGetter interface { type InstanceReconciler struct { mgr clusterGetter managementCluster cluster.Cluster + // DownstreamClient is an optional client pointing at the downstream control plane. + // When non-nil, the reconciler writes a copy of each Instance back to the + // downstream control plane so that the InstanceProjector (running in the + // management cluster) can aggregate status across all POP cells. Set to nil to + // disable federation write-back (e.g. in non-federation deployments). + DownstreamClient client.Client + finalizers finalizer.Finalizers } // +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances,verbs=get;list;watch;create;update;patch;delete @@ -69,29 +86,24 @@ func (r *InstanceReconciler) Reconcile(ctx context.Context, req mcreconcile.Requ return ctrl.Result{}, err } + // Run the finalizer framework first. This handles downstream write-back cleanup + // via the Finalize method registered below. + finalizationResult, err := r.finalizers.Finalize(ctx, &instance) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to finalize: %w", err) + } + if finalizationResult.Updated { + if err = cl.GetClient().Update(ctx, &instance); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to update based on finalization result: %w", err) + } + return ctrl.Result{}, nil + } + logger.Info("reconciling instance") defer logger.Info("reconcile complete") if !instance.DeletionTimestamp.IsZero() { - if controllerutil.ContainsFinalizer(&instance, instanceQuotaFinalizer) { - claimName := fmt.Sprintf("%s--%s", instance.Namespace, instance.Name) - var claim quotav1alpha1.ResourceClaim - if err := r.managementCluster.GetClient().Get(ctx, client.ObjectKey{Namespace: instance.Namespace, Name: claimName}, &claim); err != nil { - if !apierrors.IsNotFound(err) { - return ctrl.Result{}, fmt.Errorf("failed getting resource claim for deletion: %w", err) - } - } else { - if err := r.managementCluster.GetClient().Delete(ctx, &claim); client.IgnoreNotFound(err) != nil { - return ctrl.Result{}, fmt.Errorf("failed deleting resource claim: %w", err) - } - } - - controllerutil.RemoveFinalizer(&instance, instanceQuotaFinalizer) - if err := cl.GetClient().Update(ctx, &instance); err != nil { - return ctrl.Result{}, fmt.Errorf("failed removing quota finalizer: %w", err) - } - } - return ctrl.Result{}, nil + return ctrl.Result{}, r.reconcileDeletion(ctx, cl.GetClient(), &instance) } if !controllerutil.ContainsFinalizer(&instance, instanceQuotaFinalizer) { @@ -102,84 +114,232 @@ func (r *InstanceReconciler) Reconcile(ctx context.Context, req mcreconcile.Requ return ctrl.Result{}, nil } - grantedCondition, err := r.reconcileQuotaClaim(ctx, req.ClusterName, &instance) + statusChanged, err := r.reconcileQuotaCondition(ctx, req.ClusterName, &instance) if err != nil { - return ctrl.Result{}, fmt.Errorf("failed reconciling quota claim: %w", err) + return ctrl.Result{}, err + } + + readyChanged, err := r.reconcileInstanceReadyCondition(ctx, cl.GetClient(), &instance, r.checkForNetworkCreationFailure) + if err != nil { + return ctrl.Result{}, err + } + + if statusChanged || readyChanged { + if err := cl.GetClient().Status().Update(ctx, &instance); err != nil { + return ctrl.Result{}, err + } + if err := r.writeBackToDownstream(ctx, req.ClusterName, &instance); err != nil { + return ctrl.Result{}, err + } + // Return after the status update so that the next reconcile sees the + // updated QuotaGranted condition before attempting spec changes. + return ctrl.Result{}, nil + } + + if err := r.removeQuotaSchedulingGate(ctx, cl.GetClient(), &instance); err != nil { + return ctrl.Result{}, err + } + + return ctrl.Result{}, nil +} + +// reconcileDeletion handles quota-claim cleanup when an Instance is being +// deleted. It removes the quota finalizer once the ResourceClaim is gone. +func (r *InstanceReconciler) reconcileDeletion(ctx context.Context, cl client.Client, instance *computev1alpha.Instance) error { + if !controllerutil.ContainsFinalizer(instance, instanceQuotaFinalizer) { + return nil + } + + claimName := fmt.Sprintf("%s--%s", instance.Namespace, instance.Name) + var claim quotav1alpha1.ResourceClaim + if err := r.managementCluster.GetClient().Get(ctx, client.ObjectKey{Namespace: instance.Namespace, Name: claimName}, &claim); err != nil { + if !apierrors.IsNotFound(err) { + return fmt.Errorf("failed getting resource claim for deletion: %w", err) + } + } else { + if err := r.managementCluster.GetClient().Delete(ctx, &claim); client.IgnoreNotFound(err) != nil { + return fmt.Errorf("failed deleting resource claim: %w", err) + } } - statusChanged := false + controllerutil.RemoveFinalizer(instance, instanceQuotaFinalizer) + if err := cl.Update(ctx, instance); err != nil { + return fmt.Errorf("failed removing quota finalizer: %w", err) + } + return nil +} + +// reconcileQuotaCondition reconciles the ResourceClaim and updates the +// InstanceQuotaGranted status condition. It returns true when the condition +// changed and a status update is required. +func (r *InstanceReconciler) reconcileQuotaCondition(ctx context.Context, clusterName string, instance *computev1alpha.Instance) (bool, error) { + grantedCondition, err := r.reconcileQuotaClaim(ctx, clusterName, instance) + if err != nil { + return false, fmt.Errorf("failed reconciling quota claim: %w", err) + } switch { case grantedCondition == nil || (grantedCondition.Status == metav1.ConditionFalse && grantedCondition.Reason == quotav1alpha1.ResourceClaimPendingReason): - statusChanged = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + return apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ Type: computev1alpha.InstanceQuotaGranted, Status: metav1.ConditionUnknown, Reason: computev1alpha.InstanceQuotaGrantedReasonPendingEvaluation, Message: "Waiting for quota evaluation", ObservedGeneration: instance.Generation, - }) + }), nil case grantedCondition.Status == metav1.ConditionTrue: - statusChanged = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + return apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ Type: computev1alpha.InstanceQuotaGranted, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, Message: grantedCondition.Message, ObservedGeneration: instance.Generation, - }) + }), nil - case grantedCondition.Status == metav1.ConditionFalse: + default: // grantedCondition.Status == metav1.ConditionFalse reason := computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded if grantedCondition.Reason == quotav1alpha1.ResourceClaimValidationFailedReason { reason = computev1alpha.InstanceQuotaGrantedReasonValidationFailed } - statusChanged = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + return apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ Type: computev1alpha.InstanceQuotaGranted, Status: metav1.ConditionFalse, Reason: reason, Message: grantedCondition.Message, ObservedGeneration: instance.Generation, - }) + }), nil } +} - readyChanged, err := r.reconcileInstanceReadyCondition(ctx, cl.GetClient(), &instance, r.checkForNetworkCreationFailure) +// removeQuotaSchedulingGate removes the quota scheduling gate from the +// Instance spec once QuotaGranted=True has been persisted to status. +func (r *InstanceReconciler) removeQuotaSchedulingGate(ctx context.Context, cl client.Client, instance *computev1alpha.Instance) error { + quotaGrantedCond := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceQuotaGranted) + if quotaGrantedCond == nil || quotaGrantedCond.Status != metav1.ConditionTrue { + return nil + } + if instance.Spec.Controller == nil { + return nil + } + + newGates := make([]computev1alpha.SchedulingGate, 0, len(instance.Spec.Controller.SchedulingGates)) + gateRemoved := false + for _, gate := range instance.Spec.Controller.SchedulingGates { + if gate.Name == instancecontrol.QuotaSchedulingGate.String() { + gateRemoved = true + continue + } + newGates = append(newGates, gate) + } + if !gateRemoved { + return nil + } + + patch := client.MergeFrom(instance.DeepCopy()) + instance.Spec.Controller.SchedulingGates = newGates + if err := cl.Patch(ctx, instance, patch); err != nil { + return fmt.Errorf("failed patching quota scheduling gate: %w", err) + } + return nil +} + +// Finalize removes the downstream write-back Instance when the local Instance is +// deleted. It is a no-op when downstream federation is disabled. +func (r *InstanceReconciler) Finalize(ctx context.Context, obj client.Object) (finalizer.Result, error) { + if r.DownstreamClient == nil { + return finalizer.Result{}, nil + } + + instance := obj.(*computev1alpha.Instance) + + downstreamInstance := &computev1alpha.Instance{} + err := r.DownstreamClient.Get(ctx, client.ObjectKeyFromObject(instance), downstreamInstance) + if apierrors.IsNotFound(err) { + // Already gone — nothing to do. + return finalizer.Result{}, nil + } if err != nil { - return ctrl.Result{}, err + return finalizer.Result{}, fmt.Errorf("failed getting downstream instance for deletion: %w", err) } - if statusChanged || readyChanged { - if err := cl.GetClient().Status().Update(ctx, &instance); err != nil { - return ctrl.Result{}, err + if err := r.DownstreamClient.Delete(ctx, downstreamInstance); client.IgnoreNotFound(err) != nil { + return finalizer.Result{}, fmt.Errorf("failed deleting downstream write-back instance: %w", err) + } + + return finalizer.Result{}, nil +} + +// writeBackToDownstream copies the Instance spec and status to the downstream +// control plane so that the InstanceProjector can aggregate state from all POP +// cells. It is a no-op when DownstreamClient is nil (federation disabled). +func (r *InstanceReconciler) writeBackToDownstream(ctx context.Context, clusterName string, instance *computev1alpha.Instance) error { + if r.DownstreamClient == nil { + return nil + } + + // Encode the POP-cell cluster name using the same convention as NSO's + // MappedNamespaceResourceStrategy: "cluster-" with "/" → "_". + encodedClusterName := "cluster-" + strings.ReplaceAll(clusterName, "/", "_") + + // Read the upstream project namespace name from the downstream namespace label + // stamped by the WorkloadDeploymentFederator. This lets the InstanceProjector + // resolve the target namespace via a direct label lookup on the Instance rather + // than scanning all project cluster namespaces by UID. + upstreamNamespace := instance.Namespace // fallback: cell namespace (ns-) + var downstreamNS corev1.Namespace + if err := r.DownstreamClient.Get(ctx, client.ObjectKey{Name: instance.Namespace}, &downstreamNS); err == nil { + if v := downstreamNS.Labels[downstreamclient.UpstreamOwnerNamespaceLabel]; v != "" { + upstreamNamespace = v } - // Return after the status update so that the next reconcile sees the - // updated QuotaGranted condition before attempting spec changes. - return ctrl.Result{}, nil } - // Remove the quota scheduling gate once QuotaGranted=True is persisted. - quotaGrantedCond := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceQuotaGranted) - if quotaGrantedCond != nil && quotaGrantedCond.Status == metav1.ConditionTrue { - if instance.Spec.Controller != nil { - newGates := make([]computev1alpha.SchedulingGate, 0, len(instance.Spec.Controller.SchedulingGates)) - gateRemoved := false - for _, gate := range instance.Spec.Controller.SchedulingGates { - if gate.Name == instancecontrol.QuotaSchedulingGate.String() { - gateRemoved = true - continue - } - newGates = append(newGates, gate) - } - if gateRemoved { - patch := client.MergeFrom(instance.DeepCopy()) - instance.Spec.Controller.SchedulingGates = newGates - if err := cl.GetClient().Patch(ctx, &instance, patch); err != nil { - return ctrl.Result{}, fmt.Errorf("failed patching quota scheduling gate: %w", err) - } - } + writeBack := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instance.Name, + Namespace: instance.Namespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: encodedClusterName, + downstreamclient.UpstreamOwnerNamespaceLabel: upstreamNamespace, + }, + }, + Spec: instance.Spec, + } + + existing := &computev1alpha.Instance{} + err := r.DownstreamClient.Get(ctx, client.ObjectKeyFromObject(writeBack), existing) + if apierrors.IsNotFound(err) { + // Ensure the namespace exists in the downstream control plane before creating the Instance. + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: instance.Namespace}} + if err := r.DownstreamClient.Create(ctx, ns); err != nil && !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("failed ensuring downstream namespace: %w", err) + } + if err := r.DownstreamClient.Create(ctx, writeBack); err != nil { + return fmt.Errorf("failed creating downstream write-back instance: %w", err) + } + writeBack.Status = instance.Status + if err := r.DownstreamClient.Status().Update(ctx, writeBack); err != nil { + return fmt.Errorf("failed updating downstream write-back instance status after create: %w", err) } + return nil + } + if err != nil { + return fmt.Errorf("failed getting downstream instance: %w", err) } - return ctrl.Result{}, nil + // Update spec + labels on the existing object, then push status separately. + existing.Spec = instance.Spec + existing.Labels = writeBack.Labels + if err := r.DownstreamClient.Update(ctx, existing); err != nil { + return fmt.Errorf("failed updating downstream write-back instance: %w", err) + } + + existing.Status = instance.Status + if err := r.DownstreamClient.Status().Update(ctx, existing); err != nil { + return fmt.Errorf("failed updating downstream write-back instance status: %w", err) + } + + return nil } func (r *InstanceReconciler) reconcileQuotaClaim(ctx context.Context, clusterName string, instance *computev1alpha.Instance) (*metav1.Condition, error) { @@ -344,6 +504,7 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( return false, fmt.Errorf("failed checking for network creation failure: %w", err) } + readyCondition.Status = metav1.ConditionFalse if networkCreationFailure { readyCondition.Reason = "NetworkFailedToCreate" readyCondition.Message = networkCreationFailureMessage @@ -360,6 +521,7 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( if programmedCondition == nil || programmedCondition.Status != metav1.ConditionTrue { logger.Info("instance is not programmed", "instance", instance.Name) + readyCondition.Status = metav1.ConditionFalse readyCondition.Reason = computev1alpha.InstanceProgrammedReasonPendingProgramming if programmedCondition != nil && programmedCondition.Reason != pendingReason { readyCondition.Reason = programmedCondition.Reason @@ -379,6 +541,7 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( if runningCondition == nil || runningCondition.Status != metav1.ConditionTrue { logger.Info("instance is not running", "instance", instance.Name) + readyCondition.Status = metav1.ConditionFalse readyCondition.Reason = pendingReason if runningCondition != nil && runningCondition.Reason != pendingReason { readyCondition.Reason = runningCondition.Reason @@ -441,6 +604,11 @@ func (r *InstanceReconciler) SetupWithManager(mgr mcmanager.Manager, managementC r.mgr = mgr r.managementCluster = managementCluster + r.finalizers = finalizer.NewFinalizers() + if err := r.finalizers.Register(instanceControllerFinalizer, r); err != nil { + return fmt.Errorf("failed to register finalizer: %w", err) + } + // Watch ResourceClaim objects on the management cluster directly, bypassing // the multicluster clusterInjectingQueue which would overwrite ClusterName. // Using ctrlsource.TypedKind lets the handler produce mcreconcile.Request diff --git a/internal/controller/instance_controller_test.go b/internal/controller/instance_controller_test.go index 1a15090..3537e53 100644 --- a/internal/controller/instance_controller_test.go +++ b/internal/controller/instance_controller_test.go @@ -2,8 +2,6 @@ package controller import ( "context" - "fmt" - "net/http" "testing" "github.com/stretchr/testify/assert" @@ -13,12 +11,10 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/rest" - "k8s.io/client-go/tools/record" - "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/controller-runtime/pkg/cluster" + "sigs.k8s.io/controller-runtime/pkg/finalizer" "sigs.k8s.io/controller-runtime/pkg/reconcile" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" @@ -27,36 +23,6 @@ import ( quotav1alpha1 "go.miloapis.com/milo/pkg/apis/quota/v1alpha1" ) -// fakeCluster implements cluster.Cluster for testing using a fake client. -type fakeCluster struct { - client client.Client - scheme *runtime.Scheme -} - -func (f *fakeCluster) GetHTTPClient() *http.Client { return nil } -func (f *fakeCluster) GetConfig() *rest.Config { return nil } -func (f *fakeCluster) GetCache() cache.Cache { return nil } -func (f *fakeCluster) GetScheme() *runtime.Scheme { return f.scheme } -func (f *fakeCluster) GetClient() client.Client { return f.client } -func (f *fakeCluster) GetFieldIndexer() client.FieldIndexer { return nil } -func (f *fakeCluster) GetEventRecorderFor(string) record.EventRecorder { return nil } -func (f *fakeCluster) GetRESTMapper() apimeta.RESTMapper { return nil } -func (f *fakeCluster) GetAPIReader() client.Reader { return f.client } -func (f *fakeCluster) Start(context.Context) error { return nil } - -// fakeMCManager is a minimal multicluster manager that returns a single cluster. -type fakeMCManager struct { - clusters map[string]cluster.Cluster -} - -func (m *fakeMCManager) GetCluster(ctx context.Context, clusterName string) (cluster.Cluster, error) { - cl, ok := m.clusters[clusterName] - if !ok { - return nil, fmt.Errorf("cluster %q not found", clusterName) - } - return cl, nil -} - // newTestScheme builds a runtime.Scheme with the types needed for instance reconcile tests. func newTestScheme(t *testing.T) *runtime.Scheme { t.Helper() @@ -508,12 +474,15 @@ func TestReconcileQuota(t *testing.T) { // makeInstance creates a test Instance with an owner reference to the // deployment so that checkForNetworkCreationFailure can look it up. + // Both finalizers are pre-populated so that the finalizer framework does + // not need to add instanceControllerFinalizer on the first reconcile, + // which would cause an early return before quota logic runs. makeInstance := func(_ *runtime.Scheme, gates ...computev1alpha.SchedulingGate) *computev1alpha.Instance { return &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ Name: instanceName, Namespace: namespace, - Finalizers: []string{instanceQuotaFinalizer}, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, OwnerReferences: []metav1.OwnerReference{ { APIVersion: "compute.datumapis.com/v1alpha", @@ -590,14 +559,21 @@ func TestReconcileQuota(t *testing.T) { mgr := &fakeMCManager{ clusters: map[string]cluster.Cluster{ - clusterName: &fakeCluster{client: projectClient, scheme: s}, + clusterName: newFakeCluster(projectClient), }, } r := &InstanceReconciler{ mgr: mgr, - managementCluster: &fakeCluster{client: mgmtClient, scheme: s}, + managementCluster: newFakeCluster(mgmtClient), } + + // Initialize the finalizer registry so that r.finalizers.Finalize is not + // a nil-pointer dereference. SetupWithManager does this in production; in + // tests we replicate the same steps manually. + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + return r, projectClient, mgmtClient } @@ -737,10 +713,28 @@ func TestReconcileQuota(t *testing.T) { s := newTestScheme(t) now := metav1.Now() - instance := makeInstance(s, - computev1alpha.SchedulingGate{Name: instancecontrol.QuotaSchedulingGate.String()}, - ) - instance.DeletionTimestamp = &now + // Build the instance directly without instanceControllerFinalizer to + // represent the state after the Karmada finalizer has already been + // cleaned up; only the quota finalizer remains to be processed. + instance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instanceName, + Namespace: namespace, + DeletionTimestamp: &now, + Finalizers: []string{instanceQuotaFinalizer}, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: "d1-standard-2"}, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + } claim := makeClaim(s, metav1.ConditionFalse, quotav1alpha1.ResourceClaimPendingReason) diff --git a/internal/controller/instance_projector.go b/internal/controller/instance_projector.go new file mode 100644 index 0000000..db26e84 --- /dev/null +++ b/internal/controller/instance_projector.go @@ -0,0 +1,164 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + "strings" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +// InstanceProjector watches Instance objects written back to the downstream +// control plane by POP-cell InstanceReconcilers and creates read-only +// projections in the corresponding project namespace within each project cluster. +// +// Namespace resolution: a downstream Instance lives in namespace +// `ns-`. The UID portion is matched against the UID of +// namespaces in the project cluster to find the target namespace. +// +// Ownership: each projected Instance is owned by the project WorkloadDeployment +// so that it is garbage-collected via cascading deletion when the deployment is +// removed from the project cluster. +// +// The controller is registered with a standard manager.Manager pointed at the +// downstream control plane — NOT the multicluster-runtime manager — so informer +// watches are scoped to the downstream control plane. +type InstanceProjector struct { + // DownstreamClient reads Instance objects from the downstream control plane. + // Must be set before SetupWithManager is called. + DownstreamClient client.Client + + // MCManager provides access to project cluster clients via GetCluster. + MCManager mcmanager.Manager +} + +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances/status,verbs=get;update;patch + +func (r *InstanceProjector) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx).WithValues("instance", req.NamespacedName) + + // 1. Fetch the Instance from the downstream control plane. + var downstreamInstance computev1alpha.Instance + if err := r.DownstreamClient.Get(ctx, req.NamespacedName, &downstreamInstance); err != nil { + if apierrors.IsNotFound(err) { + // Instance was deleted from the downstream control plane. Projections + // are owned by the project WorkloadDeployment, so cascading deletion + // handles cleanup. + return ctrl.Result{}, nil + } + return ctrl.Result{}, fmt.Errorf("failed getting downstream instance: %w", err) + } + + // Only project Instances that carry the upstream tracking label; others were + // not written by our InstanceReconciler write-back logic. + encodedClusterName, ok := downstreamInstance.Labels[downstreamclient.UpstreamOwnerClusterNameLabel] + if !ok { + logger.V(1).Info("skipping instance without upstream cluster label") + return ctrl.Result{}, nil + } + + // 2. Resolve the project cluster name. + // The encoded form is "cluster-" with "/" replaced by "_". + clusterName := strings.TrimPrefix(encodedClusterName, "cluster-") + clusterName = strings.ReplaceAll(clusterName, "_", "/") + + // 3. Obtain the project cluster client. + projectCluster, err := r.MCManager.GetCluster(ctx, clusterName) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed getting project cluster %q: %w", clusterName, err) + } + projectClient := projectCluster.GetClient() + + // 4. Resolve the target project namespace from the Instance label. + // The InstanceReconciler stamps UpstreamOwnerNamespaceLabel with the project + // namespace name (read from the downstream namespace label set by the federator), + // so we can resolve the target namespace directly without scanning. + targetNamespace := downstreamInstance.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] + if targetNamespace == "" { + logger.Info("Instance missing upstream-namespace label, requeueing", + "namespace", downstreamInstance.Namespace, "name", downstreamInstance.Name) + return ctrl.Result{RequeueAfter: 5 * time.Second}, nil + } + + // 5. Find the owning WorkloadDeployment in the project cluster by UID. + // The downstream Instance carries WorkloadDeploymentUIDLabel so we can find + // the owning deployment without relying on field selectors. + wdUID := downstreamInstance.Labels[computev1alpha.WorkloadDeploymentUIDLabel] + + var wdList computev1alpha.WorkloadDeploymentList + if err := projectClient.List(ctx, &wdList, client.InNamespace(targetNamespace)); err != nil { + return ctrl.Result{}, fmt.Errorf("failed listing WorkloadDeployments in %s/%s: %w", clusterName, targetNamespace, err) + } + + var ownerWD *computev1alpha.WorkloadDeployment + for i := range wdList.Items { + if string(wdList.Items[i].UID) == wdUID { + ownerWD = &wdList.Items[i] + break + } + } + + // 6. Create or update the projection in the project namespace. + projection := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: downstreamInstance.Name, + Namespace: targetNamespace, + }, + } + + operationResult, err := controllerutil.CreateOrUpdate(ctx, projectClient, projection, func() error { + // Propagate upstream tracking labels so consumers can filter by origin. + if projection.Labels == nil { + projection.Labels = make(map[string]string) + } + for k, v := range downstreamInstance.Labels { + projection.Labels[k] = v + } + + projection.Spec = downstreamInstance.Spec + + // Attach an owner reference to the WorkloadDeployment so the projection + // is garbage-collected when the deployment is removed. + if ownerWD != nil { + return controllerutil.SetOwnerReference(ownerWD, projection, projectCluster.GetScheme()) + } + return nil + }) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed upserting Instance projection in %s/%s: %w", clusterName, targetNamespace, err) + } + + logger.Info("reconciled Instance projection", "operation", operationResult, "namespace", targetNamespace, "cluster", clusterName) + + // 7. Sync status — status is a separate subresource. + projection.Status = downstreamInstance.Status + if err := projectClient.Status().Update(ctx, projection); err != nil && !apierrors.IsNotFound(err) { + return ctrl.Result{}, fmt.Errorf("failed updating Instance projection status: %w", err) + } + + return ctrl.Result{}, nil +} + +// SetupWithManager registers the InstanceProjector with downstreamMgr, a standard +// manager.Manager configured against the downstream control plane REST config. +// DownstreamClient and MCManager must be set before calling this method. +func (r *InstanceProjector) SetupWithManager(downstreamMgr manager.Manager) error { + return ctrl.NewControllerManagedBy(downstreamMgr). + For(&computev1alpha.Instance{}). + Named("instance-projector"). + Complete(r) +} diff --git a/internal/controller/instance_projector_test.go b/internal/controller/instance_projector_test.go new file mode 100644 index 0000000..67e3a46 --- /dev/null +++ b/internal/controller/instance_projector_test.go @@ -0,0 +1,361 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +// ─── Test constants ─────────────────────────────────────────────────────────── + +const ( + // projTestCluster is the project cluster name used in projector tests. + projTestCluster = "project-cluster" + + // projTestProjNS is the project namespace name. + projTestProjNS = "proj-namespace" + + // projTestProjNSUID is the project namespace UID embedded in the Karmada + // namespace name below. + projTestProjNSUID = types.UID("deadbeef-1111-2222-3333-444455556666") + + // projTestKarmadaNS is the Karmada namespace derived from the UID above + // via the ns- convention. + projTestKarmadaNS = "ns-deadbeef-1111-2222-3333-444455556666" + + // projTestInstanceName is the name of the Karmada (and projected) Instance. + projTestInstanceName = "inst-abc" + + // projTestWDUID is the UID of the owning WorkloadDeployment. + projTestWDUID = types.UID("wd-uid-9999-aaaa-bbbb-cccc") + + // projTestWDName is the name of the owning WorkloadDeployment. + projTestWDName = "my-wd" +) + +// encodedCluster returns the value of the UpstreamOwnerClusterNameLabel for +// projTestCluster ("cluster-"). +func encodedCluster() string { + return "cluster-" + projTestCluster +} + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +// projTestProjectNS builds the project cluster Namespace with the stable test UID. +func projTestProjectNS() *corev1.Namespace { + return &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestProjNS, + UID: projTestProjNSUID, + }, + } +} + +// projTestWorkloadDeployment builds the project WorkloadDeployment that owns +// projected Instances. +func projTestWorkloadDeployment() *computev1alpha.WorkloadDeployment { + return &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestWDName, + Namespace: projTestProjNS, + UID: projTestWDUID, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: "LAX", + PlacementName: "default", + WorkloadRef: computev1alpha.WorkloadReference{Name: "my-workload"}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + }, + } +} + +// projTestKarmadaInstance builds a Karmada Instance with the default labels +// needed for the InstanceProjector to act on it. Optional label overrides are +// applied last. +func projTestKarmadaInstance(labelOverrides map[string]string) *computev1alpha.Instance { + labels := map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: encodedCluster(), + downstreamclient.UpstreamOwnerNamespaceLabel: projTestProjNS, + computev1alpha.WorkloadDeploymentUIDLabel: string(projTestWDUID), + } + for k, v := range labelOverrides { + labels[k] = v + } + return &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestInstanceName, + Namespace: projTestKarmadaNS, + Labels: labels, + }, + Spec: computev1alpha.InstanceSpec{ + // Minimal valid spec — actual content is copied to the projection. + }, + } +} + +// newTestProjector wires an InstanceProjector with the given downstream client and +// a project cluster that serves the supplied project client. +func newTestProjector(karmadaClient client.Client, projectClient client.Client) *InstanceProjector { + projectCluster := newFakeCluster(projectClient) + mgr := newFakeMCManager(projTestCluster, projectCluster) + return &InstanceProjector{ + DownstreamClient: karmadaClient, + MCManager: mgr, + } +} + +// projectorRequest builds a ctrl.Request for the test Instance in Karmada. +func projectorRequest() ctrl.Request { + return ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: projTestInstanceName, + Namespace: projTestKarmadaNS, + }, + } +} + +// ─── Tests ─────────────────────────────────────────────────────────────────── + +// TestInstanceProjector_Reconcile is the primary table-driven test. +func TestInstanceProjector_Reconcile(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + + // karmadaInstance is what exists in the Karmada API server. + // A nil value means the Instance does not exist (not-found path). + karmadaInstance *computev1alpha.Instance + + // projectObjs are pre-populated in the project cluster fake client. + projectObjs []client.Object + + // wantProjection controls whether a projected Instance should appear. + wantProjection bool + + // wantOwnerRef controls whether the projected Instance should have an + // owner reference pointing to the project WorkloadDeployment. + wantOwnerRef bool + + // wantRequeue controls whether the reconcile result should request a requeue. + wantRequeue bool + + // wantErr controls whether the reconcile should return an error. + wantErr bool + }{ + { + name: "happy path — instance projected with owner reference", + karmadaInstance: projTestKarmadaInstance(nil), + projectObjs: []client.Object{ + projTestProjectNS(), + projTestWorkloadDeployment(), + }, + wantProjection: true, + wantOwnerRef: true, + }, + { + name: "projection created without owner ref when WD UID label absent", + karmadaInstance: projTestKarmadaInstance(map[string]string{ + // Override: remove the WD UID label. + computev1alpha.WorkloadDeploymentUIDLabel: "", + }), + projectObjs: []client.Object{ + projTestProjectNS(), + // No WorkloadDeployment in project cluster. + }, + wantProjection: true, + wantOwnerRef: false, + }, + { + name: "missing upstream-cluster-name label — skipped, no projection", + karmadaInstance: &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestInstanceName, + Namespace: projTestKarmadaNS, + // Intentionally no UpstreamOwnerClusterNameLabel. + Labels: map[string]string{ + "some-other-label": "value", + }, + }, + }, + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + }, + { + name: "missing upstream-namespace label — requeue", + karmadaInstance: projTestKarmadaInstance(map[string]string{ + // Override: remove the upstream namespace label. + downstreamclient.UpstreamOwnerNamespaceLabel: "", + }), + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + wantRequeue: true, + }, + { + name: "karmada instance not found — no-op", + karmadaInstance: nil, // causes Get to return NotFound + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + // Build Karmada client. + var karmadaObjs []client.Object + if tt.karmadaInstance != nil { + karmadaObjs = append(karmadaObjs, tt.karmadaInstance) + } + karmadaClient := newKarmadaFakeClient(karmadaObjs...) + + // Build project client. + projectClient := fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects(tt.projectObjs...). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newTestProjector(karmadaClient, projectClient) + + result, err := r.Reconcile(context.Background(), projectorRequest()) + + if tt.wantErr { + require.Error(t, err) + return + } + require.NoError(t, err) + + if tt.wantRequeue { + assert.NotZero(t, result.RequeueAfter, "expected RequeueAfter to be set") + } else { + assert.Equal(t, ctrl.Result{}, result) + } + + ctx := context.Background() + + // Check whether a projected Instance exists in the project namespace. + var projection computev1alpha.Instance + err = projectClient.Get(ctx, types.NamespacedName{ + Name: projTestInstanceName, + Namespace: projTestProjNS, + }, &projection) + + if !tt.wantProjection { + assert.True(t, isNotFound(err), + "expected no projection in project namespace, but found one (or unexpected error: %v)", err) + return + } + + require.NoError(t, err, "expected projection to exist in project namespace") + + // Labels should be copied from the Karmada instance. + if tt.karmadaInstance != nil { + for k, v := range tt.karmadaInstance.Labels { + assert.Equal(t, v, projection.Labels[k], + "projection label %q should match Karmada instance label", k) + } + } + + // Owner reference check. + if tt.wantOwnerRef { + require.NotEmpty(t, projection.OwnerReferences, + "projected instance should have an owner reference to the WorkloadDeployment") + ownerRef := projection.OwnerReferences[0] + assert.Equal(t, string(projTestWDUID), string(ownerRef.UID), + "owner reference UID should match the WorkloadDeployment UID") + assert.Equal(t, projTestWDName, ownerRef.Name, + "owner reference name should match the WorkloadDeployment name") + } else { + assert.Empty(t, projection.OwnerReferences, + "projected instance should have no owner reference when WD not found") + } + }) + } +} + +// TestInstanceProjector_SpecCopied verifies that the Instance spec is correctly +// propagated from the Karmada instance to the projection. +func TestInstanceProjector_SpecCopied(t *testing.T) { + t.Parallel() + + karmadaInst := projTestKarmadaInstance(nil) + // Set a recognizable spec field we can assert against. + karmadaInst.Spec.Controller = &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{{Name: "test-gate"}}, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects(projTestProjectNS(), projTestWorkloadDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + karmadaClient := newKarmadaFakeClient(karmadaInst) + + r := newTestProjector(karmadaClient, projectClient) + _, err := r.Reconcile(context.Background(), projectorRequest()) + require.NoError(t, err) + + var projection computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Name: projTestInstanceName, Namespace: projTestProjNS}, + &projection)) + + require.NotNil(t, projection.Spec.Controller) + require.Len(t, projection.Spec.Controller.SchedulingGates, 1) + assert.Equal(t, "test-gate", projection.Spec.Controller.SchedulingGates[0].Name) +} + +// TestInstanceProjector_NamespaceResolution verifies that the projector resolves +// the target project namespace directly from the UpstreamOwnerNamespaceLabel on +// the Karmada Instance, landing the projection in the correct namespace. +func TestInstanceProjector_NamespaceResolution(t *testing.T) { + t.Parallel() + + karmadaInst := projTestKarmadaInstance(nil) + projectClient := fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects( + projTestProjectNS(), + projTestWorkloadDeployment(), + ). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + karmadaClient := newKarmadaFakeClient(karmadaInst) + + r := newTestProjector(karmadaClient, projectClient) + result, err := r.Reconcile(context.Background(), projectorRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + // Projection must land in the namespace named by the label. + var projection computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Name: projTestInstanceName, Namespace: projTestProjNS}, + &projection)) +} + +// isNotFound returns true when err is a Kubernetes not-found error or is nil +// (object not found means Get returned NotFound, not that err is nil). +// Used to distinguish "no projection created" from "projection exists but Get failed". +func isNotFound(err error) bool { + if err == nil { + return false // object exists — not the "not found" case + } + // Import apierrors to check — we already have it via the fake client package. + return client.IgnoreNotFound(err) == nil +} diff --git a/internal/controller/instancecontrol/stateful/stateful_control.go b/internal/controller/instancecontrol/stateful/stateful_control.go index 566a652..e259eda 100644 --- a/internal/controller/instancecontrol/stateful/stateful_control.go +++ b/internal/controller/instancecontrol/stateful/stateful_control.go @@ -68,8 +68,6 @@ func (c *statefulControl) GetActions( }, Spec: deployment.Spec.Template.Spec, } - desiredInstances[i].Spec.Location = deployment.Status.Location - // TODO(jreese) consider adding scheduling gates via mutating webhooks desiredInstances[i].Spec.Controller = &v1alpha.InstanceController{ TemplateHash: instanceTemplateHash, diff --git a/internal/controller/testing_helpers_test.go b/internal/controller/testing_helpers_test.go new file mode 100644 index 0000000..ff48a8c --- /dev/null +++ b/internal/controller/testing_helpers_test.go @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/cluster" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + computev1alpha "go.datum.net/compute/api/v1alpha" +) + +// ─── Scheme helpers ─────────────────────────────────────────────────────────── + +// newProjectScheme builds a runtime.Scheme with the types needed by the project +// cluster (corev1 + compute). +func newProjectScheme() *runtime.Scheme { + s := runtime.NewScheme() + _ = corev1.AddToScheme(s) + _ = computev1alpha.AddToScheme(s) + return s +} + +// newKarmadaScheme builds a runtime.Scheme with the types needed by the Karmada +// API server (corev1 + compute + karmada policy). +func newKarmadaScheme() *runtime.Scheme { + s := runtime.NewScheme() + _ = corev1.AddToScheme(s) + _ = computev1alpha.AddToScheme(s) + _ = karmadapolicyv1alpha1.Install(s) + return s +} + +// newProjectFakeClient returns a fake client pre-populated with the given +// objects and the project scheme. +func newProjectFakeClient(objs ...client.Object) client.Client { + return fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects(objs...). + WithStatusSubresource(objs...). + Build() +} + +// newKarmadaFakeClient returns a fake client pre-populated with the given +// objects and the Karmada scheme. +func newKarmadaFakeClient(objs ...client.Object) client.Client { + return fake.NewClientBuilder(). + WithScheme(newKarmadaScheme()). + WithObjects(objs...). + Build() +} + +// ─── Fake cluster.Cluster ───────────────────────────────────────────────────── + +// fakeCluster is a minimal cluster.Cluster implementation for tests. +// Embeds the interface so only the methods we need are implemented. +type fakeCluster struct { + cluster.Cluster // nil embed — panics if unimplemented methods are called + cl client.Client +} + +func (f *fakeCluster) GetClient() client.Client { return f.cl } +func (f *fakeCluster) GetScheme() *runtime.Scheme { return f.cl.Scheme() } +func (f *fakeCluster) GetAPIReader() client.Reader { return f.cl } + +// newFakeCluster wraps a fake client in a fakeCluster. +func newFakeCluster(cl client.Client) *fakeCluster { + return &fakeCluster{cl: cl} +} + +// ─── Fake mcmanager.Manager ─────────────────────────────────────────────────── + +// fakeMCManager is a minimal mcmanager.Manager implementation that serves a +// fixed map of project clusters. Only GetCluster is implemented; all other +// Manager methods panic through the embedded nil interface. +type fakeMCManager struct { + mcmanager.Manager // nil embed — panics if unimplemented methods are called + clusters map[string]cluster.Cluster +} + +func (m *fakeMCManager) GetCluster(_ context.Context, name string) (cluster.Cluster, error) { + if c, ok := m.clusters[name]; ok { + return c, nil + } + return nil, fmt.Errorf("cluster %q not found in fake manager", name) +} + +// newFakeMCManager returns a fakeMCManager with a single named cluster. +func newFakeMCManager(clusterName string, cl cluster.Cluster) *fakeMCManager { + return &fakeMCManager{ + clusters: map[string]cluster.Cluster{clusterName: cl}, + } +} diff --git a/internal/controller/workloaddeployment_controller.go b/internal/controller/workloaddeployment_controller.go index 50e21ef..5d87dae 100644 --- a/internal/controller/workloaddeployment_controller.go +++ b/internal/controller/workloaddeployment_controller.go @@ -37,6 +37,11 @@ import ( type WorkloadDeploymentReconciler struct { mgr mcmanager.Manager finalizers finalizer.Finalizers + // KarmadaClient is an optional client pointing at the Karmada control plane. + // When non-nil, the reconciler writes the WorkloadDeployment status back to + // the Karmada namespace after each reconcile so the WorkloadDeploymentFederator + // can aggregate it into the project-namespace object. Set to nil to disable. + KarmadaClient client.Client } // +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments,verbs=get;list;watch;create;update;patch;delete @@ -86,10 +91,6 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco logger.Info("reconciling deployment") defer logger.Info("reconcile complete") - if deployment.Status.Location == nil { - return ctrl.Result{}, nil - } - // Collect all instances for this deployment listOpts := client.MatchingLabels{ computev1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), @@ -143,59 +144,59 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco return ctrl.Result{}, err } - patchResult, err := controllerutil.CreateOrPatch(ctx, cl.GetClient(), &deployment, func() error { - deployment.Status.Replicas = int32(replicas) - deployment.Status.CurrentReplicas = int32(currentReplicas) - deployment.Status.DesiredReplicas = desiredReplicas - deployment.Status.ReadyReplicas = int32(readyReplicas) - - if quotaBlockedReplicas > 0 { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentReplicasReady, - Status: metav1.ConditionFalse, - Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded, - Message: fmt.Sprintf("%d of %d desired replicas are pending quota", quotaBlockedReplicas, desiredReplicas), - }) - } else { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentReplicasReady, - Status: metav1.ConditionTrue, - Reason: "ReplicasAvailable", - Message: fmt.Sprintf("%d/%d replicas available", readyReplicas, desiredReplicas), - }) - } - - if readyReplicas > 0 { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionTrue, - Reason: "StableInstanceFound", - Message: fmt.Sprintf("%d/%d instances are ready", readyReplicas, replicas), - }) - } else if !networkReady { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionFalse, - Reason: "ProvisioningNetwork", - Message: "Network is being provisioned", - }) - } else if replicas > 0 { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionFalse, - Reason: "ProvisioningInstances", - Message: "Instances are being provisioned", - }) - } + deployment.Status.Replicas = int32(replicas) + deployment.Status.CurrentReplicas = int32(currentReplicas) + deployment.Status.DesiredReplicas = desiredReplicas + deployment.Status.ReadyReplicas = int32(readyReplicas) + + if quotaBlockedReplicas > 0 { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentReplicasReady, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded, + Message: fmt.Sprintf("%d of %d desired replicas are pending quota", quotaBlockedReplicas, desiredReplicas), + }) + } else { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentReplicasReady, + Status: metav1.ConditionTrue, + Reason: "ReplicasAvailable", + Message: fmt.Sprintf("%d/%d replicas available", readyReplicas, desiredReplicas), + }) + } - return nil - }) + if readyReplicas > 0 { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionTrue, + Reason: "StableInstanceFound", + Message: fmt.Sprintf("%d/%d instances are ready", readyReplicas, replicas), + }) + } else if !networkReady { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionFalse, + Reason: "ProvisioningNetwork", + Message: "Network is being provisioned", + }) + } else if replicas > 0 { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionFalse, + Reason: "ProvisioningInstances", + Message: "Instances are being provisioned", + }) + } - if err != nil { + if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { return ctrl.Result{}, fmt.Errorf("failed updating deployment status: %w", err) } - logger.Info("deployment status processed", "operation_result", patchResult) + if err := r.writeStatusToKarmada(ctx, &deployment); err != nil { + return ctrl.Result{}, err + } + + logger.Info("deployment status updated") return ctrl.Result{}, nil } @@ -240,6 +241,34 @@ func (r *WorkloadDeploymentReconciler) reconcileInstanceGates( return currentReplicas, readyReplicas, quotaBlockedReplicas, nil } +// writeStatusToKarmada copies the WorkloadDeployment status to the matching +// object in the Karmada namespace so the WorkloadDeploymentFederator can +// sync it back to the project-namespace object on the control plane. +// It is a no-op when KarmadaClient is nil. +func (r *WorkloadDeploymentReconciler) writeStatusToKarmada(ctx context.Context, deployment *computev1alpha.WorkloadDeployment) error { + if r.KarmadaClient == nil { + return nil + } + + var kd computev1alpha.WorkloadDeployment + if err := r.KarmadaClient.Get(ctx, client.ObjectKeyFromObject(deployment), &kd); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed getting Karmada WD for status writeback: %w", err) + } + + kd.Status = deployment.Status + // Use Update (not Patch) so all required status fields are present in the + // request body; MergeFrom omits unchanged zero-value int32 fields which + // would fail the CRD's required constraints on currentReplicas/readyReplicas. + if err := r.KarmadaClient.Status().Update(ctx, &kd); err != nil { + return fmt.Errorf("failed updating Karmada WD status: %w", err) + } + + return nil +} + func (r *WorkloadDeploymentReconciler) reconcileNetworks( ctx context.Context, c client.Client, @@ -247,6 +276,30 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( ) (bool, error) { logger := log.FromContext(ctx) + // Resolve the Location for this deployment's city code. With Karmada + // propagation the WorkloadDeployment lands in the cluster that serves the + // requested city, so the Location object for that city must exist locally. + var locationList networkingv1alpha.LocationList + if err := c.List(ctx, &locationList); err != nil { + return false, fmt.Errorf("failed to list locations: %w", err) + } + + var locationRef *networkingv1alpha.LocationReference + for _, loc := range locationList.Items { + if cityCode, ok := loc.Spec.Topology["topology.datum.net/city-code"]; ok && cityCode == deployment.Spec.CityCode { + locationRef = &networkingv1alpha.LocationReference{ + Name: loc.Name, + Namespace: loc.Namespace, + } + break + } + } + + if locationRef == nil { + logger.Info("no location found for city code, waiting", "cityCode", deployment.Spec.CityCode) + return false, nil + } + // First, ensure we have a NetworkBinding for each interface, and that the // binding is ready before we move on to create SubnetClaims. @@ -271,7 +324,7 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( }, Spec: networkingv1alpha.NetworkBindingSpec{ Network: networkInterface.Network, - Location: *deployment.Status.Location, + Location: *locationRef, }, } @@ -347,8 +400,8 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( } // If it's not the same location, don't consider the subnet claim. - if claim.Spec.Location.Namespace != deployment.Status.Location.Namespace || - claim.Spec.Location.Name != deployment.Status.Location.Name { + if claim.Spec.Location.Namespace != locationRef.Namespace || + claim.Spec.Location.Name != locationRef.Name { continue } @@ -371,7 +424,7 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( NetworkContext: networkingv1alpha.LocalNetworkContextRef{ Name: networkContext.Name, }, - Location: *deployment.Status.Location, + Location: *locationRef, }, } @@ -490,25 +543,34 @@ func (r *WorkloadDeploymentReconciler) SetupWithManager(mgr mcmanager.Manager) e func enqueueWorkloadDeploymentByLocation(ctx context.Context, mgr mcmanager.Manager, clusterName string, locationRef networkingv1alpha.LocationReference) []mcreconcile.Request { logger := log.FromContext(ctx) - cluster, err := mgr.GetCluster(ctx, clusterName) + cl, err := mgr.GetCluster(ctx, clusterName) if err != nil { logger.Error(err, "failed to get cluster") return nil } - clusterClient := cluster.GetClient() + clusterClient := cl.GetClient() - locationName := (types.NamespacedName{ + // Resolve the Location to find its city code, then look up WorkloadDeployments + // that target the same city via the deploymentCityCodeIndex. + var location networkingv1alpha.Location + if err := clusterClient.Get(ctx, types.NamespacedName{ Namespace: locationRef.Namespace, Name: locationRef.Name, - }).String() - listOpts := client.MatchingFields{ - deploymentLocationIndex: locationName, + }, &location); err != nil { + logger.Error(err, "failed to get location for enqueue", "location", locationRef) + return nil } - var workloadDeployments computev1alpha.WorkloadDeploymentList + cityCode, ok := location.Spec.Topology["topology.datum.net/city-code"] + if !ok { + return nil + } - if err := clusterClient.List(ctx, &workloadDeployments, listOpts); err != nil { - logger.Error(err, "failed to list workloads") + var workloadDeployments computev1alpha.WorkloadDeploymentList + if err := clusterClient.List(ctx, &workloadDeployments, client.MatchingFields{ + deploymentCityCodeIndex: cityCode, + }); err != nil { + logger.Error(err, "failed to list workload deployments") return nil } diff --git a/internal/controller/workloaddeployment_federator.go b/internal/controller/workloaddeployment_federator.go new file mode 100644 index 0000000..d7437a1 --- /dev/null +++ b/internal/controller/workloaddeployment_federator.go @@ -0,0 +1,402 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + "strings" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/equality" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/finalizer" + "sigs.k8s.io/controller-runtime/pkg/log" + mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" + mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +const ( + // federatorFinalizer is added to project-namespace WorkloadDeployments that + // have been federated to the downstream control plane. It ensures we clean up + // the downstream object and any orphaned PropagationPolicies before the project + // object is permanently deleted. + federatorFinalizer = "compute.datumapis.com/federator" + + // cityCodeLabel is applied to WorkloadDeployments in the downstream namespace + // and is used by PropagationPolicy selectors to route them to the correct + // POP-cell clusters. Downstream Cluster objects are expected to carry this + // label with their city-code value. + cityCodeLabel = "topology.datum.net/city-code" +) + +// WorkloadDeploymentFederator replicates WorkloadDeployments from project +// namespaces into the downstream control plane so it can propagate them to the +// appropriate POP-cell clusters. +// +// For each WorkloadDeployment the controller: +// 1. Determines the downstream namespace via the ns- +// convention (matching the MappedNamespaceResourceStrategy used by +// go.datum.net/network-services-operator; this logic will migrate to Milo +// once the shared library is promoted). +// 2. Upserts a corresponding WorkloadDeployment in that downstream namespace, +// stamped with label topology.datum.net/city-code=. +// 3. Lazily creates a PropagationPolicy per city code per downstream namespace +// that selects WorkloadDeployments by the city-code label and targets +// clusters carrying the same label. The PP is deleted once no deployments +// with that city code remain in the namespace. +// 4. Reads the aggregated status from the downstream control plane and writes +// it back to the project-namespace object. +// 5. On deletion: removes the downstream WorkloadDeployment and cleans up +// unused PropagationPolicies. +type WorkloadDeploymentFederator struct { + mgr mcmanager.Manager + // DownstreamClient is a client pointed at the downstream control plane. The + // caller (cmd/main.go) is responsible for constructing it from + // --downstream-kubeconfig. + DownstreamClient client.Client + finalizers finalizer.Finalizers +} + +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments,verbs=get;list;watch;update;patch +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/finalizers,verbs=update +// +kubebuilder:rbac:groups=core,resources=namespaces,verbs=get;list + +func (r *WorkloadDeploymentFederator) Reconcile(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { + if r.DownstreamClient == nil { + return ctrl.Result{}, nil + } + + logger := log.FromContext(ctx) + + cl, err := r.mgr.GetCluster(ctx, req.ClusterName) + if err != nil { + return ctrl.Result{}, err + } + ctx = mccontext.WithCluster(ctx, req.ClusterName) + + var deployment computev1alpha.WorkloadDeployment + if err := cl.GetClient().Get(ctx, req.NamespacedName, &deployment); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + finalizationResult, err := r.finalizers.Finalize(ctx, &deployment) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to finalize: %w", err) + } + if finalizationResult.Updated { + if err = cl.GetClient().Update(ctx, &deployment); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to update based on finalization result: %w", err) + } + return ctrl.Result{}, nil + } + + if !deployment.DeletionTimestamp.IsZero() { + return ctrl.Result{}, nil + } + + logger.Info("federating deployment to downstream control plane") + + // Determine the downstream namespace for this project namespace using the + // ns- convention (MappedNamespaceResourceStrategy). + // Using strategy.GetClient() for writes ensures the downstream namespace is + // created with UpstreamOwnerNamespaceLabel so the InstanceProjector can + // resolve the target project namespace without scanning all namespaces. + strategy := downstreamclient.NewMappedNamespaceResourceStrategy(req.ClusterName, cl.GetClient(), r.DownstreamClient) + downstreamNS, err := strategy.GetDownstreamNamespaceNameForUpstreamNamespace(ctx, deployment.Namespace) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to determine downstream namespace: %w", err) + } + + // Ensure the downstream namespace exists and carries the upstream tracking + // labels so the InstanceProjector can resolve the project namespace by label + // lookup instead of scanning all namespaces. + if err := r.ensureDownstreamNamespace(ctx, downstreamNS, deployment.Namespace, req.ClusterName); err != nil { + return ctrl.Result{}, err + } + + // Upsert the WorkloadDeployment in the downstream control plane via the + // strategy client so any future Create calls also go through + // ensureDownstreamNamespace automatically. + if err := r.upsertDownstreamDeployment(ctx, strategy.GetClient(), &deployment, downstreamNS); err != nil { + return ctrl.Result{}, err + } + + // Lazily create the PropagationPolicy that targets clusters with the matching + // city-code label. + if err := r.ensurePropagationPolicy(ctx, downstreamNS, deployment.Spec.CityCode); err != nil { + return ctrl.Result{}, err + } + + // Pull aggregated status from the downstream control plane back into the + // project namespace. + if err := r.syncStatusFromDownstream(ctx, cl.GetClient(), &deployment, downstreamNS); err != nil { + return ctrl.Result{}, err + } + + logger.Info("federation complete") + return ctrl.Result{}, nil +} + +// Finalize removes the downstream WorkloadDeployment and, if no other +// deployments with the same city code remain in the downstream namespace, deletes +// the PropagationPolicy as well. +func (r *WorkloadDeploymentFederator) Finalize(ctx context.Context, obj client.Object) (finalizer.Result, error) { + if r.DownstreamClient == nil { + return finalizer.Result{}, nil + } + + deployment := obj.(*computev1alpha.WorkloadDeployment) + logger := log.FromContext(ctx).WithValues( + "deployment", deployment.Name, + "namespace", deployment.Namespace, + ) + + clusterName, ok := mccontext.ClusterFrom(ctx) + if !ok { + return finalizer.Result{}, fmt.Errorf("cluster name not found in context") + } + + cl, err := r.mgr.GetCluster(ctx, clusterName) + if err != nil { + return finalizer.Result{}, err + } + + strategy := downstreamclient.NewMappedNamespaceResourceStrategy(clusterName, cl.GetClient(), r.DownstreamClient) + downstreamNS, err := strategy.GetDownstreamNamespaceNameForUpstreamNamespace(ctx, deployment.Namespace) + if err != nil { + return finalizer.Result{}, fmt.Errorf("failed to determine downstream namespace during finalization: %w", err) + } + + // Delete the downstream WorkloadDeployment. + kd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: deployment.Name, + Namespace: downstreamNS, + }, + } + if err := r.DownstreamClient.Delete(ctx, kd); client.IgnoreNotFound(err) != nil { + return finalizer.Result{}, fmt.Errorf("failed to delete downstream deployment %s/%s: %w", downstreamNS, deployment.Name, err) + } + logger.Info("deleted downstream WorkloadDeployment", "downstreamNamespace", downstreamNS) + + // Clean up the PropagationPolicy if no other deployments with the same city + // code remain in this downstream namespace. + if err := r.cleanupPropagationPolicyIfUnused(ctx, downstreamNS, deployment.Spec.CityCode); err != nil { + return finalizer.Result{}, err + } + + return finalizer.Result{}, nil +} + +// ensureDownstreamNamespace creates or updates the downstream namespace, stamping +// it with the upstream tracking labels that MappedNamespaceResourceStrategy uses. +// This allows the InstanceProjector to resolve the project namespace name via a +// direct label lookup rather than scanning all namespaces by UID. +func (r *WorkloadDeploymentFederator) ensureDownstreamNamespace(ctx context.Context, name, upstreamNamespace, clusterName string) error { + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: name}} + _, err := controllerutil.CreateOrUpdate(ctx, r.DownstreamClient, ns, func() error { + if ns.Labels == nil { + ns.Labels = make(map[string]string) + } + ns.Labels[downstreamclient.UpstreamOwnerClusterNameLabel] = fmt.Sprintf("cluster-%s", strings.ReplaceAll(clusterName, "/", "_")) + ns.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] = upstreamNamespace + return nil + }) + if err != nil { + return fmt.Errorf("failed to ensure downstream namespace %q: %w", name, err) + } + return nil +} + +// upsertDownstreamDeployment creates or updates the WorkloadDeployment in the +// downstream namespace via the provided client (expected to be strategy.GetClient() +// so the downstream namespace is created with upstream tracking labels). +func (r *WorkloadDeploymentFederator) upsertDownstreamDeployment( + ctx context.Context, + downstreamClient client.Client, + deployment *computev1alpha.WorkloadDeployment, + downstreamNS string, +) error { + kd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: deployment.Name, + Namespace: downstreamNS, + }, + } + + result, err := controllerutil.CreateOrPatch(ctx, downstreamClient, kd, func() error { + if kd.Labels == nil { + kd.Labels = make(map[string]string) + } + kd.Labels[cityCodeLabel] = deployment.Spec.CityCode + kd.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] = deployment.Namespace + kd.Spec = deployment.Spec + return nil + }) + if err != nil { + return fmt.Errorf("failed to upsert downstream deployment %s/%s: %w", downstreamNS, deployment.Name, err) + } + + log.FromContext(ctx).Info("upserted downstream deployment", "result", result, "downstreamNamespace", downstreamNS) + return nil +} + +// ensurePropagationPolicy creates or updates a PropagationPolicy in the downstream +// namespace that selects all WorkloadDeployments with the given city-code label +// and targets clusters carrying the same label. +func (r *WorkloadDeploymentFederator) ensurePropagationPolicy( + ctx context.Context, + downstreamNS string, + cityCode string, +) error { + pp := &karmadapolicyv1alpha1.PropagationPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: propagationPolicyNameFor(cityCode), + Namespace: downstreamNS, + }, + } + + result, err := controllerutil.CreateOrPatch(ctx, r.DownstreamClient, pp, func() error { + pp.Spec = karmadapolicyv1alpha1.PropagationSpec{ + // Select all WorkloadDeployments in this namespace that carry the + // city-code label. Using a label selector (rather than individual + // resource names) means that new deployments for this city are + // automatically picked up without updating the policy. + ResourceSelectors: []karmadapolicyv1alpha1.ResourceSelector{ + { + APIVersion: computev1alpha.GroupVersion.String(), + Kind: "WorkloadDeployment", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + cityCodeLabel: cityCode, + }, + }, + }, + }, + Placement: karmadapolicyv1alpha1.Placement{ + // Route to clusters that carry the same city-code label. POP-cell + // clusters registered with the downstream control plane must be + // labeled accordingly. + ClusterAffinity: &karmadapolicyv1alpha1.ClusterAffinity{ + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + cityCodeLabel: cityCode, + }, + }, + }, + }, + } + return nil + }) + if err != nil { + return fmt.Errorf("failed to upsert PropagationPolicy for city %q in %s: %w", cityCode, downstreamNS, err) + } + + log.FromContext(ctx).Info("upserted PropagationPolicy", "result", result, "cityCode", cityCode, "downstreamNamespace", downstreamNS) + return nil +} + +// syncStatusFromDownstream reads the aggregated status of the WorkloadDeployment +// from the downstream namespace and writes it back to the project-namespace +// object. It is a no-op when the downstream object does not yet exist. +func (r *WorkloadDeploymentFederator) syncStatusFromDownstream( + ctx context.Context, + projectClient client.Client, + deployment *computev1alpha.WorkloadDeployment, + downstreamNS string, +) error { + var kd computev1alpha.WorkloadDeployment + if err := r.DownstreamClient.Get(ctx, types.NamespacedName{ + Name: deployment.Name, + Namespace: downstreamNS, + }, &kd); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to get downstream deployment for status sync: %w", err) + } + + if equality.Semantic.DeepEqual(deployment.Status, kd.Status) { + return nil + } + + deployment.Status = kd.Status + if err := projectClient.Status().Update(ctx, deployment); err != nil { + return fmt.Errorf("failed to write downstream status back to project deployment: %w", err) + } + return nil +} + +// cleanupPropagationPolicyIfUnused deletes the PropagationPolicy for the given +// city code if no WorkloadDeployments with that city code remain in the +// downstream namespace. +func (r *WorkloadDeploymentFederator) cleanupPropagationPolicyIfUnused( + ctx context.Context, + downstreamNS string, + cityCode string, +) error { + var remaining computev1alpha.WorkloadDeploymentList + if err := r.DownstreamClient.List(ctx, &remaining, + client.InNamespace(downstreamNS), + client.MatchingLabels{cityCodeLabel: cityCode}, + ); err != nil { + return fmt.Errorf("failed to list remaining downstream deployments for city %q: %w", cityCode, err) + } + + if len(remaining.Items) > 0 { + // Other deployments still need this PropagationPolicy. + return nil + } + + pp := &karmadapolicyv1alpha1.PropagationPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: propagationPolicyNameFor(cityCode), + Namespace: downstreamNS, + }, + } + if err := r.DownstreamClient.Delete(ctx, pp); client.IgnoreNotFound(err) != nil { + return fmt.Errorf("failed to delete PropagationPolicy for city %q in %s: %w", cityCode, downstreamNS, err) + } + + log.FromContext(ctx).Info("deleted PropagationPolicy (no more deployments for city)", "cityCode", cityCode, "downstreamNamespace", downstreamNS) + return nil +} + +// SetupWithManager registers the controller with the multicluster manager. +// It must only be called when DownstreamClient is non-nil. +func (r *WorkloadDeploymentFederator) SetupWithManager(mgr mcmanager.Manager) error { + r.mgr = mgr + r.finalizers = finalizer.NewFinalizers() + if err := r.finalizers.Register(federatorFinalizer, r); err != nil { + return fmt.Errorf("failed to register federator finalizer: %w", err) + } + return mcbuilder.ControllerManagedBy(mgr). + For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithEngageWithLocalCluster(false)). + Named("workload-deployment-federator"). + Complete(r) +} + +// propagationPolicyNameFor returns the PropagationPolicy name for a given city +// code. The name is stable and deterministic so that multiple reconciles of +// different deployments sharing the same city code converge on the same policy. +func propagationPolicyNameFor(cityCode string) string { + // Sanitize the city code to a valid Kubernetes name: lower-case, spaces → hyphens. + sanitized := strings.ToLower(strings.ReplaceAll(cityCode, " ", "-")) + return fmt.Sprintf("city-%s", sanitized) +} diff --git a/internal/controller/workloaddeployment_federator_test.go b/internal/controller/workloaddeployment_federator_test.go new file mode 100644 index 0000000..143f975 --- /dev/null +++ b/internal/controller/workloaddeployment_federator_test.go @@ -0,0 +1,398 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "testing" + "time" + + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/finalizer" + mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" + mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + + computev1alpha "go.datum.net/compute/api/v1alpha" +) + +// ─── Shared test constants ──────────────────────────────────────────────────── + +const ( + testCluster = "test-project-cluster" + testProjNS = "my-project" + testProjNSUID = types.UID("aabbccdd-0000-1111-2222-333344445555") + testKarmadaNSStr = "ns-aabbccdd-0000-1111-2222-333344445555" + testWDName = "my-workload-deployment" + testCityCodeLAX = "LAX" +) + +// ─── Test helpers ───────────────────────────────────────────────────────────── + +// testProjectNamespace returns a corev1.Namespace for the project cluster with a +// stable UID that matches testKarmadaNSStr. +func testProjectNamespace() *corev1.Namespace { + return &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: testProjNS, + UID: testProjNSUID, + }, + } +} + +// testWorkloadDeployment returns a WorkloadDeployment with the given options. +func testWorkloadDeployment(opts ...func(*computev1alpha.WorkloadDeployment)) *computev1alpha.WorkloadDeployment { + wd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: testWDName, + Namespace: testProjNS, + UID: "wd-uid-1111", + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: testCityCodeLAX, + WorkloadRef: computev1alpha.WorkloadReference{ + Name: "test-workload", + }, + PlacementName: "default", + ScaleSettings: computev1alpha.HorizontalScaleSettings{ + MinReplicas: 1, + }, + }, + } + for _, opt := range opts { + opt(wd) + } + return wd +} + +// withFinalizer adds the federator finalizer to the WorkloadDeployment. +func withFinalizer(wd *computev1alpha.WorkloadDeployment) { + wd.Finalizers = append(wd.Finalizers, federatorFinalizer) +} + +// withDeletionTimestamp sets a non-zero DeletionTimestamp on the WorkloadDeployment. +func withDeletionTimestamp(wd *computev1alpha.WorkloadDeployment) { + t := metav1.NewTime(time.Now().Add(-5 * time.Second)) + wd.DeletionTimestamp = &t +} + +// newTestFederator constructs a WorkloadDeploymentFederator wired to the given +// project client (via a fakeMCManager) and downstream client. The federator +// finalizer is pre-registered so reconcile can handle deletions. +func newTestFederator(projectClient client.Client, karmadaClient client.Client) *WorkloadDeploymentFederator { + projectCluster := newFakeCluster(projectClient) + mgr := newFakeMCManager(testCluster, projectCluster) + + r := &WorkloadDeploymentFederator{ + mgr: mgr, + DownstreamClient: karmadaClient, + } + + feds := finalizer.NewFinalizers() + if err := feds.Register(federatorFinalizer, r); err != nil { + panic("failed to register test finalizer: " + err.Error()) + } + r.finalizers = feds + return r +} + +// reconcileRequest builds an mcreconcile.Request for the test WorkloadDeployment. +func reconcileRequest() mcreconcile.Request { + return mcreconcile.Request{ + ClusterName: testCluster, + Request: ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: testWDName, + Namespace: testProjNS, + }, + }, + } +} + +// ─── Unit tests ─────────────────────────────────────────────────────────────── + +func TestPropagationPolicyNameFor(t *testing.T) { + t.Parallel() + + tests := []struct { + cityCode string + want string + }{ + {"LAX", "city-lax"}, + {"lax", "city-lax"}, + {"New York", "city-new-york"}, + {"LOS ANGELES", "city-los-angeles"}, + {"SEA", "city-sea"}, + } + + for _, tt := range tests { + t.Run(tt.cityCode, func(t *testing.T) { + t.Parallel() + got := propagationPolicyNameFor(tt.cityCode) + assert.Equal(t, tt.want, got) + }) + } +} + +// TestWorkloadDeploymentFederator_NoDownstreamClient verifies that the reconciler +// is a no-op when DownstreamClient is nil. +func TestWorkloadDeploymentFederator_NoDownstreamClient(t *testing.T) { + t.Parallel() + + projectClient := newProjectFakeClient(testProjectNamespace(), testWorkloadDeployment()) + r := newTestFederator(projectClient, nil) + r.DownstreamClient = nil // explicitly nil + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) +} + +// TestWorkloadDeploymentFederator_AddsFinalizerOnFirstSeen verifies that the +// first reconcile of a brand-new WorkloadDeployment adds the finalizer and +// returns without federating (the finalizer update triggers a re-queue). +func TestWorkloadDeploymentFederator_AddsFinalizerOnFirstSeen(t *testing.T) { + t.Parallel() + + wd := testWorkloadDeployment() // no finalizer yet + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + // The project WD should now have the finalizer persisted. + var updated computev1alpha.WorkloadDeployment + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Name: testWDName, Namespace: testProjNS}, &updated)) + assert.Contains(t, updated.Finalizers, federatorFinalizer) + + // Karmada should be untouched – federation happens on the next reconcile. + var wdList computev1alpha.WorkloadDeploymentList + require.NoError(t, karmadaClient.List(context.Background(), &wdList)) + assert.Empty(t, wdList.Items, "no Karmada WD should be created on first-seen reconcile") +} + +// TestWorkloadDeploymentFederator_FederatesToKarmada verifies that a +// WorkloadDeployment with the finalizer already set is fully federated: +// the Karmada namespace, WorkloadDeployment (with city-code label), and +// PropagationPolicy are all created. +func TestWorkloadDeploymentFederator_FederatesToKarmada(t *testing.T) { + t.Parallel() + + wd := testWorkloadDeployment(withFinalizer) + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + ctx := context.Background() + + // Karmada namespace must exist. + var karmadaNS corev1.Namespace + err = karmadaClient.Get(ctx, types.NamespacedName{Name: testKarmadaNSStr}, &karmadaNS) + require.NoError(t, err, "Karmada namespace %q should exist", testKarmadaNSStr) + + // Karmada WorkloadDeployment must exist with the city-code label. + var karmadaWD computev1alpha.WorkloadDeployment + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: testWDName, + Namespace: testKarmadaNSStr, + }, &karmadaWD) + require.NoError(t, err, "Karmada WorkloadDeployment should exist") + assert.Equal(t, testCityCodeLAX, karmadaWD.Labels[cityCodeLabel], + "city-code label should be set on Karmada WD") + assert.Equal(t, testCityCodeLAX, karmadaWD.Spec.CityCode, + "spec.cityCode should be copied from project WD") + + // PropagationPolicy for the city code must exist. + ppName := propagationPolicyNameFor(testCityCodeLAX) + var pp karmadapolicyv1alpha1.PropagationPolicy + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: ppName, + Namespace: testKarmadaNSStr, + }, &pp) + require.NoError(t, err, "PropagationPolicy %q should exist", ppName) + + // The PP must select WorkloadDeployments by the city-code label. + require.Len(t, pp.Spec.ResourceSelectors, 1) + sel := pp.Spec.ResourceSelectors[0] + assert.Equal(t, computev1alpha.GroupVersion.String(), sel.APIVersion) + assert.Equal(t, "WorkloadDeployment", sel.Kind) + require.NotNil(t, sel.LabelSelector) + assert.Equal(t, testCityCodeLAX, sel.LabelSelector.MatchLabels[cityCodeLabel]) + + // The PP cluster affinity must target clusters carrying the same city-code. + require.NotNil(t, pp.Spec.Placement.ClusterAffinity) + require.NotNil(t, pp.Spec.Placement.ClusterAffinity.LabelSelector) + assert.Equal(t, testCityCodeLAX, + pp.Spec.Placement.ClusterAffinity.LabelSelector.MatchLabels[cityCodeLabel]) +} + +// TestWorkloadDeploymentFederator_Finalization covers the deletion scenarios: +// cleanup of Karmada resources and conditional PropagationPolicy removal. +func TestWorkloadDeploymentFederator_Finalization(t *testing.T) { + t.Parallel() + + ppName := propagationPolicyNameFor(testCityCodeLAX) + + tests := []struct { + name string + // karmadaExtra holds additional Karmada objects beyond the "own" WD and PP. + karmadaExtra []client.Object + wantPPGone bool + }{ + { + name: "last WD for city — PropagationPolicy removed", + karmadaExtra: nil, + wantPPGone: true, + }, + { + name: "other WD for same city remains — PropagationPolicy kept", + karmadaExtra: []client.Object{ + // A sibling WD in the same Karmada namespace with the same city-code. + &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "other-deployment", + Namespace: testKarmadaNSStr, + Labels: map[string]string{cityCodeLabel: testCityCodeLAX}, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: testCityCodeLAX, + PlacementName: "other", + WorkloadRef: computev1alpha.WorkloadReference{Name: "other"}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + }, + }, + }, + wantPPGone: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + // Project cluster: namespace + WD with finalizer and deletion timestamp. + wd := testWorkloadDeployment(withFinalizer, withDeletionTimestamp) + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + + // Karmada cluster: the mirrored WD + its PropagationPolicy + any extras. + karmadaWD := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: testWDName, + Namespace: testKarmadaNSStr, + Labels: map[string]string{cityCodeLabel: testCityCodeLAX}, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: testCityCodeLAX, + PlacementName: "default", + WorkloadRef: computev1alpha.WorkloadReference{Name: "test-workload"}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + }, + } + karmadaPP := &karmadapolicyv1alpha1.PropagationPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: ppName, + Namespace: testKarmadaNSStr, + }, + } + karmadaObjs := []client.Object{ + &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: testKarmadaNSStr}}, + karmadaWD, + karmadaPP, + } + karmadaObjs = append(karmadaObjs, tt.karmadaExtra...) + karmadaClient := newKarmadaFakeClient(karmadaObjs...) + + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + ctx := context.Background() + + // The Karmada-side WD must be gone. + var remainingWD computev1alpha.WorkloadDeployment + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: testWDName, + Namespace: testKarmadaNSStr, + }, &remainingWD) + assert.True(t, apierrors.IsNotFound(err), + "Karmada WD %q should be deleted after finalization", testWDName) + + // PropagationPolicy presence depends on whether siblings remain. + var remainingPP karmadapolicyv1alpha1.PropagationPolicy + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: ppName, + Namespace: testKarmadaNSStr, + }, &remainingPP) + if tt.wantPPGone { + assert.True(t, apierrors.IsNotFound(err), + "PropagationPolicy should be deleted when no city siblings remain") + } else { + assert.NoError(t, err, + "PropagationPolicy should be kept when other city siblings remain") + } + + // The project WD should be gone: once the federator finalizer is removed + // from an object that already has a DeletionTimestamp, the API server + // (and the fake client) garbage-collects the object. + var updatedWD computev1alpha.WorkloadDeployment + err = projectClient.Get(ctx, + types.NamespacedName{Name: testWDName, Namespace: testProjNS}, &updatedWD) + assert.True(t, apierrors.IsNotFound(err), + "project WD should be gone after finalizer removal (DeletionTimestamp + empty Finalizers = GC)") + }) + } +} + +// TestWorkloadDeploymentFederator_NotFound verifies that a missing +// WorkloadDeployment is handled gracefully (no error, no action). +func TestWorkloadDeploymentFederator_NotFound(t *testing.T) { + t.Parallel() + + projectClient := newProjectFakeClient(testProjectNamespace()) // WD missing + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) +} + +// TestWorkloadDeploymentFederator_Finalize_DirectCall exercises the Finalize +// method directly, ensuring the cluster name is required in context. +func TestWorkloadDeploymentFederator_Finalize_DirectCall(t *testing.T) { + t.Parallel() + + projectClient := newProjectFakeClient(testProjectNamespace()) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + wd := testWorkloadDeployment(withFinalizer) + + // Without cluster in context → must return an error. + _, err := r.Finalize(context.Background(), wd) + require.Error(t, err, "Finalize without cluster context should fail") + assert.Contains(t, err.Error(), "cluster name not found") + + // With cluster in context → must succeed (karmada client returns not-found, which is OK). + ctx := mccontext.WithCluster(context.Background(), testCluster) + result, err := r.Finalize(ctx, wd) + require.NoError(t, err) + assert.False(t, result.Updated) +} diff --git a/internal/controller/workloaddeployment_scheduler.go b/internal/controller/workloaddeployment_scheduler.go deleted file mode 100644 index 041b0d6..0000000 --- a/internal/controller/workloaddeployment_scheduler.go +++ /dev/null @@ -1,153 +0,0 @@ -// SPDX-License-Identifier: AGPL-3.0-only - -package controller - -import ( - "context" - "fmt" - "time" - - apierrors "k8s.io/apimachinery/pkg/api/errors" - apimeta "k8s.io/apimachinery/pkg/api/meta" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/controller-runtime/pkg/predicate" - mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" - mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" - mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" - mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" - - computev1alpha "go.datum.net/compute/api/v1alpha" - networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" -) - -// WorkloadDeploymentScheduler schedules a WorkloadDeployment -type WorkloadDeploymentScheduler struct { - mgr mcmanager.Manager -} - -func (r *WorkloadDeploymentScheduler) Reconcile(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { - logger := log.FromContext(ctx) - - cl, err := r.mgr.GetCluster(ctx, req.ClusterName) - if err != nil { - return ctrl.Result{}, err - } - - ctx = mccontext.WithCluster(ctx, req.ClusterName) - var deployment computev1alpha.WorkloadDeployment - if err := cl.GetClient().Get(ctx, req.NamespacedName, &deployment); err != nil { - if apierrors.IsNotFound(err) { - return ctrl.Result{}, nil - } - return ctrl.Result{}, err - } - - if !deployment.DeletionTimestamp.IsZero() { - return ctrl.Result{}, nil - } - - logger.Info("scheduling deployment") - defer logger.Info("scheduling complete") - - // TODO(jreese) improve! - // The first iteration of this scheduler will be very simple and only look for - // the first available location that is viable for the deployment. In the - // future, we could see a more advanced system similar to the Kubernetes - // scheduler itself. - - // Step 1: Get Locations - var locations networkingv1alpha.LocationList - if err := cl.GetClient().List(ctx, &locations); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to list locations: %w", err) - } - - if len(locations.Items) == 0 { - // Should only be the case in new environments if workloads are created - // prior to location registration. - - changed := apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: "Available", - Status: metav1.ConditionFalse, - Reason: "NoLocations", - ObservedGeneration: deployment.Generation, - Message: "No locations are registered with the system.", - }) - if changed { - // TODO(jreese) investigate kubevirt / other operators for better tracking - // of updates to the status. I seem to remember a "builder" of sorts that - // looked rather nice. - if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update deployment status: %w", err) - } - } - - return ctrl.Result{RequeueAfter: 30 * time.Second}, nil - } - - // TODO(jreese) define standard Topology keys somewhere - - var selectedLocation *networkingv1alpha.Location - for _, location := range locations.Items { - cityCode, ok := location.Spec.Topology["topology.datum.net/city-code"] - if ok && cityCode == deployment.Spec.CityCode { - selectedLocation = &location - break - } - } - - if selectedLocation == nil { - changed := apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: "Available", - Status: metav1.ConditionFalse, - Reason: "NoCandidateLocations", - ObservedGeneration: deployment.Generation, - Message: "No locations are candidates for this deployment.", - }) - if changed { - if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update deployment status: %w", err) - } - } - } else { - deployment.Status.Location = &networkingv1alpha.LocationReference{ - Name: selectedLocation.Name, - Namespace: selectedLocation.Namespace, - } - - // TODO(jreese) make sure we don't run into update conflicts with the update - // of the spec then status here. Just can't remember if it's an issue. - - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: "Available", - Status: metav1.ConditionFalse, - Reason: "LocationAssigned", - ObservedGeneration: deployment.Generation, - Message: "Deployment has been assigned a location.", - }) - - if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update deployment status: %w", err) - } - - } - - return ctrl.Result{}, nil -} - -// SetupWithManager sets up the controller with the Manager. -func (r *WorkloadDeploymentScheduler) SetupWithManager(mgr mcmanager.Manager) error { - r.mgr = mgr - return mcbuilder.ControllerManagedBy(mgr). - For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithPredicates( - predicate.NewPredicateFuncs(func(object client.Object) bool { - // Don't process deployments that have been scheduled - o := object.(*computev1alpha.WorkloadDeployment) - return o.Status.Location == nil - }), - )). - Named("workload-deployment-scheduler"). - Complete(r) -} diff --git a/test/e2e/chainsaw-config.yaml b/test/e2e/chainsaw-config.yaml new file mode 100644 index 0000000..cd3a995 --- /dev/null +++ b/test/e2e/chainsaw-config.yaml @@ -0,0 +1,47 @@ +# Chainsaw global configuration for the compute federation e2e test suite. +# +# Prerequisites +# ───────────── +# Run `task e2e:up` to create the Kind clusters and populate kubeconfigs under +# tmp/e2e/kubeconfigs/ before running Chainsaw. +# +# Running +# ─────── +# From the repository root via Taskfile (recommended): +# +# task e2e:test +# +# Or directly: +# +# KUBECONFIG=tmp/e2e/kubeconfigs/control-plane.yaml \ +# chainsaw test --config test/e2e/chainsaw-config.yaml test/e2e/ +# +# The KUBECONFIG env var sets the "default" cluster (control-plane cell). +# Additional clusters (downstream, pop-dfw, pop-ord) are declared below and +# referenced by name in individual test steps via `cluster: downstream` etc. +# +# Kubeconfig paths below are relative to the working directory where Chainsaw is +# invoked (the project root), NOT relative to this config file's location. +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Configuration +metadata: + name: chainsaw +spec: + timeouts: + apply: 30s + assert: 60s + cleanup: 60s + delete: 30s + error: 30s + exec: 30s + clusters: + # Downstream control plane. WorkloadDeployments, PropagationPolicies, + # and Instance write-backs live here. + downstream: + kubeconfig: tmp/e2e/kubeconfigs/downstream.yaml + # POP DFW cell — downstream member cluster labelled topology.datum.net/city-code=dfw. + pop-dfw: + kubeconfig: tmp/e2e/kubeconfigs/pop-dfw.yaml + # POP ORD cell — downstream member cluster labelled topology.datum.net/city-code=ord. + pop-ord: + kubeconfig: tmp/e2e/kubeconfigs/pop-ord.yaml diff --git a/test/e2e/deletion-cascade/assert-downstream-wd-exists.yaml b/test/e2e/deletion-cascade/assert-downstream-wd-exists.yaml new file mode 100644 index 0000000..aae65da --- /dev/null +++ b/test/e2e/deletion-cascade/assert-downstream-wd-exists.yaml @@ -0,0 +1,7 @@ +# Assert the WorkloadDeployment is present in the Karmada API server. +# Used both to confirm federation succeeded and as the target for the error: check. +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + namespace: ($downstreamNS) + name: test-cascade-wd diff --git a/test/e2e/deletion-cascade/chainsaw-test.yaml b/test/e2e/deletion-cascade/chainsaw-test.yaml new file mode 100644 index 0000000..03a11ea --- /dev/null +++ b/test/e2e/deletion-cascade/chainsaw-test.yaml @@ -0,0 +1,79 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: deletion-cascade +spec: + description: | + Verifies that deleting a WorkloadDeployment from the project namespace causes + the federator to remove the corresponding WorkloadDeployment from Karmada. + + The WorkloadDeploymentFederator adds a finalizer + (compute.datumapis.com/federator) to every project WD it manages. When the + project WD is deleted: + 1. The finalizer's Finalize method runs (blocking deletion until complete). + 2. It deletes the Karmada-side WorkloadDeployment. + 3. It removes the PropagationPolicy if no other WDs for the city remain. + 4. It removes the finalizer, allowing the project WD to be garbage-collected. + + This test validates: project WD deletion → Karmada WD deletion. + + template: true + + steps: + - name: create-wd + description: Create a WorkloadDeployment on the control-plane cluster. + try: + - apply: + file: workload-deployment.yaml + + - name: wait-for-federation + description: Wait for the WorkloadDeployment to appear in Karmada. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-cascade-wd + + - name: delete-wd + description: Delete the WorkloadDeployment from the control-plane cluster. + try: + - delete: + ref: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + namespace: ($namespace) + name: test-cascade-wd + + - name: assert-downstream-wd-deleted + description: Confirm the Karmada copy is removed by the finalizer. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - wait: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + namespace: ($downstreamNS) + name: test-cascade-wd + timeout: 30s + for: + deletion: {} diff --git a/test/e2e/deletion-cascade/workload-deployment.yaml b/test/e2e/deletion-cascade/workload-deployment.yaml new file mode 100644 index 0000000..39d68a1 --- /dev/null +++ b/test/e2e/deletion-cascade/workload-deployment.yaml @@ -0,0 +1,21 @@ +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + name: test-cascade-wd +spec: + cityCode: dfw + placementName: default + workloadRef: + name: test-workload + uid: "00000000-0000-0000-0000-000000000001" + template: + spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network + + scaleSettings: + minReplicas: 1 diff --git a/test/e2e/env/README.md b/test/e2e/env/README.md new file mode 100644 index 0000000..671e705 --- /dev/null +++ b/test/e2e/env/README.md @@ -0,0 +1,251 @@ +# Local Kind + Karmada e2e Environment + +This document describes the local multi-cluster environment used for end-to-end +testing of the compute federation layer. + +--- + +## Prerequisites + +| Tool | Minimum version | Install | +|------|----------------|---------| +| [Docker Desktop](https://www.docker.com/products/docker-desktop/) | 4.x | required for Kind | +| [kind](https://kind.sigs.k8s.io/) | v0.23+ | `brew install kind` | +| [kubectl](https://kubernetes.io/docs/tasks/tools/) | v1.28+ | `brew install kubernetes-cli` | +| [helm](https://helm.sh/) | v3.14+ | `brew install helm` | +| [task](https://taskfile.dev/) | v3 | `brew install go-task` | +| Python 3 | 3.9+ | pre-installed on macOS | +| go | 1.24+ | `brew install go` | + +`karmadactl` is downloaded automatically by `task e2e:up` into `./bin/`. + +--- + +## Cluster Topology + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ compute-control-plane (Kind cluster) │ +│ │ +│ ┌───────────────────────────────────────────────────────────────┐ │ +│ │ karmada-system namespace │ │ +│ │ Karmada API Server ←── https://localhost:32443 │ │ +│ │ Karmada Controller Manager │ │ +│ │ Karmada Scheduler │ │ +│ └───────────────────────────────────────────────────────────────┘ │ +│ │ +│ compute operator (WorkloadReconciler, Federator, InstanceProjector)│ +└──────────────────────────┬──────────────────────────────────────────┘ + │ Karmada propagates WorkloadDeployments + ┌────────────────┴─────────────────┐ + │ │ +┌─────────▼──────────┐ ┌──────────▼─────────┐ +│ compute-pop-dfw │ │ compute-pop-ord │ +│ (Kind cluster) │ │ (Kind cluster) │ +│ │ │ │ +│ city-code=dfw │ │ city-code=ord │ +│ Compute CRDs │ │ Compute CRDs │ +│ NSO CRDs │ │ NSO CRDs │ +└────────────────────┘ └────────────────────┘ +``` + +### What lives where + +| Resource | Cluster | +|----------|---------| +| `Workload`, `WorkloadDeployment` (consumer-facing) | Control Plane Cell | +| `WorkloadDeployment` (federation intent), `PropagationPolicy` | Karmada API Server | +| `WorkloadDeployment` (propagated), `Instance`, `NetworkBinding`, `SubnetClaim` | POP cells | +| `Instance` (write-back for visibility) | Karmada API Server | + +--- + +## Running the environment + +### Start + +```bash +task e2e:up +``` + +This is fully idempotent — running it twice will not fail. + +What it does, in order: + +1. Downloads `karmadactl v1.16.0` into `./bin/` (once). +2. Adds the `karmada-charts` Helm repository. +3. Creates Kind clusters `compute-control-plane`, `compute-pop-dfw`, + `compute-pop-ord` (skips any that already exist). +4. Exports kubeconfigs to `./tmp/e2e/kubeconfigs/`. +5. Installs Karmada v1.16.0 via the `karmada-charts/karmada` Helm chart into + `compute-control-plane`, with the API server exposed on NodePort 32443. +6. Registers `compute-pop-dfw` and `compute-pop-ord` as member clusters and + labels each with `topology.datum.net/city-code`. +7. Installs compute CRDs to all clusters and the Karmada API server. +8. Installs NSO CRDs to the POP cell clusters. + +### Stop + +```bash +task e2e:down +``` + +Deletes all three Kind clusters and removes `./tmp/e2e/`. + +--- + +## Kubeconfigs + +After `task e2e:up`: + +| File | Cluster | Use for | +|------|---------|---------| +| `tmp/e2e/kubeconfigs/control-plane.yaml` | `compute-control-plane` | kubectl, deploying the compute operator | +| `tmp/e2e/kubeconfigs/karmada.yaml` | Karmada API server | kubectl, karmadactl | +| `tmp/e2e/kubeconfigs/pop-dfw.yaml` | `compute-pop-dfw` | kubectl, inspecting POP cell state | +| `tmp/e2e/kubeconfigs/pop-ord.yaml` | `compute-pop-ord` | kubectl, inspecting POP cell state | + +The `-internal.yaml` variants use the Kind container's Docker bridge IP and are +intended for the Karmada controller running inside Docker — not for direct +developer use. + +### Quick check + +```bash +# Verify cluster list in Karmada +kubectl --kubeconfig tmp/e2e/kubeconfigs/karmada.yaml get clusters + +# Expected output: +# NAME READY AGE +# compute-pop-dfw True ... +# compute-pop-ord True ... + +# Verify city-code labels +kubectl --kubeconfig tmp/e2e/kubeconfigs/karmada.yaml \ + get clusters -L topology.datum.net/city-code +``` + +--- + +## Using the environment from e2e tests + +Import `go.datum.net/compute/test/e2e/env` in your test suite: + +```go +package myfeature_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + computev1alpha1 "go.datum.net/compute/api/v1alpha1" + + "go.datum.net/compute/test/e2e/env" +) + +var testEnv *env.Environment + +func TestMyFeature(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "MyFeature Suite") +} + +var _ = BeforeSuite(func() { + scheme := runtime.NewScheme() + Expect(corev1.AddToScheme(scheme)).To(Succeed()) + Expect(computev1alpha1.AddToScheme(scheme)).To(Succeed()) + + var err error + testEnv, err = env.New(scheme) + Expect(err).NotTo(HaveOccurred()) +}) + +var _ = It("creates a workload and propagates it", func() { + // Control plane cluster client + cpClient := testEnv.ControlPlane.Client + + // Karmada API server client + karmadaClient := testEnv.Karmada.Client + + // POP DFW cluster client + dfwCell, err := testEnv.POPCell(env.CityCodeDFW) + Expect(err).NotTo(HaveOccurred()) + dfwClient := dfwCell.Client + + _ = cpClient + _ = karmadaClient + _ = dfwClient +}) +``` + +### Environment variable override + +Set `E2E_KUBECONFIG_DIR` to an absolute path to load kubeconfigs from a +different directory (useful in CI): + +```bash +E2E_KUBECONFIG_DIR=/path/to/kubeconfigs go test ./test/e2e/... +``` + +--- + +## Networking notes (macOS) + +On macOS with Docker Desktop, Kind clusters run as Docker containers. The +container-to-container networking works as follows: + +| From | To | Address used | +|------|----|--------------| +| macOS host | Any Kind cluster API server | `localhost:` | +| macOS host | Karmada API server | `https://localhost:32443` (NodePort) | +| Karmada controller (in Docker) | POP cell API servers | Docker bridge IP (`172.18.x.x:6443`) | + +The `-internal.yaml` kubeconfig variants use Docker bridge IPs with +`insecure-skip-tls-verify: true` because the node certificates do not include +bridge IPs in their SANs. This is acceptable for a local dev environment. + +--- + +## Troubleshooting + +### Karmada API server not reachable + +```bash +kubectl --kubeconfig tmp/e2e/kubeconfigs/karmada.yaml get ns +``` + +If this times out, check: +1. The Kind cluster is running: `kind get clusters` +2. Port 32443 is mapped: `docker port compute-control-plane-control-plane` +3. The karmada-apiserver pod is running: + ```bash + kubectl --kubeconfig tmp/e2e/kubeconfigs/control-plane.yaml \ + get pods -n karmada-system + ``` + +### POP cluster shows NotReady in Karmada + +The Karmada controller manager uses the Docker bridge IP kubeconfig to reach +POP cells. Check: + +```bash +kubectl --kubeconfig tmp/e2e/kubeconfigs/karmada.yaml \ + describe cluster compute-pop-dfw +``` + +Then verify the cluster secret contains the expected Docker IP: + +```bash +kubectl --kubeconfig tmp/e2e/kubeconfigs/karmada.yaml \ + get secret -n karmada-system | grep pop-dfw +``` + +### Start fresh + +```bash +task e2e:down && task e2e:up +``` diff --git a/test/e2e/env/env.go b/test/e2e/env/env.go new file mode 100644 index 0000000..7d2c59c --- /dev/null +++ b/test/e2e/env/env.go @@ -0,0 +1,233 @@ +// Package env provides helpers for connecting to the local Kind e2e environment +// created by "task e2e:up". +// +// # Environment layout +// +// The environment consists of three Kind clusters and one downstream API server: +// +// - Control plane cell — hosts the compute operator (WorkloadReconciler, +// WorkloadDeploymentFederator, InstanceProjector). +// - Downstream control plane — the federation API server; WorkloadDeployments +// are written here so they can be propagated to POP cells. +// - POP DFW (compute-pop-dfw) — member cluster labelled city-code=dfw. +// - POP ORD (compute-pop-ord) — member cluster labelled city-code=ord. +// +// # Kubeconfig resolution +// +// Kubeconfigs are read from the directory at [DefaultKubeconfigDir] (relative +// to the repository root), unless overridden via the [EnvKubeconfigDir] +// environment variable. +// +// Expected files inside that directory: +// +// control-plane.yaml — management / control-plane cell +// downstream.yaml — downstream federation API server (https://localhost:32443) +// pop-dfw.yaml — POP DFW cell (standard Kind localhost-based kubeconfig) +// pop-ord.yaml — POP ORD cell (standard Kind localhost-based kubeconfig) +// +// # Typical usage in a Ginkgo suite +// +// var ( +// testEnv *env.Environment +// ) +// +// var _ = BeforeSuite(func() { +// scheme := runtime.NewScheme() +// Expect(computev1alpha1.AddToScheme(scheme)).To(Succeed()) +// Expect(corev1.AddToScheme(scheme)).To(Succeed()) +// +// var err error +// testEnv, err = env.New(scheme) +// Expect(err).NotTo(HaveOccurred()) +// }) +package env + +import ( + "fmt" + "os" + "path/filepath" + "runtime" + + k8sruntime "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" +) + +// Environment variable name that overrides the kubeconfig directory. +const EnvKubeconfigDir = "E2E_KUBECONFIG_DIR" + +// DefaultKubeconfigDir is the kubeconfig directory used when [EnvKubeconfigDir] +// is not set. It is resolved relative to the repository root (three directories +// above this source file). +const DefaultKubeconfigDir = "tmp/e2e/kubeconfigs" + +// City codes for the two POP cells created by "task e2e:up". +const ( + CityCodeDFW = "dfw" + CityCodeORD = "ord" +) + +// Environment holds a [ClusterAccess] for each cluster in the local e2e +// environment. All fields are populated by [New]; none are nil on success. +type Environment struct { + // ControlPlane is the management / control-plane cell cluster. + // The compute operator runs here (WorkloadReconciler, + // WorkloadDeploymentFederator, InstanceProjector). + ControlPlane *ClusterAccess + + // Downstream is the downstream control plane. + // WorkloadDeployments and PropagationPolicies live here. + Downstream *ClusterAccess + + // POPCells maps city-code strings (e.g. "dfw", "ord") to the + // corresponding POP cell cluster. Use [Environment.POPCell] for + // safe, error-returning access. + POPCells map[string]*ClusterAccess +} + +// ClusterAccess bundles a REST config and a controller-runtime Client for a +// single cluster. +type ClusterAccess struct { + // Config is the REST config used to build the client. + Config *rest.Config + + // Client is a controller-runtime client scoped to this cluster. + // The client is built with the scheme supplied to [New]. + Client ctrlclient.Client +} + +// New creates an [Environment] by loading kubeconfigs from the configured +// directory and building a controller-runtime client for each cluster using +// the provided scheme. +// +// The scheme should have all relevant types registered before calling New; +// for example compute types, networking types, and core Kubernetes types. +func New(scheme *k8sruntime.Scheme) (*Environment, error) { + dir := kubeconfigDir() + + controlPlane, err := loadCluster(filepath.Join(dir, "control-plane.yaml"), scheme) + if err != nil { + return nil, fmt.Errorf("control-plane cluster: %w", err) + } + + downstream, err := loadCluster(filepath.Join(dir, "downstream.yaml"), scheme) + if err != nil { + return nil, fmt.Errorf("downstream control plane: %w", err) + } + + popDFW, err := loadCluster(filepath.Join(dir, "pop-dfw.yaml"), scheme) + if err != nil { + return nil, fmt.Errorf("POP DFW cluster: %w", err) + } + + popORD, err := loadCluster(filepath.Join(dir, "pop-ord.yaml"), scheme) + if err != nil { + return nil, fmt.Errorf("POP ORD cluster: %w", err) + } + + return &Environment{ + ControlPlane: controlPlane, + Downstream: downstream, + POPCells: map[string]*ClusterAccess{ + CityCodeDFW: popDFW, + CityCodeORD: popORD, + }, + }, nil +} + +// POPCell returns the [ClusterAccess] for the POP cell with the given city +// code. It returns an error if no POP cell is registered for that code. +func (e *Environment) POPCell(cityCode string) (*ClusterAccess, error) { + ca, ok := e.POPCells[cityCode] + if !ok { + known := make([]string, 0, len(e.POPCells)) + for k := range e.POPCells { + known = append(known, k) + } + return nil, fmt.Errorf("no POP cell registered for city code %q (known: %v)", cityCode, known) + } + return ca, nil +} + +// MustPOPCell is like [Environment.POPCell] but panics on error. +// Useful in test setup where a missing POP cell is always a fatal misconfiguration. +func (e *Environment) MustPOPCell(cityCode string) *ClusterAccess { + ca, err := e.POPCell(cityCode) + if err != nil { + panic(err) + } + return ca +} + +// RESTConfigFor is a convenience function that returns a [rest.Config] for the +// named cluster without constructing a client. Useful when the caller needs to +// build a typed clientset directly. +func RESTConfigFor(kubeconfigPath string) (*rest.Config, error) { + cfg, err := clientcmd.BuildConfigFromFlags("", kubeconfigPath) + if err != nil { + return nil, fmt.Errorf("building REST config from %s: %w", kubeconfigPath, err) + } + return cfg, nil +} + +// KubeconfigPath returns the absolute path to the kubeconfig file for the +// named cluster. name must be one of "control-plane", "downstream", "pop-dfw", +// or "pop-ord". +func KubeconfigPath(name string) string { + return filepath.Join(kubeconfigDir(), name+".yaml") +} + +// ─── internal helpers ──────────────────────────────────────────────────────── + +func loadCluster(kubeconfigPath string, scheme *k8sruntime.Scheme) (*ClusterAccess, error) { + cfg, err := clientcmd.BuildConfigFromFlags("", kubeconfigPath) + if err != nil { + return nil, fmt.Errorf("building REST config from %s: %w", kubeconfigPath, err) + } + + c, err := ctrlclient.New(cfg, ctrlclient.Options{Scheme: scheme}) + if err != nil { + return nil, fmt.Errorf("building client from %s: %w", kubeconfigPath, err) + } + + return &ClusterAccess{ + Config: cfg, + Client: c, + }, nil +} + +// kubeconfigDir returns the directory containing e2e kubeconfigs. +// It honours the E2E_KUBECONFIG_DIR environment variable, otherwise falls +// back to /tmp/e2e/kubeconfigs. +func kubeconfigDir() string { + if dir := os.Getenv(EnvKubeconfigDir); dir != "" { + return dir + } + return filepath.Join(repoRoot(), DefaultKubeconfigDir) +} + +// repoRoot walks up from this source file to find the repository root +// (identified by the presence of go.mod). +func repoRoot() string { + // Use the file path of this source file as a starting point so the helper + // works regardless of the caller's working directory. + _, thisFile, _, ok := runtime.Caller(0) + if !ok { + // Fallback: assume tests are run from the repo root. + return "." + } + + dir := filepath.Dir(thisFile) + for { + if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil { + return dir + } + parent := filepath.Dir(dir) + if parent == dir { + // Reached filesystem root without finding go.mod. + return "." + } + dir = parent + } +} diff --git a/test/e2e/full-federation/chainsaw-test.yaml b/test/e2e/full-federation/chainsaw-test.yaml new file mode 100644 index 0000000..020a2bc --- /dev/null +++ b/test/e2e/full-federation/chainsaw-test.yaml @@ -0,0 +1,150 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: full-federation +spec: + description: | + End-to-end federation chain test. + + Exercises the complete path from WorkloadDeployment creation through to + Instance projection on the control-plane cluster: + + 1. Create WorkloadDeployment on control-plane. + 2. WorkloadDeploymentFederator replicates it to Karmada (ns- namespace). + 3. Karmada PropagationPolicy routes the WD to pop-dfw. + 4. WorkloadDeploymentReconciler on pop-dfw creates Instance test-full-fed-wd-0. + 5. InstanceReconciler on pop-dfw writes Instance back to Karmada with + label meta.datumapis.com/upstream-cluster-name: cluster-single. + 6. InstanceProjector on control-plane creates a projection of the Instance + in the project namespace. + + Prerequisites: both operator instances must be running (task e2e:operator:start). + + template: true + + steps: + - name: create-workload-deployment + description: Create the WorkloadDeployment on the control-plane cluster. + try: + - apply: + file: workload-deployment.yaml + + - name: assert-wd-in-downstream + description: Assert WorkloadDeploymentFederator replicated the WD to Karmada and status is synced back. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-full-fed-wd + - assert: + # Wait for the cell operator to write status back to the Karmada WD. + timeout: 60s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-full-fed-wd + status: + replicas: 1 + desiredReplicas: 1 + + - name: assert-wd-on-pop-dfw + description: Assert Karmada propagated the WD to pop-dfw and the cell reconciler set status. + cluster: pop-dfw + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + # Karmada propagation can take longer than a local apply. + timeout: 60s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-full-fed-wd + status: + replicas: 1 + desiredReplicas: 1 + + - name: assert-instance-on-pop-dfw + description: Assert WorkloadDeploymentReconciler created an Instance on pop-dfw with a Ready condition. + cluster: pop-dfw + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: Instance + metadata: + namespace: ($downstreamNS) + name: test-full-fed-wd-0 + (status.conditions[?type == 'Ready'] | [0]): + status: "Unknown" + + - name: assert-instance-writeback-in-downstream + description: Assert InstanceReconciler wrote the Instance back to Karmada. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: Instance + metadata: + namespace: ($downstreamNS) + name: test-full-fed-wd-0 + labels: + meta.datumapis.com/upstream-cluster-name: cluster-single + + - name: assert-instance-projected-to-control-plane + description: Assert InstanceProjector created a projection with status on the control-plane. + try: + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: Instance + metadata: + namespace: ($namespace) + name: test-full-fed-wd-0 + labels: + meta.datumapis.com/upstream-cluster-name: cluster-single + (status.conditions[?type == 'Ready'] | [0]): + status: "Unknown" diff --git a/test/e2e/full-federation/workload-deployment.yaml b/test/e2e/full-federation/workload-deployment.yaml new file mode 100644 index 0000000..70b4cb9 --- /dev/null +++ b/test/e2e/full-federation/workload-deployment.yaml @@ -0,0 +1,21 @@ +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + name: test-full-fed-wd + # namespace is injected by Chainsaw from ($namespace) +spec: + cityCode: dfw + placementName: default + workloadRef: + name: test-workload + uid: "00000000-0000-0000-0000-000000000001" + template: + spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network + scaleSettings: + minReplicas: 1 diff --git a/test/e2e/instance-projection/assert-downstream-wd.yaml b/test/e2e/instance-projection/assert-downstream-wd.yaml new file mode 100644 index 0000000..705d089 --- /dev/null +++ b/test/e2e/instance-projection/assert-downstream-wd.yaml @@ -0,0 +1,6 @@ +# Assert the WorkloadDeployment is federated to Karmada (and the Karmada namespace created). +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + namespace: ($downstreamNS) + name: test-projector-wd diff --git a/test/e2e/instance-projection/assert-projected-instance.yaml b/test/e2e/instance-projection/assert-projected-instance.yaml new file mode 100644 index 0000000..0542194 --- /dev/null +++ b/test/e2e/instance-projection/assert-projected-instance.yaml @@ -0,0 +1,19 @@ +# Assert the InstanceProjector created a projection in the project namespace. +# +# The InstanceProjector (internal/controller/instance_projector.go): +# - Watches Instances in Karmada that carry upstreamClusterNameLabel +# - Strips "cluster-" prefix to get the cluster name ("single" in single-provider mode) +# - Finds the project namespace by matching ns- to namespace UIDs +# - Creates/updates the Instance projection in the project namespace +# - Sets an owner reference to the WorkloadDeployment for cascading deletion +apiVersion: compute.datumapis.com/v1alpha +kind: Instance +metadata: + # namespace is the Chainsaw test namespace (the project namespace on control-plane) + name: test-projected-instance + labels: + meta.datumapis.com/upstream-cluster-name: cluster-single + ownerReferences: + - apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + name: test-projector-wd diff --git a/test/e2e/instance-projection/chainsaw-test.yaml b/test/e2e/instance-projection/chainsaw-test.yaml new file mode 100644 index 0000000..16fa9f9 --- /dev/null +++ b/test/e2e/instance-projection/chainsaw-test.yaml @@ -0,0 +1,123 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: instance-projection +spec: + description: | + Verifies that the InstanceProjector watches Instances written back to the + Karmada API server and creates corresponding read-only projections in the + project namespace on the control-plane cluster. + + Flow: + 1. Create a WorkloadDeployment → triggers federator → Karmada namespace created. + 2. Write an Instance to Karmada (simulating a POP-cell InstanceReconciler write-back). + 3. InstanceProjector detects the Karmada Instance and creates a projection in the + project namespace (the Chainsaw test namespace on the control-plane cluster). + 4. Assert the projection exists with the upstream tracking label and an owner + reference to the WorkloadDeployment (for cascading deletion). + + Cluster name label: "cluster-single" + The compute operator runs in single-provider mode for this e2e environment, + registering the control-plane cluster with the multicluster-runtime manager + under the name "single" (see cmd/main.go, wrappedSingleClusterProvider). + + template: true + + steps: + - name: create-wd + description: Create the WorkloadDeployment to trigger federation and namespace creation. + try: + - apply: + file: workload-deployment.yaml + + - name: wait-for-downstream-namespace + description: Wait for the federated WorkloadDeployment to appear in Karmada. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-projector-wd + + - name: write-instance-to-downstream + description: | + Write an Instance to Karmada simulating InstanceReconciler write-back. + Uses explicit control-plane kubeconfig to derive downstreamNS and WD UID. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get workloaddeployment test-projector-wd \ + --namespace "$NAMESPACE" \ + -o jsonpath='{.metadata.uid}' + outputs: + - name: wdUID + value: ($stdout) + - script: + env: + - name: KARMADA_NS + value: ($downstreamNS) + - name: WD_UID + value: ($wdUID) + content: | + kubectl apply -f - < is the multicluster-runtime cluster name registered by +# wrappedSingleClusterProvider (always "single" in single-cluster mode) +# - Label meta.datumapis.com/upstream-namespace = the POP-cell namespace +apiVersion: compute.datumapis.com/v1alpha +kind: Instance +metadata: + namespace: ($instanceNS) + name: test-writeback-instance + labels: + meta.datumapis.com/upstream-cluster-name: cluster-single + meta.datumapis.com/upstream-namespace: ($instanceNS) diff --git a/test/e2e/instance-writeback/chainsaw-test.yaml b/test/e2e/instance-writeback/chainsaw-test.yaml new file mode 100644 index 0000000..32dbbc5 --- /dev/null +++ b/test/e2e/instance-writeback/chainsaw-test.yaml @@ -0,0 +1,112 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: instance-writeback +spec: + description: | + Verifies that the InstanceReconciler running in a POP-cell cluster writes + Instance objects back to the Karmada API server after reconciling the Ready + condition for the first time. + + Write-back convention (internal/controller/instance_controller.go): + - The Instance is written to Karmada at the same namespace/name as the POP-cell Instance. + - Label meta.datumapis.com/upstream-cluster-name is set to + "cluster-" (e.g. "cluster-compute-pop-dfw"). + - Label meta.datumapis.com/upstream-namespace records the originating namespace. + + Note: this test requires the compute operator (InstanceReconciler) to be running + in the DFW POP cell cluster. + + template: true + + steps: + - name: setup-namespaces + description: Create the Instance namespace in the DFW POP cell and Karmada. + try: + - script: + content: | + kubectl get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: instanceNS + value: ($stdout) + - script: + env: + - name: INSTANCE_NS + value: ($instanceNS) + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/pop-dfw.yaml \ + create namespace "$INSTANCE_NS" \ + --dry-run=client -o yaml | \ + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/pop-dfw.yaml apply -f - + - script: + env: + - name: INSTANCE_NS + value: ($instanceNS) + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/downstream.yaml \ + create namespace "$INSTANCE_NS" \ + --dry-run=client -o yaml | \ + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/downstream.yaml apply -f - + cleanup: + - script: + env: + - name: INSTANCE_NS + value: ($instanceNS) + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/pop-dfw.yaml \ + delete namespace "$INSTANCE_NS" --ignore-not-found + - script: + env: + - name: INSTANCE_NS + value: ($instanceNS) + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/downstream.yaml \ + delete namespace "$INSTANCE_NS" --ignore-not-found + + - name: create-instance-on-pop-dfw + description: Create the Instance on the DFW POP cell cluster. + cluster: pop-dfw + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: instanceNS + value: ($stdout) + - apply: + file: instance-pop-dfw.yaml + cleanup: + - script: + content: | + INSTANCE_NS=$(kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}') + kubectl delete instance test-writeback-instance \ + --namespace "$INSTANCE_NS" --ignore-not-found + + - name: assert-instance-in-downstream + description: Wait for the InstanceReconciler to write back the Instance to Karmada. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: instanceNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: Instance + metadata: + namespace: ($instanceNS) + name: test-writeback-instance + labels: + meta.datumapis.com/upstream-cluster-name: cluster-single + meta.datumapis.com/upstream-namespace: ($instanceNS) diff --git a/test/e2e/instance-writeback/instance-pop-dfw.yaml b/test/e2e/instance-writeback/instance-pop-dfw.yaml new file mode 100644 index 0000000..250eb7d --- /dev/null +++ b/test/e2e/instance-writeback/instance-pop-dfw.yaml @@ -0,0 +1,15 @@ +# Instance created in the DFW POP cell. +# ($instanceNS) is the namespace derived from the Chainsaw test namespace UID, +# matching the ns- convention so the InstanceProjector can resolve it later. +apiVersion: compute.datumapis.com/v1alpha +kind: Instance +metadata: + name: test-writeback-instance + namespace: ($instanceNS) +spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network diff --git a/test/e2e/propagation-policy-lifecycle/assert-pp-exists.yaml b/test/e2e/propagation-policy-lifecycle/assert-pp-exists.yaml new file mode 100644 index 0000000..77a817a --- /dev/null +++ b/test/e2e/propagation-policy-lifecycle/assert-pp-exists.yaml @@ -0,0 +1,6 @@ +# Asserts that the PropagationPolicy for city dfw exists in the Karmada namespace. +apiVersion: policy.karmada.io/v1alpha1 +kind: PropagationPolicy +metadata: + namespace: ($downstreamNS) + name: workload-deployments-dfw diff --git a/test/e2e/propagation-policy-lifecycle/chainsaw-test.yaml b/test/e2e/propagation-policy-lifecycle/chainsaw-test.yaml new file mode 100644 index 0000000..5678c39 --- /dev/null +++ b/test/e2e/propagation-policy-lifecycle/chainsaw-test.yaml @@ -0,0 +1,133 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: propagation-policy-lifecycle +spec: + description: | + Verifies the PropagationPolicy lifecycle managed by the WorkloadDeploymentFederator: + + - A PropagationPolicy (city-dfw) is lazily created when the first WorkloadDeployment + for city code "dfw" is federated to Karmada. + - The PropagationPolicy is RETAINED while at least one WorkloadDeployment for + that city code remains in the Karmada namespace. + - The PropagationPolicy is DELETED when the last deployment for the city is removed. + + The test creates two WDs (wd-alpha, wd-beta) both targeting cityCode=dfw, verifies + the PP appears, deletes wd-alpha and asserts the PP is still present, then deletes + wd-beta and waits for the PP to disappear. + + template: true + + steps: + - name: create-deployments + description: Create two WorkloadDeployments targeting dfw on the control-plane. + try: + - apply: + file: workload-deployment-alpha.yaml + - apply: + file: workload-deployment-beta.yaml + + - name: assert-policy-created + description: | + Assert both WDs are federated to Karmada and the PropagationPolicy exists. + Both WDs must be present in Karmada before proceeding to the deletion steps; + otherwise wd-alpha's finalizer could see an empty Karmada list and prematurely + delete the PP before wd-beta has been federated. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: wd-alpha + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: wd-beta + - assert: + timeout: 30s + resource: + apiVersion: policy.karmada.io/v1alpha1 + kind: PropagationPolicy + metadata: + namespace: ($downstreamNS) + name: city-dfw + + - name: delete-alpha + description: Delete wd-alpha; wd-beta still targets dfw so the PP must be retained. + try: + - delete: + ref: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + namespace: ($namespace) + name: wd-alpha + + - name: assert-policy-retained + description: Assert the PropagationPolicy is still present after wd-alpha is deleted. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - sleep: + duration: 8s + - assert: + timeout: 5s + resource: + apiVersion: policy.karmada.io/v1alpha1 + kind: PropagationPolicy + metadata: + namespace: ($downstreamNS) + name: city-dfw + + - name: delete-beta + description: Delete wd-beta (the last WD for city dfw). + try: + - delete: + ref: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + namespace: ($namespace) + name: wd-beta + + - name: assert-policy-deleted + description: Wait for the PropagationPolicy to be removed once no WDs remain. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - wait: + apiVersion: policy.karmada.io/v1alpha1 + kind: PropagationPolicy + namespace: ($downstreamNS) + name: city-dfw + timeout: 30s + for: + deletion: {} diff --git a/test/e2e/propagation-policy-lifecycle/workload-deployment-alpha.yaml b/test/e2e/propagation-policy-lifecycle/workload-deployment-alpha.yaml new file mode 100644 index 0000000..f9eb27f --- /dev/null +++ b/test/e2e/propagation-policy-lifecycle/workload-deployment-alpha.yaml @@ -0,0 +1,21 @@ +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + name: wd-alpha +spec: + cityCode: dfw + placementName: default + workloadRef: + name: test-workload + uid: "00000000-0000-0000-0000-000000000001" + template: + spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network + + scaleSettings: + minReplicas: 1 diff --git a/test/e2e/propagation-policy-lifecycle/workload-deployment-beta.yaml b/test/e2e/propagation-policy-lifecycle/workload-deployment-beta.yaml new file mode 100644 index 0000000..fd1d65c --- /dev/null +++ b/test/e2e/propagation-policy-lifecycle/workload-deployment-beta.yaml @@ -0,0 +1,21 @@ +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + name: wd-beta +spec: + cityCode: dfw + placementName: default + workloadRef: + name: test-workload + uid: "00000000-0000-0000-0000-000000000001" + template: + spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network + + scaleSettings: + minReplicas: 1 diff --git a/test/e2e/workload-deployment-federation/assert-downstream-pp.yaml b/test/e2e/workload-deployment-federation/assert-downstream-pp.yaml new file mode 100644 index 0000000..98f8d0f --- /dev/null +++ b/test/e2e/workload-deployment-federation/assert-downstream-pp.yaml @@ -0,0 +1,20 @@ +# Assert the PropagationPolicy was created in the Karmada namespace. +# The name follows propagationPolicyNameFor("dfw") = "workload-deployments-dfw". +# ($downstreamNS) is substituted by Chainsaw's template engine. +apiVersion: policy.karmada.io/v1alpha1 +kind: PropagationPolicy +metadata: + namespace: ($downstreamNS) + name: workload-deployments-dfw +spec: + resourceSelectors: + - apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + labelSelector: + matchLabels: + topology.datum.net/city-code: dfw + placement: + clusterAffinity: + labelSelector: + matchLabels: + topology.datum.net/city-code: dfw diff --git a/test/e2e/workload-deployment-federation/assert-downstream-wd.yaml b/test/e2e/workload-deployment-federation/assert-downstream-wd.yaml new file mode 100644 index 0000000..23c308f --- /dev/null +++ b/test/e2e/workload-deployment-federation/assert-downstream-wd.yaml @@ -0,0 +1,9 @@ +# Assert the WorkloadDeployment exists in Karmada with the city-code label. +# ($downstreamNS) is substituted by Chainsaw's template engine from the script binding. +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + namespace: ($downstreamNS) + name: test-federation-wd + labels: + topology.datum.net/city-code: dfw diff --git a/test/e2e/workload-deployment-federation/chainsaw-test.yaml b/test/e2e/workload-deployment-federation/chainsaw-test.yaml new file mode 100644 index 0000000..302d89c --- /dev/null +++ b/test/e2e/workload-deployment-federation/chainsaw-test.yaml @@ -0,0 +1,84 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: workload-deployment-federation +spec: + description: | + Verifies that the WorkloadDeploymentFederator replicates a WorkloadDeployment + from the project namespace (control-plane cluster) to the Karmada API server + with the correct city-code label and PropagationPolicy. + + The federator follows the ns- convention for Karmada namespaces, + matching the MappedNamespaceResourceStrategy used by NSO. The test derives + the expected Karmada namespace dynamically from the Chainsaw test namespace UID. + + Verified: + - WorkloadDeployment exists in Karmada at ns- + - Karmada copy carries label topology.datum.net/city-code: dfw + - PropagationPolicy city-dfw exists in the Karmada namespace, + selecting WDs by city-code and routing them to matching POP-cell clusters. + + template: true + + steps: + - name: derive-ns-and-create-wd + description: Derive Karmada namespace and create the WorkloadDeployment. + try: + - apply: + file: workload-deployment.yaml + + - name: assert-wd-in-downstream + description: Assert WorkloadDeployment federated to Karmada with city-code label. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-federation-wd + labels: + topology.datum.net/city-code: dfw + + - name: assert-propagation-policy-in-downstream + description: Assert PropagationPolicy created for city-dfw. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: policy.karmada.io/v1alpha1 + kind: PropagationPolicy + metadata: + namespace: ($downstreamNS) + name: city-dfw + spec: + resourceSelectors: + - apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + labelSelector: + matchLabels: + topology.datum.net/city-code: dfw + placement: + clusterAffinity: + labelSelector: + matchLabels: + topology.datum.net/city-code: dfw diff --git a/test/e2e/workload-deployment-federation/workload-deployment.yaml b/test/e2e/workload-deployment-federation/workload-deployment.yaml new file mode 100644 index 0000000..0cd2347 --- /dev/null +++ b/test/e2e/workload-deployment-federation/workload-deployment.yaml @@ -0,0 +1,22 @@ +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + name: test-federation-wd + # namespace is injected by Chainsaw from ($namespace) +spec: + cityCode: dfw + placementName: default + workloadRef: + name: test-workload + uid: "00000000-0000-0000-0000-000000000001" + template: + spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network + + scaleSettings: + minReplicas: 1