diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 8949c76..5dcc90b 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -18,6 +18,7 @@ jobs: secrets: inherit publish-kustomize-bundles: + needs: publish-container-image permissions: id-token: write contents: read @@ -26,4 +27,6 @@ jobs: with: bundle-name: ghcr.io/datum-cloud/compute-kustomize bundle-path: config + image-name: ghcr.io/datum-cloud/compute + image-overlays: config/base/manager secrets: inherit diff --git a/.gitignore b/.gitignore index 2b0c6e4..05b47b6 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,6 @@ go.work.sum .env bin/ + +# Local e2e environment artefacts (Kind kubeconfigs, etc.) +tmp/ diff --git a/Taskfile.yaml b/Taskfile.yaml new file mode 100644 index 0000000..bcfbb0f --- /dev/null +++ b/Taskfile.yaml @@ -0,0 +1,481 @@ +version: '3' + +# ─── Variables ────────────────────────────────────────────────────────────── + +vars: + # Karmada Helm chart version to install (karmada-charts/karmada) + KARMADA_VERSION: v1.16.0 + + # karmadactl CLI version for cluster registration + KARMADACTL_VERSION: v1.16.0 + + # Chainsaw version for e2e testing (kyverno/chainsaw) + CHAINSAW_VERSION: v0.2.15 + + # Local tool directory (mirrors Makefile convention) + LOCALBIN: '{{.ROOT_DIR}}/bin' + KARMADACTL: '{{.ROOT_DIR}}/bin/karmadactl' + CHAINSAW: '{{.ROOT_DIR}}/bin/chainsaw' + + # Kind cluster names + KIND_CONTROL_PLANE: compute-control-plane + KIND_POP_DFW: compute-pop-dfw + KIND_POP_ORD: compute-pop-ord + + # All cluster names (for CRD installation loops) + KIND_ALL_CLUSTERS: '{{.KIND_CONTROL_PLANE}} {{.KIND_POP_DFW}} {{.KIND_POP_ORD}}' + + # Working directory for e2e artefacts (gitignored) + E2E_DIR: '{{.ROOT_DIR}}/tmp/e2e' + KUBECONFIG_DIR: '{{.ROOT_DIR}}/tmp/e2e/kubeconfigs' + + # Fixed NodePort for the Karmada API server. + # The Kind management cluster is created with an extraPortMapping for this port + # so it is reachable at https://localhost:32443 from the developer's machine. + KARMADA_API_NODEPORT: "32443" + +# ─── Tasks ────────────────────────────────────────────────────────────────── + +tasks: + + default: + cmds: + - task --list + silent: true + + # ════════════════════════════════════════════════════════════════════════ + # e2e environment lifecycle + # ════════════════════════════════════════════════════════════════════════ + + e2e:up: + desc: "Create the full local Kind+Karmada e2e environment (idempotent)" + cmds: + - task: e2e:tools + - task: e2e:clusters:create + - task: e2e:karmada:install + - task: e2e:karmada:configure + - task: e2e:karmada:join-clusters + - task: e2e:crds:install + - cmd: | + echo "" + echo "╔══════════════════════════════════════════════════════════╗" + echo "║ e2e environment ready ║" + echo "╠══════════════════════════════════════════════════════════╣" + echo "║ Control plane: {{.KUBECONFIG_DIR}}/control-plane.yaml" + echo "║ Karmada API: {{.KUBECONFIG_DIR}}/karmada.yaml" + echo "║ POP DFW: {{.KUBECONFIG_DIR}}/pop-dfw.yaml" + echo "║ POP ORD: {{.KUBECONFIG_DIR}}/pop-ord.yaml" + echo "╠══════════════════════════════════════════════════════════╣" + echo "║ Export for kubectl: ║" + echo "║ export KUBECONFIG={{.KUBECONFIG_DIR}}/control-plane.yaml" + echo "╚══════════════════════════════════════════════════════════╝" + silent: false + + e2e:down: + desc: "Tear down the local e2e environment" + cmds: + - kind delete cluster --name {{.KIND_CONTROL_PLANE}} 2>/dev/null || true + - kind delete cluster --name {{.KIND_POP_DFW}} 2>/dev/null || true + - kind delete cluster --name {{.KIND_POP_ORD}} 2>/dev/null || true + - rm -rf {{.E2E_DIR}} + - cmd: echo "✓ e2e environment torn down" + silent: false + + e2e:test: + desc: "Run Chainsaw e2e tests against the local Kind+Karmada environment" + deps: [e2e:tools:chainsaw] + cmds: + - | + KUBECONFIG={{.KUBECONFIG_DIR}}/control-plane.yaml \ + {{.CHAINSAW}} test \ + --config test/e2e/chainsaw-config.yaml \ + test/e2e/ \ + {{.CLI_ARGS}} + + e2e:test:filter: + desc: "Run a subset of e2e tests by name regex (e.g. task e2e:test:filter -- --include-test-regex federation)" + deps: [e2e:tools:chainsaw] + cmds: + - | + KUBECONFIG={{.KUBECONFIG_DIR}}/control-plane.yaml \ + {{.CHAINSAW}} test \ + --config test/e2e/chainsaw-config.yaml \ + {{.CLI_ARGS}} \ + test/e2e/ + + # ════════════════════════════════════════════════════════════════════════ + # Tool installation + # ════════════════════════════════════════════════════════════════════════ + + e2e:tools: + desc: "Install e2e-specific tooling (karmadactl, chainsaw, helm repo)" + cmds: + - task: e2e:tools:karmadactl + - task: e2e:tools:chainsaw + - task: e2e:tools:helm-repo + + e2e:tools:karmadactl: + desc: "Download karmadactl {{.KARMADACTL_VERSION}}" + cmds: + - mkdir -p {{.LOCALBIN}} + - | + if [ ! -f "{{.KARMADACTL}}" ]; then + OS=$(uname -s | tr '[:upper:]' '[:lower:]') + ARCH=$(uname -m | sed 's/x86_64/amd64/;s/aarch64/arm64/') + URL="https://github.com/karmada-io/karmada/releases/download/{{.KARMADACTL_VERSION}}/karmadactl-${OS}-${ARCH}.tgz" + echo "Downloading karmadactl {{.KARMADACTL_VERSION}} (${OS}/${ARCH}) from ${URL}..." + curl -sSfL "${URL}" | tar -xz -C {{.LOCALBIN}} karmadactl + chmod +x {{.KARMADACTL}} + echo "karmadactl installed → {{.KARMADACTL}}" + else + echo "karmadactl already present at {{.KARMADACTL}}" + fi + status: + - test -f {{.KARMADACTL}} + + e2e:tools:chainsaw: + desc: "Download chainsaw {{.CHAINSAW_VERSION}}" + cmds: + - mkdir -p {{.LOCALBIN}} + - | + if [ ! -f "{{.CHAINSAW}}" ]; then + OS=$(uname -s | tr '[:upper:]' '[:lower:]') + ARCH=$(uname -m | sed 's/x86_64/amd64/;s/aarch64/arm64/') + URL="https://github.com/kyverno/chainsaw/releases/download/{{.CHAINSAW_VERSION}}/chainsaw_${OS}_${ARCH}.tar.gz" + echo "Downloading chainsaw {{.CHAINSAW_VERSION}} (${OS}/${ARCH}) from ${URL}..." + curl -sSfL "${URL}" | tar -xz -C {{.LOCALBIN}} chainsaw + chmod +x {{.CHAINSAW}} + echo "chainsaw installed → {{.CHAINSAW}}" + else + echo "chainsaw already present at {{.CHAINSAW}}" + fi + status: + - test -f {{.CHAINSAW}} + + e2e:tools:helm-repo: + desc: "Add/update karmada-charts Helm repository" + cmds: + - | + if ! helm repo list 2>/dev/null | grep -q karmada-charts; then + helm repo add karmada-charts https://raw.githubusercontent.com/karmada-io/karmada/master/charts + echo "Added karmada-charts Helm repository" + fi + helm repo update karmada-charts + status: + - helm repo list 2>/dev/null | grep -q karmada-charts + + # ════════════════════════════════════════════════════════════════════════ + # Kind cluster management + # ════════════════════════════════════════════════════════════════════════ + + e2e:clusters:create: + desc: "Create all Kind clusters (idempotent)" + cmds: + # Management / control-plane cell cluster — needs extraPortMappings for + # the Karmada API server NodePort so it is accessible at localhost:32443. + - task: _e2e:cluster:create + vars: + CLUSTER_NAME: "{{.KIND_CONTROL_PLANE}}" + KIND_CONFIG: hack/e2e/kind-control-plane.yaml + # POP cell clusters — default Kind config is sufficient. + - task: _e2e:cluster:create + vars: + CLUSTER_NAME: "{{.KIND_POP_DFW}}" + KIND_CONFIG: "" + - task: _e2e:cluster:create + vars: + CLUSTER_NAME: "{{.KIND_POP_ORD}}" + KIND_CONFIG: "" + - mkdir -p {{.KUBECONFIG_DIR}} + - task: _e2e:kubeconfigs:export + + _e2e:cluster:create: + internal: true + cmds: + - | + if kind get clusters 2>/dev/null | grep -qx '{{.CLUSTER_NAME}}'; then + echo "Kind cluster '{{.CLUSTER_NAME}}' already exists — skipping" + else + echo "Creating Kind cluster '{{.CLUSTER_NAME}}'..." + CONFIG_FLAG="" + if [ -n "{{.KIND_CONFIG}}" ]; then + CONFIG_FLAG="--config {{.KIND_CONFIG}}" + fi + kind create cluster \ + --name {{.CLUSTER_NAME}} \ + $CONFIG_FLAG \ + --wait 90s + fi + + _e2e:kubeconfigs:export: + internal: true + desc: "Export Kind kubeconfigs and create Docker-IP variants for cross-cluster use" + cmds: + # Standard kubeconfigs (localhost-based, for developer kubectl use) + - kind export kubeconfig --name {{.KIND_CONTROL_PLANE}} --kubeconfig {{.KUBECONFIG_DIR}}/control-plane.yaml + - kind export kubeconfig --name {{.KIND_POP_DFW}} --kubeconfig {{.KUBECONFIG_DIR}}/pop-dfw.yaml + - kind export kubeconfig --name {{.KIND_POP_ORD}} --kubeconfig {{.KUBECONFIG_DIR}}/pop-ord.yaml + # Docker-IP kubeconfigs (used by Karmada controller, running inside Docker, + # to reach POP cell API servers across the kind bridge network) + - | + hack/e2e/make-internal-kubeconfig.sh \ + {{.KUBECONFIG_DIR}}/pop-dfw.yaml \ + {{.KUBECONFIG_DIR}}/pop-dfw-internal.yaml \ + {{.KIND_POP_DFW}} + - | + hack/e2e/make-internal-kubeconfig.sh \ + {{.KUBECONFIG_DIR}}/pop-ord.yaml \ + {{.KUBECONFIG_DIR}}/pop-ord-internal.yaml \ + {{.KIND_POP_ORD}} + + # ════════════════════════════════════════════════════════════════════════ + # Karmada installation + # ════════════════════════════════════════════════════════════════════════ + + e2e:karmada:install: + desc: "Install Karmada into the management cluster via Helm (idempotent)" + cmds: + - | + if kubectl --kubeconfig={{.KUBECONFIG_DIR}}/control-plane.yaml \ + get ns karmada-system &>/dev/null; then + echo "Karmada already installed (karmada-system namespace exists)" + else + echo "Installing Karmada {{.KARMADA_VERSION}} via Helm..." + helm install karmada karmada-charts/karmada \ + --kubeconfig={{.KUBECONFIG_DIR}}/control-plane.yaml \ + --namespace karmada-system \ + --create-namespace \ + --version {{.KARMADA_VERSION}} \ + --set apiServer.serviceType=NodePort \ + --set apiServer.nodePort={{.KARMADA_API_NODEPORT}} \ + --wait \ + --timeout 5m + echo "Karmada installed" + fi + - task: _e2e:karmada:build-kubeconfig + + e2e:karmada:configure: + desc: "Apply federation component config to the Karmada API server (idempotent)" + cmds: + - | + echo "Applying federation component to Karmada..." + kubectl --kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml apply \ + -k config/components/federation/ + echo "Federation component applied" + + _e2e:karmada:build-kubeconfig: + internal: true + desc: "Extract Karmada kubeconfig from secret and patch server to localhost:{{.KARMADA_API_NODEPORT}}" + cmds: + - | + echo "Building Karmada kubeconfig → {{.KUBECONFIG_DIR}}/karmada.yaml" + # Extract raw kubeconfig from the secret the Helm chart creates + kubectl --kubeconfig={{.KUBECONFIG_DIR}}/control-plane.yaml \ + get secret karmada-kubeconfig \ + -n karmada-system \ + -o jsonpath='{.data.kubeconfig}' \ + | base64 -d > {{.KUBECONFIG_DIR}}/karmada-raw.yaml + # Rewrite the server address to the NodePort exposed on localhost + python3 - {{.KUBECONFIG_DIR}}/karmada-raw.yaml {{.KUBECONFIG_DIR}}/karmada.yaml 127.0.0.1 {{.KARMADA_API_NODEPORT}} << 'PYEOF' + import sys, yaml + + src, dst, host, port = sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4] + + with open(src) as f: + cfg = yaml.safe_load(f) + + for cluster in cfg.get('clusters', []): + old = cluster['cluster'].get('server', '') + cluster['cluster']['server'] = f'https://{host}:{port}' + # The cert is for the internal cluster IP, so skip TLS verification. + # This is a local dev-only environment. + cluster['cluster']['insecure-skip-tls-verify'] = True + cluster['cluster'].pop('certificate-authority-data', None) + print(f" karmada server: {old} → https://{host}:{port}", file=sys.stderr) + + with open(dst, 'w') as f: + yaml.dump(cfg, f, default_flow_style=False) + PYEOF + rm {{.KUBECONFIG_DIR}}/karmada-raw.yaml + + # ════════════════════════════════════════════════════════════════════════ + # POP cell cluster registration + # ════════════════════════════════════════════════════════════════════════ + + e2e:karmada:join-clusters: + desc: "Register POP cell clusters with Karmada and apply city-code labels" + cmds: + - task: _e2e:karmada:join-cluster + vars: + CLUSTER_NAME: "{{.KIND_POP_DFW}}" + CITY_CODE: dfw + EXTERNAL_KUBECONFIG: "{{.KUBECONFIG_DIR}}/pop-dfw.yaml" + INTERNAL_KUBECONFIG: "{{.KUBECONFIG_DIR}}/pop-dfw-internal.yaml" + - task: _e2e:karmada:join-cluster + vars: + CLUSTER_NAME: "{{.KIND_POP_ORD}}" + CITY_CODE: ord + EXTERNAL_KUBECONFIG: "{{.KUBECONFIG_DIR}}/pop-ord.yaml" + INTERNAL_KUBECONFIG: "{{.KUBECONFIG_DIR}}/pop-ord-internal.yaml" + + _e2e:karmada:join-cluster: + internal: true + cmds: + # ── Register with karmadactl join ────────────────────────────────── + # We pass the EXTERNAL kubeconfig (localhost-based) here so karmadactl + # can reach the member cluster from this macOS host to set up initial + # RBAC. The stored secret is patched below to the Docker-IP variant. + - | + if kubectl --kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml \ + get cluster {{.CLUSTER_NAME}} &>/dev/null; then + echo "Cluster '{{.CLUSTER_NAME}}' already registered in Karmada — skipping join" + else + echo "Joining '{{.CLUSTER_NAME}}' to Karmada..." + {{.KARMADACTL}} join {{.CLUSTER_NAME}} \ + --kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml \ + --cluster-kubeconfig={{.EXTERNAL_KUBECONFIG}} \ + --cluster-context=kind-{{.CLUSTER_NAME}} + echo "Cluster '{{.CLUSTER_NAME}}' registered" + fi + # ── Patch cluster secret → Docker-IP kubeconfig ─────────────────── + # The Karmada controller manager runs inside Docker; it cannot use + # localhost to reach POP cell API servers. We update the stored secret + # with a kubeconfig whose server address uses the Kind container IP so + # container-to-container communication works across the kind bridge. + - | + hack/e2e/patch-cluster-secret.sh \ + {{.KUBECONFIG_DIR}}/karmada.yaml \ + {{.CLUSTER_NAME}} \ + {{.INTERNAL_KUBECONFIG}} + # ── Apply city-code label ────────────────────────────────────────── + - | + kubectl --kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml \ + label cluster {{.CLUSTER_NAME}} \ + topology.datum.net/city-code={{.CITY_CODE}} \ + --overwrite + echo "Labeled cluster '{{.CLUSTER_NAME}}' with topology.datum.net/city-code={{.CITY_CODE}}" + + # ════════════════════════════════════════════════════════════════════════ + # CRD installation + # ════════════════════════════════════════════════════════════════════════ + + e2e:crds:install: + desc: "Install compute + NSO CRDs to all clusters" + cmds: + - task: _e2e:crds:compute + - task: _e2e:crds:nso + + _e2e:crds:compute: + internal: true + desc: "Apply compute CRDs to all clusters and the Karmada API server" + cmds: + # All three Kind clusters + the Karmada API server get the compute CRDs. + # The Karmada API server needs them so it can store and propagate + # WorkloadDeployment objects. + - | + for KC in \ + {{.KUBECONFIG_DIR}}/control-plane.yaml \ + {{.KUBECONFIG_DIR}}/karmada.yaml \ + {{.KUBECONFIG_DIR}}/pop-dfw.yaml \ + {{.KUBECONFIG_DIR}}/pop-ord.yaml; do + echo "Installing compute CRDs → $(basename $KC .yaml)..." + kubectl --kubeconfig="$KC" apply -k config/base/crd --server-side + done + + _e2e:crds:nso: + internal: true + desc: "Apply NSO CRDs to control-plane and POP cell clusters" + cmds: + # NSO CRDs (NetworkBinding, SubnetClaim, etc.) are installed on the + # control-plane as well as POP cells. The control-plane operator needs them + # so that Subnet/SubnetClaim informer watches can start without cache errors, + # even though NSO controllers themselves only run on POP cells. + - | + go mod download go.datum.net/network-services-operator + NSO_VERSION=$(go list -m -json go.datum.net/network-services-operator \ + | python3 -c "import sys, json; print(json.load(sys.stdin)['Version'])") + NSO_CRD_PATH="$(go env GOMODCACHE)/go.datum.net/network-services-operator@${NSO_VERSION}/config/crd" + echo "NSO CRDs from: ${NSO_CRD_PATH}" + for KC in \ + {{.KUBECONFIG_DIR}}/control-plane.yaml \ + {{.KUBECONFIG_DIR}}/pop-dfw.yaml \ + {{.KUBECONFIG_DIR}}/pop-ord.yaml; do + echo "Installing NSO CRDs → $(basename $KC .yaml)..." + kubectl --kubeconfig="$KC" apply -k "${NSO_CRD_PATH}" --server-side + done + + # ════════════════════════════════════════════════════════════════════════ + # Operator lifecycle (background processes for federation e2e) + # ════════════════════════════════════════════════════════════════════════ + + e2e:operator:start: + desc: "Start management (control-plane) and cell (pop-dfw) operator instances in the background" + cmds: + - mkdir -p {{.E2E_DIR}}/logs {{.E2E_DIR}}/pids + - | + echo "Starting management operator (control-plane)..." + KUBECONFIG={{.KUBECONFIG_DIR}}/control-plane.yaml \ + go run ./cmd/main.go \ + --karmada-kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml \ + --enable-cell-controllers=false \ + --leader-elect=false \ + --health-probe-bind-address=:9091 \ + > {{.E2E_DIR}}/logs/operator-management.log 2>&1 & + echo $! > {{.E2E_DIR}}/pids/operator-management.pid + echo "Management operator PID: $!" + - | + echo "Waiting for management operator health check on :9091..." + deadline=$((SECONDS + 15)) + until curl -sf http://localhost:9091/healthz >/dev/null 2>&1; do + if [ $SECONDS -ge $deadline ]; then + echo "ERROR: management operator did not become healthy within 15s" + cat {{.E2E_DIR}}/logs/operator-management.log || true + exit 1 + fi + sleep 1 + done + echo "Management operator is healthy" + - | + echo "Starting cell operator (pop-dfw)..." + KUBECONFIG={{.KUBECONFIG_DIR}}/pop-dfw.yaml \ + go run ./cmd/main.go \ + --karmada-kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml \ + --enable-management-controllers=false \ + --leader-elect=false \ + --health-probe-bind-address=:9092 \ + > {{.E2E_DIR}}/logs/operator-cell-dfw.log 2>&1 & + echo $! > {{.E2E_DIR}}/pids/operator-cell-dfw.pid + echo "Cell operator PID: $!" + - | + echo "Waiting for cell operator health check on :9092..." + deadline=$((SECONDS + 15)) + until curl -sf http://localhost:9092/healthz >/dev/null 2>&1; do + if [ $SECONDS -ge $deadline ]; then + echo "ERROR: cell operator did not become healthy within 15s" + cat {{.E2E_DIR}}/logs/operator-cell-dfw.log || true + exit 1 + fi + sleep 1 + done + echo "Cell operator is healthy" + + e2e:operator:stop: + desc: "Stop background operator instances" + cmds: + - | + for PIDFILE in \ + {{.E2E_DIR}}/pids/operator-management.pid \ + {{.E2E_DIR}}/pids/operator-cell-dfw.pid; do + if [ -f "$PIDFILE" ]; then + PID=$(cat "$PIDFILE") + if kill -0 "$PID" 2>/dev/null; then + echo "Stopping PID $PID ($(basename $PIDFILE .pid))..." + kill -TERM "$PID" || true + else + echo "Process $PID ($(basename $PIDFILE .pid)) is not running" + fi + rm -f "$PIDFILE" + else + echo "PID file not found: $PIDFILE" + fi + done diff --git a/api/v1alpha/workloaddeployment_types.go b/api/v1alpha/workloaddeployment_types.go index 7da27c8..03ac341 100644 --- a/api/v1alpha/workloaddeployment_types.go +++ b/api/v1alpha/workloaddeployment_types.go @@ -2,8 +2,6 @@ package v1alpha import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - - networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" ) // WorkloadDeploymentSpec defines the desired state of WorkloadDeployment @@ -37,11 +35,6 @@ type WorkloadDeploymentSpec struct { // WorkloadDeploymentStatus defines the observed state of WorkloadDeployment type WorkloadDeploymentStatus struct { - // The location which the deployment has been scheduled to - // - // +kubebuilder:validation:Optional - Location *networkingv1alpha.LocationReference `json:"location,omitempty"` - // Represents the observations of a deployment's current state. // Known condition types are: "Available", "Progressing" Conditions []metav1.Condition `json:"conditions,omitempty"` @@ -80,8 +73,6 @@ const ( // +kubebuilder:printcolumn:name="Ready",type=string,JSONPath=`.status.readyReplicas` // +kubebuilder:printcolumn:name="Desired",type=string,JSONPath=`.status.desiredReplicas` // +kubebuilder:printcolumn:name="Up-to-date",type=string,JSONPath=`.status.currentReplicas` -// +kubebuilder:printcolumn:name="Location Namespace",type=string,JSONPath=`.status.location.namespace`,priority=1 -// +kubebuilder:printcolumn:name="Location Name",type=string,JSONPath=`.status.location.name`,priority=1 type WorkloadDeployment struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` diff --git a/api/v1alpha/zz_generated.deepcopy.go b/api/v1alpha/zz_generated.deepcopy.go index 8ecc1ba..6bd72a2 100644 --- a/api/v1alpha/zz_generated.deepcopy.go +++ b/api/v1alpha/zz_generated.deepcopy.go @@ -917,11 +917,6 @@ func (in *WorkloadDeploymentSpec) DeepCopy() *WorkloadDeploymentSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *WorkloadDeploymentStatus) DeepCopyInto(out *WorkloadDeploymentStatus) { *out = *in - if in.Location != nil { - in, out := &in.Location, &out.Location - *out = new(apiv1alpha.LocationReference) - **out = **in - } if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions *out = make([]metav1.Condition, len(*in)) diff --git a/cmd/main.go b/cmd/main.go index 3bb44bc..32e0c5f 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -18,17 +18,22 @@ import ( "k8s.io/apimachinery/pkg/runtime/serializer" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/cluster" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/manager" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" "sigs.k8s.io/controller-runtime/pkg/webhook" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcsingle "sigs.k8s.io/multicluster-runtime/providers/single" + karmadaclusterv1alpha1 "github.com/karmada-io/api/cluster/v1alpha1" + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" computev1alpha "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/config" "go.datum.net/compute/internal/controller" @@ -51,6 +56,11 @@ var ( gitCommit = "unknown" gitTreeState = "unknown" buildDate = "unknown" + + // downstreamRestConfig holds the REST config for the downstream control plane. + // It is populated from --downstream-kubeconfig when set, and is nil when the + // flag is omitted (e.g. in non-federation deployments). + downstreamRestConfig *rest.Config ) func init() { @@ -61,6 +71,8 @@ func init() { utilruntime.Must(computev1alpha.AddToScheme(scheme)) utilruntime.Must(networkingv1alpha.AddToScheme(scheme)) utilruntime.Must(quotav1alpha1.AddToScheme(scheme)) + utilruntime.Must(karmadapolicyv1alpha1.Install(scheme)) + utilruntime.Must(karmadaclusterv1alpha1.Install(scheme)) // +kubebuilder:scaffold:scheme } @@ -71,12 +83,27 @@ func main() { var leaderElectionNamespace string var probeAddr string var serverConfigFile string + var downstreamKubeconfig string + var downstreamContext string + var enableManagementControllers bool + var enableCellControllers bool flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") flag.BoolVar(&enableLeaderElection, "leader-elect", false, "Enable leader election for controller manager. "+ "Enabling this will ensure there is only one active controller manager.") flag.StringVar(&leaderElectionNamespace, "leader-elect-namespace", "", "The namespace to use for leader election.") + flag.StringVar(&downstreamKubeconfig, "downstream-kubeconfig", "", + "Path to the kubeconfig file for the downstream control plane. "+ + "When omitted, downstream federation features are disabled.") + flag.StringVar(&downstreamContext, "downstream-context", "", + "Context to use from the downstream kubeconfig. When omitted, the current context is used.") + flag.BoolVar(&enableManagementControllers, "enable-management-controllers", true, + "Enable management-plane controllers (WorkloadDeploymentFederator, InstanceProjector). "+ + "Disable when running a cell-only operator instance.") + flag.BoolVar(&enableCellControllers, "enable-cell-controllers", true, + "Enable cell controllers (WorkloadDeploymentReconciler, InstanceReconciler). "+ + "Disable when running a management-only operator instance.") opts := zap.Options{ Development: true, @@ -89,6 +116,23 @@ func main() { ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + // Load the downstream REST config when --downstream-kubeconfig is provided. + // When the flag is omitted, downstreamRestConfig remains nil and federation + // features will be skipped at controller setup time. + if downstreamKubeconfig != "" { + loader := clientcmd.NewNonInteractiveDeferredLoadingClientConfig( + &clientcmd.ClientConfigLoadingRules{ExplicitPath: downstreamKubeconfig}, + &clientcmd.ConfigOverrides{CurrentContext: downstreamContext}, + ) + var err error + downstreamRestConfig, err = loader.ClientConfig() + if err != nil { + setupLog.Error(err, "unable to load downstream kubeconfig", "path", downstreamKubeconfig) + os.Exit(1) + } + setupLog.Info("downstream kubeconfig loaded", "path", downstreamKubeconfig) + } + setupLog.Info("starting compute", "version", version, "gitCommit", gitCommit, @@ -180,17 +224,63 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "Workload") os.Exit(1) } - if err = (&controller.WorkloadDeploymentReconciler{}).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeployment") - os.Exit(1) + + // Build a single downstream client shared across all controllers that need + // to read or write to the downstream control plane. Nil when federation is disabled. + var downstreamClient client.Client + if downstreamRestConfig != nil { + downstreamClient, err = client.New(downstreamRestConfig, client.Options{Scheme: scheme}) + if err != nil { + setupLog.Error(err, "unable to create downstream client") + os.Exit(1) + } } - if err = (&controller.WorkloadDeploymentScheduler{}).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeploymentScheduler") - os.Exit(1) + + if enableCellControllers { + if err = (&controller.WorkloadDeploymentReconciler{}).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeployment") + os.Exit(1) + } } - if err = (&controller.InstanceReconciler{}).SetupWithManager(mgr, deploymentCluster); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "Instance") - os.Exit(1) + + if enableCellControllers { + instanceReconciler := &controller.InstanceReconciler{DownstreamClient: downstreamClient} + if err = instanceReconciler.SetupWithManager(mgr, deploymentCluster); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "Instance") + os.Exit(1) + } + } + + // WorkloadDeploymentFederator and InstanceProjector are management-plane + // controllers that run on the control-plane cluster. They require a downstream + // control plane to be configured (--downstream-kubeconfig provided). + if enableManagementControllers && downstreamRestConfig != nil { + federator := &controller.WorkloadDeploymentFederator{DownstreamClient: downstreamClient} + if err = federator.SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeploymentFederator") + os.Exit(1) + } + + // InstanceProjector: runs in the Control Plane Cell, watches Instances + // written back to the downstream control plane by POP-cell operators, and + // projects them into the corresponding project namespaces via the + // multicluster manager. + downstreamMgr, err := manager.New(downstreamRestConfig, manager.Options{ + Scheme: scheme, + Metrics: metricsserver.Options{BindAddress: "0"}, + }) + if err != nil { + setupLog.Error(err, "unable to create downstream manager for InstanceProjector") + os.Exit(1) + } + if err = (&controller.InstanceProjector{ + DownstreamClient: downstreamClient, + MCManager: mgr, + }).SetupWithManager(downstreamMgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "InstanceProjector") + os.Exit(1) + } + runnables = append(runnables, downstreamMgr) } if serverConfig.WebhookServer != nil { @@ -284,6 +374,7 @@ func initializeClusterDiscovery( } discoveryManager, err := manager.New(discoveryRestConfig, manager.Options{ + Metrics: metricsserver.Options{BindAddress: "0"}, Client: client.Options{ Cache: &client.CacheOptions{ Unstructured: true, diff --git a/config/base/downstream-rbac/kustomization.yaml b/config/base/downstream-rbac/kustomization.yaml new file mode 100644 index 0000000..4c4dbe4 --- /dev/null +++ b/config/base/downstream-rbac/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - rbac.yaml diff --git a/config/base/downstream-rbac/rbac.yaml b/config/base/downstream-rbac/rbac.yaml new file mode 100644 index 0000000..d214abc --- /dev/null +++ b/config/base/downstream-rbac/rbac.yaml @@ -0,0 +1,32 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: compute-manager +rules: + - apiGroups: ["compute.datumapis.com"] + resources: ["workloaddeployments", "workloaddeployments/status", "instances", "instances/status"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: ["policy.karmada.io"] + resources: ["propagationpolicies", "clusterpropagationpolicies"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: ["cluster.karmada.io"] + resources: ["clusters"] + verbs: ["get", "list", "watch"] + - apiGroups: ["work.karmada.io"] + resources: ["resourcebindings", "clusterresourcebindings"] + verbs: ["get", "list", "watch"] + - apiGroups: ["config.karmada.io"] + resources: ["resourceinterpreterwebhookconfigurations", "resourceinterpretercustomizations"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: compute-manager +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: compute-manager +subjects: + - kind: User + name: system:serviceaccount:compute-system:compute-manager diff --git a/config/base/federation/kustomization.yaml b/config/base/federation/kustomization.yaml new file mode 100644 index 0000000..1261dac --- /dev/null +++ b/config/base/federation/kustomization.yaml @@ -0,0 +1,10 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../crd/bases/compute.datumapis.com_instances.yaml + - ../crd/bases/compute.datumapis.com_workloaddeployments.yaml + - ../crd/bases/compute.datumapis.com_workloads.yaml + +components: + - ../../components/federation diff --git a/config/base/manager/manager.yaml b/config/base/manager/manager.yaml index 0302817..9528f3c 100644 --- a/config/base/manager/manager.yaml +++ b/config/base/manager/manager.yaml @@ -26,14 +26,30 @@ spec: seccompProfile: type: RuntimeDefault containers: - - command: + - name: manager + command: - /manager args: - - --leader-elect - - --health-probe-bind-address=:8081 - - --server-config=/config/config.yaml + - --leader-elect=$(LEADER_ELECT) + - --health-probe-bind-address=$(HEALTH_PROBE_BIND_ADDRESS) + - --server-config=$(SERVER_CONFIG) + - --downstream-kubeconfig=$(DOWNSTREAM_KUBECONFIG) + - --enable-management-controllers=$(ENABLE_MANAGEMENT_CONTROLLERS) + - --enable-cell-controllers=$(ENABLE_CELL_CONTROLLERS) + env: + - name: LEADER_ELECT + value: "true" + - name: HEALTH_PROBE_BIND_ADDRESS + value: ":8081" + - name: SERVER_CONFIG + value: /config/config.yaml + - name: DOWNSTREAM_KUBECONFIG + value: "" + - name: ENABLE_MANAGEMENT_CONTROLLERS + value: "false" + - name: ENABLE_CELL_CONTROLLERS + value: "false" image: ghcr.io/datum-cloud/compute:latest - name: manager ports: - containerPort: 9443 name: webhook-server @@ -69,7 +85,7 @@ spec: - name: webhook-cert mountPath: /tmp/k8s-webhook-server/serving-certs readOnly: true - serviceAccountName: compute + serviceAccountName: compute-manager terminationGracePeriodSeconds: 10 volumes: - name: config diff --git a/config/base/manager/service_account.yaml b/config/base/manager/service_account.yaml index f8711de..cc6bd6c 100644 --- a/config/base/manager/service_account.yaml +++ b/config/base/manager/service_account.yaml @@ -4,4 +4,4 @@ metadata: labels: app.kubernetes.io/name: compute app.kubernetes.io/managed-by: kustomize - name: compute + name: compute-manager diff --git a/config/components/cell-controllers/kustomization.yaml b/config/components/cell-controllers/kustomization.yaml new file mode 100644 index 0000000..3f32da3 --- /dev/null +++ b/config/components/cell-controllers/kustomization.yaml @@ -0,0 +1,20 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + - target: + kind: Deployment + name: compute-manager + patch: |- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: compute-manager + spec: + template: + spec: + containers: + - name: manager + env: + - name: ENABLE_CELL_CONTROLLERS + value: "true" diff --git a/config/components/controller_rbac/metrics_auth_role_binding.yaml b/config/components/controller_rbac/metrics_auth_role_binding.yaml index 1ea3d97..ada1a1d 100644 --- a/config/components/controller_rbac/metrics_auth_role_binding.yaml +++ b/config/components/controller_rbac/metrics_auth_role_binding.yaml @@ -8,4 +8,4 @@ roleRef: name: compute-metrics-auth-role subjects: - kind: ServiceAccount - name: compute + name: compute-manager diff --git a/config/components/controller_rbac/role_binding.yaml b/config/components/controller_rbac/role_binding.yaml index 6256bf3..2f3e267 100644 --- a/config/components/controller_rbac/role_binding.yaml +++ b/config/components/controller_rbac/role_binding.yaml @@ -11,4 +11,4 @@ roleRef: name: compute subjects: - kind: ServiceAccount - name: compute + name: compute-manager diff --git a/config/components/federation/kustomization.yaml b/config/components/federation/kustomization.yaml new file mode 100644 index 0000000..3ba207f --- /dev/null +++ b/config/components/federation/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +resources: + - workloaddeployment-interpreter.yaml diff --git a/config/components/federation/workloaddeployment-interpreter.yaml b/config/components/federation/workloaddeployment-interpreter.yaml new file mode 100644 index 0000000..3e6a9e2 --- /dev/null +++ b/config/components/federation/workloaddeployment-interpreter.yaml @@ -0,0 +1,51 @@ +apiVersion: config.karmada.io/v1alpha1 +kind: ResourceInterpreterCustomization +metadata: + name: workloaddeployment +spec: + target: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + customizations: + statusReflection: + luaScript: | + function ReflectStatus(observedObj) + if observedObj.status == nil then + return nil + end + return observedObj.status + end + statusAggregation: + luaScript: | + function AggregateStatus(desiredObj, statusItems) + if statusItems == nil then + return desiredObj + end + local replicas = 0 + local currentReplicas = 0 + local desiredReplicas = 0 + local readyReplicas = 0 + local conditions = nil + for i = 1, #statusItems do + local item = statusItems[i] + if item.status ~= nil then + replicas = replicas + (item.status.replicas or 0) + currentReplicas = currentReplicas + (item.status.currentReplicas or 0) + desiredReplicas = desiredReplicas + (item.status.desiredReplicas or 0) + readyReplicas = readyReplicas + (item.status.readyReplicas or 0) + if conditions == nil and item.status.conditions ~= nil then + conditions = item.status.conditions + end + end + end + desiredObj.status = { + replicas = replicas, + currentReplicas = currentReplicas, + desiredReplicas = desiredReplicas, + readyReplicas = readyReplicas, + } + if conditions ~= nil then + desiredObj.status.conditions = conditions + end + return desiredObj + end diff --git a/config/components/leader_election/leader_election_role_binding.yaml b/config/components/leader_election/leader_election_role_binding.yaml index a5fe999..d6783c0 100644 --- a/config/components/leader_election/leader_election_role_binding.yaml +++ b/config/components/leader_election/leader_election_role_binding.yaml @@ -11,4 +11,4 @@ roleRef: name: compute-leader-election subjects: - kind: ServiceAccount - name: compute + name: compute-manager diff --git a/config/components/management-controllers/kustomization.yaml b/config/components/management-controllers/kustomization.yaml new file mode 100644 index 0000000..d1e29e7 --- /dev/null +++ b/config/components/management-controllers/kustomization.yaml @@ -0,0 +1,20 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + - target: + kind: Deployment + name: compute-manager + patch: |- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: compute-manager + spec: + template: + spec: + containers: + - name: manager + env: + - name: ENABLE_MANAGEMENT_CONTROLLERS + value: "true" diff --git a/go.mod b/go.mod index 19fc010..286c1d9 100644 --- a/go.mod +++ b/go.mod @@ -1,22 +1,21 @@ module go.datum.net/compute -go 1.24.0 - -toolchain go1.24.2 +go 1.24.6 require ( github.com/google/go-cmp v0.7.0 + github.com/karmada-io/api v1.15.0 github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/gomega v1.37.0 github.com/stretchr/testify v1.11.1 go.datum.net/network-services-operator v0.1.0 - go.miloapis.com/milo v0.24.11 + go.miloapis.com/milo v0.25.2-0.20260518184803-e6ac7ea55253 golang.org/x/crypto v0.39.0 golang.org/x/sync v0.16.0 google.golang.org/protobuf v1.36.11 - k8s.io/api v0.33.1 + k8s.io/api v0.33.2 k8s.io/apimachinery v0.33.2 - k8s.io/client-go v0.33.1 + k8s.io/client-go v0.33.2 k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 sigs.k8s.io/controller-runtime v0.21.0 sigs.k8s.io/gateway-api v1.2.1 @@ -95,9 +94,9 @@ require ( gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiextensions-apiserver v0.33.1 // indirect - k8s.io/apiserver v0.33.1 // indirect - k8s.io/component-base v0.33.1 // indirect + k8s.io/apiextensions-apiserver v0.33.2 // indirect + k8s.io/apiserver v0.33.2 // indirect + k8s.io/component-base v0.33.2 // indirect k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20250610211856-8b98d1ed966a // indirect sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 // indirect diff --git a/go.sum b/go.sum index c472bd8..fb725cf 100644 --- a/go.sum +++ b/go.sum @@ -62,8 +62,6 @@ github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 h1:TmHmbvxPmaegwhDubVz0lICL0J5Ka2vwTzhoePEXsGE= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0/go.mod h1:qztMSjm835F2bXf+5HKAPIS5qsmQDqZna/PgVt4rWtI= github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 h1:5ZPtiqj0JL5oKWmcsq4VMaAW5ukBEgSGXEN89zeH1Jo= github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3/go.mod h1:ndYquD05frm2vACXE1nsccT4oJzjhw2arTS2cpUD1PI= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= @@ -72,6 +70,8 @@ github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8Hm github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/karmada-io/api v1.15.0 h1:6Dx+Q36LaoPqKM4gduUuhSBQ3eKjKusjkvmggLpt9xs= +github.com/karmada-io/api v1.15.0/go.mod h1:wNbBEmXYkrRLSC2VgmXizIG12FW+/sAUF7UIz5WlYAU= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= @@ -129,8 +129,6 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= -github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= @@ -139,10 +137,8 @@ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= go.datum.net/network-services-operator v0.1.0 h1:PAXOZ5DdJFgRoeVBPIXhqkCm6DxbP4tVOPcr3Y7h/So= go.datum.net/network-services-operator v0.1.0/go.mod h1:uloVfxqE+8DgSiMB651X8UC9yECpXbwp/NBstofCceE= -go.miloapis.com/milo v0.1.0 h1:AYFVz1lfta/NbWSFSSKPtnkCA2rN+iegxlfQrDgEvYY= -go.miloapis.com/milo v0.1.0/go.mod h1:X+DpWOchv/Vm63mwHnboW00KRGsODY2bUTS/bBbK1+E= -go.miloapis.com/milo v0.24.11 h1:rByXDKbP4ZEN0I/z1C2RyUCyQi0NWrITLqoQILSAn2E= -go.miloapis.com/milo v0.24.11/go.mod h1:xOFYvUsvSZV3z6eow5YdB5C/qRQf2s/5/arcfJs5XPg= +go.miloapis.com/milo v0.25.2-0.20260518184803-e6ac7ea55253 h1:0GIQZMFWzUf3XkyWahnMGDdl2+7n69NwYdwYAyo0i5Y= +go.miloapis.com/milo v0.25.2-0.20260518184803-e6ac7ea55253/go.mod h1:xOFYvUsvSZV3z6eow5YdB5C/qRQf2s/5/arcfJs5XPg= go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 h1:yd02MEjBdJkG3uabWP9apV+OuWRIXGDuJEUJbOHmCFU= @@ -195,8 +191,6 @@ golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKl golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8= -golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -224,20 +218,12 @@ golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8T golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gomodules.xyz/jsonpatch/v2 v2.5.0 h1:JELs8RLM12qJGXU4u/TO3V25KW8GreMKl9pdkk14RM0= gomodules.xyz/jsonpatch/v2 v2.5.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= -google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422 h1:GVIKPyP/kLIyVOgOnTwFOrvQaQUzOzGMCxgFUOEmm24= -google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422/go.mod h1:b6h1vNKhxaSoEI+5jc3PJUCustfli/mRab7295pY7rw= google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb h1:p31xT4yrYrSM/G4Sn2+TNUkVhFCbG9y8itM2S6Th950= google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:jbe3Bkdp+Dh2IrslsFCklNhweNTBgSYanP1UXhJDhKg= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a h1:51aaUVRocpvUOSQKM6Q7VuoaktNIaMCLuhZB6DKksq4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a/go.mod h1:uRxBH1mhmO8PGhU89cMcHaXKZqO+OfakD8QQO0oYwlQ= google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb h1:TLPQVbx1GJ8VKZxz52VAxl1EBgKXXbTiU9Fc5fZeLn4= google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:LuRYeWDFV6WOn90g357N17oMCaxpgCnbi/44qJvDn2I= -google.golang.org/grpc v1.71.0 h1:kF77BGdPTQ4/JZWMlb9VpJ5pa25aqvVqogsxNHHdeBg= -google.golang.org/grpc v1.71.0/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= google.golang.org/grpc v1.71.1 h1:ffsFWr7ygTUscGPI0KKK6TLrGz0476KUvvsbqWK0rPI= google.golang.org/grpc v1.71.1/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= -google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= -google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= @@ -250,18 +236,18 @@ gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.33.1 h1:tA6Cf3bHnLIrUK4IqEgb2v++/GYUtqiu9sRVk3iBXyw= -k8s.io/api v0.33.1/go.mod h1:87esjTn9DRSRTD4fWMXamiXxJhpOIREjWOSjsW1kEHw= -k8s.io/apiextensions-apiserver v0.33.1 h1:N7ccbSlRN6I2QBcXevB73PixX2dQNIW0ZRuguEE91zI= -k8s.io/apiextensions-apiserver v0.33.1/go.mod h1:uNQ52z1A1Gu75QSa+pFK5bcXc4hq7lpOXbweZgi4dqA= +k8s.io/api v0.33.2 h1:YgwIS5jKfA+BZg//OQhkJNIfie/kmRsO0BmNaVSimvY= +k8s.io/api v0.33.2/go.mod h1:fhrbphQJSM2cXzCWgqU29xLDuks4mu7ti9vveEnpSXs= +k8s.io/apiextensions-apiserver v0.33.2 h1:6gnkIbngnaUflR3XwE1mCefN3YS8yTD631JXQhsU6M8= +k8s.io/apiextensions-apiserver v0.33.2/go.mod h1:IvVanieYsEHJImTKXGP6XCOjTwv2LUMos0YWc9O+QP8= k8s.io/apimachinery v0.33.2 h1:IHFVhqg59mb8PJWTLi8m1mAoepkUNYmptHsV+Z1m5jY= k8s.io/apimachinery v0.33.2/go.mod h1:BHW0YOu7n22fFv/JkYOEfkUYNRN0fj0BlvMFWA7b+SM= -k8s.io/apiserver v0.33.1 h1:yLgLUPDVC6tHbNcw5uE9mo1T6ELhJj7B0geifra3Qdo= -k8s.io/apiserver v0.33.1/go.mod h1:VMbE4ArWYLO01omz+k8hFjAdYfc3GVAYPrhP2tTKccs= -k8s.io/client-go v0.33.1 h1:ZZV/Ks2g92cyxWkRRnfUDsnhNn28eFpt26aGc8KbXF4= -k8s.io/client-go v0.33.1/go.mod h1:JAsUrl1ArO7uRVFWfcj6kOomSlCv+JpvIsp6usAGefA= -k8s.io/component-base v0.33.1 h1:EoJ0xA+wr77T+G8p6T3l4efT2oNwbqBVKR71E0tBIaI= -k8s.io/component-base v0.33.1/go.mod h1:guT/w/6piyPfTgq7gfvgetyXMIh10zuXA6cRRm3rDuY= +k8s.io/apiserver v0.33.2 h1:KGTRbxn2wJagJowo29kKBp4TchpO1DRO3g+dB/KOJN4= +k8s.io/apiserver v0.33.2/go.mod h1:9qday04wEAMLPWWo9AwqCZSiIn3OYSZacDyu/AcoM/M= +k8s.io/client-go v0.33.2 h1:z8CIcc0P581x/J1ZYf4CNzRKxRvQAwoAolYPbtQes+E= +k8s.io/client-go v0.33.2/go.mod h1:9mCgT4wROvL948w6f6ArJNb7yQd7QsvqavDeZHvNmHo= +k8s.io/component-base v0.33.2 h1:sCCsn9s/dG3ZrQTX/Us0/Sx2R0G5kwa0wbZFYoVp/+0= +k8s.io/component-base v0.33.2/go.mod h1:/41uw9wKzuelhN+u+/C59ixxf4tYQKW7p32ddkYNe2k= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/kube-openapi v0.0.0-20250610211856-8b98d1ed966a h1:ZV3Zr+/7s7aVbjNGICQt+ppKWsF1tehxggNfbM7XnG8= diff --git a/hack/e2e/kind-control-plane.yaml b/hack/e2e/kind-control-plane.yaml new file mode 100644 index 0000000..47f3c63 --- /dev/null +++ b/hack/e2e/kind-control-plane.yaml @@ -0,0 +1,17 @@ +# Kind cluster configuration for the compute-control-plane management cluster. +# +# extraPortMappings exposes port 32443 on the macOS host so that the Karmada +# API server NodePort service (nodePort: 32443) is accessible at +# https://localhost:32443 without any additional port-forwarding. +# +# This matches KARMADA_API_NODEPORT in Taskfile.yaml. + +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: + - role: control-plane + extraPortMappings: + - containerPort: 32443 # Karmada API server NodePort + hostPort: 32443 + protocol: TCP + listenAddress: "127.0.0.1" diff --git a/hack/e2e/make-internal-kubeconfig.sh b/hack/e2e/make-internal-kubeconfig.sh new file mode 100755 index 0000000..3303a5b --- /dev/null +++ b/hack/e2e/make-internal-kubeconfig.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +# make-internal-kubeconfig.sh +# +# Produces a kubeconfig variant that uses the Kind node's Docker container IP +# instead of localhost. This variant is stored in Karmada so the controller +# manager (running inside Docker) can reach member cluster API servers across +# the kind bridge network. +# +# Background: Kind maps each cluster's API server to a random localhost port +# on the developer machine. Inside Docker containers, "localhost" refers to the +# container's own loopback — not the host. We therefore swap the server address +# to the Kind control-plane container's Docker bridge IP (e.g. 172.18.0.x) and +# set insecure-skip-tls-verify because the node certificate does not include +# the Docker bridge IP in its SANs. +# +# Usage: +# hack/e2e/make-internal-kubeconfig.sh \ +# tmp/e2e/kubeconfigs/pop-dfw.yaml \ +# tmp/e2e/kubeconfigs/pop-dfw-internal.yaml \ +# compute-pop-dfw + +set -euo pipefail + +INPUT="${1:?usage: $0 }" +OUTPUT="${2:?usage: $0 }" +CLUSTER_NAME="${3:?usage: $0 }" + +CONTAINER_NAME="${CLUSTER_NAME}-control-plane" + +# Resolve the container's Docker bridge IP. +DOCKER_IP=$(docker inspect \ + -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' \ + "${CONTAINER_NAME}" 2>/dev/null || true) + +if [ -z "${DOCKER_IP}" ]; then + echo "ERROR: Could not resolve Docker IP for container '${CONTAINER_NAME}'." >&2 + echo " Is the Kind cluster '${CLUSTER_NAME}' running?" >&2 + exit 1 +fi + +echo " ${CLUSTER_NAME}: Docker IP ${DOCKER_IP} → ${OUTPUT}" + +python3 - "${INPUT}" "${OUTPUT}" "${DOCKER_IP}" <<'PYEOF' +import sys, yaml + +src, dst, docker_ip = sys.argv[1], sys.argv[2], sys.argv[3] + +with open(src) as f: + cfg = yaml.safe_load(f) + +for cluster in cfg.get('clusters', []): + # Kind API server always listens on port 6443 inside the container. + cluster['cluster']['server'] = f'https://{docker_ip}:6443' + # The node cert only covers localhost / 127.0.0.1, not the bridge IP. + cluster['cluster']['insecure-skip-tls-verify'] = True + cluster['cluster'].pop('certificate-authority-data', None) + +with open(dst, 'w') as f: + yaml.dump(cfg, f, default_flow_style=False) +PYEOF diff --git a/hack/e2e/patch-cluster-secret.sh b/hack/e2e/patch-cluster-secret.sh new file mode 100755 index 0000000..e29ed38 --- /dev/null +++ b/hack/e2e/patch-cluster-secret.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash +# patch-cluster-secret.sh +# +# After "karmadactl join", Karmada stores the member cluster's kubeconfig in a +# Secret referenced by the Cluster object's spec.secretRef, and sets +# spec.apiEndpoint to the localhost address it resolved from the external +# kubeconfig. The Karmada controller manager runs inside Docker and cannot use +# localhost to reach POP cell API servers. +# +# This script: +# 1. Replaces the kubeconfig in the Secret with the Docker-IP variant so that +# the Karmada controller can make API calls to the member cluster. +# 2. Patches spec.apiEndpoint on the Cluster object so that health checks also +# use the Docker bridge IP instead of localhost. +# +# Usage: +# hack/e2e/patch-cluster-secret.sh \ +# tmp/e2e/kubeconfigs/karmada.yaml \ +# compute-pop-dfw \ +# tmp/e2e/kubeconfigs/pop-dfw-internal.yaml + +set -euo pipefail + +KARMADA_KUBECONFIG="${1:?usage: $0 }" +CLUSTER_NAME="${2:?usage: $0 }" +INTERNAL_KUBECONFIG="${3:?usage: $0 }" + +# ------------------------------------------------------------------ +# Read the Cluster object's secretRef (name + namespace) +# ------------------------------------------------------------------ +SECRET_NAME=$(kubectl \ + --kubeconfig="${KARMADA_KUBECONFIG}" \ + get cluster "${CLUSTER_NAME}" \ + -o jsonpath='{.spec.secretRef.name}' 2>/dev/null || true) + +if [ -z "${SECRET_NAME}" ]; then + echo "ERROR: Could not find spec.secretRef.name on cluster '${CLUSTER_NAME}'." >&2 + echo " Has karmadactl join completed successfully?" >&2 + exit 1 +fi + +SECRET_NAMESPACE=$(kubectl \ + --kubeconfig="${KARMADA_KUBECONFIG}" \ + get cluster "${CLUSTER_NAME}" \ + -o jsonpath='{.spec.secretRef.namespace}' 2>/dev/null || true) + +SECRET_NAMESPACE="${SECRET_NAMESPACE:-karmada-system}" + +echo " Patching secret ${SECRET_NAMESPACE}/${SECRET_NAME} with Docker-IP kubeconfig..." + +# ------------------------------------------------------------------ +# Replace the kubeconfig data in the secret +# ------------------------------------------------------------------ +kubectl \ + --kubeconfig="${KARMADA_KUBECONFIG}" \ + create secret generic "${SECRET_NAME}" \ + --namespace="${SECRET_NAMESPACE}" \ + --from-file=kubeconfig="${INTERNAL_KUBECONFIG}" \ + --dry-run=client -o yaml \ + | kubectl \ + --kubeconfig="${KARMADA_KUBECONFIG}" \ + apply -f - + +echo " Secret ${SECRET_NAMESPACE}/${SECRET_NAME} updated — Karmada controller will use Docker bridge IP" + +# ------------------------------------------------------------------ +# Extract the Docker-IP server URL from the internal kubeconfig and +# patch spec.apiEndpoint on the Cluster object so that Karmada's +# cluster-status controller uses the same reachable address for health +# checks. Without this patch the controller continues to probe the +# localhost address stored by karmadactl join and the cluster never +# transitions to Ready. +# ------------------------------------------------------------------ +DOCKER_SERVER=$(kubectl \ + --kubeconfig="${INTERNAL_KUBECONFIG}" \ + config view --minify -o jsonpath='{.clusters[0].cluster.server}') + +if [ -z "${DOCKER_SERVER}" ]; then + echo "ERROR: Could not read server URL from ${INTERNAL_KUBECONFIG}" >&2 + exit 1 +fi + +echo " Patching spec.apiEndpoint on cluster '${CLUSTER_NAME}' → ${DOCKER_SERVER}..." +kubectl \ + --kubeconfig="${KARMADA_KUBECONFIG}" \ + patch cluster "${CLUSTER_NAME}" \ + --type=merge \ + -p "{\"spec\":{\"apiEndpoint\":\"${DOCKER_SERVER}\"}}" + +echo " Cluster '${CLUSTER_NAME}' patched — health checks will now use Docker bridge IP" diff --git a/internal/controller/indexers.go b/internal/controller/indexers.go index fb0ebe8..7d9e1ae 100644 --- a/internal/controller/indexers.go +++ b/internal/controller/indexers.go @@ -15,7 +15,10 @@ import ( const ( deploymentWorkloadUIDIndex = "deploymentWorkloadUIDIndex" workloadNetworksIndex = "workloadNetworksIndex" - deploymentLocationIndex = "deploymentLocationIndex" + // deploymentCityCodeIndex indexes WorkloadDeployments by their Spec.CityCode + // so that SubnetClaim/Subnet watches can efficiently find the deployments + // that target the same city as a changed networking resource. + deploymentCityCodeIndex = "deploymentCityCodeIndex" ) func AddIndexers(ctx context.Context, mgr mcmanager.Manager) error { @@ -30,9 +33,10 @@ func addWorkloadDeploymentIndexers(ctx context.Context, mgr mcmanager.Manager) e return fmt.Errorf("failed to add workload deployment indexer %q: %w", deploymentWorkloadUIDIndex, err) } - // Index workload deployments by location - if err := mgr.GetFieldIndexer().IndexField(ctx, &computev1alpha.WorkloadDeployment{}, deploymentLocationIndex, deploymentLocationIndexFunc); err != nil { - return fmt.Errorf("failed to add workload deployment indexer %q: %w", deploymentLocationIndex, err) + // Index workload deployments by city code so that SubnetClaim/Subnet watch + // handlers can efficiently find deployments targeting the same city. + if err := mgr.GetFieldIndexer().IndexField(ctx, &computev1alpha.WorkloadDeployment{}, deploymentCityCodeIndex, deploymentCityCodeIndexFunc); err != nil { + return fmt.Errorf("failed to add workload deployment indexer %q: %w", deploymentCityCodeIndex, err) } return nil @@ -44,18 +48,12 @@ func deploymentWorkloadUIDIndexFunc(o client.Object) []string { } } -func deploymentLocationIndexFunc(o client.Object) []string { +func deploymentCityCodeIndexFunc(o client.Object) []string { deployment := o.(*computev1alpha.WorkloadDeployment) - if deployment.Status.Location == nil { + if deployment.Spec.CityCode == "" { return nil } - - return []string{ - types.NamespacedName{ - Namespace: deployment.Status.Location.Namespace, - Name: deployment.Status.Location.Name, - }.String(), - } + return []string{deployment.Spec.CityCode} } func addWorkloadIndexers(ctx context.Context, mgr mcmanager.Manager) error { diff --git a/internal/controller/instance_controller.go b/internal/controller/instance_controller.go index e5bc356..e90e695 100644 --- a/internal/controller/instance_controller.go +++ b/internal/controller/instance_controller.go @@ -17,6 +17,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/cluster" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/finalizer" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/reconcile" @@ -29,11 +30,20 @@ import ( computev1alpha "go.datum.net/compute/api/v1alpha" networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" quotav1alpha1 "go.miloapis.com/milo/pkg/apis/quota/v1alpha1" + "go.miloapis.com/milo/pkg/downstreamclient" "go.datum.net/compute/internal/controller/instancecontrol" ) -const instanceQuotaFinalizer = "quota.compute.datumapis.com/claim-cleanup" +const ( + // instanceQuotaFinalizer ensures the quota ResourceClaim is deleted when + // an Instance is removed. + instanceQuotaFinalizer = "quota.compute.datumapis.com/claim-cleanup" + + // instanceControllerFinalizer is registered with the finalizer framework and + // triggers downstream write-back cleanup on deletion. + instanceControllerFinalizer = "compute.datumapis.com/instance-controller" +) // clusterGetter is the subset of mcmanager.Manager used by InstanceReconciler. // Keeping it narrow allows unit tests to substitute a minimal fake. @@ -45,6 +55,13 @@ type clusterGetter interface { type InstanceReconciler struct { mgr clusterGetter managementCluster cluster.Cluster + // DownstreamClient is an optional client pointing at the downstream control plane. + // When non-nil, the reconciler writes a copy of each Instance back to the + // downstream control plane so that the InstanceProjector (running in the + // management cluster) can aggregate status across all POP cells. Set to nil to + // disable federation write-back (e.g. in non-federation deployments). + DownstreamClient client.Client + finalizers finalizer.Finalizers } // +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances,verbs=get;list;watch;create;update;patch;delete @@ -69,29 +86,24 @@ func (r *InstanceReconciler) Reconcile(ctx context.Context, req mcreconcile.Requ return ctrl.Result{}, err } + // Run the finalizer framework first. This handles downstream write-back cleanup + // via the Finalize method registered below. + finalizationResult, err := r.finalizers.Finalize(ctx, &instance) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to finalize: %w", err) + } + if finalizationResult.Updated { + if err = cl.GetClient().Update(ctx, &instance); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to update based on finalization result: %w", err) + } + return ctrl.Result{}, nil + } + logger.Info("reconciling instance") defer logger.Info("reconcile complete") if !instance.DeletionTimestamp.IsZero() { - if controllerutil.ContainsFinalizer(&instance, instanceQuotaFinalizer) { - claimName := fmt.Sprintf("%s--%s", instance.Namespace, instance.Name) - var claim quotav1alpha1.ResourceClaim - if err := r.managementCluster.GetClient().Get(ctx, client.ObjectKey{Namespace: instance.Namespace, Name: claimName}, &claim); err != nil { - if !apierrors.IsNotFound(err) { - return ctrl.Result{}, fmt.Errorf("failed getting resource claim for deletion: %w", err) - } - } else { - if err := r.managementCluster.GetClient().Delete(ctx, &claim); client.IgnoreNotFound(err) != nil { - return ctrl.Result{}, fmt.Errorf("failed deleting resource claim: %w", err) - } - } - - controllerutil.RemoveFinalizer(&instance, instanceQuotaFinalizer) - if err := cl.GetClient().Update(ctx, &instance); err != nil { - return ctrl.Result{}, fmt.Errorf("failed removing quota finalizer: %w", err) - } - } - return ctrl.Result{}, nil + return ctrl.Result{}, r.reconcileDeletion(ctx, cl.GetClient(), &instance) } if !controllerutil.ContainsFinalizer(&instance, instanceQuotaFinalizer) { @@ -102,84 +114,232 @@ func (r *InstanceReconciler) Reconcile(ctx context.Context, req mcreconcile.Requ return ctrl.Result{}, nil } - grantedCondition, err := r.reconcileQuotaClaim(ctx, req.ClusterName, &instance) + statusChanged, err := r.reconcileQuotaCondition(ctx, req.ClusterName, &instance) if err != nil { - return ctrl.Result{}, fmt.Errorf("failed reconciling quota claim: %w", err) + return ctrl.Result{}, err + } + + readyChanged, err := r.reconcileInstanceReadyCondition(ctx, cl.GetClient(), &instance, r.checkForNetworkCreationFailure) + if err != nil { + return ctrl.Result{}, err + } + + if statusChanged || readyChanged { + if err := cl.GetClient().Status().Update(ctx, &instance); err != nil { + return ctrl.Result{}, err + } + if err := r.writeBackToDownstream(ctx, req.ClusterName, &instance); err != nil { + return ctrl.Result{}, err + } + // Return after the status update so that the next reconcile sees the + // updated QuotaGranted condition before attempting spec changes. + return ctrl.Result{}, nil + } + + if err := r.removeQuotaSchedulingGate(ctx, cl.GetClient(), &instance); err != nil { + return ctrl.Result{}, err + } + + return ctrl.Result{}, nil +} + +// reconcileDeletion handles quota-claim cleanup when an Instance is being +// deleted. It removes the quota finalizer once the ResourceClaim is gone. +func (r *InstanceReconciler) reconcileDeletion(ctx context.Context, cl client.Client, instance *computev1alpha.Instance) error { + if !controllerutil.ContainsFinalizer(instance, instanceQuotaFinalizer) { + return nil + } + + claimName := fmt.Sprintf("%s--%s", instance.Namespace, instance.Name) + var claim quotav1alpha1.ResourceClaim + if err := r.managementCluster.GetClient().Get(ctx, client.ObjectKey{Namespace: instance.Namespace, Name: claimName}, &claim); err != nil { + if !apierrors.IsNotFound(err) { + return fmt.Errorf("failed getting resource claim for deletion: %w", err) + } + } else { + if err := r.managementCluster.GetClient().Delete(ctx, &claim); client.IgnoreNotFound(err) != nil { + return fmt.Errorf("failed deleting resource claim: %w", err) + } } - statusChanged := false + controllerutil.RemoveFinalizer(instance, instanceQuotaFinalizer) + if err := cl.Update(ctx, instance); err != nil { + return fmt.Errorf("failed removing quota finalizer: %w", err) + } + return nil +} + +// reconcileQuotaCondition reconciles the ResourceClaim and updates the +// InstanceQuotaGranted status condition. It returns true when the condition +// changed and a status update is required. +func (r *InstanceReconciler) reconcileQuotaCondition(ctx context.Context, clusterName string, instance *computev1alpha.Instance) (bool, error) { + grantedCondition, err := r.reconcileQuotaClaim(ctx, clusterName, instance) + if err != nil { + return false, fmt.Errorf("failed reconciling quota claim: %w", err) + } switch { case grantedCondition == nil || (grantedCondition.Status == metav1.ConditionFalse && grantedCondition.Reason == quotav1alpha1.ResourceClaimPendingReason): - statusChanged = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + return apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ Type: computev1alpha.InstanceQuotaGranted, Status: metav1.ConditionUnknown, Reason: computev1alpha.InstanceQuotaGrantedReasonPendingEvaluation, Message: "Waiting for quota evaluation", ObservedGeneration: instance.Generation, - }) + }), nil case grantedCondition.Status == metav1.ConditionTrue: - statusChanged = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + return apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ Type: computev1alpha.InstanceQuotaGranted, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, Message: grantedCondition.Message, ObservedGeneration: instance.Generation, - }) + }), nil - case grantedCondition.Status == metav1.ConditionFalse: + default: // grantedCondition.Status == metav1.ConditionFalse reason := computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded if grantedCondition.Reason == quotav1alpha1.ResourceClaimValidationFailedReason { reason = computev1alpha.InstanceQuotaGrantedReasonValidationFailed } - statusChanged = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + return apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ Type: computev1alpha.InstanceQuotaGranted, Status: metav1.ConditionFalse, Reason: reason, Message: grantedCondition.Message, ObservedGeneration: instance.Generation, - }) + }), nil } +} - readyChanged, err := r.reconcileInstanceReadyCondition(ctx, cl.GetClient(), &instance, r.checkForNetworkCreationFailure) +// removeQuotaSchedulingGate removes the quota scheduling gate from the +// Instance spec once QuotaGranted=True has been persisted to status. +func (r *InstanceReconciler) removeQuotaSchedulingGate(ctx context.Context, cl client.Client, instance *computev1alpha.Instance) error { + quotaGrantedCond := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceQuotaGranted) + if quotaGrantedCond == nil || quotaGrantedCond.Status != metav1.ConditionTrue { + return nil + } + if instance.Spec.Controller == nil { + return nil + } + + newGates := make([]computev1alpha.SchedulingGate, 0, len(instance.Spec.Controller.SchedulingGates)) + gateRemoved := false + for _, gate := range instance.Spec.Controller.SchedulingGates { + if gate.Name == instancecontrol.QuotaSchedulingGate.String() { + gateRemoved = true + continue + } + newGates = append(newGates, gate) + } + if !gateRemoved { + return nil + } + + patch := client.MergeFrom(instance.DeepCopy()) + instance.Spec.Controller.SchedulingGates = newGates + if err := cl.Patch(ctx, instance, patch); err != nil { + return fmt.Errorf("failed patching quota scheduling gate: %w", err) + } + return nil +} + +// Finalize removes the downstream write-back Instance when the local Instance is +// deleted. It is a no-op when downstream federation is disabled. +func (r *InstanceReconciler) Finalize(ctx context.Context, obj client.Object) (finalizer.Result, error) { + if r.DownstreamClient == nil { + return finalizer.Result{}, nil + } + + instance := obj.(*computev1alpha.Instance) + + downstreamInstance := &computev1alpha.Instance{} + err := r.DownstreamClient.Get(ctx, client.ObjectKeyFromObject(instance), downstreamInstance) + if apierrors.IsNotFound(err) { + // Already gone — nothing to do. + return finalizer.Result{}, nil + } if err != nil { - return ctrl.Result{}, err + return finalizer.Result{}, fmt.Errorf("failed getting downstream instance for deletion: %w", err) } - if statusChanged || readyChanged { - if err := cl.GetClient().Status().Update(ctx, &instance); err != nil { - return ctrl.Result{}, err + if err := r.DownstreamClient.Delete(ctx, downstreamInstance); client.IgnoreNotFound(err) != nil { + return finalizer.Result{}, fmt.Errorf("failed deleting downstream write-back instance: %w", err) + } + + return finalizer.Result{}, nil +} + +// writeBackToDownstream copies the Instance spec and status to the downstream +// control plane so that the InstanceProjector can aggregate state from all POP +// cells. It is a no-op when DownstreamClient is nil (federation disabled). +func (r *InstanceReconciler) writeBackToDownstream(ctx context.Context, clusterName string, instance *computev1alpha.Instance) error { + if r.DownstreamClient == nil { + return nil + } + + // Encode the POP-cell cluster name using the same convention as NSO's + // MappedNamespaceResourceStrategy: "cluster-" with "/" → "_". + encodedClusterName := "cluster-" + strings.ReplaceAll(clusterName, "/", "_") + + // Read the upstream project namespace name from the downstream namespace label + // stamped by the WorkloadDeploymentFederator. This lets the InstanceProjector + // resolve the target namespace via a direct label lookup on the Instance rather + // than scanning all project cluster namespaces by UID. + upstreamNamespace := instance.Namespace // fallback: cell namespace (ns-) + var downstreamNS corev1.Namespace + if err := r.DownstreamClient.Get(ctx, client.ObjectKey{Name: instance.Namespace}, &downstreamNS); err == nil { + if v := downstreamNS.Labels[downstreamclient.UpstreamOwnerNamespaceLabel]; v != "" { + upstreamNamespace = v } - // Return after the status update so that the next reconcile sees the - // updated QuotaGranted condition before attempting spec changes. - return ctrl.Result{}, nil } - // Remove the quota scheduling gate once QuotaGranted=True is persisted. - quotaGrantedCond := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceQuotaGranted) - if quotaGrantedCond != nil && quotaGrantedCond.Status == metav1.ConditionTrue { - if instance.Spec.Controller != nil { - newGates := make([]computev1alpha.SchedulingGate, 0, len(instance.Spec.Controller.SchedulingGates)) - gateRemoved := false - for _, gate := range instance.Spec.Controller.SchedulingGates { - if gate.Name == instancecontrol.QuotaSchedulingGate.String() { - gateRemoved = true - continue - } - newGates = append(newGates, gate) - } - if gateRemoved { - patch := client.MergeFrom(instance.DeepCopy()) - instance.Spec.Controller.SchedulingGates = newGates - if err := cl.GetClient().Patch(ctx, &instance, patch); err != nil { - return ctrl.Result{}, fmt.Errorf("failed patching quota scheduling gate: %w", err) - } - } + writeBack := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instance.Name, + Namespace: instance.Namespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: encodedClusterName, + downstreamclient.UpstreamOwnerNamespaceLabel: upstreamNamespace, + }, + }, + Spec: instance.Spec, + } + + existing := &computev1alpha.Instance{} + err := r.DownstreamClient.Get(ctx, client.ObjectKeyFromObject(writeBack), existing) + if apierrors.IsNotFound(err) { + // Ensure the namespace exists in the downstream control plane before creating the Instance. + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: instance.Namespace}} + if err := r.DownstreamClient.Create(ctx, ns); err != nil && !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("failed ensuring downstream namespace: %w", err) + } + if err := r.DownstreamClient.Create(ctx, writeBack); err != nil { + return fmt.Errorf("failed creating downstream write-back instance: %w", err) + } + writeBack.Status = instance.Status + if err := r.DownstreamClient.Status().Update(ctx, writeBack); err != nil { + return fmt.Errorf("failed updating downstream write-back instance status after create: %w", err) } + return nil + } + if err != nil { + return fmt.Errorf("failed getting downstream instance: %w", err) } - return ctrl.Result{}, nil + // Update spec + labels on the existing object, then push status separately. + existing.Spec = instance.Spec + existing.Labels = writeBack.Labels + if err := r.DownstreamClient.Update(ctx, existing); err != nil { + return fmt.Errorf("failed updating downstream write-back instance: %w", err) + } + + existing.Status = instance.Status + if err := r.DownstreamClient.Status().Update(ctx, existing); err != nil { + return fmt.Errorf("failed updating downstream write-back instance status: %w", err) + } + + return nil } func (r *InstanceReconciler) reconcileQuotaClaim(ctx context.Context, clusterName string, instance *computev1alpha.Instance) (*metav1.Condition, error) { @@ -344,6 +504,7 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( return false, fmt.Errorf("failed checking for network creation failure: %w", err) } + readyCondition.Status = metav1.ConditionFalse if networkCreationFailure { readyCondition.Reason = "NetworkFailedToCreate" readyCondition.Message = networkCreationFailureMessage @@ -360,6 +521,7 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( if programmedCondition == nil || programmedCondition.Status != metav1.ConditionTrue { logger.Info("instance is not programmed", "instance", instance.Name) + readyCondition.Status = metav1.ConditionFalse readyCondition.Reason = computev1alpha.InstanceProgrammedReasonPendingProgramming if programmedCondition != nil && programmedCondition.Reason != pendingReason { readyCondition.Reason = programmedCondition.Reason @@ -379,6 +541,7 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( if runningCondition == nil || runningCondition.Status != metav1.ConditionTrue { logger.Info("instance is not running", "instance", instance.Name) + readyCondition.Status = metav1.ConditionFalse readyCondition.Reason = pendingReason if runningCondition != nil && runningCondition.Reason != pendingReason { readyCondition.Reason = runningCondition.Reason @@ -441,6 +604,11 @@ func (r *InstanceReconciler) SetupWithManager(mgr mcmanager.Manager, managementC r.mgr = mgr r.managementCluster = managementCluster + r.finalizers = finalizer.NewFinalizers() + if err := r.finalizers.Register(instanceControllerFinalizer, r); err != nil { + return fmt.Errorf("failed to register finalizer: %w", err) + } + // Watch ResourceClaim objects on the management cluster directly, bypassing // the multicluster clusterInjectingQueue which would overwrite ClusterName. // Using ctrlsource.TypedKind lets the handler produce mcreconcile.Request diff --git a/internal/controller/instance_controller_test.go b/internal/controller/instance_controller_test.go index 1a15090..3537e53 100644 --- a/internal/controller/instance_controller_test.go +++ b/internal/controller/instance_controller_test.go @@ -2,8 +2,6 @@ package controller import ( "context" - "fmt" - "net/http" "testing" "github.com/stretchr/testify/assert" @@ -13,12 +11,10 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/rest" - "k8s.io/client-go/tools/record" - "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/controller-runtime/pkg/cluster" + "sigs.k8s.io/controller-runtime/pkg/finalizer" "sigs.k8s.io/controller-runtime/pkg/reconcile" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" @@ -27,36 +23,6 @@ import ( quotav1alpha1 "go.miloapis.com/milo/pkg/apis/quota/v1alpha1" ) -// fakeCluster implements cluster.Cluster for testing using a fake client. -type fakeCluster struct { - client client.Client - scheme *runtime.Scheme -} - -func (f *fakeCluster) GetHTTPClient() *http.Client { return nil } -func (f *fakeCluster) GetConfig() *rest.Config { return nil } -func (f *fakeCluster) GetCache() cache.Cache { return nil } -func (f *fakeCluster) GetScheme() *runtime.Scheme { return f.scheme } -func (f *fakeCluster) GetClient() client.Client { return f.client } -func (f *fakeCluster) GetFieldIndexer() client.FieldIndexer { return nil } -func (f *fakeCluster) GetEventRecorderFor(string) record.EventRecorder { return nil } -func (f *fakeCluster) GetRESTMapper() apimeta.RESTMapper { return nil } -func (f *fakeCluster) GetAPIReader() client.Reader { return f.client } -func (f *fakeCluster) Start(context.Context) error { return nil } - -// fakeMCManager is a minimal multicluster manager that returns a single cluster. -type fakeMCManager struct { - clusters map[string]cluster.Cluster -} - -func (m *fakeMCManager) GetCluster(ctx context.Context, clusterName string) (cluster.Cluster, error) { - cl, ok := m.clusters[clusterName] - if !ok { - return nil, fmt.Errorf("cluster %q not found", clusterName) - } - return cl, nil -} - // newTestScheme builds a runtime.Scheme with the types needed for instance reconcile tests. func newTestScheme(t *testing.T) *runtime.Scheme { t.Helper() @@ -508,12 +474,15 @@ func TestReconcileQuota(t *testing.T) { // makeInstance creates a test Instance with an owner reference to the // deployment so that checkForNetworkCreationFailure can look it up. + // Both finalizers are pre-populated so that the finalizer framework does + // not need to add instanceControllerFinalizer on the first reconcile, + // which would cause an early return before quota logic runs. makeInstance := func(_ *runtime.Scheme, gates ...computev1alpha.SchedulingGate) *computev1alpha.Instance { return &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ Name: instanceName, Namespace: namespace, - Finalizers: []string{instanceQuotaFinalizer}, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, OwnerReferences: []metav1.OwnerReference{ { APIVersion: "compute.datumapis.com/v1alpha", @@ -590,14 +559,21 @@ func TestReconcileQuota(t *testing.T) { mgr := &fakeMCManager{ clusters: map[string]cluster.Cluster{ - clusterName: &fakeCluster{client: projectClient, scheme: s}, + clusterName: newFakeCluster(projectClient), }, } r := &InstanceReconciler{ mgr: mgr, - managementCluster: &fakeCluster{client: mgmtClient, scheme: s}, + managementCluster: newFakeCluster(mgmtClient), } + + // Initialize the finalizer registry so that r.finalizers.Finalize is not + // a nil-pointer dereference. SetupWithManager does this in production; in + // tests we replicate the same steps manually. + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + return r, projectClient, mgmtClient } @@ -737,10 +713,28 @@ func TestReconcileQuota(t *testing.T) { s := newTestScheme(t) now := metav1.Now() - instance := makeInstance(s, - computev1alpha.SchedulingGate{Name: instancecontrol.QuotaSchedulingGate.String()}, - ) - instance.DeletionTimestamp = &now + // Build the instance directly without instanceControllerFinalizer to + // represent the state after the Karmada finalizer has already been + // cleaned up; only the quota finalizer remains to be processed. + instance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instanceName, + Namespace: namespace, + DeletionTimestamp: &now, + Finalizers: []string{instanceQuotaFinalizer}, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: "d1-standard-2"}, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + } claim := makeClaim(s, metav1.ConditionFalse, quotav1alpha1.ResourceClaimPendingReason) diff --git a/internal/controller/instance_projector.go b/internal/controller/instance_projector.go new file mode 100644 index 0000000..db26e84 --- /dev/null +++ b/internal/controller/instance_projector.go @@ -0,0 +1,164 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + "strings" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +// InstanceProjector watches Instance objects written back to the downstream +// control plane by POP-cell InstanceReconcilers and creates read-only +// projections in the corresponding project namespace within each project cluster. +// +// Namespace resolution: a downstream Instance lives in namespace +// `ns-`. The UID portion is matched against the UID of +// namespaces in the project cluster to find the target namespace. +// +// Ownership: each projected Instance is owned by the project WorkloadDeployment +// so that it is garbage-collected via cascading deletion when the deployment is +// removed from the project cluster. +// +// The controller is registered with a standard manager.Manager pointed at the +// downstream control plane — NOT the multicluster-runtime manager — so informer +// watches are scoped to the downstream control plane. +type InstanceProjector struct { + // DownstreamClient reads Instance objects from the downstream control plane. + // Must be set before SetupWithManager is called. + DownstreamClient client.Client + + // MCManager provides access to project cluster clients via GetCluster. + MCManager mcmanager.Manager +} + +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances/status,verbs=get;update;patch + +func (r *InstanceProjector) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx).WithValues("instance", req.NamespacedName) + + // 1. Fetch the Instance from the downstream control plane. + var downstreamInstance computev1alpha.Instance + if err := r.DownstreamClient.Get(ctx, req.NamespacedName, &downstreamInstance); err != nil { + if apierrors.IsNotFound(err) { + // Instance was deleted from the downstream control plane. Projections + // are owned by the project WorkloadDeployment, so cascading deletion + // handles cleanup. + return ctrl.Result{}, nil + } + return ctrl.Result{}, fmt.Errorf("failed getting downstream instance: %w", err) + } + + // Only project Instances that carry the upstream tracking label; others were + // not written by our InstanceReconciler write-back logic. + encodedClusterName, ok := downstreamInstance.Labels[downstreamclient.UpstreamOwnerClusterNameLabel] + if !ok { + logger.V(1).Info("skipping instance without upstream cluster label") + return ctrl.Result{}, nil + } + + // 2. Resolve the project cluster name. + // The encoded form is "cluster-" with "/" replaced by "_". + clusterName := strings.TrimPrefix(encodedClusterName, "cluster-") + clusterName = strings.ReplaceAll(clusterName, "_", "/") + + // 3. Obtain the project cluster client. + projectCluster, err := r.MCManager.GetCluster(ctx, clusterName) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed getting project cluster %q: %w", clusterName, err) + } + projectClient := projectCluster.GetClient() + + // 4. Resolve the target project namespace from the Instance label. + // The InstanceReconciler stamps UpstreamOwnerNamespaceLabel with the project + // namespace name (read from the downstream namespace label set by the federator), + // so we can resolve the target namespace directly without scanning. + targetNamespace := downstreamInstance.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] + if targetNamespace == "" { + logger.Info("Instance missing upstream-namespace label, requeueing", + "namespace", downstreamInstance.Namespace, "name", downstreamInstance.Name) + return ctrl.Result{RequeueAfter: 5 * time.Second}, nil + } + + // 5. Find the owning WorkloadDeployment in the project cluster by UID. + // The downstream Instance carries WorkloadDeploymentUIDLabel so we can find + // the owning deployment without relying on field selectors. + wdUID := downstreamInstance.Labels[computev1alpha.WorkloadDeploymentUIDLabel] + + var wdList computev1alpha.WorkloadDeploymentList + if err := projectClient.List(ctx, &wdList, client.InNamespace(targetNamespace)); err != nil { + return ctrl.Result{}, fmt.Errorf("failed listing WorkloadDeployments in %s/%s: %w", clusterName, targetNamespace, err) + } + + var ownerWD *computev1alpha.WorkloadDeployment + for i := range wdList.Items { + if string(wdList.Items[i].UID) == wdUID { + ownerWD = &wdList.Items[i] + break + } + } + + // 6. Create or update the projection in the project namespace. + projection := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: downstreamInstance.Name, + Namespace: targetNamespace, + }, + } + + operationResult, err := controllerutil.CreateOrUpdate(ctx, projectClient, projection, func() error { + // Propagate upstream tracking labels so consumers can filter by origin. + if projection.Labels == nil { + projection.Labels = make(map[string]string) + } + for k, v := range downstreamInstance.Labels { + projection.Labels[k] = v + } + + projection.Spec = downstreamInstance.Spec + + // Attach an owner reference to the WorkloadDeployment so the projection + // is garbage-collected when the deployment is removed. + if ownerWD != nil { + return controllerutil.SetOwnerReference(ownerWD, projection, projectCluster.GetScheme()) + } + return nil + }) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed upserting Instance projection in %s/%s: %w", clusterName, targetNamespace, err) + } + + logger.Info("reconciled Instance projection", "operation", operationResult, "namespace", targetNamespace, "cluster", clusterName) + + // 7. Sync status — status is a separate subresource. + projection.Status = downstreamInstance.Status + if err := projectClient.Status().Update(ctx, projection); err != nil && !apierrors.IsNotFound(err) { + return ctrl.Result{}, fmt.Errorf("failed updating Instance projection status: %w", err) + } + + return ctrl.Result{}, nil +} + +// SetupWithManager registers the InstanceProjector with downstreamMgr, a standard +// manager.Manager configured against the downstream control plane REST config. +// DownstreamClient and MCManager must be set before calling this method. +func (r *InstanceProjector) SetupWithManager(downstreamMgr manager.Manager) error { + return ctrl.NewControllerManagedBy(downstreamMgr). + For(&computev1alpha.Instance{}). + Named("instance-projector"). + Complete(r) +} diff --git a/internal/controller/instance_projector_test.go b/internal/controller/instance_projector_test.go new file mode 100644 index 0000000..67e3a46 --- /dev/null +++ b/internal/controller/instance_projector_test.go @@ -0,0 +1,361 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +// ─── Test constants ─────────────────────────────────────────────────────────── + +const ( + // projTestCluster is the project cluster name used in projector tests. + projTestCluster = "project-cluster" + + // projTestProjNS is the project namespace name. + projTestProjNS = "proj-namespace" + + // projTestProjNSUID is the project namespace UID embedded in the Karmada + // namespace name below. + projTestProjNSUID = types.UID("deadbeef-1111-2222-3333-444455556666") + + // projTestKarmadaNS is the Karmada namespace derived from the UID above + // via the ns- convention. + projTestKarmadaNS = "ns-deadbeef-1111-2222-3333-444455556666" + + // projTestInstanceName is the name of the Karmada (and projected) Instance. + projTestInstanceName = "inst-abc" + + // projTestWDUID is the UID of the owning WorkloadDeployment. + projTestWDUID = types.UID("wd-uid-9999-aaaa-bbbb-cccc") + + // projTestWDName is the name of the owning WorkloadDeployment. + projTestWDName = "my-wd" +) + +// encodedCluster returns the value of the UpstreamOwnerClusterNameLabel for +// projTestCluster ("cluster-"). +func encodedCluster() string { + return "cluster-" + projTestCluster +} + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +// projTestProjectNS builds the project cluster Namespace with the stable test UID. +func projTestProjectNS() *corev1.Namespace { + return &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestProjNS, + UID: projTestProjNSUID, + }, + } +} + +// projTestWorkloadDeployment builds the project WorkloadDeployment that owns +// projected Instances. +func projTestWorkloadDeployment() *computev1alpha.WorkloadDeployment { + return &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestWDName, + Namespace: projTestProjNS, + UID: projTestWDUID, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: "LAX", + PlacementName: "default", + WorkloadRef: computev1alpha.WorkloadReference{Name: "my-workload"}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + }, + } +} + +// projTestKarmadaInstance builds a Karmada Instance with the default labels +// needed for the InstanceProjector to act on it. Optional label overrides are +// applied last. +func projTestKarmadaInstance(labelOverrides map[string]string) *computev1alpha.Instance { + labels := map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: encodedCluster(), + downstreamclient.UpstreamOwnerNamespaceLabel: projTestProjNS, + computev1alpha.WorkloadDeploymentUIDLabel: string(projTestWDUID), + } + for k, v := range labelOverrides { + labels[k] = v + } + return &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestInstanceName, + Namespace: projTestKarmadaNS, + Labels: labels, + }, + Spec: computev1alpha.InstanceSpec{ + // Minimal valid spec — actual content is copied to the projection. + }, + } +} + +// newTestProjector wires an InstanceProjector with the given downstream client and +// a project cluster that serves the supplied project client. +func newTestProjector(karmadaClient client.Client, projectClient client.Client) *InstanceProjector { + projectCluster := newFakeCluster(projectClient) + mgr := newFakeMCManager(projTestCluster, projectCluster) + return &InstanceProjector{ + DownstreamClient: karmadaClient, + MCManager: mgr, + } +} + +// projectorRequest builds a ctrl.Request for the test Instance in Karmada. +func projectorRequest() ctrl.Request { + return ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: projTestInstanceName, + Namespace: projTestKarmadaNS, + }, + } +} + +// ─── Tests ─────────────────────────────────────────────────────────────────── + +// TestInstanceProjector_Reconcile is the primary table-driven test. +func TestInstanceProjector_Reconcile(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + + // karmadaInstance is what exists in the Karmada API server. + // A nil value means the Instance does not exist (not-found path). + karmadaInstance *computev1alpha.Instance + + // projectObjs are pre-populated in the project cluster fake client. + projectObjs []client.Object + + // wantProjection controls whether a projected Instance should appear. + wantProjection bool + + // wantOwnerRef controls whether the projected Instance should have an + // owner reference pointing to the project WorkloadDeployment. + wantOwnerRef bool + + // wantRequeue controls whether the reconcile result should request a requeue. + wantRequeue bool + + // wantErr controls whether the reconcile should return an error. + wantErr bool + }{ + { + name: "happy path — instance projected with owner reference", + karmadaInstance: projTestKarmadaInstance(nil), + projectObjs: []client.Object{ + projTestProjectNS(), + projTestWorkloadDeployment(), + }, + wantProjection: true, + wantOwnerRef: true, + }, + { + name: "projection created without owner ref when WD UID label absent", + karmadaInstance: projTestKarmadaInstance(map[string]string{ + // Override: remove the WD UID label. + computev1alpha.WorkloadDeploymentUIDLabel: "", + }), + projectObjs: []client.Object{ + projTestProjectNS(), + // No WorkloadDeployment in project cluster. + }, + wantProjection: true, + wantOwnerRef: false, + }, + { + name: "missing upstream-cluster-name label — skipped, no projection", + karmadaInstance: &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestInstanceName, + Namespace: projTestKarmadaNS, + // Intentionally no UpstreamOwnerClusterNameLabel. + Labels: map[string]string{ + "some-other-label": "value", + }, + }, + }, + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + }, + { + name: "missing upstream-namespace label — requeue", + karmadaInstance: projTestKarmadaInstance(map[string]string{ + // Override: remove the upstream namespace label. + downstreamclient.UpstreamOwnerNamespaceLabel: "", + }), + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + wantRequeue: true, + }, + { + name: "karmada instance not found — no-op", + karmadaInstance: nil, // causes Get to return NotFound + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + // Build Karmada client. + var karmadaObjs []client.Object + if tt.karmadaInstance != nil { + karmadaObjs = append(karmadaObjs, tt.karmadaInstance) + } + karmadaClient := newKarmadaFakeClient(karmadaObjs...) + + // Build project client. + projectClient := fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects(tt.projectObjs...). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newTestProjector(karmadaClient, projectClient) + + result, err := r.Reconcile(context.Background(), projectorRequest()) + + if tt.wantErr { + require.Error(t, err) + return + } + require.NoError(t, err) + + if tt.wantRequeue { + assert.NotZero(t, result.RequeueAfter, "expected RequeueAfter to be set") + } else { + assert.Equal(t, ctrl.Result{}, result) + } + + ctx := context.Background() + + // Check whether a projected Instance exists in the project namespace. + var projection computev1alpha.Instance + err = projectClient.Get(ctx, types.NamespacedName{ + Name: projTestInstanceName, + Namespace: projTestProjNS, + }, &projection) + + if !tt.wantProjection { + assert.True(t, isNotFound(err), + "expected no projection in project namespace, but found one (or unexpected error: %v)", err) + return + } + + require.NoError(t, err, "expected projection to exist in project namespace") + + // Labels should be copied from the Karmada instance. + if tt.karmadaInstance != nil { + for k, v := range tt.karmadaInstance.Labels { + assert.Equal(t, v, projection.Labels[k], + "projection label %q should match Karmada instance label", k) + } + } + + // Owner reference check. + if tt.wantOwnerRef { + require.NotEmpty(t, projection.OwnerReferences, + "projected instance should have an owner reference to the WorkloadDeployment") + ownerRef := projection.OwnerReferences[0] + assert.Equal(t, string(projTestWDUID), string(ownerRef.UID), + "owner reference UID should match the WorkloadDeployment UID") + assert.Equal(t, projTestWDName, ownerRef.Name, + "owner reference name should match the WorkloadDeployment name") + } else { + assert.Empty(t, projection.OwnerReferences, + "projected instance should have no owner reference when WD not found") + } + }) + } +} + +// TestInstanceProjector_SpecCopied verifies that the Instance spec is correctly +// propagated from the Karmada instance to the projection. +func TestInstanceProjector_SpecCopied(t *testing.T) { + t.Parallel() + + karmadaInst := projTestKarmadaInstance(nil) + // Set a recognizable spec field we can assert against. + karmadaInst.Spec.Controller = &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{{Name: "test-gate"}}, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects(projTestProjectNS(), projTestWorkloadDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + karmadaClient := newKarmadaFakeClient(karmadaInst) + + r := newTestProjector(karmadaClient, projectClient) + _, err := r.Reconcile(context.Background(), projectorRequest()) + require.NoError(t, err) + + var projection computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Name: projTestInstanceName, Namespace: projTestProjNS}, + &projection)) + + require.NotNil(t, projection.Spec.Controller) + require.Len(t, projection.Spec.Controller.SchedulingGates, 1) + assert.Equal(t, "test-gate", projection.Spec.Controller.SchedulingGates[0].Name) +} + +// TestInstanceProjector_NamespaceResolution verifies that the projector resolves +// the target project namespace directly from the UpstreamOwnerNamespaceLabel on +// the Karmada Instance, landing the projection in the correct namespace. +func TestInstanceProjector_NamespaceResolution(t *testing.T) { + t.Parallel() + + karmadaInst := projTestKarmadaInstance(nil) + projectClient := fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects( + projTestProjectNS(), + projTestWorkloadDeployment(), + ). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + karmadaClient := newKarmadaFakeClient(karmadaInst) + + r := newTestProjector(karmadaClient, projectClient) + result, err := r.Reconcile(context.Background(), projectorRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + // Projection must land in the namespace named by the label. + var projection computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Name: projTestInstanceName, Namespace: projTestProjNS}, + &projection)) +} + +// isNotFound returns true when err is a Kubernetes not-found error or is nil +// (object not found means Get returned NotFound, not that err is nil). +// Used to distinguish "no projection created" from "projection exists but Get failed". +func isNotFound(err error) bool { + if err == nil { + return false // object exists — not the "not found" case + } + // Import apierrors to check — we already have it via the fake client package. + return client.IgnoreNotFound(err) == nil +} diff --git a/internal/controller/instancecontrol/stateful/stateful_control.go b/internal/controller/instancecontrol/stateful/stateful_control.go index 566a652..e259eda 100644 --- a/internal/controller/instancecontrol/stateful/stateful_control.go +++ b/internal/controller/instancecontrol/stateful/stateful_control.go @@ -68,8 +68,6 @@ func (c *statefulControl) GetActions( }, Spec: deployment.Spec.Template.Spec, } - desiredInstances[i].Spec.Location = deployment.Status.Location - // TODO(jreese) consider adding scheduling gates via mutating webhooks desiredInstances[i].Spec.Controller = &v1alpha.InstanceController{ TemplateHash: instanceTemplateHash, diff --git a/internal/controller/testing_helpers_test.go b/internal/controller/testing_helpers_test.go new file mode 100644 index 0000000..ff48a8c --- /dev/null +++ b/internal/controller/testing_helpers_test.go @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/cluster" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + computev1alpha "go.datum.net/compute/api/v1alpha" +) + +// ─── Scheme helpers ─────────────────────────────────────────────────────────── + +// newProjectScheme builds a runtime.Scheme with the types needed by the project +// cluster (corev1 + compute). +func newProjectScheme() *runtime.Scheme { + s := runtime.NewScheme() + _ = corev1.AddToScheme(s) + _ = computev1alpha.AddToScheme(s) + return s +} + +// newKarmadaScheme builds a runtime.Scheme with the types needed by the Karmada +// API server (corev1 + compute + karmada policy). +func newKarmadaScheme() *runtime.Scheme { + s := runtime.NewScheme() + _ = corev1.AddToScheme(s) + _ = computev1alpha.AddToScheme(s) + _ = karmadapolicyv1alpha1.Install(s) + return s +} + +// newProjectFakeClient returns a fake client pre-populated with the given +// objects and the project scheme. +func newProjectFakeClient(objs ...client.Object) client.Client { + return fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects(objs...). + WithStatusSubresource(objs...). + Build() +} + +// newKarmadaFakeClient returns a fake client pre-populated with the given +// objects and the Karmada scheme. +func newKarmadaFakeClient(objs ...client.Object) client.Client { + return fake.NewClientBuilder(). + WithScheme(newKarmadaScheme()). + WithObjects(objs...). + Build() +} + +// ─── Fake cluster.Cluster ───────────────────────────────────────────────────── + +// fakeCluster is a minimal cluster.Cluster implementation for tests. +// Embeds the interface so only the methods we need are implemented. +type fakeCluster struct { + cluster.Cluster // nil embed — panics if unimplemented methods are called + cl client.Client +} + +func (f *fakeCluster) GetClient() client.Client { return f.cl } +func (f *fakeCluster) GetScheme() *runtime.Scheme { return f.cl.Scheme() } +func (f *fakeCluster) GetAPIReader() client.Reader { return f.cl } + +// newFakeCluster wraps a fake client in a fakeCluster. +func newFakeCluster(cl client.Client) *fakeCluster { + return &fakeCluster{cl: cl} +} + +// ─── Fake mcmanager.Manager ─────────────────────────────────────────────────── + +// fakeMCManager is a minimal mcmanager.Manager implementation that serves a +// fixed map of project clusters. Only GetCluster is implemented; all other +// Manager methods panic through the embedded nil interface. +type fakeMCManager struct { + mcmanager.Manager // nil embed — panics if unimplemented methods are called + clusters map[string]cluster.Cluster +} + +func (m *fakeMCManager) GetCluster(_ context.Context, name string) (cluster.Cluster, error) { + if c, ok := m.clusters[name]; ok { + return c, nil + } + return nil, fmt.Errorf("cluster %q not found in fake manager", name) +} + +// newFakeMCManager returns a fakeMCManager with a single named cluster. +func newFakeMCManager(clusterName string, cl cluster.Cluster) *fakeMCManager { + return &fakeMCManager{ + clusters: map[string]cluster.Cluster{clusterName: cl}, + } +} diff --git a/internal/controller/workloaddeployment_controller.go b/internal/controller/workloaddeployment_controller.go index 50e21ef..5d87dae 100644 --- a/internal/controller/workloaddeployment_controller.go +++ b/internal/controller/workloaddeployment_controller.go @@ -37,6 +37,11 @@ import ( type WorkloadDeploymentReconciler struct { mgr mcmanager.Manager finalizers finalizer.Finalizers + // KarmadaClient is an optional client pointing at the Karmada control plane. + // When non-nil, the reconciler writes the WorkloadDeployment status back to + // the Karmada namespace after each reconcile so the WorkloadDeploymentFederator + // can aggregate it into the project-namespace object. Set to nil to disable. + KarmadaClient client.Client } // +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments,verbs=get;list;watch;create;update;patch;delete @@ -86,10 +91,6 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco logger.Info("reconciling deployment") defer logger.Info("reconcile complete") - if deployment.Status.Location == nil { - return ctrl.Result{}, nil - } - // Collect all instances for this deployment listOpts := client.MatchingLabels{ computev1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), @@ -143,59 +144,59 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco return ctrl.Result{}, err } - patchResult, err := controllerutil.CreateOrPatch(ctx, cl.GetClient(), &deployment, func() error { - deployment.Status.Replicas = int32(replicas) - deployment.Status.CurrentReplicas = int32(currentReplicas) - deployment.Status.DesiredReplicas = desiredReplicas - deployment.Status.ReadyReplicas = int32(readyReplicas) - - if quotaBlockedReplicas > 0 { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentReplicasReady, - Status: metav1.ConditionFalse, - Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded, - Message: fmt.Sprintf("%d of %d desired replicas are pending quota", quotaBlockedReplicas, desiredReplicas), - }) - } else { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentReplicasReady, - Status: metav1.ConditionTrue, - Reason: "ReplicasAvailable", - Message: fmt.Sprintf("%d/%d replicas available", readyReplicas, desiredReplicas), - }) - } - - if readyReplicas > 0 { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionTrue, - Reason: "StableInstanceFound", - Message: fmt.Sprintf("%d/%d instances are ready", readyReplicas, replicas), - }) - } else if !networkReady { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionFalse, - Reason: "ProvisioningNetwork", - Message: "Network is being provisioned", - }) - } else if replicas > 0 { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionFalse, - Reason: "ProvisioningInstances", - Message: "Instances are being provisioned", - }) - } + deployment.Status.Replicas = int32(replicas) + deployment.Status.CurrentReplicas = int32(currentReplicas) + deployment.Status.DesiredReplicas = desiredReplicas + deployment.Status.ReadyReplicas = int32(readyReplicas) + + if quotaBlockedReplicas > 0 { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentReplicasReady, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded, + Message: fmt.Sprintf("%d of %d desired replicas are pending quota", quotaBlockedReplicas, desiredReplicas), + }) + } else { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentReplicasReady, + Status: metav1.ConditionTrue, + Reason: "ReplicasAvailable", + Message: fmt.Sprintf("%d/%d replicas available", readyReplicas, desiredReplicas), + }) + } - return nil - }) + if readyReplicas > 0 { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionTrue, + Reason: "StableInstanceFound", + Message: fmt.Sprintf("%d/%d instances are ready", readyReplicas, replicas), + }) + } else if !networkReady { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionFalse, + Reason: "ProvisioningNetwork", + Message: "Network is being provisioned", + }) + } else if replicas > 0 { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionFalse, + Reason: "ProvisioningInstances", + Message: "Instances are being provisioned", + }) + } - if err != nil { + if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { return ctrl.Result{}, fmt.Errorf("failed updating deployment status: %w", err) } - logger.Info("deployment status processed", "operation_result", patchResult) + if err := r.writeStatusToKarmada(ctx, &deployment); err != nil { + return ctrl.Result{}, err + } + + logger.Info("deployment status updated") return ctrl.Result{}, nil } @@ -240,6 +241,34 @@ func (r *WorkloadDeploymentReconciler) reconcileInstanceGates( return currentReplicas, readyReplicas, quotaBlockedReplicas, nil } +// writeStatusToKarmada copies the WorkloadDeployment status to the matching +// object in the Karmada namespace so the WorkloadDeploymentFederator can +// sync it back to the project-namespace object on the control plane. +// It is a no-op when KarmadaClient is nil. +func (r *WorkloadDeploymentReconciler) writeStatusToKarmada(ctx context.Context, deployment *computev1alpha.WorkloadDeployment) error { + if r.KarmadaClient == nil { + return nil + } + + var kd computev1alpha.WorkloadDeployment + if err := r.KarmadaClient.Get(ctx, client.ObjectKeyFromObject(deployment), &kd); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed getting Karmada WD for status writeback: %w", err) + } + + kd.Status = deployment.Status + // Use Update (not Patch) so all required status fields are present in the + // request body; MergeFrom omits unchanged zero-value int32 fields which + // would fail the CRD's required constraints on currentReplicas/readyReplicas. + if err := r.KarmadaClient.Status().Update(ctx, &kd); err != nil { + return fmt.Errorf("failed updating Karmada WD status: %w", err) + } + + return nil +} + func (r *WorkloadDeploymentReconciler) reconcileNetworks( ctx context.Context, c client.Client, @@ -247,6 +276,30 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( ) (bool, error) { logger := log.FromContext(ctx) + // Resolve the Location for this deployment's city code. With Karmada + // propagation the WorkloadDeployment lands in the cluster that serves the + // requested city, so the Location object for that city must exist locally. + var locationList networkingv1alpha.LocationList + if err := c.List(ctx, &locationList); err != nil { + return false, fmt.Errorf("failed to list locations: %w", err) + } + + var locationRef *networkingv1alpha.LocationReference + for _, loc := range locationList.Items { + if cityCode, ok := loc.Spec.Topology["topology.datum.net/city-code"]; ok && cityCode == deployment.Spec.CityCode { + locationRef = &networkingv1alpha.LocationReference{ + Name: loc.Name, + Namespace: loc.Namespace, + } + break + } + } + + if locationRef == nil { + logger.Info("no location found for city code, waiting", "cityCode", deployment.Spec.CityCode) + return false, nil + } + // First, ensure we have a NetworkBinding for each interface, and that the // binding is ready before we move on to create SubnetClaims. @@ -271,7 +324,7 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( }, Spec: networkingv1alpha.NetworkBindingSpec{ Network: networkInterface.Network, - Location: *deployment.Status.Location, + Location: *locationRef, }, } @@ -347,8 +400,8 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( } // If it's not the same location, don't consider the subnet claim. - if claim.Spec.Location.Namespace != deployment.Status.Location.Namespace || - claim.Spec.Location.Name != deployment.Status.Location.Name { + if claim.Spec.Location.Namespace != locationRef.Namespace || + claim.Spec.Location.Name != locationRef.Name { continue } @@ -371,7 +424,7 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( NetworkContext: networkingv1alpha.LocalNetworkContextRef{ Name: networkContext.Name, }, - Location: *deployment.Status.Location, + Location: *locationRef, }, } @@ -490,25 +543,34 @@ func (r *WorkloadDeploymentReconciler) SetupWithManager(mgr mcmanager.Manager) e func enqueueWorkloadDeploymentByLocation(ctx context.Context, mgr mcmanager.Manager, clusterName string, locationRef networkingv1alpha.LocationReference) []mcreconcile.Request { logger := log.FromContext(ctx) - cluster, err := mgr.GetCluster(ctx, clusterName) + cl, err := mgr.GetCluster(ctx, clusterName) if err != nil { logger.Error(err, "failed to get cluster") return nil } - clusterClient := cluster.GetClient() + clusterClient := cl.GetClient() - locationName := (types.NamespacedName{ + // Resolve the Location to find its city code, then look up WorkloadDeployments + // that target the same city via the deploymentCityCodeIndex. + var location networkingv1alpha.Location + if err := clusterClient.Get(ctx, types.NamespacedName{ Namespace: locationRef.Namespace, Name: locationRef.Name, - }).String() - listOpts := client.MatchingFields{ - deploymentLocationIndex: locationName, + }, &location); err != nil { + logger.Error(err, "failed to get location for enqueue", "location", locationRef) + return nil } - var workloadDeployments computev1alpha.WorkloadDeploymentList + cityCode, ok := location.Spec.Topology["topology.datum.net/city-code"] + if !ok { + return nil + } - if err := clusterClient.List(ctx, &workloadDeployments, listOpts); err != nil { - logger.Error(err, "failed to list workloads") + var workloadDeployments computev1alpha.WorkloadDeploymentList + if err := clusterClient.List(ctx, &workloadDeployments, client.MatchingFields{ + deploymentCityCodeIndex: cityCode, + }); err != nil { + logger.Error(err, "failed to list workload deployments") return nil } diff --git a/internal/controller/workloaddeployment_federator.go b/internal/controller/workloaddeployment_federator.go new file mode 100644 index 0000000..d7437a1 --- /dev/null +++ b/internal/controller/workloaddeployment_federator.go @@ -0,0 +1,402 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + "strings" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/equality" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/finalizer" + "sigs.k8s.io/controller-runtime/pkg/log" + mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" + mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +const ( + // federatorFinalizer is added to project-namespace WorkloadDeployments that + // have been federated to the downstream control plane. It ensures we clean up + // the downstream object and any orphaned PropagationPolicies before the project + // object is permanently deleted. + federatorFinalizer = "compute.datumapis.com/federator" + + // cityCodeLabel is applied to WorkloadDeployments in the downstream namespace + // and is used by PropagationPolicy selectors to route them to the correct + // POP-cell clusters. Downstream Cluster objects are expected to carry this + // label with their city-code value. + cityCodeLabel = "topology.datum.net/city-code" +) + +// WorkloadDeploymentFederator replicates WorkloadDeployments from project +// namespaces into the downstream control plane so it can propagate them to the +// appropriate POP-cell clusters. +// +// For each WorkloadDeployment the controller: +// 1. Determines the downstream namespace via the ns- +// convention (matching the MappedNamespaceResourceStrategy used by +// go.datum.net/network-services-operator; this logic will migrate to Milo +// once the shared library is promoted). +// 2. Upserts a corresponding WorkloadDeployment in that downstream namespace, +// stamped with label topology.datum.net/city-code=. +// 3. Lazily creates a PropagationPolicy per city code per downstream namespace +// that selects WorkloadDeployments by the city-code label and targets +// clusters carrying the same label. The PP is deleted once no deployments +// with that city code remain in the namespace. +// 4. Reads the aggregated status from the downstream control plane and writes +// it back to the project-namespace object. +// 5. On deletion: removes the downstream WorkloadDeployment and cleans up +// unused PropagationPolicies. +type WorkloadDeploymentFederator struct { + mgr mcmanager.Manager + // DownstreamClient is a client pointed at the downstream control plane. The + // caller (cmd/main.go) is responsible for constructing it from + // --downstream-kubeconfig. + DownstreamClient client.Client + finalizers finalizer.Finalizers +} + +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments,verbs=get;list;watch;update;patch +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/finalizers,verbs=update +// +kubebuilder:rbac:groups=core,resources=namespaces,verbs=get;list + +func (r *WorkloadDeploymentFederator) Reconcile(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { + if r.DownstreamClient == nil { + return ctrl.Result{}, nil + } + + logger := log.FromContext(ctx) + + cl, err := r.mgr.GetCluster(ctx, req.ClusterName) + if err != nil { + return ctrl.Result{}, err + } + ctx = mccontext.WithCluster(ctx, req.ClusterName) + + var deployment computev1alpha.WorkloadDeployment + if err := cl.GetClient().Get(ctx, req.NamespacedName, &deployment); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + finalizationResult, err := r.finalizers.Finalize(ctx, &deployment) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to finalize: %w", err) + } + if finalizationResult.Updated { + if err = cl.GetClient().Update(ctx, &deployment); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to update based on finalization result: %w", err) + } + return ctrl.Result{}, nil + } + + if !deployment.DeletionTimestamp.IsZero() { + return ctrl.Result{}, nil + } + + logger.Info("federating deployment to downstream control plane") + + // Determine the downstream namespace for this project namespace using the + // ns- convention (MappedNamespaceResourceStrategy). + // Using strategy.GetClient() for writes ensures the downstream namespace is + // created with UpstreamOwnerNamespaceLabel so the InstanceProjector can + // resolve the target project namespace without scanning all namespaces. + strategy := downstreamclient.NewMappedNamespaceResourceStrategy(req.ClusterName, cl.GetClient(), r.DownstreamClient) + downstreamNS, err := strategy.GetDownstreamNamespaceNameForUpstreamNamespace(ctx, deployment.Namespace) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to determine downstream namespace: %w", err) + } + + // Ensure the downstream namespace exists and carries the upstream tracking + // labels so the InstanceProjector can resolve the project namespace by label + // lookup instead of scanning all namespaces. + if err := r.ensureDownstreamNamespace(ctx, downstreamNS, deployment.Namespace, req.ClusterName); err != nil { + return ctrl.Result{}, err + } + + // Upsert the WorkloadDeployment in the downstream control plane via the + // strategy client so any future Create calls also go through + // ensureDownstreamNamespace automatically. + if err := r.upsertDownstreamDeployment(ctx, strategy.GetClient(), &deployment, downstreamNS); err != nil { + return ctrl.Result{}, err + } + + // Lazily create the PropagationPolicy that targets clusters with the matching + // city-code label. + if err := r.ensurePropagationPolicy(ctx, downstreamNS, deployment.Spec.CityCode); err != nil { + return ctrl.Result{}, err + } + + // Pull aggregated status from the downstream control plane back into the + // project namespace. + if err := r.syncStatusFromDownstream(ctx, cl.GetClient(), &deployment, downstreamNS); err != nil { + return ctrl.Result{}, err + } + + logger.Info("federation complete") + return ctrl.Result{}, nil +} + +// Finalize removes the downstream WorkloadDeployment and, if no other +// deployments with the same city code remain in the downstream namespace, deletes +// the PropagationPolicy as well. +func (r *WorkloadDeploymentFederator) Finalize(ctx context.Context, obj client.Object) (finalizer.Result, error) { + if r.DownstreamClient == nil { + return finalizer.Result{}, nil + } + + deployment := obj.(*computev1alpha.WorkloadDeployment) + logger := log.FromContext(ctx).WithValues( + "deployment", deployment.Name, + "namespace", deployment.Namespace, + ) + + clusterName, ok := mccontext.ClusterFrom(ctx) + if !ok { + return finalizer.Result{}, fmt.Errorf("cluster name not found in context") + } + + cl, err := r.mgr.GetCluster(ctx, clusterName) + if err != nil { + return finalizer.Result{}, err + } + + strategy := downstreamclient.NewMappedNamespaceResourceStrategy(clusterName, cl.GetClient(), r.DownstreamClient) + downstreamNS, err := strategy.GetDownstreamNamespaceNameForUpstreamNamespace(ctx, deployment.Namespace) + if err != nil { + return finalizer.Result{}, fmt.Errorf("failed to determine downstream namespace during finalization: %w", err) + } + + // Delete the downstream WorkloadDeployment. + kd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: deployment.Name, + Namespace: downstreamNS, + }, + } + if err := r.DownstreamClient.Delete(ctx, kd); client.IgnoreNotFound(err) != nil { + return finalizer.Result{}, fmt.Errorf("failed to delete downstream deployment %s/%s: %w", downstreamNS, deployment.Name, err) + } + logger.Info("deleted downstream WorkloadDeployment", "downstreamNamespace", downstreamNS) + + // Clean up the PropagationPolicy if no other deployments with the same city + // code remain in this downstream namespace. + if err := r.cleanupPropagationPolicyIfUnused(ctx, downstreamNS, deployment.Spec.CityCode); err != nil { + return finalizer.Result{}, err + } + + return finalizer.Result{}, nil +} + +// ensureDownstreamNamespace creates or updates the downstream namespace, stamping +// it with the upstream tracking labels that MappedNamespaceResourceStrategy uses. +// This allows the InstanceProjector to resolve the project namespace name via a +// direct label lookup rather than scanning all namespaces by UID. +func (r *WorkloadDeploymentFederator) ensureDownstreamNamespace(ctx context.Context, name, upstreamNamespace, clusterName string) error { + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: name}} + _, err := controllerutil.CreateOrUpdate(ctx, r.DownstreamClient, ns, func() error { + if ns.Labels == nil { + ns.Labels = make(map[string]string) + } + ns.Labels[downstreamclient.UpstreamOwnerClusterNameLabel] = fmt.Sprintf("cluster-%s", strings.ReplaceAll(clusterName, "/", "_")) + ns.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] = upstreamNamespace + return nil + }) + if err != nil { + return fmt.Errorf("failed to ensure downstream namespace %q: %w", name, err) + } + return nil +} + +// upsertDownstreamDeployment creates or updates the WorkloadDeployment in the +// downstream namespace via the provided client (expected to be strategy.GetClient() +// so the downstream namespace is created with upstream tracking labels). +func (r *WorkloadDeploymentFederator) upsertDownstreamDeployment( + ctx context.Context, + downstreamClient client.Client, + deployment *computev1alpha.WorkloadDeployment, + downstreamNS string, +) error { + kd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: deployment.Name, + Namespace: downstreamNS, + }, + } + + result, err := controllerutil.CreateOrPatch(ctx, downstreamClient, kd, func() error { + if kd.Labels == nil { + kd.Labels = make(map[string]string) + } + kd.Labels[cityCodeLabel] = deployment.Spec.CityCode + kd.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] = deployment.Namespace + kd.Spec = deployment.Spec + return nil + }) + if err != nil { + return fmt.Errorf("failed to upsert downstream deployment %s/%s: %w", downstreamNS, deployment.Name, err) + } + + log.FromContext(ctx).Info("upserted downstream deployment", "result", result, "downstreamNamespace", downstreamNS) + return nil +} + +// ensurePropagationPolicy creates or updates a PropagationPolicy in the downstream +// namespace that selects all WorkloadDeployments with the given city-code label +// and targets clusters carrying the same label. +func (r *WorkloadDeploymentFederator) ensurePropagationPolicy( + ctx context.Context, + downstreamNS string, + cityCode string, +) error { + pp := &karmadapolicyv1alpha1.PropagationPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: propagationPolicyNameFor(cityCode), + Namespace: downstreamNS, + }, + } + + result, err := controllerutil.CreateOrPatch(ctx, r.DownstreamClient, pp, func() error { + pp.Spec = karmadapolicyv1alpha1.PropagationSpec{ + // Select all WorkloadDeployments in this namespace that carry the + // city-code label. Using a label selector (rather than individual + // resource names) means that new deployments for this city are + // automatically picked up without updating the policy. + ResourceSelectors: []karmadapolicyv1alpha1.ResourceSelector{ + { + APIVersion: computev1alpha.GroupVersion.String(), + Kind: "WorkloadDeployment", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + cityCodeLabel: cityCode, + }, + }, + }, + }, + Placement: karmadapolicyv1alpha1.Placement{ + // Route to clusters that carry the same city-code label. POP-cell + // clusters registered with the downstream control plane must be + // labeled accordingly. + ClusterAffinity: &karmadapolicyv1alpha1.ClusterAffinity{ + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + cityCodeLabel: cityCode, + }, + }, + }, + }, + } + return nil + }) + if err != nil { + return fmt.Errorf("failed to upsert PropagationPolicy for city %q in %s: %w", cityCode, downstreamNS, err) + } + + log.FromContext(ctx).Info("upserted PropagationPolicy", "result", result, "cityCode", cityCode, "downstreamNamespace", downstreamNS) + return nil +} + +// syncStatusFromDownstream reads the aggregated status of the WorkloadDeployment +// from the downstream namespace and writes it back to the project-namespace +// object. It is a no-op when the downstream object does not yet exist. +func (r *WorkloadDeploymentFederator) syncStatusFromDownstream( + ctx context.Context, + projectClient client.Client, + deployment *computev1alpha.WorkloadDeployment, + downstreamNS string, +) error { + var kd computev1alpha.WorkloadDeployment + if err := r.DownstreamClient.Get(ctx, types.NamespacedName{ + Name: deployment.Name, + Namespace: downstreamNS, + }, &kd); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to get downstream deployment for status sync: %w", err) + } + + if equality.Semantic.DeepEqual(deployment.Status, kd.Status) { + return nil + } + + deployment.Status = kd.Status + if err := projectClient.Status().Update(ctx, deployment); err != nil { + return fmt.Errorf("failed to write downstream status back to project deployment: %w", err) + } + return nil +} + +// cleanupPropagationPolicyIfUnused deletes the PropagationPolicy for the given +// city code if no WorkloadDeployments with that city code remain in the +// downstream namespace. +func (r *WorkloadDeploymentFederator) cleanupPropagationPolicyIfUnused( + ctx context.Context, + downstreamNS string, + cityCode string, +) error { + var remaining computev1alpha.WorkloadDeploymentList + if err := r.DownstreamClient.List(ctx, &remaining, + client.InNamespace(downstreamNS), + client.MatchingLabels{cityCodeLabel: cityCode}, + ); err != nil { + return fmt.Errorf("failed to list remaining downstream deployments for city %q: %w", cityCode, err) + } + + if len(remaining.Items) > 0 { + // Other deployments still need this PropagationPolicy. + return nil + } + + pp := &karmadapolicyv1alpha1.PropagationPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: propagationPolicyNameFor(cityCode), + Namespace: downstreamNS, + }, + } + if err := r.DownstreamClient.Delete(ctx, pp); client.IgnoreNotFound(err) != nil { + return fmt.Errorf("failed to delete PropagationPolicy for city %q in %s: %w", cityCode, downstreamNS, err) + } + + log.FromContext(ctx).Info("deleted PropagationPolicy (no more deployments for city)", "cityCode", cityCode, "downstreamNamespace", downstreamNS) + return nil +} + +// SetupWithManager registers the controller with the multicluster manager. +// It must only be called when DownstreamClient is non-nil. +func (r *WorkloadDeploymentFederator) SetupWithManager(mgr mcmanager.Manager) error { + r.mgr = mgr + r.finalizers = finalizer.NewFinalizers() + if err := r.finalizers.Register(federatorFinalizer, r); err != nil { + return fmt.Errorf("failed to register federator finalizer: %w", err) + } + return mcbuilder.ControllerManagedBy(mgr). + For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithEngageWithLocalCluster(false)). + Named("workload-deployment-federator"). + Complete(r) +} + +// propagationPolicyNameFor returns the PropagationPolicy name for a given city +// code. The name is stable and deterministic so that multiple reconciles of +// different deployments sharing the same city code converge on the same policy. +func propagationPolicyNameFor(cityCode string) string { + // Sanitize the city code to a valid Kubernetes name: lower-case, spaces → hyphens. + sanitized := strings.ToLower(strings.ReplaceAll(cityCode, " ", "-")) + return fmt.Sprintf("city-%s", sanitized) +} diff --git a/internal/controller/workloaddeployment_federator_test.go b/internal/controller/workloaddeployment_federator_test.go new file mode 100644 index 0000000..143f975 --- /dev/null +++ b/internal/controller/workloaddeployment_federator_test.go @@ -0,0 +1,398 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "testing" + "time" + + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/finalizer" + mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" + mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + + computev1alpha "go.datum.net/compute/api/v1alpha" +) + +// ─── Shared test constants ──────────────────────────────────────────────────── + +const ( + testCluster = "test-project-cluster" + testProjNS = "my-project" + testProjNSUID = types.UID("aabbccdd-0000-1111-2222-333344445555") + testKarmadaNSStr = "ns-aabbccdd-0000-1111-2222-333344445555" + testWDName = "my-workload-deployment" + testCityCodeLAX = "LAX" +) + +// ─── Test helpers ───────────────────────────────────────────────────────────── + +// testProjectNamespace returns a corev1.Namespace for the project cluster with a +// stable UID that matches testKarmadaNSStr. +func testProjectNamespace() *corev1.Namespace { + return &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: testProjNS, + UID: testProjNSUID, + }, + } +} + +// testWorkloadDeployment returns a WorkloadDeployment with the given options. +func testWorkloadDeployment(opts ...func(*computev1alpha.WorkloadDeployment)) *computev1alpha.WorkloadDeployment { + wd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: testWDName, + Namespace: testProjNS, + UID: "wd-uid-1111", + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: testCityCodeLAX, + WorkloadRef: computev1alpha.WorkloadReference{ + Name: "test-workload", + }, + PlacementName: "default", + ScaleSettings: computev1alpha.HorizontalScaleSettings{ + MinReplicas: 1, + }, + }, + } + for _, opt := range opts { + opt(wd) + } + return wd +} + +// withFinalizer adds the federator finalizer to the WorkloadDeployment. +func withFinalizer(wd *computev1alpha.WorkloadDeployment) { + wd.Finalizers = append(wd.Finalizers, federatorFinalizer) +} + +// withDeletionTimestamp sets a non-zero DeletionTimestamp on the WorkloadDeployment. +func withDeletionTimestamp(wd *computev1alpha.WorkloadDeployment) { + t := metav1.NewTime(time.Now().Add(-5 * time.Second)) + wd.DeletionTimestamp = &t +} + +// newTestFederator constructs a WorkloadDeploymentFederator wired to the given +// project client (via a fakeMCManager) and downstream client. The federator +// finalizer is pre-registered so reconcile can handle deletions. +func newTestFederator(projectClient client.Client, karmadaClient client.Client) *WorkloadDeploymentFederator { + projectCluster := newFakeCluster(projectClient) + mgr := newFakeMCManager(testCluster, projectCluster) + + r := &WorkloadDeploymentFederator{ + mgr: mgr, + DownstreamClient: karmadaClient, + } + + feds := finalizer.NewFinalizers() + if err := feds.Register(federatorFinalizer, r); err != nil { + panic("failed to register test finalizer: " + err.Error()) + } + r.finalizers = feds + return r +} + +// reconcileRequest builds an mcreconcile.Request for the test WorkloadDeployment. +func reconcileRequest() mcreconcile.Request { + return mcreconcile.Request{ + ClusterName: testCluster, + Request: ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: testWDName, + Namespace: testProjNS, + }, + }, + } +} + +// ─── Unit tests ─────────────────────────────────────────────────────────────── + +func TestPropagationPolicyNameFor(t *testing.T) { + t.Parallel() + + tests := []struct { + cityCode string + want string + }{ + {"LAX", "city-lax"}, + {"lax", "city-lax"}, + {"New York", "city-new-york"}, + {"LOS ANGELES", "city-los-angeles"}, + {"SEA", "city-sea"}, + } + + for _, tt := range tests { + t.Run(tt.cityCode, func(t *testing.T) { + t.Parallel() + got := propagationPolicyNameFor(tt.cityCode) + assert.Equal(t, tt.want, got) + }) + } +} + +// TestWorkloadDeploymentFederator_NoDownstreamClient verifies that the reconciler +// is a no-op when DownstreamClient is nil. +func TestWorkloadDeploymentFederator_NoDownstreamClient(t *testing.T) { + t.Parallel() + + projectClient := newProjectFakeClient(testProjectNamespace(), testWorkloadDeployment()) + r := newTestFederator(projectClient, nil) + r.DownstreamClient = nil // explicitly nil + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) +} + +// TestWorkloadDeploymentFederator_AddsFinalizerOnFirstSeen verifies that the +// first reconcile of a brand-new WorkloadDeployment adds the finalizer and +// returns without federating (the finalizer update triggers a re-queue). +func TestWorkloadDeploymentFederator_AddsFinalizerOnFirstSeen(t *testing.T) { + t.Parallel() + + wd := testWorkloadDeployment() // no finalizer yet + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + // The project WD should now have the finalizer persisted. + var updated computev1alpha.WorkloadDeployment + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Name: testWDName, Namespace: testProjNS}, &updated)) + assert.Contains(t, updated.Finalizers, federatorFinalizer) + + // Karmada should be untouched – federation happens on the next reconcile. + var wdList computev1alpha.WorkloadDeploymentList + require.NoError(t, karmadaClient.List(context.Background(), &wdList)) + assert.Empty(t, wdList.Items, "no Karmada WD should be created on first-seen reconcile") +} + +// TestWorkloadDeploymentFederator_FederatesToKarmada verifies that a +// WorkloadDeployment with the finalizer already set is fully federated: +// the Karmada namespace, WorkloadDeployment (with city-code label), and +// PropagationPolicy are all created. +func TestWorkloadDeploymentFederator_FederatesToKarmada(t *testing.T) { + t.Parallel() + + wd := testWorkloadDeployment(withFinalizer) + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + ctx := context.Background() + + // Karmada namespace must exist. + var karmadaNS corev1.Namespace + err = karmadaClient.Get(ctx, types.NamespacedName{Name: testKarmadaNSStr}, &karmadaNS) + require.NoError(t, err, "Karmada namespace %q should exist", testKarmadaNSStr) + + // Karmada WorkloadDeployment must exist with the city-code label. + var karmadaWD computev1alpha.WorkloadDeployment + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: testWDName, + Namespace: testKarmadaNSStr, + }, &karmadaWD) + require.NoError(t, err, "Karmada WorkloadDeployment should exist") + assert.Equal(t, testCityCodeLAX, karmadaWD.Labels[cityCodeLabel], + "city-code label should be set on Karmada WD") + assert.Equal(t, testCityCodeLAX, karmadaWD.Spec.CityCode, + "spec.cityCode should be copied from project WD") + + // PropagationPolicy for the city code must exist. + ppName := propagationPolicyNameFor(testCityCodeLAX) + var pp karmadapolicyv1alpha1.PropagationPolicy + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: ppName, + Namespace: testKarmadaNSStr, + }, &pp) + require.NoError(t, err, "PropagationPolicy %q should exist", ppName) + + // The PP must select WorkloadDeployments by the city-code label. + require.Len(t, pp.Spec.ResourceSelectors, 1) + sel := pp.Spec.ResourceSelectors[0] + assert.Equal(t, computev1alpha.GroupVersion.String(), sel.APIVersion) + assert.Equal(t, "WorkloadDeployment", sel.Kind) + require.NotNil(t, sel.LabelSelector) + assert.Equal(t, testCityCodeLAX, sel.LabelSelector.MatchLabels[cityCodeLabel]) + + // The PP cluster affinity must target clusters carrying the same city-code. + require.NotNil(t, pp.Spec.Placement.ClusterAffinity) + require.NotNil(t, pp.Spec.Placement.ClusterAffinity.LabelSelector) + assert.Equal(t, testCityCodeLAX, + pp.Spec.Placement.ClusterAffinity.LabelSelector.MatchLabels[cityCodeLabel]) +} + +// TestWorkloadDeploymentFederator_Finalization covers the deletion scenarios: +// cleanup of Karmada resources and conditional PropagationPolicy removal. +func TestWorkloadDeploymentFederator_Finalization(t *testing.T) { + t.Parallel() + + ppName := propagationPolicyNameFor(testCityCodeLAX) + + tests := []struct { + name string + // karmadaExtra holds additional Karmada objects beyond the "own" WD and PP. + karmadaExtra []client.Object + wantPPGone bool + }{ + { + name: "last WD for city — PropagationPolicy removed", + karmadaExtra: nil, + wantPPGone: true, + }, + { + name: "other WD for same city remains — PropagationPolicy kept", + karmadaExtra: []client.Object{ + // A sibling WD in the same Karmada namespace with the same city-code. + &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "other-deployment", + Namespace: testKarmadaNSStr, + Labels: map[string]string{cityCodeLabel: testCityCodeLAX}, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: testCityCodeLAX, + PlacementName: "other", + WorkloadRef: computev1alpha.WorkloadReference{Name: "other"}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + }, + }, + }, + wantPPGone: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + // Project cluster: namespace + WD with finalizer and deletion timestamp. + wd := testWorkloadDeployment(withFinalizer, withDeletionTimestamp) + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + + // Karmada cluster: the mirrored WD + its PropagationPolicy + any extras. + karmadaWD := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: testWDName, + Namespace: testKarmadaNSStr, + Labels: map[string]string{cityCodeLabel: testCityCodeLAX}, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: testCityCodeLAX, + PlacementName: "default", + WorkloadRef: computev1alpha.WorkloadReference{Name: "test-workload"}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + }, + } + karmadaPP := &karmadapolicyv1alpha1.PropagationPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: ppName, + Namespace: testKarmadaNSStr, + }, + } + karmadaObjs := []client.Object{ + &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: testKarmadaNSStr}}, + karmadaWD, + karmadaPP, + } + karmadaObjs = append(karmadaObjs, tt.karmadaExtra...) + karmadaClient := newKarmadaFakeClient(karmadaObjs...) + + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + ctx := context.Background() + + // The Karmada-side WD must be gone. + var remainingWD computev1alpha.WorkloadDeployment + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: testWDName, + Namespace: testKarmadaNSStr, + }, &remainingWD) + assert.True(t, apierrors.IsNotFound(err), + "Karmada WD %q should be deleted after finalization", testWDName) + + // PropagationPolicy presence depends on whether siblings remain. + var remainingPP karmadapolicyv1alpha1.PropagationPolicy + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: ppName, + Namespace: testKarmadaNSStr, + }, &remainingPP) + if tt.wantPPGone { + assert.True(t, apierrors.IsNotFound(err), + "PropagationPolicy should be deleted when no city siblings remain") + } else { + assert.NoError(t, err, + "PropagationPolicy should be kept when other city siblings remain") + } + + // The project WD should be gone: once the federator finalizer is removed + // from an object that already has a DeletionTimestamp, the API server + // (and the fake client) garbage-collects the object. + var updatedWD computev1alpha.WorkloadDeployment + err = projectClient.Get(ctx, + types.NamespacedName{Name: testWDName, Namespace: testProjNS}, &updatedWD) + assert.True(t, apierrors.IsNotFound(err), + "project WD should be gone after finalizer removal (DeletionTimestamp + empty Finalizers = GC)") + }) + } +} + +// TestWorkloadDeploymentFederator_NotFound verifies that a missing +// WorkloadDeployment is handled gracefully (no error, no action). +func TestWorkloadDeploymentFederator_NotFound(t *testing.T) { + t.Parallel() + + projectClient := newProjectFakeClient(testProjectNamespace()) // WD missing + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) +} + +// TestWorkloadDeploymentFederator_Finalize_DirectCall exercises the Finalize +// method directly, ensuring the cluster name is required in context. +func TestWorkloadDeploymentFederator_Finalize_DirectCall(t *testing.T) { + t.Parallel() + + projectClient := newProjectFakeClient(testProjectNamespace()) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + wd := testWorkloadDeployment(withFinalizer) + + // Without cluster in context → must return an error. + _, err := r.Finalize(context.Background(), wd) + require.Error(t, err, "Finalize without cluster context should fail") + assert.Contains(t, err.Error(), "cluster name not found") + + // With cluster in context → must succeed (karmada client returns not-found, which is OK). + ctx := mccontext.WithCluster(context.Background(), testCluster) + result, err := r.Finalize(ctx, wd) + require.NoError(t, err) + assert.False(t, result.Updated) +} diff --git a/internal/controller/workloaddeployment_scheduler.go b/internal/controller/workloaddeployment_scheduler.go deleted file mode 100644 index 041b0d6..0000000 --- a/internal/controller/workloaddeployment_scheduler.go +++ /dev/null @@ -1,153 +0,0 @@ -// SPDX-License-Identifier: AGPL-3.0-only - -package controller - -import ( - "context" - "fmt" - "time" - - apierrors "k8s.io/apimachinery/pkg/api/errors" - apimeta "k8s.io/apimachinery/pkg/api/meta" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/controller-runtime/pkg/predicate" - mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" - mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" - mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" - mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" - - computev1alpha "go.datum.net/compute/api/v1alpha" - networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" -) - -// WorkloadDeploymentScheduler schedules a WorkloadDeployment -type WorkloadDeploymentScheduler struct { - mgr mcmanager.Manager -} - -func (r *WorkloadDeploymentScheduler) Reconcile(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { - logger := log.FromContext(ctx) - - cl, err := r.mgr.GetCluster(ctx, req.ClusterName) - if err != nil { - return ctrl.Result{}, err - } - - ctx = mccontext.WithCluster(ctx, req.ClusterName) - var deployment computev1alpha.WorkloadDeployment - if err := cl.GetClient().Get(ctx, req.NamespacedName, &deployment); err != nil { - if apierrors.IsNotFound(err) { - return ctrl.Result{}, nil - } - return ctrl.Result{}, err - } - - if !deployment.DeletionTimestamp.IsZero() { - return ctrl.Result{}, nil - } - - logger.Info("scheduling deployment") - defer logger.Info("scheduling complete") - - // TODO(jreese) improve! - // The first iteration of this scheduler will be very simple and only look for - // the first available location that is viable for the deployment. In the - // future, we could see a more advanced system similar to the Kubernetes - // scheduler itself. - - // Step 1: Get Locations - var locations networkingv1alpha.LocationList - if err := cl.GetClient().List(ctx, &locations); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to list locations: %w", err) - } - - if len(locations.Items) == 0 { - // Should only be the case in new environments if workloads are created - // prior to location registration. - - changed := apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: "Available", - Status: metav1.ConditionFalse, - Reason: "NoLocations", - ObservedGeneration: deployment.Generation, - Message: "No locations are registered with the system.", - }) - if changed { - // TODO(jreese) investigate kubevirt / other operators for better tracking - // of updates to the status. I seem to remember a "builder" of sorts that - // looked rather nice. - if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update deployment status: %w", err) - } - } - - return ctrl.Result{RequeueAfter: 30 * time.Second}, nil - } - - // TODO(jreese) define standard Topology keys somewhere - - var selectedLocation *networkingv1alpha.Location - for _, location := range locations.Items { - cityCode, ok := location.Spec.Topology["topology.datum.net/city-code"] - if ok && cityCode == deployment.Spec.CityCode { - selectedLocation = &location - break - } - } - - if selectedLocation == nil { - changed := apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: "Available", - Status: metav1.ConditionFalse, - Reason: "NoCandidateLocations", - ObservedGeneration: deployment.Generation, - Message: "No locations are candidates for this deployment.", - }) - if changed { - if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update deployment status: %w", err) - } - } - } else { - deployment.Status.Location = &networkingv1alpha.LocationReference{ - Name: selectedLocation.Name, - Namespace: selectedLocation.Namespace, - } - - // TODO(jreese) make sure we don't run into update conflicts with the update - // of the spec then status here. Just can't remember if it's an issue. - - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: "Available", - Status: metav1.ConditionFalse, - Reason: "LocationAssigned", - ObservedGeneration: deployment.Generation, - Message: "Deployment has been assigned a location.", - }) - - if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update deployment status: %w", err) - } - - } - - return ctrl.Result{}, nil -} - -// SetupWithManager sets up the controller with the Manager. -func (r *WorkloadDeploymentScheduler) SetupWithManager(mgr mcmanager.Manager) error { - r.mgr = mgr - return mcbuilder.ControllerManagedBy(mgr). - For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithPredicates( - predicate.NewPredicateFuncs(func(object client.Object) bool { - // Don't process deployments that have been scheduled - o := object.(*computev1alpha.WorkloadDeployment) - return o.Status.Location == nil - }), - )). - Named("workload-deployment-scheduler"). - Complete(r) -} diff --git a/test/e2e/chainsaw-config.yaml b/test/e2e/chainsaw-config.yaml new file mode 100644 index 0000000..cd3a995 --- /dev/null +++ b/test/e2e/chainsaw-config.yaml @@ -0,0 +1,47 @@ +# Chainsaw global configuration for the compute federation e2e test suite. +# +# Prerequisites +# ───────────── +# Run `task e2e:up` to create the Kind clusters and populate kubeconfigs under +# tmp/e2e/kubeconfigs/ before running Chainsaw. +# +# Running +# ─────── +# From the repository root via Taskfile (recommended): +# +# task e2e:test +# +# Or directly: +# +# KUBECONFIG=tmp/e2e/kubeconfigs/control-plane.yaml \ +# chainsaw test --config test/e2e/chainsaw-config.yaml test/e2e/ +# +# The KUBECONFIG env var sets the "default" cluster (control-plane cell). +# Additional clusters (downstream, pop-dfw, pop-ord) are declared below and +# referenced by name in individual test steps via `cluster: downstream` etc. +# +# Kubeconfig paths below are relative to the working directory where Chainsaw is +# invoked (the project root), NOT relative to this config file's location. +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Configuration +metadata: + name: chainsaw +spec: + timeouts: + apply: 30s + assert: 60s + cleanup: 60s + delete: 30s + error: 30s + exec: 30s + clusters: + # Downstream control plane. WorkloadDeployments, PropagationPolicies, + # and Instance write-backs live here. + downstream: + kubeconfig: tmp/e2e/kubeconfigs/downstream.yaml + # POP DFW cell — downstream member cluster labelled topology.datum.net/city-code=dfw. + pop-dfw: + kubeconfig: tmp/e2e/kubeconfigs/pop-dfw.yaml + # POP ORD cell — downstream member cluster labelled topology.datum.net/city-code=ord. + pop-ord: + kubeconfig: tmp/e2e/kubeconfigs/pop-ord.yaml diff --git a/test/e2e/deletion-cascade/assert-downstream-wd-exists.yaml b/test/e2e/deletion-cascade/assert-downstream-wd-exists.yaml new file mode 100644 index 0000000..aae65da --- /dev/null +++ b/test/e2e/deletion-cascade/assert-downstream-wd-exists.yaml @@ -0,0 +1,7 @@ +# Assert the WorkloadDeployment is present in the Karmada API server. +# Used both to confirm federation succeeded and as the target for the error: check. +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + namespace: ($downstreamNS) + name: test-cascade-wd diff --git a/test/e2e/deletion-cascade/chainsaw-test.yaml b/test/e2e/deletion-cascade/chainsaw-test.yaml new file mode 100644 index 0000000..03a11ea --- /dev/null +++ b/test/e2e/deletion-cascade/chainsaw-test.yaml @@ -0,0 +1,79 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: deletion-cascade +spec: + description: | + Verifies that deleting a WorkloadDeployment from the project namespace causes + the federator to remove the corresponding WorkloadDeployment from Karmada. + + The WorkloadDeploymentFederator adds a finalizer + (compute.datumapis.com/federator) to every project WD it manages. When the + project WD is deleted: + 1. The finalizer's Finalize method runs (blocking deletion until complete). + 2. It deletes the Karmada-side WorkloadDeployment. + 3. It removes the PropagationPolicy if no other WDs for the city remain. + 4. It removes the finalizer, allowing the project WD to be garbage-collected. + + This test validates: project WD deletion → Karmada WD deletion. + + template: true + + steps: + - name: create-wd + description: Create a WorkloadDeployment on the control-plane cluster. + try: + - apply: + file: workload-deployment.yaml + + - name: wait-for-federation + description: Wait for the WorkloadDeployment to appear in Karmada. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-cascade-wd + + - name: delete-wd + description: Delete the WorkloadDeployment from the control-plane cluster. + try: + - delete: + ref: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + namespace: ($namespace) + name: test-cascade-wd + + - name: assert-downstream-wd-deleted + description: Confirm the Karmada copy is removed by the finalizer. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - wait: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + namespace: ($downstreamNS) + name: test-cascade-wd + timeout: 30s + for: + deletion: {} diff --git a/test/e2e/deletion-cascade/workload-deployment.yaml b/test/e2e/deletion-cascade/workload-deployment.yaml new file mode 100644 index 0000000..39d68a1 --- /dev/null +++ b/test/e2e/deletion-cascade/workload-deployment.yaml @@ -0,0 +1,21 @@ +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + name: test-cascade-wd +spec: + cityCode: dfw + placementName: default + workloadRef: + name: test-workload + uid: "00000000-0000-0000-0000-000000000001" + template: + spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network + + scaleSettings: + minReplicas: 1 diff --git a/test/e2e/env/README.md b/test/e2e/env/README.md new file mode 100644 index 0000000..671e705 --- /dev/null +++ b/test/e2e/env/README.md @@ -0,0 +1,251 @@ +# Local Kind + Karmada e2e Environment + +This document describes the local multi-cluster environment used for end-to-end +testing of the compute federation layer. + +--- + +## Prerequisites + +| Tool | Minimum version | Install | +|------|----------------|---------| +| [Docker Desktop](https://www.docker.com/products/docker-desktop/) | 4.x | required for Kind | +| [kind](https://kind.sigs.k8s.io/) | v0.23+ | `brew install kind` | +| [kubectl](https://kubernetes.io/docs/tasks/tools/) | v1.28+ | `brew install kubernetes-cli` | +| [helm](https://helm.sh/) | v3.14+ | `brew install helm` | +| [task](https://taskfile.dev/) | v3 | `brew install go-task` | +| Python 3 | 3.9+ | pre-installed on macOS | +| go | 1.24+ | `brew install go` | + +`karmadactl` is downloaded automatically by `task e2e:up` into `./bin/`. + +--- + +## Cluster Topology + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ compute-control-plane (Kind cluster) │ +│ │ +│ ┌───────────────────────────────────────────────────────────────┐ │ +│ │ karmada-system namespace │ │ +│ │ Karmada API Server ←── https://localhost:32443 │ │ +│ │ Karmada Controller Manager │ │ +│ │ Karmada Scheduler │ │ +│ └───────────────────────────────────────────────────────────────┘ │ +│ │ +│ compute operator (WorkloadReconciler, Federator, InstanceProjector)│ +└──────────────────────────┬──────────────────────────────────────────┘ + │ Karmada propagates WorkloadDeployments + ┌────────────────┴─────────────────┐ + │ │ +┌─────────▼──────────┐ ┌──────────▼─────────┐ +│ compute-pop-dfw │ │ compute-pop-ord │ +│ (Kind cluster) │ │ (Kind cluster) │ +│ │ │ │ +│ city-code=dfw │ │ city-code=ord │ +│ Compute CRDs │ │ Compute CRDs │ +│ NSO CRDs │ │ NSO CRDs │ +└────────────────────┘ └────────────────────┘ +``` + +### What lives where + +| Resource | Cluster | +|----------|---------| +| `Workload`, `WorkloadDeployment` (consumer-facing) | Control Plane Cell | +| `WorkloadDeployment` (federation intent), `PropagationPolicy` | Karmada API Server | +| `WorkloadDeployment` (propagated), `Instance`, `NetworkBinding`, `SubnetClaim` | POP cells | +| `Instance` (write-back for visibility) | Karmada API Server | + +--- + +## Running the environment + +### Start + +```bash +task e2e:up +``` + +This is fully idempotent — running it twice will not fail. + +What it does, in order: + +1. Downloads `karmadactl v1.16.0` into `./bin/` (once). +2. Adds the `karmada-charts` Helm repository. +3. Creates Kind clusters `compute-control-plane`, `compute-pop-dfw`, + `compute-pop-ord` (skips any that already exist). +4. Exports kubeconfigs to `./tmp/e2e/kubeconfigs/`. +5. Installs Karmada v1.16.0 via the `karmada-charts/karmada` Helm chart into + `compute-control-plane`, with the API server exposed on NodePort 32443. +6. Registers `compute-pop-dfw` and `compute-pop-ord` as member clusters and + labels each with `topology.datum.net/city-code`. +7. Installs compute CRDs to all clusters and the Karmada API server. +8. Installs NSO CRDs to the POP cell clusters. + +### Stop + +```bash +task e2e:down +``` + +Deletes all three Kind clusters and removes `./tmp/e2e/`. + +--- + +## Kubeconfigs + +After `task e2e:up`: + +| File | Cluster | Use for | +|------|---------|---------| +| `tmp/e2e/kubeconfigs/control-plane.yaml` | `compute-control-plane` | kubectl, deploying the compute operator | +| `tmp/e2e/kubeconfigs/karmada.yaml` | Karmada API server | kubectl, karmadactl | +| `tmp/e2e/kubeconfigs/pop-dfw.yaml` | `compute-pop-dfw` | kubectl, inspecting POP cell state | +| `tmp/e2e/kubeconfigs/pop-ord.yaml` | `compute-pop-ord` | kubectl, inspecting POP cell state | + +The `-internal.yaml` variants use the Kind container's Docker bridge IP and are +intended for the Karmada controller running inside Docker — not for direct +developer use. + +### Quick check + +```bash +# Verify cluster list in Karmada +kubectl --kubeconfig tmp/e2e/kubeconfigs/karmada.yaml get clusters + +# Expected output: +# NAME READY AGE +# compute-pop-dfw True ... +# compute-pop-ord True ... + +# Verify city-code labels +kubectl --kubeconfig tmp/e2e/kubeconfigs/karmada.yaml \ + get clusters -L topology.datum.net/city-code +``` + +--- + +## Using the environment from e2e tests + +Import `go.datum.net/compute/test/e2e/env` in your test suite: + +```go +package myfeature_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + computev1alpha1 "go.datum.net/compute/api/v1alpha1" + + "go.datum.net/compute/test/e2e/env" +) + +var testEnv *env.Environment + +func TestMyFeature(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "MyFeature Suite") +} + +var _ = BeforeSuite(func() { + scheme := runtime.NewScheme() + Expect(corev1.AddToScheme(scheme)).To(Succeed()) + Expect(computev1alpha1.AddToScheme(scheme)).To(Succeed()) + + var err error + testEnv, err = env.New(scheme) + Expect(err).NotTo(HaveOccurred()) +}) + +var _ = It("creates a workload and propagates it", func() { + // Control plane cluster client + cpClient := testEnv.ControlPlane.Client + + // Karmada API server client + karmadaClient := testEnv.Karmada.Client + + // POP DFW cluster client + dfwCell, err := testEnv.POPCell(env.CityCodeDFW) + Expect(err).NotTo(HaveOccurred()) + dfwClient := dfwCell.Client + + _ = cpClient + _ = karmadaClient + _ = dfwClient +}) +``` + +### Environment variable override + +Set `E2E_KUBECONFIG_DIR` to an absolute path to load kubeconfigs from a +different directory (useful in CI): + +```bash +E2E_KUBECONFIG_DIR=/path/to/kubeconfigs go test ./test/e2e/... +``` + +--- + +## Networking notes (macOS) + +On macOS with Docker Desktop, Kind clusters run as Docker containers. The +container-to-container networking works as follows: + +| From | To | Address used | +|------|----|--------------| +| macOS host | Any Kind cluster API server | `localhost:` | +| macOS host | Karmada API server | `https://localhost:32443` (NodePort) | +| Karmada controller (in Docker) | POP cell API servers | Docker bridge IP (`172.18.x.x:6443`) | + +The `-internal.yaml` kubeconfig variants use Docker bridge IPs with +`insecure-skip-tls-verify: true` because the node certificates do not include +bridge IPs in their SANs. This is acceptable for a local dev environment. + +--- + +## Troubleshooting + +### Karmada API server not reachable + +```bash +kubectl --kubeconfig tmp/e2e/kubeconfigs/karmada.yaml get ns +``` + +If this times out, check: +1. The Kind cluster is running: `kind get clusters` +2. Port 32443 is mapped: `docker port compute-control-plane-control-plane` +3. The karmada-apiserver pod is running: + ```bash + kubectl --kubeconfig tmp/e2e/kubeconfigs/control-plane.yaml \ + get pods -n karmada-system + ``` + +### POP cluster shows NotReady in Karmada + +The Karmada controller manager uses the Docker bridge IP kubeconfig to reach +POP cells. Check: + +```bash +kubectl --kubeconfig tmp/e2e/kubeconfigs/karmada.yaml \ + describe cluster compute-pop-dfw +``` + +Then verify the cluster secret contains the expected Docker IP: + +```bash +kubectl --kubeconfig tmp/e2e/kubeconfigs/karmada.yaml \ + get secret -n karmada-system | grep pop-dfw +``` + +### Start fresh + +```bash +task e2e:down && task e2e:up +``` diff --git a/test/e2e/env/env.go b/test/e2e/env/env.go new file mode 100644 index 0000000..7d2c59c --- /dev/null +++ b/test/e2e/env/env.go @@ -0,0 +1,233 @@ +// Package env provides helpers for connecting to the local Kind e2e environment +// created by "task e2e:up". +// +// # Environment layout +// +// The environment consists of three Kind clusters and one downstream API server: +// +// - Control plane cell — hosts the compute operator (WorkloadReconciler, +// WorkloadDeploymentFederator, InstanceProjector). +// - Downstream control plane — the federation API server; WorkloadDeployments +// are written here so they can be propagated to POP cells. +// - POP DFW (compute-pop-dfw) — member cluster labelled city-code=dfw. +// - POP ORD (compute-pop-ord) — member cluster labelled city-code=ord. +// +// # Kubeconfig resolution +// +// Kubeconfigs are read from the directory at [DefaultKubeconfigDir] (relative +// to the repository root), unless overridden via the [EnvKubeconfigDir] +// environment variable. +// +// Expected files inside that directory: +// +// control-plane.yaml — management / control-plane cell +// downstream.yaml — downstream federation API server (https://localhost:32443) +// pop-dfw.yaml — POP DFW cell (standard Kind localhost-based kubeconfig) +// pop-ord.yaml — POP ORD cell (standard Kind localhost-based kubeconfig) +// +// # Typical usage in a Ginkgo suite +// +// var ( +// testEnv *env.Environment +// ) +// +// var _ = BeforeSuite(func() { +// scheme := runtime.NewScheme() +// Expect(computev1alpha1.AddToScheme(scheme)).To(Succeed()) +// Expect(corev1.AddToScheme(scheme)).To(Succeed()) +// +// var err error +// testEnv, err = env.New(scheme) +// Expect(err).NotTo(HaveOccurred()) +// }) +package env + +import ( + "fmt" + "os" + "path/filepath" + "runtime" + + k8sruntime "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" +) + +// Environment variable name that overrides the kubeconfig directory. +const EnvKubeconfigDir = "E2E_KUBECONFIG_DIR" + +// DefaultKubeconfigDir is the kubeconfig directory used when [EnvKubeconfigDir] +// is not set. It is resolved relative to the repository root (three directories +// above this source file). +const DefaultKubeconfigDir = "tmp/e2e/kubeconfigs" + +// City codes for the two POP cells created by "task e2e:up". +const ( + CityCodeDFW = "dfw" + CityCodeORD = "ord" +) + +// Environment holds a [ClusterAccess] for each cluster in the local e2e +// environment. All fields are populated by [New]; none are nil on success. +type Environment struct { + // ControlPlane is the management / control-plane cell cluster. + // The compute operator runs here (WorkloadReconciler, + // WorkloadDeploymentFederator, InstanceProjector). + ControlPlane *ClusterAccess + + // Downstream is the downstream control plane. + // WorkloadDeployments and PropagationPolicies live here. + Downstream *ClusterAccess + + // POPCells maps city-code strings (e.g. "dfw", "ord") to the + // corresponding POP cell cluster. Use [Environment.POPCell] for + // safe, error-returning access. + POPCells map[string]*ClusterAccess +} + +// ClusterAccess bundles a REST config and a controller-runtime Client for a +// single cluster. +type ClusterAccess struct { + // Config is the REST config used to build the client. + Config *rest.Config + + // Client is a controller-runtime client scoped to this cluster. + // The client is built with the scheme supplied to [New]. + Client ctrlclient.Client +} + +// New creates an [Environment] by loading kubeconfigs from the configured +// directory and building a controller-runtime client for each cluster using +// the provided scheme. +// +// The scheme should have all relevant types registered before calling New; +// for example compute types, networking types, and core Kubernetes types. +func New(scheme *k8sruntime.Scheme) (*Environment, error) { + dir := kubeconfigDir() + + controlPlane, err := loadCluster(filepath.Join(dir, "control-plane.yaml"), scheme) + if err != nil { + return nil, fmt.Errorf("control-plane cluster: %w", err) + } + + downstream, err := loadCluster(filepath.Join(dir, "downstream.yaml"), scheme) + if err != nil { + return nil, fmt.Errorf("downstream control plane: %w", err) + } + + popDFW, err := loadCluster(filepath.Join(dir, "pop-dfw.yaml"), scheme) + if err != nil { + return nil, fmt.Errorf("POP DFW cluster: %w", err) + } + + popORD, err := loadCluster(filepath.Join(dir, "pop-ord.yaml"), scheme) + if err != nil { + return nil, fmt.Errorf("POP ORD cluster: %w", err) + } + + return &Environment{ + ControlPlane: controlPlane, + Downstream: downstream, + POPCells: map[string]*ClusterAccess{ + CityCodeDFW: popDFW, + CityCodeORD: popORD, + }, + }, nil +} + +// POPCell returns the [ClusterAccess] for the POP cell with the given city +// code. It returns an error if no POP cell is registered for that code. +func (e *Environment) POPCell(cityCode string) (*ClusterAccess, error) { + ca, ok := e.POPCells[cityCode] + if !ok { + known := make([]string, 0, len(e.POPCells)) + for k := range e.POPCells { + known = append(known, k) + } + return nil, fmt.Errorf("no POP cell registered for city code %q (known: %v)", cityCode, known) + } + return ca, nil +} + +// MustPOPCell is like [Environment.POPCell] but panics on error. +// Useful in test setup where a missing POP cell is always a fatal misconfiguration. +func (e *Environment) MustPOPCell(cityCode string) *ClusterAccess { + ca, err := e.POPCell(cityCode) + if err != nil { + panic(err) + } + return ca +} + +// RESTConfigFor is a convenience function that returns a [rest.Config] for the +// named cluster without constructing a client. Useful when the caller needs to +// build a typed clientset directly. +func RESTConfigFor(kubeconfigPath string) (*rest.Config, error) { + cfg, err := clientcmd.BuildConfigFromFlags("", kubeconfigPath) + if err != nil { + return nil, fmt.Errorf("building REST config from %s: %w", kubeconfigPath, err) + } + return cfg, nil +} + +// KubeconfigPath returns the absolute path to the kubeconfig file for the +// named cluster. name must be one of "control-plane", "downstream", "pop-dfw", +// or "pop-ord". +func KubeconfigPath(name string) string { + return filepath.Join(kubeconfigDir(), name+".yaml") +} + +// ─── internal helpers ──────────────────────────────────────────────────────── + +func loadCluster(kubeconfigPath string, scheme *k8sruntime.Scheme) (*ClusterAccess, error) { + cfg, err := clientcmd.BuildConfigFromFlags("", kubeconfigPath) + if err != nil { + return nil, fmt.Errorf("building REST config from %s: %w", kubeconfigPath, err) + } + + c, err := ctrlclient.New(cfg, ctrlclient.Options{Scheme: scheme}) + if err != nil { + return nil, fmt.Errorf("building client from %s: %w", kubeconfigPath, err) + } + + return &ClusterAccess{ + Config: cfg, + Client: c, + }, nil +} + +// kubeconfigDir returns the directory containing e2e kubeconfigs. +// It honours the E2E_KUBECONFIG_DIR environment variable, otherwise falls +// back to /tmp/e2e/kubeconfigs. +func kubeconfigDir() string { + if dir := os.Getenv(EnvKubeconfigDir); dir != "" { + return dir + } + return filepath.Join(repoRoot(), DefaultKubeconfigDir) +} + +// repoRoot walks up from this source file to find the repository root +// (identified by the presence of go.mod). +func repoRoot() string { + // Use the file path of this source file as a starting point so the helper + // works regardless of the caller's working directory. + _, thisFile, _, ok := runtime.Caller(0) + if !ok { + // Fallback: assume tests are run from the repo root. + return "." + } + + dir := filepath.Dir(thisFile) + for { + if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil { + return dir + } + parent := filepath.Dir(dir) + if parent == dir { + // Reached filesystem root without finding go.mod. + return "." + } + dir = parent + } +} diff --git a/test/e2e/full-federation/chainsaw-test.yaml b/test/e2e/full-federation/chainsaw-test.yaml new file mode 100644 index 0000000..020a2bc --- /dev/null +++ b/test/e2e/full-federation/chainsaw-test.yaml @@ -0,0 +1,150 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: full-federation +spec: + description: | + End-to-end federation chain test. + + Exercises the complete path from WorkloadDeployment creation through to + Instance projection on the control-plane cluster: + + 1. Create WorkloadDeployment on control-plane. + 2. WorkloadDeploymentFederator replicates it to Karmada (ns- namespace). + 3. Karmada PropagationPolicy routes the WD to pop-dfw. + 4. WorkloadDeploymentReconciler on pop-dfw creates Instance test-full-fed-wd-0. + 5. InstanceReconciler on pop-dfw writes Instance back to Karmada with + label meta.datumapis.com/upstream-cluster-name: cluster-single. + 6. InstanceProjector on control-plane creates a projection of the Instance + in the project namespace. + + Prerequisites: both operator instances must be running (task e2e:operator:start). + + template: true + + steps: + - name: create-workload-deployment + description: Create the WorkloadDeployment on the control-plane cluster. + try: + - apply: + file: workload-deployment.yaml + + - name: assert-wd-in-downstream + description: Assert WorkloadDeploymentFederator replicated the WD to Karmada and status is synced back. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-full-fed-wd + - assert: + # Wait for the cell operator to write status back to the Karmada WD. + timeout: 60s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-full-fed-wd + status: + replicas: 1 + desiredReplicas: 1 + + - name: assert-wd-on-pop-dfw + description: Assert Karmada propagated the WD to pop-dfw and the cell reconciler set status. + cluster: pop-dfw + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + # Karmada propagation can take longer than a local apply. + timeout: 60s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-full-fed-wd + status: + replicas: 1 + desiredReplicas: 1 + + - name: assert-instance-on-pop-dfw + description: Assert WorkloadDeploymentReconciler created an Instance on pop-dfw with a Ready condition. + cluster: pop-dfw + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: Instance + metadata: + namespace: ($downstreamNS) + name: test-full-fed-wd-0 + (status.conditions[?type == 'Ready'] | [0]): + status: "Unknown" + + - name: assert-instance-writeback-in-downstream + description: Assert InstanceReconciler wrote the Instance back to Karmada. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: Instance + metadata: + namespace: ($downstreamNS) + name: test-full-fed-wd-0 + labels: + meta.datumapis.com/upstream-cluster-name: cluster-single + + - name: assert-instance-projected-to-control-plane + description: Assert InstanceProjector created a projection with status on the control-plane. + try: + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: Instance + metadata: + namespace: ($namespace) + name: test-full-fed-wd-0 + labels: + meta.datumapis.com/upstream-cluster-name: cluster-single + (status.conditions[?type == 'Ready'] | [0]): + status: "Unknown" diff --git a/test/e2e/full-federation/workload-deployment.yaml b/test/e2e/full-federation/workload-deployment.yaml new file mode 100644 index 0000000..70b4cb9 --- /dev/null +++ b/test/e2e/full-federation/workload-deployment.yaml @@ -0,0 +1,21 @@ +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + name: test-full-fed-wd + # namespace is injected by Chainsaw from ($namespace) +spec: + cityCode: dfw + placementName: default + workloadRef: + name: test-workload + uid: "00000000-0000-0000-0000-000000000001" + template: + spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network + scaleSettings: + minReplicas: 1 diff --git a/test/e2e/instance-projection/assert-downstream-wd.yaml b/test/e2e/instance-projection/assert-downstream-wd.yaml new file mode 100644 index 0000000..705d089 --- /dev/null +++ b/test/e2e/instance-projection/assert-downstream-wd.yaml @@ -0,0 +1,6 @@ +# Assert the WorkloadDeployment is federated to Karmada (and the Karmada namespace created). +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + namespace: ($downstreamNS) + name: test-projector-wd diff --git a/test/e2e/instance-projection/assert-projected-instance.yaml b/test/e2e/instance-projection/assert-projected-instance.yaml new file mode 100644 index 0000000..0542194 --- /dev/null +++ b/test/e2e/instance-projection/assert-projected-instance.yaml @@ -0,0 +1,19 @@ +# Assert the InstanceProjector created a projection in the project namespace. +# +# The InstanceProjector (internal/controller/instance_projector.go): +# - Watches Instances in Karmada that carry upstreamClusterNameLabel +# - Strips "cluster-" prefix to get the cluster name ("single" in single-provider mode) +# - Finds the project namespace by matching ns- to namespace UIDs +# - Creates/updates the Instance projection in the project namespace +# - Sets an owner reference to the WorkloadDeployment for cascading deletion +apiVersion: compute.datumapis.com/v1alpha +kind: Instance +metadata: + # namespace is the Chainsaw test namespace (the project namespace on control-plane) + name: test-projected-instance + labels: + meta.datumapis.com/upstream-cluster-name: cluster-single + ownerReferences: + - apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + name: test-projector-wd diff --git a/test/e2e/instance-projection/chainsaw-test.yaml b/test/e2e/instance-projection/chainsaw-test.yaml new file mode 100644 index 0000000..16fa9f9 --- /dev/null +++ b/test/e2e/instance-projection/chainsaw-test.yaml @@ -0,0 +1,123 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: instance-projection +spec: + description: | + Verifies that the InstanceProjector watches Instances written back to the + Karmada API server and creates corresponding read-only projections in the + project namespace on the control-plane cluster. + + Flow: + 1. Create a WorkloadDeployment → triggers federator → Karmada namespace created. + 2. Write an Instance to Karmada (simulating a POP-cell InstanceReconciler write-back). + 3. InstanceProjector detects the Karmada Instance and creates a projection in the + project namespace (the Chainsaw test namespace on the control-plane cluster). + 4. Assert the projection exists with the upstream tracking label and an owner + reference to the WorkloadDeployment (for cascading deletion). + + Cluster name label: "cluster-single" + The compute operator runs in single-provider mode for this e2e environment, + registering the control-plane cluster with the multicluster-runtime manager + under the name "single" (see cmd/main.go, wrappedSingleClusterProvider). + + template: true + + steps: + - name: create-wd + description: Create the WorkloadDeployment to trigger federation and namespace creation. + try: + - apply: + file: workload-deployment.yaml + + - name: wait-for-downstream-namespace + description: Wait for the federated WorkloadDeployment to appear in Karmada. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-projector-wd + + - name: write-instance-to-downstream + description: | + Write an Instance to Karmada simulating InstanceReconciler write-back. + Uses explicit control-plane kubeconfig to derive downstreamNS and WD UID. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get workloaddeployment test-projector-wd \ + --namespace "$NAMESPACE" \ + -o jsonpath='{.metadata.uid}' + outputs: + - name: wdUID + value: ($stdout) + - script: + env: + - name: KARMADA_NS + value: ($downstreamNS) + - name: WD_UID + value: ($wdUID) + content: | + kubectl apply -f - < is the multicluster-runtime cluster name registered by +# wrappedSingleClusterProvider (always "single" in single-cluster mode) +# - Label meta.datumapis.com/upstream-namespace = the POP-cell namespace +apiVersion: compute.datumapis.com/v1alpha +kind: Instance +metadata: + namespace: ($instanceNS) + name: test-writeback-instance + labels: + meta.datumapis.com/upstream-cluster-name: cluster-single + meta.datumapis.com/upstream-namespace: ($instanceNS) diff --git a/test/e2e/instance-writeback/chainsaw-test.yaml b/test/e2e/instance-writeback/chainsaw-test.yaml new file mode 100644 index 0000000..32dbbc5 --- /dev/null +++ b/test/e2e/instance-writeback/chainsaw-test.yaml @@ -0,0 +1,112 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: instance-writeback +spec: + description: | + Verifies that the InstanceReconciler running in a POP-cell cluster writes + Instance objects back to the Karmada API server after reconciling the Ready + condition for the first time. + + Write-back convention (internal/controller/instance_controller.go): + - The Instance is written to Karmada at the same namespace/name as the POP-cell Instance. + - Label meta.datumapis.com/upstream-cluster-name is set to + "cluster-" (e.g. "cluster-compute-pop-dfw"). + - Label meta.datumapis.com/upstream-namespace records the originating namespace. + + Note: this test requires the compute operator (InstanceReconciler) to be running + in the DFW POP cell cluster. + + template: true + + steps: + - name: setup-namespaces + description: Create the Instance namespace in the DFW POP cell and Karmada. + try: + - script: + content: | + kubectl get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: instanceNS + value: ($stdout) + - script: + env: + - name: INSTANCE_NS + value: ($instanceNS) + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/pop-dfw.yaml \ + create namespace "$INSTANCE_NS" \ + --dry-run=client -o yaml | \ + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/pop-dfw.yaml apply -f - + - script: + env: + - name: INSTANCE_NS + value: ($instanceNS) + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/downstream.yaml \ + create namespace "$INSTANCE_NS" \ + --dry-run=client -o yaml | \ + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/downstream.yaml apply -f - + cleanup: + - script: + env: + - name: INSTANCE_NS + value: ($instanceNS) + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/pop-dfw.yaml \ + delete namespace "$INSTANCE_NS" --ignore-not-found + - script: + env: + - name: INSTANCE_NS + value: ($instanceNS) + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/downstream.yaml \ + delete namespace "$INSTANCE_NS" --ignore-not-found + + - name: create-instance-on-pop-dfw + description: Create the Instance on the DFW POP cell cluster. + cluster: pop-dfw + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: instanceNS + value: ($stdout) + - apply: + file: instance-pop-dfw.yaml + cleanup: + - script: + content: | + INSTANCE_NS=$(kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}') + kubectl delete instance test-writeback-instance \ + --namespace "$INSTANCE_NS" --ignore-not-found + + - name: assert-instance-in-downstream + description: Wait for the InstanceReconciler to write back the Instance to Karmada. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: instanceNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: Instance + metadata: + namespace: ($instanceNS) + name: test-writeback-instance + labels: + meta.datumapis.com/upstream-cluster-name: cluster-single + meta.datumapis.com/upstream-namespace: ($instanceNS) diff --git a/test/e2e/instance-writeback/instance-pop-dfw.yaml b/test/e2e/instance-writeback/instance-pop-dfw.yaml new file mode 100644 index 0000000..250eb7d --- /dev/null +++ b/test/e2e/instance-writeback/instance-pop-dfw.yaml @@ -0,0 +1,15 @@ +# Instance created in the DFW POP cell. +# ($instanceNS) is the namespace derived from the Chainsaw test namespace UID, +# matching the ns- convention so the InstanceProjector can resolve it later. +apiVersion: compute.datumapis.com/v1alpha +kind: Instance +metadata: + name: test-writeback-instance + namespace: ($instanceNS) +spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network diff --git a/test/e2e/propagation-policy-lifecycle/assert-pp-exists.yaml b/test/e2e/propagation-policy-lifecycle/assert-pp-exists.yaml new file mode 100644 index 0000000..77a817a --- /dev/null +++ b/test/e2e/propagation-policy-lifecycle/assert-pp-exists.yaml @@ -0,0 +1,6 @@ +# Asserts that the PropagationPolicy for city dfw exists in the Karmada namespace. +apiVersion: policy.karmada.io/v1alpha1 +kind: PropagationPolicy +metadata: + namespace: ($downstreamNS) + name: workload-deployments-dfw diff --git a/test/e2e/propagation-policy-lifecycle/chainsaw-test.yaml b/test/e2e/propagation-policy-lifecycle/chainsaw-test.yaml new file mode 100644 index 0000000..5678c39 --- /dev/null +++ b/test/e2e/propagation-policy-lifecycle/chainsaw-test.yaml @@ -0,0 +1,133 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: propagation-policy-lifecycle +spec: + description: | + Verifies the PropagationPolicy lifecycle managed by the WorkloadDeploymentFederator: + + - A PropagationPolicy (city-dfw) is lazily created when the first WorkloadDeployment + for city code "dfw" is federated to Karmada. + - The PropagationPolicy is RETAINED while at least one WorkloadDeployment for + that city code remains in the Karmada namespace. + - The PropagationPolicy is DELETED when the last deployment for the city is removed. + + The test creates two WDs (wd-alpha, wd-beta) both targeting cityCode=dfw, verifies + the PP appears, deletes wd-alpha and asserts the PP is still present, then deletes + wd-beta and waits for the PP to disappear. + + template: true + + steps: + - name: create-deployments + description: Create two WorkloadDeployments targeting dfw on the control-plane. + try: + - apply: + file: workload-deployment-alpha.yaml + - apply: + file: workload-deployment-beta.yaml + + - name: assert-policy-created + description: | + Assert both WDs are federated to Karmada and the PropagationPolicy exists. + Both WDs must be present in Karmada before proceeding to the deletion steps; + otherwise wd-alpha's finalizer could see an empty Karmada list and prematurely + delete the PP before wd-beta has been federated. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: wd-alpha + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: wd-beta + - assert: + timeout: 30s + resource: + apiVersion: policy.karmada.io/v1alpha1 + kind: PropagationPolicy + metadata: + namespace: ($downstreamNS) + name: city-dfw + + - name: delete-alpha + description: Delete wd-alpha; wd-beta still targets dfw so the PP must be retained. + try: + - delete: + ref: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + namespace: ($namespace) + name: wd-alpha + + - name: assert-policy-retained + description: Assert the PropagationPolicy is still present after wd-alpha is deleted. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - sleep: + duration: 8s + - assert: + timeout: 5s + resource: + apiVersion: policy.karmada.io/v1alpha1 + kind: PropagationPolicy + metadata: + namespace: ($downstreamNS) + name: city-dfw + + - name: delete-beta + description: Delete wd-beta (the last WD for city dfw). + try: + - delete: + ref: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + namespace: ($namespace) + name: wd-beta + + - name: assert-policy-deleted + description: Wait for the PropagationPolicy to be removed once no WDs remain. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - wait: + apiVersion: policy.karmada.io/v1alpha1 + kind: PropagationPolicy + namespace: ($downstreamNS) + name: city-dfw + timeout: 30s + for: + deletion: {} diff --git a/test/e2e/propagation-policy-lifecycle/workload-deployment-alpha.yaml b/test/e2e/propagation-policy-lifecycle/workload-deployment-alpha.yaml new file mode 100644 index 0000000..f9eb27f --- /dev/null +++ b/test/e2e/propagation-policy-lifecycle/workload-deployment-alpha.yaml @@ -0,0 +1,21 @@ +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + name: wd-alpha +spec: + cityCode: dfw + placementName: default + workloadRef: + name: test-workload + uid: "00000000-0000-0000-0000-000000000001" + template: + spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network + + scaleSettings: + minReplicas: 1 diff --git a/test/e2e/propagation-policy-lifecycle/workload-deployment-beta.yaml b/test/e2e/propagation-policy-lifecycle/workload-deployment-beta.yaml new file mode 100644 index 0000000..fd1d65c --- /dev/null +++ b/test/e2e/propagation-policy-lifecycle/workload-deployment-beta.yaml @@ -0,0 +1,21 @@ +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + name: wd-beta +spec: + cityCode: dfw + placementName: default + workloadRef: + name: test-workload + uid: "00000000-0000-0000-0000-000000000001" + template: + spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network + + scaleSettings: + minReplicas: 1 diff --git a/test/e2e/workload-deployment-federation/assert-downstream-pp.yaml b/test/e2e/workload-deployment-federation/assert-downstream-pp.yaml new file mode 100644 index 0000000..98f8d0f --- /dev/null +++ b/test/e2e/workload-deployment-federation/assert-downstream-pp.yaml @@ -0,0 +1,20 @@ +# Assert the PropagationPolicy was created in the Karmada namespace. +# The name follows propagationPolicyNameFor("dfw") = "workload-deployments-dfw". +# ($downstreamNS) is substituted by Chainsaw's template engine. +apiVersion: policy.karmada.io/v1alpha1 +kind: PropagationPolicy +metadata: + namespace: ($downstreamNS) + name: workload-deployments-dfw +spec: + resourceSelectors: + - apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + labelSelector: + matchLabels: + topology.datum.net/city-code: dfw + placement: + clusterAffinity: + labelSelector: + matchLabels: + topology.datum.net/city-code: dfw diff --git a/test/e2e/workload-deployment-federation/assert-downstream-wd.yaml b/test/e2e/workload-deployment-federation/assert-downstream-wd.yaml new file mode 100644 index 0000000..23c308f --- /dev/null +++ b/test/e2e/workload-deployment-federation/assert-downstream-wd.yaml @@ -0,0 +1,9 @@ +# Assert the WorkloadDeployment exists in Karmada with the city-code label. +# ($downstreamNS) is substituted by Chainsaw's template engine from the script binding. +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + namespace: ($downstreamNS) + name: test-federation-wd + labels: + topology.datum.net/city-code: dfw diff --git a/test/e2e/workload-deployment-federation/chainsaw-test.yaml b/test/e2e/workload-deployment-federation/chainsaw-test.yaml new file mode 100644 index 0000000..302d89c --- /dev/null +++ b/test/e2e/workload-deployment-federation/chainsaw-test.yaml @@ -0,0 +1,84 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: workload-deployment-federation +spec: + description: | + Verifies that the WorkloadDeploymentFederator replicates a WorkloadDeployment + from the project namespace (control-plane cluster) to the Karmada API server + with the correct city-code label and PropagationPolicy. + + The federator follows the ns- convention for Karmada namespaces, + matching the MappedNamespaceResourceStrategy used by NSO. The test derives + the expected Karmada namespace dynamically from the Chainsaw test namespace UID. + + Verified: + - WorkloadDeployment exists in Karmada at ns- + - Karmada copy carries label topology.datum.net/city-code: dfw + - PropagationPolicy city-dfw exists in the Karmada namespace, + selecting WDs by city-code and routing them to matching POP-cell clusters. + + template: true + + steps: + - name: derive-ns-and-create-wd + description: Derive Karmada namespace and create the WorkloadDeployment. + try: + - apply: + file: workload-deployment.yaml + + - name: assert-wd-in-downstream + description: Assert WorkloadDeployment federated to Karmada with city-code label. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-federation-wd + labels: + topology.datum.net/city-code: dfw + + - name: assert-propagation-policy-in-downstream + description: Assert PropagationPolicy created for city-dfw. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: policy.karmada.io/v1alpha1 + kind: PropagationPolicy + metadata: + namespace: ($downstreamNS) + name: city-dfw + spec: + resourceSelectors: + - apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + labelSelector: + matchLabels: + topology.datum.net/city-code: dfw + placement: + clusterAffinity: + labelSelector: + matchLabels: + topology.datum.net/city-code: dfw diff --git a/test/e2e/workload-deployment-federation/workload-deployment.yaml b/test/e2e/workload-deployment-federation/workload-deployment.yaml new file mode 100644 index 0000000..0cd2347 --- /dev/null +++ b/test/e2e/workload-deployment-federation/workload-deployment.yaml @@ -0,0 +1,22 @@ +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + name: test-federation-wd + # namespace is injected by Chainsaw from ($namespace) +spec: + cityCode: dfw + placementName: default + workloadRef: + name: test-workload + uid: "00000000-0000-0000-0000-000000000001" + template: + spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network + + scaleSettings: + minReplicas: 1