From 54693db54ac89cc25caf98624126b8c736bf46d3 Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Tue, 28 Apr 2026 08:59:26 +0200 Subject: [PATCH 01/21] fix: calico-lab template support for cluster-api v1.11 --- config/clusterctl-templates/cluster-template-calico-lab.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/config/clusterctl-templates/cluster-template-calico-lab.yaml b/config/clusterctl-templates/cluster-template-calico-lab.yaml index 1e07261..b5053d4 100644 --- a/config/clusterctl-templates/cluster-template-calico-lab.yaml +++ b/config/clusterctl-templates/cluster-template-calico-lab.yaml @@ -133,8 +133,6 @@ spec: kubeadmConfigSpec: format: ignition clusterConfiguration: - apiServer: - extraArgs: {} controllerManager: extraArgs: cloud-provider: external @@ -258,8 +256,6 @@ spec: spec: format: ignition clusterConfiguration: - apiServer: - extraArgs: {} controllerManager: extraArgs: cloud-provider: external From bd3ccf4d8e2f68ea36a5ca474692a94a35f5b080 Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Tue, 21 Apr 2026 17:25:19 +0200 Subject: [PATCH 02/21] feat: smoke tests for capms sonic and dell_sonic flavors --- .github/workflows/integration.yaml | 2 + capi-lab/Makefile | 60 ++++++++++++++++-------------- capi-lab/compose.kamaji.yaml | 18 --------- capi-lab/compose.yaml | 18 ++++++++- capi-lab/mini-lab | 2 +- capi-lab/test/integration.sh | 41 +++++++++++++++++++- 6 files changed, 92 insertions(+), 49 deletions(-) diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index 07b987a..90ca32a 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -17,6 +17,8 @@ jobs: strategy: matrix: flavors: + - name: capms_dell_sonic + - name: capms_sonic - name: kamaji steps: diff --git a/capi-lab/Makefile b/capi-lab/Makefile index 6ff0985..ab5b128 100644 --- a/capi-lab/Makefile +++ b/capi-lab/Makefile @@ -5,7 +5,7 @@ ANSIBLE_EXTRA_VARS_FILE=$(shell pwd)/mini-lab-overrides/extra-vars.yaml KIND_EXPERIMENTAL_DOCKER_NETWORK=mini_lab_ext KUBECONFIG := $(shell pwd)/mini-lab/.kubeconfig -MINI_LAB_FLAVOR := $(or $(MINI_LAB_FLAVOR),capms) +MINI_LAB_FLAVOR := $(or $(MINI_LAB_FLAVOR),capms_sonic) CLUSTER_NAME ?= metal-test KUBERNETES_VERSION ?= 1.33.5 @@ -35,7 +35,9 @@ SUBMODULE_SHA=$(shell git -C mini-lab rev-parse --short=8 HEAD) MINI_LAB_VM_IMAGE := $(or $(MINI_LAB_VM_IMAGE),ghcr.io/metal-stack/mini-lab-vms:$(SUBMODULE_SHA)) MINI_LAB_SONIC_IMAGE := $(or $(MINI_LAB_SONIC_IMAGE),ghcr.io/metal-stack/mini-lab-sonic:$(SUBMODULE_SHA)) -ifeq ($(MINI_LAB_FLAVOR),capms) +ifeq ($(MINI_LAB_FLAVOR),capms_sonic) +DEPLOY_TARGET=deploy-kubeadm +else ifeq ($(MINI_LAB_FLAVOR),capms_dell_sonic) DEPLOY_TARGET=deploy-kubeadm else ifeq ($(MINI_LAB_FLAVOR),kamaji) DEPLOY_TARGET=deploy-kamaji @@ -101,24 +103,26 @@ control-plane-ip: apply-sample-cluster: $(eval CONTROL_PLANE_IP = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network ip list --name "$(CLUSTER_NAME)-vip" -o template --template '{{ .ipaddress }}')) echo $(CLUSTER_NAME) - clusterctl generate cluster $(CLUSTER_NAME) \ - --kubeconfig=$(KUBECONFIG) \ - --worker-machine-count 1 \ - --control-plane-machine-count 1 \ - --kubernetes-version $(KUBERNETES_VERSION) \ - --from ../config/clusterctl-templates/cluster-template-calico-lab.yaml \ - | kubectl --kubeconfig=$(KUBECONFIG) apply -f - + docker compose -f compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) \ + clusterctl generate cluster $(CLUSTER_NAME) \ + --kubeconfig=$(KUBECONFIG) \ + --worker-machine-count 1 \ + --control-plane-machine-count 1 \ + --kubernetes-version $(KUBERNETES_VERSION) \ + --from ../config/clusterctl-templates/cluster-template-calico-lab.yaml \ + | kubectl --kubeconfig=$(KUBECONFIG) apply -f - .PHONY: delete-sample-cluster delete-sample-cluster: $(eval CONTROL_PLANE_IP = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network ip list --name "$(CLUSTER_NAME)-vip" -o template --template '{{ .ipaddress }}')) - clusterctl generate cluster $(CLUSTER_NAME) \ - --kubeconfig=$(KUBECONFIG) \ - --worker-machine-count 1 \ - --control-plane-machine-count 1 \ - --kubernetes-version $(KUBERNETES_VERSION) \ - --from ../config/clusterctl-templates/cluster-template-calico-lab.yaml \ - | kubectl --kubeconfig=$(KUBECONFIG) delete -f - + docker compose -f compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) \ + clusterctl generate cluster $(CLUSTER_NAME) \ + --kubeconfig=$(KUBECONFIG) \ + --worker-machine-count 1 \ + --control-plane-machine-count 1 \ + --kubernetes-version $(KUBERNETES_VERSION) \ + --from ../config/clusterctl-templates/cluster-template-calico-lab.yaml \ + | kubectl --kubeconfig=$(KUBECONFIG) delete -f - .PHONY: mtu-fix mtu-fix: @@ -133,22 +137,22 @@ create-kamaji-tenant: # let MetalLB assign the IP to the tenant cluster control plane service envsubst < kamaji/metallb-tenant-pool.yaml | kubectl --kubeconfig=$(KUBECONFIG) apply -f - docker compose -f compose.yaml -f compose.kamaji.yaml run $(DOCKER_COMPOSE_RUN_ARG) \ - clusterctl generate cluster $(CLUSTER_NAME) \ - --target-namespace $(TENANT_NAMESPACE) \ - --worker-machine-count 1 \ - --kubernetes-version $(KUBERNETES_VERSION) \ - --from /templates/cluster-template-kamaji-tenant.yaml \ - | kubectl --kubeconfig=$(KUBECONFIG) apply -f - + clusterctl generate cluster $(CLUSTER_NAME) \ + --target-namespace $(TENANT_NAMESPACE) \ + --worker-machine-count 1 \ + --kubernetes-version $(KUBERNETES_VERSION) \ + --from /templates/cluster-template-kamaji-tenant.yaml \ + | kubectl --kubeconfig=$(KUBECONFIG) apply -f - .PHONY: delete-kamaji-tenant delete-kamaji-tenant: docker compose -f compose.yaml -f compose.kamaji.yaml run $(DOCKER_COMPOSE_RUN_ARG) \ - clusterctl generate cluster $(CLUSTER_NAME) \ - --target-namespace $(TENANT_NAMESPACE) \ - --worker-machine-count 1 \ - --kubernetes-version $(KUBERNETES_VERSION) \ - --from /templates/cluster-template-kamaji-tenant.yaml \ - | kubectl --kubeconfig=$(KUBECONFIG) delete -f - + clusterctl generate cluster $(CLUSTER_NAME) \ + --target-namespace $(TENANT_NAMESPACE) \ + --worker-machine-count 1 \ + --kubernetes-version $(KUBERNETES_VERSION) \ + --from /templates/cluster-template-kamaji-tenant.yaml \ + | kubectl --kubeconfig=$(KUBECONFIG) delete -f - .PHONY: kamaji-tenant-kubeconfig kamaji-tenant-kubeconfig: diff --git a/capi-lab/compose.kamaji.yaml b/capi-lab/compose.kamaji.yaml index a66ec34..a1aa589 100644 --- a/capi-lab/compose.kamaji.yaml +++ b/capi-lab/compose.kamaji.yaml @@ -3,25 +3,7 @@ services: clusterctl: image: registry.k8s.io/cluster-api/clusterctl:v1.12.3 environment: - - EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION=true - - METAL_API_HMAC=${METAL_API_HMAC} - - METAL_API_URL=${METAL_API_URL} - - METAL_API_HMAC_AUTH_TYPE=${METAL_API_HMAC_AUTH_TYPE} - - CLUSTER_NAME=${CLUSTER_NAME} - TENANT_NAMESPACE=${TENANT_NAMESPACE} - - KUBERNETES_VERSION=${KUBERNETES_VERSION} - - CONTROL_PLANE_IP=${CONTROL_PLANE_IP} - PODS_CIDR=${PODS_CIDR} - SERVICES_CIDR=${SERVICES_CIDR} - - METAL_PARTITION=${METAL_PARTITION} - - METAL_PROJECT_ID=${METAL_PROJECT_ID} - - CONTROL_PLANE_MACHINE_IMAGE=${CONTROL_PLANE_MACHINE_IMAGE} - - CONTROL_PLANE_MACHINE_SIZE=${CONTROL_PLANE_MACHINE_SIZE} - - WORKER_MACHINE_IMAGE=${WORKER_MACHINE_IMAGE} - - WORKER_MACHINE_SIZE=${WORKER_MACHINE_SIZE} - - FIREWALL_MACHINE_IMAGE=${FIREWALL_MACHINE_IMAGE} - - FIREWALL_MACHINE_SIZE=${FIREWALL_MACHINE_SIZE} - METAL_NODE_NETWORK_ID=${METAL_NODE_NETWORK_ID} - - FIREWALL_EXTERNAL_NETWORKS=${FIREWALL_EXTERNAL_NETWORKS} - volumes: - - ../config/clusterctl-templates:/templates:ro diff --git a/capi-lab/compose.yaml b/capi-lab/compose.yaml index f680835..104fc52 100644 --- a/capi-lab/compose.yaml +++ b/capi-lab/compose.yaml @@ -5,8 +5,24 @@ services: environment: - EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION=true - KUBECONFIG=/kubeconfig - image: registry.k8s.io/cluster-api/clusterctl:v1.9.5 + - METAL_API_HMAC=${METAL_API_HMAC} + - METAL_API_URL=${METAL_API_URL} + - METAL_API_HMAC_AUTH_TYPE=${METAL_API_HMAC_AUTH_TYPE} + - CLUSTER_NAME=${CLUSTER_NAME} + - KUBERNETES_VERSION=${KUBERNETES_VERSION} + - CONTROL_PLANE_IP=${CONTROL_PLANE_IP} + - METAL_PARTITION=${METAL_PARTITION} + - METAL_PROJECT_ID=${METAL_PROJECT_ID} + - CONTROL_PLANE_MACHINE_IMAGE=${CONTROL_PLANE_MACHINE_IMAGE} + - CONTROL_PLANE_MACHINE_SIZE=${CONTROL_PLANE_MACHINE_SIZE} + - WORKER_MACHINE_IMAGE=${WORKER_MACHINE_IMAGE} + - WORKER_MACHINE_SIZE=${WORKER_MACHINE_SIZE} + - FIREWALL_MACHINE_IMAGE=${FIREWALL_MACHINE_IMAGE} + - FIREWALL_MACHINE_SIZE=${FIREWALL_MACHINE_SIZE} + - FIREWALL_EXTERNAL_NETWORKS=${FIREWALL_EXTERNAL_NETWORKS} + image: registry.k8s.io/cluster-api/clusterctl:v1.11.4 network_mode: host user: root volumes: - ${KUBECONFIG}:/kubeconfig:ro + - ../config/clusterctl-templates:/templates:ro diff --git a/capi-lab/mini-lab b/capi-lab/mini-lab index f7d4a9b..1664eff 160000 --- a/capi-lab/mini-lab +++ b/capi-lab/mini-lab @@ -1 +1 @@ -Subproject commit f7d4a9b4aec09c454fe638d49b9bc8493d6385b5 +Subproject commit 1664eff4656afcf5cf2e0bae74e5a2a5f7939f14 diff --git a/capi-lab/test/integration.sh b/capi-lab/test/integration.sh index f6938d4..3623728 100755 --- a/capi-lab/test/integration.sh +++ b/capi-lab/test/integration.sh @@ -24,9 +24,48 @@ echo "$waiting/$minWaiting machines are waiting" make push-to-capi-lab +if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_sonic" ]; then + + if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ]; then + 2>&1 echo "Starting capms dell sonic flavor tests" + else + 2>&1 echo "Starting capms sonic flavor tests" + fi + + echo "Creating control plane IP" + make -C capi-lab control-plane-ip + + echo "Applying sample cluster" + make -C capi-lab apply-sample-cluster + + echo "Waiting for control-plane to get to Phoned Home state" + phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l) + minPhoned=2 + declare -i attempts=0 + until [ "$phoned" -ge $minPhoned ] + do + if [ "$attempts" -ge 120 ]; then + echo "not enough machines phoned home - timeout reached" + exit 1 + fi + echo "$phoned/$minPhoned machines have phoned home" + sleep 5 + phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l) + attempts+=1 + done + echo "$phoned/$minPhoned machines have phoned home" + + echo "Applying mtu fix" + make -C capi-lab mtu-fix + + # TODO further checks + +fi + + if [ "$MINI_LAB_FLAVOR" = "kamaji" ]; then - echo "Starting kamaji tests" + echo "Starting kamaji flavor tests" echo "Creating control plane IP" export CLUSTER_NAME=kamaji-tenant-test From f21cdc361ab96f0b94ec848058cfb9f9df16159a Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Tue, 21 Apr 2026 22:41:33 +0200 Subject: [PATCH 03/21] fix: remove redundant kubeconfig for clusterctl sample cluster --- capi-lab/Makefile | 6 ++---- capi-lab/test/integration.sh | 6 +++--- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/capi-lab/Makefile b/capi-lab/Makefile index ab5b128..35cfef2 100644 --- a/capi-lab/Makefile +++ b/capi-lab/Makefile @@ -105,11 +105,10 @@ apply-sample-cluster: echo $(CLUSTER_NAME) docker compose -f compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) \ clusterctl generate cluster $(CLUSTER_NAME) \ - --kubeconfig=$(KUBECONFIG) \ --worker-machine-count 1 \ --control-plane-machine-count 1 \ --kubernetes-version $(KUBERNETES_VERSION) \ - --from ../config/clusterctl-templates/cluster-template-calico-lab.yaml \ + --from /templates/cluster-template-calico-lab.yaml \ | kubectl --kubeconfig=$(KUBECONFIG) apply -f - .PHONY: delete-sample-cluster @@ -117,11 +116,10 @@ delete-sample-cluster: $(eval CONTROL_PLANE_IP = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network ip list --name "$(CLUSTER_NAME)-vip" -o template --template '{{ .ipaddress }}')) docker compose -f compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) \ clusterctl generate cluster $(CLUSTER_NAME) \ - --kubeconfig=$(KUBECONFIG) \ --worker-machine-count 1 \ --control-plane-machine-count 1 \ --kubernetes-version $(KUBERNETES_VERSION) \ - --from ../config/clusterctl-templates/cluster-template-calico-lab.yaml \ + --from /templates/cluster-template-calico-lab.yaml \ | kubectl --kubeconfig=$(KUBECONFIG) delete -f - .PHONY: mtu-fix diff --git a/capi-lab/test/integration.sh b/capi-lab/test/integration.sh index 3623728..a6612f0 100755 --- a/capi-lab/test/integration.sh +++ b/capi-lab/test/integration.sh @@ -18,7 +18,7 @@ do echo "$waiting/$minWaiting machines are waiting" sleep 5 waiting=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Waiting | wc -l) - attempts=$attempts+1 + attempts+=1 done echo "$waiting/$minWaiting machines are waiting" @@ -27,9 +27,9 @@ make push-to-capi-lab if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_sonic" ]; then if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ]; then - 2>&1 echo "Starting capms dell sonic flavor tests" + echo "Starting capms dell sonic flavor tests" else - 2>&1 echo "Starting capms sonic flavor tests" + echo "Starting capms sonic flavor tests" fi echo "Creating control plane IP" From d37b278c35369e93bc54bc1bbab966483643d095 Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Wed, 22 Apr 2026 00:05:28 +0200 Subject: [PATCH 04/21] chore: do not continue lab tests on error and do not fail fast --- .github/workflows/integration.yaml | 5 +++-- capi-lab/mini-lab | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index 90ca32a..ea76b05 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -12,9 +12,10 @@ jobs: test: name: Run tests runs-on: ubuntu-latest - - continue-on-error: true + # TODO should we run on self-hosted? + strategy: + fail-fast: false matrix: flavors: - name: capms_dell_sonic diff --git a/capi-lab/mini-lab b/capi-lab/mini-lab index 1664eff..5a5bbeb 160000 --- a/capi-lab/mini-lab +++ b/capi-lab/mini-lab @@ -1 +1 @@ -Subproject commit 1664eff4656afcf5cf2e0bae74e5a2a5f7939f14 +Subproject commit 5a5bbeb08d74efbd105aed0b7b4a4eb5dab82418 From f8a25d22b74003fe5b03a2266cab51131fb42894 Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Wed, 22 Apr 2026 00:14:15 +0200 Subject: [PATCH 05/21] chore: add timeout for lab integration tests --- .github/workflows/integration.yaml | 3 ++- capi-lab/test/integration.sh | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index ea76b05..a0b2537 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -13,7 +13,7 @@ jobs: name: Run tests runs-on: ubuntu-latest # TODO should we run on self-hosted? - + strategy: fail-fast: false matrix: @@ -49,6 +49,7 @@ jobs: - name: Run integration tests shell: bash + timeout-minutes: 45 run: | eval $(make -C capi-lab --silent dev-env) ./capi-lab/test/ci-cleanup.sh diff --git a/capi-lab/test/integration.sh b/capi-lab/test/integration.sh index a6612f0..d153691 100755 --- a/capi-lab/test/integration.sh +++ b/capi-lab/test/integration.sh @@ -11,7 +11,7 @@ minWaiting=2 declare -i attempts=0 until [ "$waiting" -ge $minWaiting ] do - if [ "$attempts" -ge 60 ]; then + if [ "$attempts" -ge 180 ]; then echo "not enough machines in waiting state - timeout reached" exit 1 fi @@ -44,7 +44,7 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ declare -i attempts=0 until [ "$phoned" -ge $minPhoned ] do - if [ "$attempts" -ge 120 ]; then + if [ "$attempts" -ge 180 ]; then echo "not enough machines phoned home - timeout reached" exit 1 fi @@ -81,7 +81,7 @@ if [ "$MINI_LAB_FLAVOR" = "kamaji" ]; then declare -i attempts=0 until [ "$phoned" -ge $minPhoned ] do - if [ "$attempts" -ge 120 ]; then + if [ "$attempts" -ge 180 ]; then echo "not enough machines phoned home - timeout reached" exit 1 fi From 0edbc39b946e979776326504774b5456b19fb223 Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Wed, 22 Apr 2026 00:46:05 +0200 Subject: [PATCH 06/21] test(lab): wait for capms nodes to become ready --- capi-lab/Makefile | 12 ++++++++++- capi-lab/test/integration.sh | 39 +++++++++++++++++++++++++++++++++++- 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/capi-lab/Makefile b/capi-lab/Makefile index 35cfef2..d9fd0eb 100644 --- a/capi-lab/Makefile +++ b/capi-lab/Makefile @@ -127,6 +127,16 @@ mtu-fix: cd mini-lab && ssh -F files/ssh/config leaf01 'ip link set dev vtep-1001 mtu 9100 && echo done' cd mini-lab && ssh -F files/ssh/config leaf02 'ip link set dev vtep-1001 mtu 9100 && echo done' +.PHONY: sample-cluster-kubeconfig +sample-cluster-kubeconfig: + kubectl --kubeconfig=$(KUBECONFIG) get secret $(CLUSTER_NAME)-kubeconfig -o jsonpath='{.data.value}' | base64 -d > ../$(CLUSTER_NAME).kubeconfig + @echo "Sample cluster kubeconfig written to $(CLUSTER_NAME).kubeconfig" + +.PHONY: sample-cluster-deploy-metal-ccm +sample-cluster-deploy-metal-ccm: + $(eval METAL_NODE_NETWORK_ID = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network list --project $(METAL_PROJECT_ID) -o template --template '{{ (index . 0).id }}')) + kubectl kustomize ../config/target-cluster/overlays/kubeadm | envsubst | kubectl --kubeconfig=../$(CLUSTER_NAME).kubeconfig apply -f - + .PHONY: create-kamaji-tenant create-kamaji-tenant: $(eval CONTROL_PLANE_IP = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network ip list --name "$(CLUSTER_NAME)-vip" -o template --template '{{ .ipaddress }}')) @@ -164,4 +174,4 @@ kamaji-tenant-deploy-calico: .PHONY: kamaji-tenant-deploy-metal-ccm kamaji-tenant-deploy-metal-ccm: - kustomize build ../config/target-cluster/overlays/kamaji | envsubst | kubectl --kubeconfig=../$(CLUSTER_NAME).kubeconfig apply -f - + kubectl kustomize ../config/target-cluster/overlays/kamaji | envsubst | kubectl --kubeconfig=../$(CLUSTER_NAME).kubeconfig apply -f - diff --git a/capi-lab/test/integration.sh b/capi-lab/test/integration.sh index d153691..cda9d54 100755 --- a/capi-lab/test/integration.sh +++ b/capi-lab/test/integration.sh @@ -32,6 +32,8 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ echo "Starting capms sonic flavor tests" fi + export CLUSTER_NAME=metal-test + echo "Creating control plane IP" make -C capi-lab control-plane-ip @@ -58,7 +60,42 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ echo "Applying mtu fix" make -C capi-lab mtu-fix - # TODO further checks + echo "Waiting for cluster to be provisioned" + declare -i attempts=0 + until kubectl --kubeconfig ${KUBECONFIG} get cluster ${CLUSTER_NAME} -o jsonpath='{.status.phase}' 2>/dev/null | grep -q "Provisioned" + do + if [ "$attempts" -ge 180 ]; then + echo "cluster was not provisioned - timeout reached" + kubectl --kubeconfig ${KUBECONFIG} get cluster ${CLUSTER_NAME} -o yaml || true + exit 1 + fi + echo "cluster ${CLUSTER_NAME} is not yet provisioned" + sleep 5 + attempts+=1 + done + echo "Cluster ${CLUSTER_NAME} is provisioned" + + echo "Generating kubeconfig for sample cluster" + make -C capi-lab sample-cluster-kubeconfig + + echo "Deploying metal-ccm to sample cluster" + make -C capi-lab sample-cluster-deploy-metal-ccm + + echo "Waiting for nodes to become Ready" + declare -i attempts=0 + until kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes --no-headers 2>/dev/null | awk '{ print $2 }' | grep -q "^Ready$" + do + if [ "$attempts" -ge 180 ]; then + echo "no nodes became Ready - timeout reached" + kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes || true + exit 1 + fi + echo "no nodes are Ready yet" + sleep 5 + attempts+=1 + done + echo "At least one node is Ready" + kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes fi From 4a8e0bef8e3035f96c76cc1a91de4802d0ad3cb2 Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Thu, 23 Apr 2026 11:19:02 +0200 Subject: [PATCH 07/21] test(lab): wait for tenant API server to be reachable --- capi-lab/Makefile | 4 ++-- capi-lab/test/integration.sh | 32 +++++++++++++++++++++++++------- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/capi-lab/Makefile b/capi-lab/Makefile index d9fd0eb..b5bcc52 100644 --- a/capi-lab/Makefile +++ b/capi-lab/Makefile @@ -134,13 +134,13 @@ sample-cluster-kubeconfig: .PHONY: sample-cluster-deploy-metal-ccm sample-cluster-deploy-metal-ccm: - $(eval METAL_NODE_NETWORK_ID = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network list --project $(METAL_PROJECT_ID) -o template --template '{{ (index . 0).id }}')) + $(eval METAL_NODE_NETWORK_ID = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network list --project $(METAL_PROJECT_ID) -o json | jq -r '.[0].id')) kubectl kustomize ../config/target-cluster/overlays/kubeadm | envsubst | kubectl --kubeconfig=../$(CLUSTER_NAME).kubeconfig apply -f - .PHONY: create-kamaji-tenant create-kamaji-tenant: $(eval CONTROL_PLANE_IP = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network ip list --name "$(CLUSTER_NAME)-vip" -o template --template '{{ .ipaddress }}')) - $(eval METAL_NODE_NETWORK_ID = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network list --project $(METAL_PROJECT_ID) -o template --template '{{ (index . 0).id }}')) + $(eval METAL_NODE_NETWORK_ID = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network list --project $(METAL_PROJECT_ID) -o json | jq -r '.[0].id')) kubectl --kubeconfig=$(KUBECONFIG) create namespace $(TENANT_NAMESPACE) --dry-run=client -o yaml | kubectl --kubeconfig=$(KUBECONFIG) apply -f - # let MetalLB assign the IP to the tenant cluster control plane service envsubst < kamaji/metallb-tenant-pool.yaml | kubectl --kubeconfig=$(KUBECONFIG) apply -f - diff --git a/capi-lab/test/integration.sh b/capi-lab/test/integration.sh index cda9d54..b1cdc49 100755 --- a/capi-lab/test/integration.sh +++ b/capi-lab/test/integration.sh @@ -40,9 +40,9 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ echo "Applying sample cluster" make -C capi-lab apply-sample-cluster - echo "Waiting for control-plane to get to Phoned Home state" + echo "Waiting for firewall, control-plane and worker to get to Phoned Home state" phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l) - minPhoned=2 + minPhoned=3 declare -i attempts=0 until [ "$phoned" -ge $minPhoned ] do @@ -78,23 +78,41 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ echo "Generating kubeconfig for sample cluster" make -C capi-lab sample-cluster-kubeconfig + echo "Waiting for tenant API server to be reachable" + declare -i attempts=0 + until kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig version >/dev/null 2>&1 + do + if [ "$attempts" -ge 180 ]; then + echo "tenant API server not reachable - timeout reached" + kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig version || true + exit 1 + fi + echo "tenant API server not reachable yet" + sleep 5 + attempts+=1 + done + echo "Tenant API server is reachable" + echo "Deploying metal-ccm to sample cluster" make -C capi-lab sample-cluster-deploy-metal-ccm - echo "Waiting for nodes to become Ready" + echo "Waiting for control-plane and worker node to become Ready" + minReady=2 + ready=0 declare -i attempts=0 - until kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes --no-headers 2>/dev/null | awk '{ print $2 }' | grep -q "^Ready$" + until [ "$ready" -ge $minReady ] do if [ "$attempts" -ge 180 ]; then - echo "no nodes became Ready - timeout reached" + echo "not enough nodes became Ready - timeout reached" kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes || true exit 1 fi - echo "no nodes are Ready yet" + echo "$ready/$minReady nodes are Ready" sleep 5 + ready=$(kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes --no-headers 2>/dev/null | awk '{ print $2 }' | grep -c "^Ready$" || true) attempts+=1 done - echo "At least one node is Ready" + echo "$ready/$minReady nodes are Ready" kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes fi From 4201da5bfb2120a6deca41e2fa2e406f06eefc0f Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Thu, 23 Apr 2026 11:42:25 +0200 Subject: [PATCH 08/21] chore: update submodule --- capi-lab/mini-lab | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capi-lab/mini-lab b/capi-lab/mini-lab index 5a5bbeb..6789fb6 160000 --- a/capi-lab/mini-lab +++ b/capi-lab/mini-lab @@ -1 +1 @@ -Subproject commit 5a5bbeb08d74efbd105aed0b7b4a4eb5dab82418 +Subproject commit 6789fb6bb5d2c10ccf08759bd58a05a6ee127852 From e4bb4113a88dee8e580126110228a48f5a281a69 Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Thu, 23 Apr 2026 12:54:01 +0200 Subject: [PATCH 09/21] fix: give capms machines more time to phone home --- capi-lab/test/integration.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capi-lab/test/integration.sh b/capi-lab/test/integration.sh index b1cdc49..556f219 100755 --- a/capi-lab/test/integration.sh +++ b/capi-lab/test/integration.sh @@ -46,7 +46,7 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ declare -i attempts=0 until [ "$phoned" -ge $minPhoned ] do - if [ "$attempts" -ge 180 ]; then + if [ "$attempts" -ge 240 ]; then echo "not enough machines phoned home - timeout reached" exit 1 fi From 0cb077f25d1586fb0c023bf78a9e9de3d33b187b Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Fri, 24 Apr 2026 23:55:28 +0200 Subject: [PATCH 10/21] chore: update submodule --- capi-lab/mini-lab | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capi-lab/mini-lab b/capi-lab/mini-lab index 6789fb6..7367c66 160000 --- a/capi-lab/mini-lab +++ b/capi-lab/mini-lab @@ -1 +1 @@ -Subproject commit 6789fb6bb5d2c10ccf08759bd58a05a6ee127852 +Subproject commit 7367c66ce46623415aaf09a568fb3e76c45279f9 From 1900c4cf56ac4dc717613015210e63594ebd915a Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Mon, 27 Apr 2026 12:20:36 +0200 Subject: [PATCH 11/21] fix: expect only 2 workers --- capi-lab/test/integration.sh | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/capi-lab/test/integration.sh b/capi-lab/test/integration.sh index 556f219..a5b9c06 100755 --- a/capi-lab/test/integration.sh +++ b/capi-lab/test/integration.sh @@ -40,9 +40,9 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ echo "Applying sample cluster" make -C capi-lab apply-sample-cluster - echo "Waiting for firewall, control-plane and worker to get to Phoned Home state" + echo "Waiting for firewall and control-plane to get to Phoned Home state" phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l) - minPhoned=3 + minPhoned=2 declare -i attempts=0 until [ "$phoned" -ge $minPhoned ] do @@ -57,8 +57,10 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ done echo "$phoned/$minPhoned machines have phoned home" - echo "Applying mtu fix" - make -C capi-lab mtu-fix + if [ "$MINI_LAB_FLAVOR" = "capms_sonic" ]; then + echo "Applying mtu fix" + make -C capi-lab mtu-fix + fi echo "Waiting for cluster to be provisioned" declare -i attempts=0 @@ -96,8 +98,8 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ echo "Deploying metal-ccm to sample cluster" make -C capi-lab sample-cluster-deploy-metal-ccm - echo "Waiting for control-plane and worker node to become Ready" - minReady=2 + echo "Waiting for control-plane node to become Ready" + minReady=1 ready=0 declare -i attempts=0 until [ "$ready" -ge $minReady ] From 78168df9ff949d1b49ffbbdbcd798ee74626fa3b Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Tue, 28 Apr 2026 12:21:18 +0200 Subject: [PATCH 12/21] fix: metal-ccm is already deployed via template --- capi-lab/mini-lab | 2 +- capi-lab/test/integration.sh | 47 ++++++++++++++++++++++++++---------- 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/capi-lab/mini-lab b/capi-lab/mini-lab index 7367c66..195c7a1 160000 --- a/capi-lab/mini-lab +++ b/capi-lab/mini-lab @@ -1 +1 @@ -Subproject commit 7367c66ce46623415aaf09a568fb3e76c45279f9 +Subproject commit 195c7a1975f846b76f5c55b00540dd05886c2887 diff --git a/capi-lab/test/integration.sh b/capi-lab/test/integration.sh index a5b9c06..45c098c 100755 --- a/capi-lab/test/integration.sh +++ b/capi-lab/test/integration.sh @@ -40,6 +40,22 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ echo "Applying sample cluster" make -C capi-lab apply-sample-cluster + echo "Waiting for cluster to be provisioned" + declare -i attempts=0 + until kubectl --kubeconfig ${KUBECONFIG} get cluster ${CLUSTER_NAME} -o jsonpath='{.status.phase}' 2>/dev/null | grep -q "Provisioned" + do + if [ "$attempts" -ge 180 ]; then + echo "cluster was not provisioned - timeout reached" + kubectl --kubeconfig ${KUBECONFIG} get cluster ${CLUSTER_NAME} -o yaml || true + exit 1 + fi + echo "cluster ${CLUSTER_NAME} is not yet provisioned" + sleep 5 + attempts+=1 + done + echo "Cluster ${CLUSTER_NAME} is provisioned" + + echo "Waiting for firewall and control-plane to get to Phoned Home state" phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l) minPhoned=2 @@ -62,24 +78,30 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ make -C capi-lab mtu-fix fi - echo "Waiting for cluster to be provisioned" + echo "Waiting for worker to get to Phoned Home state" + phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l) + minPhoned=3 declare -i attempts=0 - until kubectl --kubeconfig ${KUBECONFIG} get cluster ${CLUSTER_NAME} -o jsonpath='{.status.phase}' 2>/dev/null | grep -q "Provisioned" + until [ "$phoned" -ge $minPhoned ] do - if [ "$attempts" -ge 180 ]; then - echo "cluster was not provisioned - timeout reached" - kubectl --kubeconfig ${KUBECONFIG} get cluster ${CLUSTER_NAME} -o yaml || true + if [ "$attempts" -ge 240 ]; then + echo "not enough machines phoned home - timeout reached" exit 1 fi - echo "cluster ${CLUSTER_NAME} is not yet provisioned" + echo "$phoned/$minPhoned machines have phoned home" sleep 5 + phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l) attempts+=1 done - echo "Cluster ${CLUSTER_NAME} is provisioned" + echo "$phoned/$minPhoned machines have phoned home" echo "Generating kubeconfig for sample cluster" make -C capi-lab sample-cluster-kubeconfig + # TODO remove once we can reliably check for the nodes to be ready + kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes + kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get pods -A + echo "Waiting for tenant API server to be reachable" declare -i attempts=0 until kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig version >/dev/null 2>&1 @@ -95,11 +117,8 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ done echo "Tenant API server is reachable" - echo "Deploying metal-ccm to sample cluster" - make -C capi-lab sample-cluster-deploy-metal-ccm - - echo "Waiting for control-plane node to become Ready" - minReady=1 + echo "Waiting for control-plane node and worker node to become Ready" + minReady=2 ready=0 declare -i attempts=0 until [ "$ready" -ge $minReady ] @@ -115,7 +134,6 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ attempts+=1 done echo "$ready/$minReady nodes are Ready" - kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes fi @@ -163,6 +181,9 @@ if [ "$MINI_LAB_FLAVOR" = "kamaji" ]; then echo "Checking if tenant cluster exists" if kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes | grep -e "Ready"; then + # Currently this also catches NotReady nodes, but that's good enough for now to verify + # that the node has joined. + # Only metal-ccm will be able to set the node to Ready but we do not go that far here echo "Nodes have joined the cluster and are ready" elif kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes | grep -e "No resources found"; then echo "Nodes have not joined yet" From b7dc088effa403e0dca2797a7591f5b95e3887ec Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Tue, 28 Apr 2026 13:21:54 +0200 Subject: [PATCH 13/21] fix: ccm tolerations and timeouts for third machine --- .github/workflows/integration.yaml | 2 +- capi-lab/test/integration.sh | 6 +++--- .../clusterctl-templates/cluster-template-calico-lab.yaml | 5 +++++ 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index a0b2537..99d1a42 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -49,7 +49,7 @@ jobs: - name: Run integration tests shell: bash - timeout-minutes: 45 + timeout-minutes: 60 run: | eval $(make -C capi-lab --silent dev-env) ./capi-lab/test/ci-cleanup.sh diff --git a/capi-lab/test/integration.sh b/capi-lab/test/integration.sh index 45c098c..7f0ba0a 100755 --- a/capi-lab/test/integration.sh +++ b/capi-lab/test/integration.sh @@ -84,7 +84,7 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ declare -i attempts=0 until [ "$phoned" -ge $minPhoned ] do - if [ "$attempts" -ge 240 ]; then + if [ "$attempts" -ge 360 ]; then echo "not enough machines phoned home - timeout reached" exit 1 fi @@ -99,8 +99,8 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ make -C capi-lab sample-cluster-kubeconfig # TODO remove once we can reliably check for the nodes to be ready - kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes - kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get pods -A + kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes || true + kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get pods -A || true echo "Waiting for tenant API server to be reachable" declare -i attempts=0 diff --git a/config/clusterctl-templates/cluster-template-calico-lab.yaml b/config/clusterctl-templates/cluster-template-calico-lab.yaml index b5053d4..7dce5e2 100644 --- a/config/clusterctl-templates/cluster-template-calico-lab.yaml +++ b/config/clusterctl-templates/cluster-template-calico-lab.yaml @@ -557,6 +557,11 @@ data: - effect: NoSchedule key: node.cloudprovider.kubernetes.io/uninitialized value: "true" + - effect: NoSchedule + key: node.cluster.x-k8s.io/uninitialized + operator: Exists + - key: node.kubernetes.io/not-ready + operator: Exists restartPolicy: Always volumes: - name: cloud-controller-manager From 0e79be04ec404f389d5f9f752e380c6a22be2d07 Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Tue, 28 Apr 2026 16:14:20 +0200 Subject: [PATCH 14/21] debug: higher timeouts and debug output --- .github/workflows/integration.yaml | 2 +- capi-lab/test/integration.sh | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index 99d1a42..9871c5f 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -49,7 +49,7 @@ jobs: - name: Run integration tests shell: bash - timeout-minutes: 60 + timeout-minutes: 150 run: | eval $(make -C capi-lab --silent dev-env) ./capi-lab/test/ci-cleanup.sh diff --git a/capi-lab/test/integration.sh b/capi-lab/test/integration.sh index 7f0ba0a..634a18f 100755 --- a/capi-lab/test/integration.sh +++ b/capi-lab/test/integration.sh @@ -84,10 +84,15 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ declare -i attempts=0 until [ "$phoned" -ge $minPhoned ] do - if [ "$attempts" -ge 360 ]; then + if [ "$attempts" -ge 480 ]; then echo "not enough machines phoned home - timeout reached" + docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls || true exit 1 fi + if [ $((attempts % 60)) -eq 0 ] && [ "$attempts" -gt 0 ]; then + echo "machine states after $attempts attempts:" + docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls || true + fi echo "$phoned/$minPhoned machines have phoned home" sleep 5 phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l) From 582c1b37706422c40fb3cb3cb798e1075e5341ce Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Tue, 28 Apr 2026 16:52:17 +0200 Subject: [PATCH 15/21] fix(ci): free disk space on gh ubuntu runner --- .github/workflows/integration.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index 9871c5f..ed75ff2 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -23,6 +23,15 @@ jobs: - name: kamaji steps: + - name: Free disk space + # ubuntu-latest only has ~14GB free; kind + QEMU VMs + containerlab + Docker images exhaust it. + # Remove preinstalled SDKs/toolchains we don't need to recover ~10-12GB. + # apt-get clean removes cached .deb files (~few hundred MB). + run: | + sudo rm -rf /usr/local/lib/android /usr/share/dotnet /usr/share/swift /opt/ghc /usr/local/.ghcup /opt/hostedtoolcache/CodeQL + sudo apt-get clean + df -h + - name: Gain back workspace permissions # https://github.com/actions/checkout/issues/211 run: | [ -d "${GITHUB_WORKSPACE}" ] && sudo chown -R $USER:$USER ${GITHUB_WORKSPACE} From d272e7522aad816b9d5f646c5c0178fec7657906 Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Thu, 21 May 2026 11:13:42 +0200 Subject: [PATCH 16/21] update submodule --- capi-lab/mini-lab | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capi-lab/mini-lab b/capi-lab/mini-lab index 195c7a1..007c279 160000 --- a/capi-lab/mini-lab +++ b/capi-lab/mini-lab @@ -1 +1 @@ -Subproject commit 195c7a1975f846b76f5c55b00540dd05886c2887 +Subproject commit 007c2791d92d50994d8f6732a75f76345725f40c From ef5a2a8dd0424b87c108f1067aa6e995c5bbb319 Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Thu, 21 May 2026 11:31:29 +0200 Subject: [PATCH 17/21] chore: remove capms_sonic tests --- .github/workflows/integration.yaml | 1 - capi-lab/Makefile | 6 ++---- capi-lab/test/integration.sh | 14 ++------------ 3 files changed, 4 insertions(+), 17 deletions(-) diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index ed75ff2..fe2178e 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -19,7 +19,6 @@ jobs: matrix: flavors: - name: capms_dell_sonic - - name: capms_sonic - name: kamaji steps: diff --git a/capi-lab/Makefile b/capi-lab/Makefile index b5bcc52..7a46d12 100644 --- a/capi-lab/Makefile +++ b/capi-lab/Makefile @@ -5,7 +5,7 @@ ANSIBLE_EXTRA_VARS_FILE=$(shell pwd)/mini-lab-overrides/extra-vars.yaml KIND_EXPERIMENTAL_DOCKER_NETWORK=mini_lab_ext KUBECONFIG := $(shell pwd)/mini-lab/.kubeconfig -MINI_LAB_FLAVOR := $(or $(MINI_LAB_FLAVOR),capms_sonic) +MINI_LAB_FLAVOR := $(or $(MINI_LAB_FLAVOR),capms_dell_sonic) CLUSTER_NAME ?= metal-test KUBERNETES_VERSION ?= 1.33.5 @@ -35,9 +35,7 @@ SUBMODULE_SHA=$(shell git -C mini-lab rev-parse --short=8 HEAD) MINI_LAB_VM_IMAGE := $(or $(MINI_LAB_VM_IMAGE),ghcr.io/metal-stack/mini-lab-vms:$(SUBMODULE_SHA)) MINI_LAB_SONIC_IMAGE := $(or $(MINI_LAB_SONIC_IMAGE),ghcr.io/metal-stack/mini-lab-sonic:$(SUBMODULE_SHA)) -ifeq ($(MINI_LAB_FLAVOR),capms_sonic) -DEPLOY_TARGET=deploy-kubeadm -else ifeq ($(MINI_LAB_FLAVOR),capms_dell_sonic) +ifeq ($(MINI_LAB_FLAVOR),capms_dell_sonic) DEPLOY_TARGET=deploy-kubeadm else ifeq ($(MINI_LAB_FLAVOR),kamaji) DEPLOY_TARGET=deploy-kamaji diff --git a/capi-lab/test/integration.sh b/capi-lab/test/integration.sh index 634a18f..a4684ff 100755 --- a/capi-lab/test/integration.sh +++ b/capi-lab/test/integration.sh @@ -24,13 +24,8 @@ echo "$waiting/$minWaiting machines are waiting" make push-to-capi-lab -if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_sonic" ]; then - - if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ]; then - echo "Starting capms dell sonic flavor tests" - else - echo "Starting capms sonic flavor tests" - fi +if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ]; then + echo "Starting capms dell sonic flavor tests" export CLUSTER_NAME=metal-test @@ -73,11 +68,6 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ done echo "$phoned/$minPhoned machines have phoned home" - if [ "$MINI_LAB_FLAVOR" = "capms_sonic" ]; then - echo "Applying mtu fix" - make -C capi-lab mtu-fix - fi - echo "Waiting for worker to get to Phoned Home state" phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l) minPhoned=3 From 295fe795214283ef715c7d31a75535db7c91da54 Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Thu, 21 May 2026 13:03:25 +0200 Subject: [PATCH 18/21] chore: run smoke tests on self-hosted runner --- .github/workflows/integration.yaml | 19 +++++++++---------- capi-lab/test/integration.sh | 2 +- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index fe2178e..86d0d52 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -11,8 +11,7 @@ on: jobs: test: name: Run tests - runs-on: ubuntu-latest - # TODO should we run on self-hosted? + runs-on: self-hosted strategy: fail-fast: false @@ -22,14 +21,14 @@ jobs: - name: kamaji steps: - - name: Free disk space - # ubuntu-latest only has ~14GB free; kind + QEMU VMs + containerlab + Docker images exhaust it. - # Remove preinstalled SDKs/toolchains we don't need to recover ~10-12GB. - # apt-get clean removes cached .deb files (~few hundred MB). - run: | - sudo rm -rf /usr/local/lib/android /usr/share/dotnet /usr/share/swift /opt/ghc /usr/local/.ghcup /opt/hostedtoolcache/CodeQL - sudo apt-get clean - df -h + # - name: Free disk space + # # ubuntu-latest only has ~14GB free; kind + QEMU VMs + containerlab + Docker images exhaust it. + # # Remove preinstalled SDKs/toolchains we don't need to recover ~10-12GB. + # # apt-get clean removes cached .deb files (~few hundred MB). + # run: | + # sudo rm -rf /usr/local/lib/android /usr/share/dotnet /usr/share/swift /opt/ghc /usr/local/.ghcup /opt/hostedtoolcache/CodeQL + # sudo apt-get clean + # df -h - name: Gain back workspace permissions # https://github.com/actions/checkout/issues/211 run: | diff --git a/capi-lab/test/integration.sh b/capi-lab/test/integration.sh index a4684ff..57f6f62 100755 --- a/capi-lab/test/integration.sh +++ b/capi-lab/test/integration.sh @@ -99,7 +99,7 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ]; then echo "Waiting for tenant API server to be reachable" declare -i attempts=0 - until kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig version >/dev/null 2>&1 + until kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig version 2>&1 | grep -q "Server Version" do if [ "$attempts" -ge 180 ]; then echo "tenant API server not reachable - timeout reached" From 9f433f9b9008f7b9ffec7584bc78cae5563c0d10 Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Fri, 22 May 2026 10:31:07 +0200 Subject: [PATCH 19/21] test(integration): install go for self-hosted runner --- .github/workflows/integration.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index 86d0d52..4546cda 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -54,6 +54,11 @@ jobs: with: submodules: true + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + - name: Run integration tests shell: bash timeout-minutes: 150 From 67d4686c2f9fad83fc129f1964706adb7479645c Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Fri, 22 May 2026 11:01:09 +0200 Subject: [PATCH 20/21] test(integration): print out node and pod state --- capi-lab/test/integration.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/capi-lab/test/integration.sh b/capi-lab/test/integration.sh index 57f6f62..b4e8df9 100755 --- a/capi-lab/test/integration.sh +++ b/capi-lab/test/integration.sh @@ -123,6 +123,11 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ]; then kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes || true exit 1 fi + if [ $((attempts % 60)) -eq 0 ] && [ "$attempts" -gt 0 ]; then + echo "node states after $attempts attempts:" + kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes || true + kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get pods -A || true + fi echo "$ready/$minReady nodes are Ready" sleep 5 ready=$(kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes --no-headers 2>/dev/null | awk '{ print $2 }' | grep -c "^Ready$" || true) From bf86d41a318fc2bf6fae9a5b61b3b9b7b51a2d83 Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Fri, 22 May 2026 15:39:00 +0200 Subject: [PATCH 21/21] test(integration): smoke tests for capms_sonic --- .github/workflows/integration.yaml | 1 + capi-lab/Makefile | 4 ++-- capi-lab/mini-lab | 2 +- capi-lab/test/integration.sh | 14 ++++++++++++-- 4 files changed, 16 insertions(+), 5 deletions(-) diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index 4546cda..7903f1a 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -18,6 +18,7 @@ jobs: matrix: flavors: - name: capms_dell_sonic + - name: capms_sonic - name: kamaji steps: diff --git a/capi-lab/Makefile b/capi-lab/Makefile index 7a46d12..31417fa 100644 --- a/capi-lab/Makefile +++ b/capi-lab/Makefile @@ -5,7 +5,7 @@ ANSIBLE_EXTRA_VARS_FILE=$(shell pwd)/mini-lab-overrides/extra-vars.yaml KIND_EXPERIMENTAL_DOCKER_NETWORK=mini_lab_ext KUBECONFIG := $(shell pwd)/mini-lab/.kubeconfig -MINI_LAB_FLAVOR := $(or $(MINI_LAB_FLAVOR),capms_dell_sonic) +MINI_LAB_FLAVOR := $(or $(MINI_LAB_FLAVOR),capms_sonic) CLUSTER_NAME ?= metal-test KUBERNETES_VERSION ?= 1.33.5 @@ -35,7 +35,7 @@ SUBMODULE_SHA=$(shell git -C mini-lab rev-parse --short=8 HEAD) MINI_LAB_VM_IMAGE := $(or $(MINI_LAB_VM_IMAGE),ghcr.io/metal-stack/mini-lab-vms:$(SUBMODULE_SHA)) MINI_LAB_SONIC_IMAGE := $(or $(MINI_LAB_SONIC_IMAGE),ghcr.io/metal-stack/mini-lab-sonic:$(SUBMODULE_SHA)) -ifeq ($(MINI_LAB_FLAVOR),capms_dell_sonic) +ifneq (,$(filter $(MINI_LAB_FLAVOR),capms_sonic capms_dell_sonic)) DEPLOY_TARGET=deploy-kubeadm else ifeq ($(MINI_LAB_FLAVOR),kamaji) DEPLOY_TARGET=deploy-kamaji diff --git a/capi-lab/mini-lab b/capi-lab/mini-lab index 007c279..3179691 160000 --- a/capi-lab/mini-lab +++ b/capi-lab/mini-lab @@ -1 +1 @@ -Subproject commit 007c2791d92d50994d8f6732a75f76345725f40c +Subproject commit 3179691de8bd6982baefef0b2445d3af7fa55032 diff --git a/capi-lab/test/integration.sh b/capi-lab/test/integration.sh index b4e8df9..20961e2 100755 --- a/capi-lab/test/integration.sh +++ b/capi-lab/test/integration.sh @@ -24,8 +24,13 @@ echo "$waiting/$minWaiting machines are waiting" make push-to-capi-lab -if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ]; then - echo "Starting capms dell sonic flavor tests" +if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_sonic" ]; then + + if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ]; then + echo "Starting capms dell sonic flavor tests" + else + echo "Starting capms sonic flavor tests" + fi export CLUSTER_NAME=metal-test @@ -68,6 +73,11 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ]; then done echo "$phoned/$minPhoned machines have phoned home" + if [ "$MINI_LAB_FLAVOR" = "capms_sonic" ]; then + echo "Applying mtu fix" + make -C capi-lab mtu-fix + fi + echo "Waiting for worker to get to Phoned Home state" phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l) minPhoned=3