Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
54693db
fix: calico-lab template support for cluster-api v1.11
ma-hartma Apr 28, 2026
bd3ccf4
feat: smoke tests for capms sonic and dell_sonic flavors
ma-hartma Apr 21, 2026
f21cdc3
fix: remove redundant kubeconfig for clusterctl sample cluster
ma-hartma Apr 21, 2026
d37b278
chore: do not continue lab tests on error and do not fail fast
ma-hartma Apr 21, 2026
f8a25d2
chore: add timeout for lab integration tests
ma-hartma Apr 21, 2026
0edbc39
test(lab): wait for capms nodes to become ready
ma-hartma Apr 21, 2026
4a8e0be
test(lab): wait for tenant API server to be reachable
ma-hartma Apr 23, 2026
4201da5
chore: update submodule
ma-hartma Apr 23, 2026
e4bb411
fix: give capms machines more time to phone home
ma-hartma Apr 23, 2026
0cb077f
chore: update submodule
ma-hartma Apr 24, 2026
1900c4c
fix: expect only 2 workers
ma-hartma Apr 27, 2026
78168df
fix: metal-ccm is already deployed via template
ma-hartma Apr 28, 2026
b7dc088
fix: ccm tolerations and timeouts for third machine
ma-hartma Apr 28, 2026
0e79be0
debug: higher timeouts and debug output
ma-hartma Apr 28, 2026
582c1b3
fix(ci): free disk space on gh ubuntu runner
ma-hartma Apr 28, 2026
d272e75
update submodule
ma-hartma May 21, 2026
ef5a2a8
chore: remove capms_sonic tests
ma-hartma May 21, 2026
295fe79
chore: run smoke tests on self-hosted runner
ma-hartma May 21, 2026
9f433f9
test(integration): install go for self-hosted runner
ma-hartma May 22, 2026
67d4686
test(integration): print out node and pod state
ma-hartma May 22, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions .github/workflows/integration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,25 @@ on:
jobs:
test:
name: Run tests
runs-on: ubuntu-latest
runs-on: self-hosted

continue-on-error: true
strategy:
fail-fast: false
matrix:
flavors:
- name: capms_dell_sonic
- name: kamaji

steps:
# - name: Free disk space
# # ubuntu-latest only has ~14GB free; kind + QEMU VMs + containerlab + Docker images exhaust it.
# # Remove preinstalled SDKs/toolchains we don't need to recover ~10-12GB.
# # apt-get clean removes cached .deb files (~few hundred MB).
# run: |
# sudo rm -rf /usr/local/lib/android /usr/share/dotnet /usr/share/swift /opt/ghc /usr/local/.ghcup /opt/hostedtoolcache/CodeQL
# sudo apt-get clean
# df -h

- name: Gain back workspace permissions # https://github.com/actions/checkout/issues/211
run: |
[ -d "${GITHUB_WORKSPACE}" ] && sudo chown -R $USER:$USER ${GITHUB_WORKSPACE}
Expand All @@ -44,8 +54,14 @@ jobs:
with:
submodules: true

- name: Setup Go
uses: actions/setup-go@v5
with:
go-version-file: go.mod

- name: Run integration tests
shell: bash
timeout-minutes: 150
run: |
eval $(make -C capi-lab --silent dev-env)
./capi-lab/test/ci-cleanup.sh
Expand Down
70 changes: 40 additions & 30 deletions capi-lab/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ ANSIBLE_EXTRA_VARS_FILE=$(shell pwd)/mini-lab-overrides/extra-vars.yaml
KIND_EXPERIMENTAL_DOCKER_NETWORK=mini_lab_ext
KUBECONFIG := $(shell pwd)/mini-lab/.kubeconfig

MINI_LAB_FLAVOR := $(or $(MINI_LAB_FLAVOR),capms)
MINI_LAB_FLAVOR := $(or $(MINI_LAB_FLAVOR),capms_dell_sonic)

CLUSTER_NAME ?= metal-test
KUBERNETES_VERSION ?= 1.33.5
Expand Down Expand Up @@ -35,7 +35,7 @@ SUBMODULE_SHA=$(shell git -C mini-lab rev-parse --short=8 HEAD)
MINI_LAB_VM_IMAGE := $(or $(MINI_LAB_VM_IMAGE),ghcr.io/metal-stack/mini-lab-vms:$(SUBMODULE_SHA))
MINI_LAB_SONIC_IMAGE := $(or $(MINI_LAB_SONIC_IMAGE),ghcr.io/metal-stack/mini-lab-sonic:$(SUBMODULE_SHA))

ifeq ($(MINI_LAB_FLAVOR),capms)
ifeq ($(MINI_LAB_FLAVOR),capms_dell_sonic)
DEPLOY_TARGET=deploy-kubeadm
else ifeq ($(MINI_LAB_FLAVOR),kamaji)
DEPLOY_TARGET=deploy-kamaji
Expand Down Expand Up @@ -101,54 +101,64 @@ control-plane-ip:
apply-sample-cluster:
$(eval CONTROL_PLANE_IP = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network ip list --name "$(CLUSTER_NAME)-vip" -o template --template '{{ .ipaddress }}'))
echo $(CLUSTER_NAME)
clusterctl generate cluster $(CLUSTER_NAME) \
--kubeconfig=$(KUBECONFIG) \
--worker-machine-count 1 \
--control-plane-machine-count 1 \
--kubernetes-version $(KUBERNETES_VERSION) \
--from ../config/clusterctl-templates/cluster-template-calico-lab.yaml \
| kubectl --kubeconfig=$(KUBECONFIG) apply -f -
docker compose -f compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) \
clusterctl generate cluster $(CLUSTER_NAME) \
--worker-machine-count 1 \
--control-plane-machine-count 1 \
--kubernetes-version $(KUBERNETES_VERSION) \
--from /templates/cluster-template-calico-lab.yaml \
| kubectl --kubeconfig=$(KUBECONFIG) apply -f -

.PHONY: delete-sample-cluster
delete-sample-cluster:
$(eval CONTROL_PLANE_IP = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network ip list --name "$(CLUSTER_NAME)-vip" -o template --template '{{ .ipaddress }}'))
clusterctl generate cluster $(CLUSTER_NAME) \
--kubeconfig=$(KUBECONFIG) \
--worker-machine-count 1 \
--control-plane-machine-count 1 \
--kubernetes-version $(KUBERNETES_VERSION) \
--from ../config/clusterctl-templates/cluster-template-calico-lab.yaml \
| kubectl --kubeconfig=$(KUBECONFIG) delete -f -
docker compose -f compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) \
clusterctl generate cluster $(CLUSTER_NAME) \
--worker-machine-count 1 \
--control-plane-machine-count 1 \
--kubernetes-version $(KUBERNETES_VERSION) \
--from /templates/cluster-template-calico-lab.yaml \
| kubectl --kubeconfig=$(KUBECONFIG) delete -f -

.PHONY: mtu-fix
mtu-fix:
cd mini-lab && ssh -F files/ssh/config leaf01 'ip link set dev vtep-1001 mtu 9100 && echo done'
cd mini-lab && ssh -F files/ssh/config leaf02 'ip link set dev vtep-1001 mtu 9100 && echo done'

.PHONY: sample-cluster-kubeconfig
sample-cluster-kubeconfig:
kubectl --kubeconfig=$(KUBECONFIG) get secret $(CLUSTER_NAME)-kubeconfig -o jsonpath='{.data.value}' | base64 -d > ../$(CLUSTER_NAME).kubeconfig
@echo "Sample cluster kubeconfig written to $(CLUSTER_NAME).kubeconfig"

.PHONY: sample-cluster-deploy-metal-ccm
sample-cluster-deploy-metal-ccm:
$(eval METAL_NODE_NETWORK_ID = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network list --project $(METAL_PROJECT_ID) -o json | jq -r '.[0].id'))
kubectl kustomize ../config/target-cluster/overlays/kubeadm | envsubst | kubectl --kubeconfig=../$(CLUSTER_NAME).kubeconfig apply -f -

.PHONY: create-kamaji-tenant
create-kamaji-tenant:
$(eval CONTROL_PLANE_IP = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network ip list --name "$(CLUSTER_NAME)-vip" -o template --template '{{ .ipaddress }}'))
$(eval METAL_NODE_NETWORK_ID = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network list --project $(METAL_PROJECT_ID) -o template --template '{{ (index . 0).id }}'))
$(eval METAL_NODE_NETWORK_ID = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network list --project $(METAL_PROJECT_ID) -o json | jq -r '.[0].id'))
kubectl --kubeconfig=$(KUBECONFIG) create namespace $(TENANT_NAMESPACE) --dry-run=client -o yaml | kubectl --kubeconfig=$(KUBECONFIG) apply -f -
# let MetalLB assign the IP to the tenant cluster control plane service
envsubst < kamaji/metallb-tenant-pool.yaml | kubectl --kubeconfig=$(KUBECONFIG) apply -f -
docker compose -f compose.yaml -f compose.kamaji.yaml run $(DOCKER_COMPOSE_RUN_ARG) \
clusterctl generate cluster $(CLUSTER_NAME) \
--target-namespace $(TENANT_NAMESPACE) \
--worker-machine-count 1 \
--kubernetes-version $(KUBERNETES_VERSION) \
--from /templates/cluster-template-kamaji-tenant.yaml \
| kubectl --kubeconfig=$(KUBECONFIG) apply -f -
clusterctl generate cluster $(CLUSTER_NAME) \
--target-namespace $(TENANT_NAMESPACE) \
--worker-machine-count 1 \
--kubernetes-version $(KUBERNETES_VERSION) \
--from /templates/cluster-template-kamaji-tenant.yaml \
| kubectl --kubeconfig=$(KUBECONFIG) apply -f -

.PHONY: delete-kamaji-tenant
delete-kamaji-tenant:
docker compose -f compose.yaml -f compose.kamaji.yaml run $(DOCKER_COMPOSE_RUN_ARG) \
clusterctl generate cluster $(CLUSTER_NAME) \
--target-namespace $(TENANT_NAMESPACE) \
--worker-machine-count 1 \
--kubernetes-version $(KUBERNETES_VERSION) \
--from /templates/cluster-template-kamaji-tenant.yaml \
| kubectl --kubeconfig=$(KUBECONFIG) delete -f -
clusterctl generate cluster $(CLUSTER_NAME) \
--target-namespace $(TENANT_NAMESPACE) \
--worker-machine-count 1 \
--kubernetes-version $(KUBERNETES_VERSION) \
--from /templates/cluster-template-kamaji-tenant.yaml \
| kubectl --kubeconfig=$(KUBECONFIG) delete -f -

.PHONY: kamaji-tenant-kubeconfig
kamaji-tenant-kubeconfig:
Expand All @@ -162,4 +172,4 @@ kamaji-tenant-deploy-calico:

.PHONY: kamaji-tenant-deploy-metal-ccm
kamaji-tenant-deploy-metal-ccm:
kustomize build ../config/target-cluster/overlays/kamaji | envsubst | kubectl --kubeconfig=../$(CLUSTER_NAME).kubeconfig apply -f -
kubectl kustomize ../config/target-cluster/overlays/kamaji | envsubst | kubectl --kubeconfig=../$(CLUSTER_NAME).kubeconfig apply -f -
18 changes: 0 additions & 18 deletions capi-lab/compose.kamaji.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,7 @@ services:
clusterctl:
image: registry.k8s.io/cluster-api/clusterctl:v1.12.3
environment:
- EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION=true
- METAL_API_HMAC=${METAL_API_HMAC}
- METAL_API_URL=${METAL_API_URL}
- METAL_API_HMAC_AUTH_TYPE=${METAL_API_HMAC_AUTH_TYPE}
- CLUSTER_NAME=${CLUSTER_NAME}
- TENANT_NAMESPACE=${TENANT_NAMESPACE}
- KUBERNETES_VERSION=${KUBERNETES_VERSION}
- CONTROL_PLANE_IP=${CONTROL_PLANE_IP}
- PODS_CIDR=${PODS_CIDR}
- SERVICES_CIDR=${SERVICES_CIDR}
- METAL_PARTITION=${METAL_PARTITION}
- METAL_PROJECT_ID=${METAL_PROJECT_ID}
- CONTROL_PLANE_MACHINE_IMAGE=${CONTROL_PLANE_MACHINE_IMAGE}
- CONTROL_PLANE_MACHINE_SIZE=${CONTROL_PLANE_MACHINE_SIZE}
- WORKER_MACHINE_IMAGE=${WORKER_MACHINE_IMAGE}
- WORKER_MACHINE_SIZE=${WORKER_MACHINE_SIZE}
- FIREWALL_MACHINE_IMAGE=${FIREWALL_MACHINE_IMAGE}
- FIREWALL_MACHINE_SIZE=${FIREWALL_MACHINE_SIZE}
- METAL_NODE_NETWORK_ID=${METAL_NODE_NETWORK_ID}
- FIREWALL_EXTERNAL_NETWORKS=${FIREWALL_EXTERNAL_NETWORKS}
volumes:
- ../config/clusterctl-templates:/templates:ro
18 changes: 17 additions & 1 deletion capi-lab/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,24 @@ services:
environment:
- EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION=true
- KUBECONFIG=/kubeconfig
image: registry.k8s.io/cluster-api/clusterctl:v1.9.5
- METAL_API_HMAC=${METAL_API_HMAC}
- METAL_API_URL=${METAL_API_URL}
- METAL_API_HMAC_AUTH_TYPE=${METAL_API_HMAC_AUTH_TYPE}
- CLUSTER_NAME=${CLUSTER_NAME}
- KUBERNETES_VERSION=${KUBERNETES_VERSION}
- CONTROL_PLANE_IP=${CONTROL_PLANE_IP}
- METAL_PARTITION=${METAL_PARTITION}
- METAL_PROJECT_ID=${METAL_PROJECT_ID}
- CONTROL_PLANE_MACHINE_IMAGE=${CONTROL_PLANE_MACHINE_IMAGE}
- CONTROL_PLANE_MACHINE_SIZE=${CONTROL_PLANE_MACHINE_SIZE}
- WORKER_MACHINE_IMAGE=${WORKER_MACHINE_IMAGE}
- WORKER_MACHINE_SIZE=${WORKER_MACHINE_SIZE}
- FIREWALL_MACHINE_IMAGE=${FIREWALL_MACHINE_IMAGE}
- FIREWALL_MACHINE_SIZE=${FIREWALL_MACHINE_SIZE}
- FIREWALL_EXTERNAL_NETWORKS=${FIREWALL_EXTERNAL_NETWORKS}
image: registry.k8s.io/cluster-api/clusterctl:v1.11.4
network_mode: host
user: root
volumes:
- ${KUBECONFIG}:/kubeconfig:ro
- ../config/clusterctl-templates:/templates:ro
125 changes: 121 additions & 4 deletions capi-lab/test/integration.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,136 @@ minWaiting=2
declare -i attempts=0
until [ "$waiting" -ge $minWaiting ]
do
if [ "$attempts" -ge 60 ]; then
if [ "$attempts" -ge 180 ]; then
echo "not enough machines in waiting state - timeout reached"
exit 1
fi
echo "$waiting/$minWaiting machines are waiting"
sleep 5
waiting=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Waiting | wc -l)
attempts=$attempts+1
attempts+=1
done
echo "$waiting/$minWaiting machines are waiting"

make push-to-capi-lab

if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ]; then
echo "Starting capms dell sonic flavor tests"

export CLUSTER_NAME=metal-test

echo "Creating control plane IP"
make -C capi-lab control-plane-ip

echo "Applying sample cluster"
make -C capi-lab apply-sample-cluster

echo "Waiting for cluster to be provisioned"
declare -i attempts=0
until kubectl --kubeconfig ${KUBECONFIG} get cluster ${CLUSTER_NAME} -o jsonpath='{.status.phase}' 2>/dev/null | grep -q "Provisioned"
do
if [ "$attempts" -ge 180 ]; then
echo "cluster was not provisioned - timeout reached"
kubectl --kubeconfig ${KUBECONFIG} get cluster ${CLUSTER_NAME} -o yaml || true
exit 1
fi
echo "cluster ${CLUSTER_NAME} is not yet provisioned"
sleep 5
attempts+=1
done
echo "Cluster ${CLUSTER_NAME} is provisioned"


echo "Waiting for firewall and control-plane to get to Phoned Home state"
phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l)
minPhoned=2
declare -i attempts=0
until [ "$phoned" -ge $minPhoned ]
do
if [ "$attempts" -ge 240 ]; then
echo "not enough machines phoned home - timeout reached"
exit 1
fi
echo "$phoned/$minPhoned machines have phoned home"
sleep 5
phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l)
attempts+=1
done
echo "$phoned/$minPhoned machines have phoned home"

echo "Waiting for worker to get to Phoned Home state"
phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l)
minPhoned=3
declare -i attempts=0
until [ "$phoned" -ge $minPhoned ]
do
if [ "$attempts" -ge 480 ]; then
echo "not enough machines phoned home - timeout reached"
docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls || true
exit 1
fi
if [ $((attempts % 60)) -eq 0 ] && [ "$attempts" -gt 0 ]; then
echo "machine states after $attempts attempts:"
docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls || true
fi
echo "$phoned/$minPhoned machines have phoned home"
sleep 5
phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l)
attempts+=1
done
echo "$phoned/$minPhoned machines have phoned home"

echo "Generating kubeconfig for sample cluster"
make -C capi-lab sample-cluster-kubeconfig

# TODO remove once we can reliably check for the nodes to be ready
kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes || true
kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get pods -A || true

echo "Waiting for tenant API server to be reachable"
declare -i attempts=0
until kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig version 2>&1 | grep -q "Server Version"
do
if [ "$attempts" -ge 180 ]; then
echo "tenant API server not reachable - timeout reached"
kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig version || true
exit 1
fi
echo "tenant API server not reachable yet"
sleep 5
attempts+=1
done
echo "Tenant API server is reachable"

echo "Waiting for control-plane node and worker node to become Ready"
minReady=2
ready=0
declare -i attempts=0
until [ "$ready" -ge $minReady ]
do
if [ "$attempts" -ge 180 ]; then
echo "not enough nodes became Ready - timeout reached"
kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes || true
exit 1
fi
if [ $((attempts % 60)) -eq 0 ] && [ "$attempts" -gt 0 ]; then
echo "node states after $attempts attempts:"
kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes || true
kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get pods -A || true
fi
echo "$ready/$minReady nodes are Ready"
sleep 5
ready=$(kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes --no-headers 2>/dev/null | awk '{ print $2 }' | grep -c "^Ready$" || true)
attempts+=1
done
echo "$ready/$minReady nodes are Ready"

fi


if [ "$MINI_LAB_FLAVOR" = "kamaji" ]; then

echo "Starting kamaji tests"
echo "Starting kamaji flavor tests"

echo "Creating control plane IP"
export CLUSTER_NAME=kamaji-tenant-test
Expand All @@ -42,7 +156,7 @@ if [ "$MINI_LAB_FLAVOR" = "kamaji" ]; then
declare -i attempts=0
until [ "$phoned" -ge $minPhoned ]
do
if [ "$attempts" -ge 120 ]; then
if [ "$attempts" -ge 180 ]; then
echo "not enough machines phoned home - timeout reached"
exit 1
fi
Expand All @@ -67,6 +181,9 @@ if [ "$MINI_LAB_FLAVOR" = "kamaji" ]; then

echo "Checking if tenant cluster exists"
if kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes | grep -e "Ready"; then
# Currently this also catches NotReady nodes, but that's good enough for now to verify
# that the node has joined.
# Only metal-ccm will be able to set the node to Ready but we do not go that far here
echo "Nodes have joined the cluster and are ready"
elif kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes | grep -e "No resources found"; then
echo "Nodes have not joined yet"
Expand Down
Loading
Loading