From f90836da8dade415bea2d0670bf78c523175f17f Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 4 Mar 2026 14:08:04 +0100 Subject: [PATCH 01/33] CFN-6544: set Noble as default stemcell for acceptance tests --- .gitignore | 2 +- .../{bionic_test.go => jammy_test.go} | 12 +- acceptance-tests/run-local.sh | 41 ++-- ci/Dockerfile | 13 +- ci/bosh-scaled-out.yml | 3 + ci/pipeline.yml | 14 +- ci/scripts/acceptance-tests | 14 +- ci/scripts/functions-ci.sh | 14 +- ci/scripts/start-bosh.sh | 217 +++++++++++------- .../.gitkeep | 0 manifests/haproxy.yml | 2 +- 11 files changed, 194 insertions(+), 138 deletions(-) rename acceptance-tests/{bionic_test.go => jammy_test.go} (73%) create mode 100644 ci/bosh-scaled-out.yml rename ci/scripts/{stemcell-bionic => stemcell-jammy}/.gitkeep (100%) diff --git a/.gitignore b/.gitignore index 5b6f7d4b..881ead69 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,7 @@ config/settings.yml releases/*.tgz releases/**/*.tgz ci/scripts/stemcell/*.tgz -ci/scripts/stemcell-bionic/*.tgz +ci/scripts/stemcell-jammy/*.tgz dev_releases blobs/* .blobs diff --git a/acceptance-tests/bionic_test.go b/acceptance-tests/jammy_test.go similarity index 73% rename from acceptance-tests/bionic_test.go rename to acceptance-tests/jammy_test.go index 1823cd98..44ffc175 100644 --- a/acceptance-tests/bionic_test.go +++ b/acceptance-tests/jammy_test.go @@ -7,14 +7,14 @@ import ( . "github.com/onsi/ginkgo/v2" ) -var _ = Describe("Bionic", func() { - It("Correctly proxies HTTP requests when using the Bionic stemcell", func() { +var _ = Describe("Jammy", func() { + It("Correctly proxies HTTP requests when using the Jammy stemcell", func() { - opsfileBionic := `--- -# Configure Bionic stemcell + opsfileJammy := `--- +# Configure Jammy stemcell - type: replace path: /stemcells/alias=default/os - value: ubuntu-bionic + value: ubuntu-jammy ` haproxyBackendPort := 12000 @@ -22,7 +22,7 @@ var _ = Describe("Bionic", func() { haproxyBackendPort: haproxyBackendPort, haproxyBackendServers: []string{"127.0.0.1"}, deploymentName: deploymentNameForTestNode(), - }, []string{opsfileBionic}, map[string]interface{}{}, true) + }, []string{opsfileJammy}, map[string]interface{}{}, true) closeLocalServer, localPort := startDefaultTestServer() defer closeLocalServer() diff --git a/acceptance-tests/run-local.sh b/acceptance-tests/run-local.sh index 391a8acd..01d402f8 100755 --- a/acceptance-tests/run-local.sh +++ b/acceptance-tests/run-local.sh @@ -3,21 +3,27 @@ set -eu REPO_DIR="$(cd "$(dirname "$0")/.." && pwd)" source "${REPO_DIR}/ci/scripts/functions-ci.sh" +FOCUS="" +PARALLELISM="" KEEP_RUNNING="" usage() { - echo -e "Usage: $0 [-F ] [-k] + echo -e "Usage: $0 [-F ] [-P ] [-k] -F Focus on a particular test. Expects a Ginkgo test name. Keep bosh running afterwards. + -P Set Ginkgo parallel node count. Default is '-p' (smart parallelism). -k Keep bosh container running. Useful for debug." 1>&2; exit 1; } -while getopts ":F:k" o; do +while getopts ":F:P:k" o; do case "${o}" in F) FOCUS=${OPTARG} KEEP_RUNNING=true ;; + P) + PARALLELISM=${OPTARG} + ;; k) KEEP_RUNNING=true ;; @@ -28,25 +34,11 @@ while getopts ":F:k" o; do done shift $((OPTIND-1)) -docker_mac_check_cgroupsv1() { - # Force cgroups v1 on Docker for Mac - # inspired by https://github.com/docker/for-mac/issues/6073#issuecomment-1018793677 - - SETTINGS=~/Library/Group\ Containers/group.com.docker/settings.json - - cgroupsV1Enabled=$(jq '.deprecatedCgroupv1' "$SETTINGS") - if [ "$cgroupsV1Enabled" != "true" ]; then - echo "deprecatedCgroupv1 should be enabled in $SETTINGS. Otherwise the acceptance tests will not run on Docker for Mac." - echo "Check in the README.md for a convenient script to set deprecatedCgroupv1 and restart Docker." - exit 1 - fi -} - check_required_files() { PIDS="" REQUIRED_FILE_PATTERNS=( - ci/scripts/stemcell/bosh-stemcell-*-ubuntu-jammy-*.tgz!https://bosh.io/d/stemcells/bosh-warden-boshlite-ubuntu-jammy-go_agent - ci/scripts/stemcell-bionic/bosh-stemcell-*-ubuntu-bionic-*.tgz!https://bosh.io/d/stemcells/bosh-warden-boshlite-ubuntu-bionic-go_agent + ci/scripts/stemcell/bosh-stemcell-*-ubuntu-noble.tgz!https://bosh.io/d/stemcells/bosh-warden-boshlite-ubuntu-noble + ci/scripts/stemcell-jammy/bosh-stemcell-*-ubuntu-jammy-*.tgz!https://bosh.io/d/stemcells/bosh-warden-boshlite-ubuntu-jammy-go_agent ) for entry in "${REQUIRED_FILE_PATTERNS[@]}"; do @@ -63,9 +55,10 @@ check_required_files() { fi ( - echo "$filepattern not found, downloading latest." + echo "$filepattern not found, downloading." cd "$folder" && \ - resolved=$(curl -s --write-out '\n%{redirect_url}' "$url" | tail -n1) && \ + resolved=$(curl -s --write-out '\n%{redirect_url}' "$url" | tail -n1 | tr -d '\n') && \ + echo "Resolved URL: $resolved" && \ curl -s --remote-name --remote-header-name --location "$resolved" && \ echo "Downloaded '$url' successfully." && \ ls -1lh "$folder/"$filepattern @@ -81,10 +74,6 @@ check_required_files() { check_required_files -if [ "$(uname)" == "Darwin" ]; then - docker_mac_check_cgroupsv1 -fi - build_image "${REPO_DIR}/ci" prepare_docker_scratch @@ -93,9 +82,9 @@ if [ -n "$KEEP_RUNNING" ] ; then echo echo "*** KEEP_RUNNING enabled. Please clean up docker scratch after removing containers: ${DOCKER_SCRATCH}" echo - docker run --privileged -v "$REPO_DIR":/repo -v "${DOCKER_SCRATCH}":/scratch/docker -e REPO_ROOT=/repo -e FOCUS="$FOCUS" -e KEEP_RUNNING="${KEEP_RUNNING}" haproxy-boshrelease-testflight bash -c "cd /repo/ci/scripts && ./acceptance-tests ; sleep infinity" + docker run --privileged -v "$REPO_DIR":/repo -v "${DOCKER_SCRATCH}":/scratch/docker -e REPO_ROOT=/repo -e FOCUS="${FOCUS}" -e PARALLELISM="${PARALLELISM}" -e KEEP_RUNNING="${KEEP_RUNNING}" haproxy-boshrelease-testflight bash -c "cd /repo/ci/scripts && ./acceptance-tests ; sleep infinity" else - docker run --rm --privileged -v "$REPO_DIR":/repo -v "${DOCKER_SCRATCH}":/scratch/docker -e REPO_ROOT=/repo -e KEEP_RUNNING="" haproxy-boshrelease-testflight bash -c "cd /repo/ci/scripts && ./acceptance-tests" + docker run --rm --privileged -v "$REPO_DIR":/repo -v "${DOCKER_SCRATCH}":/scratch/docker -e REPO_ROOT=/repo -e KEEP_RUNNING="" -e PARALLELISM="${PARALLELISM}" haproxy-boshrelease-testflight bash -c "cd /repo/ci/scripts && ./acceptance-tests" echo "Cleaning up docker scratch: ${DOCKER_SCRATCH}" sudo rm -rf "${DOCKER_SCRATCH}" fi diff --git a/ci/Dockerfile b/ci/Dockerfile index 56c7550b..34414d78 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -1,18 +1,23 @@ -FROM bosh/docker-cpi:main +FROM ghcr.io/cloudfoundry/bosh/docker-cpi:latest # Install all necessary tools for haproxy testflight and dependency autobump ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update && \ - apt-get install -y wget jq git vim nano python3-pip && \ + apt-get install -y wget jq git vim nano python3-pip python3-venv && \ apt-get clean # Set bosh env at login RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc +# Copy ops files +COPY bosh-scaled-out.yml /usr/local/bosh-deployment/bosh-scaled-out.yml + # Install Python libraries needed for scripts +RUN python3 -m venv /opt/venv +ENV PATH="/opt/venv/bin:${PATH}" COPY scripts/requirements.txt /requirements.txt -RUN /usr/bin/python3 -m pip install -r /requirements.txt +RUN pip install -r /requirements.txt # Install go dependencies ENV GOBIN=/usr/local/bin -RUN go install github.com/geofffranks/spruce/cmd/spruce@latest +RUN go install github.com/geofffranks/spruce/cmd/spruce@latest \ No newline at end of file diff --git a/ci/bosh-scaled-out.yml b/ci/bosh-scaled-out.yml new file mode 100644 index 00000000..93937df3 --- /dev/null +++ b/ci/bosh-scaled-out.yml @@ -0,0 +1,3 @@ +- type: replace + path: /instance_groups/name=bosh/properties/director/workers? + value: 12 \ No newline at end of file diff --git a/ci/pipeline.yml b/ci/pipeline.yml index bc374b1f..75436cfd 100644 --- a/ci/pipeline.yml +++ b/ci/pipeline.yml @@ -121,7 +121,7 @@ jobs: - in_parallel: - { get: git, trigger: true, passed: [unit-tests] } - { get: stemcell } - - { get: stemcell-bionic } + - { get: stemcell-jammy } - get: haproxy-boshrelease-testflight - task: acceptance-tests privileged: true @@ -131,7 +131,7 @@ jobs: inputs: - { name: git } - { name: stemcell } - - { name: stemcell-bionic } + - { name: stemcell-jammy } run: path: ./git/ci/scripts/acceptance-tests args: [] @@ -152,7 +152,7 @@ jobs: - do: - { get: git-pull-requests, trigger: true, version: every } - { get: stemcell } - - { get: stemcell-bionic } + - { get: stemcell-jammy } - get: haproxy-boshrelease-testflight - put: git-pull-requests params: @@ -169,7 +169,7 @@ jobs: inputs: - { name: git-pull-requests } - { name: stemcell } - - { name: stemcell-bionic } + - { name: stemcell-jammy } run: path: ./git-pull-requests/ci/scripts/acceptance-tests args: [] @@ -403,15 +403,15 @@ resources: - "dependabot" - "CFN-CI" - - name: stemcell-bionic + - name: stemcell-jammy type: bosh-io-stemcell source: - name: bosh-warden-boshlite-ubuntu-bionic-go_agent + name: bosh-warden-boshlite-ubuntu-jammy-go_agent - name: stemcell type: bosh-io-stemcell source: - name: bosh-warden-boshlite-ubuntu-jammy-go_agent + name: bosh-warden-boshlite-ubuntu-noble - name: version type: semver diff --git a/ci/scripts/acceptance-tests b/ci/scripts/acceptance-tests index 9cc17e83..579a864a 100755 --- a/ci/scripts/acceptance-tests +++ b/ci/scripts/acceptance-tests @@ -35,11 +35,19 @@ echo "----- Running tests" export PATH=$PATH:$GOPATH/bin ginkgo version -PARALLELISM="-p" -if [ -n "$FOCUS" ]; then +echo "------------------------------------------------------------------" +if [ -n "${FOCUS:-}" ]; then PARALLELISM="--nodes=1" + echo "FOCUS is set, thus PARALLELISM is set to '$PARALLELISM'" +elif [ -n "${PARALLELISM:-}" ]; then + PARALLELISM="--nodes=$PARALLELISM" + echo "PARALLELISM is set. Will run ginkgo with '$PARALLELISM'" +else + PARALLELISM="-p" + echo "PARALLELISM is not set. Using default '$PARALLELISM'" fi +echo "------------------------------------------------------------------" ginkgo -v "$PARALLELISM" -r --trace --show-node-events --randomize-all --flake-attempts 5 "${ADDITIONAL_ARGS[@]}" -keep_running_info +keep_running_info \ No newline at end of file diff --git a/ci/scripts/functions-ci.sh b/ci/scripts/functions-ci.sh index d3e64d50..fa8e5465 100755 --- a/ci/scripts/functions-ci.sh +++ b/ci/scripts/functions-ci.sh @@ -62,18 +62,18 @@ function bosh_release() { } function bosh_assets() { - stemcell_jammy_path="$START_DIR/stemcell/*.tgz" - stemcell_bionic_path="$START_DIR/stemcell-bionic/*.tgz" + stemcell_noble_path="$START_DIR/stemcell/*.tgz" + stemcell_jammy_path="$START_DIR/stemcell-jammy/*.tgz" + + echo "----- Uploading Noble stemcell" + bosh -n upload-stemcell $stemcell_noble_path echo "----- Uploading Jammy stemcell" bosh -n upload-stemcell $stemcell_jammy_path - echo "----- Uploading Bionic stemcell" - bosh -n upload-stemcell $stemcell_bionic_path - echo "----- Uploading os-conf (used for tests only)" - bosh -n upload-release --sha1 386293038ae3d00813eaa475b4acf63f8da226ef \ - https://bosh.io/d/github.com/cloudfoundry/os-conf-release?v=22.1.2 + bosh -n upload-release --sha1 sha256:efcf30754ce4c5f308aedab3329d8d679f5967b2a4c3c453204c7cb10c7c5ed9 \ + https://bosh.io/d/github.com/cloudfoundry/os-conf-release?v=23.0.0 export BOSH_PATH=$(command -v bosh) export BASE_MANIFEST_PATH="$PWD/manifests/haproxy.yml" diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 3bda28f6..ad6bd418 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -6,44 +6,30 @@ function generate_certs() { local certs_dir certs_dir="${1}" - pushd "${certs_dir}" - - jq -ner --arg "ip" "${OUTER_CONTAINER_IP}" '{ - "variables": [ - { - "name": "docker_ca", - "type": "certificate", - "options": { - "is_ca": true, - "common_name": "ca" - } - }, - { - "name": "docker_tls", - "type": "certificate", - "options": { - "extended_key_usage": [ - "server_auth" - ], - "common_name": $ip, - "alternative_names": [ $ip ], - "ca": "docker_ca" - } - }, - { - "name": "client_docker_tls", - "type": "certificate", - "options": { - "extended_key_usage": [ - "client_auth" - ], - "common_name": $ip, - "alternative_names": [ $ip ], - "ca": "docker_ca" - } - } - ] - }' > ./bosh-vars.yml + pushd "${certs_dir}" > /dev/null + cat < ./bosh-vars.yml +--- +variables: +- name: docker_ca + type: certificate + options: + is_ca: true + common_name: ca +- name: docker_tls + type: certificate + options: + extended_key_usage: [server_auth] + common_name: $OUTER_CONTAINER_IP + alternative_names: [$OUTER_CONTAINER_IP] + ca: docker_ca +- name: client_docker_tls + type: certificate + options: + extended_key_usage: [client_auth] + common_name: $OUTER_CONTAINER_IP + alternative_names: [$OUTER_CONTAINER_IP] + ca: docker_ca +EOF bosh int ./bosh-vars.yml --vars-store=./certs.yml bosh int ./certs.yml --path=/docker_ca/ca > ./ca.pem @@ -51,12 +37,13 @@ function generate_certs() { bosh int ./certs.yml --path=/docker_tls/private_key > ./server-key.pem bosh int ./certs.yml --path=/client_docker_tls/certificate > ./cert.pem bosh int ./certs.yml --path=/client_docker_tls/private_key > ./key.pem - # generate certs in json format - # - ruby -e 'puts File.read("./ca.pem").split("\n").join("\\n")' > "$certs_dir/ca_json_safe.pem" - ruby -e 'puts File.read("./cert.pem").split("\n").join("\\n")' > "$certs_dir/client_certificate_json_safe.pem" - ruby -e 'puts File.read("./key.pem").split("\n").join("\\n")' > "$certs_dir/client_private_key_json_safe.pem" - popd + + # generate certs in json format + ruby -e 'puts File.read("./ca.pem").split("\n").join("\\n")' > "${certs_dir}/ca_json_safe.pem" + ruby -e 'puts File.read("./cert.pem").split("\n").join("\\n")' > "${certs_dir}/client_certificate_json_safe.pem" + ruby -e 'puts File.read("./key.pem").split("\n").join("\\n")' > "${certs_dir}/client_private_key_json_safe.pem" + + popd > /dev/null } function sanitize_cgroups() { @@ -64,15 +51,28 @@ function sanitize_cgroups() { mountpoint -q /sys/fs/cgroup || \ mount -t tmpfs -o uid=0,gid=0,mode=0755 cgroup /sys/fs/cgroup + if [ -f /sys/fs/cgroup/cgroup.controllers ]; then + # cgroups v2: enable nesting (based on moby/moby hack/dind) + mkdir -p /sys/fs/cgroup/init + # Loop to handle races from concurrent process creation (e.g. docker exec) + while ! { + xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs 2>/dev/null || : + sed -e 's/ / +/g' -e 's/^/+/' < /sys/fs/cgroup/cgroup.controllers \ + > /sys/fs/cgroup/cgroup.subtree_control + }; do true; done + return + fi + mount -o remount,rw /sys/fs/cgroup - sed -e 1d /proc/cgroups | while read sys hierarchy num enabled; do + # shellcheck disable=SC2034 + sed -e 1d /proc/cgroups | while read -r sys hierarchy num enabled; do if [ "$enabled" != "1" ]; then # subsystem disabled; skip continue fi - grouping="$(cat /proc/self/cgroup | cut -d: -f2 | grep "\\<$sys\\>")" + grouping="$(cut -d: -f2 < /proc/self/cgroup | grep "\\<$sys\\>")" if [ -z "$grouping" ]; then # subsystem not mounted anywhere; mount it on its own grouping="$sys" @@ -102,17 +102,32 @@ function sanitize_cgroups() { source "ci/scripts/functions-ci.sh" function start_docker() { - generate_certs "$1" - local mtu + local certs_dir + certs_dir="${1}" + + export DNS_IP="8.8.8.8" + + # docker will fail starting with the new iptables. it throws: + # iptables v1.8.7 (nf_tables): Could not fetch rule set generation id: .... + update-alternatives --set iptables /usr/sbin/iptables-legacy + + generate_certs "${certs_dir}" + mkdir -p /var/log mkdir -p /var/run sanitize_cgroups + echo "Sanitized cgroups for docker" >&2 + + # systemd inside nested Docker containers requires shared mount propagation + mount --make-rshared / - # ensure systemd cgroup is present - mkdir -p /sys/fs/cgroup/systemd - if ! mountpoint -q /sys/fs/cgroup/systemd ; then - mount -t cgroup -o none,name=systemd cgroup /sys/fs/cgroup/systemd + # ensure systemd cgroup is present (cgroups v1 only) + if [ ! -f /sys/fs/cgroup/cgroup.controllers ]; then + mkdir -p /sys/fs/cgroup/systemd + if ! mountpoint -q /sys/fs/cgroup/systemd ; then + mount -t cgroup -o none,name=systemd cgroup /sys/fs/cgroup/systemd + fi fi # check for /proc/sys being mounted readonly, as systemd does @@ -120,12 +135,13 @@ function start_docker() { mount -o remount,rw /proc/sys fi - mtu=$(cat /sys/class/net/$(ip route get 8.8.8.8|awk '{ print $5 }')/mtu) + local mtu + mtu=$(cat "/sys/class/net/$(ip route get ${DNS_IP} | awk '{ print $5 }')/mtu") [[ ! -d /etc/docker ]] && mkdir /etc/docker cat < /etc/docker/daemon.json { - "hosts": ["${DOCKER_HOST}","unix:///var/run/docker.sock"], + "hosts": ["${DOCKER_HOST}"], "tls": true, "tlscert": "${certs_dir}/server-cert.pem", "tlskey": "${certs_dir}/server-key.pem", @@ -137,15 +153,14 @@ function start_docker() { EOF service docker start - - export DOCKER_TLS_VERIFY=1 - export DOCKER_CERT_PATH=$1 + echo "Started docker service" >&2 rc=1 - for i in $(seq 1 10); do - echo waiting for docker to come up... - sleep 10 + for i in $(seq 1 100); do + echo "waiting for docker to come up... (${i})" + sleep 1 set +e + echo "Docker started, checking if it's responsive..." docker info rc=$? set -e @@ -165,66 +180,102 @@ EOF if [ -z "${KEEP_RUNNING}" ] ; then trap stop_docker ERR fi - echo "$certs_dir" + + echo "${certs_dir}" } function main() { - export OUTER_CONTAINER_IP=$(ruby -rsocket -e 'puts Socket.ip_address_list - .reject { |addr| !addr.ip? || addr.ipv4_loopback? || addr.ipv6? } - .map { |addr| addr.ip_address }.first') - - export DOCKER_HOST="tcp://${OUTER_CONTAINER_IP}:4243" + # ".first" - original code could return multiple IPs (e.g., container IP + docker0 bridge IP) + # which breaks the docker_tls JSON variable formatting + OUTER_CONTAINER_IP=$(ruby -rsocket -e 'puts Socket.ip_address_list + .reject { |addr| !addr.ip? || addr.ipv4_loopback? || addr.ipv6? } + .map { |addr| addr.ip_address }.first') + export OUTER_CONTAINER_IP + echo "Determined OUTER_CONTAINER_IP: ${OUTER_CONTAINER_IP}" >&2 local certs_dir certs_dir=$(mktemp -d) - start_docker "${certs_dir}" local local_bosh_dir local_bosh_dir="/tmp/local-bosh/director" + mkdir -p ${local_bosh_dir} + + cat < "${local_bosh_dir}/docker-env" +export DOCKER_HOST="tcp://${OUTER_CONTAINER_IP}:4243" +export DOCKER_TLS_VERIFY=1 +export DOCKER_CERT_PATH="${certs_dir}" +EOF + echo "Source '${local_bosh_dir}/docker-env' to run docker" >&2 + source "${local_bosh_dir}/docker-env" - if ! docker network ls | grep director_network; then - docker network create -d bridge --subnet=10.245.0.0/16 director_network + start_docker "${certs_dir}" + echo "Docker is up and running with TLS configured" >&2 + + local docker_network_name="director_network" + local docker_network_cidr="10.245.0.0/16" + if docker network ls | grep -q "${docker_network_name}"; then + echo "A docker network named '${docker_network_name}' already exists, skipping creation" >&2 + else + docker network create -d bridge --subnet=${docker_network_cidr} "${docker_network_name}" + echo "Created docker network '${docker_network_name}' with subnet '${docker_network_cidr}'" >&2 fi - compilation_ops="$PWD/ci/compilation.yml" pushd "${BOSH_DEPLOYMENT_PATH:-/usr/local/bosh-deployment}" > /dev/null + echo "Current directory: $(pwd)" >&2 + export BOSH_DIRECTOR_IP="10.245.0.3" export BOSH_ENVIRONMENT="docker-director" - mkdir -p ${local_bosh_dir} + cat < "${local_bosh_dir}/docker_tls.json" +{ + "ca": "$(cat "${certs_dir}/ca_json_safe.pem")", + "certificate": "$(cat "${certs_dir}/client_certificate_json_safe.pem")", + "private_key": "$(cat "${certs_dir}/client_private_key_json_safe.pem")" +} +EOF - command bosh int bosh.yml \ + echo "Interpolating BOSH deployment manifest with Docker CPI and TLS configuration..." >&2 + bosh int bosh.yml \ -o docker/cpi.yml \ -o jumpbox-user.yml \ + -o /usr/local/local-releases.yml \ + -o "$PWD/bosh-scaled-out.yml" \ -v director_name=docker \ - -v internal_cidr=10.245.0.0/16 \ + -v internal_cidr=${docker_network_cidr} \ -v internal_gw=10.245.0.1 \ -v internal_ip="${BOSH_DIRECTOR_IP}" \ -v docker_host="${DOCKER_HOST}" \ - -v network=director_network \ - -v docker_tls="{\"ca\": \"$(cat "${certs_dir}"/ca_json_safe.pem)\",\"certificate\": \"$(cat "${certs_dir}"/client_certificate_json_safe.pem)\",\"private_key\": \"$(cat "${certs_dir}"/client_private_key_json_safe.pem)\"}" \ - ${@} > "${local_bosh_dir}/bosh-director.yml" + -v network="${docker_network_name}" \ + -v docker_tls="$(cat "${local_bosh_dir}/docker_tls.json")" \ + "${@}" > "${local_bosh_dir}/bosh-director.yml" - command bosh create-env "${local_bosh_dir}/bosh-director.yml" \ - --vars-store="${local_bosh_dir}/creds.yml" \ - --state="${local_bosh_dir}/state.json" + echo "Creating BOSH director environment..." >&2 + bosh create-env "${local_bosh_dir}/bosh-director.yml" \ + --vars-store="${local_bosh_dir}/creds.yml" \ + --state="${local_bosh_dir}/state.json" + echo "Extracting BOSH director credentials and CA certificate..." >&2 bosh int "${local_bosh_dir}/creds.yml" --path /director_ssl/ca > "${local_bosh_dir}/ca.crt" + bosh_client_secret="$(bosh int "${local_bosh_dir}/creds.yml" --path /admin_password)" + + echo "Setting up BOSH CLI environment..." >&2 bosh -e "${BOSH_DIRECTOR_IP}" --ca-cert "${local_bosh_dir}/ca.crt" alias-env "${BOSH_ENVIRONMENT}" cat < "${local_bosh_dir}/env" + export BOSH_DIRECTOR_IP="${BOSH_DIRECTOR_IP}" export BOSH_ENVIRONMENT="${BOSH_ENVIRONMENT}" export BOSH_CLIENT=admin - export BOSH_CLIENT_SECRET=$(bosh int "${local_bosh_dir}/creds.yml" --path /admin_password) + export BOSH_CLIENT_SECRET=${bosh_client_secret} export BOSH_CA_CERT="${local_bosh_dir}/ca.crt" - EOF + echo "Source '${local_bosh_dir}/env' to run bosh" >&2 source "${local_bosh_dir}/env" - bosh -n update-cloud-config docker/cloud-config.yml -v network=director_network -o "${compilation_ops}" + echo "Updating BOSH cloud config with Docker network..." >&2 + bosh -n update-cloud-config docker/cloud-config.yml -v network="${docker_network_name}" popd > /dev/null } echo "----- Starting BOSH" -main $@ +main "${@}" \ No newline at end of file diff --git a/ci/scripts/stemcell-bionic/.gitkeep b/ci/scripts/stemcell-jammy/.gitkeep similarity index 100% rename from ci/scripts/stemcell-bionic/.gitkeep rename to ci/scripts/stemcell-jammy/.gitkeep diff --git a/manifests/haproxy.yml b/manifests/haproxy.yml index 92a99154..16fcb911 100644 --- a/manifests/haproxy.yml +++ b/manifests/haproxy.yml @@ -31,7 +31,7 @@ update: stemcells: - alias: default - os: ubuntu-jammy + os: ubuntu-noble version: latest releases: From 81054066ccce34145f852941609e2487a34ce7d3 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 4 Mar 2026 17:32:52 +0100 Subject: [PATCH 02/33] CFN-6544: bump to the last bpm version --- manifests/haproxy.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/manifests/haproxy.yml b/manifests/haproxy.yml index 16fcb911..3b09636e 100644 --- a/manifests/haproxy.yml +++ b/manifests/haproxy.yml @@ -36,9 +36,9 @@ stemcells: releases: - name: bpm - version: 1.2.14 - url: https://bosh.io/d/github.com/cloudfoundry/bpm-release?v=1.2.14 - sha1: 1e357a533654e2067e15231dd8ac5bad2e697dff + version: 1.4.26 + url: https://bosh.io/d/github.com/cloudfoundry/bpm-release?v=1.4.26 + sha1: sha256:40af85114d2a8a67812bf65212076581ea42cefcf67ee6b8d78d778ed1ca2b85 - name: haproxy version: 16.4.0+3.2.13 url: https://github.com/cloudfoundry/haproxy-boshrelease/releases/download/v16.4.0+3.2.13/haproxy-16.4.0+3.2.13.tgz From 14394dc3264c38ba90cf96d2fd40272b5fee46ff Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Thu, 5 Mar 2026 10:51:42 +0100 Subject: [PATCH 03/33] CFN-6544: fix Docker CPI image in pipeline --- ci/pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/pipeline.yml b/ci/pipeline.yml index 75436cfd..8b58f69e 100644 --- a/ci/pipeline.yml +++ b/ci/pipeline.yml @@ -453,7 +453,7 @@ resources: - name: docker-cpi-image type: docker-image source: - repository: bosh/docker-cpi + repository: ghcr.io/cloudfoundry/bosh/docker-cpi - name: git-ci type: git From c07aa62d072cd68bd179f7679b9a77c2b8f1f682 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Mon, 9 Mar 2026 13:11:17 +0100 Subject: [PATCH 04/33] CFN-6544: fixes --- acceptance-tests/run-local.sh | 18 ++++++++++++++++++ ci/Dockerfile | 1 + ci/scripts/start-bosh.sh | 2 +- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/acceptance-tests/run-local.sh b/acceptance-tests/run-local.sh index 01d402f8..33946376 100755 --- a/acceptance-tests/run-local.sh +++ b/acceptance-tests/run-local.sh @@ -34,6 +34,20 @@ while getopts ":F:P:k" o; do done shift $((OPTIND-1)) +docker_mac_check_cgroupsv1() { + # Force cgroups v1 on Docker for Mac + # inspired by https://github.com/docker/for-mac/issues/6073#issuecomment-1018793677 + + SETTINGS=~/Library/Group\ Containers/group.com.docker/settings.json + + cgroupsV1Enabled=$(jq '.deprecatedCgroupv1' "$SETTINGS") + if [ "$cgroupsV1Enabled" != "true" ]; then + echo "deprecatedCgroupv1 should be enabled in $SETTINGS. Otherwise the acceptance tests will not run on Docker for Mac." + echo "Check in the README.md for a convenient script to set deprecatedCgroupv1 and restart Docker." + exit 1 + fi +} + check_required_files() { PIDS="" REQUIRED_FILE_PATTERNS=( @@ -74,6 +88,10 @@ check_required_files() { check_required_files +if [ "$(uname)" == "Darwin" ]; then + docker_mac_check_cgroupsv1 +fi + build_image "${REPO_DIR}/ci" prepare_docker_scratch diff --git a/ci/Dockerfile b/ci/Dockerfile index 34414d78..c61ec60d 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -7,6 +7,7 @@ RUN apt-get update && \ apt-get clean # Set bosh env at login +RUN echo "source /tmp/local-bosh/director/docker-env" >> /root/.bashrc RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc # Copy ops files diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index ad6bd418..d1aacf61 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -66,7 +66,7 @@ function sanitize_cgroups() { mount -o remount,rw /sys/fs/cgroup # shellcheck disable=SC2034 - sed -e 1d /proc/cgroups | while read -r sys hierarchy num enabled; do + sed -e 1d /proc/cgroups | while read -r sys enabled; do if [ "$enabled" != "1" ]; then # subsystem disabled; skip continue From 6094de114922c06bc14d7194fb70cd258d502808 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Mon, 9 Mar 2026 13:50:47 +0100 Subject: [PATCH 05/33] CFN-6544: rollback cgroup check change --- ci/scripts/start-bosh.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index d1aacf61..ad6bd418 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -66,7 +66,7 @@ function sanitize_cgroups() { mount -o remount,rw /sys/fs/cgroup # shellcheck disable=SC2034 - sed -e 1d /proc/cgroups | while read -r sys enabled; do + sed -e 1d /proc/cgroups | while read -r sys hierarchy num enabled; do if [ "$enabled" != "1" ]; then # subsystem disabled; skip continue From 024e286e85d80284c40ff9993c1c96866327e6ff Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Mon, 9 Mar 2026 15:42:49 +0100 Subject: [PATCH 06/33] CFN-6544: new ops file (start with systemd) --- ci/Dockerfile | 3 ++- ci/{ => ops}/bosh-scaled-out.yml | 0 ci/{ => ops}/compilation.yml | 0 ci/ops/noble-support.yml | 6 ++++++ ci/scripts/start-bosh.sh | 1 + 5 files changed, 9 insertions(+), 1 deletion(-) rename ci/{ => ops}/bosh-scaled-out.yml (100%) rename ci/{ => ops}/compilation.yml (100%) create mode 100644 ci/ops/noble-support.yml diff --git a/ci/Dockerfile b/ci/Dockerfile index c61ec60d..537fdaf8 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -11,7 +11,8 @@ RUN echo "source /tmp/local-bosh/director/docker-env" >> /root/.bashrc RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc # Copy ops files -COPY bosh-scaled-out.yml /usr/local/bosh-deployment/bosh-scaled-out.yml +COPY ops/noble-support.yml /usr/local/bosh-deployment/noble-support.yml +COPY ops/bosh-scaled-out.yml /usr/local/bosh-deployment/bosh-scaled-out.yml # Install Python libraries needed for scripts RUN python3 -m venv /opt/venv diff --git a/ci/bosh-scaled-out.yml b/ci/ops/bosh-scaled-out.yml similarity index 100% rename from ci/bosh-scaled-out.yml rename to ci/ops/bosh-scaled-out.yml diff --git a/ci/compilation.yml b/ci/ops/compilation.yml similarity index 100% rename from ci/compilation.yml rename to ci/ops/compilation.yml diff --git a/ci/ops/noble-support.yml b/ci/ops/noble-support.yml new file mode 100644 index 00000000..beec1cbb --- /dev/null +++ b/ci/ops/noble-support.yml @@ -0,0 +1,6 @@ +- type: replace + path: /cloud_provider/properties/docker_cpi/start_containers_with_systemd? + value: true +- type: replace + path: /instance_groups/name=bosh/properties/docker_cpi/start_containers_with_systemd? + value: true diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index ad6bd418..cfa57dd3 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -239,6 +239,7 @@ EOF -o docker/cpi.yml \ -o jumpbox-user.yml \ -o /usr/local/local-releases.yml \ + -o "$PWD/noble-support.yml" \ -o "$PWD/bosh-scaled-out.yml" \ -v director_name=docker \ -v internal_cidr=${docker_network_cidr} \ From 6ce8e764e072408ef1622ff501347128523fac67 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Mon, 9 Mar 2026 17:13:59 +0100 Subject: [PATCH 07/33] CFN-6544: rollback new ops file (start with systemd) and add verbosity --- ci/Dockerfile | 1 - ci/ops/noble-support.yml | 6 ------ ci/scripts/acceptance-tests | 12 +++++++++++- ci/scripts/start-bosh.sh | 1 - 4 files changed, 11 insertions(+), 9 deletions(-) delete mode 100644 ci/ops/noble-support.yml diff --git a/ci/Dockerfile b/ci/Dockerfile index 537fdaf8..da6d1ff3 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -11,7 +11,6 @@ RUN echo "source /tmp/local-bosh/director/docker-env" >> /root/.bashrc RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc # Copy ops files -COPY ops/noble-support.yml /usr/local/bosh-deployment/noble-support.yml COPY ops/bosh-scaled-out.yml /usr/local/bosh-deployment/bosh-scaled-out.yml # Install Python libraries needed for scripts diff --git a/ci/ops/noble-support.yml b/ci/ops/noble-support.yml deleted file mode 100644 index beec1cbb..00000000 --- a/ci/ops/noble-support.yml +++ /dev/null @@ -1,6 +0,0 @@ -- type: replace - path: /cloud_provider/properties/docker_cpi/start_containers_with_systemd? - value: true -- type: replace - path: /instance_groups/name=bosh/properties/docker_cpi/start_containers_with_systemd? - value: true diff --git a/ci/scripts/acceptance-tests b/ci/scripts/acceptance-tests index 579a864a..589a46d7 100755 --- a/ci/scripts/acceptance-tests +++ b/ci/scripts/acceptance-tests @@ -48,6 +48,16 @@ else fi echo "------------------------------------------------------------------" -ginkgo -v "$PARALLELISM" -r --trace --show-node-events --randomize-all --flake-attempts 5 "${ADDITIONAL_ARGS[@]}" +echo "------------------------------------------------------------------" +if [ "${VERBOSITY:-}" = "vv" ]; then + VERBOSITY_FLAG="-vv" + echo "VERBOSITY is set to 'vv'. Will run ginkgo with '$VERBOSITY_FLAG'" +else + VERBOSITY_FLAG="-v" + echo "VERBOSITY is not set or unrecognised. Using default '$VERBOSITY_FLAG'" +fi +echo "------------------------------------------------------------------" + +ginkgo "$VERBOSITY_FLAG" "$PARALLELISM" -r --trace --show-node-events --randomize-all --flake-attempts 5 "${ADDITIONAL_ARGS[@]}" keep_running_info \ No newline at end of file diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index cfa57dd3..ad6bd418 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -239,7 +239,6 @@ EOF -o docker/cpi.yml \ -o jumpbox-user.yml \ -o /usr/local/local-releases.yml \ - -o "$PWD/noble-support.yml" \ -o "$PWD/bosh-scaled-out.yml" \ -v director_name=docker \ -v internal_cidr=${docker_network_cidr} \ From e78b590427cf47ec8d42e16cef984f12d7129cee Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Mon, 9 Mar 2026 19:28:57 +0100 Subject: [PATCH 08/33] CFN-6544: debug cgroups --- ci/scripts/start-bosh.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index ad6bd418..d1aacf61 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -66,7 +66,7 @@ function sanitize_cgroups() { mount -o remount,rw /sys/fs/cgroup # shellcheck disable=SC2034 - sed -e 1d /proc/cgroups | while read -r sys hierarchy num enabled; do + sed -e 1d /proc/cgroups | while read -r sys enabled; do if [ "$enabled" != "1" ]; then # subsystem disabled; skip continue From c05baf1d18d6a8e44ab310fa85b4c3208d58c7bd Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Mon, 9 Mar 2026 19:36:16 +0100 Subject: [PATCH 09/33] CFN-6544: rollback cgroups --- ci/scripts/start-bosh.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index d1aacf61..ad6bd418 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -66,7 +66,7 @@ function sanitize_cgroups() { mount -o remount,rw /sys/fs/cgroup # shellcheck disable=SC2034 - sed -e 1d /proc/cgroups | while read -r sys enabled; do + sed -e 1d /proc/cgroups | while read -r sys hierarchy num enabled; do if [ "$enabled" != "1" ]; then # subsystem disabled; skip continue From 34dca32e4e365b6537afe2214c850e4efa2be4b2 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 10 Mar 2026 08:52:29 +0100 Subject: [PATCH 10/33] CFN-6544: debugging --- acceptance-tests/bosh_helpers.go | 31 +++++++++++++++++++++++++++++-- ci/scripts/start-bosh.sh | 1 + 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/acceptance-tests/bosh_helpers.go b/acceptance-tests/bosh_helpers.go index 8197b47c..645723b7 100644 --- a/acceptance-tests/bosh_helpers.go +++ b/acceptance-tests/bosh_helpers.go @@ -5,6 +5,7 @@ import ( "fmt" "io/ioutil" "os/exec" + "regexp" "strings" "time" @@ -152,10 +153,14 @@ func deployHAProxy(baseManifestVars baseManifestVars, customOpsfiles []string, c session, err := gexec.Start(cmd, GinkgoWriter, GinkgoWriter) Expect(err).NotTo(HaveOccurred()) + Eventually(session, 20*time.Minute, time.Second).Should(gexec.Exit()) + if expectSuccess { - Eventually(session, 20*time.Minute, time.Second).Should(gexec.Exit(0)) + if session.ExitCode() != 0 { + dumpBoshTaskDebug(session) + Fail(fmt.Sprintf("bosh deploy exited with code %d", session.ExitCode())) + } } else { - Eventually(session, 20*time.Minute, time.Second).Should(gexec.Exit()) Expect(session.ExitCode()).NotTo(BeZero()) } @@ -173,6 +178,28 @@ func dumpCmd(cmd *exec.Cmd) { writeLog("------------------------------------") } +// dumpBoshTaskDebug extracts the BOSH task number from session output and runs +// "bosh tasks --debug" to stream the full debug log into GinkgoWriter. +func dumpBoshTaskDebug(session *gexec.Session) { + combined := string(session.Out.Contents()) + string(session.Err.Contents()) + // Lines like: "Task 67 | 19:24:12 | ..." + re := regexp.MustCompile(`(?m)^\s*Task (\d+) \|`) + matches := re.FindStringSubmatch(combined) + if len(matches) < 2 { + writeLog("(could not extract BOSH task number from output for debug dump)") + return + } + taskNumber := matches[1] + By(fmt.Sprintf("Dumping BOSH task %s debug log", taskNumber)) + cmd := config.boshCmd("", "task", taskNumber, "--debug") + debugSession, err := gexec.Start(cmd, GinkgoWriter, GinkgoWriter) + if err != nil { + writeLog(fmt.Sprintf("Failed to start bosh task debug: %s", err)) + return + } + Eventually(debugSession, 2*time.Minute, time.Second).Should(gexec.Exit()) +} + func dumpHAProxyConfig(haproxyInfo haproxyInfo) { By("Checking /var/vcap/jobs/haproxy/config/haproxy.config") haProxyConfig, _, err := runOnRemote(haproxyInfo.SSHUser, haproxyInfo.PublicIP, haproxyInfo.SSHPrivateKey, "cat /var/vcap/jobs/haproxy/config/haproxy.config") diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index ad6bd418..82ea4080 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -267,6 +267,7 @@ EOF export BOSH_CLIENT=admin export BOSH_CLIENT_SECRET=${bosh_client_secret} export BOSH_CA_CERT="${local_bosh_dir}/ca.crt" + export BOSH_LOG_LEVEL=debug EOF echo "Source '${local_bosh_dir}/env' to run bosh" >&2 source "${local_bosh_dir}/env" From afa66fe80177133c02e157013c6f4c7bb25c600f Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 10 Mar 2026 10:50:29 +0100 Subject: [PATCH 11/33] CFN-6544: debugging one test --- acceptance-tests/healthcheck_test.go | 12 +++++++----- ci/scripts/acceptance-tests | 3 +++ 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/acceptance-tests/healthcheck_test.go b/acceptance-tests/healthcheck_test.go index 93c20a75..60e74ea7 100644 --- a/acceptance-tests/healthcheck_test.go +++ b/acceptance-tests/healthcheck_test.go @@ -62,13 +62,15 @@ var _ = Describe("HTTP Health Check", func() { haproxyBackendServers: []string{"127.0.0.1"}, deploymentName: backendDeploymentName, }, []string{}, map[string]interface{}{}, true) - defer deleteDeployment(backendDeploymentName) + // defer deleteDeployment(backendDeploymentName) - closeLocalServer, backendLocalPort := startDefaultTestServer() - defer closeLocalServer() + _, backendLocalPort := startDefaultTestServer() + //closeLocalServer, backendLocalPort := startDefaultTestServer() + // defer closeLocalServer() - closeTunnel := setupTunnelFromHaproxyToTestServer(backendHaproxyInfo, haproxyBackendPort, backendLocalPort) - defer closeTunnel() + _ := setupTunnelFromHaproxyToTestServer(backendHaproxyInfo, haproxyBackendPort, backendLocalPort) + //closeTunnel := setupTunnelFromHaproxyToTestServer(backendHaproxyInfo, haproxyBackendPort, backendLocalPort) + // defer closeTunnel() // Now deploy test HAProxy with 'haproxy-backend' configured as backend haproxyInfo, _ := deployHAProxy(baseManifestVars{ diff --git a/ci/scripts/acceptance-tests b/ci/scripts/acceptance-tests index 589a46d7..e47fa0f3 100755 --- a/ci/scripts/acceptance-tests +++ b/ci/scripts/acceptance-tests @@ -4,6 +4,9 @@ set -e source "${REPO_ROOT}/ci/scripts/functions-ci.sh" START_DIR="${PWD}" # Differs for CI and manual execution + +FOCUS="Correctly starts if there is a healthy backend" + if [ -n "$FOCUS" ]; then echo "------------------------------------------------------------------" echo "FOCUS is set. Will only run tests matching '$FOCUS'" From bef1960588516597b1e3e9c56cc7d342f1e0957f Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 10 Mar 2026 11:05:15 +0100 Subject: [PATCH 12/33] CFN-6544: debugging one test, fixes --- acceptance-tests/healthcheck_test.go | 2 +- ci/scripts/start-bosh.sh | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/acceptance-tests/healthcheck_test.go b/acceptance-tests/healthcheck_test.go index 60e74ea7..29aedc25 100644 --- a/acceptance-tests/healthcheck_test.go +++ b/acceptance-tests/healthcheck_test.go @@ -68,7 +68,7 @@ var _ = Describe("HTTP Health Check", func() { //closeLocalServer, backendLocalPort := startDefaultTestServer() // defer closeLocalServer() - _ := setupTunnelFromHaproxyToTestServer(backendHaproxyInfo, haproxyBackendPort, backendLocalPort) + setupTunnelFromHaproxyToTestServer(backendHaproxyInfo, haproxyBackendPort, backendLocalPort) //closeTunnel := setupTunnelFromHaproxyToTestServer(backendHaproxyInfo, haproxyBackendPort, backendLocalPort) // defer closeTunnel() diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 82ea4080..ad6bd418 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -267,7 +267,6 @@ EOF export BOSH_CLIENT=admin export BOSH_CLIENT_SECRET=${bosh_client_secret} export BOSH_CA_CERT="${local_bosh_dir}/ca.crt" - export BOSH_LOG_LEVEL=debug EOF echo "Source '${local_bosh_dir}/env' to run bosh" >&2 source "${local_bosh_dir}/env" From e4898b7e6aa22abf41cd7601d8379c8c89f88353 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 10 Mar 2026 11:35:09 +0100 Subject: [PATCH 13/33] CFN-6544: rollback task log dumping --- acceptance-tests/bosh_helpers.go | 31 ++----------------------------- 1 file changed, 2 insertions(+), 29 deletions(-) diff --git a/acceptance-tests/bosh_helpers.go b/acceptance-tests/bosh_helpers.go index 645723b7..8197b47c 100644 --- a/acceptance-tests/bosh_helpers.go +++ b/acceptance-tests/bosh_helpers.go @@ -5,7 +5,6 @@ import ( "fmt" "io/ioutil" "os/exec" - "regexp" "strings" "time" @@ -153,14 +152,10 @@ func deployHAProxy(baseManifestVars baseManifestVars, customOpsfiles []string, c session, err := gexec.Start(cmd, GinkgoWriter, GinkgoWriter) Expect(err).NotTo(HaveOccurred()) - Eventually(session, 20*time.Minute, time.Second).Should(gexec.Exit()) - if expectSuccess { - if session.ExitCode() != 0 { - dumpBoshTaskDebug(session) - Fail(fmt.Sprintf("bosh deploy exited with code %d", session.ExitCode())) - } + Eventually(session, 20*time.Minute, time.Second).Should(gexec.Exit(0)) } else { + Eventually(session, 20*time.Minute, time.Second).Should(gexec.Exit()) Expect(session.ExitCode()).NotTo(BeZero()) } @@ -178,28 +173,6 @@ func dumpCmd(cmd *exec.Cmd) { writeLog("------------------------------------") } -// dumpBoshTaskDebug extracts the BOSH task number from session output and runs -// "bosh tasks --debug" to stream the full debug log into GinkgoWriter. -func dumpBoshTaskDebug(session *gexec.Session) { - combined := string(session.Out.Contents()) + string(session.Err.Contents()) - // Lines like: "Task 67 | 19:24:12 | ..." - re := regexp.MustCompile(`(?m)^\s*Task (\d+) \|`) - matches := re.FindStringSubmatch(combined) - if len(matches) < 2 { - writeLog("(could not extract BOSH task number from output for debug dump)") - return - } - taskNumber := matches[1] - By(fmt.Sprintf("Dumping BOSH task %s debug log", taskNumber)) - cmd := config.boshCmd("", "task", taskNumber, "--debug") - debugSession, err := gexec.Start(cmd, GinkgoWriter, GinkgoWriter) - if err != nil { - writeLog(fmt.Sprintf("Failed to start bosh task debug: %s", err)) - return - } - Eventually(debugSession, 2*time.Minute, time.Second).Should(gexec.Exit()) -} - func dumpHAProxyConfig(haproxyInfo haproxyInfo) { By("Checking /var/vcap/jobs/haproxy/config/haproxy.config") haProxyConfig, _, err := runOnRemote(haproxyInfo.SSHUser, haproxyInfo.PublicIP, haproxyInfo.SSHPrivateKey, "cat /var/vcap/jobs/haproxy/config/haproxy.config") From 30e7a0cc8c7138658d97106309cb9048d0b333fb Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 10 Mar 2026 12:51:48 +0100 Subject: [PATCH 14/33] CFN-6544: outbound fixes --- ci/scripts/start-bosh.sh | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index ad6bd418..b90f9ee3 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -105,11 +105,14 @@ function start_docker() { local certs_dir certs_dir="${1}" - export DNS_IP="8.8.8.8" + # Raise inotify limits so nested containers running systemd don't exhaust + # file descriptors. Systemd and containerd's cgroup-v2 event monitor both + # use inotify; the default max_user_instances (128) was too low. + sysctl -w fs.inotify.max_user_instances=1024 + sysctl -w fs.inotify.max_user_watches=524288 + sysctl -w net.ipv4.ip_forward=1 - # docker will fail starting with the new iptables. it throws: - # iptables v1.8.7 (nf_tables): Could not fetch rule set generation id: .... - update-alternatives --set iptables /usr/sbin/iptables-legacy + export DNS_IP="8.8.8.8" generate_certs "${certs_dir}" @@ -147,8 +150,10 @@ function start_docker() { "tlskey": "${certs_dir}/server-key.pem", "tlscacert": "${certs_dir}/ca.pem", "mtu": ${mtu}, + "dns": ["8.8.8.8", "8.8.4.4"], "data-root": "/scratch/docker", - "tlsverify": true + "tlsverify": true, + "ip-forward-no-drop": true } EOF From 6a20cd30423c243f9caed8c3406d7959b6905fdc Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 10 Mar 2026 13:07:09 +0100 Subject: [PATCH 15/33] CFN-6544: outbound fixes --- ci/Dockerfile | 5 +- ci/ops/bosh-dns.yml | 98 ++++++++++++++++++++++++++++++++++++++++ ci/scripts/start-bosh.sh | 10 +++- 3 files changed, 110 insertions(+), 3 deletions(-) create mode 100644 ci/ops/bosh-dns.yml diff --git a/ci/Dockerfile b/ci/Dockerfile index da6d1ff3..8835c870 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -11,7 +11,10 @@ RUN echo "source /tmp/local-bosh/director/docker-env" >> /root/.bashrc RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc # Copy ops files -COPY ops/bosh-scaled-out.yml /usr/local/bosh-deployment/bosh-scaled-out.yml +RUN mkdir -p /usr/local/bosh-deployment/haproxy-boshrelease +COPY ops/bosh-dns.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-dns.yml +COPY ops/bosh-scaled-out.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-scaled-out.yml +COPY ops/compilation.yml /usr/local/bosh-deployment/haproxy-boshrelease/compilation.yml # Install Python libraries needed for scripts RUN python3 -m venv /opt/venv diff --git a/ci/ops/bosh-dns.yml b/ci/ops/bosh-dns.yml new file mode 100644 index 00000000..11113091 --- /dev/null +++ b/ci/ops/bosh-dns.yml @@ -0,0 +1,98 @@ +- type: replace + path: /addons?/name=bosh-dns-systemd? + value: + include: + stemcell: + - os: ubuntu-noble + jobs: + - name: bosh-dns + properties: + configure_systemd_resolved: true + disable_recursors: true + override_nameserver: false + api: + client: + tls: ((dns_api_client_tls)) + server: + tls: ((dns_api_server_tls)) + health: + client: + tls: ((dns_healthcheck_client_tls)) + enabled: true + server: + tls: ((dns_healthcheck_server_tls)) + cache: + enabled: true + release: bosh-dns + name: bosh-dns-systemd +- type: replace + path: /releases/name=bosh-dns? + value: + name: bosh-dns + sha1: 494d9e6ff68909a3aaddf146464dd4599f9f16a8 + url: https://bosh.io/d/github.com/cloudfoundry/bosh-dns-release?v=1.39.21 + version: 1.39.21 +- type: replace + path: /variables/name=dns_healthcheck_tls_ca? + value: + name: dns_healthcheck_tls_ca + options: + common_name: dns-healthcheck-tls-ca + is_ca: true + type: certificate +- type: replace + path: /variables/name=dns_healthcheck_server_tls? + value: + name: dns_healthcheck_server_tls + options: + alternative_names: + - health.bosh-dns + ca: dns_healthcheck_tls_ca + common_name: health.bosh-dns + extended_key_usage: + - server_auth + type: certificate +- type: replace + path: /variables/name=dns_healthcheck_client_tls? + value: + name: dns_healthcheck_client_tls + options: + alternative_names: + - health.bosh-dns + ca: dns_healthcheck_tls_ca + common_name: health.bosh-dns + extended_key_usage: + - client_auth + type: certificate +- type: replace + path: /variables/name=dns_api_tls_ca? + value: + name: dns_api_tls_ca + options: + common_name: dns-api-tls-ca + is_ca: true + type: certificate +- type: replace + path: /variables/name=dns_api_server_tls? + value: + name: dns_api_server_tls + options: + alternative_names: + - api.bosh-dns + ca: dns_api_tls_ca + common_name: api.bosh-dns + extended_key_usage: + - server_auth + type: certificate +- type: replace + path: /variables/name=dns_api_client_tls? + value: + name: dns_api_client_tls + options: + alternative_names: + - api.bosh-dns + ca: dns_api_tls_ca + common_name: api.bosh-dns + extended_key_usage: + - client_auth + type: certificate \ No newline at end of file diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index b90f9ee3..e4877ec1 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -239,12 +239,15 @@ EOF } EOF + local ops_files_dir="$PWD/haproxy-boshrelease" + echo "Interpolating BOSH deployment manifest with Docker CPI and TLS configuration..." >&2 bosh int bosh.yml \ -o docker/cpi.yml \ -o jumpbox-user.yml \ -o /usr/local/local-releases.yml \ - -o "$PWD/bosh-scaled-out.yml" \ + -o "$ops_files_dir/bosh-dns.yml" \ + -o "$ops_files_dir/bosh-scaled-out.yml" \ -v director_name=docker \ -v internal_cidr=${docker_network_cidr} \ -v internal_gw=10.245.0.1 \ @@ -277,7 +280,10 @@ EOF source "${local_bosh_dir}/env" echo "Updating BOSH cloud config with Docker network..." >&2 - bosh -n update-cloud-config docker/cloud-config.yml -v network="${docker_network_name}" + bosh -n update-cloud-config \ + docker/cloud-config.yml \ + -o "$ops_files_dir/compilation.yml" \ + -v network="${docker_network_name}" popd > /dev/null } From a3a8866b3f7fc6a9d09c43f344909d4c3b7b3a89 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 10 Mar 2026 16:13:50 +0100 Subject: [PATCH 16/33] CFN-6544: increase canary watch timeout --- manifests/haproxy.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/manifests/haproxy.yml b/manifests/haproxy.yml index 3b09636e..a80b5430 100644 --- a/manifests/haproxy.yml +++ b/manifests/haproxy.yml @@ -25,8 +25,8 @@ instance_groups: update: canaries: 1 max_in_flight: 1 - canary_watch_time: 1000-30000 - update_watch_time: 1000-30000 + canary_watch_time: 1000-60000 + update_watch_time: 1000-60000 serial: false stemcells: From 65c6e0c892ce2da34788d1cd8d8e57419a97f685 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 10 Mar 2026 16:32:10 +0100 Subject: [PATCH 17/33] CFN-6544: increase canary watch timeout --- manifests/haproxy.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/manifests/haproxy.yml b/manifests/haproxy.yml index a80b5430..e0de40a1 100644 --- a/manifests/haproxy.yml +++ b/manifests/haproxy.yml @@ -25,8 +25,8 @@ instance_groups: update: canaries: 1 max_in_flight: 1 - canary_watch_time: 1000-60000 - update_watch_time: 1000-60000 + canary_watch_time: 1000-120000 + update_watch_time: 1000-120000 serial: false stemcells: From 369fc83faa7aa7277652d4915711e6f4fbecb454 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 10 Mar 2026 19:43:55 +0100 Subject: [PATCH 18/33] CFN-6544: increase canary watch timeout for Bosh deployment --- ci/Dockerfile | 1 + ci/ops/bosh-watch-time.yml | 6 ++++++ ci/scripts/start-bosh.sh | 1 + 3 files changed, 8 insertions(+) create mode 100644 ci/ops/bosh-watch-time.yml diff --git a/ci/Dockerfile b/ci/Dockerfile index 8835c870..9647e2f1 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -14,6 +14,7 @@ RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc RUN mkdir -p /usr/local/bosh-deployment/haproxy-boshrelease COPY ops/bosh-dns.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-dns.yml COPY ops/bosh-scaled-out.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-scaled-out.yml +COPY ops/bosh-watch-time.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-watch-time.yml COPY ops/compilation.yml /usr/local/bosh-deployment/haproxy-boshrelease/compilation.yml # Install Python libraries needed for scripts diff --git a/ci/ops/bosh-watch-time.yml b/ci/ops/bosh-watch-time.yml new file mode 100644 index 00000000..cae9c925 --- /dev/null +++ b/ci/ops/bosh-watch-time.yml @@ -0,0 +1,6 @@ +- type: replace + path: /update?/canary_watch_time? + value: 60000-1200000 +- type: replace + path: /update?/update_watch_time? + value: 60000-1200000 diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index e4877ec1..4055c8c7 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -248,6 +248,7 @@ EOF -o /usr/local/local-releases.yml \ -o "$ops_files_dir/bosh-dns.yml" \ -o "$ops_files_dir/bosh-scaled-out.yml" \ + -o "$ops_files_dir/bosh-watch-time.yml" \ -v director_name=docker \ -v internal_cidr=${docker_network_cidr} \ -v internal_gw=10.245.0.1 \ From 267a285788e00d4c394a9eae5dfa1d1410413992 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 10 Mar 2026 21:07:34 +0100 Subject: [PATCH 19/33] CFN-6544: increase canary watch timeout for Bosh deployment --- ci/ops/bosh-watch-time.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ci/ops/bosh-watch-time.yml b/ci/ops/bosh-watch-time.yml index cae9c925..249f65f2 100644 --- a/ci/ops/bosh-watch-time.yml +++ b/ci/ops/bosh-watch-time.yml @@ -1,6 +1,9 @@ - type: replace path: /update?/canary_watch_time? - value: 60000-1200000 + value: 60000-600000 - type: replace path: /update?/update_watch_time? - value: 60000-1200000 + value: 60000-600000 +- type: replace + path: /instance_groups/name=bosh/properties/director/db/connection_options?/pool_timeout? + value: 60 \ No newline at end of file From bf45ecddd49c62b7bd63641a119433f26591e81e Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 10 Mar 2026 23:43:45 +0100 Subject: [PATCH 20/33] CFN-6544: rollback dns ops file --- ci/scripts/start-bosh.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 4055c8c7..d151f693 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -246,7 +246,6 @@ EOF -o docker/cpi.yml \ -o jumpbox-user.yml \ -o /usr/local/local-releases.yml \ - -o "$ops_files_dir/bosh-dns.yml" \ -o "$ops_files_dir/bosh-scaled-out.yml" \ -o "$ops_files_dir/bosh-watch-time.yml" \ -v director_name=docker \ From c22ca235f13b3de1fd3fe73451abf9c85a6d9ee2 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 11 Mar 2026 00:01:44 +0100 Subject: [PATCH 21/33] CFN-6544: increase director db connection timeout --- ci/ops/bosh-watch-time.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/ops/bosh-watch-time.yml b/ci/ops/bosh-watch-time.yml index 249f65f2..6234b1c5 100644 --- a/ci/ops/bosh-watch-time.yml +++ b/ci/ops/bosh-watch-time.yml @@ -6,4 +6,4 @@ value: 60000-600000 - type: replace path: /instance_groups/name=bosh/properties/director/db/connection_options?/pool_timeout? - value: 60 \ No newline at end of file + value: 120 \ No newline at end of file From 30d5013956c0a8ed9c0d139231202a1a1f2ddb63 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 11 Mar 2026 09:13:13 +0100 Subject: [PATCH 22/33] CFN-6544: extra param --- ci/scripts/acceptance-tests | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/ci/scripts/acceptance-tests b/ci/scripts/acceptance-tests index e47fa0f3..92c0a600 100755 --- a/ci/scripts/acceptance-tests +++ b/ci/scripts/acceptance-tests @@ -61,6 +61,15 @@ else fi echo "------------------------------------------------------------------" -ginkgo "$VERBOSITY_FLAG" "$PARALLELISM" -r --trace --show-node-events --randomize-all --flake-attempts 5 "${ADDITIONAL_ARGS[@]}" +echo "------------------------------------------------------------------" +if [ -n "${FLAKE_ATTEMPTS:-}" ]; then + echo "FLAKE_ATTEMPTS is set. Will run ginkgo with '--flake-attempts=$FLAKE_ATTEMPTS'" +else + FLAKE_ATTEMPTS=5 + echo "FLAKE_ATTEMPTS is not set. Using default '$FLAKE_ATTEMPTS'" +fi +echo "------------------------------------------------------------------" + +ginkgo "$VERBOSITY_FLAG" "$PARALLELISM" -r --trace --show-node-events --randomize-all --flake-attempts "$FLAKE_ATTEMPTS" "${ADDITIONAL_ARGS[@]}" keep_running_info \ No newline at end of file From f1d2cd0302690bace8bc7e35ce5d6d40ce24163f Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 11 Mar 2026 11:10:11 +0100 Subject: [PATCH 23/33] CFN-6544: cgroupns mode --- ci/Dockerfile | 1 + ci/ops/bosh-cgroup.yml | 14 ++++++++++++++ ci/ops/bosh-watch-time.yml | 5 ++++- ci/scripts/start-bosh.sh | 2 ++ 4 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 ci/ops/bosh-cgroup.yml diff --git a/ci/Dockerfile b/ci/Dockerfile index 9647e2f1..301c0c43 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -12,6 +12,7 @@ RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc # Copy ops files RUN mkdir -p /usr/local/bosh-deployment/haproxy-boshrelease +COPY ops/bosh-cgroup.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-cgroup.yml COPY ops/bosh-dns.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-dns.yml COPY ops/bosh-scaled-out.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-scaled-out.yml COPY ops/bosh-watch-time.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-watch-time.yml diff --git a/ci/ops/bosh-cgroup.yml b/ci/ops/bosh-cgroup.yml new file mode 100644 index 00000000..b5238806 --- /dev/null +++ b/ci/ops/bosh-cgroup.yml @@ -0,0 +1,14 @@ +# When the Concourse worker host uses cgroups v2 (unified hierarchy), +# Docker containers started by the Docker CPI default to a private cgroup +# namespace. BPM's runc then tries to create scopes under +# /sys/fs/cgroup/systemd/ (a cgroups v1 path) which does not exist in +# the container, causing: +# openat2 /sys/fs/cgroup/systemd/.../cgroup.procs: no such file or directory +# +# Setting cgroupns_mode to "host" makes the BOSH director container share +# the host cgroup namespace so that /sys/fs/cgroup/systemd/ is visible +# and BPM/runc can write to it. +- type: replace + path: /instance_groups/name=bosh/properties/docker_cpi/container?/cgroupns_mode? + value: host + diff --git a/ci/ops/bosh-watch-time.yml b/ci/ops/bosh-watch-time.yml index 6234b1c5..ac774784 100644 --- a/ci/ops/bosh-watch-time.yml +++ b/ci/ops/bosh-watch-time.yml @@ -6,4 +6,7 @@ value: 60000-600000 - type: replace path: /instance_groups/name=bosh/properties/director/db/connection_options?/pool_timeout? - value: 120 \ No newline at end of file + value: 120 +- type: replace + path: /instance_groups/name=bosh/properties/director/db/connection_options?/connect_timeout? + value: 30 \ No newline at end of file diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index d151f693..e1c75390 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -246,6 +246,8 @@ EOF -o docker/cpi.yml \ -o jumpbox-user.yml \ -o /usr/local/local-releases.yml \ + -o "$ops_files_dir/bosh-cgroup.yml" \ + -o "$ops_files_dir/bosh-dns.yml" \ -o "$ops_files_dir/bosh-scaled-out.yml" \ -o "$ops_files_dir/bosh-watch-time.yml" \ -v director_name=docker \ From 803e842443d6a4b72c0e6d1e51f64f6b7665e0b1 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 11 Mar 2026 12:21:51 +0100 Subject: [PATCH 24/33] CFN-6544: privileged --- ci/ops/bosh-cgroup.yml | 14 +++----------- ci/scripts/start-bosh.sh | 1 - 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/ci/ops/bosh-cgroup.yml b/ci/ops/bosh-cgroup.yml index b5238806..1796f4a0 100644 --- a/ci/ops/bosh-cgroup.yml +++ b/ci/ops/bosh-cgroup.yml @@ -1,14 +1,6 @@ -# When the Concourse worker host uses cgroups v2 (unified hierarchy), -# Docker containers started by the Docker CPI default to a private cgroup -# namespace. BPM's runc then tries to create scopes under -# /sys/fs/cgroup/systemd/ (a cgroups v1 path) which does not exist in -# the container, causing: -# openat2 /sys/fs/cgroup/systemd/.../cgroup.procs: no such file or directory -# -# Setting cgroupns_mode to "host" makes the BOSH director container share -# the host cgroup namespace so that /sys/fs/cgroup/systemd/ is visible -# and BPM/runc can write to it. - type: replace path: /instance_groups/name=bosh/properties/docker_cpi/container?/cgroupns_mode? value: host - +- type: replace + path: /instance_groups/name=bosh/properties/docker_cpi/container?/privileged? + value: true diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index e1c75390..c07d4363 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -247,7 +247,6 @@ EOF -o jumpbox-user.yml \ -o /usr/local/local-releases.yml \ -o "$ops_files_dir/bosh-cgroup.yml" \ - -o "$ops_files_dir/bosh-dns.yml" \ -o "$ops_files_dir/bosh-scaled-out.yml" \ -o "$ops_files_dir/bosh-watch-time.yml" \ -v director_name=docker \ From f4553ce057aa0a81c072f35145cda60786299ff7 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 11 Mar 2026 14:05:41 +0100 Subject: [PATCH 25/33] CFN-6544: rollback ops files --- ci/scripts/start-bosh.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index c07d4363..402b6626 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -246,9 +246,7 @@ EOF -o docker/cpi.yml \ -o jumpbox-user.yml \ -o /usr/local/local-releases.yml \ - -o "$ops_files_dir/bosh-cgroup.yml" \ -o "$ops_files_dir/bosh-scaled-out.yml" \ - -o "$ops_files_dir/bosh-watch-time.yml" \ -v director_name=docker \ -v internal_cidr=${docker_network_cidr} \ -v internal_gw=10.245.0.1 \ From 8c85a98c07395927189ad27481c2dc82b538f0e6 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 11 Mar 2026 16:33:39 +0100 Subject: [PATCH 26/33] CFN-6544: cgroupns mode for all VMs --- ci/Dockerfile | 1 + ci/ops/cloud-config-cgroup.yml | 7 +++++++ ci/scripts/start-bosh.sh | 2 ++ 3 files changed, 10 insertions(+) create mode 100644 ci/ops/cloud-config-cgroup.yml diff --git a/ci/Dockerfile b/ci/Dockerfile index 301c0c43..0d8d7c92 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -12,6 +12,7 @@ RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc # Copy ops files RUN mkdir -p /usr/local/bosh-deployment/haproxy-boshrelease +COPY ops/cloud-config-cgroup.yml /usr/local/bosh-deployment/haproxy-boshrelease/cloud-config-cgroup.yml COPY ops/bosh-cgroup.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-cgroup.yml COPY ops/bosh-dns.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-dns.yml COPY ops/bosh-scaled-out.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-scaled-out.yml diff --git a/ci/ops/cloud-config-cgroup.yml b/ci/ops/cloud-config-cgroup.yml new file mode 100644 index 00000000..442254b2 --- /dev/null +++ b/ci/ops/cloud-config-cgroup.yml @@ -0,0 +1,7 @@ +- type: replace + path: /vm_types/name=default/cloud_properties/privileged? + value: true +- type: replace + path: /vm_types/name=default/cloud_properties/cgroupns_mode? + value: host + diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 402b6626..887dd11c 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -246,6 +246,7 @@ EOF -o docker/cpi.yml \ -o jumpbox-user.yml \ -o /usr/local/local-releases.yml \ + -o "$ops_files_dir/bosh-cgroup.yml" \ -o "$ops_files_dir/bosh-scaled-out.yml" \ -v director_name=docker \ -v internal_cidr=${docker_network_cidr} \ @@ -282,6 +283,7 @@ EOF bosh -n update-cloud-config \ docker/cloud-config.yml \ -o "$ops_files_dir/compilation.yml" \ + -o "$ops_files_dir/cloud-config-cgroup.yml" \ -v network="${docker_network_name}" popd > /dev/null From 3fcf4bc6d501fa9e0af085c36868d4cd26721196 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 11 Mar 2026 17:40:39 +0100 Subject: [PATCH 27/33] CFN-6544: cgroupfs driver --- ci/Dockerfile | 4 +- ci/ops/bosh-dns.yml | 98 -------------------------------------- ci/ops/bosh-watch-time.yml | 12 ----- ci/scripts/start-bosh.sh | 3 +- 4 files changed, 3 insertions(+), 114 deletions(-) delete mode 100644 ci/ops/bosh-dns.yml delete mode 100644 ci/ops/bosh-watch-time.yml diff --git a/ci/Dockerfile b/ci/Dockerfile index 0d8d7c92..c1eb1a1f 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -12,11 +12,9 @@ RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc # Copy ops files RUN mkdir -p /usr/local/bosh-deployment/haproxy-boshrelease -COPY ops/cloud-config-cgroup.yml /usr/local/bosh-deployment/haproxy-boshrelease/cloud-config-cgroup.yml COPY ops/bosh-cgroup.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-cgroup.yml -COPY ops/bosh-dns.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-dns.yml COPY ops/bosh-scaled-out.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-scaled-out.yml -COPY ops/bosh-watch-time.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-watch-time.yml +COPY ops/cloud-config-cgroup.yml /usr/local/bosh-deployment/haproxy-boshrelease/cloud-config-cgroup.yml COPY ops/compilation.yml /usr/local/bosh-deployment/haproxy-boshrelease/compilation.yml # Install Python libraries needed for scripts diff --git a/ci/ops/bosh-dns.yml b/ci/ops/bosh-dns.yml deleted file mode 100644 index 11113091..00000000 --- a/ci/ops/bosh-dns.yml +++ /dev/null @@ -1,98 +0,0 @@ -- type: replace - path: /addons?/name=bosh-dns-systemd? - value: - include: - stemcell: - - os: ubuntu-noble - jobs: - - name: bosh-dns - properties: - configure_systemd_resolved: true - disable_recursors: true - override_nameserver: false - api: - client: - tls: ((dns_api_client_tls)) - server: - tls: ((dns_api_server_tls)) - health: - client: - tls: ((dns_healthcheck_client_tls)) - enabled: true - server: - tls: ((dns_healthcheck_server_tls)) - cache: - enabled: true - release: bosh-dns - name: bosh-dns-systemd -- type: replace - path: /releases/name=bosh-dns? - value: - name: bosh-dns - sha1: 494d9e6ff68909a3aaddf146464dd4599f9f16a8 - url: https://bosh.io/d/github.com/cloudfoundry/bosh-dns-release?v=1.39.21 - version: 1.39.21 -- type: replace - path: /variables/name=dns_healthcheck_tls_ca? - value: - name: dns_healthcheck_tls_ca - options: - common_name: dns-healthcheck-tls-ca - is_ca: true - type: certificate -- type: replace - path: /variables/name=dns_healthcheck_server_tls? - value: - name: dns_healthcheck_server_tls - options: - alternative_names: - - health.bosh-dns - ca: dns_healthcheck_tls_ca - common_name: health.bosh-dns - extended_key_usage: - - server_auth - type: certificate -- type: replace - path: /variables/name=dns_healthcheck_client_tls? - value: - name: dns_healthcheck_client_tls - options: - alternative_names: - - health.bosh-dns - ca: dns_healthcheck_tls_ca - common_name: health.bosh-dns - extended_key_usage: - - client_auth - type: certificate -- type: replace - path: /variables/name=dns_api_tls_ca? - value: - name: dns_api_tls_ca - options: - common_name: dns-api-tls-ca - is_ca: true - type: certificate -- type: replace - path: /variables/name=dns_api_server_tls? - value: - name: dns_api_server_tls - options: - alternative_names: - - api.bosh-dns - ca: dns_api_tls_ca - common_name: api.bosh-dns - extended_key_usage: - - server_auth - type: certificate -- type: replace - path: /variables/name=dns_api_client_tls? - value: - name: dns_api_client_tls - options: - alternative_names: - - api.bosh-dns - ca: dns_api_tls_ca - common_name: api.bosh-dns - extended_key_usage: - - client_auth - type: certificate \ No newline at end of file diff --git a/ci/ops/bosh-watch-time.yml b/ci/ops/bosh-watch-time.yml deleted file mode 100644 index ac774784..00000000 --- a/ci/ops/bosh-watch-time.yml +++ /dev/null @@ -1,12 +0,0 @@ -- type: replace - path: /update?/canary_watch_time? - value: 60000-600000 -- type: replace - path: /update?/update_watch_time? - value: 60000-600000 -- type: replace - path: /instance_groups/name=bosh/properties/director/db/connection_options?/pool_timeout? - value: 120 -- type: replace - path: /instance_groups/name=bosh/properties/director/db/connection_options?/connect_timeout? - value: 30 \ No newline at end of file diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 887dd11c..a9e92491 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -153,7 +153,8 @@ function start_docker() { "dns": ["8.8.8.8", "8.8.4.4"], "data-root": "/scratch/docker", "tlsverify": true, - "ip-forward-no-drop": true + "ip-forward-no-drop": true, + "exec-opts": ["native.cgroupdriver=cgroupfs"] } EOF From 3adbce5c5fb783bd741302df859a7500e6dfcdc7 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Thu, 12 Mar 2026 10:03:31 +0100 Subject: [PATCH 28/33] CFN-6544: run all tests --- ci/ops/cloud-config-cgroup.yml | 4 ++-- ci/scripts/acceptance-tests | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/ci/ops/cloud-config-cgroup.yml b/ci/ops/cloud-config-cgroup.yml index 442254b2..2b5d3c58 100644 --- a/ci/ops/cloud-config-cgroup.yml +++ b/ci/ops/cloud-config-cgroup.yml @@ -1,7 +1,7 @@ - type: replace - path: /vm_types/name=default/cloud_properties/privileged? + path: /vm_types/name=default/cloud_properties?/privileged? value: true - type: replace - path: /vm_types/name=default/cloud_properties/cgroupns_mode? + path: /vm_types/name=default/cloud_properties?/cgroupns_mode? value: host diff --git a/ci/scripts/acceptance-tests b/ci/scripts/acceptance-tests index 92c0a600..d6e35394 100755 --- a/ci/scripts/acceptance-tests +++ b/ci/scripts/acceptance-tests @@ -4,9 +4,6 @@ set -e source "${REPO_ROOT}/ci/scripts/functions-ci.sh" START_DIR="${PWD}" # Differs for CI and manual execution - -FOCUS="Correctly starts if there is a healthy backend" - if [ -n "$FOCUS" ]; then echo "------------------------------------------------------------------" echo "FOCUS is set. Will only run tests matching '$FOCUS'" From 41a9b2107ecece6990765913fd362a5bb9ac4447 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Thu, 12 Mar 2026 13:28:26 +0100 Subject: [PATCH 29/33] CFN-6544: add attempts of bosh env creation --- acceptance-tests/healthcheck_test.go | 12 +++++------- ci/scripts/start-bosh.sh | 27 ++++++++++++++++++++++++--- 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/acceptance-tests/healthcheck_test.go b/acceptance-tests/healthcheck_test.go index 29aedc25..93c20a75 100644 --- a/acceptance-tests/healthcheck_test.go +++ b/acceptance-tests/healthcheck_test.go @@ -62,15 +62,13 @@ var _ = Describe("HTTP Health Check", func() { haproxyBackendServers: []string{"127.0.0.1"}, deploymentName: backendDeploymentName, }, []string{}, map[string]interface{}{}, true) - // defer deleteDeployment(backendDeploymentName) + defer deleteDeployment(backendDeploymentName) - _, backendLocalPort := startDefaultTestServer() - //closeLocalServer, backendLocalPort := startDefaultTestServer() - // defer closeLocalServer() + closeLocalServer, backendLocalPort := startDefaultTestServer() + defer closeLocalServer() - setupTunnelFromHaproxyToTestServer(backendHaproxyInfo, haproxyBackendPort, backendLocalPort) - //closeTunnel := setupTunnelFromHaproxyToTestServer(backendHaproxyInfo, haproxyBackendPort, backendLocalPort) - // defer closeTunnel() + closeTunnel := setupTunnelFromHaproxyToTestServer(backendHaproxyInfo, haproxyBackendPort, backendLocalPort) + defer closeTunnel() // Now deploy test HAProxy with 'haproxy-backend' configured as backend haproxyInfo, _ := deployHAProxy(baseManifestVars{ diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index a9e92491..50fe829b 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -259,9 +259,30 @@ EOF "${@}" > "${local_bosh_dir}/bosh-director.yml" echo "Creating BOSH director environment..." >&2 - bosh create-env "${local_bosh_dir}/bosh-director.yml" \ - --vars-store="${local_bosh_dir}/creds.yml" \ - --state="${local_bosh_dir}/state.json" + local create_env_rc=1 + local max_attempts=${FLAKE_ATTEMPTS:-5} + local attempt_interval=30 + for attempt in $(seq 1 $max_attempts); do + echo "bosh create-env attempt ${attempt}/${max_attempts}..." >&2 + bosh create-env "${local_bosh_dir}/bosh-director.yml" \ + --vars-store="${local_bosh_dir}/creds.yml" \ + --state="${local_bosh_dir}/state.json" + + create_env_rc=$? + if [ "${create_env_rc}" -eq "0" ]; then + echo "bosh create-env succeeded on attempt ${attempt}" >&2 + break + fi + echo "bosh create-env failed on attempt ${attempt} (exit code ${create_env_rc})" >&2 + if [ "${attempt}" -lt "${max_attempts}" ]; then + echo "Retrying in ${attempt_interval} seconds..." >&2 + sleep ${attempt_interval} + fi + done + if [ "${create_env_rc}" -ne "0" ]; then + echo "bosh create-env failed after ${max_attempts} attempts. Exiting." >&2 + exit 1 + fi echo "Extracting BOSH director credentials and CA certificate..." >&2 bosh int "${local_bosh_dir}/creds.yml" --path /director_ssl/ca > "${local_bosh_dir}/ca.crt" From a646ccfa0a7d907171cfcbbfc3aa7f22e74934f2 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Thu, 12 Mar 2026 14:44:56 +0100 Subject: [PATCH 30/33] CFN-6544: add attempts of bosh env creation, fix; remove unnecessary ops files --- ci/Dockerfile | 2 -- ci/ops/bosh-cgroup.yml | 6 ------ ci/ops/cloud-config-cgroup.yml | 7 ------- ci/scripts/start-bosh.sh | 6 +++--- 4 files changed, 3 insertions(+), 18 deletions(-) delete mode 100644 ci/ops/bosh-cgroup.yml delete mode 100644 ci/ops/cloud-config-cgroup.yml diff --git a/ci/Dockerfile b/ci/Dockerfile index c1eb1a1f..b3bd5c48 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -12,9 +12,7 @@ RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc # Copy ops files RUN mkdir -p /usr/local/bosh-deployment/haproxy-boshrelease -COPY ops/bosh-cgroup.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-cgroup.yml COPY ops/bosh-scaled-out.yml /usr/local/bosh-deployment/haproxy-boshrelease/bosh-scaled-out.yml -COPY ops/cloud-config-cgroup.yml /usr/local/bosh-deployment/haproxy-boshrelease/cloud-config-cgroup.yml COPY ops/compilation.yml /usr/local/bosh-deployment/haproxy-boshrelease/compilation.yml # Install Python libraries needed for scripts diff --git a/ci/ops/bosh-cgroup.yml b/ci/ops/bosh-cgroup.yml deleted file mode 100644 index 1796f4a0..00000000 --- a/ci/ops/bosh-cgroup.yml +++ /dev/null @@ -1,6 +0,0 @@ -- type: replace - path: /instance_groups/name=bosh/properties/docker_cpi/container?/cgroupns_mode? - value: host -- type: replace - path: /instance_groups/name=bosh/properties/docker_cpi/container?/privileged? - value: true diff --git a/ci/ops/cloud-config-cgroup.yml b/ci/ops/cloud-config-cgroup.yml deleted file mode 100644 index 2b5d3c58..00000000 --- a/ci/ops/cloud-config-cgroup.yml +++ /dev/null @@ -1,7 +0,0 @@ -- type: replace - path: /vm_types/name=default/cloud_properties?/privileged? - value: true -- type: replace - path: /vm_types/name=default/cloud_properties?/cgroupns_mode? - value: host - diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 50fe829b..ca6b8a9c 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -247,7 +247,6 @@ EOF -o docker/cpi.yml \ -o jumpbox-user.yml \ -o /usr/local/local-releases.yml \ - -o "$ops_files_dir/bosh-cgroup.yml" \ -o "$ops_files_dir/bosh-scaled-out.yml" \ -v director_name=docker \ -v internal_cidr=${docker_network_cidr} \ @@ -264,11 +263,13 @@ EOF local attempt_interval=30 for attempt in $(seq 1 $max_attempts); do echo "bosh create-env attempt ${attempt}/${max_attempts}..." >&2 + set +e. # disables abort-on-error bosh create-env "${local_bosh_dir}/bosh-director.yml" \ --vars-store="${local_bosh_dir}/creds.yml" \ --state="${local_bosh_dir}/state.json" - create_env_rc=$? + set -e + if [ "${create_env_rc}" -eq "0" ]; then echo "bosh create-env succeeded on attempt ${attempt}" >&2 break @@ -305,7 +306,6 @@ EOF bosh -n update-cloud-config \ docker/cloud-config.yml \ -o "$ops_files_dir/compilation.yml" \ - -o "$ops_files_dir/cloud-config-cgroup.yml" \ -v network="${docker_network_name}" popd > /dev/null From 485ed5be2dca2837d3eae1dc0d7d4fd1470c5a6a Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Thu, 12 Mar 2026 14:54:32 +0100 Subject: [PATCH 31/33] CFN-6544: typo --- ci/scripts/start-bosh.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index ca6b8a9c..2f84f6b6 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -263,7 +263,7 @@ EOF local attempt_interval=30 for attempt in $(seq 1 $max_attempts); do echo "bosh create-env attempt ${attempt}/${max_attempts}..." >&2 - set +e. # disables abort-on-error + set +e # disables abort-on-error bosh create-env "${local_bosh_dir}/bosh-director.yml" \ --vars-store="${local_bosh_dir}/creds.yml" \ --state="${local_bosh_dir}/state.json" From c0aa7bd07f828414d51606e83b8faf3b45ba88d0 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Thu, 12 Mar 2026 17:25:29 +0100 Subject: [PATCH 32/33] CFN-6544: redeploy after error to protect against flakiness --- .../acceptance_tests_suite_test.go | 7 --- acceptance-tests/bosh_helpers.go | 50 ++++++++++++++++--- acceptance-tests/config.go | 14 ++++++ acceptance-tests/log_helpers.go | 15 ++++++ 4 files changed, 73 insertions(+), 13 deletions(-) create mode 100644 acceptance-tests/log_helpers.go diff --git a/acceptance-tests/acceptance_tests_suite_test.go b/acceptance-tests/acceptance_tests_suite_test.go index 93e2ca2b..ab937416 100644 --- a/acceptance-tests/acceptance_tests_suite_test.go +++ b/acceptance-tests/acceptance_tests_suite_test.go @@ -247,10 +247,3 @@ func checkNetOpErr(err error, expectString string) { Expect(errors.As(tlsErr, &opErr)).To(BeTrue()) Expect(opErr.Err.Error()).To(ContainSubstring(expectString)) } - -func writeLog(s string) { - ginkgoConfig, _ := GinkgoConfiguration() - for _, line := range strings.Split(s, "\n") { - fmt.Printf("node %d/%d: %s\n", ginkgoConfig.ParallelProcess, ginkgoConfig.ParallelTotal, line) - } -} diff --git a/acceptance-tests/bosh_helpers.go b/acceptance-tests/bosh_helpers.go index 8197b47c..f6fa4022 100644 --- a/acceptance-tests/bosh_helpers.go +++ b/acceptance-tests/bosh_helpers.go @@ -149,13 +149,9 @@ func deployHAProxy(baseManifestVars baseManifestVars, customOpsfiles []string, c cmd, varsStoreReader := deployBaseManifestCmd(baseManifestVars.deploymentName, opsfiles, manifestVars) dumpCmd(cmd) - session, err := gexec.Start(cmd, GinkgoWriter, GinkgoWriter) - Expect(err).NotTo(HaveOccurred()) + session := deployWithRetry(baseManifestVars.deploymentName, cmd, 20*time.Minute, expectSuccess) - if expectSuccess { - Eventually(session, 20*time.Minute, time.Second).Should(gexec.Exit(0)) - } else { - Eventually(session, 20*time.Minute, time.Second).Should(gexec.Exit()) + if !expectSuccess { Expect(session.ExitCode()).NotTo(BeZero()) } @@ -167,6 +163,48 @@ func deployHAProxy(baseManifestVars baseManifestVars, customOpsfiles []string, c return haproxyInfo, varsStoreReader } +// deployWithRetry runs a bosh deploy command and retries up to config.FlakeAttempts times. +// On each failed attempt the deployment is deleted before retrying, so the next attempt starts clean. +// If expectSuccess is false the command is run once without retrying (failure is expected by the caller). +func deployWithRetry(boshDeployment string, cmd *exec.Cmd, timeout time.Duration, expectSuccess bool) *gexec.Session { + var session *gexec.Session + var err error + + for attempt := 1; attempt <= config.FlakeAttempts; attempt++ { + if attempt > 1 { + writeLog(fmt.Sprintf("Deployment attempt %d/%d failed, deleting deployment before retry...", attempt-1, config.FlakeAttempts)) + deleteDeployment(boshDeployment) + + writeLog(fmt.Sprintf("Retrying deployment (attempt %d/%d)...", attempt, config.FlakeAttempts)) + newCmd := exec.Command(cmd.Path, cmd.Args[1:]...) + newCmd.Env = cmd.Env + cmd = newCmd + } + + session, err = gexec.Start(cmd, GinkgoWriter, GinkgoWriter) + Expect(err).NotTo(HaveOccurred()) + + // Wait for the process to exit without asserting the exit code + Eventually(session, timeout, time.Second).Should(gexec.Exit()) + + if !expectSuccess { + // caller expects failure — return immediately without retrying + return session + } + + if session.ExitCode() == 0 { + writeLog(fmt.Sprintf("Deployment succeeded on attempt %d/%d", attempt, config.FlakeAttempts)) + return session + } + + writeLog(fmt.Sprintf("Deployment failed on attempt %d/%d (exit code %d)", attempt, config.FlakeAttempts, session.ExitCode())) + } + + // All attempts exhausted — fail the test with a clear message + Expect(session.ExitCode()).To(BeZero(), fmt.Sprintf("Deployment failed after %d attempt(s)", config.FlakeAttempts)) + return session +} + func dumpCmd(cmd *exec.Cmd) { writeLog("---------- Command to run ----------") writeLog(cmd.String()) diff --git a/acceptance-tests/config.go b/acceptance-tests/config.go index 5d3c6bce..aa27d4c8 100644 --- a/acceptance-tests/config.go +++ b/acceptance-tests/config.go @@ -4,10 +4,13 @@ import ( "fmt" "os" "os/exec" + "strconv" ) var config Config +const DEFAULT_FLAKE_ATTEMPTS = 5 + type Config struct { ReleaseRepoPath string `json:"releaseRepoPath"` ReleaseVersion string `json:"releaseVersion"` @@ -18,6 +21,7 @@ type Config struct { BoshPath string `json:"boshPath"` BaseManifestPath string `json:"baseManifestPath"` HomePath string `json:"homePath"` + FlakeAttempts int `json:"flakeAttempts"` } func loadConfig() (Config, error) { @@ -67,6 +71,15 @@ func loadConfig() (Config, error) { return Config{}, err } + flakeAttempts := DEFAULT_FLAKE_ATTEMPTS + if val := os.Getenv("FLAKE_ATTEMPTS"); val != "" { + if flakeAttemptsFromEnv, err := strconv.Atoi(val); err == nil && flakeAttemptsFromEnv > 0 { + flakeAttempts = flakeAttemptsFromEnv + } else { + writeLog(fmt.Sprintf("FLAKE_ATTEMPTS must be a positive integer, but got: %s, so defaulting test suite's flakeAttempts to %d", val, DEFAULT_FLAKE_ATTEMPTS)) + } + } + return Config{ ReleaseRepoPath: releaseRepoPath, ReleaseVersion: releaseVersion, @@ -77,6 +90,7 @@ func loadConfig() (Config, error) { BoshPath: boshPath, BaseManifestPath: baseManifestPath, HomePath: homePath, + FlakeAttempts: flakeAttempts, }, nil } diff --git a/acceptance-tests/log_helpers.go b/acceptance-tests/log_helpers.go new file mode 100644 index 00000000..d134aca5 --- /dev/null +++ b/acceptance-tests/log_helpers.go @@ -0,0 +1,15 @@ +package acceptance_tests + +import ( + "fmt" + "strings" + + . "github.com/onsi/ginkgo/v2" +) + +func writeLog(s string) { + ginkgoConfig, _ := GinkgoConfiguration() + for _, line := range strings.Split(s, "\n") { + fmt.Printf("node %d/%d: %s\n", ginkgoConfig.ParallelProcess, ginkgoConfig.ParallelTotal, line) + } +} From 36dcd2a972087cfb8ae6da6ffc343cd6b3e55987 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Thu, 12 Mar 2026 18:34:20 +0100 Subject: [PATCH 33/33] CFN-6544: workaround to make bosh start --- ci/scripts/start-bosh.sh | 84 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 78 insertions(+), 6 deletions(-) diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 2f84f6b6..a2b62f62 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -260,10 +260,10 @@ EOF echo "Creating BOSH director environment..." >&2 local create_env_rc=1 local max_attempts=${FLAKE_ATTEMPTS:-5} - local attempt_interval=30 + local bpm_restart_timeout=120 for attempt in $(seq 1 $max_attempts); do echo "bosh create-env attempt ${attempt}/${max_attempts}..." >&2 - set +e # disables abort-on-error + set +e bosh create-env "${local_bosh_dir}/bosh-director.yml" \ --vars-store="${local_bosh_dir}/creds.yml" \ --state="${local_bosh_dir}/state.json" @@ -274,12 +274,84 @@ EOF echo "bosh create-env succeeded on attempt ${attempt}" >&2 break fi - echo "bosh create-env failed on attempt ${attempt} (exit code ${create_env_rc})" >&2 - if [ "${attempt}" -lt "${max_attempts}" ]; then - echo "Retrying in ${attempt_interval} seconds..." >&2 - sleep ${attempt_interval} + + echo "bosh create-env failed on attempt ${attempt}/${max_attempts} (exit code ${create_env_rc})" >&2 + if [ "${attempt}" -ge "${max_attempts}" ]; then + break + fi + + # The VM already exists but its jobs are in 'failing' state. + # Re-running create-env from scratch wastes ~6 minutes recompiling packages. + # Instead, find the director container and restart BPM inside it so monit + # can bring the jobs back up — then let create-env verify the running state. + local director_container + director_container=$(docker ps --format "{{.ID}}" | head -1) + if [ -n "${director_container}" ]; then + echo "Found director container ${director_container}, restarting BPM jobs..." >&2 + set +e + # runc delete --force fails because the container cgroup scope dirs + # (system.slice/runc-*.scope) are owned by the host systemd and cannot + # be rmdir-d from inside the nested container, even when they are empty. + # + # BPM only needs the runc state dir to be gone before it can re-create + # the container. So: remove the state dir directly, bypassing runc delete. + # The orphaned cgroup scope dirs will be cleaned up by the host systemd + # garbage collector once there are no more references to them. + docker exec "${director_container}" bash -c ' + runc_bin=/var/vcap/packages/bpm/bin/runc + runc_root=/var/vcap/sys/run/bpm-runc + + for container_id in $(${runc_bin} --root ${runc_root} list -q 2>/dev/null); do + # postgres must keep running — the director depends on it + [ "${container_id}" = "bpm-postgres" ] && continue + echo "Cleaning up runc container: ${container_id}" >&2 + rm -rf "${runc_root:?}/${container_id}" + done + ' + # Restart all monitored jobs except postgres (which must keep running + # as the director database — restarting it would cause data loss risk + # and break the director on the next attempt). + docker exec "${director_container}" bash -c ' + /var/vcap/bosh/bin/monit summary | awk "/Process/{print \$2}" | tr -d "'"'"'" | \ + while read -r job; do + [ "${job}" = "postgres" ] && continue + echo "Restarting monit job: ${job}" >&2 + /var/vcap/bosh/bin/monit restart "${job}" || true + done + ' + set -e + + echo "Waiting up to ${bpm_restart_timeout}s for director jobs to recover..." >&2 + local elapsed=0 + local recovered=false + while [ "${elapsed}" -lt "${bpm_restart_timeout}" ]; do + sleep 10 + elapsed=$((elapsed + 10)) + set +e + local status + status=$(docker exec "${director_container}" /var/vcap/bosh/bin/monit summary 2>/dev/null) + set -e + local failing + failing=$(echo "${status}" | grep -c "not monitored\|does not exist\|failed\|stopped" || true) + if [ "${failing}" -eq "0" ]; then + echo "All director jobs are running after BPM restart (${elapsed}s)" >&2 + recovered=true + break + fi + echo "Still waiting for jobs... (${elapsed}s)" >&2 + done + + if [ "${recovered}" = "true" ]; then + echo "BPM recovery succeeded, re-running create-env to verify state..." >&2 + else + echo "BPM recovery did not complete within ${bpm_restart_timeout}s, re-running create-env anyway..." >&2 + fi + else + echo "Director container not found at ${BOSH_DIRECTOR_IP}, re-running create-env from scratch..." >&2 + sleep 10 fi done + if [ "${create_env_rc}" -ne "0" ]; then echo "bosh create-env failed after ${max_attempts} attempts. Exiting." >&2 exit 1