From e3f6e4e9712465ced1510a6c36e987aac5048793 Mon Sep 17 00:00:00 2001 From: Shiti Saxena Date: Sun, 10 May 2026 16:34:18 -0700 Subject: [PATCH] fix: fix flaky docker e2e tests and update python image --- forge-go/e2e/distributed_client_test.go | 3 +++ forge-go/e2e/echo_test.go | 27 ++++++++++++++++++++++--- forge-go/e2e/main_test.go | 5 +++++ forge-go/supervisor/docker.go | 21 +++++++++++++------ forge-go/supervisor/docker_test.go | 6 +++--- 5 files changed, 50 insertions(+), 12 deletions(-) diff --git a/forge-go/e2e/distributed_client_test.go b/forge-go/e2e/distributed_client_test.go index d749287..fce6457 100644 --- a/forge-go/e2e/distributed_client_test.go +++ b/forge-go/e2e/distributed_client_test.go @@ -180,6 +180,9 @@ agents: statusKey := fmt.Sprintf("forge:agent:status:%s:%s", guildID, "echo-agent") val, err := rdb.Get(context.Background(), statusKey).Result() if err != nil { + if err == redis.Nil { + return fmt.Errorf("status key %q not written yet", statusKey) + } return err } var status map[string]interface{} diff --git a/forge-go/e2e/echo_test.go b/forge-go/e2e/echo_test.go index 2097feb..4ccc837 100644 --- a/forge-go/e2e/echo_test.go +++ b/forge-go/e2e/echo_test.go @@ -166,6 +166,29 @@ func TestLevel1_EchoAgentIntegration(t *testing.T) { } }() + // For Docker, wait until the supervisor confirms the container is running before + // sending messages. This separates cold-start time from the message-exchange timeout. + if reqSup == "docker" { + statusKey := fmt.Sprintf("forge:agent:status:%s:%s", guildSpec.ID, agentSpec.ID) + require.NoError(t, waitFor(90*time.Second, 500*time.Millisecond, func() error { + val, err := rdb.Get(context.Background(), statusKey).Result() + if err == redis.Nil { + return fmt.Errorf("container not yet started") + } + if err != nil { + return err + } + var st map[string]interface{} + if err := json.Unmarshal([]byte(val), &st); err != nil { + return err + } + if state, _ := st["state"].(string); state != "running" { + return fmt.Errorf("container state: %q", state) + } + return nil + }), "Docker agent container did not reach running state") + } + // 6. Execute the core test assertions // Send a payload to the topic the EchoAgent is configured to listen to. testPayload := map[string]interface{}{ @@ -175,7 +198,7 @@ func TestLevel1_EchoAgentIntegration(t *testing.T) { topicIn := fmt.Sprintf("%s:echo_topic", guildSpec.ID) msg.TopicPublishedTo = topicIn - // Background routine to continually ping the agent until it finishes uvx boots and subscribes + // Background routine to continually ping the agent until it subscribes. done := make(chan struct{}) go func() { for { @@ -188,8 +211,6 @@ func TestLevel1_EchoAgentIntegration(t *testing.T) { } }() - // Wait up to 30 seconds for the EchoAgent to receive, process, and publish a response - // Docker boots usually take ~3s the first time if cached, but we give 30s. topicOut := fmt.Sprintf("%s:default_topic", guildSpec.ID) respMsg, err := probeAgent.WaitForMessage(ctx, topicOut, 30*time.Second) close(done) diff --git a/forge-go/e2e/main_test.go b/forge-go/e2e/main_test.go index 5c76005..187aea2 100644 --- a/forge-go/e2e/main_test.go +++ b/forge-go/e2e/main_test.go @@ -62,6 +62,11 @@ func TestMain(m *testing.M) { fmt.Fprintf(os.Stderr, "failed to build e2e forge binary: %v\n%s\n", buildErr, string(buildOut)) os.Exit(1) } + + // Pre-pull the default Docker agent image so individual tests don't time out on a cold pull. + if out, err := exec.Command("docker", "pull", "ghcr.io/astral-sh/uv:python3.13-bookworm-slim").CombinedOutput(); err != nil { + fmt.Fprintf(os.Stderr, "docker pre-pull (non-fatal): %v\n%s\n", err, out) + } } code := m.Run() if e2eBaseDir != "" { diff --git a/forge-go/supervisor/docker.go b/forge-go/supervisor/docker.go index 08c553e..b8d3bed 100644 --- a/forge-go/supervisor/docker.go +++ b/forge-go/supervisor/docker.go @@ -7,6 +7,7 @@ import ( "fmt" "io" "log/slog" + "os" "os/user" "path/filepath" "strings" @@ -209,7 +210,7 @@ func (d *DockerSupervisor) Launch(ctx context.Context, guildID string, agentSpec imageRef := entry.Image if imageRef == "" { - imageRef = "ghcr.io/astral-sh/uv:python3.12-bookworm-slim" + imageRef = "ghcr.io/astral-sh/uv:python3.13-bookworm-slim" } if err := d.ensureImage(ctx, imageRef); err != nil { @@ -218,13 +219,14 @@ func (d *DockerSupervisor) Launch(ctx context.Context, guildID string, agentSpec env = append(env, "UV_PROJECT_ENVIRONMENT=/tmp/.venv") - var cleanEnv []string + // Extract UV_CACHE_DIR so we can bind-mount it into the container for caching. + var uvCacheDir string for _, e := range env { - if !strings.HasPrefix(e, "UV_CACHE_DIR=") { - cleanEnv = append(cleanEnv, e) + if val, ok := strings.CutPrefix(e, "UV_CACHE_DIR="); ok { + uvCacheDir = val + break } } - env = cleanEnv var cmd []string if entry.Runtime == registry.RuntimeDocker { @@ -266,6 +268,13 @@ func (d *DockerSupervisor) Launch(ctx context.Context, guildID string, agentSpec containerCfg, hostCfg := BuildContainerConfig(agentSpec, entry, guildID, imageRef, cmd, env) + // Mount the UV cache directory so repeated runs reuse cached packages. + if uvCacheDir != "" { + if err := os.MkdirAll(uvCacheDir, 0o755); err == nil { + hostCfg.Binds = append(hostCfg.Binds, fmt.Sprintf("%s:%s:rw,z", uvCacheDir, uvCacheDir)) + } + } + // Adjust container config for bridge connectivity. if bridge != nil { if bridge.Mode() == BridgeTransportIPC { @@ -413,7 +422,7 @@ func (d *DockerSupervisor) relaunchContainer(ctx context.Context, guildID string imageRef := entry.Image if imageRef == "" { - imageRef = "ghcr.io/astral-sh/uv:python3.12-bookworm-slim" + imageRef = "ghcr.io/astral-sh/uv:python3.13-bookworm-slim" } var cmd []string diff --git a/forge-go/supervisor/docker_test.go b/forge-go/supervisor/docker_test.go index 114d1cb..58f4b38 100644 --- a/forge-go/supervisor/docker_test.go +++ b/forge-go/supervisor/docker_test.go @@ -27,11 +27,11 @@ func TestBuildContainerConfig_Airgapped(t *testing.T) { cmd := []string{"python", "main.py"} env := []string{"FOO=BAR"} - cCfg, hCfg := BuildContainerConfig(agentSpec, entry, "test-guild", "ghcr.io/astral-sh/uv:python3.12-bookworm-slim", cmd, env) + cCfg, hCfg := BuildContainerConfig(agentSpec, entry, "test-guild", "ghcr.io/astral-sh/uv:python3.13-bookworm-slim", cmd, env) // Image - if cCfg.Image != "ghcr.io/astral-sh/uv:python3.12-bookworm-slim" { - t.Errorf("expected image ghcr.io/astral-sh/uv:python3.12-bookworm-slim, got %s", cCfg.Image) + if cCfg.Image != "ghcr.io/astral-sh/uv:python3.13-bookworm-slim" { + t.Errorf("expected image ghcr.io/astral-sh/uv:python3.13-bookworm-slim, got %s", cCfg.Image) } // Command