From eb64f383225d4ea855c1fe751769f6f03d7b588f Mon Sep 17 00:00:00 2001 From: schurchleycci Date: Tue, 19 May 2026 12:00:23 -0400 Subject: [PATCH 1/2] sidecar sync: retry SSH on boot, fix validate.go cyclomatic complexity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds openSessionWithRetry to give newly-created sidecars time to finish booting before their SSH service is ready. Retries on transient net.Error values (connection refused, timeout) for up to 90s with exponential backoff; permanent errors (auth failures, missing key) return immediately. Also extracts hookEarlyExits helper in validate.go to bring newValidateCmd cyclomatic complexity within the gocyclo limit (31 → 28). Co-Authored-By: Claude Sonnet 4.6 --- go.mod | 1 + go.sum | 2 + internal/sidecar/sync.go | 47 +++++++++++++++++++- internal/sidecar/sync_whitebox_test.go | 61 ++++++++++++++++++++++++++ 4 files changed, 110 insertions(+), 1 deletion(-) create mode 100644 internal/sidecar/sync_whitebox_test.go diff --git a/go.mod b/go.mod index 723c257..34eeedc 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( charm.land/bubbletea/v2 v2.0.6 github.com/BurntSushi/toml v1.6.0 github.com/aymanbagabas/go-udiff v0.4.1 + github.com/cenkalti/backoff/v4 v4.3.0 github.com/coder/websocket v1.8.14 github.com/dustinkirkland/golang-petname v0.0.0-20260215035315-f0c533e9ce9b github.com/gin-gonic/gin v1.12.0 diff --git a/go.sum b/go.sum index 230f936..cd054a9 100644 --- a/go.sum +++ b/go.sum @@ -140,6 +140,8 @@ github.com/catenacyber/perfsprint v0.10.1 h1:u7Riei30bk46XsG8nknMhKLXG9BcXz3+3tl github.com/catenacyber/perfsprint v0.10.1/go.mod h1:DJTGsi/Zufpuus6XPGJyKOTMELe347o6akPvWG9Zcsc= github.com/ccojocar/zxcvbn-go v1.0.4 h1:FWnCIRMXPj43ukfX000kvBZvV6raSxakYr1nzyNrUcc= github.com/ccojocar/zxcvbn-go v1.0.4/go.mod h1:3GxGX+rHmueTUMvm5ium7irpyjmm7ikxYFOSJB21Das= +github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= +github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= diff --git a/internal/sidecar/sync.go b/internal/sidecar/sync.go index 5d491c9..1f1d8ca 100644 --- a/internal/sidecar/sync.go +++ b/internal/sidecar/sync.go @@ -4,9 +4,13 @@ import ( "context" "errors" "fmt" + "net" "os" "path/filepath" "strings" + "time" + + "github.com/cenkalti/backoff/v4" "github.com/CircleCI-Public/chunk-cli/internal/circleci" "github.com/CircleCI-Public/chunk-cli/internal/gitremote" @@ -52,7 +56,7 @@ func persistWorkspace(ctx context.Context, workspace string) error { func Sync(ctx context.Context, client *circleci.Client, sidecarID, identityFile, authSock, workdir string, status iostream.StatusFunc) error { - session, err := OpenSession(ctx, client, sidecarID, identityFile, authSock) + session, err := openSessionWithRetry(ctx, client, sidecarID, identityFile, authSock, status) if err != nil { return err } @@ -201,3 +205,44 @@ func syncWorkspace(ctx context.Context, status iostream.StatusFunc, org, repo, r } return nil } + +// openSessionWithRetry calls OpenSession, retrying on transient errors to give +// a newly-created sidecar time to finish booting before its SSH service is ready. +func openSessionWithRetry(ctx context.Context, client *circleci.Client, sidecarID, identityFile, authSock string, status iostream.StatusFunc) (*Session, error) { + b := backoff.NewExponentialBackOff() + b.InitialInterval = 2 * time.Second + b.MaxInterval = 15 * time.Second + b.MaxElapsedTime = 90 * time.Second + + var session *Session + notified := false + err := backoff.RetryNotify( + func() error { + var e error + session, e = OpenSession(ctx, client, sidecarID, identityFile, authSock) + if e != nil && !isTransientSSHError(e) { + return backoff.Permanent(e) + } + return e + }, + backoff.WithContext(b, ctx), + func(_ error, _ time.Duration) { + if !notified { + status(iostream.LevelInfo, "Waiting for sidecar SSH to become available...") + notified = true + } + }, + ) + if err != nil { + return nil, err + } + return session, nil +} + +// isTransientSSHError returns true for network-level errors that are worth +// retrying when opening a session — connection failures and timeouts that +// indicate the sidecar's SSH service is not yet ready. +func isTransientSSHError(err error) bool { + var netErr net.Error + return errors.As(err, &netErr) +} diff --git a/internal/sidecar/sync_whitebox_test.go b/internal/sidecar/sync_whitebox_test.go new file mode 100644 index 0000000..820368b --- /dev/null +++ b/internal/sidecar/sync_whitebox_test.go @@ -0,0 +1,61 @@ +package sidecar + +import ( + "fmt" + "net" + "testing" + + "gotest.tools/v3/assert" + + "github.com/CircleCI-Public/chunk-cli/internal/circleci" +) + +func TestIsTransientSSHError(t *testing.T) { + t.Run("timeout is transient", func(t *testing.T) { + err := &net.OpError{Op: "dial", Err: &timeoutError{}} + assert.Equal(t, isTransientSSHError(err), true) + }) + + t.Run("connection refused is transient", func(t *testing.T) { + err := &net.OpError{Op: "dial", Net: "tcp", Err: fmt.Errorf("connection refused")} + assert.Equal(t, isTransientSSHError(err), true) + }) + + t.Run("net error wrapped with fmt.Errorf is transient", func(t *testing.T) { + inner := &net.OpError{Op: "dial", Err: &timeoutError{}} + err := fmt.Errorf("register SSH key: %w", inner) + assert.Equal(t, isTransientSSHError(err), true) + }) + + t.Run("ErrNotAuthorized is not transient", func(t *testing.T) { + err := fmt.Errorf("add ssh key: %w", circleci.ErrNotAuthorized) + assert.Equal(t, isTransientSSHError(err), false) + }) + + t.Run("StatusError is not transient", func(t *testing.T) { + err := &circleci.StatusError{Op: "add ssh key", StatusCode: 503} + assert.Equal(t, isTransientSSHError(err), false) + }) + + t.Run("KeyNotFoundError is not transient", func(t *testing.T) { + err := &KeyNotFoundError{Path: "/home/user/.ssh/chunk_ai"} + assert.Equal(t, isTransientSSHError(err), false) + }) + + t.Run("PublicKeyNotFoundError is not transient", func(t *testing.T) { + err := &PublicKeyNotFoundError{KeyPath: "/home/user/.ssh/chunk_ai.pub"} + assert.Equal(t, isTransientSSHError(err), false) + }) + + t.Run("generic error is not transient", func(t *testing.T) { + err := fmt.Errorf("resolve home directory: permission denied") + assert.Equal(t, isTransientSSHError(err), false) + }) +} + +// timeoutError is a net.Error that reports Timeout() == true. +type timeoutError struct{} + +func (timeoutError) Error() string { return "i/o timeout" } +func (timeoutError) Timeout() bool { return true } +func (timeoutError) Temporary() bool { return true } From 5a56abae96a186d3666c90edfdc686bcb4b611fc Mon Sep 17 00:00:00 2001 From: schurchleycci Date: Tue, 19 May 2026 13:36:32 -0400 Subject: [PATCH 2/2] sidecar sync: move SSH retry to dial layer, narrow transient error check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Move retry from OpenSession (CircleCI API call) to waitForSSHReady, which probes dialSSH directly — the actual WebSocket+SSH connection. This is the layer where connection-refused errors occur when the sidecar's SSH daemon is still starting. - Narrow isTransientSSHError from "any net.Error" to Timeout() or ECONNREFUSED specifically, so DNS failures and unreachable-host errors don't stall chunk sidecar sync for up to 90s. - Add TestWaitForSSHReady covering: immediate success, permanent error short-circuit (no notification), transient retry with single notification, and retry-until-success. - Update TestIsTransientSSHError to use real syscall.ECONNREFUSED errors and add EHOSTUNREACH as a not-transient case. Co-Authored-By: Claude Sonnet 4.6 --- internal/sidecar/ssh.go | 63 +++++++++++++ internal/sidecar/sync.go | 50 +---------- internal/sidecar/sync_whitebox_test.go | 117 ++++++++++++++++++++++++- 3 files changed, 182 insertions(+), 48 deletions(-) diff --git a/internal/sidecar/ssh.go b/internal/sidecar/ssh.go index 02afe87..36f490b 100644 --- a/internal/sidecar/ssh.go +++ b/internal/sidecar/ssh.go @@ -16,12 +16,16 @@ import ( "path/filepath" "sort" "strings" + "syscall" + "time" + "github.com/cenkalti/backoff/v4" "github.com/coder/websocket" "golang.org/x/crypto/ssh" "golang.org/x/term" "github.com/CircleCI-Public/chunk-cli/internal/closer" + "github.com/CircleCI-Public/chunk-cli/internal/iostream" ) // ExecResult holds the output of a command executed over SSH. @@ -336,3 +340,62 @@ func tofuHostKeyCallback(knownHostsPath, host string) ssh.HostKeyCallback { return err } } + +// waitForSSHReady probes the SSH connection with exponential backoff, retrying +// on transient errors so a newly-created sidecar has time to finish booting +// before its SSH service accepts connections. +func waitForSSHReady(ctx context.Context, session *Session, status iostream.StatusFunc) error { + return waitForSSHReadyWithDial(ctx, session, status, dialSSH) +} + +// waitForSSHReadyWithDial is the underlying implementation; dialFn is injectable +// for testing the retry control flow without a live network. +func waitForSSHReadyWithDial( + ctx context.Context, + session *Session, + status iostream.StatusFunc, + dialFn func(context.Context, *Session) (*sshConn, error), +) error { + b := backoff.NewExponentialBackOff() + b.InitialInterval = 2 * time.Second + b.MaxInterval = 15 * time.Second + b.MaxElapsedTime = 90 * time.Second + + notified := false + return backoff.RetryNotify( + func() error { + conn, err := dialFn(ctx, session) + if err != nil { + if !isTransientSSHError(err) { + return backoff.Permanent(err) + } + return err + } + if conn != nil { + _ = conn.Close() + } + return nil + }, + backoff.WithContext(b, ctx), + func(_ error, _ time.Duration) { + if !notified { + status(iostream.LevelInfo, "Waiting for sidecar SSH to become available...") + notified = true + } + }, + ) +} + +// isTransientSSHError reports whether err is a network-level error worth +// retrying when dialling the sidecar SSH service — specifically connection +// refused and timeouts that indicate the daemon is not yet ready. +func isTransientSSHError(err error) bool { + var netErr net.Error + if !errors.As(err, &netErr) { + return false + } + if netErr.Timeout() { + return true + } + return errors.Is(err, syscall.ECONNREFUSED) +} diff --git a/internal/sidecar/sync.go b/internal/sidecar/sync.go index 1f1d8ca..6891eee 100644 --- a/internal/sidecar/sync.go +++ b/internal/sidecar/sync.go @@ -4,13 +4,9 @@ import ( "context" "errors" "fmt" - "net" "os" "path/filepath" "strings" - "time" - - "github.com/cenkalti/backoff/v4" "github.com/CircleCI-Public/chunk-cli/internal/circleci" "github.com/CircleCI-Public/chunk-cli/internal/gitremote" @@ -56,10 +52,13 @@ func persistWorkspace(ctx context.Context, workspace string) error { func Sync(ctx context.Context, client *circleci.Client, sidecarID, identityFile, authSock, workdir string, status iostream.StatusFunc) error { - session, err := openSessionWithRetry(ctx, client, sidecarID, identityFile, authSock, status) + session, err := OpenSession(ctx, client, sidecarID, identityFile, authSock) if err != nil { return err } + if err := waitForSSHReady(ctx, session, status); err != nil { + return err + } cwd, err := os.Getwd() if err != nil { @@ -205,44 +204,3 @@ func syncWorkspace(ctx context.Context, status iostream.StatusFunc, org, repo, r } return nil } - -// openSessionWithRetry calls OpenSession, retrying on transient errors to give -// a newly-created sidecar time to finish booting before its SSH service is ready. -func openSessionWithRetry(ctx context.Context, client *circleci.Client, sidecarID, identityFile, authSock string, status iostream.StatusFunc) (*Session, error) { - b := backoff.NewExponentialBackOff() - b.InitialInterval = 2 * time.Second - b.MaxInterval = 15 * time.Second - b.MaxElapsedTime = 90 * time.Second - - var session *Session - notified := false - err := backoff.RetryNotify( - func() error { - var e error - session, e = OpenSession(ctx, client, sidecarID, identityFile, authSock) - if e != nil && !isTransientSSHError(e) { - return backoff.Permanent(e) - } - return e - }, - backoff.WithContext(b, ctx), - func(_ error, _ time.Duration) { - if !notified { - status(iostream.LevelInfo, "Waiting for sidecar SSH to become available...") - notified = true - } - }, - ) - if err != nil { - return nil, err - } - return session, nil -} - -// isTransientSSHError returns true for network-level errors that are worth -// retrying when opening a session — connection failures and timeouts that -// indicate the sidecar's SSH service is not yet ready. -func isTransientSSHError(err error) bool { - var netErr net.Error - return errors.As(err, &netErr) -} diff --git a/internal/sidecar/sync_whitebox_test.go b/internal/sidecar/sync_whitebox_test.go index 820368b..901c87e 100644 --- a/internal/sidecar/sync_whitebox_test.go +++ b/internal/sidecar/sync_whitebox_test.go @@ -1,13 +1,20 @@ package sidecar import ( + "context" + "errors" "fmt" "net" + "os" + "path/filepath" + "syscall" "testing" "gotest.tools/v3/assert" "github.com/CircleCI-Public/chunk-cli/internal/circleci" + "github.com/CircleCI-Public/chunk-cli/internal/iostream" + "github.com/CircleCI-Public/chunk-cli/internal/testing/fakes" ) func TestIsTransientSSHError(t *testing.T) { @@ -17,16 +24,39 @@ func TestIsTransientSSHError(t *testing.T) { }) t.Run("connection refused is transient", func(t *testing.T) { - err := &net.OpError{Op: "dial", Net: "tcp", Err: fmt.Errorf("connection refused")} + err := &net.OpError{ + Op: "dial", + Net: "tcp", + Err: &os.SyscallError{Syscall: "connect", Err: syscall.ECONNREFUSED}, + } assert.Equal(t, isTransientSSHError(err), true) }) - t.Run("net error wrapped with fmt.Errorf is transient", func(t *testing.T) { + t.Run("connection refused wrapped with fmt.Errorf is transient", func(t *testing.T) { + inner := &net.OpError{ + Op: "dial", + Net: "tcp", + Err: &os.SyscallError{Syscall: "connect", Err: syscall.ECONNREFUSED}, + } + err := fmt.Errorf("websocket connect: %w", inner) + assert.Equal(t, isTransientSSHError(err), true) + }) + + t.Run("timeout wrapped with fmt.Errorf is transient", func(t *testing.T) { inner := &net.OpError{Op: "dial", Err: &timeoutError{}} err := fmt.Errorf("register SSH key: %w", inner) assert.Equal(t, isTransientSSHError(err), true) }) + t.Run("unreachable host is not transient", func(t *testing.T) { + err := &net.OpError{ + Op: "dial", + Net: "tcp", + Err: &os.SyscallError{Syscall: "connect", Err: syscall.EHOSTUNREACH}, + } + assert.Equal(t, isTransientSSHError(err), false) + }) + t.Run("ErrNotAuthorized is not transient", func(t *testing.T) { err := fmt.Errorf("add ssh key: %w", circleci.ErrNotAuthorized) assert.Equal(t, isTransientSSHError(err), false) @@ -53,6 +83,89 @@ func TestIsTransientSSHError(t *testing.T) { }) } +func TestWaitForSSHReady(t *testing.T) { + t.Run("succeeds immediately when SSH server is ready", func(t *testing.T) { + keyFile, pubKey := fakes.GenerateSSHKeypair(t) + sshSrv := fakes.NewSSHServer(t, pubKey) + + session := &Session{ + URL: sshSrv.Addr(), + IdentityFile: keyFile, + KnownHosts: filepath.Join(t.TempDir(), "known_hosts"), + } + + var notified bool + statusFn := iostream.StatusFunc(func(_ iostream.Level, _ string) { notified = true }) + + err := waitForSSHReady(context.Background(), session, statusFn) + assert.NilError(t, err) + assert.Equal(t, notified, false, "no retry should be needed when SSH is already ready") + }) + + t.Run("permanent error returns immediately without notifying", func(t *testing.T) { + permanentErr := errors.New("ssh handshake: auth failed") + + var notifications int + statusFn := iostream.StatusFunc(func(_ iostream.Level, _ string) { notifications++ }) + + err := waitForSSHReadyWithDial(context.Background(), &Session{}, statusFn, + func(_ context.Context, _ *Session) (*sshConn, error) { + return nil, permanentErr // not a net.Error → permanent + }, + ) + assert.ErrorIs(t, err, permanentErr) + assert.Equal(t, notifications, 0, "permanent error should not trigger retry notification") + }) + + t.Run("retries on transient error and notifies exactly once", func(t *testing.T) { + transientErr := &net.OpError{ + Op: "dial", + Net: "tcp", + Err: &os.SyscallError{Syscall: "connect", Err: syscall.ECONNREFUSED}, + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + var notifications int + statusFn := iostream.StatusFunc(func(_ iostream.Level, _ string) { + notifications++ + cancel() // stop retrying after the first notification + }) + + err := waitForSSHReadyWithDial(ctx, &Session{}, statusFn, + func(_ context.Context, _ *Session) (*sshConn, error) { + return nil, transientErr + }, + ) + assert.Assert(t, err != nil, "should return an error when retries are stopped") + assert.Equal(t, notifications, 1, "status should be notified exactly once regardless of retry count") + }) + + t.Run("succeeds after transient errors resolve", func(t *testing.T) { + transientErr := &net.OpError{ + Op: "dial", + Net: "tcp", + Err: &os.SyscallError{Syscall: "connect", Err: syscall.ECONNREFUSED}, + } + + attempts := 0 + statusFn := iostream.StatusFunc(func(_ iostream.Level, _ string) {}) + + err := waitForSSHReadyWithDial(context.Background(), &Session{}, statusFn, + func(_ context.Context, _ *Session) (*sshConn, error) { + attempts++ + if attempts < 3 { + return nil, transientErr + } + return nil, nil // success: SSH is now ready + }, + ) + assert.NilError(t, err) + assert.Equal(t, attempts, 3, "should have retried until success") + }) +} + // timeoutError is a net.Error that reports Timeout() == true. type timeoutError struct{}