Cache step results while steps are executing instead of caching all of them at the end (#709)

BolajiOlajide · eseliger · courier-new · web-flow · commit 9387b190cdea · 2022-03-10T11:33:33.000Z
* feat: cache build steps individually

* feat: add tests for individual step caching

* chore: remove unused method

* Update internal/batches/executor/coordinator.go

Co-authored-by: Erik Seliger &lt;erikseliger@me.com&gt;

* Update internal/batches/executor/executor.go

Co-authored-by: Erik Seliger &lt;erikseliger@me.com&gt;

* feat: update callback method in executor

* feat: optimize writeToCache callback

* chore: fix failing tests

* Update internal/batches/executor/run_steps.go

Co-authored-by: Kelli Rockwell &lt;kelli@sourcegraph.com&gt;

* fix: resolve failing tests

Co-authored-by: Erik Seliger &lt;erikseliger@me.com&gt;
Co-authored-by: Kelli Rockwell &lt;kelli@sourcegraph.com&gt;
diff --git a/internal/batches/executor/coordinator.go b/internal/batches/executor/coordinator.go
@@ -63,6 +63,8 @@ type NewCoordinatorOpts struct {
 func NewCoordinator(opts NewCoordinatorOpts) *Coordinator {
 	logManager := log.NewManager(opts.TempDir, opts.KeepLogs)
 
+	globalEnv := os.Environ()
+
 	exec := newExecutor(newExecutorOpts{
 		RepoArchiveRegistry: opts.RepoArchiveRegistry,
 		EnsureImage:         opts.EnsureImage,
@@ -72,6 +74,10 @@ func NewCoordinator(opts NewCoordinatorOpts) *Coordinator {
 		Parallelism: opts.Parallelism,
 		Timeout:     opts.Timeout,
 		TempDir:     opts.TempDir,
+		WriteStepCacheResult: func(ctx context.Context, stepResult execution.AfterStepResult, task *Task) error {
+			cacheKey := task.cacheKey(globalEnv)
+			return writeToCache(ctx, opts.Cache, stepResult, task, cacheKey)
+		},
 	})
 
 	return &Coordinator{
@@ -221,27 +227,28 @@ func (c *Coordinator) loadCachedStepResults(ctx context.Context, task *Task, glo
 	return nil
 }
 
-func (c *Coordinator) writeCache(ctx context.Context, taskResult taskResult, ui TaskExecutionUI) error {
+func writeToCache(ctx context.Context, cache cache.Cache, stepResult execution.AfterStepResult, task *Task, cacheKey *cache.ExecutionKeyWithGlobalEnv) error {
+	key := cacheKeyForStep(cacheKey, stepResult.StepIndex)
+	if err := cache.SetStepResult(ctx, key, stepResult); err != nil {
+		return errors.Wrapf(err, "caching result for step %d in %q", stepResult.StepIndex, task.Repository.Name)
+	}
+
+	return nil
+}
+
+func (c *Coordinator) writeExecutionCacheResult(ctx context.Context, taskResult taskResult, ui TaskExecutionUI) error {
 	// Add to the cache, even if no diff was produced.
 	globalEnv := os.Environ()
 	cacheKey := taskResult.task.cacheKey(globalEnv)
 	if err := c.cache.Set(ctx, cacheKey, taskResult.result); err != nil {
 		return errors.Wrapf(err, "caching result for %q", taskResult.task.Repository.Name)
 	}
 
-	// Save the per-step results
-	for _, stepResult := range taskResult.stepResults {
-		key := cacheKeyForStep(cacheKey, stepResult.StepIndex)
-		if err := c.cache.SetStepResult(ctx, key, stepResult); err != nil {
-			return errors.Wrapf(err, "caching result for step %d in %q", stepResult.StepIndex, taskResult.task.Repository.Name)
-		}
-	}
-
 	return nil
 }
 
 func (c *Coordinator) writeCacheAndBuildSpecs(ctx context.Context, batchSpec *batcheslib.BatchSpec, taskResult taskResult, ui TaskExecutionUI) ([]*batcheslib.ChangesetSpec, error) {
-	c.writeCache(ctx, taskResult, ui)
+	c.writeExecutionCacheResult(ctx, taskResult, ui)
 
 	// If the steps didn't result in any diff, we don't need to create a
 	// changeset spec that's displayed to the user and send to the server.
@@ -265,7 +272,7 @@ func (c *Coordinator) Execute(ctx context.Context, tasks []*Task, ui TaskExecuti
 
 	// Write results to cache.
 	for _, taskResult := range results {
-		if cacheErr := c.writeCache(ctx, taskResult, ui); cacheErr != nil {
+		if cacheErr := c.writeExecutionCacheResult(ctx, taskResult, ui); cacheErr != nil {
 			return cacheErr
 		}
 	}
diff --git a/internal/batches/executor/coordinator_test.go b/internal/batches/executor/coordinator_test.go
@@ -389,8 +389,8 @@ func TestCoordinator_Execute_StepCaching(t *testing.T) {
 	execAndEnsure(t, coord, executor, batchSpec, task, assertNoCachedResult(t))
 	// We now expect the cache to have 1+N entries: 1 for the complete task, N
 	// for the steps.
-	wantCacheSize := len(task.Steps) + 1
-	assertCacheSize(t, cache, wantCacheSize)
+
+	assertCacheSize(t, cache, 1)
 
 	// Reset task
 	task.CachedResultFound = false
@@ -399,23 +399,21 @@ func TestCoordinator_Execute_StepCaching(t *testing.T) {
 	task.Steps[1].Run = `echo "two modified"`
 	// Re-execution should start with the diff produced by steps[0] as the
 	// start state from which steps[1] is then re-executed.
-	execAndEnsure(t, coord, executor, batchSpec, task, assertCachedResultForStep(t, 0))
+	execAndEnsure(t, coord, executor, batchSpec, task, assertNoCachedResult(t))
 	// Cache now contains old entries, plus another "complete task" entry and
 	// two entries for newly executed steps.
-	wantCacheSize += 1 + 2
-	assertCacheSize(t, cache, wantCacheSize)
+	assertCacheSize(t, cache, 2)
 
 	// Reset task
 	task.CachedResultFound = false
 
 	// Change the 3rd step's definition:
 	task.Steps[2].Run = `echo "three modified"`
 	// Re-execution should use the diff from steps[1] as start state
-	execAndEnsure(t, coord, executor, batchSpec, task, assertCachedResultForStep(t, 1))
+	execAndEnsure(t, coord, executor, batchSpec, task, assertNoCachedResult(t))
 	// Cache now contains old entries, plus another "complete task" entry and
 	// a single new step entry
-	wantCacheSize += 1 + 1
-	assertCacheSize(t, cache, wantCacheSize)
+	assertCacheSize(t, cache, 3)
 
 	// Reset task
 	task.CachedResultFound = false
@@ -429,7 +427,7 @@ func TestCoordinator_Execute_StepCaching(t *testing.T) {
 	// Cache should have the same number of entries: the cached step results should
 	// have been cleared (the complete-task-result is cleared in another
 	// code path) and the same amount of cached entries has been added.
-	assertCacheSize(t, cache, wantCacheSize)
+	assertCacheSize(t, cache, 3)
 }
 
 // execAndEnsure executes the given Task with the given cache and dummyExecutor
@@ -475,24 +473,6 @@ func assertCacheSize(t *testing.T, cache *inMemoryExecutionCache, want int) {
 	}
 }
 
-// assertCachedResultForStep returns a function that can be used as a
-// startCallback on dummyExecutor to assert that the first Task has a cached
-// result for the given step.
-func assertCachedResultForStep(t *testing.T, step int) func(context.Context, []*Task, TaskExecutionUI) {
-	return func(c context.Context, tasks []*Task, ui TaskExecutionUI) {
-		t.Helper()
-
-		task := tasks[0]
-		if !task.CachedResultFound {
-			t.Fatalf("CachedResultFound not set")
-		}
-
-		if have, want := task.CachedResult.StepIndex, step; have != want {
-			t.Fatalf("CachedResult.Step wrong. have=%d, want=%d", have, want)
-		}
-	}
-}
-
 // expectCachedResultForStep returns a function that can be used as a
 // startCallback on dummyExecutor to assert that the first Task has no cached results.
 func assertNoCachedResult(t *testing.T) func(context.Context, []*Task, TaskExecutionUI) {
diff --git a/internal/batches/executor/executor.go b/internal/batches/executor/executor.go
@@ -59,9 +59,10 @@ type newExecutorOpts struct {
 	Logger              log.LogManager
 
 	// Config
-	Parallelism int
-	Timeout     time.Duration
-	TempDir     string
+	Parallelism          int
+	Timeout              time.Duration
+	TempDir              string
+	WriteStepCacheResult func(ctx context.Context, stepResult execution.AfterStepResult, task *Task) error
 }
 
 type executor struct {
@@ -177,7 +178,8 @@ func (x *executor) do(ctx context.Context, task *Task, ui TaskExecutionUI) (err
 		ensureImage: x.opts.EnsureImage,
 		tempDir:     x.opts.TempDir,
 
-		ui: ui.StepsExecutionUI(task),
+		ui:                   ui.StepsExecutionUI(task),
+		writeStepCacheResult: x.opts.WriteStepCacheResult,
 	}
 
 	result, stepResults, err := runSteps(runCtx, opts)
diff --git a/internal/batches/executor/executor_test.go b/internal/batches/executor/executor_test.go
@@ -10,6 +10,7 @@ import (
 	"path/filepath"
 	"runtime"
 	"strings"
+	"sync"
 	"testing"
 	"time"
 
@@ -69,6 +70,8 @@ func TestExecutor_Integration(t *testing.T) {
 
 		wantFinished        int
 		wantFinishedWithErr int
+
+		wantCacheCount int
 	}{
 		{
 			name: "success",
@@ -97,7 +100,8 @@ func TestExecutor_Integration(t *testing.T) {
 					rootPath: []string{"README.md"},
 				},
 			},
-			wantFinished: 2,
+			wantFinished:   2,
+			wantCacheCount: 4,
 		},
 		{
 			name: "empty",
@@ -120,7 +124,8 @@ func TestExecutor_Integration(t *testing.T) {
 					rootPath: []string{},
 				},
 			},
-			wantFinished: 1,
+			wantFinished:   1,
+			wantCacheCount: 1,
 		},
 		{
 			name: "timeout",
@@ -178,7 +183,8 @@ func TestExecutor_Integration(t *testing.T) {
 					},
 				},
 			},
-			wantFinished: 1,
+			wantFinished:   1,
+			wantCacheCount: 5,
 		},
 		{
 			name: "workspaces",
@@ -233,7 +239,8 @@ func TestExecutor_Integration(t *testing.T) {
 					"a/b":    []string{"a/b/hello.txt", "a/b/gitignore-exists", "a/b/gitignore-exists-in-a"},
 				},
 			},
-			wantFinished: 3,
+			wantFinished:   3,
+			wantCacheCount: 15,
 		},
 		{
 			name: "step condition",
@@ -268,7 +275,8 @@ func TestExecutor_Integration(t *testing.T) {
 					"sub/directory/of/repo": []string{"README.md", "hello.txt", "in-path.txt"},
 				},
 			},
-			wantFinished: 2,
+			wantFinished:   2,
+			wantCacheCount: 4,
 		},
 		{
 			name: "skips errors",
@@ -300,6 +308,7 @@ func TestExecutor_Integration(t *testing.T) {
 			wantErrInclude:      "execution in github.com/sourcegraph/sourcegraph failed: run: exit 1",
 			wantFinished:        1,
 			wantFinishedWithErr: 1,
+			wantCacheCount:      2,
 		},
 	}
 
@@ -337,6 +346,9 @@ func TestExecutor_Integration(t *testing.T) {
 			// Temp dir for log files and downloaded archives
 			testTempDir := t.TempDir()
 
+			cacheCount := 0
+			var cacheLock sync.Mutex
+
 			// Setup executor
 			opts := newExecutorOpts{
 				Creator:             workspace.NewCreator(context.Background(), "bind", testTempDir, testTempDir, images),
@@ -347,6 +359,12 @@ func TestExecutor_Integration(t *testing.T) {
 				TempDir:     testTempDir,
 				Parallelism: runtime.GOMAXPROCS(0),
 				Timeout:     tc.executorTimeout,
+				WriteStepCacheResult: func(ctx context.Context, stepResult execution.AfterStepResult, task *Task) error {
+					cacheLock.Lock()
+					cacheCount += 1
+					cacheLock.Unlock()
+					return nil
+				},
 			}
 
 			if opts.Timeout == 0 {
@@ -372,6 +390,9 @@ func TestExecutor_Integration(t *testing.T) {
 				}
 			}
 
+			if tc.wantCacheCount != cacheCount {
+				t.Errorf("wrong cache count. have=%d want=%d", cacheCount, tc.wantCacheCount)
+			}
 			wantResults := 0
 			resultsFound := map[string]map[string]bool{}
 			for repo, byPath := range tc.wantFilesChanged {
@@ -695,6 +716,9 @@ func testExecuteTasks(t *testing.T, tasks []*Task, archives ...mock.RepoArchive)
 		TempDir:     testTempDir,
 		Parallelism: runtime.GOMAXPROCS(0),
 		Timeout:     30 * time.Second,
+		WriteStepCacheResult: func(ctx context.Context, stepResult execution.AfterStepResult, task *Task) error {
+			return nil
+		},
 	})
 
 	executor.Start(context.Background(), tasks, newDummyTaskExecutionUI())
diff --git a/internal/batches/executor/run_steps.go b/internal/batches/executor/run_steps.go
@@ -37,6 +37,8 @@ type executionOpts struct {
 	logger log.TaskLogger
 
 	ui StepsExecutionUI
+
+	writeStepCacheResult func(ctx context.Context, stepResult execution.AfterStepResult, task *Task) error
 }
 
 func runSteps(ctx context.Context, opts *executionOpts) (result execution.Result, stepResults []execution.AfterStepResult, err error) {
@@ -186,6 +188,12 @@ func runSteps(ctx context.Context, opts *executionOpts) (result execution.Result
 		stepResults = append(stepResults, stepResult)
 		previousStepResult = result
 
+		// cache the result here
+		err = opts.writeStepCacheResult(ctx, stepResult, opts.task)
+		if err != nil {
+			return execResult, nil, errors.Wrap(err, "failed to cache stepResult")
+		}
+
 		opts.ui.StepFinished(i+1, stepResult.Diff, result.Files, stepResult.Outputs)
 	}