From 81dc810d58a26ddfbfac3063ee3868d9c8431489 Mon Sep 17 00:00:00 2001
From: chatton <github.qpeyb@simplelogin.fr>
Date: Wed, 25 Feb 2026 12:16:03 +0000
Subject: [PATCH 01/19] refactor: move spamoor benchmark into testify suite in
 test/e2e/benchmark

- Create test/e2e/benchmark/ subpackage with SpamoorSuite (testify/suite)
- Move spamoor smoke test into suite as TestSpamoorSmoke
- Split helpers into focused files: traces.go, output.go, metrics.go
- Introduce resultWriter for defer-based benchmark JSON output
- Export shared symbols from evm_test_common.go for cross-package use
- Restructure CI to fan-out benchmark jobs and fan-in publishing
- Run benchmarks on PRs only when benchmark-related files change
---
 .github/workflows/benchmark.yml          | 111 ++++++++-
 test/e2e/benchmark/metrics.go            |  48 ++++
 test/e2e/benchmark/output.go             |  77 +++++++
 test/e2e/benchmark/spamoor_smoke_test.go | 114 +++++++++
 test/e2e/benchmark/suite_test.go         | 144 ++++++++++++
 test/e2e/benchmark/traces.go             |  67 ++++++
 test/e2e/evm_contract_bench_test.go      |   6 +-
 test/e2e/evm_force_inclusion_e2e_test.go |   6 +-
 test/e2e/evm_full_node_e2e_test.go       |  10 +-
 test/e2e/evm_spamoor_smoke_test.go       | 281 -----------------------
 test/e2e/evm_test_common.go              |  75 +++---
 test/e2e/failover_e2e_test.go            |   4 +-
 12 files changed, 605 insertions(+), 338 deletions(-)
 create mode 100644 test/e2e/benchmark/metrics.go
 create mode 100644 test/e2e/benchmark/output.go
 create mode 100644 test/e2e/benchmark/spamoor_smoke_test.go
 create mode 100644 test/e2e/benchmark/suite_test.go
 create mode 100644 test/e2e/benchmark/traces.go
 delete mode 100644 test/e2e/evm_spamoor_smoke_test.go

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 15cb1a4ff6..2f8fcdb1c7 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -5,6 +5,15 @@ permissions: {}
   push:
     branches:
       - main
+  pull_request:
+    branches:
+      - main
+    paths:
+      - 'test/e2e/benchmark/**'
+      - 'test/e2e/evm_contract_bench_test.go'
+      - 'test/e2e/evm_test_common.go'
+      - 'test/e2e/sut_helper.go'
+      - '.github/workflows/benchmark.yml'
   workflow_dispatch:
 
 jobs:
@@ -12,9 +21,6 @@ jobs:
     name: EVM Contract Benchmark
     runs-on: ubuntu-latest
     timeout-minutes: 30
-    permissions:
-      contents: write
-      issues: write
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
       - name: Set up Go
@@ -29,30 +35,113 @@ jobs:
         run: |
           cd test/e2e && go test -tags evm -bench=. -benchmem -run='^$' \
             -timeout=10m --evm-binary=../../build/evm | tee output.txt
-      - name: Store benchmark result
+      - name: Run Block Executor benchmarks
+        run: |
+          go test -bench=BenchmarkProduceBlock -benchmem -run='^$' \
+            ./block/internal/executing/... > block_executor_output.txt
+      - name: Upload benchmark results
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: evm-benchmark-results
+          path: |
+            test/e2e/output.txt
+            block_executor_output.txt
+
+  spamoor-benchmark:
+    name: Spamoor Trace Benchmark
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - name: Set up Go
+        uses: actions/setup-go@7a3fe6cf4cb3a834922a1244abfce67bcef6a0c5 # v6.2.0
+        with:
+          go-version-file: ./go.mod
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0
+      - name: Build binaries
+        run: make build-evm build-da
+      - name: Run Spamoor smoke test
+        run: |
+          cd test/e2e && BENCH_JSON_OUTPUT=benchmark/spamoor_bench.json go test -tags evm \
+            -run='^TestSpamoorSuite$/^TestSpamoorSmoke$' -v -timeout=15m \
+            --evm-binary=../../build/evm ./benchmark/
+      - name: Upload benchmark results
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: spamoor-benchmark-results
+          path: test/e2e/benchmark/spamoor_bench.json
+
+  # single job to push all results to gh-pages sequentially, avoiding race conditions
+  publish-benchmarks:
+    name: Publish Benchmark Results
+    needs: [evm-benchmark, spamoor-benchmark]
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - name: Download EVM benchmark results
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
+        with:
+          name: evm-benchmark-results
+      - name: Download Spamoor benchmark results
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
+        with:
+          name: spamoor-benchmark-results
+          path: test/e2e/benchmark/
+
+      # only update the benchmark baseline on push/dispatch, not on PRs
+      - name: Store EVM Contract Roundtrip result
+        if: always()
         uses: benchmark-action/github-action-benchmark@4bdcce38c94cec68da58d012ac24b7b1155efe8b # v1.20.7
         with:
           name: EVM Contract Roundtrip
           tool: 'go'
           output-file-path: test/e2e/output.txt
-          auto-push: true
+          auto-push: ${{ github.event_name != 'pull_request' }}
+          save-data-file: ${{ github.event_name != 'pull_request' }}
           github-token: ${{ secrets.GITHUB_TOKEN }}
           alert-threshold: '150%'
           fail-on-alert: true
           comment-on-alert: true
 
-      - name: Run Block Executor benchmarks
-        run: |
-          go test -bench=BenchmarkProduceBlock -benchmem -run='^$' \
-            ./block/internal/executing/... > block_executor_output.txt
-      - name: Store Block Executor benchmark result
+      # delete local gh-pages so the next benchmark action step fetches fresh from remote
+      - name: Reset local gh-pages branch
+        if: always()
+        run: git branch -D gh-pages || true
+
+      - name: Store Block Executor result
+        if: always()
         uses: benchmark-action/github-action-benchmark@4bdcce38c94cec68da58d012ac24b7b1155efe8b # v1.20.7
         with:
           name: Block Executor Benchmark
           tool: 'go'
           output-file-path: block_executor_output.txt
-          auto-push: true
+          auto-push: ${{ github.event_name != 'pull_request' }}
+          save-data-file: ${{ github.event_name != 'pull_request' }}
           github-token: ${{ secrets.GITHUB_TOKEN }}
           alert-threshold: '150%'
           fail-on-alert: true
           comment-on-alert: true
+
+      # delete local gh-pages so the next benchmark action step fetches fresh from remote
+      - name: Reset local gh-pages branch
+        if: always()
+        run: git branch -D gh-pages || true
+
+      - name: Store Spamoor Trace result
+        if: always()
+        uses: benchmark-action/github-action-benchmark@4bdcce38c94cec68da58d012ac24b7b1155efe8b # v1.20.7
+        with:
+          name: Spamoor Trace Benchmarks
+          tool: 'customSmallerIsBetter'
+          output-file-path: test/e2e/benchmark/spamoor_bench.json
+          auto-push: ${{ github.event_name != 'pull_request' }}
+          save-data-file: ${{ github.event_name != 'pull_request' }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          alert-threshold: '150%'
+          fail-on-alert: false
+          comment-on-alert: true
diff --git a/test/e2e/benchmark/metrics.go b/test/e2e/benchmark/metrics.go
new file mode 100644
index 0000000000..fcc51af865
--- /dev/null
+++ b/test/e2e/benchmark/metrics.go
@@ -0,0 +1,48 @@
+//go:build evm
+
+package benchmark
+
+import (
+	"fmt"
+	"net/http"
+	"testing"
+	"time"
+
+	dto "github.com/prometheus/client_model/go"
+)
+
+// requireHTTP polls a URL until it returns a 2xx status code or the timeout expires.
+func requireHTTP(t testing.TB, url string, timeout time.Duration) {
+	t.Helper()
+	client := &http.Client{Timeout: 200 * time.Millisecond}
+	deadline := time.Now().Add(timeout)
+	var lastErr error
+	for time.Now().Before(deadline) {
+		resp, err := client.Get(url)
+		if err == nil {
+			_ = resp.Body.Close()
+			if resp.StatusCode >= 200 && resp.StatusCode < 300 {
+				return
+			}
+			lastErr = fmt.Errorf("status %d", resp.StatusCode)
+		} else {
+			lastErr = err
+		}
+		time.Sleep(100 * time.Millisecond)
+	}
+	t.Fatalf("daemon not ready at %s: %v", url, lastErr)
+}
+
+// sumCounter sums all counter values in a prometheus MetricFamily.
+func sumCounter(f *dto.MetricFamily) float64 {
+	if f == nil || f.GetType() != dto.MetricType_COUNTER {
+		return 0
+	}
+	var sum float64
+	for _, m := range f.GetMetric() {
+		if m.GetCounter() != nil && m.GetCounter().Value != nil {
+			sum += m.GetCounter().GetValue()
+		}
+	}
+	return sum
+}
diff --git a/test/e2e/benchmark/output.go b/test/e2e/benchmark/output.go
new file mode 100644
index 0000000000..0d05ebd4ec
--- /dev/null
+++ b/test/e2e/benchmark/output.go
@@ -0,0 +1,77 @@
+//go:build evm
+
+package benchmark
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"sort"
+	"testing"
+
+	e2e "github.com/evstack/ev-node/test/e2e"
+	"github.com/stretchr/testify/require"
+)
+
+// entry matches the customSmallerIsBetter format for github-action-benchmark.
+type entry struct {
+	Name  string  `json:"name"`
+	Unit  string  `json:"unit"`
+	Value float64 `json:"value"`
+}
+
+// resultWriter accumulates benchmark entries and writes them to a JSON file
+// when flush is called. Create one early in a test and defer flush so results
+// are written regardless of where the test exits.
+type resultWriter struct {
+	t       testing.TB
+	label   string
+	entries []entry
+}
+
+func newResultWriter(t testing.TB, label string) *resultWriter {
+	return &resultWriter{t: t, label: label}
+}
+
+// addSpans aggregates trace spans into per-operation avg duration entries.
+func (w *resultWriter) addSpans(spans []e2e.TraceSpan) {
+	m := e2e.AggregateSpanStats(spans)
+	if len(m) == 0 {
+		return
+	}
+
+	names := make([]string, 0, len(m))
+	for name := range m {
+		names = append(names, name)
+	}
+	sort.Strings(names)
+
+	for _, name := range names {
+		s := m[name]
+		avg := float64(s.Total.Microseconds()) / float64(s.Count)
+		w.entries = append(w.entries, entry{
+			Name:  fmt.Sprintf("%s - %s (avg)", w.label, name),
+			Unit:  "us",
+			Value: avg,
+		})
+	}
+}
+
+// addEntry appends a custom entry to the results.
+func (w *resultWriter) addEntry(e entry) {
+	w.entries = append(w.entries, e)
+}
+
+// flush writes accumulated entries to the path in BENCH_JSON_OUTPUT.
+// It is a no-op when the env var is unset or no entries were added.
+func (w *resultWriter) flush() {
+	outputPath := os.Getenv("BENCH_JSON_OUTPUT")
+	if outputPath == "" || len(w.entries) == 0 {
+		return
+	}
+
+	data, err := json.MarshalIndent(w.entries, "", "  ")
+	require.NoError(w.t, err, "failed to marshal benchmark JSON")
+	require.NoError(w.t, os.WriteFile(outputPath, data, 0644), "failed to write benchmark JSON to %s", outputPath)
+	w.t.Logf("wrote %d benchmark entries to %s", len(w.entries), outputPath)
+}
diff --git a/test/e2e/benchmark/spamoor_smoke_test.go b/test/e2e/benchmark/spamoor_smoke_test.go
new file mode 100644
index 0000000000..1f56f26874
--- /dev/null
+++ b/test/e2e/benchmark/spamoor_smoke_test.go
@@ -0,0 +1,114 @@
+//go:build evm
+
+package benchmark
+
+import (
+	"time"
+
+	"github.com/celestiaorg/tastora/framework/docker/evstack/spamoor"
+	e2e "github.com/evstack/ev-node/test/e2e"
+)
+
+// TestSpamoorSmoke spins up reth + sequencer and a Spamoor node, starts a few
+// basic spammers, waits briefly, then validates trace spans and prints a concise
+// metrics summary.
+func (s *SpamoorSuite) TestSpamoorSmoke() {
+	t := s.T()
+	w := newResultWriter(t, "SpamoorSmoke")
+	defer w.flush()
+
+	e := s.setupEnv(config{
+		rethTag:     "pr-140",
+		serviceName: "ev-node-smoke",
+	})
+	api := e.spamoorAPI
+
+	eoatx := map[string]any{
+		"throughput":      100,
+		"total_count":     3000,
+		"max_pending":     4000,
+		"max_wallets":     300,
+		"amount":          100,
+		"random_amount":   true,
+		"random_target":   true,
+		"base_fee":        20,
+		"tip_fee":         2,
+		"refill_amount":   "1000000000000000000",
+		"refill_balance":  "500000000000000000",
+		"refill_interval": 600,
+	}
+
+	gasburner := map[string]any{
+		"throughput":        25,
+		"total_count":       2000,
+		"max_pending":       8000,
+		"max_wallets":       500,
+		"gas_units_to_burn": 3000000,
+		"base_fee":          20,
+		"tip_fee":           5,
+		"rebroadcast":       5,
+		"refill_amount":     "5000000000000000000",
+		"refill_balance":    "2000000000000000000",
+		"refill_interval":   300,
+	}
+
+	var ids []int
+	id, err := api.CreateSpammer("smoke-eoatx", spamoor.ScenarioEOATX, eoatx, true)
+	s.Require().NoError(err, "failed to create eoatx spammer")
+	ids = append(ids, id)
+	id, err = api.CreateSpammer("smoke-gasburner", spamoor.ScenarioGasBurnerTX, gasburner, true)
+	s.Require().NoError(err, "failed to create gasburner spammer")
+	ids = append(ids, id)
+
+	for _, id := range ids {
+		idToDelete := id
+		t.Cleanup(func() { _ = api.DeleteSpammer(idToDelete) })
+	}
+
+	// allow spamoor enough time to generate transaction throughput
+	// so that the expected tracing spans appear in Jaeger.
+	time.Sleep(60 * time.Second)
+
+	// fetch parsed metrics and print a concise summary.
+	metrics, err := api.GetMetrics()
+	s.Require().NoError(err, "failed to get metrics")
+	sent := sumCounter(metrics["spamoor_transactions_sent_total"])
+	fail := sumCounter(metrics["spamoor_transactions_failed_total"])
+
+	// collect traces
+	evNodeSpans := s.collectServiceTraces(e, "ev-node-smoke")
+	evRethSpans := s.collectServiceTraces(e, "ev-reth")
+	e2e.PrintTraceReport(t, "ev-node-smoke", evNodeSpans)
+	e2e.PrintTraceReport(t, "ev-reth", evRethSpans)
+
+	w.addSpans(append(evNodeSpans, evRethSpans...))
+
+	// assert expected ev-node span names
+	assertSpanNames(t, evNodeSpans, []string{
+		"BlockExecutor.ProduceBlock",
+		"BlockExecutor.ApplyBlock",
+		"BlockExecutor.CreateBlock",
+		"BlockExecutor.RetrieveBatch",
+		"Executor.ExecuteTxs",
+		"Executor.SetFinal",
+		"Engine.ForkchoiceUpdated",
+		"Engine.NewPayload",
+		"Engine.GetPayload",
+		"Eth.GetBlockByNumber",
+		"Sequencer.GetNextBatch",
+		"DASubmitter.SubmitHeaders",
+		"DASubmitter.SubmitData",
+		"DA.Submit",
+	}, "ev-node-smoke")
+
+	// assert expected ev-reth span names
+	assertSpanNames(t, evRethSpans, []string{
+		"build_payload",
+		"execute_tx",
+		"try_build",
+		"validate_transaction",
+	}, "ev-reth")
+
+	s.Require().Greater(sent, float64(0), "at least one transaction should have been sent")
+	s.Require().Zero(fail, "no transactions should have failed")
+}
diff --git a/test/e2e/benchmark/suite_test.go b/test/e2e/benchmark/suite_test.go
new file mode 100644
index 0000000000..a43f25890c
--- /dev/null
+++ b/test/e2e/benchmark/suite_test.go
@@ -0,0 +1,144 @@
+//go:build evm
+
+package benchmark
+
+import (
+	"context"
+	"path/filepath"
+	"testing"
+	"time"
+
+	tastoradocker "github.com/celestiaorg/tastora/framework/docker"
+	"github.com/celestiaorg/tastora/framework/docker/evstack/reth"
+	"github.com/celestiaorg/tastora/framework/docker/evstack/spamoor"
+	"github.com/celestiaorg/tastora/framework/docker/jaeger"
+	tastoratypes "github.com/celestiaorg/tastora/framework/types"
+	"github.com/ethereum/go-ethereum/ethclient"
+	"github.com/stretchr/testify/suite"
+	"go.uber.org/zap/zaptest"
+
+	e2e "github.com/evstack/ev-node/test/e2e"
+)
+
+// SpamoorSuite groups benchmarks that use Spamoor for load generation
+// and Jaeger for distributed tracing. Docker client and network are shared
+// across all tests in the suite.
+type SpamoorSuite struct {
+	suite.Suite
+	dockerCli tastoratypes.TastoraDockerClient
+	networkID string
+}
+
+func TestSpamoorSuite(t *testing.T) {
+	suite.Run(t, new(SpamoorSuite))
+}
+
+func (s *SpamoorSuite) SetupTest() {
+	s.dockerCli, s.networkID = tastoradocker.Setup(s.T())
+}
+
+// env holds a fully-wired environment created by setupEnv.
+type env struct {
+	jaeger     *jaeger.Node
+	evmEnv     *e2e.EVMEnv
+	sut        *e2e.SystemUnderTest
+	spamoorAPI *spamoor.API
+	ethClient  *ethclient.Client
+}
+
+// config parameterizes the per-test environment setup.
+type config struct {
+	rethTag     string
+	serviceName string
+}
+
+// setupEnv creates a Jaeger + reth + sequencer + Spamoor environment for
+// a single test. Each call spins up isolated infrastructure so tests
+// can't interfere with each other.
+func (s *SpamoorSuite) setupEnv(cfg config) *env {
+	t := s.T()
+	ctx := t.Context()
+	sut := e2e.NewSystemUnderTest(t)
+
+	// jaeger
+	jcfg := jaeger.Config{Logger: zaptest.NewLogger(t), DockerClient: s.dockerCli, DockerNetworkID: s.networkID}
+	jg, err := jaeger.New(ctx, jcfg, t.Name(), 0)
+	s.Require().NoError(err, "failed to create jaeger node")
+	t.Cleanup(func() { _ = jg.Remove(t.Context()) })
+	s.Require().NoError(jg.Start(ctx), "failed to start jaeger node")
+
+	// reth + local DA with OTLP tracing to Jaeger
+	evmEnv := e2e.SetupCommonEVMEnv(t, sut, s.dockerCli, s.networkID,
+		e2e.WithRethOpts(func(b *reth.NodeBuilder) {
+			b.WithTag(cfg.rethTag).WithEnv(
+				"OTEL_EXPORTER_OTLP_ENDPOINT="+jg.Internal.IngestHTTPEndpoint()+"/v1/traces",
+				"OTEL_EXPORTER_OTLP_TRACES_ENDPOINT="+jg.Internal.IngestHTTPEndpoint()+"/v1/traces",
+				"OTEL_EXPORTER_OTLP_PROTOCOL=http",
+				"OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http",
+				"RUST_LOG=debug",
+				"OTEL_SDK_DISABLED=false",
+			)
+		}),
+	)
+
+	// sequencer with tracing
+	sequencerHome := filepath.Join(t.TempDir(), "sequencer")
+	otlpHTTP := jg.External.IngestHTTPEndpoint()
+	e2e.SetupSequencerNode(t, sut, sequencerHome, evmEnv.SequencerJWT, evmEnv.GenesisHash, evmEnv.Endpoints,
+		"--evnode.instrumentation.tracing=true",
+		"--evnode.instrumentation.tracing_endpoint", otlpHTTP,
+		"--evnode.instrumentation.tracing_sample_rate", "1.0",
+		"--evnode.instrumentation.tracing_service_name", cfg.serviceName,
+	)
+	t.Log("sequencer node is up")
+
+	// eth client
+	ethClient, err := ethclient.Dial(evmEnv.Endpoints.GetSequencerEthURL())
+	s.Require().NoError(err, "failed to dial sequencer eth endpoint")
+	t.Cleanup(func() { ethClient.Close() })
+
+	// spamoor
+	ni, err := evmEnv.RethNode.GetNetworkInfo(ctx)
+	s.Require().NoError(err, "failed to get reth network info")
+	internalRPC := "http://" + ni.Internal.RPCAddress()
+
+	spBuilder := spamoor.NewNodeBuilder(t.Name()).
+		WithDockerClient(evmEnv.RethNode.DockerClient).
+		WithDockerNetworkID(evmEnv.RethNode.NetworkID).
+		WithLogger(evmEnv.RethNode.Logger).
+		WithRPCHosts(internalRPC).
+		WithPrivateKey(e2e.TestPrivateKey)
+
+	spNode, err := spBuilder.Build(ctx)
+	s.Require().NoError(err, "failed to build spamoor node")
+	t.Cleanup(func() { _ = spNode.Remove(t.Context()) })
+	s.Require().NoError(spNode.Start(ctx), "failed to start spamoor node")
+
+	spInfo, err := spNode.GetNetworkInfo(ctx)
+	s.Require().NoError(err, "failed to get spamoor network info")
+	apiAddr := "http://127.0.0.1:" + spInfo.External.Ports.HTTP
+	requireHTTP(t, apiAddr+"/api/spammers", 30*time.Second)
+
+	return &env{
+		jaeger:     jg,
+		evmEnv:     evmEnv,
+		sut:        sut,
+		spamoorAPI: spNode.API(),
+		ethClient:  ethClient,
+	}
+}
+
+// collectServiceTraces fetches traces from Jaeger for the given service and returns the spans.
+func (s *SpamoorSuite) collectServiceTraces(e *env, serviceName string) []e2e.TraceSpan {
+	ctx, cancel := context.WithTimeout(s.T().Context(), 3*time.Minute)
+	defer cancel()
+
+	ok, err := e.jaeger.External.WaitForTraces(ctx, serviceName, 1, 2*time.Second)
+	s.Require().NoError(err, "error waiting for %s traces; UI: %s", serviceName, e.jaeger.External.QueryURL())
+	s.Require().True(ok, "expected at least one trace from %s; UI: %s", serviceName, e.jaeger.External.QueryURL())
+
+	traces, err := e.jaeger.External.Traces(ctx, serviceName, 10000)
+	s.Require().NoError(err, "failed to fetch %s traces", serviceName)
+
+	return toTraceSpans(extractSpansFromTraces(traces))
+}
diff --git a/test/e2e/benchmark/traces.go b/test/e2e/benchmark/traces.go
new file mode 100644
index 0000000000..76f160dfa2
--- /dev/null
+++ b/test/e2e/benchmark/traces.go
@@ -0,0 +1,67 @@
+//go:build evm
+
+package benchmark
+
+import (
+	"testing"
+	"time"
+
+	e2e "github.com/evstack/ev-node/test/e2e"
+	"github.com/stretchr/testify/require"
+)
+
+// jaegerSpan holds the fields we extract from Jaeger's untyped JSON response.
+type jaegerSpan struct {
+	operationName string
+	duration      float64 // microseconds
+}
+
+func (j jaegerSpan) SpanName() string           { return j.operationName }
+func (j jaegerSpan) SpanDuration() time.Duration { return time.Duration(j.duration) * time.Microsecond }
+
+// extractSpansFromTraces walks Jaeger's []any response and pulls out span operation names and durations.
+func extractSpansFromTraces(traces []any) []jaegerSpan {
+	var out []jaegerSpan
+	for _, t := range traces {
+		traceMap, ok := t.(map[string]any)
+		if !ok {
+			continue
+		}
+		spans, ok := traceMap["spans"].([]any)
+		if !ok {
+			continue
+		}
+		for _, s := range spans {
+			spanMap, ok := s.(map[string]any)
+			if !ok {
+				continue
+			}
+			name, _ := spanMap["operationName"].(string)
+			dur, _ := spanMap["duration"].(float64)
+			if name != "" {
+				out = append(out, jaegerSpan{operationName: name, duration: dur})
+			}
+		}
+	}
+	return out
+}
+
+func toTraceSpans(spans []jaegerSpan) []e2e.TraceSpan {
+	out := make([]e2e.TraceSpan, len(spans))
+	for i, s := range spans {
+		out[i] = s
+	}
+	return out
+}
+
+// assertSpanNames verifies that all expected span names appear in the trace data.
+func assertSpanNames(t testing.TB, spans []e2e.TraceSpan, expected []string, label string) {
+	t.Helper()
+	opNames := make(map[string]struct{}, len(spans))
+	for _, span := range spans {
+		opNames[span.SpanName()] = struct{}{}
+	}
+	for _, name := range expected {
+		require.Contains(t, opNames, name, "expected span %q not found in %s traces", name, label)
+	}
+}
diff --git a/test/e2e/evm_contract_bench_test.go b/test/e2e/evm_contract_bench_test.go
index 7fd67dcc5f..006a8ea1ce 100644
--- a/test/e2e/evm_contract_bench_test.go
+++ b/test/e2e/evm_contract_bench_test.go
@@ -202,7 +202,7 @@ func (c *otlpCollector) getSpans() []*tracepb.Span {
 	return cp
 }
 
-// otlpSpanAdapter wraps an OTLP protobuf span to implement traceSpan.
+// otlpSpanAdapter wraps an OTLP protobuf span to implement TraceSpan.
 type otlpSpanAdapter struct {
 	span *tracepb.Span
 }
@@ -215,11 +215,11 @@ func (a otlpSpanAdapter) SpanDuration() time.Duration {
 func printCollectedTraceReport(b testing.TB, collector *otlpCollector) {
 	b.Helper()
 	raw := collector.getSpans()
-	spans := make([]traceSpan, len(raw))
+	spans := make([]TraceSpan, len(raw))
 	for i, s := range raw {
 		spans[i] = otlpSpanAdapter{span: s}
 	}
-	printTraceReport(b, "ev-node", spans)
+	PrintTraceReport(b, "ev-node", spans)
 }
 
 // waitForReceipt polls for a transaction receipt until it is available.
diff --git a/test/e2e/evm_force_inclusion_e2e_test.go b/test/e2e/evm_force_inclusion_e2e_test.go
index 00046e3b20..bc04eba6a7 100644
--- a/test/e2e/evm_force_inclusion_e2e_test.go
+++ b/test/e2e/evm_force_inclusion_e2e_test.go
@@ -79,7 +79,7 @@ func setupSequencerWithForceInclusion(t *testing.T, sut *SystemUnderTest, nodeHo
 
 	// Use common setup (no full node needed initially)
 	dcli, netID := tastoradocker.Setup(t)
-	env := setupCommonEVMEnv(t, sut, dcli, netID)
+	env := SetupCommonEVMEnv(t, sut, dcli, netID)
 
 	// Create passphrase file
 	passphraseFile := createPassphraseFile(t, nodeHome)
@@ -195,7 +195,7 @@ func TestEvmFullNodeForceInclusionE2E(t *testing.T) {
 	// We manually setup sequencer here because we need the force inclusion flag,
 	// and we need to capture variables for full node setup.
 	dockerClient, networkID := tastoradocker.Setup(t)
-	env := setupCommonEVMEnv(t, sut, dockerClient, networkID, WithFullNode())
+	env := SetupCommonEVMEnv(t, sut, dockerClient, networkID, WithFullNode())
 
 	passphraseFile := createPassphraseFile(t, sequencerHome)
 	jwtSecretFile := createJWTSecretFile(t, sequencerHome, env.SequencerJWT)
@@ -288,7 +288,7 @@ func setupMaliciousSequencer(t *testing.T, sut *SystemUnderTest, nodeHome string
 
 	// Use common setup with full node support
 	dockerClient, networkID := tastoradocker.Setup(t)
-	env := setupCommonEVMEnv(t, sut, dockerClient, networkID, WithFullNode())
+	env := SetupCommonEVMEnv(t, sut, dockerClient, networkID, WithFullNode())
 	// Use env fields inline below to reduce local vars
 
 	passphraseFile := createPassphraseFile(t, nodeHome)
diff --git a/test/e2e/evm_full_node_e2e_test.go b/test/e2e/evm_full_node_e2e_test.go
index 23a3dbcf82..98d07fe815 100644
--- a/test/e2e/evm_full_node_e2e_test.go
+++ b/test/e2e/evm_full_node_e2e_test.go
@@ -215,10 +215,10 @@ func setupSequencerWithFullNode(t *testing.T, sut *SystemUnderTest, sequencerHom
 
 	// Common setup for both sequencer and full node
 	dcli, netID := tastoradocker.Setup(t)
-	env := setupCommonEVMEnv(t, sut, dcli, netID, WithFullNode())
+	env := SetupCommonEVMEnv(t, sut, dcli, netID, WithFullNode())
 
 	// Setup sequencer
-	setupSequencerNode(t, sut, sequencerHome, env.SequencerJWT, env.GenesisHash, env.Endpoints)
+	SetupSequencerNode(t, sut, sequencerHome, env.SequencerJWT, env.GenesisHash, env.Endpoints)
 	t.Log("Sequencer node is up")
 
 	// Get P2P address and setup full node
@@ -648,7 +648,7 @@ func setupSequencerWithFullNodeLazy(t *testing.T, sut *SystemUnderTest, sequence
 
 	// Common setup for both sequencer and full node
 	dcli, netID := tastoradocker.Setup(t)
-	env := setupCommonEVMEnv(t, sut, dcli, netID, WithFullNode())
+	env := SetupCommonEVMEnv(t, sut, dcli, netID, WithFullNode())
 
 	t.Logf("Generated test endpoints - Rollkit RPC: %s, P2P: %s, Full Node RPC: %s, P2P: %s, DA Port: %s",
 		env.Endpoints.RollkitRPCPort, env.Endpoints.RollkitP2PPort, env.Endpoints.FullNodeRPCPort, env.Endpoints.FullNodeP2PPort, env.Endpoints.DAPort)
@@ -1042,14 +1042,14 @@ func testSequencerFullNodeRestart(t *testing.T, initialLazyMode, restartLazyMode
 	t.Logf("Test mode: initial_lazy=%t, restart_lazy=%t", initialLazyMode, restartLazyMode)
 
 	dcli, netID := tastoradocker.Setup(t)
-	env := setupCommonEVMEnv(t, sut, dcli, netID, WithFullNode())
+	env := SetupCommonEVMEnv(t, sut, dcli, netID, WithFullNode())
 
 	// Setup sequencer based on initial mode
 	if initialLazyMode {
 		setupSequencerNodeLazy(t, sut, sequencerHome, env.SequencerJWT, env.GenesisHash, env.Endpoints)
 		t.Log("Sequencer node (lazy mode) is up")
 	} else {
-		setupSequencerNode(t, sut, sequencerHome, env.SequencerJWT, env.GenesisHash, env.Endpoints)
+		SetupSequencerNode(t, sut, sequencerHome, env.SequencerJWT, env.GenesisHash, env.Endpoints)
 		t.Log("Sequencer node is up")
 	}
 
diff --git a/test/e2e/evm_spamoor_smoke_test.go b/test/e2e/evm_spamoor_smoke_test.go
deleted file mode 100644
index ca172948a0..0000000000
--- a/test/e2e/evm_spamoor_smoke_test.go
+++ /dev/null
@@ -1,281 +0,0 @@
-//go:build evm
-
-package e2e
-
-import (
-	"context"
-	"fmt"
-	"net/http"
-	"path/filepath"
-	"testing"
-	"time"
-
-	tastoradocker "github.com/celestiaorg/tastora/framework/docker"
-	reth "github.com/celestiaorg/tastora/framework/docker/evstack/reth"
-	spamoor "github.com/celestiaorg/tastora/framework/docker/evstack/spamoor"
-	jaeger "github.com/celestiaorg/tastora/framework/docker/jaeger"
-	dto "github.com/prometheus/client_model/go"
-	"github.com/stretchr/testify/require"
-	"go.uber.org/zap/zaptest"
-)
-
-// TestSpamoorSmoke spins up reth + sequencer and a Spamoor node, starts a few
-// basic spammers, waits briefly, then prints a concise metrics summary.
-func TestSpamoorSmoke(t *testing.T) {
-	t.Parallel()
-
-	sut := NewSystemUnderTest(t)
-	// Prepare a shared docker client and network for Jaeger and reth.
-	ctx := t.Context()
-	dcli, netID := tastoradocker.Setup(t)
-	jcfg := jaeger.Config{Logger: zaptest.NewLogger(t), DockerClient: dcli, DockerNetworkID: netID}
-	jg, err := jaeger.New(ctx, jcfg, t.Name(), 0)
-	require.NoError(t, err, "failed to create jaeger node")
-	t.Cleanup(func() { _ = jg.Remove(t.Context()) })
-	require.NoError(t, jg.Start(ctx), "failed to start jaeger node")
-
-	// Bring up reth + local DA on the same docker network as Jaeger so reth can export traces.
-	env := setupCommonEVMEnv(t, sut, dcli, netID,
-		WithRethOpts(func(b *reth.NodeBuilder) {
-			b.WithEnv(
-				"OTEL_EXPORTER_OTLP_TRACES_ENDPOINT="+jg.Internal.IngestHTTPEndpoint()+"/v1/traces",
-				"OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http",
-				"RUST_LOG=info",
-				"OTEL_SDK_DISABLED=false",
-			)
-		}),
-	)
-	sequencerHome := filepath.Join(t.TempDir(), "sequencer")
-
-	// ev-node runs on the host, so use Jaeger's external OTLP/HTTP endpoint.
-	otlpHTTP := jg.External.IngestHTTPEndpoint()
-
-	// Start sequencer with tracing to Jaeger collector.
-	setupSequencerNode(t, sut, sequencerHome, env.SequencerJWT, env.GenesisHash, env.Endpoints,
-		"--evnode.instrumentation.tracing=true",
-		"--evnode.instrumentation.tracing_endpoint", otlpHTTP,
-		"--evnode.instrumentation.tracing_sample_rate", "1.0",
-		"--evnode.instrumentation.tracing_service_name", "ev-node-smoke",
-	)
-	t.Log("Sequencer node is up")
-
-	// Start Spamoor within the same Docker network, targeting reth internal RPC.
-	ni, err := env.RethNode.GetNetworkInfo(ctx)
-	require.NoError(t, err, "failed to get network info")
-
-	internalRPC := "http://" + ni.Internal.RPCAddress()
-
-	spBuilder := spamoor.NewNodeBuilder(t.Name()).
-		WithDockerClient(env.RethNode.DockerClient).
-		WithDockerNetworkID(env.RethNode.NetworkID).
-		WithLogger(env.RethNode.Logger).
-		WithRPCHosts(internalRPC).
-		WithPrivateKey(TestPrivateKey)
-
-	spNode, err := spBuilder.Build(ctx)
-	require.NoError(t, err, "failed to build sp node")
-
-	t.Cleanup(func() { _ = spNode.Remove(t.Context()) })
-	require.NoError(t, spNode.Start(ctx), "failed to start spamoor node")
-
-	// Wait for daemon readiness.
-	spInfo, err := spNode.GetNetworkInfo(ctx)
-	require.NoError(t, err, "failed to get network info")
-
-	apiAddr := "http://127.0.0.1:" + spInfo.External.Ports.HTTP
-	requireHTTP(t, apiAddr+"/api/spammers", 30*time.Second)
-	api := spNode.API()
-
-	// Basic scenarios (structs that YAML-marshal into the daemon config).
-	eoatx := map[string]any{
-		"throughput":      100,
-		"total_count":     3000,
-		"max_pending":     4000,
-		"max_wallets":     300,
-		"amount":          100,
-		"random_amount":   true,
-		"random_target":   true,
-		"base_fee":        20, // gwei
-		"tip_fee":         2,  // gwei
-		"refill_amount":   "1000000000000000000",
-		"refill_balance":  "500000000000000000",
-		"refill_interval": 600,
-	}
-
-	gasburner := map[string]any{
-		"throughput":        25,
-		"total_count":       2000,
-		"max_pending":       8000,
-		"max_wallets":       500,
-		"gas_units_to_burn": 3000000,
-		"base_fee":          20,
-		"tip_fee":           5,
-		"rebroadcast":       5,
-		"refill_amount":     "5000000000000000000",
-		"refill_balance":    "2000000000000000000",
-		"refill_interval":   300,
-	}
-
-	var ids []int
-	id, err := api.CreateSpammer("smoke-eoatx", spamoor.ScenarioEOATX, eoatx, true)
-	require.NoError(t, err, "failed to create eoatx spammer")
-	ids = append(ids, id)
-	id, err = api.CreateSpammer("smoke-gasburner", spamoor.ScenarioGasBurnerTX, gasburner, true)
-	require.NoError(t, err, "failed to create gasburner spammer")
-	ids = append(ids, id)
-
-	for _, id := range ids {
-		idToDelete := id
-		t.Cleanup(func() { _ = api.DeleteSpammer(idToDelete) })
-	}
-
-	// allow spamoor enough time to generate transaction throughput
-	// so that the expected tracing spans appear in Jaeger.
-	time.Sleep(60 * time.Second)
-
-	// Fetch parsed metrics and print a concise summary.
-	metrics, err := api.GetMetrics()
-	require.NoError(t, err, "failed to get metrics")
-	sent := sumCounter(metrics["spamoor_transactions_sent_total"])
-	fail := sumCounter(metrics["spamoor_transactions_failed_total"])
-
-	// Verify Jaeger received traces from ev-node.
-	// Service name is set above via --evnode.instrumentation.tracing_service_name "ev-node-smoke".
-	traceCtx, cancel := context.WithTimeout(ctx, 3*time.Minute)
-	defer cancel()
-	ok, err := jg.External.WaitForTraces(traceCtx, "ev-node-smoke", 1, 2*time.Second)
-	require.NoError(t, err, "error while waiting for Jaeger traces; UI: %s", jg.External.QueryURL())
-	require.True(t, ok, "expected at least one trace in Jaeger; UI: %s", jg.External.QueryURL())
-
-	// Also wait for traces from ev-reth and print a small sample.
-	ok, err = jg.External.WaitForTraces(traceCtx, "ev-reth", 1, 2*time.Second)
-	require.NoError(t, err, "error while waiting for ev-reth traces; UI: %s", jg.External.QueryURL())
-	require.True(t, ok, "expected at least one trace from ev-reth; UI: %s", jg.External.QueryURL())
-
-	// fetch traces and print reports for both services.
-	// use a large limit to fetch all traces from the test run.
-	evNodeTraces, err := jg.External.Traces(traceCtx, "ev-node-smoke", 10000)
-	require.NoError(t, err, "failed to fetch ev-node-smoke traces from Jaeger")
-	evNodeSpans := extractSpansFromTraces(evNodeTraces)
-	printTraceReport(t, "ev-node-smoke", toTraceSpans(evNodeSpans))
-
-	evRethTraces, err := jg.External.Traces(traceCtx, "ev-reth", 10000)
-	require.NoError(t, err, "failed to fetch ev-reth traces from Jaeger")
-	evRethSpans := extractSpansFromTraces(evRethTraces)
-	printTraceReport(t, "ev-reth", toTraceSpans(evRethSpans))
-
-	// assert expected ev-node span names are present.
-	// these spans reliably appear during block production with transactions flowing.
-	expectedSpans := []string{
-		"BlockExecutor.ProduceBlock",
-		"BlockExecutor.ApplyBlock",
-		"BlockExecutor.CreateBlock",
-		"BlockExecutor.RetrieveBatch",
-		"Executor.ExecuteTxs",
-		"Executor.SetFinal",
-		"Engine.ForkchoiceUpdated",
-		"Engine.NewPayload",
-		"Engine.GetPayload",
-		"Eth.GetBlockByNumber",
-		"Sequencer.GetNextBatch",
-		"DASubmitter.SubmitHeaders",
-		"DASubmitter.SubmitData",
-		"DA.Submit",
-	}
-	opNames := make(map[string]struct{}, len(evNodeSpans))
-	for _, s := range evNodeSpans {
-		opNames[s.operationName] = struct{}{}
-	}
-	for _, name := range expectedSpans {
-		require.Contains(t, opNames, name, "expected span %q not found in ev-node-smoke traces", name)
-	}
-
-	// ev-reth span names are internal to the Rust OTLP exporter and may change
-	// across versions, so we only assert that spans were collected at all.
-	// TODO: check for more specific spans once implemented.
-	require.NotEmpty(t, evRethSpans, "expected at least one span from ev-reth")
-
-	require.Greater(t, sent, float64(0), "at least one transaction should have been sent")
-	require.Zero(t, fail, "no transactions should have failed")
-}
-
-// --- helpers ---
-
-func requireHTTP(t *testing.T, url string, timeout time.Duration) {
-	t.Helper()
-	client := &http.Client{Timeout: 200 * time.Millisecond}
-	deadline := time.Now().Add(timeout)
-	var lastErr error
-	for time.Now().Before(deadline) {
-		resp, err := client.Get(url)
-		if err == nil {
-			_ = resp.Body.Close()
-			if resp.StatusCode >= 200 && resp.StatusCode < 300 {
-				return
-			}
-			lastErr = fmt.Errorf("status %d", resp.StatusCode)
-		} else {
-			lastErr = err
-		}
-		time.Sleep(100 * time.Millisecond)
-	}
-	t.Fatalf("daemon not ready at %s: %v", url, lastErr)
-}
-
-// Metric family helpers.
-func sumCounter(f *dto.MetricFamily) float64 {
-	if f == nil || f.GetType() != dto.MetricType_COUNTER {
-		return 0
-	}
-	var sum float64
-	for _, m := range f.GetMetric() {
-		if m.GetCounter() != nil && m.GetCounter().Value != nil {
-			sum += m.GetCounter().GetValue()
-		}
-	}
-	return sum
-}
-
-// jaegerSpan holds the fields we extract from Jaeger's untyped JSON response.
-type jaegerSpan struct {
-	operationName string
-	duration      float64 // microseconds
-}
-
-func (j jaegerSpan) SpanName() string            { return j.operationName }
-func (j jaegerSpan) SpanDuration() time.Duration { return time.Duration(j.duration) * time.Microsecond }
-
-// extractSpansFromTraces walks Jaeger's []any response and pulls out span operation names and durations.
-func extractSpansFromTraces(traces []any) []jaegerSpan {
-	var out []jaegerSpan
-	for _, t := range traces {
-		traceMap, ok := t.(map[string]any)
-		if !ok {
-			continue
-		}
-		spans, ok := traceMap["spans"].([]any)
-		if !ok {
-			continue
-		}
-		for _, s := range spans {
-			spanMap, ok := s.(map[string]any)
-			if !ok {
-				continue
-			}
-			name, _ := spanMap["operationName"].(string)
-			dur, _ := spanMap["duration"].(float64)
-			if name != "" {
-				out = append(out, jaegerSpan{operationName: name, duration: dur})
-			}
-		}
-	}
-	return out
-}
-
-func toTraceSpans(spans []jaegerSpan) []traceSpan {
-	out := make([]traceSpan, len(spans))
-	for i, s := range spans {
-		out[i] = s
-	}
-	return out
-}
diff --git a/test/e2e/evm_test_common.go b/test/e2e/evm_test_common.go
index 5ddaf935b0..161ff89349 100644
--- a/test/e2e/evm_test_common.go
+++ b/test/e2e/evm_test_common.go
@@ -315,7 +315,7 @@ func getNodeP2PAddress(t testing.TB, sut *SystemUnderTest, nodeHome string, rpcP
 // - jwtSecret: JWT secret for authenticating with EVM engine
 // - genesisHash: Hash of the genesis block for chain validation
 // - endpoints: TestEndpoints struct containing unique port assignments
-func setupSequencerNode(t testing.TB, sut *SystemUnderTest, sequencerHome, jwtSecret, genesisHash string, endpoints *TestEndpoints, extraArgs ...string) {
+func SetupSequencerNode(t testing.TB, sut *SystemUnderTest, sequencerHome, jwtSecret, genesisHash string, endpoints *TestEndpoints, extraArgs ...string) {
 	t.Helper()
 
 	// Create passphrase file
@@ -531,7 +531,7 @@ func WithRethOpts(opts ...evmtest.RethNodeOpt) SetupOpt {
 
 // setupCommonEVMEnv creates and initializes ev-reth instances, while also initializing the local ev-node instance
 // managed by sut. If a full node is also required, we can use the WithFullNode() additional option.
-func setupCommonEVMEnv(t testing.TB, sut *SystemUnderTest, client tastoratypes.TastoraDockerClient, networkID string, opts ...SetupOpt) *EVMEnv {
+func SetupCommonEVMEnv(t testing.TB, sut *SystemUnderTest, client tastoratypes.TastoraDockerClient, networkID string, opts ...SetupOpt) *EVMEnv {
 	t.Helper()
 
 	// Configuration via functional options
@@ -665,9 +665,9 @@ func setupSequencerOnlyTest(t testing.TB, sut *SystemUnderTest, nodeHome string,
 	t.Helper()
 
 	// Use common setup (no full node needed)
-	env := setupCommonEVMEnv(t, sut, client, networkID)
+	env := SetupCommonEVMEnv(t, sut, client, networkID)
 	// Initialize and start sequencer node
-	setupSequencerNode(t, sut, nodeHome, env.SequencerJWT, env.GenesisHash, env.Endpoints, extraArgs...)
+	SetupSequencerNode(t, sut, nodeHome, env.SequencerJWT, env.GenesisHash, env.Endpoints, extraArgs...)
 	t.Log("Sequencer node is up")
 
 	return env.GenesisHash, env.Endpoints.GetSequencerEthURL()
@@ -849,27 +849,23 @@ func verifyNoBlockProduction(t testing.TB, client *ethclient.Client, duration ti
 	t.Logf("✅ %s maintained height %d for %v (no new blocks produced)", nodeName, initialHeight, duration)
 }
 
-// traceSpan is a common interface for span data from different sources (OTLP collector, Jaeger API).
-type traceSpan interface {
+// TraceSpan is a common interface for span data from different sources (OTLP collector, Jaeger API).
+type TraceSpan interface {
 	SpanName() string
 	SpanDuration() time.Duration
 }
 
-// printTraceReport aggregates spans by operation name and prints a timing breakdown.
-func printTraceReport(t testing.TB, label string, spans []traceSpan) {
-	t.Helper()
-	if len(spans) == 0 {
-		t.Logf("WARNING: no spans found for %s", label)
-		return
-	}
+// SpanStats holds aggregated timing statistics for a single span operation.
+type SpanStats struct {
+	Count int
+	Total time.Duration
+	Min   time.Duration
+	Max   time.Duration
+}
 
-	type stats struct {
-		count int
-		total time.Duration
-		min   time.Duration
-		max   time.Duration
-	}
-	m := make(map[string]*stats)
+// AggregateSpanStats groups spans by operation name and computes count, total, min, max.
+func AggregateSpanStats(spans []TraceSpan) map[string]*SpanStats {
+	m := make(map[string]*SpanStats)
 	for _, span := range spans {
 		d := span.SpanDuration()
 		if d <= 0 {
@@ -878,45 +874,57 @@ func printTraceReport(t testing.TB, label string, spans []traceSpan) {
 		name := span.SpanName()
 		s, ok := m[name]
 		if !ok {
-			s = &stats{min: d, max: d}
+			s = &SpanStats{Min: d, Max: d}
 			m[name] = s
 		}
-		s.count++
-		s.total += d
-		if d < s.min {
-			s.min = d
+		s.Count++
+		s.Total += d
+		if d < s.Min {
+			s.Min = d
 		}
-		if d > s.max {
-			s.max = d
+		if d > s.Max {
+			s.Max = d
 		}
 	}
+	return m
+}
+
+// PrintTraceReport aggregates spans by operation name and prints a timing breakdown.
+func PrintTraceReport(t testing.TB, label string, spans []TraceSpan) {
+	t.Helper()
+	if len(spans) == 0 {
+		t.Logf("WARNING: no spans found for %s", label)
+		return
+	}
+
+	m := AggregateSpanStats(spans)
 
 	names := make([]string, 0, len(m))
 	for name := range m {
 		names = append(names, name)
 	}
 	sort.Slice(names, func(i, j int) bool {
-		return m[names[i]].total > m[names[j]].total
+		return m[names[i]].Total > m[names[j]].Total
 	})
 
 	var overallTotal time.Duration
 	for _, s := range m {
-		overallTotal += s.total
+		overallTotal += s.Total
 	}
 
 	t.Logf("\n--- %s Trace Breakdown (%d spans) ---", label, len(spans))
 	t.Logf("%-40s %6s %12s %12s %12s %7s", "OPERATION", "COUNT", "AVG", "MIN", "MAX", "% TOTAL")
 	for _, name := range names {
 		s := m[name]
-		avg := s.total / time.Duration(s.count)
-		pct := float64(s.total) / float64(overallTotal) * 100
-		t.Logf("%-40s %6d %12s %12s %12s %6.1f%%", name, s.count, avg, s.min, s.max, pct)
+		avg := s.Total / time.Duration(s.Count)
+		pct := float64(s.Total) / float64(overallTotal) * 100
+		t.Logf("%-40s %6d %12s %12s %12s %6.1f%%", name, s.Count, avg, s.Min, s.Max, pct)
 	}
 
 	t.Logf("\n--- %s Time Distribution ---", label)
 	for _, name := range names {
 		s := m[name]
-		pct := float64(s.total) / float64(overallTotal) * 100
+		pct := float64(s.Total) / float64(overallTotal) * 100
 		bar := ""
 		for range int(pct / 2) {
 			bar += "█"
@@ -924,3 +932,4 @@ func printTraceReport(t testing.TB, label string, spans []traceSpan) {
 		t.Logf("%-40s %5.1f%% %s", name, pct, bar)
 	}
 }
+
diff --git a/test/e2e/failover_e2e_test.go b/test/e2e/failover_e2e_test.go
index 905b280cf4..26333a2dd0 100644
--- a/test/e2e/failover_e2e_test.go
+++ b/test/e2e/failover_e2e_test.go
@@ -56,7 +56,7 @@ func TestLeaseFailoverE2E(t *testing.T) {
 
 	// Get JWT secrets and setup common components first
 	dockerClient, networkID := tastoradocker.Setup(t)
-	env := setupCommonEVMEnv(t, sut, dockerClient, networkID, WithFullNode())
+	env := SetupCommonEVMEnv(t, sut, dockerClient, networkID, WithFullNode())
 	// Use a fresh reth node on the same Docker network as used by the env setup.
 	rethFn := evmtest.SetupTestRethNode(t, dockerClient, networkID)
 	jwtSecret3 := rethFn.JWTSecretHex()
@@ -257,7 +257,7 @@ func TestHASequencerRollingRestartE2E(t *testing.T) {
 
 	// Get Docker and common environment
 	dockerClient, networkID := tastoradocker.Setup(t)
-	env := setupCommonEVMEnv(t, sut, dockerClient, networkID, WithFullNode())
+	env := SetupCommonEVMEnv(t, sut, dockerClient, networkID, WithFullNode())
 	rethFn := evmtest.SetupTestRethNode(t, dockerClient, networkID)
 	jwtSecret3 := rethFn.JWTSecretHex()
 	fnInfo, err := rethFn.GetNetworkInfo(context.Background())

From 18fc15a6a4a72c8c2f153dff9f2a749728f0010c Mon Sep 17 00:00:00 2001
From: chatton <github.qpeyb@simplelogin.fr>
Date: Wed, 25 Feb 2026 12:40:33 +0000
Subject: [PATCH 02/19] fix: correct BENCH_JSON_OUTPUT path for spamoor
 benchmark

go test sets the working directory to the package under test, so the
env var should be relative to test/e2e/benchmark/, not test/e2e/.
---
 .github/workflows/benchmark.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 2f8fcdb1c7..bb2bd296df 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -63,7 +63,7 @@ jobs:
         run: make build-evm build-da
       - name: Run Spamoor smoke test
         run: |
-          cd test/e2e && BENCH_JSON_OUTPUT=benchmark/spamoor_bench.json go test -tags evm \
+          cd test/e2e && BENCH_JSON_OUTPUT=spamoor_bench.json go test -tags evm \
             -run='^TestSpamoorSuite$/^TestSpamoorSmoke$' -v -timeout=15m \
             --evm-binary=../../build/evm ./benchmark/
       - name: Upload benchmark results

From fccd9db21a00243d01b821ae8adb7f7ddc75c5f9 Mon Sep 17 00:00:00 2001
From: chatton <github.qpeyb@simplelogin.fr>
Date: Wed, 25 Feb 2026 12:50:50 +0000
Subject: [PATCH 03/19] fix: place package pattern before test binary flags in
 benchmark CI

go test treats all arguments after an unknown flag (--evm-binary) as
test binary args, so ./benchmark/ was never recognized as a package
pattern.
---
 .github/workflows/benchmark.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index bb2bd296df..c4be10227b 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -65,7 +65,7 @@ jobs:
         run: |
           cd test/e2e && BENCH_JSON_OUTPUT=spamoor_bench.json go test -tags evm \
             -run='^TestSpamoorSuite$/^TestSpamoorSmoke$' -v -timeout=15m \
-            --evm-binary=../../build/evm ./benchmark/
+            ./benchmark/ --evm-binary=../../build/evm
       - name: Upload benchmark results
         uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:

From ae525cae6fe048844f4fa5eb01180c0a534a7f5c Mon Sep 17 00:00:00 2001
From: chatton <github.qpeyb@simplelogin.fr>
Date: Wed, 25 Feb 2026 13:48:41 +0000
Subject: [PATCH 04/19] fix: adjust evm-binary path for benchmark subpackage
 working directory

go test sets the cwd to the package directory (test/e2e/benchmark/),
so the binary path needs an extra parent traversal.
---
 .github/workflows/benchmark.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index c4be10227b..b3a157a0b1 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -65,7 +65,7 @@ jobs:
         run: |
           cd test/e2e && BENCH_JSON_OUTPUT=spamoor_bench.json go test -tags evm \
             -run='^TestSpamoorSuite$/^TestSpamoorSmoke$' -v -timeout=15m \
-            ./benchmark/ --evm-binary=../../build/evm
+            ./benchmark/ --evm-binary=../../../build/evm
       - name: Upload benchmark results
         uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:

From 039eaf75c5401296236b3f56683c4e59d46ca040 Mon Sep 17 00:00:00 2001
From: chatton <github.qpeyb@simplelogin.fr>
Date: Wed, 25 Feb 2026 14:30:13 +0000
Subject: [PATCH 05/19] wip: erc20 benchmark test

---
 test/e2e/benchmark/helpers.go            | 108 ++++++++++++++++
 test/e2e/benchmark/spamoor_erc20_test.go | 152 +++++++++++++++++++++++
 2 files changed, 260 insertions(+)
 create mode 100644 test/e2e/benchmark/helpers.go
 create mode 100644 test/e2e/benchmark/spamoor_erc20_test.go

diff --git a/test/e2e/benchmark/helpers.go b/test/e2e/benchmark/helpers.go
new file mode 100644
index 0000000000..4577407d75
--- /dev/null
+++ b/test/e2e/benchmark/helpers.go
@@ -0,0 +1,108 @@
+//go:build evm
+
+package benchmark
+
+import (
+	"context"
+	"fmt"
+	"math/big"
+	"time"
+
+	"github.com/celestiaorg/tastora/framework/docker/evstack/spamoor"
+	"github.com/ethereum/go-ethereum/ethclient"
+)
+
+// blockMetrics holds aggregated gas and transaction data across a range of blocks.
+type blockMetrics struct {
+	StartBlock   uint64
+	EndBlock     uint64
+	BlockCount   int
+	TotalGasUsed uint64
+	TotalTxCount int
+	GasPerBlock  []uint64
+	TxPerBlock   []int
+}
+
+// avgGasPerBlock returns the mean gas used per block.
+func (m *blockMetrics) avgGasPerBlock() float64 {
+	if m.BlockCount == 0 {
+		return 0
+	}
+	return float64(m.TotalGasUsed) / float64(m.BlockCount)
+}
+
+// avgTxPerBlock returns the mean transaction count per block.
+func (m *blockMetrics) avgTxPerBlock() float64 {
+	if m.BlockCount == 0 {
+		return 0
+	}
+	return float64(m.TotalTxCount) / float64(m.BlockCount)
+}
+
+// mgasPerSec calculates the achieved throughput in megagas per second.
+func mgasPerSec(totalGasUsed uint64, elapsed time.Duration) float64 {
+	if elapsed <= 0 {
+		return 0
+	}
+	return float64(totalGasUsed) / elapsed.Seconds() / 1e6
+}
+
+// waitForSpamoorDone polls spamoor metrics until the total sent count reaches
+// the target or the context is cancelled. It returns the final sent and failed
+// counts.
+func waitForSpamoorDone(ctx context.Context, api *spamoor.API, targetCount int, pollInterval time.Duration) (sent, failed float64, err error) {
+	ticker := time.NewTicker(pollInterval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return sent, failed, fmt.Errorf("timed out waiting for spamoor to send %d txs (sent %.0f): %w", targetCount, sent, ctx.Err())
+		case <-ticker.C:
+			metrics, mErr := api.GetMetrics()
+			if mErr != nil {
+				continue
+			}
+			sent = sumCounter(metrics["spamoor_transactions_sent_total"])
+			failed = sumCounter(metrics["spamoor_transactions_failed_total"])
+			if sent >= float64(targetCount) {
+				return sent, failed, nil
+			}
+		}
+	}
+}
+
+// collectBlockMetrics fetches block headers from startBlock to endBlock
+// (inclusive) and aggregates gas and transaction counts. Blocks with zero
+// transactions are skipped to avoid skewing averages with empty blocks.
+func collectBlockMetrics(ctx context.Context, client *ethclient.Client, startBlock, endBlock uint64) (*blockMetrics, error) {
+	if endBlock < startBlock {
+		return nil, fmt.Errorf("endBlock %d < startBlock %d", endBlock, startBlock)
+	}
+
+	m := &blockMetrics{
+		StartBlock: startBlock,
+		EndBlock:   endBlock,
+	}
+
+	for n := startBlock; n <= endBlock; n++ {
+		block, err := client.BlockByNumber(ctx, new(big.Int).SetUint64(n))
+		if err != nil {
+			return nil, fmt.Errorf("failed to fetch block %d: %w", n, err)
+		}
+
+		txCount := len(block.Transactions())
+		if txCount == 0 {
+			continue
+		}
+
+		gasUsed := block.GasUsed()
+		m.BlockCount++
+		m.TotalGasUsed += gasUsed
+		m.TotalTxCount += txCount
+		m.GasPerBlock = append(m.GasPerBlock, gasUsed)
+		m.TxPerBlock = append(m.TxPerBlock, txCount)
+	}
+
+	return m, nil
+}
diff --git a/test/e2e/benchmark/spamoor_erc20_test.go b/test/e2e/benchmark/spamoor_erc20_test.go
new file mode 100644
index 0000000000..daab436cac
--- /dev/null
+++ b/test/e2e/benchmark/spamoor_erc20_test.go
@@ -0,0 +1,152 @@
+//go:build evm
+
+package benchmark
+
+import (
+	"context"
+	"time"
+
+	"github.com/celestiaorg/tastora/framework/docker/evstack/spamoor"
+	e2e "github.com/evstack/ev-node/test/e2e"
+)
+
+// TestERC20Throughput measures ERC-20 token transfer throughput in isolation.
+// Each tx is ~65K gas (contract call with storage reads/writes)
+//
+// Primary metric: achieved MGas/s.
+// Diagnostic metrics: per-span latency breakdown, gas/block, tx/block.
+func (s *SpamoorSuite) TestERC20Throughput() {
+	const (
+		totalCount  = 3000
+		serviceName = "ev-node-erc20"
+		waitTimeout = 3 * time.Minute
+	)
+
+	t := s.T()
+	ctx := t.Context()
+	w := newResultWriter(t, "ERC20Throughput")
+	defer w.flush()
+
+	e := s.setupEnv(config{
+		rethTag:     "pr-140",
+		serviceName: serviceName,
+	})
+
+	erc20Config := map[string]any{
+		"throughput":      100,
+		"total_count":     totalCount,
+		"max_pending":     4000,
+		"max_wallets":     300,
+		"base_fee":        20,
+		"tip_fee":         3,
+		"refill_amount":   "5000000000000000000",
+		"refill_balance":  "2000000000000000000",
+		"refill_interval": 600,
+	}
+
+	// record the starting block before generating load
+	startHeader, err := e.ethClient.HeaderByNumber(ctx, nil)
+	s.Require().NoError(err, "failed to get start block header")
+	startBlock := startHeader.Number.Uint64()
+	loadStart := time.Now()
+
+	id, err := e.spamoorAPI.CreateSpammer("bench-erc20", spamoor.ScenarioERC20TX, erc20Config, true)
+	s.Require().NoError(err, "failed to create erc20 spammer")
+	t.Cleanup(func() { _ = e.spamoorAPI.DeleteSpammer(id) })
+
+	// wait for spamoor to finish sending all transactions
+	waitCtx, cancel := context.WithTimeout(ctx, waitTimeout)
+	defer cancel()
+	sent, failed, err := waitForSpamoorDone(waitCtx, e.spamoorAPI, totalCount, 2*time.Second)
+	s.Require().NoError(err, "spamoor did not finish in time")
+
+	// allow a short settle period for remaining txs to be included in blocks
+	time.Sleep(5 * time.Second)
+	wallClock := time.Since(loadStart)
+
+	endHeader, err := e.ethClient.HeaderByNumber(ctx, nil)
+	s.Require().NoError(err, "failed to get end block header")
+	endBlock := endHeader.Number.Uint64()
+
+	// collect block-level gas/tx metrics
+	bm, err := collectBlockMetrics(ctx, e.ethClient, startBlock, endBlock)
+	s.Require().NoError(err, "failed to collect block metrics")
+
+	achievedMGas := mgasPerSec(bm.TotalGasUsed, wallClock)
+	achievedTPS := float64(bm.TotalTxCount) / wallClock.Seconds()
+
+	t.Logf("ERC20 throughput: %.2f MGas/s, %.1f TPS over %s (%d blocks, %d txs, %.2f avg gas/block)",
+		achievedMGas, achievedTPS, wallClock.Round(time.Millisecond),
+		bm.BlockCount, bm.TotalTxCount, bm.avgGasPerBlock())
+
+	// collect and report traces
+	evNodeSpans := s.collectServiceTraces(e, serviceName)
+	evRethSpans := s.collectServiceTraces(e, "ev-reth")
+	e2e.PrintTraceReport(t, serviceName, evNodeSpans)
+	e2e.PrintTraceReport(t, "ev-reth", evRethSpans)
+
+	// compute ev-node overhead ratio
+	spanStats := e2e.AggregateSpanStats(evNodeSpans)
+	if produceBlock, ok := spanStats["BlockExecutor.ProduceBlock"]; ok {
+		if executeTxs, ok2 := spanStats["Executor.ExecuteTxs"]; ok2 {
+			produceAvg := float64(produceBlock.Total.Microseconds()) / float64(produceBlock.Count)
+			executeAvg := float64(executeTxs.Total.Microseconds()) / float64(executeTxs.Count)
+			if produceAvg > 0 {
+				overhead := (produceAvg - executeAvg) / produceAvg * 100
+				t.Logf("ev-node overhead: %.1f%%", overhead)
+				w.addEntry(entry{
+					Name:  "ERC20Throughput - ev-node overhead",
+					Unit:  "%",
+					Value: overhead,
+				})
+			}
+		}
+	}
+
+	// write all metrics to benchmark output
+	w.addEntry(entry{
+		Name:  "ERC20Throughput - MGas/s",
+		Unit:  "MGas/s",
+		Value: achievedMGas,
+	})
+	w.addEntry(entry{
+		Name:  "ERC20Throughput - TPS",
+		Unit:  "tx/s",
+		Value: achievedTPS,
+	})
+	w.addEntry(entry{
+		Name:  "ERC20Throughput - avg gas/block",
+		Unit:  "gas",
+		Value: bm.avgGasPerBlock(),
+	})
+	w.addEntry(entry{
+		Name:  "ERC20Throughput - avg tx/block",
+		Unit:  "count",
+		Value: bm.avgTxPerBlock(),
+	})
+	w.addEntry(entry{
+		Name:  "ERC20Throughput - blocks/s",
+		Unit:  "blocks/s",
+		Value: float64(bm.BlockCount) / wallClock.Seconds(),
+	})
+
+	// add per-span avg latencies
+	w.addSpans(append(evNodeSpans, evRethSpans...))
+
+	// assertions
+	s.Require().Greater(sent, float64(0), "at least one transaction should have been sent")
+	s.Require().Zero(failed, "no transactions should have failed")
+
+	// assert expected ev-node spans are present
+	assertSpanNames(t, evNodeSpans, []string{
+		"BlockExecutor.ProduceBlock",
+		"BlockExecutor.ApplyBlock",
+		"Executor.ExecuteTxs",
+		"Executor.SetFinal",
+		"Engine.ForkchoiceUpdated",
+		"Engine.NewPayload",
+		"Engine.GetPayload",
+		"Sequencer.GetNextBatch",
+		"DA.Submit",
+	}, serviceName)
+}

From 85c9d2d693fa32a7195598562d3cfaeadae90e35 Mon Sep 17 00:00:00 2001
From: chatton <github.qpeyb@simplelogin.fr>
Date: Wed, 25 Feb 2026 14:45:37 +0000
Subject: [PATCH 06/19] fix: exclude benchmark subpackage from make test-e2e

The benchmark package doesn't define the --binary flag that test-e2e
passes. It has its own CI workflow so it doesn't need to run here.
---
 scripts/test.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/test.mk b/scripts/test.mk
index f9f76c1ab3..524f8a4cdb 100644
--- a/scripts/test.mk
+++ b/scripts/test.mk
@@ -24,7 +24,7 @@ test-integration:
 ## test-e2e: Running e2e tests
 test-e2e: build build-da build-evm docker-build-if-local
 	@echo "--> Running e2e tests"
-	@cd test/e2e && go test -mod=readonly -failfast -timeout=15m -tags='e2e evm' ./... --binary=../../build/testapp --evm-binary=../../build/evm
+	@cd test/e2e && go test -mod=readonly -failfast -timeout=15m -tags='e2e evm' $$(go list -tags='e2e evm' ./... | grep -v /benchmark) --binary=../../build/testapp --evm-binary=../../build/evm
 .PHONY: test-e2e
 
 ## test-integration-cover: generate code coverage report for integration tests.

From fe8d166a32c07841b3f2686853e7ea63a8a53c75 Mon Sep 17 00:00:00 2001
From: chatton <github.qpeyb@simplelogin.fr>
Date: Thu, 26 Feb 2026 13:03:06 +0000
Subject: [PATCH 07/19] fix: replace FilterLogs with header iteration and
 optimize spamoor config

collectBlockMetrics hit reth's 20K FilterLogs limit at high tx volumes.
Replace with direct header iteration over [startBlock, endBlock] and add
Phase 1 metrics: non-empty ratio, block interval p50/p99, gas/block and
tx/block p50/p99.

Optimize spamoor configuration for 100ms block time:
- --slot-duration 100ms, --startup-delay 0 on daemon
- throughput=50 per 100ms slot (500 tx/s per spammer)
- max_pending=50000 to avoid 3s block poll backpressure
- 5 staggered spammers with 50K txs each

Results: 55 MGas/s, 1414 TPS, 19.8% non-empty blocks (up from 6%).
---
 test/e2e/benchmark/helpers.go            | 159 +++++++++++++++++++----
 test/e2e/benchmark/spamoor_erc20_test.go | 106 ++++++++++++---
 test/e2e/benchmark/suite_test.go         |  44 ++++++-
 test/e2e/go.mod                          |   1 +
 test/e2e/go.sum                          |   2 -
 5 files changed, 263 insertions(+), 49 deletions(-)

diff --git a/test/e2e/benchmark/helpers.go b/test/e2e/benchmark/helpers.go
index 4577407d75..d25b06d7b5 100644
--- a/test/e2e/benchmark/helpers.go
+++ b/test/e2e/benchmark/helpers.go
@@ -5,7 +5,9 @@ package benchmark
 import (
 	"context"
 	"fmt"
+	"math"
 	"math/big"
+	"sort"
 	"time"
 
 	"github.com/celestiaorg/tastora/framework/docker/evstack/spamoor"
@@ -14,13 +16,25 @@ import (
 
 // blockMetrics holds aggregated gas and transaction data across a range of blocks.
 type blockMetrics struct {
-	StartBlock   uint64
-	EndBlock     uint64
-	BlockCount   int
-	TotalGasUsed uint64
-	TotalTxCount int
-	GasPerBlock  []uint64
-	TxPerBlock   []int
+	StartBlock     uint64
+	EndBlock       uint64
+	BlockCount     int // non-empty blocks only
+	TotalBlockCount int // all blocks in range including empty
+	TotalGasUsed   uint64
+	TotalTxCount   int
+	GasPerBlock    []uint64        // non-empty blocks only
+	TxPerBlock     []int           // non-empty blocks only
+	BlockIntervals []time.Duration // intervals between all consecutive blocks (for drift)
+	FirstBlockTime time.Time
+	LastBlockTime  time.Time
+}
+
+// steadyStateDuration returns the time between the first and last active blocks.
+func (m *blockMetrics) steadyStateDuration() time.Duration {
+	if m.FirstBlockTime.IsZero() || m.LastBlockTime.IsZero() {
+		return 0
+	}
+	return m.LastBlockTime.Sub(m.FirstBlockTime)
 }
 
 // avgGasPerBlock returns the mean gas used per block.
@@ -39,6 +53,74 @@ func (m *blockMetrics) avgTxPerBlock() float64 {
 	return float64(m.TotalTxCount) / float64(m.BlockCount)
 }
 
+// nonEmptyRatio returns the percentage of blocks that contained transactions.
+func (m *blockMetrics) nonEmptyRatio() float64 {
+	if m.TotalBlockCount == 0 {
+		return 0
+	}
+	return float64(m.BlockCount) / float64(m.TotalBlockCount) * 100
+}
+
+// blockIntervalStats returns p50, p99, and max of block intervals.
+func (m *blockMetrics) blockIntervalStats() (p50, p99, max time.Duration) {
+	if len(m.BlockIntervals) == 0 {
+		return 0, 0, 0
+	}
+	sorted := make([]float64, len(m.BlockIntervals))
+	for i, d := range m.BlockIntervals {
+		sorted[i] = float64(d)
+	}
+	sort.Float64s(sorted)
+	return time.Duration(percentile(sorted, 0.50)),
+		time.Duration(percentile(sorted, 0.99)),
+		time.Duration(sorted[len(sorted)-1])
+}
+
+// gasPerBlockStats returns p50 and p99 of gas used per non-empty block.
+func (m *blockMetrics) gasPerBlockStats() (p50, p99 float64) {
+	if len(m.GasPerBlock) == 0 {
+		return 0, 0
+	}
+	sorted := make([]float64, len(m.GasPerBlock))
+	for i, g := range m.GasPerBlock {
+		sorted[i] = float64(g)
+	}
+	sort.Float64s(sorted)
+	return percentile(sorted, 0.50), percentile(sorted, 0.99)
+}
+
+// txPerBlockStats returns p50 and p99 of tx count per non-empty block.
+func (m *blockMetrics) txPerBlockStats() (p50, p99 float64) {
+	if len(m.TxPerBlock) == 0 {
+		return 0, 0
+	}
+	sorted := make([]float64, len(m.TxPerBlock))
+	for i, c := range m.TxPerBlock {
+		sorted[i] = float64(c)
+	}
+	sort.Float64s(sorted)
+	return percentile(sorted, 0.50), percentile(sorted, 0.99)
+}
+
+// percentile returns the p-th percentile from a pre-sorted float64 slice
+// using linear interpolation.
+func percentile(sorted []float64, p float64) float64 {
+	if len(sorted) == 0 {
+		return 0
+	}
+	if len(sorted) == 1 {
+		return sorted[0]
+	}
+	rank := p * float64(len(sorted)-1)
+	lower := int(math.Floor(rank))
+	upper := lower + 1
+	if upper >= len(sorted) {
+		return sorted[len(sorted)-1]
+	}
+	frac := rank - float64(lower)
+	return sorted[lower] + frac*(sorted[upper]-sorted[lower])
+}
+
 // mgasPerSec calculates the achieved throughput in megagas per second.
 func mgasPerSec(totalGasUsed uint64, elapsed time.Duration) float64 {
 	if elapsed <= 0 {
@@ -48,12 +130,15 @@ func mgasPerSec(totalGasUsed uint64, elapsed time.Duration) float64 {
 }
 
 // waitForSpamoorDone polls spamoor metrics until the total sent count reaches
-// the target or the context is cancelled. It returns the final sent and failed
-// counts.
-func waitForSpamoorDone(ctx context.Context, api *spamoor.API, targetCount int, pollInterval time.Duration) (sent, failed float64, err error) {
+// the target or the context is cancelled. It logs the send rate at each poll
+// interval and returns the final sent and failed counts.
+func waitForSpamoorDone(ctx context.Context, log func(string, ...any), api *spamoor.API, targetCount int, pollInterval time.Duration) (sent, failed float64, err error) {
 	ticker := time.NewTicker(pollInterval)
 	defer ticker.Stop()
 
+	start := time.Now()
+	var prevSent float64
+
 	for {
 		select {
 		case <-ctx.Done():
@@ -65,6 +150,14 @@ func waitForSpamoorDone(ctx context.Context, api *spamoor.API, targetCount int,
 			}
 			sent = sumCounter(metrics["spamoor_transactions_sent_total"])
 			failed = sumCounter(metrics["spamoor_transactions_failed_total"])
+
+			delta := sent - prevSent
+			rate := delta / pollInterval.Seconds()
+			elapsed := time.Since(start).Round(time.Second)
+			log("spamoor progress: %.0f/%.0f sent (%.0f tx/s instant, %.0f tx/s avg, %.0f failed) [%s]",
+				sent, float64(targetCount), rate, sent/time.Since(start).Seconds(), failed, elapsed)
+			prevSent = sent
+
 			if sent >= float64(targetCount) {
 				return sent, failed, nil
 			}
@@ -72,36 +165,52 @@ func waitForSpamoorDone(ctx context.Context, api *spamoor.API, targetCount int,
 	}
 }
 
-// collectBlockMetrics fetches block headers from startBlock to endBlock
-// (inclusive) and aggregates gas and transaction counts. Blocks with zero
-// transactions are skipped to avoid skewing averages with empty blocks.
+// collectBlockMetrics iterates all headers in [startBlock, endBlock] to collect
+// gas and transaction metrics. Empty blocks are skipped for gas/tx aggregation
+// but included in block interval tracking.
 func collectBlockMetrics(ctx context.Context, client *ethclient.Client, startBlock, endBlock uint64) (*blockMetrics, error) {
 	if endBlock < startBlock {
 		return nil, fmt.Errorf("endBlock %d < startBlock %d", endBlock, startBlock)
 	}
 
-	m := &blockMetrics{
-		StartBlock: startBlock,
-		EndBlock:   endBlock,
-	}
+	m := &blockMetrics{StartBlock: startBlock, EndBlock: endBlock}
 
+	var prevBlockTime time.Time
 	for n := startBlock; n <= endBlock; n++ {
-		block, err := client.BlockByNumber(ctx, new(big.Int).SetUint64(n))
+		header, err := client.HeaderByNumber(ctx, new(big.Int).SetUint64(n))
 		if err != nil {
-			return nil, fmt.Errorf("failed to fetch block %d: %w", n, err)
+			return nil, fmt.Errorf("failed to fetch header %d: %w", n, err)
+		}
+
+		blockTime := time.Unix(int64(header.Time), 0)
+		m.TotalBlockCount++
+
+		// track intervals between all consecutive blocks
+		if !prevBlockTime.IsZero() {
+			m.BlockIntervals = append(m.BlockIntervals, blockTime.Sub(prevBlockTime))
+		}
+		prevBlockTime = blockTime
+
+		txCount, err := client.TransactionCount(ctx, header.Hash())
+		if err != nil {
+			return nil, fmt.Errorf("failed to fetch tx count for block %d: %w", n, err)
 		}
 
-		txCount := len(block.Transactions())
 		if txCount == 0 {
 			continue
 		}
 
-		gasUsed := block.GasUsed()
+		// non-empty block: aggregate gas and tx metrics
+		if m.BlockCount == 0 {
+			m.FirstBlockTime = blockTime
+		}
+		m.LastBlockTime = blockTime
+
 		m.BlockCount++
-		m.TotalGasUsed += gasUsed
-		m.TotalTxCount += txCount
-		m.GasPerBlock = append(m.GasPerBlock, gasUsed)
-		m.TxPerBlock = append(m.TxPerBlock, txCount)
+		m.TotalGasUsed += header.GasUsed
+		m.TotalTxCount += int(txCount)
+		m.GasPerBlock = append(m.GasPerBlock, header.GasUsed)
+		m.TxPerBlock = append(m.TxPerBlock, int(txCount))
 	}
 
 	return m, nil
diff --git a/test/e2e/benchmark/spamoor_erc20_test.go b/test/e2e/benchmark/spamoor_erc20_test.go
index daab436cac..5cd601a670 100644
--- a/test/e2e/benchmark/spamoor_erc20_test.go
+++ b/test/e2e/benchmark/spamoor_erc20_test.go
@@ -4,6 +4,7 @@ package benchmark
 
 import (
 	"context"
+	"fmt"
 	"time"
 
 	"github.com/celestiaorg/tastora/framework/docker/evstack/spamoor"
@@ -17,9 +18,11 @@ import (
 // Diagnostic metrics: per-span latency breakdown, gas/block, tx/block.
 func (s *SpamoorSuite) TestERC20Throughput() {
 	const (
-		totalCount  = 3000
-		serviceName = "ev-node-erc20"
-		waitTimeout = 3 * time.Minute
+		numSpammers    = 5
+		countPerSpammer = 50000
+		totalCount      = numSpammers * countPerSpammer
+		serviceName     = "ev-node-erc20"
+		waitTimeout     = 5 * time.Minute
 	)
 
 	t := s.T()
@@ -33,10 +36,10 @@ func (s *SpamoorSuite) TestERC20Throughput() {
 	})
 
 	erc20Config := map[string]any{
-		"throughput":      100,
-		"total_count":     totalCount,
-		"max_pending":     4000,
-		"max_wallets":     300,
+		"throughput":      50, // 50 tx per 100ms slot = 500 tx/s per spammer
+		"total_count":     countPerSpammer,
+		"max_pending":     50000,
+		"max_wallets":     200,
 		"base_fee":        20,
 		"tip_fee":         3,
 		"refill_amount":   "5000000000000000000",
@@ -50,18 +53,28 @@ func (s *SpamoorSuite) TestERC20Throughput() {
 	startBlock := startHeader.Number.Uint64()
 	loadStart := time.Now()
 
-	id, err := e.spamoorAPI.CreateSpammer("bench-erc20", spamoor.ScenarioERC20TX, erc20Config, true)
-	s.Require().NoError(err, "failed to create erc20 spammer")
-	t.Cleanup(func() { _ = e.spamoorAPI.DeleteSpammer(id) })
+	// stagger spammer launches so their warm-up phases (contract deploy + wallet
+	// funding) complete at different times, producing a more continuous tx stream
+	// instead of synchronized bursts.
+	const staggerDelay = 5 * time.Second
+	for i := range numSpammers {
+		if i > 0 {
+			time.Sleep(staggerDelay)
+		}
+		name := fmt.Sprintf("bench-erc20-%d", i)
+		id, err := e.spamoorAPI.CreateSpammer(name, spamoor.ScenarioERC20TX, erc20Config, true)
+		s.Require().NoError(err, "failed to create spammer %s", name)
+		t.Cleanup(func() { _ = e.spamoorAPI.DeleteSpammer(id) })
+	}
 
 	// wait for spamoor to finish sending all transactions
 	waitCtx, cancel := context.WithTimeout(ctx, waitTimeout)
 	defer cancel()
-	sent, failed, err := waitForSpamoorDone(waitCtx, e.spamoorAPI, totalCount, 2*time.Second)
+	sent, failed, err := waitForSpamoorDone(waitCtx, t.Logf, e.spamoorAPI, totalCount, 2*time.Second)
 	s.Require().NoError(err, "spamoor did not finish in time")
 
-	// allow a short settle period for remaining txs to be included in blocks
-	time.Sleep(5 * time.Second)
+	// allow pending txs to drain from mempool into blocks
+	time.Sleep(20 * time.Second)
 	wallClock := time.Since(loadStart)
 
 	endHeader, err := e.ethClient.HeaderByNumber(ctx, nil)
@@ -72,18 +85,34 @@ func (s *SpamoorSuite) TestERC20Throughput() {
 	bm, err := collectBlockMetrics(ctx, e.ethClient, startBlock, endBlock)
 	s.Require().NoError(err, "failed to collect block metrics")
 
-	achievedMGas := mgasPerSec(bm.TotalGasUsed, wallClock)
-	achievedTPS := float64(bm.TotalTxCount) / wallClock.Seconds()
+	// use steady-state window (first active block to last active block) for
+	// throughput calculation, excluding warm-up and cool-down periods
+	steadyState := bm.steadyStateDuration()
+	s.Require().Greater(steadyState, time.Duration(0), "expected non-zero steady-state duration")
+
+	achievedMGas := mgasPerSec(bm.TotalGasUsed, steadyState)
+	achievedTPS := float64(bm.TotalTxCount) / steadyState.Seconds()
 
-	t.Logf("ERC20 throughput: %.2f MGas/s, %.1f TPS over %s (%d blocks, %d txs, %.2f avg gas/block)",
-		achievedMGas, achievedTPS, wallClock.Round(time.Millisecond),
-		bm.BlockCount, bm.TotalTxCount, bm.avgGasPerBlock())
+	intervalP50, intervalP99, intervalMax := bm.blockIntervalStats()
+	gasP50, gasP99 := bm.gasPerBlockStats()
+	txP50, txP99 := bm.txPerBlockStats()
+
+	t.Logf("block range: %d-%d (%d total, %d non-empty, %.1f%% non-empty)",
+		startBlock, endBlock, bm.TotalBlockCount, bm.BlockCount, bm.nonEmptyRatio())
+	t.Logf("block intervals: p50=%s, p99=%s, max=%s",
+		intervalP50.Round(time.Millisecond), intervalP99.Round(time.Millisecond), intervalMax.Round(time.Millisecond))
+	t.Logf("gas/block (non-empty): avg=%.0f, p50=%.0f, p99=%.0f", bm.avgGasPerBlock(), gasP50, gasP99)
+	t.Logf("tx/block (non-empty): avg=%.1f, p50=%.0f, p99=%.0f", bm.avgTxPerBlock(), txP50, txP99)
+	t.Logf("ERC20 throughput: %.2f MGas/s, %.1f TPS over %s steady-state (%s wall clock)",
+		achievedMGas, achievedTPS, steadyState.Round(time.Millisecond), wallClock.Round(time.Millisecond))
 
 	// collect and report traces
 	evNodeSpans := s.collectServiceTraces(e, serviceName)
-	evRethSpans := s.collectServiceTraces(e, "ev-reth")
+	evRethSpans := s.tryCollectServiceTraces(e, "ev-reth")
 	e2e.PrintTraceReport(t, serviceName, evNodeSpans)
-	e2e.PrintTraceReport(t, "ev-reth", evRethSpans)
+	if len(evRethSpans) > 0 {
+		e2e.PrintTraceReport(t, "ev-reth", evRethSpans)
+	}
 
 	// compute ev-node overhead ratio
 	spanStats := e2e.AggregateSpanStats(evNodeSpans)
@@ -127,7 +156,42 @@ func (s *SpamoorSuite) TestERC20Throughput() {
 	w.addEntry(entry{
 		Name:  "ERC20Throughput - blocks/s",
 		Unit:  "blocks/s",
-		Value: float64(bm.BlockCount) / wallClock.Seconds(),
+		Value: float64(bm.BlockCount) / steadyState.Seconds(),
+	})
+	w.addEntry(entry{
+		Name:  "ERC20Throughput - non-empty block ratio",
+		Unit:  "%",
+		Value: bm.nonEmptyRatio(),
+	})
+	w.addEntry(entry{
+		Name:  "ERC20Throughput - block interval p50",
+		Unit:  "ms",
+		Value: float64(intervalP50.Milliseconds()),
+	})
+	w.addEntry(entry{
+		Name:  "ERC20Throughput - block interval p99",
+		Unit:  "ms",
+		Value: float64(intervalP99.Milliseconds()),
+	})
+	w.addEntry(entry{
+		Name:  "ERC20Throughput - gas/block p50",
+		Unit:  "gas",
+		Value: gasP50,
+	})
+	w.addEntry(entry{
+		Name:  "ERC20Throughput - gas/block p99",
+		Unit:  "gas",
+		Value: gasP99,
+	})
+	w.addEntry(entry{
+		Name:  "ERC20Throughput - tx/block p50",
+		Unit:  "count",
+		Value: txP50,
+	})
+	w.addEntry(entry{
+		Name:  "ERC20Throughput - tx/block p99",
+		Unit:  "count",
+		Value: txP99,
 	})
 
 	// add per-span avg latencies
diff --git a/test/e2e/benchmark/suite_test.go b/test/e2e/benchmark/suite_test.go
index a43f25890c..b96cc67b3a 100644
--- a/test/e2e/benchmark/suite_test.go
+++ b/test/e2e/benchmark/suite_test.go
@@ -107,7 +107,8 @@ func (s *SpamoorSuite) setupEnv(cfg config) *env {
 		WithDockerNetworkID(evmEnv.RethNode.NetworkID).
 		WithLogger(evmEnv.RethNode.Logger).
 		WithRPCHosts(internalRPC).
-		WithPrivateKey(e2e.TestPrivateKey)
+		WithPrivateKey(e2e.TestPrivateKey).
+		WithAdditionalStartArgs("--slot-duration", "100ms", "--startup-delay", "0")
 
 	spNode, err := spBuilder.Build(ctx)
 	s.Require().NoError(err, "failed to build spamoor node")
@@ -142,3 +143,44 @@ func (s *SpamoorSuite) collectServiceTraces(e *env, serviceName string) []e2e.Tr
 
 	return toTraceSpans(extractSpansFromTraces(traces))
 }
+
+// tryCollectServiceTraces attempts to fetch all traces by requesting them in
+// batches to avoid overwhelming Jaeger with a single large response.
+// Returns nil instead of failing the test if traces are unavailable.
+func (s *SpamoorSuite) tryCollectServiceTraces(e *env, serviceName string) []e2e.TraceSpan {
+	t := s.T()
+	ctx, cancel := context.WithTimeout(t.Context(), 3*time.Minute)
+	defer cancel()
+
+	ok, err := e.jaeger.External.WaitForTraces(ctx, serviceName, 1, 2*time.Second)
+	if err != nil || !ok {
+		t.Logf("warning: could not collect %s traces (err=%v, ok=%v)", serviceName, err, ok)
+		return nil
+	}
+
+	const batchSize = 200
+	var allSpans []e2e.TraceSpan
+	for batch := 0; ; batch++ {
+		traces, err := e.jaeger.External.Traces(ctx, serviceName, batchSize)
+		if err != nil {
+			if batch == 0 {
+				t.Logf("warning: failed to fetch %s traces: %v", serviceName, err)
+				return nil
+			}
+			// got some data before failing, use what we have
+			t.Logf("warning: %s trace fetch stopped after %d batches (%d spans): %v", serviceName, batch, len(allSpans), err)
+			break
+		}
+		spans := toTraceSpans(extractSpansFromTraces(traces))
+		if len(spans) == 0 {
+			break
+		}
+		allSpans = append(allSpans, spans...)
+		if len(traces) < batchSize {
+			break // got fewer traces than requested, no more available
+		}
+	}
+
+	t.Logf("collected %d %s spans", len(allSpans), serviceName)
+	return allSpans
+}
diff --git a/test/e2e/go.mod b/test/e2e/go.mod
index 46710aa54f..f2a24355dc 100644
--- a/test/e2e/go.mod
+++ b/test/e2e/go.mod
@@ -22,6 +22,7 @@ require (
 )
 
 replace (
+	github.com/celestiaorg/tastora => /Users/chatton/checkouts/celestiaorg/tastora
 	github.com/evstack/ev-node => ../../
 	github.com/evstack/ev-node/core => ../../core
 	github.com/evstack/ev-node/execution/evm => ../../execution/evm
diff --git a/test/e2e/go.sum b/test/e2e/go.sum
index 782994eb29..0d66192173 100644
--- a/test/e2e/go.sum
+++ b/test/e2e/go.sum
@@ -145,8 +145,6 @@ github.com/celestiaorg/go-square/v3 v3.0.2 h1:eSQOgNII8inK9IhiBZ+6GADQeWbRq4HYY7
 github.com/celestiaorg/go-square/v3 v3.0.2/go.mod h1:oFReMLsSDMRs82ICFEeFQFCqNvwdsbIM1BzCcb0f7dM=
 github.com/celestiaorg/nmt v0.24.2 h1:LlpJSPOd6/Lw1Ig6HUhZuqiINHLka/ZSRTBzlNJpchg=
 github.com/celestiaorg/nmt v0.24.2/go.mod h1:vgLBpWBi8F5KLxTdXSwb7AU4NhiIQ1AQRGa+PzdcLEA=
-github.com/celestiaorg/tastora v0.16.0 h1:V4ctGcvVR8thy4ulvrHagrTfdNfuCHOTsCYoKVRQ75U=
-github.com/celestiaorg/tastora v0.16.0/go.mod h1:C867PBm6Ne6e/1JlmsRqcLeJ6RHAuMoMRCvwJzV/q8g=
 github.com/cenkalti/backoff v2.2.1+incompatible h1:tNowT99t7UNflLxfYYSlKYsBpXdEet03Pg2g16Swow4=
 github.com/cenkalti/backoff v2.2.1+incompatible/go.mod h1:90ReRw6GdpyfrHakVjL/QHaoyV4aDUVVkXQJJJ3NXXM=
 github.com/cenkalti/backoff/v4 v4.1.1/go.mod h1:scbssz8iZGpm3xbr14ovlUdkxfGXNInqkPWOWmG2CLw=

From 99e3e42d64d47c0caa885b85ca85ead11f859dd7 Mon Sep 17 00:00:00 2001
From: chatton <github.qpeyb@simplelogin.fr>
Date: Thu, 26 Feb 2026 15:31:07 +0000
Subject: [PATCH 08/19] fix: improve benchmark measurement window and
 reliability

- Move startBlock capture after spammer creation to exclude warm-up
- Replace 20s drain sleep with smart poll (waitForDrain)
- Add deleteAllSpammers cleanup to handle stale spamoor DB entries
- Lower trace sample rate to 10% to prevent Jaeger OOM
---
 test/e2e/benchmark/helpers.go            | 59 ++++++++++++++++++++++++
 test/e2e/benchmark/spamoor_erc20_test.go | 32 +++++++------
 test/e2e/benchmark/suite_test.go         |  4 +-
 3 files changed, 79 insertions(+), 16 deletions(-)

diff --git a/test/e2e/benchmark/helpers.go b/test/e2e/benchmark/helpers.go
index d25b06d7b5..bfc83a9eee 100644
--- a/test/e2e/benchmark/helpers.go
+++ b/test/e2e/benchmark/helpers.go
@@ -165,6 +165,65 @@ func waitForSpamoorDone(ctx context.Context, log func(string, ...any), api *spam
 	}
 }
 
+// deleteAllSpammers removes any pre-existing spammers from the daemon.
+// This prevents stale spammers (from previous failed runs) being restored
+// from the spamoor SQLite database.
+func deleteAllSpammers(api *spamoor.API) error {
+	existing, err := api.ListSpammers()
+	if err != nil {
+		return fmt.Errorf("list spammers: %w", err)
+	}
+	for _, sp := range existing {
+		if err := api.DeleteSpammer(sp.ID); err != nil {
+			return fmt.Errorf("delete spammer %d: %w", sp.ID, err)
+		}
+	}
+	return nil
+}
+
+// waitForDrain polls the latest block until consecutiveEmpty consecutive empty
+// blocks are observed, indicating the mempool has drained.
+func waitForDrain(ctx context.Context, log func(string, ...any), client *ethclient.Client, consecutiveEmpty int) {
+	var emptyRun int
+	var lastBlock uint64
+	ticker := time.NewTicker(200 * time.Millisecond)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			log("drain timeout after %d consecutive empty blocks (needed %d)", emptyRun, consecutiveEmpty)
+			return
+		case <-ticker.C:
+			header, err := client.HeaderByNumber(ctx, nil)
+			if err != nil {
+				continue
+			}
+			num := header.Number.Uint64()
+			if num == lastBlock {
+				continue
+			}
+
+			txCount, err := client.TransactionCount(ctx, header.Hash())
+			if err != nil {
+				continue
+			}
+
+			lastBlock = num
+			if txCount == 0 {
+				emptyRun++
+			} else {
+				emptyRun = 0
+			}
+
+			if emptyRun >= consecutiveEmpty {
+				log("mempool drained: %d consecutive empty blocks at block %d", emptyRun, num)
+				return
+			}
+		}
+	}
+}
+
 // collectBlockMetrics iterates all headers in [startBlock, endBlock] to collect
 // gas and transaction metrics. Empty blocks are skipped for gas/tx aggregation
 // but included in block interval tracking.
diff --git a/test/e2e/benchmark/spamoor_erc20_test.go b/test/e2e/benchmark/spamoor_erc20_test.go
index 5cd601a670..c9ba95b375 100644
--- a/test/e2e/benchmark/spamoor_erc20_test.go
+++ b/test/e2e/benchmark/spamoor_erc20_test.go
@@ -18,7 +18,7 @@ import (
 // Diagnostic metrics: per-span latency breakdown, gas/block, tx/block.
 func (s *SpamoorSuite) TestERC20Throughput() {
 	const (
-		numSpammers    = 5
+		numSpammers     = 5
 		countPerSpammer = 50000
 		totalCount      = numSpammers * countPerSpammer
 		serviceName     = "ev-node-erc20"
@@ -47,34 +47,36 @@ func (s *SpamoorSuite) TestERC20Throughput() {
 		"refill_interval": 600,
 	}
 
-	// record the starting block before generating load
-	startHeader, err := e.ethClient.HeaderByNumber(ctx, nil)
-	s.Require().NoError(err, "failed to get start block header")
-	startBlock := startHeader.Number.Uint64()
-	loadStart := time.Now()
+	// clear any stale spammers restored from the persistent spamoor database
+	s.Require().NoError(deleteAllSpammers(e.spamoorAPI), "failed to delete stale spammers")
 
-	// stagger spammer launches so their warm-up phases (contract deploy + wallet
-	// funding) complete at different times, producing a more continuous tx stream
-	// instead of synchronized bursts.
-	const staggerDelay = 5 * time.Second
+	// launch all spammers before recording startBlock so warm-up
+	// (contract deploy + wallet funding) is excluded from the measurement window.
 	for i := range numSpammers {
-		if i > 0 {
-			time.Sleep(staggerDelay)
-		}
 		name := fmt.Sprintf("bench-erc20-%d", i)
 		id, err := e.spamoorAPI.CreateSpammer(name, spamoor.ScenarioERC20TX, erc20Config, true)
 		s.Require().NoError(err, "failed to create spammer %s", name)
 		t.Cleanup(func() { _ = e.spamoorAPI.DeleteSpammer(id) })
 	}
 
+	// record the starting block after all spammers are launched so the
+	// measurement window excludes the warm-up period.
+	startHeader, err := e.ethClient.HeaderByNumber(ctx, nil)
+	s.Require().NoError(err, "failed to get start block header")
+	startBlock := startHeader.Number.Uint64()
+	loadStart := time.Now()
+
 	// wait for spamoor to finish sending all transactions
 	waitCtx, cancel := context.WithTimeout(ctx, waitTimeout)
 	defer cancel()
 	sent, failed, err := waitForSpamoorDone(waitCtx, t.Logf, e.spamoorAPI, totalCount, 2*time.Second)
 	s.Require().NoError(err, "spamoor did not finish in time")
 
-	// allow pending txs to drain from mempool into blocks
-	time.Sleep(20 * time.Second)
+	// wait for pending txs to drain: once we see several consecutive empty
+	// blocks, the mempool is drained and we can stop.
+	drainCtx, drainCancel := context.WithTimeout(ctx, 30*time.Second)
+	defer drainCancel()
+	waitForDrain(drainCtx, t.Logf, e.ethClient, 10)
 	wallClock := time.Since(loadStart)
 
 	endHeader, err := e.ethClient.HeaderByNumber(ctx, nil)
diff --git a/test/e2e/benchmark/suite_test.go b/test/e2e/benchmark/suite_test.go
index b96cc67b3a..f22d522478 100644
--- a/test/e2e/benchmark/suite_test.go
+++ b/test/e2e/benchmark/suite_test.go
@@ -75,6 +75,8 @@ func (s *SpamoorSuite) setupEnv(cfg config) *env {
 				"OTEL_EXPORTER_OTLP_TRACES_ENDPOINT="+jg.Internal.IngestHTTPEndpoint()+"/v1/traces",
 				"OTEL_EXPORTER_OTLP_PROTOCOL=http",
 				"OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http",
+				"OTEL_TRACES_SAMPLER=parentbased_traceidratio",
+				"OTEL_TRACES_SAMPLER_ARG=0.1",
 				"RUST_LOG=debug",
 				"OTEL_SDK_DISABLED=false",
 			)
@@ -87,7 +89,7 @@ func (s *SpamoorSuite) setupEnv(cfg config) *env {
 	e2e.SetupSequencerNode(t, sut, sequencerHome, evmEnv.SequencerJWT, evmEnv.GenesisHash, evmEnv.Endpoints,
 		"--evnode.instrumentation.tracing=true",
 		"--evnode.instrumentation.tracing_endpoint", otlpHTTP,
-		"--evnode.instrumentation.tracing_sample_rate", "1.0",
+		"--evnode.instrumentation.tracing_sample_rate", "0.1",
 		"--evnode.instrumentation.tracing_service_name", cfg.serviceName,
 	)
 	t.Log("sequencer node is up")

From e4e06c5f232361ffc58545c9a0cc45f17d28e732 Mon Sep 17 00:00:00 2001
From: chatton <github.qpeyb@simplelogin.fr>
Date: Mon, 2 Mar 2026 09:36:47 +0000
Subject: [PATCH 09/19] fix: address PR review feedback for benchmark suite

- make reth tag configurable via EV_RETH_TAG env var (default pr-140)
- fix OTLP config: remove duplicate env vars, use http/protobuf protocol
- use require.Eventually for host readiness polling
- rename requireHTTP to requireHostUp
- use non-fatal logging in resultWriter.flush deferred context
- fix stale doc comment (setupCommonEVMEnv -> SetupCommonEVMEnv)
- rename loop variable to avoid shadowing testing.TB convention
- add block/internal/executing/** to CI path trigger
- remove unused require import from output.go
---
 .github/workflows/benchmark.yml          |  1 +
 test/e2e/benchmark/metrics.go            | 27 +++++++++---------------
 test/e2e/benchmark/output.go             | 11 +++++++---
 test/e2e/benchmark/spamoor_smoke_test.go |  8 ++++++-
 test/e2e/benchmark/suite_test.go         |  9 ++++----
 test/e2e/benchmark/traces.go             |  4 ++--
 test/e2e/evm_test_common.go              |  2 +-
 7 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index b3a157a0b1..203ff03e22 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -13,6 +13,7 @@ permissions: {}
       - 'test/e2e/evm_contract_bench_test.go'
       - 'test/e2e/evm_test_common.go'
       - 'test/e2e/sut_helper.go'
+      - 'block/internal/executing/**'
       - '.github/workflows/benchmark.yml'
   workflow_dispatch:
 
diff --git a/test/e2e/benchmark/metrics.go b/test/e2e/benchmark/metrics.go
index fcc51af865..24ddd7506a 100644
--- a/test/e2e/benchmark/metrics.go
+++ b/test/e2e/benchmark/metrics.go
@@ -3,34 +3,27 @@
 package benchmark
 
 import (
-	"fmt"
 	"net/http"
 	"testing"
 	"time"
 
+	"github.com/stretchr/testify/require"
+
 	dto "github.com/prometheus/client_model/go"
 )
 
-// requireHTTP polls a URL until it returns a 2xx status code or the timeout expires.
-func requireHTTP(t testing.TB, url string, timeout time.Duration) {
+// requireHostUp polls a URL until it returns a 2xx status code or the timeout expires.
+func requireHostUp(t testing.TB, url string, timeout time.Duration) {
 	t.Helper()
 	client := &http.Client{Timeout: 200 * time.Millisecond}
-	deadline := time.Now().Add(timeout)
-	var lastErr error
-	for time.Now().Before(deadline) {
+	require.Eventually(t, func() bool {
 		resp, err := client.Get(url)
-		if err == nil {
-			_ = resp.Body.Close()
-			if resp.StatusCode >= 200 && resp.StatusCode < 300 {
-				return
-			}
-			lastErr = fmt.Errorf("status %d", resp.StatusCode)
-		} else {
-			lastErr = err
+		if err != nil {
+			return false
 		}
-		time.Sleep(100 * time.Millisecond)
-	}
-	t.Fatalf("daemon not ready at %s: %v", url, lastErr)
+		_ = resp.Body.Close()
+		return resp.StatusCode >= 200 && resp.StatusCode < 300
+	}, timeout, 100*time.Millisecond, "daemon not ready at %s", url)
 }
 
 // sumCounter sums all counter values in a prometheus MetricFamily.
diff --git a/test/e2e/benchmark/output.go b/test/e2e/benchmark/output.go
index 0d05ebd4ec..39ccbe6d69 100644
--- a/test/e2e/benchmark/output.go
+++ b/test/e2e/benchmark/output.go
@@ -10,7 +10,6 @@ import (
 	"testing"
 
 	e2e "github.com/evstack/ev-node/test/e2e"
-	"github.com/stretchr/testify/require"
 )
 
 // entry matches the customSmallerIsBetter format for github-action-benchmark.
@@ -71,7 +70,13 @@ func (w *resultWriter) flush() {
 	}
 
 	data, err := json.MarshalIndent(w.entries, "", "  ")
-	require.NoError(w.t, err, "failed to marshal benchmark JSON")
-	require.NoError(w.t, os.WriteFile(outputPath, data, 0644), "failed to write benchmark JSON to %s", outputPath)
+	if err != nil {
+		w.t.Logf("WARNING: failed to marshal benchmark JSON: %v", err)
+		return
+	}
+	if err := os.WriteFile(outputPath, data, 0644); err != nil {
+		w.t.Logf("WARNING: failed to write benchmark JSON to %s: %v", outputPath, err)
+		return
+	}
 	w.t.Logf("wrote %d benchmark entries to %s", len(w.entries), outputPath)
 }
diff --git a/test/e2e/benchmark/spamoor_smoke_test.go b/test/e2e/benchmark/spamoor_smoke_test.go
index 1f56f26874..fa62583736 100644
--- a/test/e2e/benchmark/spamoor_smoke_test.go
+++ b/test/e2e/benchmark/spamoor_smoke_test.go
@@ -3,6 +3,7 @@
 package benchmark
 
 import (
+	"os"
 	"time"
 
 	"github.com/celestiaorg/tastora/framework/docker/evstack/spamoor"
@@ -17,8 +18,13 @@ func (s *SpamoorSuite) TestSpamoorSmoke() {
 	w := newResultWriter(t, "SpamoorSmoke")
 	defer w.flush()
 
+	// TODO: temporary hardcoded tag, will be replaced with a proper release tag
+	rethTag := os.Getenv("EV_RETH_TAG")
+	if rethTag == "" {
+		rethTag = "pr-140"
+	}
 	e := s.setupEnv(config{
-		rethTag:     "pr-140",
+		rethTag:     rethTag,
 		serviceName: "ev-node-smoke",
 	})
 	api := e.spamoorAPI
diff --git a/test/e2e/benchmark/suite_test.go b/test/e2e/benchmark/suite_test.go
index a43f25890c..bed6b6c19e 100644
--- a/test/e2e/benchmark/suite_test.go
+++ b/test/e2e/benchmark/suite_test.go
@@ -71,10 +71,11 @@ func (s *SpamoorSuite) setupEnv(cfg config) *env {
 	evmEnv := e2e.SetupCommonEVMEnv(t, sut, s.dockerCli, s.networkID,
 		e2e.WithRethOpts(func(b *reth.NodeBuilder) {
 			b.WithTag(cfg.rethTag).WithEnv(
+				// ev-reth reads OTEL_EXPORTER_OTLP_ENDPOINT and passes it directly
+				// to with_endpoint(). opentelemetry-otlp v0.31 HTTP exporter does
+				// not auto-append /v1/traces, so the full path is required.
 				"OTEL_EXPORTER_OTLP_ENDPOINT="+jg.Internal.IngestHTTPEndpoint()+"/v1/traces",
-				"OTEL_EXPORTER_OTLP_TRACES_ENDPOINT="+jg.Internal.IngestHTTPEndpoint()+"/v1/traces",
-				"OTEL_EXPORTER_OTLP_PROTOCOL=http",
-				"OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http",
+				"OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf",
 				"RUST_LOG=debug",
 				"OTEL_SDK_DISABLED=false",
 			)
@@ -117,7 +118,7 @@ func (s *SpamoorSuite) setupEnv(cfg config) *env {
 	spInfo, err := spNode.GetNetworkInfo(ctx)
 	s.Require().NoError(err, "failed to get spamoor network info")
 	apiAddr := "http://127.0.0.1:" + spInfo.External.Ports.HTTP
-	requireHTTP(t, apiAddr+"/api/spammers", 30*time.Second)
+	requireHostUp(t, apiAddr+"/api/spammers", 30*time.Second)
 
 	return &env{
 		jaeger:     jg,
diff --git a/test/e2e/benchmark/traces.go b/test/e2e/benchmark/traces.go
index 76f160dfa2..459dd16a1a 100644
--- a/test/e2e/benchmark/traces.go
+++ b/test/e2e/benchmark/traces.go
@@ -22,8 +22,8 @@ func (j jaegerSpan) SpanDuration() time.Duration { return time.Duration(j.durati
 // extractSpansFromTraces walks Jaeger's []any response and pulls out span operation names and durations.
 func extractSpansFromTraces(traces []any) []jaegerSpan {
 	var out []jaegerSpan
-	for _, t := range traces {
-		traceMap, ok := t.(map[string]any)
+	for _, traceEntry := range traces {
+		traceMap, ok := traceEntry.(map[string]any)
 		if !ok {
 			continue
 		}
diff --git a/test/e2e/evm_test_common.go b/test/e2e/evm_test_common.go
index 161ff89349..3ec25b6633 100644
--- a/test/e2e/evm_test_common.go
+++ b/test/e2e/evm_test_common.go
@@ -529,7 +529,7 @@ func WithRethOpts(opts ...evmtest.RethNodeOpt) SetupOpt {
 	}
 }
 
-// setupCommonEVMEnv creates and initializes ev-reth instances, while also initializing the local ev-node instance
+// SetupCommonEVMEnv creates and initializes ev-reth instances, while also initializing the local ev-node instance
 // managed by sut. If a full node is also required, we can use the WithFullNode() additional option.
 func SetupCommonEVMEnv(t testing.TB, sut *SystemUnderTest, client tastoratypes.TastoraDockerClient, networkID string, opts ...SetupOpt) *EVMEnv {
 	t.Helper()

From 03b9239a074c54675fad96918fbcd21d9711856b Mon Sep 17 00:00:00 2001
From: chatton <github.qpeyb@simplelogin.fr>
Date: Mon, 2 Mar 2026 09:50:57 +0000
Subject: [PATCH 10/19] chore: specify http

---
 test/e2e/benchmark/suite_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/e2e/benchmark/suite_test.go b/test/e2e/benchmark/suite_test.go
index bed6b6c19e..34c0bc57a1 100644
--- a/test/e2e/benchmark/suite_test.go
+++ b/test/e2e/benchmark/suite_test.go
@@ -75,7 +75,7 @@ func (s *SpamoorSuite) setupEnv(cfg config) *env {
 				// to with_endpoint(). opentelemetry-otlp v0.31 HTTP exporter does
 				// not auto-append /v1/traces, so the full path is required.
 				"OTEL_EXPORTER_OTLP_ENDPOINT="+jg.Internal.IngestHTTPEndpoint()+"/v1/traces",
-				"OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf",
+				"OTEL_EXPORTER_OTLP_PROTOCOL=http",
 				"RUST_LOG=debug",
 				"OTEL_SDK_DISABLED=false",
 			)

From fe3ca23cc6aee89ec481078447fb4ba0e60f2491 Mon Sep 17 00:00:00 2001
From: chatton <github.qpeyb@simplelogin.fr>
Date: Mon, 2 Mar 2026 10:17:50 +0000
Subject: [PATCH 11/19] chore: filter out benchmark tests from test-e2e

---
 .just/test.just | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.just/test.just b/.just/test.just
index c0360d43fb..2b35df6e4e 100644
--- a/.just/test.just
+++ b/.just/test.just
@@ -25,7 +25,7 @@ test-integration:
 [group('test')]
 test-e2e: build build-da build-evm docker-build-if-local
     @echo "--> Running e2e tests"
-    @cd test/e2e && go test -mod=readonly -failfast -timeout=15m -tags='e2e evm' ./... --binary=../../build/testapp --evm-binary=../../build/evm
+    @cd test/e2e && go test -mod=readonly -failfast -timeout=15m -tags='e2e evm' $(go list -tags='e2e evm' ./... | grep -v /benchmark) --binary=../../build/testapp --evm-binary=../../build/evm
 
 # Run integration tests with coverage
 [group('test')]

From 054f2c62d25663528ddb8cf2ad9f772bd04adef4 Mon Sep 17 00:00:00 2001
From: chatton <github.qpeyb@simplelogin.fr>
Date: Mon, 2 Mar 2026 12:51:43 +0000
Subject: [PATCH 12/19] refactor: centralize reth config and lower ERC20
 spammer count

move EV_RETH_TAG resolution and rpc connection limits into setupEnv
so all benchmark tests share the same reth configuration. lower ERC20
spammer count from 5 to 2 to reduce resource contention on local
hardware while keeping the loop for easy scaling on dedicated infra.
---
 test/e2e/benchmark/spamoor_erc20_test.go |  3 +-
 test/e2e/benchmark/spamoor_smoke_test.go |  7 -----
 test/e2e/benchmark/suite_test.go         | 36 +++++++++++++++++-------
 3 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/test/e2e/benchmark/spamoor_erc20_test.go b/test/e2e/benchmark/spamoor_erc20_test.go
index c9ba95b375..10c6271179 100644
--- a/test/e2e/benchmark/spamoor_erc20_test.go
+++ b/test/e2e/benchmark/spamoor_erc20_test.go
@@ -18,7 +18,7 @@ import (
 // Diagnostic metrics: per-span latency breakdown, gas/block, tx/block.
 func (s *SpamoorSuite) TestERC20Throughput() {
 	const (
-		numSpammers     = 5
+		numSpammers     = 2
 		countPerSpammer = 50000
 		totalCount      = numSpammers * countPerSpammer
 		serviceName     = "ev-node-erc20"
@@ -31,7 +31,6 @@ func (s *SpamoorSuite) TestERC20Throughput() {
 	defer w.flush()
 
 	e := s.setupEnv(config{
-		rethTag:     "pr-140",
 		serviceName: serviceName,
 	})
 
diff --git a/test/e2e/benchmark/spamoor_smoke_test.go b/test/e2e/benchmark/spamoor_smoke_test.go
index fa62583736..82c8bbf375 100644
--- a/test/e2e/benchmark/spamoor_smoke_test.go
+++ b/test/e2e/benchmark/spamoor_smoke_test.go
@@ -3,7 +3,6 @@
 package benchmark
 
 import (
-	"os"
 	"time"
 
 	"github.com/celestiaorg/tastora/framework/docker/evstack/spamoor"
@@ -18,13 +17,7 @@ func (s *SpamoorSuite) TestSpamoorSmoke() {
 	w := newResultWriter(t, "SpamoorSmoke")
 	defer w.flush()
 
-	// TODO: temporary hardcoded tag, will be replaced with a proper release tag
-	rethTag := os.Getenv("EV_RETH_TAG")
-	if rethTag == "" {
-		rethTag = "pr-140"
-	}
 	e := s.setupEnv(config{
-		rethTag:     rethTag,
 		serviceName: "ev-node-smoke",
 	})
 	api := e.spamoorAPI
diff --git a/test/e2e/benchmark/suite_test.go b/test/e2e/benchmark/suite_test.go
index db8854b363..410f815500 100644
--- a/test/e2e/benchmark/suite_test.go
+++ b/test/e2e/benchmark/suite_test.go
@@ -4,6 +4,7 @@ package benchmark
 
 import (
 	"context"
+	"os"
 	"path/filepath"
 	"testing"
 	"time"
@@ -48,10 +49,19 @@ type env struct {
 
 // config parameterizes the per-test environment setup.
 type config struct {
-	rethTag     string
 	serviceName string
 }
 
+// TODO: temporary hardcoded tag, will be replaced with a proper release tag
+const defaultRethTag = "pr-140"
+
+func rethTag() string {
+	if tag := os.Getenv("EV_RETH_TAG"); tag != "" {
+		return tag
+	}
+	return defaultRethTag
+}
+
 // setupEnv creates a Jaeger + reth + sequencer + Spamoor environment for
 // a single test. Each call spins up isolated infrastructure so tests
 // can't interfere with each other.
@@ -70,15 +80,21 @@ func (s *SpamoorSuite) setupEnv(cfg config) *env {
 	// reth + local DA with OTLP tracing to Jaeger
 	evmEnv := e2e.SetupCommonEVMEnv(t, sut, s.dockerCli, s.networkID,
 		e2e.WithRethOpts(func(b *reth.NodeBuilder) {
-			b.WithTag(cfg.rethTag).WithEnv(
-				// ev-reth reads OTEL_EXPORTER_OTLP_ENDPOINT and passes it directly
-				// to with_endpoint(). opentelemetry-otlp v0.31 HTTP exporter does
-				// not auto-append /v1/traces, so the full path is required.
-				"OTEL_EXPORTER_OTLP_ENDPOINT="+jg.Internal.IngestHTTPEndpoint()+"/v1/traces",
-				"OTEL_EXPORTER_OTLP_PROTOCOL=http",
-				"RUST_LOG=debug",
-				"OTEL_SDK_DISABLED=false",
-			)
+			b.WithTag(rethTag()).
+				// increase values to facilitate spamoor.
+				WithAdditionalStartArgs(
+					"--rpc.max-connections", "5000",
+					"--rpc.max-tracing-requests", "1000",
+				).
+				WithEnv(
+					// ev-reth reads OTEL_EXPORTER_OTLP_ENDPOINT and passes it directly
+					// to with_endpoint(). opentelemetry-otlp v0.31 HTTP exporter does
+					// not auto-append /v1/traces, so the full path is required.
+					"OTEL_EXPORTER_OTLP_ENDPOINT="+jg.Internal.IngestHTTPEndpoint()+"/v1/traces",
+					"OTEL_EXPORTER_OTLP_PROTOCOL=http",
+					"RUST_LOG=debug",
+					"OTEL_SDK_DISABLED=false",
+				)
 		}),
 	)
 

From 26bb1170356edc5d4ebee55b4dce511892f4222e Mon Sep 17 00:00:00 2001
From: chatton <github.qpeyb@simplelogin.fr>
Date: Mon, 2 Mar 2026 13:11:38 +0000
Subject: [PATCH 13/19] chore: collect all traces at once

---
 test/e2e/benchmark/suite_test.go | 35 +++++++++-----------------------
 1 file changed, 10 insertions(+), 25 deletions(-)

diff --git a/test/e2e/benchmark/suite_test.go b/test/e2e/benchmark/suite_test.go
index 410f815500..7201bde991 100644
--- a/test/e2e/benchmark/suite_test.go
+++ b/test/e2e/benchmark/suite_test.go
@@ -104,6 +104,7 @@ func (s *SpamoorSuite) setupEnv(cfg config) *env {
 	e2e.SetupSequencerNode(t, sut, sequencerHome, evmEnv.SequencerJWT, evmEnv.GenesisHash, evmEnv.Endpoints,
 		"--evnode.instrumentation.tracing=true",
 		"--evnode.instrumentation.tracing_endpoint", otlpHTTP,
+		// TODO: setting this to 1 produced too many spans for the local Jaeger deployment alongside everything else.
 		"--evnode.instrumentation.tracing_sample_rate", "0.1",
 		"--evnode.instrumentation.tracing_service_name", cfg.serviceName,
 	)
@@ -161,9 +162,8 @@ func (s *SpamoorSuite) collectServiceTraces(e *env, serviceName string) []e2e.Tr
 	return toTraceSpans(extractSpansFromTraces(traces))
 }
 
-// tryCollectServiceTraces attempts to fetch all traces by requesting them in
-// batches to avoid overwhelming Jaeger with a single large response.
-// Returns nil instead of failing the test if traces are unavailable.
+// tryCollectServiceTraces fetches traces from Jaeger for the given service,
+// returning nil instead of failing the test if traces are unavailable.
 func (s *SpamoorSuite) tryCollectServiceTraces(e *env, serviceName string) []e2e.TraceSpan {
 	t := s.T()
 	ctx, cancel := context.WithTimeout(t.Context(), 3*time.Minute)
@@ -175,28 +175,13 @@ func (s *SpamoorSuite) tryCollectServiceTraces(e *env, serviceName string) []e2e
 		return nil
 	}
 
-	const batchSize = 200
-	var allSpans []e2e.TraceSpan
-	for batch := 0; ; batch++ {
-		traces, err := e.jaeger.External.Traces(ctx, serviceName, batchSize)
-		if err != nil {
-			if batch == 0 {
-				t.Logf("warning: failed to fetch %s traces: %v", serviceName, err)
-				return nil
-			}
-			t.Logf("warning: %s trace fetch stopped after %d batches (%d spans): %v", serviceName, batch, len(allSpans), err)
-			break
-		}
-		spans := toTraceSpans(extractSpansFromTraces(traces))
-		if len(spans) == 0 {
-			break
-		}
-		allSpans = append(allSpans, spans...)
-		if len(traces) < batchSize {
-			break
-		}
+	traces, err := e.jaeger.External.Traces(ctx, serviceName, 10000)
+	if err != nil {
+		t.Logf("warning: failed to fetch %s traces: %v", serviceName, err)
+		return nil
 	}
 
-	t.Logf("collected %d %s spans", len(allSpans), serviceName)
-	return allSpans
+	spans := toTraceSpans(extractSpansFromTraces(traces))
+	t.Logf("collected %d %s spans", len(spans), serviceName)
+	return spans
 }

From b88cae3a6860a02d415f3ea34a654f215fc64867 Mon Sep 17 00:00:00 2001
From: chatton <github.qpeyb@simplelogin.fr>
Date: Mon, 2 Mar 2026 13:21:24 +0000
Subject: [PATCH 14/19] chore: self review

---
 test/e2e/benchmark/helpers.go            |  1 +
 test/e2e/benchmark/spamoor_smoke_test.go | 24 +++++++++++++++---------
 test/e2e/go.mod                          |  3 +--
 test/e2e/go.sum                          |  2 ++
 4 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/test/e2e/benchmark/helpers.go b/test/e2e/benchmark/helpers.go
index bfc83a9eee..5f26a404ab 100644
--- a/test/e2e/benchmark/helpers.go
+++ b/test/e2e/benchmark/helpers.go
@@ -146,6 +146,7 @@ func waitForSpamoorDone(ctx context.Context, log func(string, ...any), api *spam
 		case <-ticker.C:
 			metrics, mErr := api.GetMetrics()
 			if mErr != nil {
+				log("failed to get spamoor metrics: %v", mErr)
 				continue
 			}
 			sent = sumCounter(metrics["spamoor_transactions_sent_total"])
diff --git a/test/e2e/benchmark/spamoor_smoke_test.go b/test/e2e/benchmark/spamoor_smoke_test.go
index 82c8bbf375..8b0dbab35b 100644
--- a/test/e2e/benchmark/spamoor_smoke_test.go
+++ b/test/e2e/benchmark/spamoor_smoke_test.go
@@ -22,6 +22,8 @@ func (s *SpamoorSuite) TestSpamoorSmoke() {
 	})
 	api := e.spamoorAPI
 
+	s.Require().NoError(deleteAllSpammers(api), "failed to delete stale spammers")
+
 	eoatx := map[string]any{
 		"throughput":      100,
 		"total_count":     3000,
@@ -76,9 +78,11 @@ func (s *SpamoorSuite) TestSpamoorSmoke() {
 
 	// collect traces
 	evNodeSpans := s.collectServiceTraces(e, "ev-node-smoke")
-	evRethSpans := s.collectServiceTraces(e, "ev-reth")
+	evRethSpans := s.tryCollectServiceTraces(e, "ev-reth")
 	e2e.PrintTraceReport(t, "ev-node-smoke", evNodeSpans)
-	e2e.PrintTraceReport(t, "ev-reth", evRethSpans)
+	if len(evRethSpans) > 0 {
+		e2e.PrintTraceReport(t, "ev-reth", evRethSpans)
+	}
 
 	w.addSpans(append(evNodeSpans, evRethSpans...))
 
@@ -100,13 +104,15 @@ func (s *SpamoorSuite) TestSpamoorSmoke() {
 		"DA.Submit",
 	}, "ev-node-smoke")
 
-	// assert expected ev-reth span names
-	assertSpanNames(t, evRethSpans, []string{
-		"build_payload",
-		"execute_tx",
-		"try_build",
-		"validate_transaction",
-	}, "ev-reth")
+	// assert expected ev-reth span names when traces are available
+	if len(evRethSpans) > 0 {
+		assertSpanNames(t, evRethSpans, []string{
+			"build_payload",
+			"execute_tx",
+			"try_build",
+			"validate_transaction",
+		}, "ev-reth")
+	}
 
 	s.Require().Greater(sent, float64(0), "at least one transaction should have been sent")
 	s.Require().Zero(fail, "no transactions should have failed")
diff --git a/test/e2e/go.mod b/test/e2e/go.mod
index f2a24355dc..784e92e5a1 100644
--- a/test/e2e/go.mod
+++ b/test/e2e/go.mod
@@ -5,7 +5,7 @@ go 1.25.6
 require (
 	cosmossdk.io/math v1.5.3
 	github.com/celestiaorg/go-square/v3 v3.0.2
-	github.com/celestiaorg/tastora v0.16.0
+	github.com/celestiaorg/tastora v0.16.1-0.20260302131806-2816c7b82bfb
 	github.com/cosmos/cosmos-sdk v0.53.6
 	github.com/cosmos/ibc-go/v8 v8.8.0
 	github.com/ethereum/go-ethereum v1.17.0
@@ -22,7 +22,6 @@ require (
 )
 
 replace (
-	github.com/celestiaorg/tastora => /Users/chatton/checkouts/celestiaorg/tastora
 	github.com/evstack/ev-node => ../../
 	github.com/evstack/ev-node/core => ../../core
 	github.com/evstack/ev-node/execution/evm => ../../execution/evm
diff --git a/test/e2e/go.sum b/test/e2e/go.sum
index 0d66192173..588dd5b985 100644
--- a/test/e2e/go.sum
+++ b/test/e2e/go.sum
@@ -145,6 +145,8 @@ github.com/celestiaorg/go-square/v3 v3.0.2 h1:eSQOgNII8inK9IhiBZ+6GADQeWbRq4HYY7
 github.com/celestiaorg/go-square/v3 v3.0.2/go.mod h1:oFReMLsSDMRs82ICFEeFQFCqNvwdsbIM1BzCcb0f7dM=
 github.com/celestiaorg/nmt v0.24.2 h1:LlpJSPOd6/Lw1Ig6HUhZuqiINHLka/ZSRTBzlNJpchg=
 github.com/celestiaorg/nmt v0.24.2/go.mod h1:vgLBpWBi8F5KLxTdXSwb7AU4NhiIQ1AQRGa+PzdcLEA=
+github.com/celestiaorg/tastora v0.16.1-0.20260302131806-2816c7b82bfb h1:rTVChmvo3+SYi0dwd5W2yz2zyuAid8cXVAeROEY5ecs=
+github.com/celestiaorg/tastora v0.16.1-0.20260302131806-2816c7b82bfb/go.mod h1:C867PBm6Ne6e/1JlmsRqcLeJ6RHAuMoMRCvwJzV/q8g=
 github.com/cenkalti/backoff v2.2.1+incompatible h1:tNowT99t7UNflLxfYYSlKYsBpXdEet03Pg2g16Swow4=
 github.com/cenkalti/backoff v2.2.1+incompatible/go.mod h1:90ReRw6GdpyfrHakVjL/QHaoyV4aDUVVkXQJJJ3NXXM=
 github.com/cenkalti/backoff/v4 v4.1.1/go.mod h1:scbssz8iZGpm3xbr14ovlUdkxfGXNInqkPWOWmG2CLw=

From 272ab711b7a10a9347c697439ce12e44386a4b8c Mon Sep 17 00:00:00 2001
From: chatton <github.qpeyb@simplelogin.fr>
Date: Mon, 2 Mar 2026 13:54:08 +0000
Subject: [PATCH 15/19] refactor: extract benchmark helpers to slim down ERC20
 test body

- add blockMetricsSummary with summarize(), log(), and entries() methods
- add evNodeOverhead() for computing ProduceBlock vs ExecuteTxs overhead
- add collectTraces() suite method to deduplicate trace collection pattern
- add addEntries() convenience method on resultWriter
- slim TestERC20Throughput from ~217 to ~119 lines
- reuse collectTraces in TestSpamoorSmoke
---
 test/e2e/benchmark/helpers.go            | 100 +++++++++++++++++++
 test/e2e/benchmark/output.go             |   5 +
 test/e2e/benchmark/spamoor_erc20_test.go | 118 ++---------------------
 test/e2e/benchmark/spamoor_smoke_test.go |  17 +---
 test/e2e/benchmark/suite_test.go         |  26 +++++
 5 files changed, 146 insertions(+), 120 deletions(-)

diff --git a/test/e2e/benchmark/helpers.go b/test/e2e/benchmark/helpers.go
index 5f26a404ab..136f5daac4 100644
--- a/test/e2e/benchmark/helpers.go
+++ b/test/e2e/benchmark/helpers.go
@@ -8,10 +8,12 @@ import (
 	"math"
 	"math/big"
 	"sort"
+	"testing"
 	"time"
 
 	"github.com/celestiaorg/tastora/framework/docker/evstack/spamoor"
 	"github.com/ethereum/go-ethereum/ethclient"
+	e2e "github.com/evstack/ev-node/test/e2e"
 )
 
 // blockMetrics holds aggregated gas and transaction data across a range of blocks.
@@ -225,6 +227,104 @@ func waitForDrain(ctx context.Context, log func(string, ...any), client *ethclie
 	}
 }
 
+// blockMetricsSummary holds the computed stats from a blockMetrics measurement window.
+type blockMetricsSummary struct {
+	SteadyState   time.Duration
+	AchievedMGas  float64
+	AchievedTPS   float64
+	IntervalP50   time.Duration
+	IntervalP99   time.Duration
+	IntervalMax   time.Duration
+	GasP50        float64
+	GasP99        float64
+	TxP50         float64
+	TxP99         float64
+	AvgGas        float64
+	AvgTx         float64
+	BlocksPerSec  float64
+	NonEmptyRatio float64
+}
+
+// summarize computes all derived stats from the raw block metrics.
+func (m *blockMetrics) summarize() *blockMetricsSummary {
+	ss := m.steadyStateDuration()
+	intervalP50, intervalP99, intervalMax := m.blockIntervalStats()
+	gasP50, gasP99 := m.gasPerBlockStats()
+	txP50, txP99 := m.txPerBlockStats()
+
+	var blocksPerSec float64
+	if ss > 0 {
+		blocksPerSec = float64(m.BlockCount) / ss.Seconds()
+	}
+
+	return &blockMetricsSummary{
+		SteadyState:   ss,
+		AchievedMGas:  mgasPerSec(m.TotalGasUsed, ss),
+		AchievedTPS:   float64(m.TotalTxCount) / ss.Seconds(),
+		IntervalP50:   intervalP50,
+		IntervalP99:   intervalP99,
+		IntervalMax:   intervalMax,
+		GasP50:        gasP50,
+		GasP99:        gasP99,
+		TxP50:         txP50,
+		TxP99:         txP99,
+		AvgGas:        m.avgGasPerBlock(),
+		AvgTx:         m.avgTxPerBlock(),
+		BlocksPerSec:  blocksPerSec,
+		NonEmptyRatio: m.nonEmptyRatio(),
+	}
+}
+
+// log prints a concise summary of the block metrics to the test log.
+func (s *blockMetricsSummary) log(t testing.TB, startBlock, endBlock uint64, totalBlocks, nonEmptyBlocks int, wallClock time.Duration) {
+	t.Logf("block range: %d-%d (%d total, %d non-empty, %.1f%% non-empty)",
+		startBlock, endBlock, totalBlocks, nonEmptyBlocks, s.NonEmptyRatio)
+	t.Logf("block intervals: p50=%s, p99=%s, max=%s",
+		s.IntervalP50.Round(time.Millisecond), s.IntervalP99.Round(time.Millisecond), s.IntervalMax.Round(time.Millisecond))
+	t.Logf("gas/block (non-empty): avg=%.0f, p50=%.0f, p99=%.0f", s.AvgGas, s.GasP50, s.GasP99)
+	t.Logf("tx/block (non-empty): avg=%.1f, p50=%.0f, p99=%.0f", s.AvgTx, s.TxP50, s.TxP99)
+	t.Logf("throughput: %.2f MGas/s, %.1f TPS over %s steady-state (%s wall clock)",
+		s.AchievedMGas, s.AchievedTPS, s.SteadyState.Round(time.Millisecond), wallClock.Round(time.Millisecond))
+}
+
+// entries returns all summary metrics as result writer entries with the given prefix.
+func (s *blockMetricsSummary) entries(prefix string) []entry {
+	return []entry{
+		{Name: prefix + " - MGas/s", Unit: "MGas/s", Value: s.AchievedMGas},
+		{Name: prefix + " - TPS", Unit: "tx/s", Value: s.AchievedTPS},
+		{Name: prefix + " - avg gas/block", Unit: "gas", Value: s.AvgGas},
+		{Name: prefix + " - avg tx/block", Unit: "count", Value: s.AvgTx},
+		{Name: prefix + " - blocks/s", Unit: "blocks/s", Value: s.BlocksPerSec},
+		{Name: prefix + " - non-empty block ratio", Unit: "%", Value: s.NonEmptyRatio},
+		{Name: prefix + " - block interval p50", Unit: "ms", Value: float64(s.IntervalP50.Milliseconds())},
+		{Name: prefix + " - block interval p99", Unit: "ms", Value: float64(s.IntervalP99.Milliseconds())},
+		{Name: prefix + " - gas/block p50", Unit: "gas", Value: s.GasP50},
+		{Name: prefix + " - gas/block p99", Unit: "gas", Value: s.GasP99},
+		{Name: prefix + " - tx/block p50", Unit: "count", Value: s.TxP50},
+		{Name: prefix + " - tx/block p99", Unit: "count", Value: s.TxP99},
+	}
+}
+
+// evNodeOverhead computes the overhead percentage of ev-node's ProduceBlock
+// over the inner ExecuteTxs span. Returns the overhead and whether it was computable.
+func evNodeOverhead(spans []e2e.TraceSpan) (float64, bool) {
+	stats := e2e.AggregateSpanStats(spans)
+	produce, ok := stats["BlockExecutor.ProduceBlock"]
+	if !ok {
+		return 0, false
+	}
+	execute, ok := stats["Executor.ExecuteTxs"]
+	if !ok {
+		return 0, false
+	}
+	produceAvg := float64(produce.Total.Microseconds()) / float64(produce.Count)
+	executeAvg := float64(execute.Total.Microseconds()) / float64(execute.Count)
+	if produceAvg <= 0 {
+		return 0, false
+	}
+	return (produceAvg - executeAvg) / produceAvg * 100, true
+}
+
 // collectBlockMetrics iterates all headers in [startBlock, endBlock] to collect
 // gas and transaction metrics. Empty blocks are skipped for gas/tx aggregation
 // but included in block interval tracking.
diff --git a/test/e2e/benchmark/output.go b/test/e2e/benchmark/output.go
index 39ccbe6d69..06c60e7646 100644
--- a/test/e2e/benchmark/output.go
+++ b/test/e2e/benchmark/output.go
@@ -61,6 +61,11 @@ func (w *resultWriter) addEntry(e entry) {
 	w.entries = append(w.entries, e)
 }
 
+// addEntries appends multiple entries to the results.
+func (w *resultWriter) addEntries(entries []entry) {
+	w.entries = append(w.entries, entries...)
+}
+
 // flush writes accumulated entries to the path in BENCH_JSON_OUTPUT.
 // It is a no-op when the env var is unset or no entries were added.
 func (w *resultWriter) flush() {
diff --git a/test/e2e/benchmark/spamoor_erc20_test.go b/test/e2e/benchmark/spamoor_erc20_test.go
index 10c6271179..357c5c4f3f 100644
--- a/test/e2e/benchmark/spamoor_erc20_test.go
+++ b/test/e2e/benchmark/spamoor_erc20_test.go
@@ -8,7 +8,6 @@ import (
 	"time"
 
 	"github.com/celestiaorg/tastora/framework/docker/evstack/spamoor"
-	e2e "github.com/evstack/ev-node/test/e2e"
 )
 
 // TestERC20Throughput measures ERC-20 token transfer throughput in isolation.
@@ -86,124 +85,27 @@ func (s *SpamoorSuite) TestERC20Throughput() {
 	bm, err := collectBlockMetrics(ctx, e.ethClient, startBlock, endBlock)
 	s.Require().NoError(err, "failed to collect block metrics")
 
-	// use steady-state window (first active block to last active block) for
-	// throughput calculation, excluding warm-up and cool-down periods
-	steadyState := bm.steadyStateDuration()
-	s.Require().Greater(steadyState, time.Duration(0), "expected non-zero steady-state duration")
-
-	achievedMGas := mgasPerSec(bm.TotalGasUsed, steadyState)
-	achievedTPS := float64(bm.TotalTxCount) / steadyState.Seconds()
-
-	intervalP50, intervalP99, intervalMax := bm.blockIntervalStats()
-	gasP50, gasP99 := bm.gasPerBlockStats()
-	txP50, txP99 := bm.txPerBlockStats()
-
-	t.Logf("block range: %d-%d (%d total, %d non-empty, %.1f%% non-empty)",
-		startBlock, endBlock, bm.TotalBlockCount, bm.BlockCount, bm.nonEmptyRatio())
-	t.Logf("block intervals: p50=%s, p99=%s, max=%s",
-		intervalP50.Round(time.Millisecond), intervalP99.Round(time.Millisecond), intervalMax.Round(time.Millisecond))
-	t.Logf("gas/block (non-empty): avg=%.0f, p50=%.0f, p99=%.0f", bm.avgGasPerBlock(), gasP50, gasP99)
-	t.Logf("tx/block (non-empty): avg=%.1f, p50=%.0f, p99=%.0f", bm.avgTxPerBlock(), txP50, txP99)
-	t.Logf("ERC20 throughput: %.2f MGas/s, %.1f TPS over %s steady-state (%s wall clock)",
-		achievedMGas, achievedTPS, steadyState.Round(time.Millisecond), wallClock.Round(time.Millisecond))
+	summary := bm.summarize()
+	s.Require().Greater(summary.SteadyState, time.Duration(0), "expected non-zero steady-state duration")
+	summary.log(t, startBlock, endBlock, bm.TotalBlockCount, bm.BlockCount, wallClock)
 
 	// collect and report traces
-	evNodeSpans := s.collectServiceTraces(e, serviceName)
-	evRethSpans := s.tryCollectServiceTraces(e, "ev-reth")
-	e2e.PrintTraceReport(t, serviceName, evNodeSpans)
-	if len(evRethSpans) > 0 {
-		e2e.PrintTraceReport(t, "ev-reth", evRethSpans)
-	}
+	traces := s.collectTraces(e, serviceName)
 
-	// compute ev-node overhead ratio
-	spanStats := e2e.AggregateSpanStats(evNodeSpans)
-	if produceBlock, ok := spanStats["BlockExecutor.ProduceBlock"]; ok {
-		if executeTxs, ok2 := spanStats["Executor.ExecuteTxs"]; ok2 {
-			produceAvg := float64(produceBlock.Total.Microseconds()) / float64(produceBlock.Count)
-			executeAvg := float64(executeTxs.Total.Microseconds()) / float64(executeTxs.Count)
-			if produceAvg > 0 {
-				overhead := (produceAvg - executeAvg) / produceAvg * 100
-				t.Logf("ev-node overhead: %.1f%%", overhead)
-				w.addEntry(entry{
-					Name:  "ERC20Throughput - ev-node overhead",
-					Unit:  "%",
-					Value: overhead,
-				})
-			}
-		}
+	if overhead, ok := evNodeOverhead(traces.evNode); ok {
+		t.Logf("ev-node overhead: %.1f%%", overhead)
+		w.addEntry(entry{Name: "ERC20Throughput - ev-node overhead", Unit: "%", Value: overhead})
 	}
 
-	// write all metrics to benchmark output
-	w.addEntry(entry{
-		Name:  "ERC20Throughput - MGas/s",
-		Unit:  "MGas/s",
-		Value: achievedMGas,
-	})
-	w.addEntry(entry{
-		Name:  "ERC20Throughput - TPS",
-		Unit:  "tx/s",
-		Value: achievedTPS,
-	})
-	w.addEntry(entry{
-		Name:  "ERC20Throughput - avg gas/block",
-		Unit:  "gas",
-		Value: bm.avgGasPerBlock(),
-	})
-	w.addEntry(entry{
-		Name:  "ERC20Throughput - avg tx/block",
-		Unit:  "count",
-		Value: bm.avgTxPerBlock(),
-	})
-	w.addEntry(entry{
-		Name:  "ERC20Throughput - blocks/s",
-		Unit:  "blocks/s",
-		Value: float64(bm.BlockCount) / steadyState.Seconds(),
-	})
-	w.addEntry(entry{
-		Name:  "ERC20Throughput - non-empty block ratio",
-		Unit:  "%",
-		Value: bm.nonEmptyRatio(),
-	})
-	w.addEntry(entry{
-		Name:  "ERC20Throughput - block interval p50",
-		Unit:  "ms",
-		Value: float64(intervalP50.Milliseconds()),
-	})
-	w.addEntry(entry{
-		Name:  "ERC20Throughput - block interval p99",
-		Unit:  "ms",
-		Value: float64(intervalP99.Milliseconds()),
-	})
-	w.addEntry(entry{
-		Name:  "ERC20Throughput - gas/block p50",
-		Unit:  "gas",
-		Value: gasP50,
-	})
-	w.addEntry(entry{
-		Name:  "ERC20Throughput - gas/block p99",
-		Unit:  "gas",
-		Value: gasP99,
-	})
-	w.addEntry(entry{
-		Name:  "ERC20Throughput - tx/block p50",
-		Unit:  "count",
-		Value: txP50,
-	})
-	w.addEntry(entry{
-		Name:  "ERC20Throughput - tx/block p99",
-		Unit:  "count",
-		Value: txP99,
-	})
-
-	// add per-span avg latencies
-	w.addSpans(append(evNodeSpans, evRethSpans...))
+	w.addEntries(summary.entries("ERC20Throughput"))
+	w.addSpans(traces.allSpans())
 
 	// assertions
 	s.Require().Greater(sent, float64(0), "at least one transaction should have been sent")
 	s.Require().Zero(failed, "no transactions should have failed")
 
 	// assert expected ev-node spans are present
-	assertSpanNames(t, evNodeSpans, []string{
+	assertSpanNames(t, traces.evNode, []string{
 		"BlockExecutor.ProduceBlock",
 		"BlockExecutor.ApplyBlock",
 		"Executor.ExecuteTxs",
diff --git a/test/e2e/benchmark/spamoor_smoke_test.go b/test/e2e/benchmark/spamoor_smoke_test.go
index 8b0dbab35b..39a89f961c 100644
--- a/test/e2e/benchmark/spamoor_smoke_test.go
+++ b/test/e2e/benchmark/spamoor_smoke_test.go
@@ -6,7 +6,6 @@ import (
 	"time"
 
 	"github.com/celestiaorg/tastora/framework/docker/evstack/spamoor"
-	e2e "github.com/evstack/ev-node/test/e2e"
 )
 
 // TestSpamoorSmoke spins up reth + sequencer and a Spamoor node, starts a few
@@ -77,17 +76,11 @@ func (s *SpamoorSuite) TestSpamoorSmoke() {
 	fail := sumCounter(metrics["spamoor_transactions_failed_total"])
 
 	// collect traces
-	evNodeSpans := s.collectServiceTraces(e, "ev-node-smoke")
-	evRethSpans := s.tryCollectServiceTraces(e, "ev-reth")
-	e2e.PrintTraceReport(t, "ev-node-smoke", evNodeSpans)
-	if len(evRethSpans) > 0 {
-		e2e.PrintTraceReport(t, "ev-reth", evRethSpans)
-	}
-
-	w.addSpans(append(evNodeSpans, evRethSpans...))
+	traces := s.collectTraces(e, "ev-node-smoke")
+	w.addSpans(traces.allSpans())
 
 	// assert expected ev-node span names
-	assertSpanNames(t, evNodeSpans, []string{
+	assertSpanNames(t, traces.evNode, []string{
 		"BlockExecutor.ProduceBlock",
 		"BlockExecutor.ApplyBlock",
 		"BlockExecutor.CreateBlock",
@@ -105,8 +98,8 @@ func (s *SpamoorSuite) TestSpamoorSmoke() {
 	}, "ev-node-smoke")
 
 	// assert expected ev-reth span names when traces are available
-	if len(evRethSpans) > 0 {
-		assertSpanNames(t, evRethSpans, []string{
+	if len(traces.evReth) > 0 {
+		assertSpanNames(t, traces.evReth, []string{
 			"build_payload",
 			"execute_tx",
 			"try_build",
diff --git a/test/e2e/benchmark/suite_test.go b/test/e2e/benchmark/suite_test.go
index 7201bde991..bb9993c587 100644
--- a/test/e2e/benchmark/suite_test.go
+++ b/test/e2e/benchmark/suite_test.go
@@ -147,6 +147,32 @@ func (s *SpamoorSuite) setupEnv(cfg config) *env {
 	}
 }
 
+// traceResult holds the collected spans from ev-node and (optionally) ev-reth.
+type traceResult struct {
+	evNode []e2e.TraceSpan
+	evReth []e2e.TraceSpan
+}
+
+// allSpans returns ev-node and ev-reth spans concatenated.
+func (tr *traceResult) allSpans() []e2e.TraceSpan {
+	return append(tr.evNode, tr.evReth...)
+}
+
+// collectTraces fetches ev-node traces (required) and ev-reth traces (optional)
+// from Jaeger, then prints reports for both.
+func (s *SpamoorSuite) collectTraces(e *env, serviceName string) *traceResult {
+	t := s.T()
+	tr := &traceResult{
+		evNode: s.collectServiceTraces(e, serviceName),
+		evReth: s.tryCollectServiceTraces(e, "ev-reth"),
+	}
+	e2e.PrintTraceReport(t, serviceName, tr.evNode)
+	if len(tr.evReth) > 0 {
+		e2e.PrintTraceReport(t, "ev-reth", tr.evReth)
+	}
+	return tr
+}
+
 // collectServiceTraces fetches traces from Jaeger for the given service and returns the spans.
 func (s *SpamoorSuite) collectServiceTraces(e *env, serviceName string) []e2e.TraceSpan {
 	ctx, cancel := context.WithTimeout(s.T().Context(), 3*time.Minute)

From 676e0d146d4ad909388e5c9355f93c83ee37c88b Mon Sep 17 00:00:00 2001
From: chatton <github.qpeyb@simplelogin.fr>
Date: Mon, 2 Mar 2026 14:05:08 +0000
Subject: [PATCH 16/19] docs: add detailed documentation to benchmark helper
 methods

---
 test/e2e/benchmark/helpers.go | 123 +++++++++++++++++++++++-----------
 1 file changed, 83 insertions(+), 40 deletions(-)

diff --git a/test/e2e/benchmark/helpers.go b/test/e2e/benchmark/helpers.go
index 136f5daac4..bdeaeb05be 100644
--- a/test/e2e/benchmark/helpers.go
+++ b/test/e2e/benchmark/helpers.go
@@ -17,21 +17,26 @@ import (
 )
 
 // blockMetrics holds aggregated gas and transaction data across a range of blocks.
+// Only blocks containing at least one transaction are counted in BlockCount,
+// GasPerBlock, and TxPerBlock. All blocks (including empty) are counted in
+// TotalBlockCount and contribute to BlockIntervals.
 type blockMetrics struct {
-	StartBlock     uint64
-	EndBlock       uint64
-	BlockCount     int // non-empty blocks only
-	TotalBlockCount int // all blocks in range including empty
-	TotalGasUsed   uint64
-	TotalTxCount   int
-	GasPerBlock    []uint64        // non-empty blocks only
-	TxPerBlock     []int           // non-empty blocks only
-	BlockIntervals []time.Duration // intervals between all consecutive blocks (for drift)
-	FirstBlockTime time.Time
-	LastBlockTime  time.Time
+	StartBlock      uint64
+	EndBlock        uint64
+	BlockCount      int             // non-empty blocks only
+	TotalBlockCount int             // all blocks in range including empty
+	TotalGasUsed    uint64          // cumulative gas across non-empty blocks
+	TotalTxCount    int             // cumulative tx count across non-empty blocks
+	GasPerBlock     []uint64        // per-block gas for non-empty blocks only
+	TxPerBlock      []int           // per-block tx count for non-empty blocks only
+	BlockIntervals  []time.Duration // time between all consecutive blocks (including empty)
+	FirstBlockTime  time.Time       // timestamp of the first non-empty block
+	LastBlockTime   time.Time       // timestamp of the last non-empty block
 }
 
-// steadyStateDuration returns the time between the first and last active blocks.
+// steadyStateDuration returns the wall-clock time between the first and last
+// non-empty blocks. This excludes warm-up (empty blocks before the first tx)
+// and cool-down (empty blocks after the last tx) from throughput calculations.
 func (m *blockMetrics) steadyStateDuration() time.Duration {
 	if m.FirstBlockTime.IsZero() || m.LastBlockTime.IsZero() {
 		return 0
@@ -39,7 +44,8 @@ func (m *blockMetrics) steadyStateDuration() time.Duration {
 	return m.LastBlockTime.Sub(m.FirstBlockTime)
 }
 
-// avgGasPerBlock returns the mean gas used per block.
+// avgGasPerBlock returns TotalGasUsed / BlockCount, i.e. the mean gas used
+// per non-empty block.
 func (m *blockMetrics) avgGasPerBlock() float64 {
 	if m.BlockCount == 0 {
 		return 0
@@ -47,7 +53,8 @@ func (m *blockMetrics) avgGasPerBlock() float64 {
 	return float64(m.TotalGasUsed) / float64(m.BlockCount)
 }
 
-// avgTxPerBlock returns the mean transaction count per block.
+// avgTxPerBlock returns TotalTxCount / BlockCount, i.e. the mean transaction
+// count per non-empty block.
 func (m *blockMetrics) avgTxPerBlock() float64 {
 	if m.BlockCount == 0 {
 		return 0
@@ -55,7 +62,8 @@ func (m *blockMetrics) avgTxPerBlock() float64 {
 	return float64(m.TotalTxCount) / float64(m.BlockCount)
 }
 
-// nonEmptyRatio returns the percentage of blocks that contained transactions.
+// nonEmptyRatio returns (BlockCount / TotalBlockCount) * 100, i.e. the
+// percentage of blocks in the range that contained at least one transaction.
 func (m *blockMetrics) nonEmptyRatio() float64 {
 	if m.TotalBlockCount == 0 {
 		return 0
@@ -63,7 +71,9 @@ func (m *blockMetrics) nonEmptyRatio() float64 {
 	return float64(m.BlockCount) / float64(m.TotalBlockCount) * 100
 }
 
-// blockIntervalStats returns p50, p99, and max of block intervals.
+// blockIntervalStats computes percentile statistics over the time gaps between
+// all consecutive blocks (including empty ones). This measures block production
+// cadence and jitter rather than transaction throughput.
 func (m *blockMetrics) blockIntervalStats() (p50, p99, max time.Duration) {
 	if len(m.BlockIntervals) == 0 {
 		return 0, 0, 0
@@ -78,7 +88,8 @@ func (m *blockMetrics) blockIntervalStats() (p50, p99, max time.Duration) {
 		time.Duration(sorted[len(sorted)-1])
 }
 
-// gasPerBlockStats returns p50 and p99 of gas used per non-empty block.
+// gasPerBlockStats returns the 50th and 99th percentile of gas used across
+// non-empty blocks. Shows the distribution of per-block gas consumption.
 func (m *blockMetrics) gasPerBlockStats() (p50, p99 float64) {
 	if len(m.GasPerBlock) == 0 {
 		return 0, 0
@@ -91,7 +102,8 @@ func (m *blockMetrics) gasPerBlockStats() (p50, p99 float64) {
 	return percentile(sorted, 0.50), percentile(sorted, 0.99)
 }
 
-// txPerBlockStats returns p50 and p99 of tx count per non-empty block.
+// txPerBlockStats returns the 50th and 99th percentile of transaction counts
+// across non-empty blocks. Shows the distribution of per-block tx throughput.
 func (m *blockMetrics) txPerBlockStats() (p50, p99 float64) {
 	if len(m.TxPerBlock) == 0 {
 		return 0, 0
@@ -104,8 +116,9 @@ func (m *blockMetrics) txPerBlockStats() (p50, p99 float64) {
 	return percentile(sorted, 0.50), percentile(sorted, 0.99)
 }
 
-// percentile returns the p-th percentile from a pre-sorted float64 slice
-// using linear interpolation.
+// percentile returns the p-th percentile from a pre-sorted float64 slice using
+// linear interpolation between adjacent ranks. For example, p=0.50 returns the
+// median and p=0.99 returns the value below which 99% of observations fall.
 func percentile(sorted []float64, p float64) float64 {
 	if len(sorted) == 0 {
 		return 0
@@ -123,7 +136,8 @@ func percentile(sorted []float64, p float64) float64 {
 	return sorted[lower] + frac*(sorted[upper]-sorted[lower])
 }
 
-// mgasPerSec calculates the achieved throughput in megagas per second.
+// mgasPerSec computes totalGasUsed / elapsed / 1e6, giving throughput in
+// millions of gas units per second (MGas/s).
 func mgasPerSec(totalGasUsed uint64, elapsed time.Duration) float64 {
 	if elapsed <= 0 {
 		return 0
@@ -227,25 +241,41 @@ func waitForDrain(ctx context.Context, log func(string, ...any), client *ethclie
 	}
 }
 
-// blockMetricsSummary holds the computed stats from a blockMetrics measurement window.
+// blockMetricsSummary holds all derived statistics from a blockMetrics measurement
+// window. Every field is computed from the raw block data by summarize().
 type blockMetricsSummary struct {
-	SteadyState   time.Duration
-	AchievedMGas  float64
-	AchievedTPS   float64
-	IntervalP50   time.Duration
-	IntervalP99   time.Duration
-	IntervalMax   time.Duration
-	GasP50        float64
-	GasP99        float64
-	TxP50         float64
-	TxP99         float64
-	AvgGas        float64
-	AvgTx         float64
-	BlocksPerSec  float64
+	// SteadyState is the wall-clock duration between the first and last non-empty
+	// blocks, used as the denominator for throughput calculations.
+	SteadyState time.Duration
+	// AchievedMGas is total gas / steady-state seconds / 1e6 (megagas per second).
+	AchievedMGas float64
+	// AchievedTPS is total tx count / steady-state seconds.
+	AchievedTPS float64
+	// IntervalP50, IntervalP99, IntervalMax are percentile and max statistics
+	// over the time between all consecutive blocks (including empty).
+	IntervalP50 time.Duration
+	IntervalP99 time.Duration
+	IntervalMax time.Duration
+	// GasP50, GasP99 are the 50th/99th percentile of gas used per non-empty block.
+	GasP50 float64
+	GasP99 float64
+	// TxP50, TxP99 are the 50th/99th percentile of tx count per non-empty block.
+	TxP50 float64
+	TxP99 float64
+	// AvgGas is the mean gas per non-empty block (TotalGasUsed / BlockCount).
+	AvgGas float64
+	// AvgTx is the mean tx count per non-empty block (TotalTxCount / BlockCount).
+	AvgTx float64
+	// BlocksPerSec is non-empty blocks / steady-state seconds.
+	BlocksPerSec float64
+	// NonEmptyRatio is (non-empty blocks / total blocks) * 100.
 	NonEmptyRatio float64
 }
 
-// summarize computes all derived stats from the raw block metrics.
+// summarize computes all derived statistics from the raw block-level data in one
+// pass. It delegates to the individual stat methods (steadyStateDuration,
+// blockIntervalStats, gasPerBlockStats, etc.) and packages the results into a
+// blockMetricsSummary for logging and result-writing.
 func (m *blockMetrics) summarize() *blockMetricsSummary {
 	ss := m.steadyStateDuration()
 	intervalP50, intervalP99, intervalMax := m.blockIntervalStats()
@@ -275,7 +305,10 @@ func (m *blockMetrics) summarize() *blockMetricsSummary {
 	}
 }
 
-// log prints a concise summary of the block metrics to the test log.
+// log prints the block range, interval stats, per-block gas/tx stats, and
+// throughput (MGas/s + TPS) to the test log. startBlock/endBlock and
+// totalBlocks/nonEmptyBlocks are passed separately because they live on the
+// raw blockMetrics, not the summary.
 func (s *blockMetricsSummary) log(t testing.TB, startBlock, endBlock uint64, totalBlocks, nonEmptyBlocks int, wallClock time.Duration) {
 	t.Logf("block range: %d-%d (%d total, %d non-empty, %.1f%% non-empty)",
 		startBlock, endBlock, totalBlocks, nonEmptyBlocks, s.NonEmptyRatio)
@@ -287,7 +320,10 @@ func (s *blockMetricsSummary) log(t testing.TB, startBlock, endBlock uint64, tot
 		s.AchievedMGas, s.AchievedTPS, s.SteadyState.Round(time.Millisecond), wallClock.Round(time.Millisecond))
 }
 
-// entries returns all summary metrics as result writer entries with the given prefix.
+// entries returns all summary metrics as result writer entries in the
+// customSmallerIsBetter format expected by github-action-benchmark. Each entry
+// is prefixed with the given label (e.g. "ERC20Throughput") so results from
+// different tests are distinguishable in the same output file.
 func (s *blockMetricsSummary) entries(prefix string) []entry {
 	return []entry{
 		{Name: prefix + " - MGas/s", Unit: "MGas/s", Value: s.AchievedMGas},
@@ -305,8 +341,15 @@ func (s *blockMetricsSummary) entries(prefix string) []entry {
 	}
 }
 
-// evNodeOverhead computes the overhead percentage of ev-node's ProduceBlock
-// over the inner ExecuteTxs span. Returns the overhead and whether it was computable.
+// evNodeOverhead computes the fraction of block production time spent outside
+// EVM execution. It looks up the average durations of BlockExecutor.ProduceBlock
+// (the outer span covering the full block lifecycle) and Executor.ExecuteTxs
+// (the inner span covering only EVM tx execution), then returns:
+//
+//	overhead% = (avgProduce - avgExecute) / avgProduce * 100
+//
+// This captures time spent on sequencing, DA submission, header construction,
+// and other ev-node orchestration work. Returns false if either span is missing.
 func evNodeOverhead(spans []e2e.TraceSpan) (float64, bool) {
 	stats := e2e.AggregateSpanStats(spans)
 	produce, ok := stats["BlockExecutor.ProduceBlock"]

From f4949a1be922e46c12d1763f537526925d92fe8b Mon Sep 17 00:00:00 2001
From: chatton <github.qpeyb@simplelogin.fr>
Date: Mon, 2 Mar 2026 14:18:00 +0000
Subject: [PATCH 17/19] ci: add ERC20 throughput benchmark job

---
 .github/workflows/benchmark.yml | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index dea4217677..4675250d4b 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -77,6 +77,29 @@ jobs:
           name: spamoor-benchmark-results
           path: test/e2e/benchmark/spamoor_bench.json
 
+  # TODO: wire up to publish results once additional tests are in place.
+  erc20-benchmark:
+    name: ERC20 Throughput Benchmark
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - name: Set up Go
+        uses: actions/setup-go@7a3fe6cf4cb3a834922a1244abfce67bcef6a0c5 # v6.2.0
+        with:
+          go-version-file: ./go.mod
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0
+      - name: Install just
+        uses: extractions/setup-just@v3
+      - name: Build binaries
+        run: just build-evm build-da
+      - name: Run ERC20 throughput test
+        run: |
+          cd test/e2e && go test -tags evm \
+            -run='^TestSpamoorSuite$/^TestERC20Throughput$' -v -timeout=15m \
+            ./benchmark/ --evm-binary=../../../build/evm
+
   # single job to push all results to gh-pages sequentially, avoiding race conditions
   publish-benchmarks:
     name: Publish Benchmark Results

From 4c7b7e17c6c5c6eef0712df96f94627c8a19d964 Mon Sep 17 00:00:00 2001
From: chatton <github.qpeyb@simplelogin.fr>
Date: Mon, 2 Mar 2026 14:30:14 +0000
Subject: [PATCH 18/19] chore: remove span assertions

---
 test/e2e/benchmark/spamoor_erc20_test.go | 14 ------------
 test/e2e/benchmark/spamoor_smoke_test.go | 28 ------------------------
 test/e2e/benchmark/traces.go             | 16 +-------------
 3 files changed, 1 insertion(+), 57 deletions(-)

diff --git a/test/e2e/benchmark/spamoor_erc20_test.go b/test/e2e/benchmark/spamoor_erc20_test.go
index 357c5c4f3f..bba7fd156c 100644
--- a/test/e2e/benchmark/spamoor_erc20_test.go
+++ b/test/e2e/benchmark/spamoor_erc20_test.go
@@ -100,20 +100,6 @@ func (s *SpamoorSuite) TestERC20Throughput() {
 	w.addEntries(summary.entries("ERC20Throughput"))
 	w.addSpans(traces.allSpans())
 
-	// assertions
 	s.Require().Greater(sent, float64(0), "at least one transaction should have been sent")
 	s.Require().Zero(failed, "no transactions should have failed")
-
-	// assert expected ev-node spans are present
-	assertSpanNames(t, traces.evNode, []string{
-		"BlockExecutor.ProduceBlock",
-		"BlockExecutor.ApplyBlock",
-		"Executor.ExecuteTxs",
-		"Executor.SetFinal",
-		"Engine.ForkchoiceUpdated",
-		"Engine.NewPayload",
-		"Engine.GetPayload",
-		"Sequencer.GetNextBatch",
-		"DA.Submit",
-	}, serviceName)
 }
diff --git a/test/e2e/benchmark/spamoor_smoke_test.go b/test/e2e/benchmark/spamoor_smoke_test.go
index 39a89f961c..d09057462b 100644
--- a/test/e2e/benchmark/spamoor_smoke_test.go
+++ b/test/e2e/benchmark/spamoor_smoke_test.go
@@ -79,34 +79,6 @@ func (s *SpamoorSuite) TestSpamoorSmoke() {
 	traces := s.collectTraces(e, "ev-node-smoke")
 	w.addSpans(traces.allSpans())
 
-	// assert expected ev-node span names
-	assertSpanNames(t, traces.evNode, []string{
-		"BlockExecutor.ProduceBlock",
-		"BlockExecutor.ApplyBlock",
-		"BlockExecutor.CreateBlock",
-		"BlockExecutor.RetrieveBatch",
-		"Executor.ExecuteTxs",
-		"Executor.SetFinal",
-		"Engine.ForkchoiceUpdated",
-		"Engine.NewPayload",
-		"Engine.GetPayload",
-		"Eth.GetBlockByNumber",
-		"Sequencer.GetNextBatch",
-		"DASubmitter.SubmitHeaders",
-		"DASubmitter.SubmitData",
-		"DA.Submit",
-	}, "ev-node-smoke")
-
-	// assert expected ev-reth span names when traces are available
-	if len(traces.evReth) > 0 {
-		assertSpanNames(t, traces.evReth, []string{
-			"build_payload",
-			"execute_tx",
-			"try_build",
-			"validate_transaction",
-		}, "ev-reth")
-	}
-
 	s.Require().Greater(sent, float64(0), "at least one transaction should have been sent")
 	s.Require().Zero(fail, "no transactions should have failed")
 }
diff --git a/test/e2e/benchmark/traces.go b/test/e2e/benchmark/traces.go
index 459dd16a1a..c5269a1993 100644
--- a/test/e2e/benchmark/traces.go
+++ b/test/e2e/benchmark/traces.go
@@ -3,11 +3,9 @@
 package benchmark
 
 import (
-	"testing"
 	"time"
 
 	e2e "github.com/evstack/ev-node/test/e2e"
-	"github.com/stretchr/testify/require"
 )
 
 // jaegerSpan holds the fields we extract from Jaeger's untyped JSON response.
@@ -16,7 +14,7 @@ type jaegerSpan struct {
 	duration      float64 // microseconds
 }
 
-func (j jaegerSpan) SpanName() string           { return j.operationName }
+func (j jaegerSpan) SpanName() string            { return j.operationName }
 func (j jaegerSpan) SpanDuration() time.Duration { return time.Duration(j.duration) * time.Microsecond }
 
 // extractSpansFromTraces walks Jaeger's []any response and pulls out span operation names and durations.
@@ -53,15 +51,3 @@ func toTraceSpans(spans []jaegerSpan) []e2e.TraceSpan {
 	}
 	return out
 }
-
-// assertSpanNames verifies that all expected span names appear in the trace data.
-func assertSpanNames(t testing.TB, spans []e2e.TraceSpan, expected []string, label string) {
-	t.Helper()
-	opNames := make(map[string]struct{}, len(spans))
-	for _, span := range spans {
-		opNames[span.SpanName()] = struct{}{}
-	}
-	for _, name := range expected {
-		require.Contains(t, opNames, name, "expected span %q not found in %s traces", name, label)
-	}
-}

From b14085e3a40096f3d3288708cb89198d67791a0a Mon Sep 17 00:00:00 2001
From: chatton <github.qpeyb@simplelogin.fr>
Date: Tue, 3 Mar 2026 09:37:54 +0000
Subject: [PATCH 19/19] fix: guard against drain timeout and zero-duration TPS
 division

- waitForDrain returns an error on timeout instead of silently logging
- guard AchievedTPS computation when steady-state duration is zero
---
 test/e2e/benchmark/helpers.go            | 12 ++++++------
 test/e2e/benchmark/spamoor_erc20_test.go |  4 +++-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/test/e2e/benchmark/helpers.go b/test/e2e/benchmark/helpers.go
index bdeaeb05be..83d32aa92b 100644
--- a/test/e2e/benchmark/helpers.go
+++ b/test/e2e/benchmark/helpers.go
@@ -200,7 +200,7 @@ func deleteAllSpammers(api *spamoor.API) error {
 
 // waitForDrain polls the latest block until consecutiveEmpty consecutive empty
 // blocks are observed, indicating the mempool has drained.
-func waitForDrain(ctx context.Context, log func(string, ...any), client *ethclient.Client, consecutiveEmpty int) {
+func waitForDrain(ctx context.Context, log func(string, ...any), client *ethclient.Client, consecutiveEmpty int) error {
 	var emptyRun int
 	var lastBlock uint64
 	ticker := time.NewTicker(200 * time.Millisecond)
@@ -209,8 +209,7 @@ func waitForDrain(ctx context.Context, log func(string, ...any), client *ethclie
 	for {
 		select {
 		case <-ctx.Done():
-			log("drain timeout after %d consecutive empty blocks (needed %d)", emptyRun, consecutiveEmpty)
-			return
+			return fmt.Errorf("drain timeout after %d consecutive empty blocks (needed %d): %w", emptyRun, consecutiveEmpty, ctx.Err())
 		case <-ticker.C:
 			header, err := client.HeaderByNumber(ctx, nil)
 			if err != nil {
@@ -235,7 +234,7 @@ func waitForDrain(ctx context.Context, log func(string, ...any), client *ethclie
 
 			if emptyRun >= consecutiveEmpty {
 				log("mempool drained: %d consecutive empty blocks at block %d", emptyRun, num)
-				return
+				return nil
 			}
 		}
 	}
@@ -282,15 +281,16 @@ func (m *blockMetrics) summarize() *blockMetricsSummary {
 	gasP50, gasP99 := m.gasPerBlockStats()
 	txP50, txP99 := m.txPerBlockStats()
 
-	var blocksPerSec float64
+	var blocksPerSec, achievedTPS float64
 	if ss > 0 {
 		blocksPerSec = float64(m.BlockCount) / ss.Seconds()
+		achievedTPS = float64(m.TotalTxCount) / ss.Seconds()
 	}
 
 	return &blockMetricsSummary{
 		SteadyState:   ss,
 		AchievedMGas:  mgasPerSec(m.TotalGasUsed, ss),
-		AchievedTPS:   float64(m.TotalTxCount) / ss.Seconds(),
+		AchievedTPS:   achievedTPS,
 		IntervalP50:   intervalP50,
 		IntervalP99:   intervalP99,
 		IntervalMax:   intervalMax,
diff --git a/test/e2e/benchmark/spamoor_erc20_test.go b/test/e2e/benchmark/spamoor_erc20_test.go
index bba7fd156c..d0d079355a 100644
--- a/test/e2e/benchmark/spamoor_erc20_test.go
+++ b/test/e2e/benchmark/spamoor_erc20_test.go
@@ -74,7 +74,9 @@ func (s *SpamoorSuite) TestERC20Throughput() {
 	// blocks, the mempool is drained and we can stop.
 	drainCtx, drainCancel := context.WithTimeout(ctx, 30*time.Second)
 	defer drainCancel()
-	waitForDrain(drainCtx, t.Logf, e.ethClient, 10)
+	if err := waitForDrain(drainCtx, t.Logf, e.ethClient, 10); err != nil {
+		t.Logf("warning: %v", err)
+	}
 	wallClock := time.Since(loadStart)
 
 	endHeader, err := e.ethClient.HeaderByNumber(ctx, nil)