From 9d13ca7130fc9473c3c38b9a774331d4bdc43547 Mon Sep 17 00:00:00 2001
From: tnm <t@ted.io>
Date: Wed, 18 Feb 2026 18:22:56 -0800
Subject: [PATCH] Update benchmark provider coverage and docs

---
 benchmarks/README.md                  |  52 ++++--
 benchmarks/benchmark.py               |  61 +++----
 benchmarks/benchmark_20x.py           |  52 +++---
 benchmarks/cold_vs_warm.py            |  49 +++---
 benchmarks/compare_providers.py       |  52 ++----
 benchmarks/comprehensive_benchmark.py |  58 +++----
 benchmarks/image_reuse.py             |  62 ++++---
 benchmarks/provider_matrix.py         | 231 ++++++++++++++++++++++++++
 benchmarks/run_all_benchmarks.py      |  11 +-
 benchmarks/simple_benchmark.py        | 161 ++++++++++--------
 10 files changed, 500 insertions(+), 289 deletions(-)
 create mode 100644 benchmarks/provider_matrix.py

diff --git a/benchmarks/README.md b/benchmarks/README.md
index c4fa105..019222c 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -4,10 +4,10 @@ Comprehensive benchmark suite for comparing sandbox provider performance.
 
 ## Available Benchmarks
 
-### 🎯 comprehensive_benchmark.py (RECOMMENDED)
+### comprehensive_benchmark.py (RECOMMENDED)
 **Apples-to-apples comparison with realistic workloads**
 
-Tests all providers with diverse scenarios:
+Tests all configured providers with diverse scenarios:
 - Hello World (shell execution)
 - Prime Calculation (CPU-bound)
 - File I/O (1000 files)
@@ -15,7 +15,10 @@ Tests all providers with diverse scenarios:
 - NumPy FFT (numerical computation)
 
 **Features:**
-- Uses standardized image (`daytonaio/ai-test:0.2.3`) for Modal and Daytona
+- Uses standardized runtime hints per provider:
+  - Modal/Daytona: standardized Docker image
+  - E2B/Hopx: configurable template IDs
+  - Sprites/Vercel: provider defaults
 - Multiple runs with statistical analysis (mean, stddev, min, max)
 - Detailed error reporting
 - Winner tracking across all tests
@@ -29,7 +32,7 @@ python benchmarks/comprehensive_benchmark.py
 
 ---
 
-### 📊 compare_providers.py
+### compare_providers.py
 **Lifecycle breakdown (create/execute/destroy)**
 
 Tests basic sandbox operations with detailed timing for each phase:
@@ -47,10 +50,10 @@ python benchmarks/compare_providers.py
 
 ---
 
-### ⚡ simple_benchmark.py
+### simple_benchmark.py
 **Quick smoke test**
 
-Fast basic test to verify providers are working.
+Fast create/exec/destroy verification across all configured providers.
 
 **Usage:**
 ```bash
@@ -59,7 +62,7 @@ python benchmarks/simple_benchmark.py
 
 ---
 
-### 🔥 benchmark_20x.py
+### benchmark_20x.py
 **Concurrent execution test**
 
 Tests 20 concurrent sandbox operations to measure parallelism and throughput.
@@ -71,7 +74,7 @@ python benchmarks/benchmark_20x.py
 
 ---
 
-### ❄️ cold_vs_warm.py
+### cold_vs_warm.py
 **Cold start analysis**
 
 Compares cold start (first run) vs warm start (subsequent runs) performance.
@@ -83,10 +86,11 @@ python benchmarks/cold_vs_warm.py
 
 ---
 
-### 🖼️ image_reuse.py
+### image_reuse.py
 **Image caching test**
 
-Tests how providers handle image reuse and caching.
+Tests how providers that support explicit image/template runtime configuration
+handle reuse and caching (currently Modal, Daytona, E2B, and Hopx).
 
 **Usage:**
 ```bash
@@ -99,9 +103,12 @@ python benchmarks/image_reuse.py
 
 All benchmarks auto-detect available providers based on environment variables:
 
+- **Daytona**: Set `DAYTONA_API_KEY`
 - **E2B**: Set `E2B_API_KEY`
+- **Sprites**: Set `SPRITES_TOKEN` or run `sprite login`
+- **Hopx**: Set `HOPX_API_KEY`
+- **Vercel**: Set `VERCEL_TOKEN`, `VERCEL_PROJECT_ID`, and `VERCEL_TEAM_ID`
 - **Modal**: Run `modal token set` or set `MODAL_TOKEN_ID`
-- **Daytona**: Set `DAYTONA_API_KEY`
 
 ## Standard Image
 
@@ -111,18 +118,29 @@ For apples-to-apples comparison, benchmarks use comparable environments:
   - Python 3.13, numpy, requests, anthropic, cohere, beautifulsoup4, and many AI/ML packages
   - Both providers support arbitrary Docker images
 
-- **E2B**: `code-interpreter` template
+- **E2B**: `code-interpreter` template by default
   - Python, npm, Jupyter, and common ML packages (numpy, pandas, matplotlib, etc.)
   - E2B uses templates instead of Docker images
-  - Custom templates supported via `config.image` or `config.provider_config["template"]`
+  - Benchmarks prefer `E2B_BENCHMARK_TEMPLATE`, then `benchmarks/e2b-daytona-benchmark/e2b.toml`, then `code-interpreter`
+  - Override with `E2B_BENCHMARK_TEMPLATE`
+  - If you see `Template is not compatible with secured access`, set `E2B_BENCHMARK_TEMPLATE` to a secured-access compatible template ID
+
+- **Hopx**: `code-interpreter` template by default
+  - Override with `HOPX_BENCHMARK_TEMPLATE`
+
+- **Sprites/Vercel**:
+  - Benchmarks use provider defaults for runtime/image behavior
+
+Cloudflare provider benchmarks are intentionally excluded by default.
 
 ## Contributing
 
 When adding new benchmarks:
-1. Use the standardized image for Modal/Daytona
-2. Include statistical analysis (mean, stddev)
-3. Add error handling and detailed reporting
-4. Update this README
+1. Keep provider discovery centralized (see `benchmarks/provider_matrix.py`)
+2. Use standardized runtime hints for fair comparisons where possible
+3. Include statistical analysis (mean, stddev)
+4. Add error handling and detailed reporting
+5. Update this README
 
 ## License
 
diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
index e7faf34..52ce31c 100644
--- a/benchmarks/benchmark.py
+++ b/benchmarks/benchmark.py
@@ -14,10 +14,12 @@
 # Add parent directory to path for imports
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
+from benchmarks.provider_matrix import (
+    benchmark_image_for_provider,
+    benchmark_runtime_label,
+    discover_benchmark_providers,
+)
 from sandboxes import SandboxConfig
-from sandboxes.providers.daytona import DaytonaProvider
-from sandboxes.providers.e2b import E2BProvider
-from sandboxes.providers.modal import ModalProvider
 
 
 @dataclass
@@ -63,29 +65,16 @@ def __init__(self, iterations: int = 5):
 
     def _init_providers(self):
         """Initialize available providers."""
-        # E2B
-        if os.getenv("E2B_API_KEY"):
-            try:
-                self.providers["e2b"] = E2BProvider()
-                print("✅ E2B provider initialized")
-            except Exception as e:
-                print(f"❌ E2B provider failed: {e}")
-
-        # Daytona
-        if os.getenv("DAYTONA_API_KEY"):
-            try:
-                self.providers["daytona"] = DaytonaProvider()
-                print("✅ Daytona provider initialized")
-            except Exception as e:
-                print(f"❌ Daytona provider failed: {e}")
+        configured_providers = discover_benchmark_providers(include_cloudflare=False)
 
-        # Modal
-        if os.path.exists(os.path.expanduser("~/.modal.toml")):
+        for provider in configured_providers:
             try:
-                self.providers["modal"] = ModalProvider()
-                print("✅ Modal provider initialized")
+                provider_class = provider.load_class()
+                self.providers[provider.name] = provider_class()
+                runtime = benchmark_runtime_label(provider.name)
+                print(f"✅ {provider.display_name} provider initialized ({runtime})")
             except Exception as e:
-                print(f"❌ Modal provider failed: {e}")
+                print(f"❌ {provider.display_name} provider failed: {e}")
 
     async def benchmark_create_sandbox(self, provider_name: str) -> list[BenchmarkResult]:
         """Benchmark sandbox creation."""
@@ -93,13 +82,7 @@ async def benchmark_create_sandbox(self, provider_name: str) -> list[BenchmarkRe
         results = []
 
         for i in range(self.iterations):
-            # Use Daytona image for E2B and Daytona providers
-            image = "daytonaio/ai-test:0.2.3" if provider_name in ["daytona", "e2b"] else None
-            if provider_name == "e2b":
-                # E2B uses template ID (template built from daytonaio/ai-test:0.2.3)
-                # To build your own: cd benchmarks/e2b-daytona-benchmark && e2b template build
-                # Then update this ID with the one from benchmarks/e2b-daytona-benchmark/e2b.toml
-                image = "5x6hvr4zwye07thwhpkd"
+            image = benchmark_image_for_provider(provider_name)
 
             config = SandboxConfig(
                 labels={"benchmark": "create", "iteration": str(i)},
@@ -144,12 +127,7 @@ async def benchmark_execute_command(self, provider_name: str) -> list[BenchmarkR
         provider = self.providers[provider_name]
         results = []
 
-        # Use Daytona image for E2B and Daytona providers
-        image = "daytonaio/ai-test:0.2.3" if provider_name in ["daytona", "e2b"] else None
-        if provider_name == "e2b":
-            # E2B uses template ID (template built from daytonaio/ai-test:0.2.3)
-            # See benchmarks/e2b-daytona-benchmark/README.md for building your own
-            image = "5x6hvr4zwye07thwhpkd"
+        image = benchmark_image_for_provider(provider_name)
 
         # Create one sandbox for all iterations
         config = SandboxConfig(
@@ -212,12 +190,7 @@ async def benchmark_reuse_sandbox(self, provider_name: str) -> list[BenchmarkRes
         provider = self.providers[provider_name]
         results = []
 
-        # Use Daytona image for E2B and Daytona providers
-        image = "daytonaio/ai-test:0.2.3" if provider_name in ["daytona", "e2b"] else None
-        if provider_name == "e2b":
-            # E2B uses template ID (template built from daytonaio/ai-test:0.2.3)
-            # See benchmarks/e2b-daytona-benchmark/README.md for building your own
-            image = "5x6hvr4zwye07thwhpkd"
+        image = benchmark_image_for_provider(provider_name)
 
         labels = {"benchmark": "reuse", "session": "test123"}
         config = SandboxConfig(labels=labels, timeout_seconds=120, image=image)
@@ -478,6 +451,10 @@ async def run(self) -> dict[str, ProviderMetrics]:
         print(f"   Iterations per test: {self.iterations}")
         print(f"   Providers: {', '.join(self.providers.keys())}")
 
+        if not self.providers:
+            print("❌ No configured providers found.")
+            return {}
+
         # Run benchmarks for each provider
         all_results = []
         for provider_name in self.providers:
diff --git a/benchmarks/benchmark_20x.py b/benchmarks/benchmark_20x.py
index ce48279..cdccc66 100644
--- a/benchmarks/benchmark_20x.py
+++ b/benchmarks/benchmark_20x.py
@@ -9,16 +9,14 @@
 
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
+from benchmarks.provider_matrix import benchmark_image_for_provider, discover_benchmark_providers
 from sandboxes import SandboxConfig
-from sandboxes.providers.daytona import DaytonaProvider
-from sandboxes.providers.e2b import E2BProvider
-from sandboxes.providers.modal import ModalProvider
 
 
-async def verify_and_benchmark(provider_class, name: str, runs: int = 20):
+async def verify_and_benchmark(provider_name: str, display_name: str, provider_class, runs: int = 20):
     """Benchmark provider with verification."""
     print(f"\n{'='*80}")
-    print(f"4{name} - {runs} ITERATIONS")
+    print(f"{display_name} - {runs} ITERATIONS")
     print(f"{'='*80}")
 
     try:
@@ -61,9 +59,10 @@ async def verify_and_benchmark(provider_class, name: str, runs: int = 20):
         try:
             # Create sandbox
             start = time.time()
-            config = SandboxConfig(labels={"benchmark": f"{name.lower()}_20x", "run": str(i + 1)})
-            if name == "Modal":
-                config.provider_config = {"image": "python:3.11-slim"}
+            config = SandboxConfig(labels={"benchmark": f"{provider_name}_20x", "run": str(i + 1)})
+            runtime_image = benchmark_image_for_provider(provider_name)
+            if runtime_image:
+                config.image = runtime_image
 
             sandbox = await provider.create_sandbox(config)
             create_time = (time.time() - start) * 1000
@@ -111,11 +110,11 @@ async def verify_and_benchmark(provider_class, name: str, runs: int = 20):
         print("   Could not verify final count")
 
     if not create_times:
-        print(f"\n❌ All runs failed for {name}")
+        print(f"\n❌ All runs failed for {display_name}")
         return None
 
     # Calculate comprehensive statistics
-    print(f"\n📈 STATISTICS FOR {name} ({len(create_times)}/{runs} successful)")
+    print(f"\n📈 STATISTICS FOR {display_name} ({len(create_times)}/{runs} successful)")
     print("=" * 60)
 
     def print_detailed_stats(name, times):
@@ -147,7 +146,7 @@ def print_detailed_stats(name, times):
     print(f"  Sample IDs: {created_ids[:3] if created_ids else 'None'}")
 
     return {
-        "name": name,
+        "name": display_name,
         "runs": runs,
         "successful": len(create_times),
         "failed": failed_runs,
@@ -163,25 +162,26 @@ async def main():
     """Run 20-iteration benchmark for all providers."""
     print("🔬 COMPREHENSIVE BENCHMARK - 20 ITERATIONS PER PROVIDER")
     print("=" * 80)
-    print("This will create and destroy 60 sandboxes total.")
-    print("Estimated time: 3-5 minutes")
+    provider_specs = discover_benchmark_providers(include_cloudflare=False)
+    estimated_sandboxes = len(provider_specs) * 20
+    print(f"This will create and destroy up to {estimated_sandboxes} sandboxes total.")
+    print("Estimated time: provider-dependent")
 
     results = []
 
+    if not provider_specs:
+        print("\n❌ No configured providers found.")
+        return
+
     # Test each provider
-    for provider_class, name in [
-        (ModalProvider, "Modal"),
-        (E2BProvider, "E2B"),
-        (DaytonaProvider, "Daytona"),
-    ]:
-        if name == "E2B" and not os.getenv("E2B_API_KEY"):
-            print(f"\n⚠️ Skipping {name} - no API key")
-            continue
-        if name == "Daytona" and not os.getenv("DAYTONA_API_KEY"):
-            print(f"\n⚠️ Skipping {name} - no API key")
-            continue
-
-        result = await verify_and_benchmark(provider_class, name, runs=20)
+    for provider in provider_specs:
+        provider_class = provider.load_class()
+        result = await verify_and_benchmark(
+            provider.name,
+            provider.display_name,
+            provider_class,
+            runs=20,
+        )
         if result:
             results.append(result)
 
diff --git a/benchmarks/cold_vs_warm.py b/benchmarks/cold_vs_warm.py
index acb8191..1a44900 100644
--- a/benchmarks/cold_vs_warm.py
+++ b/benchmarks/cold_vs_warm.py
@@ -9,10 +9,11 @@
 
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
+from benchmarks.provider_matrix import (
+    benchmark_image_for_provider,
+    discover_benchmark_providers,
+)
 from sandboxes import SandboxConfig
-from sandboxes.providers.daytona import DaytonaProvider
-from sandboxes.providers.e2b import E2BProvider
-from sandboxes.providers.modal import ModalProvider
 
 
 async def test_cold_startup(provider, provider_name: str, config: SandboxConfig) -> dict:
@@ -162,42 +163,40 @@ async def create_execute_destroy(index: int):
     }
 
 
-async def test_provider_warmup_patterns(provider_class, provider_name: str):
+async def test_provider_warmup_patterns(provider_name: str, display_name: str, provider_class):
     """Test complete warmup patterns for a provider."""
     print(f"\n{'='*80}")
-    print(f"🔬 WARMUP ANALYSIS: {provider_name}")
+    print(f"🔬 WARMUP ANALYSIS: {display_name}")
     print(f"{'='*80}")
 
     try:
         # Initialize fresh provider
         provider = provider_class()
 
-        # Configure for provider
         config = SandboxConfig(labels={"test": "warmup"})
-        if provider_name == "Modal":
-            config.image = "python:3.11-slim"
-        elif provider_name == "Daytona":
-            config.image = "daytonaio/ai-test:0.2.3"
+        runtime_image = benchmark_image_for_provider(provider_name)
+        if runtime_image:
+            config.image = runtime_image
 
         # Test 1: Cold startup
-        cold_results = await test_cold_startup(provider, provider_name, config)
+        cold_results = await test_cold_startup(provider, display_name, config)
 
         # Small delay to ensure cold/warm separation
         await asyncio.sleep(2)
 
         # Test 2: Warm startup sequence
-        warm_results = await test_warm_startup(provider, provider_name, config, iterations=5)
+        warm_results = await test_warm_startup(provider, display_name, config, iterations=5)
 
         # Small delay
         await asyncio.sleep(1)
 
         # Test 3: Concurrent warmup
         concurrent_results = await test_concurrent_warm(
-            provider, provider_name, config, concurrency=3
+            provider, display_name, config, concurrency=3
         )
 
         # Analysis
-        print(f"\n📊 WARMUP ANALYSIS FOR {provider_name}")
+        print(f"\n📊 WARMUP ANALYSIS FOR {display_name}")
         print(f"{'='*60}")
 
         print("\nCold vs Warm Comparison:")
@@ -237,7 +236,7 @@ async def test_provider_warmup_patterns(provider_class, provider_name: str):
         print(f"  Concurrency efficiency: {concurrent_results['efficiency']:.1f}x")
 
         return {
-            "provider": provider_name,
+            "provider": display_name,
             "cold": cold_results,
             "warm": warm_results,
             "concurrent": concurrent_results,
@@ -247,7 +246,7 @@ async def test_provider_warmup_patterns(provider_class, provider_name: str):
         }
 
     except Exception as e:
-        print(f"❌ Error testing {provider_name}: {e}")
+        print(f"❌ Error testing {display_name}: {e}")
         return None
 
 
@@ -257,18 +256,16 @@ async def main():
     print("=" * 80)
     print("Testing startup patterns across providers...")
 
-    providers_to_test = [
-        (ModalProvider, "Modal"),
-        (E2BProvider, "E2B") if os.getenv("E2B_API_KEY") else None,
-        (DaytonaProvider, "Daytona") if os.getenv("DAYTONA_API_KEY") else None,
-    ]
-
-    # Filter out None values
-    providers_to_test = [p for p in providers_to_test if p is not None]
+    providers_to_test = discover_benchmark_providers(include_cloudflare=False)
 
     results = []
-    for provider_class, name in providers_to_test:
-        result = await test_provider_warmup_patterns(provider_class, name)
+    for provider in providers_to_test:
+        provider_class = provider.load_class()
+        result = await test_provider_warmup_patterns(
+            provider.name,
+            provider.display_name,
+            provider_class,
+        )
         if result:
             results.append(result)
 
diff --git a/benchmarks/compare_providers.py b/benchmarks/compare_providers.py
index 352fd86..c95f334 100644
--- a/benchmarks/compare_providers.py
+++ b/benchmarks/compare_providers.py
@@ -10,18 +10,19 @@
 # Add parent directory to path
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
+from benchmarks.provider_matrix import benchmark_image_for_provider, discover_benchmark_providers
 from sandboxes import SandboxConfig
 
 
-async def benchmark_provider(provider_class, name: str, runs: int = 3) -> dict | None:
+async def benchmark_provider(provider_name: str, display_name: str, provider_class, runs: int = 3) -> dict | None:
     """Benchmark a single provider."""
     try:
         provider = provider_class()
         print(f"\n{'='*60}")
-        print(f"📦 Benchmarking {name}")
+        print(f"📦 Benchmarking {display_name}")
         print(f"{'='*60}")
     except Exception as e:
-        print(f"\n❌ {name} not available: {e}")
+        print(f"\n❌ {display_name} not available: {e}")
         return None
 
     create_times = []
@@ -36,11 +37,10 @@ async def benchmark_provider(provider_class, name: str, runs: int = 3) -> dict |
         try:
             # Create sandbox
             start = time.time()
-            config = SandboxConfig(labels={"benchmark": f"{name.lower()}_run_{i}"})
-            # Use standardized image for apples-to-apples comparison
-            # daytonaio/ai-test:0.2.3 includes Python 3.13 + numpy + many AI/ML packages
-            if name in ["Modal", "Daytona"]:
-                config.image = "daytonaio/ai-test:0.2.3"
+            config = SandboxConfig(labels={"benchmark": f"{provider_name}_run_{i}"})
+            runtime_image = benchmark_image_for_provider(provider_name)
+            if runtime_image:
+                config.image = runtime_image
 
             sandbox = await provider.create_sandbox(config)
             create_time = (time.time() - start) * 1000
@@ -79,7 +79,7 @@ async def benchmark_provider(provider_class, name: str, runs: int = 3) -> dict |
         return None
 
     return {
-        "name": name,
+        "name": display_name,
         "create": {
             "mean": mean(create_times),
             "median": median(create_times),
@@ -114,35 +114,13 @@ async def main():
     print("Testing with 3 runs per provider...")
 
     results = []
+    provider_specs = discover_benchmark_providers(include_cloudflare=False)
 
-    # Test Modal (we know this works)
-    from sandboxes.providers.modal import ModalProvider
-
-    modal_result = await benchmark_provider(ModalProvider, "Modal", runs=3)
-    if modal_result:
-        results.append(modal_result)
-
-    # Try E2B if available
-    if os.getenv("E2B_API_KEY"):
-        try:
-            from sandboxes.providers.e2b import E2BProvider
-
-            e2b_result = await benchmark_provider(E2BProvider, "E2B", runs=3)
-            if e2b_result:
-                results.append(e2b_result)
-        except Exception as e:
-            print(f"\n❌ E2B error: {e}")
-
-    # Try Daytona if available
-    if os.getenv("DAYTONA_API_KEY"):
-        try:
-            from sandboxes.providers.daytona import DaytonaProvider
-
-            daytona_result = await benchmark_provider(DaytonaProvider, "Daytona", runs=3)
-            if daytona_result:
-                results.append(daytona_result)
-        except Exception as e:
-            print(f"\n❌ Daytona error: {e}")
+    for provider in provider_specs:
+        provider_class = provider.load_class()
+        result = await benchmark_provider(provider.name, provider.display_name, provider_class, runs=3)
+        if result:
+            results.append(result)
 
     # Display comparison table
     print("\n" + "=" * 80)
diff --git a/benchmarks/comprehensive_benchmark.py b/benchmarks/comprehensive_benchmark.py
index f25d592..e98dd5b 100644
--- a/benchmarks/comprehensive_benchmark.py
+++ b/benchmarks/comprehensive_benchmark.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """
 Comprehensive benchmark for sandboxes library.
-Tests E2B, Modal, and Daytona providers with various realistic workloads.
+Tests configured providers with realistic workloads.
 
 Features:
 - Multiple test scenarios (Hello World, compute, I/O, package install)
@@ -15,7 +15,6 @@
 """
 
 import asyncio
-import os
 import sys
 import time
 from pathlib import Path
@@ -33,15 +32,17 @@
     HAS_TABULATE = False
     print("⚠️  Install tabulate for better output: pip install tabulate")
 
+from benchmarks.provider_matrix import (
+    STANDARD_IMAGE,
+    benchmark_image_for_provider,
+    benchmark_runtime_label,
+    discover_benchmark_providers,
+    e2b_benchmark_template,
+    hopx_benchmark_template,
+    provider_configuration_hints,
+)
 from sandboxes import run
 
-# Standard image for apples-to-apples comparison (Modal/Daytona)
-# This image includes Python 3.13, numpy, requests, and many AI/ML packages
-# E2B uses their "code-interpreter" template (doesn't support arbitrary Docker images)
-# code-interpreter includes Python, npm, Jupyter, numpy, pandas, matplotlib, etc.
-STANDARD_IMAGE = "daytonaio/ai-test:0.2.3"
-
-
 # Test scenarios - from simple to complex
 TESTS = {
     "hello_world": {
@@ -136,13 +137,9 @@ async def benchmark_provider(
             # Use comparable images for fair comparison
             kwargs = {"provider": provider_name}
             if use_standard_image:
-                if provider_name == "e2b":
-                    # E2B uses templates, not Docker images - use their code-interpreter template
-                    # Has Python, npm, Jupyter, and common ML packages (numpy, pandas, etc.)
-                    kwargs["image"] = "code-interpreter"
-                elif provider_name in ["modal", "daytona"]:
-                    # Modal and Daytona can use Docker Hub images
-                    kwargs["image"] = STANDARD_IMAGE
+                runtime_image = benchmark_image_for_provider(provider_name)
+                if runtime_image:
+                    kwargs["image"] = runtime_image
 
             result = await run(command, **kwargs)
             duration = (time.time() - start) * 1000  # Convert to ms
@@ -186,7 +183,10 @@ async def run_benchmarks(providers: list[str], use_standard_image: bool = True):
     print(f"Total tests: {len(TESTS)}")
     if use_standard_image:
         print(f"Modal/Daytona: {STANDARD_IMAGE}")
-        print("E2B: code-interpreter template (Python, npm, Jupyter, ML packages)")
+        if "e2b" in providers:
+            print(f"E2B: {e2b_benchmark_template()} template")
+        if "hopx" in providers:
+            print(f"Hopx: {hopx_benchmark_template()} template")
     print("=" * 80 + "\n")
 
     all_results = []
@@ -340,27 +340,17 @@ async def main():
     # Check which providers are available
     print("Checking available providers...")
 
-    providers_to_test = []
-
-    # Check for API keys
-    if os.getenv("E2B_API_KEY"):
-        providers_to_test.append("e2b")
-        print("✓ E2B configured")
-
-    if os.getenv("MODAL_TOKEN_ID") or Path.home().joinpath(".modal.toml").exists():
-        providers_to_test.append("modal")
-        print("✓ Modal configured")
+    providers = discover_benchmark_providers(include_cloudflare=False)
+    providers_to_test = [provider.name for provider in providers]
 
-    if os.getenv("DAYTONA_API_KEY"):
-        providers_to_test.append("daytona")
-        print("✓ Daytona configured")
+    for provider in providers:
+        print(f"✓ {provider.display_name} configured ({benchmark_runtime_label(provider.name)})")
 
     if not providers_to_test:
         print("\n❌ No providers configured!")
-        print("Set environment variables:")
-        print("  - E2B_API_KEY for E2B")
-        print("  - MODAL_TOKEN_ID for Modal (or run 'modal token set')")
-        print("  - DAYTONA_API_KEY for Daytona")
+        print("Configure at least one provider:")
+        for hint in provider_configuration_hints(include_cloudflare=False):
+            print(f"  {hint}")
         return
 
     # Run benchmarks
diff --git a/benchmarks/image_reuse.py b/benchmarks/image_reuse.py
index 9f0936c..e85b4a5 100644
--- a/benchmarks/image_reuse.py
+++ b/benchmarks/image_reuse.py
@@ -9,10 +9,11 @@
 
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
+from benchmarks.provider_matrix import (
+    benchmark_image_for_provider,
+    discover_benchmark_providers,
+)
 from sandboxes import SandboxConfig
-from sandboxes.providers.daytona import DaytonaProvider
-from sandboxes.providers.e2b import E2BProvider
-from sandboxes.providers.modal import ModalProvider
 
 
 async def test_same_image_reuse(
@@ -183,58 +184,54 @@ async def create_test_destroy(index: int):
     }
 
 
-async def test_provider_image_patterns(provider_class, provider_name: str):
+async def test_provider_image_patterns(provider_name: str, display_name: str, provider_class):
     """Test image reuse patterns for a provider."""
     print(f"\n{'='*80}")
-    print(f"🖼️  IMAGE REUSE ANALYSIS: {provider_name}")
+    print(f"🖼️  IMAGE REUSE ANALYSIS: {display_name}")
     print(f"{'='*80}")
 
     try:
         provider = provider_class()
 
         # Provider-specific image configs
-        if provider_name == "Modal":
-            primary_image = "python:3.11-slim"
+        if provider_name == "modal":
+            primary_image = benchmark_image_for_provider(provider_name)
             test_images = [
                 "python:3.11-slim",
                 "python:3.12-slim",
                 "python:3.10-slim",
                 "ubuntu:22.04",
             ]
-        elif provider_name == "E2B":
-            # E2B uses templates, not Docker images
-            primary_image = None  # Default template
-            test_images = [None]  # Only default for now
-        elif provider_name == "Daytona":
-            primary_image = "daytonaio/ai-test:0.2.3"
-            test_images = ["daytonaio/ai-test:0.2.3"]
+        elif provider_name in {"e2b", "daytona", "hopx"}:
+            primary_image = benchmark_image_for_provider(provider_name)
+            test_images = [primary_image] if primary_image else []
         else:
             return None
 
         results = {}
 
         # Test 1: Same image reuse
-        if primary_image is not None:
+        if primary_image:
             same_image_results = await test_same_image_reuse(
-                provider, provider_name, primary_image, iterations=5
+                provider, display_name, primary_image, iterations=5
             )
             results["same_image"] = same_image_results
 
             # Test 2: Concurrent same image
             concurrent_results = await test_concurrent_same_image(
-                provider, provider_name, primary_image, concurrency=3
+                provider, display_name, primary_image, concurrency=3
             )
             results["concurrent_same"] = concurrent_results
 
         # Test 3: Different images (Modal only for now)
-        if provider_name == "Modal":
+        if provider_name == "modal":
             different_images_results = await test_different_images(
-                provider, provider_name, test_images
+                provider, display_name, test_images
             )
             results["different_images"] = different_images_results
 
         # Analysis
-        print(f"\n📊 IMAGE REUSE ANALYSIS FOR {provider_name}")
+        print(f"\n📊 IMAGE REUSE ANALYSIS FOR {display_name}")
         print(f"{'='*60}")
 
         if "same_image" in results:
@@ -286,7 +283,7 @@ async def test_provider_image_patterns(provider_class, provider_name: str):
         return results
 
     except Exception as e:
-        print(f"❌ Error testing {provider_name}: {e}")
+        print(f"❌ Error testing {display_name}: {e}")
         return None
 
 
@@ -296,20 +293,21 @@ async def main():
     print("=" * 80)
     print("Testing image caching and reuse patterns...")
 
-    providers_to_test = [
-        (ModalProvider, "Modal"),
-        (E2BProvider, "E2B") if os.getenv("E2B_API_KEY") else None,
-        (DaytonaProvider, "Daytona") if os.getenv("DAYTONA_API_KEY") else None,
-    ]
-
-    # Filter out None values
-    providers_to_test = [p for p in providers_to_test if p is not None]
+    providers_to_test = discover_benchmark_providers(
+        include_cloudflare=False,
+        image_only=True,
+    )
 
     all_results = []
-    for provider_class, name in providers_to_test:
-        result = await test_provider_image_patterns(provider_class, name)
+    for provider in providers_to_test:
+        provider_class = provider.load_class()
+        result = await test_provider_image_patterns(
+            provider.name,
+            provider.display_name,
+            provider_class,
+        )
         if result:
-            all_results.append((name, result))
+            all_results.append((provider.display_name, result))
 
         # Delay between providers
         await asyncio.sleep(3)
diff --git a/benchmarks/provider_matrix.py b/benchmarks/provider_matrix.py
new file mode 100644
index 0000000..e92f18a
--- /dev/null
+++ b/benchmarks/provider_matrix.py
@@ -0,0 +1,231 @@
+"""Shared provider discovery and runtime hints for benchmark scripts."""
+
+from __future__ import annotations
+
+import os
+import re
+import shutil
+from collections.abc import Callable
+from dataclasses import dataclass
+from pathlib import Path
+
+STANDARD_IMAGE = os.getenv("BENCHMARK_STANDARD_IMAGE", "daytonaio/ai-test:0.2.3")
+_E2B_TEMPLATE_ID_RE = re.compile(r'^template_id\s*=\s*"([^"]+)"\s*$', re.MULTILINE)
+
+
+@dataclass(frozen=True)
+class BenchmarkProvider:
+    """Benchmark provider metadata used by benchmark scripts."""
+
+    name: str
+    display_name: str
+    is_configured: Callable[[], bool]
+    load_class: Callable[[], type]
+    supports_image_benchmark: bool = True
+
+
+def _vercel_token() -> str | None:
+    return (
+        os.getenv("VERCEL_TOKEN")
+        or os.getenv("VERCEL_API_TOKEN")
+        or os.getenv("VERCEL_ACCESS_TOKEN")
+        or os.getenv("VERCEL_OIDC_TOKEN")
+    )
+
+
+def _has_daytona() -> bool:
+    return bool(os.getenv("DAYTONA_API_KEY"))
+
+
+def _has_e2b() -> bool:
+    return bool(os.getenv("E2B_API_KEY"))
+
+
+def _has_sprites() -> bool:
+    return bool(os.getenv("SPRITES_TOKEN") or shutil.which("sprite"))
+
+
+def _has_hopx() -> bool:
+    return bool(os.getenv("HOPX_API_KEY"))
+
+
+def _has_vercel() -> bool:
+    return bool(_vercel_token() and os.getenv("VERCEL_PROJECT_ID") and os.getenv("VERCEL_TEAM_ID"))
+
+
+def _has_modal() -> bool:
+    return bool(os.getenv("MODAL_TOKEN_ID") or Path.home().joinpath(".modal.toml").exists())
+
+
+def _has_cloudflare() -> bool:
+    return bool(os.getenv("CLOUDFLARE_SANDBOX_BASE_URL") and os.getenv("CLOUDFLARE_API_TOKEN"))
+
+
+def _load_daytona_provider():
+    from sandboxes.providers.daytona import DaytonaProvider
+
+    return DaytonaProvider
+
+
+def _load_e2b_provider():
+    from sandboxes.providers.e2b import E2BProvider
+
+    return E2BProvider
+
+
+def _load_sprites_provider():
+    from sandboxes.providers.sprites import SpritesProvider
+
+    return SpritesProvider
+
+
+def _load_hopx_provider():
+    from sandboxes.providers.hopx import HopxProvider
+
+    return HopxProvider
+
+
+def _load_vercel_provider():
+    from sandboxes.providers.vercel import VercelProvider
+
+    return VercelProvider
+
+
+def _load_modal_provider():
+    from sandboxes.providers.modal import ModalProvider
+
+    return ModalProvider
+
+
+def _load_cloudflare_provider():
+    from sandboxes.providers.cloudflare import CloudflareProvider
+
+    return CloudflareProvider
+
+
+PROVIDERS: tuple[BenchmarkProvider, ...] = (
+    BenchmarkProvider("daytona", "Daytona", _has_daytona, _load_daytona_provider),
+    BenchmarkProvider("e2b", "E2B", _has_e2b, _load_e2b_provider),
+    BenchmarkProvider(
+        "sprites",
+        "Sprites",
+        _has_sprites,
+        _load_sprites_provider,
+        supports_image_benchmark=False,
+    ),
+    BenchmarkProvider("hopx", "Hopx", _has_hopx, _load_hopx_provider),
+    BenchmarkProvider(
+        "vercel",
+        "Vercel",
+        _has_vercel,
+        _load_vercel_provider,
+        supports_image_benchmark=False,
+    ),
+    BenchmarkProvider("modal", "Modal", _has_modal, _load_modal_provider),
+    BenchmarkProvider(
+        "cloudflare",
+        "Cloudflare",
+        _has_cloudflare,
+        _load_cloudflare_provider,
+        supports_image_benchmark=False,
+    ),
+)
+
+PROVIDER_CONFIGURATION_HINTS: dict[str, str] = {
+    "daytona": "DAYTONA_API_KEY",
+    "e2b": "E2B_API_KEY",
+    "sprites": "SPRITES_TOKEN or sprite CLI login",
+    "hopx": "HOPX_API_KEY",
+    "vercel": "VERCEL_TOKEN + VERCEL_PROJECT_ID + VERCEL_TEAM_ID",
+    "modal": "~/.modal.toml or MODAL_TOKEN_ID",
+    "cloudflare": "CLOUDFLARE_SANDBOX_BASE_URL + CLOUDFLARE_API_TOKEN",
+}
+
+
+def e2b_benchmark_template() -> str:
+    """Return E2B template used for benchmark workloads."""
+    configured = os.getenv("E2B_BENCHMARK_TEMPLATE")
+    if configured:
+        return configured
+
+    # Prefer repository template when available to keep benchmark runtime stable.
+    e2b_toml = Path(__file__).parent / "e2b-daytona-benchmark" / "e2b.toml"
+    try:
+        contents = e2b_toml.read_text()
+        match = _E2B_TEMPLATE_ID_RE.search(contents)
+        if match:
+            return match.group(1)
+    except OSError:
+        pass
+
+    # Fallback for environments without repository template metadata.
+    return "code-interpreter"
+
+
+def hopx_benchmark_template() -> str:
+    """Return Hopx template used for benchmark workloads."""
+    return os.getenv("HOPX_BENCHMARK_TEMPLATE", "code-interpreter")
+
+
+def benchmark_image_for_provider(provider_name: str) -> str | None:
+    """Return benchmark image/template hint for a provider."""
+    normalized = provider_name.lower()
+    if normalized in {"modal", "daytona"}:
+        return STANDARD_IMAGE
+    if normalized == "e2b":
+        return e2b_benchmark_template()
+    if normalized == "hopx":
+        return hopx_benchmark_template()
+    return None
+
+
+def benchmark_runtime_label(provider_name: str) -> str:
+    """Return a human-readable runtime label used in benchmark output."""
+    runtime = benchmark_image_for_provider(provider_name)
+    if runtime is None:
+        return "provider-default runtime"
+    if provider_name.lower() in {"e2b", "hopx"}:
+        return f"template={runtime}"
+    return f"image={runtime}"
+
+
+def discover_benchmark_providers(
+    *,
+    include_cloudflare: bool = False,
+    image_only: bool = False,
+) -> list[BenchmarkProvider]:
+    """Return configured providers for benchmark runs."""
+    discovered: list[BenchmarkProvider] = []
+    for provider in PROVIDERS:
+        if provider.name == "cloudflare" and not include_cloudflare:
+            continue
+        if image_only and not provider.supports_image_benchmark:
+            continue
+        if provider.is_configured():
+            discovered.append(provider)
+    return discovered
+
+
+def discover_provider_names(
+    *,
+    include_cloudflare: bool = False,
+    image_only: bool = False,
+) -> list[str]:
+    """Return configured benchmark provider names."""
+    return [
+        provider.name
+        for provider in discover_benchmark_providers(
+            include_cloudflare=include_cloudflare,
+            image_only=image_only,
+        )
+    ]
+
+
+def provider_configuration_hints(*, include_cloudflare: bool = False) -> list[str]:
+    """Return provider auth hints for benchmark setup messaging."""
+    hints = []
+    for provider in PROVIDERS:
+        if provider.name == "cloudflare" and not include_cloudflare:
+            continue
+        hints.append(f"- {provider.display_name}: {PROVIDER_CONFIGURATION_HINTS[provider.name]}")
+    return hints
diff --git a/benchmarks/run_all_benchmarks.py b/benchmarks/run_all_benchmarks.py
index 59af011..9d05636 100644
--- a/benchmarks/run_all_benchmarks.py
+++ b/benchmarks/run_all_benchmarks.py
@@ -2,12 +2,13 @@
 """
 Meta-benchmark runner that executes all benchmarks multiple times.
 
-Runs each benchmark 10 times, aggregates results, and calculates statistics
-including p50, p95, and p99 percentiles.
+Runs each benchmark according to the per-suite `runs` configuration, aggregates
+results, and calculates statistics including p50, p95, and p99 percentiles.
 
 Outputs comprehensive results to benchmarks/results.txt
 """
 
+import os
 import subprocess
 import sys
 import time
@@ -59,6 +60,8 @@
     },
 ]
 
+BENCHMARK_SCRIPT_TIMEOUT_SECONDS = int(os.getenv("BENCHMARK_SCRIPT_TIMEOUT_SECONDS", "1800"))
+
 
 def calculate_percentiles(data: list[float]) -> dict[str, float]:
     """Calculate p50, p95, p99 percentiles."""
@@ -98,7 +101,7 @@ def run_benchmark(script: str, run_number: int) -> dict[str, Any]:
             [sys.executable, str(script_path)],
             capture_output=True,
             text=True,
-            timeout=600,  # 10 minute timeout
+            timeout=BENCHMARK_SCRIPT_TIMEOUT_SECONDS,
         )
         duration = time.time() - start
 
@@ -121,7 +124,7 @@ def run_benchmark(script: str, run_number: int) -> dict[str, Any]:
             "success": False,
             "duration_seconds": duration,
             "stdout": "",
-            "stderr": "Benchmark timed out after 10 minutes",
+            "stderr": f"Benchmark timed out after {BENCHMARK_SCRIPT_TIMEOUT_SECONDS} seconds",
             "exit_code": -1,
         }
     except Exception as e:
diff --git a/benchmarks/simple_benchmark.py b/benchmarks/simple_benchmark.py
index add3be4..806261b 100644
--- a/benchmarks/simple_benchmark.py
+++ b/benchmarks/simple_benchmark.py
@@ -1,103 +1,122 @@
 #!/usr/bin/env python
-"""Simple benchmark for Modal provider."""
+"""Quick benchmark smoke test for configured providers."""
 
 import asyncio
 import os
 import sys
 import time
-from statistics import mean, median, stdev
+from statistics import mean, median
 
 # Add parent directory to path
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
+from benchmarks.provider_matrix import benchmark_image_for_provider, discover_benchmark_providers
 from sandboxes import SandboxConfig
-from sandboxes.providers.modal import ModalProvider
 
 
-async def benchmark_modal(runs=5):
-    """Benchmark Modal provider operations."""
-    provider = ModalProvider()
+async def benchmark_provider(provider_name: str, display_name: str, provider_class, runs: int = 3) -> dict | None:
+    """Run a quick create/exec/destroy smoke benchmark for a provider."""
+    try:
+        provider = provider_class()
+    except Exception as e:
+        print(f"\n❌ {display_name} initialization failed: {e}")
+        return None
 
     create_times = []
     execute_times = []
     destroy_times = []
     total_times = []
 
-    print(f"\n🔬 Running Modal Benchmark ({runs} iterations)")
+    print(f"\n🔬 Running {display_name} benchmark ({runs} iterations)")
     print("=" * 60)
 
     for i in range(runs):
-        print(f"\nRun {i+1}/{runs}:")
-
-        # Total operation time
         total_start = time.time()
+        try:
+            config = SandboxConfig(labels={"benchmark": "simple", "provider": provider_name, "run": str(i)})
+            runtime_image = benchmark_image_for_provider(provider_name)
+            if runtime_image:
+                config.image = runtime_image
+
+            start = time.time()
+            sandbox = await provider.create_sandbox(config)
+            create_time = (time.time() - start) * 1000
+            create_times.append(create_time)
+
+            start = time.time()
+            result = await provider.execute_command(
+                sandbox.id,
+                'python3 -c \'import sys; print(sys.version.split()[0])\'',
+            )
+            execute_time = (time.time() - start) * 1000
+            execute_times.append(execute_time)
+
+            start = time.time()
+            await provider.destroy_sandbox(sandbox.id)
+            destroy_time = (time.time() - start) * 1000
+            destroy_times.append(destroy_time)
+
+            total_time = (time.time() - total_start) * 1000
+            total_times.append(total_time)
+
+            icon = "✅" if result.success else "❌"
+            print(
+                f"Run {i+1}: {icon} Create={create_time:.0f}ms "
+                f"Execute={execute_time:.0f}ms Destroy={destroy_time:.0f}ms Total={total_time:.0f}ms"
+            )
+        except Exception as e:
+            print(f"Run {i+1}: ❌ Failed - {str(e)[:100]}")
 
-        # Create sandbox
-        start = time.time()
-        config = SandboxConfig(labels={"benchmark": f"run_{i}", "test": "modal_perf"})
-        sandbox = await provider.create_sandbox(config)
-        create_time = (time.time() - start) * 1000
-        create_times.append(create_time)
-        print(f"  ✅ Create: {create_time:.2f}ms - {sandbox.id}")
-
-        # Execute command
-        start = time.time()
-        result = await provider.execute_command(
-            sandbox.id,
-            'python3 -c \'import sys; print(f"Python {sys.version}"); print("Benchmark test complete")\'',
-        )
-        execute_time = (time.time() - start) * 1000
-        execute_times.append(execute_time)
-        print(f"  ✅ Execute: {execute_time:.2f}ms - Success: {result.success}")
-
-        # Destroy sandbox
-        start = time.time()
-        await provider.destroy_sandbox(sandbox.id)
-        destroy_time = (time.time() - start) * 1000
-        destroy_times.append(destroy_time)
-        print(f"  ✅ Destroy: {destroy_time:.2f}ms")
-
-        total_time = (time.time() - total_start) * 1000
-        total_times.append(total_time)
-        print(f"  ⏱️ Total: {total_time:.2f}ms")
-
-        # Small delay between runs
         if i < runs - 1:
-            await asyncio.sleep(1)
-
-    print("\n" + "=" * 60)
-    print("📊 RESULTS SUMMARY")
-    print("=" * 60)
+            await asyncio.sleep(0.2)
 
-    # Calculate statistics
-    def print_stats(name, times):
-        if len(times) > 1:
-            print(f"\n{name}:")
-            print(f"  Mean:   {mean(times):.2f}ms")
-            print(f"  Median: {median(times):.2f}ms")
-            print(f"  Min:    {min(times):.2f}ms")
-            print(f"  Max:    {max(times):.2f}ms")
-            if len(times) > 2:
-                print(f"  StdDev: {stdev(times):.2f}ms")
-        else:
-            print(f"\n{name}: {times[0]:.2f}ms")
-
-    print_stats("CREATE SANDBOX", create_times)
-    print_stats("EXECUTE COMMAND", execute_times)
-    print_stats("DESTROY SANDBOX", destroy_times)
-    print_stats("TOTAL OPERATION", total_times)
-
-    print("\n" + "=" * 60)
-    print(f"🎯 AVERAGE THROUGHPUT: {1000 / mean(total_times):.2f} ops/sec")
-    print("=" * 60)
+    if not total_times:
+        return None
 
     return {
-        "create": {"times": create_times, "mean": mean(create_times)},
-        "execute": {"times": execute_times, "mean": mean(execute_times)},
-        "destroy": {"times": destroy_times, "mean": mean(destroy_times)},
-        "total": {"times": total_times, "mean": mean(total_times)},
+        "provider": display_name,
+        "runs": len(total_times),
+        "create_median": median(create_times),
+        "execute_median": median(execute_times),
+        "destroy_median": median(destroy_times),
+        "total_mean": mean(total_times),
+        "total_median": median(total_times),
     }
 
 
+async def main():
+    """Run quick benchmark for configured providers."""
+    providers = discover_benchmark_providers(include_cloudflare=False)
+    if not providers:
+        print("❌ No configured providers found.")
+        return
+
+    results = []
+    for provider in providers:
+        provider_class = provider.load_class()
+        result = await benchmark_provider(provider.name, provider.display_name, provider_class, runs=3)
+        if result:
+            results.append(result)
+
+    if not results:
+        print("\n❌ No successful provider runs.")
+        return
+
+    print("\n" + "=" * 80)
+    print("QUICK BENCHMARK SUMMARY")
+    print("=" * 80)
+    print(f"{'Provider':<12} {'Runs':<6} {'Create':<10} {'Execute':<10} {'Destroy':<10} {'Total':<10}")
+    print("-" * 80)
+    for result in sorted(results, key=lambda r: r["total_median"]):
+        print(
+            f"{result['provider']:<12} "
+            f"{result['runs']:<6} "
+            f"{result['create_median']:<10.0f} "
+            f"{result['execute_median']:<10.0f} "
+            f"{result['destroy_median']:<10.0f} "
+            f"{result['total_median']:<10.0f}"
+        )
+
+
 if __name__ == "__main__":
-    results = asyncio.run(benchmark_modal(5))
+    asyncio.run(main())