cased · tnm · Feb 19, 2026 · Feb 19, 2026
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -4,18 +4,21 @@ Comprehensive benchmark suite for comparing sandbox provider performance.
 
 ## Available Benchmarks
 
-### 🎯 comprehensive_benchmark.py (RECOMMENDED)
+### comprehensive_benchmark.py (RECOMMENDED)
 **Apples-to-apples comparison with realistic workloads**
 
-Tests all providers with diverse scenarios:
+Tests all configured providers with diverse scenarios:
 - Hello World (shell execution)
 - Prime Calculation (CPU-bound)
 - File I/O (1000 files)
 - Package Install (pip)
 - NumPy FFT (numerical computation)
 
 **Features:**
-- Uses standardized image (`daytonaio/ai-test:0.2.3`) for Modal and Daytona
+- Uses standardized runtime hints per provider:
+  - Modal/Daytona: standardized Docker image
+  - E2B/Hopx: configurable template IDs
+  - Sprites/Vercel: provider defaults
 - Multiple runs with statistical analysis (mean, stddev, min, max)
 - Detailed error reporting
 - Winner tracking across all tests
@@ -29,7 +32,7 @@ python benchmarks/comprehensive_benchmark.py
 
 ---
 
-### 📊 compare_providers.py
+### compare_providers.py
 **Lifecycle breakdown (create/execute/destroy)**
 
 Tests basic sandbox operations with detailed timing for each phase:
@@ -47,10 +50,10 @@ python benchmarks/compare_providers.py
 
 ---
 
-### ⚡ simple_benchmark.py
+### simple_benchmark.py
 **Quick smoke test**
 
-Fast basic test to verify providers are working.
+Fast create/exec/destroy verification across all configured providers.
 
 **Usage:**
 ```bash
@@ -59,7 +62,7 @@ python benchmarks/simple_benchmark.py
 
 ---
 
-### 🔥 benchmark_20x.py
+### benchmark_20x.py
 **Concurrent execution test**
 
 Tests 20 concurrent sandbox operations to measure parallelism and throughput.
@@ -71,7 +74,7 @@ python benchmarks/benchmark_20x.py
 
 ---
 
-### ❄️ cold_vs_warm.py
+### cold_vs_warm.py
 **Cold start analysis**
 
 Compares cold start (first run) vs warm start (subsequent runs) performance.
@@ -83,10 +86,11 @@ python benchmarks/cold_vs_warm.py
 
 ---
 
-### 🖼️ image_reuse.py
+### image_reuse.py
 **Image caching test**
 
-Tests how providers handle image reuse and caching.
+Tests how providers that support explicit image/template runtime configuration
+handle reuse and caching (currently Modal, Daytona, E2B, and Hopx).
 
 **Usage:**
 ```bash
@@ -99,9 +103,12 @@ python benchmarks/image_reuse.py
 
 All benchmarks auto-detect available providers based on environment variables:
 
+- **Daytona**: Set `DAYTONA_API_KEY`
 - **E2B**: Set `E2B_API_KEY`
+- **Sprites**: Set `SPRITES_TOKEN` or run `sprite login`
+- **Hopx**: Set `HOPX_API_KEY`
+- **Vercel**: Set `VERCEL_TOKEN`, `VERCEL_PROJECT_ID`, and `VERCEL_TEAM_ID`
 - **Modal**: Run `modal token set` or set `MODAL_TOKEN_ID`
-- **Daytona**: Set `DAYTONA_API_KEY`
 
 ## Standard Image
 
@@ -111,18 +118,29 @@ For apples-to-apples comparison, benchmarks use comparable environments:
   - Python 3.13, numpy, requests, anthropic, cohere, beautifulsoup4, and many AI/ML packages
   - Both providers support arbitrary Docker images
 
-- **E2B**: `code-interpreter` template
+- **E2B**: `code-interpreter` template by default
   - Python, npm, Jupyter, and common ML packages (numpy, pandas, matplotlib, etc.)
   - E2B uses templates instead of Docker images
-  - Custom templates supported via `config.image` or `config.provider_config["template"]`
+  - Benchmarks prefer `E2B_BENCHMARK_TEMPLATE`, then `benchmarks/e2b-daytona-benchmark/e2b.toml`, then `code-interpreter`
+  - Override with `E2B_BENCHMARK_TEMPLATE`
+  - If you see `Template is not compatible with secured access`, set `E2B_BENCHMARK_TEMPLATE` to a secured-access compatible template ID
+
+- **Hopx**: `code-interpreter` template by default
+  - Override with `HOPX_BENCHMARK_TEMPLATE`
+
+- **Sprites/Vercel**:
+  - Benchmarks use provider defaults for runtime/image behavior
+
+Cloudflare provider benchmarks are intentionally excluded by default.
 
 ## Contributing
 
 When adding new benchmarks:
-1. Use the standardized image for Modal/Daytona
-2. Include statistical analysis (mean, stddev)
-3. Add error handling and detailed reporting
-4. Update this README
+1. Keep provider discovery centralized (see `benchmarks/provider_matrix.py`)
+2. Use standardized runtime hints for fair comparisons where possible
+3. Include statistical analysis (mean, stddev)
+4. Add error handling and detailed reporting
+5. Update this README
 
 ## License
 

diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
@@ -14,10 +14,12 @@
 # Add parent directory to path for imports
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
+from benchmarks.provider_matrix import (
+    benchmark_image_for_provider,
+    benchmark_runtime_label,
+    discover_benchmark_providers,
+)
 from sandboxes import SandboxConfig
-from sandboxes.providers.daytona import DaytonaProvider
-from sandboxes.providers.e2b import E2BProvider
-from sandboxes.providers.modal import ModalProvider
 
 
 @dataclass
@@ -63,43 +65,24 @@ def __init__(self, iterations: int = 5):
 
     def _init_providers(self):
         """Initialize available providers."""
-        # E2B
-        if os.getenv("E2B_API_KEY"):
-            try:
-                self.providers["e2b"] = E2BProvider()
-                print("✅ E2B provider initialized")
-            except Exception as e:
-                print(f"❌ E2B provider failed: {e}")
-
-        # Daytona
-        if os.getenv("DAYTONA_API_KEY"):
-            try:
-                self.providers["daytona"] = DaytonaProvider()
-                print("✅ Daytona provider initialized")
-            except Exception as e:
-                print(f"❌ Daytona provider failed: {e}")
+        configured_providers = discover_benchmark_providers(include_cloudflare=False)
 
-        # Modal
-        if os.path.exists(os.path.expanduser("~/.modal.toml")):
+        for provider in configured_providers:
             try:
-                self.providers["modal"] = ModalProvider()
-                print("✅ Modal provider initialized")
+                provider_class = provider.load_class()
+                self.providers[provider.name] = provider_class()
+                runtime = benchmark_runtime_label(provider.name)
+                print(f"✅ {provider.display_name} provider initialized ({runtime})")
             except Exception as e:
-                print(f"❌ Modal provider failed: {e}")
+                print(f"❌ {provider.display_name} provider failed: {e}")
 
     async def benchmark_create_sandbox(self, provider_name: str) -> list[BenchmarkResult]:
         """Benchmark sandbox creation."""
         provider = self.providers[provider_name]
         results = []
 
         for i in range(self.iterations):
-            # Use Daytona image for E2B and Daytona providers
-            image = "daytonaio/ai-test:0.2.3" if provider_name in ["daytona", "e2b"] else None
-            if provider_name == "e2b":
-                # E2B uses template ID (template built from daytonaio/ai-test:0.2.3)
-                # To build your own: cd benchmarks/e2b-daytona-benchmark && e2b template build
-                # Then update this ID with the one from benchmarks/e2b-daytona-benchmark/e2b.toml
-                image = "5x6hvr4zwye07thwhpkd"
+            image = benchmark_image_for_provider(provider_name)
 
             config = SandboxConfig(
                 labels={"benchmark": "create", "iteration": str(i)},
@@ -144,12 +127,7 @@ async def benchmark_execute_command(self, provider_name: str) -> list[BenchmarkR
         provider = self.providers[provider_name]
         results = []
 
-        # Use Daytona image for E2B and Daytona providers
-        image = "daytonaio/ai-test:0.2.3" if provider_name in ["daytona", "e2b"] else None
-        if provider_name == "e2b":
-            # E2B uses template ID (template built from daytonaio/ai-test:0.2.3)
-            # See benchmarks/e2b-daytona-benchmark/README.md for building your own
-            image = "5x6hvr4zwye07thwhpkd"
+        image = benchmark_image_for_provider(provider_name)
 
         # Create one sandbox for all iterations
         config = SandboxConfig(
@@ -212,12 +190,7 @@ async def benchmark_reuse_sandbox(self, provider_name: str) -> list[BenchmarkRes
         provider = self.providers[provider_name]
         results = []
 
-        # Use Daytona image for E2B and Daytona providers
-        image = "daytonaio/ai-test:0.2.3" if provider_name in ["daytona", "e2b"] else None
-        if provider_name == "e2b":
-            # E2B uses template ID (template built from daytonaio/ai-test:0.2.3)
-            # See benchmarks/e2b-daytona-benchmark/README.md for building your own
-            image = "5x6hvr4zwye07thwhpkd"
+        image = benchmark_image_for_provider(provider_name)
 
         labels = {"benchmark": "reuse", "session": "test123"}
         config = SandboxConfig(labels=labels, timeout_seconds=120, image=image)
@@ -478,6 +451,10 @@ async def run(self) -> dict[str, ProviderMetrics]:
         print(f"   Iterations per test: {self.iterations}")
         print(f"   Providers: {', '.join(self.providers.keys())}")
 
+        if not self.providers:
+            print("❌ No configured providers found.")
+            return {}
+
         # Run benchmarks for each provider
         all_results = []
         for provider_name in self.providers:

diff --git a/benchmarks/benchmark_20x.py b/benchmarks/benchmark_20x.py
@@ -9,16 +9,14 @@
 
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
+from benchmarks.provider_matrix import benchmark_image_for_provider, discover_benchmark_providers
 from sandboxes import SandboxConfig
-from sandboxes.providers.daytona import DaytonaProvider
-from sandboxes.providers.e2b import E2BProvider
-from sandboxes.providers.modal import ModalProvider
 
 
-async def verify_and_benchmark(provider_class, name: str, runs: int = 20):
+async def verify_and_benchmark(provider_name: str, display_name: str, provider_class, runs: int = 20):
     """Benchmark provider with verification."""
     print(f"\n{'='*80}")
-    print(f"4{name} - {runs} ITERATIONS")
+    print(f"{display_name} - {runs} ITERATIONS")
     print(f"{'='*80}")
 
     try:
@@ -61,9 +59,10 @@ async def verify_and_benchmark(provider_class, name: str, runs: int = 20):
         try:
             # Create sandbox
             start = time.time()
-            config = SandboxConfig(labels={"benchmark": f"{name.lower()}_20x", "run": str(i + 1)})
-            if name == "Modal":
-                config.provider_config = {"image": "python:3.11-slim"}
+            config = SandboxConfig(labels={"benchmark": f"{provider_name}_20x", "run": str(i + 1)})
+            runtime_image = benchmark_image_for_provider(provider_name)
+            if runtime_image:
+                config.image = runtime_image
 
             sandbox = await provider.create_sandbox(config)
             create_time = (time.time() - start) * 1000
@@ -111,11 +110,11 @@ async def verify_and_benchmark(provider_class, name: str, runs: int = 20):
         print("   Could not verify final count")
 
     if not create_times:
-        print(f"\n❌ All runs failed for {name}")
+        print(f"\n❌ All runs failed for {display_name}")
         return None
 
     # Calculate comprehensive statistics
-    print(f"\n📈 STATISTICS FOR {name} ({len(create_times)}/{runs} successful)")
+    print(f"\n📈 STATISTICS FOR {display_name} ({len(create_times)}/{runs} successful)")
     print("=" * 60)
 
     def print_detailed_stats(name, times):
@@ -147,7 +146,7 @@ def print_detailed_stats(name, times):
     print(f"  Sample IDs: {created_ids[:3] if created_ids else 'None'}")
 
     return {
-        "name": name,
+        "name": display_name,
         "runs": runs,
         "successful": len(create_times),
         "failed": failed_runs,
@@ -163,25 +162,26 @@ async def main():
     """Run 20-iteration benchmark for all providers."""
     print("🔬 COMPREHENSIVE BENCHMARK - 20 ITERATIONS PER PROVIDER")
     print("=" * 80)
-    print("This will create and destroy 60 sandboxes total.")
-    print("Estimated time: 3-5 minutes")
+    provider_specs = discover_benchmark_providers(include_cloudflare=False)
+    estimated_sandboxes = len(provider_specs) * 20
+    print(f"This will create and destroy up to {estimated_sandboxes} sandboxes total.")
+    print("Estimated time: provider-dependent")
 
     results = []
 
+    if not provider_specs:
+        print("\n❌ No configured providers found.")
+        return
+
     # Test each provider
-    for provider_class, name in [
-        (ModalProvider, "Modal"),
-        (E2BProvider, "E2B"),
-        (DaytonaProvider, "Daytona"),
-    ]:
-        if name == "E2B" and not os.getenv("E2B_API_KEY"):
-            print(f"\n⚠️ Skipping {name} - no API key")
-            continue
-        if name == "Daytona" and not os.getenv("DAYTONA_API_KEY"):
-            print(f"\n⚠️ Skipping {name} - no API key")
-            continue
-
-        result = await verify_and_benchmark(provider_class, name, runs=20)
+    for provider in provider_specs:
+        provider_class = provider.load_class()
+        result = await verify_and_benchmark(
+            provider.name,
+            provider.display_name,
+            provider_class,
+            runs=20,
+        )
         if result:
             results.append(result)