From 9d13ca7130fc9473c3c38b9a774331d4bdc43547 Mon Sep 17 00:00:00 2001 From: tnm Date: Wed, 18 Feb 2026 18:22:56 -0800 Subject: [PATCH] Update benchmark provider coverage and docs --- benchmarks/README.md | 52 ++++-- benchmarks/benchmark.py | 61 +++---- benchmarks/benchmark_20x.py | 52 +++--- benchmarks/cold_vs_warm.py | 49 +++--- benchmarks/compare_providers.py | 52 ++---- benchmarks/comprehensive_benchmark.py | 58 +++---- benchmarks/image_reuse.py | 62 ++++--- benchmarks/provider_matrix.py | 231 ++++++++++++++++++++++++++ benchmarks/run_all_benchmarks.py | 11 +- benchmarks/simple_benchmark.py | 161 ++++++++++-------- 10 files changed, 500 insertions(+), 289 deletions(-) create mode 100644 benchmarks/provider_matrix.py diff --git a/benchmarks/README.md b/benchmarks/README.md index c4fa105..019222c 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -4,10 +4,10 @@ Comprehensive benchmark suite for comparing sandbox provider performance. ## Available Benchmarks -### šŸŽÆ comprehensive_benchmark.py (RECOMMENDED) +### comprehensive_benchmark.py (RECOMMENDED) **Apples-to-apples comparison with realistic workloads** -Tests all providers with diverse scenarios: +Tests all configured providers with diverse scenarios: - Hello World (shell execution) - Prime Calculation (CPU-bound) - File I/O (1000 files) @@ -15,7 +15,10 @@ Tests all providers with diverse scenarios: - NumPy FFT (numerical computation) **Features:** -- Uses standardized image (`daytonaio/ai-test:0.2.3`) for Modal and Daytona +- Uses standardized runtime hints per provider: + - Modal/Daytona: standardized Docker image + - E2B/Hopx: configurable template IDs + - Sprites/Vercel: provider defaults - Multiple runs with statistical analysis (mean, stddev, min, max) - Detailed error reporting - Winner tracking across all tests @@ -29,7 +32,7 @@ python benchmarks/comprehensive_benchmark.py --- -### šŸ“Š compare_providers.py +### compare_providers.py **Lifecycle breakdown (create/execute/destroy)** Tests basic sandbox operations with detailed timing for each phase: @@ -47,10 +50,10 @@ python benchmarks/compare_providers.py --- -### ⚔ simple_benchmark.py +### simple_benchmark.py **Quick smoke test** -Fast basic test to verify providers are working. +Fast create/exec/destroy verification across all configured providers. **Usage:** ```bash @@ -59,7 +62,7 @@ python benchmarks/simple_benchmark.py --- -### šŸ”„ benchmark_20x.py +### benchmark_20x.py **Concurrent execution test** Tests 20 concurrent sandbox operations to measure parallelism and throughput. @@ -71,7 +74,7 @@ python benchmarks/benchmark_20x.py --- -### ā„ļø cold_vs_warm.py +### cold_vs_warm.py **Cold start analysis** Compares cold start (first run) vs warm start (subsequent runs) performance. @@ -83,10 +86,11 @@ python benchmarks/cold_vs_warm.py --- -### šŸ–¼ļø image_reuse.py +### image_reuse.py **Image caching test** -Tests how providers handle image reuse and caching. +Tests how providers that support explicit image/template runtime configuration +handle reuse and caching (currently Modal, Daytona, E2B, and Hopx). **Usage:** ```bash @@ -99,9 +103,12 @@ python benchmarks/image_reuse.py All benchmarks auto-detect available providers based on environment variables: +- **Daytona**: Set `DAYTONA_API_KEY` - **E2B**: Set `E2B_API_KEY` +- **Sprites**: Set `SPRITES_TOKEN` or run `sprite login` +- **Hopx**: Set `HOPX_API_KEY` +- **Vercel**: Set `VERCEL_TOKEN`, `VERCEL_PROJECT_ID`, and `VERCEL_TEAM_ID` - **Modal**: Run `modal token set` or set `MODAL_TOKEN_ID` -- **Daytona**: Set `DAYTONA_API_KEY` ## Standard Image @@ -111,18 +118,29 @@ For apples-to-apples comparison, benchmarks use comparable environments: - Python 3.13, numpy, requests, anthropic, cohere, beautifulsoup4, and many AI/ML packages - Both providers support arbitrary Docker images -- **E2B**: `code-interpreter` template +- **E2B**: `code-interpreter` template by default - Python, npm, Jupyter, and common ML packages (numpy, pandas, matplotlib, etc.) - E2B uses templates instead of Docker images - - Custom templates supported via `config.image` or `config.provider_config["template"]` + - Benchmarks prefer `E2B_BENCHMARK_TEMPLATE`, then `benchmarks/e2b-daytona-benchmark/e2b.toml`, then `code-interpreter` + - Override with `E2B_BENCHMARK_TEMPLATE` + - If you see `Template is not compatible with secured access`, set `E2B_BENCHMARK_TEMPLATE` to a secured-access compatible template ID + +- **Hopx**: `code-interpreter` template by default + - Override with `HOPX_BENCHMARK_TEMPLATE` + +- **Sprites/Vercel**: + - Benchmarks use provider defaults for runtime/image behavior + +Cloudflare provider benchmarks are intentionally excluded by default. ## Contributing When adding new benchmarks: -1. Use the standardized image for Modal/Daytona -2. Include statistical analysis (mean, stddev) -3. Add error handling and detailed reporting -4. Update this README +1. Keep provider discovery centralized (see `benchmarks/provider_matrix.py`) +2. Use standardized runtime hints for fair comparisons where possible +3. Include statistical analysis (mean, stddev) +4. Add error handling and detailed reporting +5. Update this README ## License diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py index e7faf34..52ce31c 100644 --- a/benchmarks/benchmark.py +++ b/benchmarks/benchmark.py @@ -14,10 +14,12 @@ # Add parent directory to path for imports sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from benchmarks.provider_matrix import ( + benchmark_image_for_provider, + benchmark_runtime_label, + discover_benchmark_providers, +) from sandboxes import SandboxConfig -from sandboxes.providers.daytona import DaytonaProvider -from sandboxes.providers.e2b import E2BProvider -from sandboxes.providers.modal import ModalProvider @dataclass @@ -63,29 +65,16 @@ def __init__(self, iterations: int = 5): def _init_providers(self): """Initialize available providers.""" - # E2B - if os.getenv("E2B_API_KEY"): - try: - self.providers["e2b"] = E2BProvider() - print("āœ… E2B provider initialized") - except Exception as e: - print(f"āŒ E2B provider failed: {e}") - - # Daytona - if os.getenv("DAYTONA_API_KEY"): - try: - self.providers["daytona"] = DaytonaProvider() - print("āœ… Daytona provider initialized") - except Exception as e: - print(f"āŒ Daytona provider failed: {e}") + configured_providers = discover_benchmark_providers(include_cloudflare=False) - # Modal - if os.path.exists(os.path.expanduser("~/.modal.toml")): + for provider in configured_providers: try: - self.providers["modal"] = ModalProvider() - print("āœ… Modal provider initialized") + provider_class = provider.load_class() + self.providers[provider.name] = provider_class() + runtime = benchmark_runtime_label(provider.name) + print(f"āœ… {provider.display_name} provider initialized ({runtime})") except Exception as e: - print(f"āŒ Modal provider failed: {e}") + print(f"āŒ {provider.display_name} provider failed: {e}") async def benchmark_create_sandbox(self, provider_name: str) -> list[BenchmarkResult]: """Benchmark sandbox creation.""" @@ -93,13 +82,7 @@ async def benchmark_create_sandbox(self, provider_name: str) -> list[BenchmarkRe results = [] for i in range(self.iterations): - # Use Daytona image for E2B and Daytona providers - image = "daytonaio/ai-test:0.2.3" if provider_name in ["daytona", "e2b"] else None - if provider_name == "e2b": - # E2B uses template ID (template built from daytonaio/ai-test:0.2.3) - # To build your own: cd benchmarks/e2b-daytona-benchmark && e2b template build - # Then update this ID with the one from benchmarks/e2b-daytona-benchmark/e2b.toml - image = "5x6hvr4zwye07thwhpkd" + image = benchmark_image_for_provider(provider_name) config = SandboxConfig( labels={"benchmark": "create", "iteration": str(i)}, @@ -144,12 +127,7 @@ async def benchmark_execute_command(self, provider_name: str) -> list[BenchmarkR provider = self.providers[provider_name] results = [] - # Use Daytona image for E2B and Daytona providers - image = "daytonaio/ai-test:0.2.3" if provider_name in ["daytona", "e2b"] else None - if provider_name == "e2b": - # E2B uses template ID (template built from daytonaio/ai-test:0.2.3) - # See benchmarks/e2b-daytona-benchmark/README.md for building your own - image = "5x6hvr4zwye07thwhpkd" + image = benchmark_image_for_provider(provider_name) # Create one sandbox for all iterations config = SandboxConfig( @@ -212,12 +190,7 @@ async def benchmark_reuse_sandbox(self, provider_name: str) -> list[BenchmarkRes provider = self.providers[provider_name] results = [] - # Use Daytona image for E2B and Daytona providers - image = "daytonaio/ai-test:0.2.3" if provider_name in ["daytona", "e2b"] else None - if provider_name == "e2b": - # E2B uses template ID (template built from daytonaio/ai-test:0.2.3) - # See benchmarks/e2b-daytona-benchmark/README.md for building your own - image = "5x6hvr4zwye07thwhpkd" + image = benchmark_image_for_provider(provider_name) labels = {"benchmark": "reuse", "session": "test123"} config = SandboxConfig(labels=labels, timeout_seconds=120, image=image) @@ -478,6 +451,10 @@ async def run(self) -> dict[str, ProviderMetrics]: print(f" Iterations per test: {self.iterations}") print(f" Providers: {', '.join(self.providers.keys())}") + if not self.providers: + print("āŒ No configured providers found.") + return {} + # Run benchmarks for each provider all_results = [] for provider_name in self.providers: diff --git a/benchmarks/benchmark_20x.py b/benchmarks/benchmark_20x.py index ce48279..cdccc66 100644 --- a/benchmarks/benchmark_20x.py +++ b/benchmarks/benchmark_20x.py @@ -9,16 +9,14 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from benchmarks.provider_matrix import benchmark_image_for_provider, discover_benchmark_providers from sandboxes import SandboxConfig -from sandboxes.providers.daytona import DaytonaProvider -from sandboxes.providers.e2b import E2BProvider -from sandboxes.providers.modal import ModalProvider -async def verify_and_benchmark(provider_class, name: str, runs: int = 20): +async def verify_and_benchmark(provider_name: str, display_name: str, provider_class, runs: int = 20): """Benchmark provider with verification.""" print(f"\n{'='*80}") - print(f"4{name} - {runs} ITERATIONS") + print(f"{display_name} - {runs} ITERATIONS") print(f"{'='*80}") try: @@ -61,9 +59,10 @@ async def verify_and_benchmark(provider_class, name: str, runs: int = 20): try: # Create sandbox start = time.time() - config = SandboxConfig(labels={"benchmark": f"{name.lower()}_20x", "run": str(i + 1)}) - if name == "Modal": - config.provider_config = {"image": "python:3.11-slim"} + config = SandboxConfig(labels={"benchmark": f"{provider_name}_20x", "run": str(i + 1)}) + runtime_image = benchmark_image_for_provider(provider_name) + if runtime_image: + config.image = runtime_image sandbox = await provider.create_sandbox(config) create_time = (time.time() - start) * 1000 @@ -111,11 +110,11 @@ async def verify_and_benchmark(provider_class, name: str, runs: int = 20): print(" Could not verify final count") if not create_times: - print(f"\nāŒ All runs failed for {name}") + print(f"\nāŒ All runs failed for {display_name}") return None # Calculate comprehensive statistics - print(f"\nšŸ“ˆ STATISTICS FOR {name} ({len(create_times)}/{runs} successful)") + print(f"\nšŸ“ˆ STATISTICS FOR {display_name} ({len(create_times)}/{runs} successful)") print("=" * 60) def print_detailed_stats(name, times): @@ -147,7 +146,7 @@ def print_detailed_stats(name, times): print(f" Sample IDs: {created_ids[:3] if created_ids else 'None'}") return { - "name": name, + "name": display_name, "runs": runs, "successful": len(create_times), "failed": failed_runs, @@ -163,25 +162,26 @@ async def main(): """Run 20-iteration benchmark for all providers.""" print("šŸ”¬ COMPREHENSIVE BENCHMARK - 20 ITERATIONS PER PROVIDER") print("=" * 80) - print("This will create and destroy 60 sandboxes total.") - print("Estimated time: 3-5 minutes") + provider_specs = discover_benchmark_providers(include_cloudflare=False) + estimated_sandboxes = len(provider_specs) * 20 + print(f"This will create and destroy up to {estimated_sandboxes} sandboxes total.") + print("Estimated time: provider-dependent") results = [] + if not provider_specs: + print("\nāŒ No configured providers found.") + return + # Test each provider - for provider_class, name in [ - (ModalProvider, "Modal"), - (E2BProvider, "E2B"), - (DaytonaProvider, "Daytona"), - ]: - if name == "E2B" and not os.getenv("E2B_API_KEY"): - print(f"\nāš ļø Skipping {name} - no API key") - continue - if name == "Daytona" and not os.getenv("DAYTONA_API_KEY"): - print(f"\nāš ļø Skipping {name} - no API key") - continue - - result = await verify_and_benchmark(provider_class, name, runs=20) + for provider in provider_specs: + provider_class = provider.load_class() + result = await verify_and_benchmark( + provider.name, + provider.display_name, + provider_class, + runs=20, + ) if result: results.append(result) diff --git a/benchmarks/cold_vs_warm.py b/benchmarks/cold_vs_warm.py index acb8191..1a44900 100644 --- a/benchmarks/cold_vs_warm.py +++ b/benchmarks/cold_vs_warm.py @@ -9,10 +9,11 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from benchmarks.provider_matrix import ( + benchmark_image_for_provider, + discover_benchmark_providers, +) from sandboxes import SandboxConfig -from sandboxes.providers.daytona import DaytonaProvider -from sandboxes.providers.e2b import E2BProvider -from sandboxes.providers.modal import ModalProvider async def test_cold_startup(provider, provider_name: str, config: SandboxConfig) -> dict: @@ -162,42 +163,40 @@ async def create_execute_destroy(index: int): } -async def test_provider_warmup_patterns(provider_class, provider_name: str): +async def test_provider_warmup_patterns(provider_name: str, display_name: str, provider_class): """Test complete warmup patterns for a provider.""" print(f"\n{'='*80}") - print(f"šŸ”¬ WARMUP ANALYSIS: {provider_name}") + print(f"šŸ”¬ WARMUP ANALYSIS: {display_name}") print(f"{'='*80}") try: # Initialize fresh provider provider = provider_class() - # Configure for provider config = SandboxConfig(labels={"test": "warmup"}) - if provider_name == "Modal": - config.image = "python:3.11-slim" - elif provider_name == "Daytona": - config.image = "daytonaio/ai-test:0.2.3" + runtime_image = benchmark_image_for_provider(provider_name) + if runtime_image: + config.image = runtime_image # Test 1: Cold startup - cold_results = await test_cold_startup(provider, provider_name, config) + cold_results = await test_cold_startup(provider, display_name, config) # Small delay to ensure cold/warm separation await asyncio.sleep(2) # Test 2: Warm startup sequence - warm_results = await test_warm_startup(provider, provider_name, config, iterations=5) + warm_results = await test_warm_startup(provider, display_name, config, iterations=5) # Small delay await asyncio.sleep(1) # Test 3: Concurrent warmup concurrent_results = await test_concurrent_warm( - provider, provider_name, config, concurrency=3 + provider, display_name, config, concurrency=3 ) # Analysis - print(f"\nšŸ“Š WARMUP ANALYSIS FOR {provider_name}") + print(f"\nšŸ“Š WARMUP ANALYSIS FOR {display_name}") print(f"{'='*60}") print("\nCold vs Warm Comparison:") @@ -237,7 +236,7 @@ async def test_provider_warmup_patterns(provider_class, provider_name: str): print(f" Concurrency efficiency: {concurrent_results['efficiency']:.1f}x") return { - "provider": provider_name, + "provider": display_name, "cold": cold_results, "warm": warm_results, "concurrent": concurrent_results, @@ -247,7 +246,7 @@ async def test_provider_warmup_patterns(provider_class, provider_name: str): } except Exception as e: - print(f"āŒ Error testing {provider_name}: {e}") + print(f"āŒ Error testing {display_name}: {e}") return None @@ -257,18 +256,16 @@ async def main(): print("=" * 80) print("Testing startup patterns across providers...") - providers_to_test = [ - (ModalProvider, "Modal"), - (E2BProvider, "E2B") if os.getenv("E2B_API_KEY") else None, - (DaytonaProvider, "Daytona") if os.getenv("DAYTONA_API_KEY") else None, - ] - - # Filter out None values - providers_to_test = [p for p in providers_to_test if p is not None] + providers_to_test = discover_benchmark_providers(include_cloudflare=False) results = [] - for provider_class, name in providers_to_test: - result = await test_provider_warmup_patterns(provider_class, name) + for provider in providers_to_test: + provider_class = provider.load_class() + result = await test_provider_warmup_patterns( + provider.name, + provider.display_name, + provider_class, + ) if result: results.append(result) diff --git a/benchmarks/compare_providers.py b/benchmarks/compare_providers.py index 352fd86..c95f334 100644 --- a/benchmarks/compare_providers.py +++ b/benchmarks/compare_providers.py @@ -10,18 +10,19 @@ # Add parent directory to path sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from benchmarks.provider_matrix import benchmark_image_for_provider, discover_benchmark_providers from sandboxes import SandboxConfig -async def benchmark_provider(provider_class, name: str, runs: int = 3) -> dict | None: +async def benchmark_provider(provider_name: str, display_name: str, provider_class, runs: int = 3) -> dict | None: """Benchmark a single provider.""" try: provider = provider_class() print(f"\n{'='*60}") - print(f"šŸ“¦ Benchmarking {name}") + print(f"šŸ“¦ Benchmarking {display_name}") print(f"{'='*60}") except Exception as e: - print(f"\nāŒ {name} not available: {e}") + print(f"\nāŒ {display_name} not available: {e}") return None create_times = [] @@ -36,11 +37,10 @@ async def benchmark_provider(provider_class, name: str, runs: int = 3) -> dict | try: # Create sandbox start = time.time() - config = SandboxConfig(labels={"benchmark": f"{name.lower()}_run_{i}"}) - # Use standardized image for apples-to-apples comparison - # daytonaio/ai-test:0.2.3 includes Python 3.13 + numpy + many AI/ML packages - if name in ["Modal", "Daytona"]: - config.image = "daytonaio/ai-test:0.2.3" + config = SandboxConfig(labels={"benchmark": f"{provider_name}_run_{i}"}) + runtime_image = benchmark_image_for_provider(provider_name) + if runtime_image: + config.image = runtime_image sandbox = await provider.create_sandbox(config) create_time = (time.time() - start) * 1000 @@ -79,7 +79,7 @@ async def benchmark_provider(provider_class, name: str, runs: int = 3) -> dict | return None return { - "name": name, + "name": display_name, "create": { "mean": mean(create_times), "median": median(create_times), @@ -114,35 +114,13 @@ async def main(): print("Testing with 3 runs per provider...") results = [] + provider_specs = discover_benchmark_providers(include_cloudflare=False) - # Test Modal (we know this works) - from sandboxes.providers.modal import ModalProvider - - modal_result = await benchmark_provider(ModalProvider, "Modal", runs=3) - if modal_result: - results.append(modal_result) - - # Try E2B if available - if os.getenv("E2B_API_KEY"): - try: - from sandboxes.providers.e2b import E2BProvider - - e2b_result = await benchmark_provider(E2BProvider, "E2B", runs=3) - if e2b_result: - results.append(e2b_result) - except Exception as e: - print(f"\nāŒ E2B error: {e}") - - # Try Daytona if available - if os.getenv("DAYTONA_API_KEY"): - try: - from sandboxes.providers.daytona import DaytonaProvider - - daytona_result = await benchmark_provider(DaytonaProvider, "Daytona", runs=3) - if daytona_result: - results.append(daytona_result) - except Exception as e: - print(f"\nāŒ Daytona error: {e}") + for provider in provider_specs: + provider_class = provider.load_class() + result = await benchmark_provider(provider.name, provider.display_name, provider_class, runs=3) + if result: + results.append(result) # Display comparison table print("\n" + "=" * 80) diff --git a/benchmarks/comprehensive_benchmark.py b/benchmarks/comprehensive_benchmark.py index f25d592..e98dd5b 100644 --- a/benchmarks/comprehensive_benchmark.py +++ b/benchmarks/comprehensive_benchmark.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ Comprehensive benchmark for sandboxes library. -Tests E2B, Modal, and Daytona providers with various realistic workloads. +Tests configured providers with realistic workloads. Features: - Multiple test scenarios (Hello World, compute, I/O, package install) @@ -15,7 +15,6 @@ """ import asyncio -import os import sys import time from pathlib import Path @@ -33,15 +32,17 @@ HAS_TABULATE = False print("āš ļø Install tabulate for better output: pip install tabulate") +from benchmarks.provider_matrix import ( + STANDARD_IMAGE, + benchmark_image_for_provider, + benchmark_runtime_label, + discover_benchmark_providers, + e2b_benchmark_template, + hopx_benchmark_template, + provider_configuration_hints, +) from sandboxes import run -# Standard image for apples-to-apples comparison (Modal/Daytona) -# This image includes Python 3.13, numpy, requests, and many AI/ML packages -# E2B uses their "code-interpreter" template (doesn't support arbitrary Docker images) -# code-interpreter includes Python, npm, Jupyter, numpy, pandas, matplotlib, etc. -STANDARD_IMAGE = "daytonaio/ai-test:0.2.3" - - # Test scenarios - from simple to complex TESTS = { "hello_world": { @@ -136,13 +137,9 @@ async def benchmark_provider( # Use comparable images for fair comparison kwargs = {"provider": provider_name} if use_standard_image: - if provider_name == "e2b": - # E2B uses templates, not Docker images - use their code-interpreter template - # Has Python, npm, Jupyter, and common ML packages (numpy, pandas, etc.) - kwargs["image"] = "code-interpreter" - elif provider_name in ["modal", "daytona"]: - # Modal and Daytona can use Docker Hub images - kwargs["image"] = STANDARD_IMAGE + runtime_image = benchmark_image_for_provider(provider_name) + if runtime_image: + kwargs["image"] = runtime_image result = await run(command, **kwargs) duration = (time.time() - start) * 1000 # Convert to ms @@ -186,7 +183,10 @@ async def run_benchmarks(providers: list[str], use_standard_image: bool = True): print(f"Total tests: {len(TESTS)}") if use_standard_image: print(f"Modal/Daytona: {STANDARD_IMAGE}") - print("E2B: code-interpreter template (Python, npm, Jupyter, ML packages)") + if "e2b" in providers: + print(f"E2B: {e2b_benchmark_template()} template") + if "hopx" in providers: + print(f"Hopx: {hopx_benchmark_template()} template") print("=" * 80 + "\n") all_results = [] @@ -340,27 +340,17 @@ async def main(): # Check which providers are available print("Checking available providers...") - providers_to_test = [] - - # Check for API keys - if os.getenv("E2B_API_KEY"): - providers_to_test.append("e2b") - print("āœ“ E2B configured") - - if os.getenv("MODAL_TOKEN_ID") or Path.home().joinpath(".modal.toml").exists(): - providers_to_test.append("modal") - print("āœ“ Modal configured") + providers = discover_benchmark_providers(include_cloudflare=False) + providers_to_test = [provider.name for provider in providers] - if os.getenv("DAYTONA_API_KEY"): - providers_to_test.append("daytona") - print("āœ“ Daytona configured") + for provider in providers: + print(f"āœ“ {provider.display_name} configured ({benchmark_runtime_label(provider.name)})") if not providers_to_test: print("\nāŒ No providers configured!") - print("Set environment variables:") - print(" - E2B_API_KEY for E2B") - print(" - MODAL_TOKEN_ID for Modal (or run 'modal token set')") - print(" - DAYTONA_API_KEY for Daytona") + print("Configure at least one provider:") + for hint in provider_configuration_hints(include_cloudflare=False): + print(f" {hint}") return # Run benchmarks diff --git a/benchmarks/image_reuse.py b/benchmarks/image_reuse.py index 9f0936c..e85b4a5 100644 --- a/benchmarks/image_reuse.py +++ b/benchmarks/image_reuse.py @@ -9,10 +9,11 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from benchmarks.provider_matrix import ( + benchmark_image_for_provider, + discover_benchmark_providers, +) from sandboxes import SandboxConfig -from sandboxes.providers.daytona import DaytonaProvider -from sandboxes.providers.e2b import E2BProvider -from sandboxes.providers.modal import ModalProvider async def test_same_image_reuse( @@ -183,58 +184,54 @@ async def create_test_destroy(index: int): } -async def test_provider_image_patterns(provider_class, provider_name: str): +async def test_provider_image_patterns(provider_name: str, display_name: str, provider_class): """Test image reuse patterns for a provider.""" print(f"\n{'='*80}") - print(f"šŸ–¼ļø IMAGE REUSE ANALYSIS: {provider_name}") + print(f"šŸ–¼ļø IMAGE REUSE ANALYSIS: {display_name}") print(f"{'='*80}") try: provider = provider_class() # Provider-specific image configs - if provider_name == "Modal": - primary_image = "python:3.11-slim" + if provider_name == "modal": + primary_image = benchmark_image_for_provider(provider_name) test_images = [ "python:3.11-slim", "python:3.12-slim", "python:3.10-slim", "ubuntu:22.04", ] - elif provider_name == "E2B": - # E2B uses templates, not Docker images - primary_image = None # Default template - test_images = [None] # Only default for now - elif provider_name == "Daytona": - primary_image = "daytonaio/ai-test:0.2.3" - test_images = ["daytonaio/ai-test:0.2.3"] + elif provider_name in {"e2b", "daytona", "hopx"}: + primary_image = benchmark_image_for_provider(provider_name) + test_images = [primary_image] if primary_image else [] else: return None results = {} # Test 1: Same image reuse - if primary_image is not None: + if primary_image: same_image_results = await test_same_image_reuse( - provider, provider_name, primary_image, iterations=5 + provider, display_name, primary_image, iterations=5 ) results["same_image"] = same_image_results # Test 2: Concurrent same image concurrent_results = await test_concurrent_same_image( - provider, provider_name, primary_image, concurrency=3 + provider, display_name, primary_image, concurrency=3 ) results["concurrent_same"] = concurrent_results # Test 3: Different images (Modal only for now) - if provider_name == "Modal": + if provider_name == "modal": different_images_results = await test_different_images( - provider, provider_name, test_images + provider, display_name, test_images ) results["different_images"] = different_images_results # Analysis - print(f"\nšŸ“Š IMAGE REUSE ANALYSIS FOR {provider_name}") + print(f"\nšŸ“Š IMAGE REUSE ANALYSIS FOR {display_name}") print(f"{'='*60}") if "same_image" in results: @@ -286,7 +283,7 @@ async def test_provider_image_patterns(provider_class, provider_name: str): return results except Exception as e: - print(f"āŒ Error testing {provider_name}: {e}") + print(f"āŒ Error testing {display_name}: {e}") return None @@ -296,20 +293,21 @@ async def main(): print("=" * 80) print("Testing image caching and reuse patterns...") - providers_to_test = [ - (ModalProvider, "Modal"), - (E2BProvider, "E2B") if os.getenv("E2B_API_KEY") else None, - (DaytonaProvider, "Daytona") if os.getenv("DAYTONA_API_KEY") else None, - ] - - # Filter out None values - providers_to_test = [p for p in providers_to_test if p is not None] + providers_to_test = discover_benchmark_providers( + include_cloudflare=False, + image_only=True, + ) all_results = [] - for provider_class, name in providers_to_test: - result = await test_provider_image_patterns(provider_class, name) + for provider in providers_to_test: + provider_class = provider.load_class() + result = await test_provider_image_patterns( + provider.name, + provider.display_name, + provider_class, + ) if result: - all_results.append((name, result)) + all_results.append((provider.display_name, result)) # Delay between providers await asyncio.sleep(3) diff --git a/benchmarks/provider_matrix.py b/benchmarks/provider_matrix.py new file mode 100644 index 0000000..e92f18a --- /dev/null +++ b/benchmarks/provider_matrix.py @@ -0,0 +1,231 @@ +"""Shared provider discovery and runtime hints for benchmark scripts.""" + +from __future__ import annotations + +import os +import re +import shutil +from collections.abc import Callable +from dataclasses import dataclass +from pathlib import Path + +STANDARD_IMAGE = os.getenv("BENCHMARK_STANDARD_IMAGE", "daytonaio/ai-test:0.2.3") +_E2B_TEMPLATE_ID_RE = re.compile(r'^template_id\s*=\s*"([^"]+)"\s*$', re.MULTILINE) + + +@dataclass(frozen=True) +class BenchmarkProvider: + """Benchmark provider metadata used by benchmark scripts.""" + + name: str + display_name: str + is_configured: Callable[[], bool] + load_class: Callable[[], type] + supports_image_benchmark: bool = True + + +def _vercel_token() -> str | None: + return ( + os.getenv("VERCEL_TOKEN") + or os.getenv("VERCEL_API_TOKEN") + or os.getenv("VERCEL_ACCESS_TOKEN") + or os.getenv("VERCEL_OIDC_TOKEN") + ) + + +def _has_daytona() -> bool: + return bool(os.getenv("DAYTONA_API_KEY")) + + +def _has_e2b() -> bool: + return bool(os.getenv("E2B_API_KEY")) + + +def _has_sprites() -> bool: + return bool(os.getenv("SPRITES_TOKEN") or shutil.which("sprite")) + + +def _has_hopx() -> bool: + return bool(os.getenv("HOPX_API_KEY")) + + +def _has_vercel() -> bool: + return bool(_vercel_token() and os.getenv("VERCEL_PROJECT_ID") and os.getenv("VERCEL_TEAM_ID")) + + +def _has_modal() -> bool: + return bool(os.getenv("MODAL_TOKEN_ID") or Path.home().joinpath(".modal.toml").exists()) + + +def _has_cloudflare() -> bool: + return bool(os.getenv("CLOUDFLARE_SANDBOX_BASE_URL") and os.getenv("CLOUDFLARE_API_TOKEN")) + + +def _load_daytona_provider(): + from sandboxes.providers.daytona import DaytonaProvider + + return DaytonaProvider + + +def _load_e2b_provider(): + from sandboxes.providers.e2b import E2BProvider + + return E2BProvider + + +def _load_sprites_provider(): + from sandboxes.providers.sprites import SpritesProvider + + return SpritesProvider + + +def _load_hopx_provider(): + from sandboxes.providers.hopx import HopxProvider + + return HopxProvider + + +def _load_vercel_provider(): + from sandboxes.providers.vercel import VercelProvider + + return VercelProvider + + +def _load_modal_provider(): + from sandboxes.providers.modal import ModalProvider + + return ModalProvider + + +def _load_cloudflare_provider(): + from sandboxes.providers.cloudflare import CloudflareProvider + + return CloudflareProvider + + +PROVIDERS: tuple[BenchmarkProvider, ...] = ( + BenchmarkProvider("daytona", "Daytona", _has_daytona, _load_daytona_provider), + BenchmarkProvider("e2b", "E2B", _has_e2b, _load_e2b_provider), + BenchmarkProvider( + "sprites", + "Sprites", + _has_sprites, + _load_sprites_provider, + supports_image_benchmark=False, + ), + BenchmarkProvider("hopx", "Hopx", _has_hopx, _load_hopx_provider), + BenchmarkProvider( + "vercel", + "Vercel", + _has_vercel, + _load_vercel_provider, + supports_image_benchmark=False, + ), + BenchmarkProvider("modal", "Modal", _has_modal, _load_modal_provider), + BenchmarkProvider( + "cloudflare", + "Cloudflare", + _has_cloudflare, + _load_cloudflare_provider, + supports_image_benchmark=False, + ), +) + +PROVIDER_CONFIGURATION_HINTS: dict[str, str] = { + "daytona": "DAYTONA_API_KEY", + "e2b": "E2B_API_KEY", + "sprites": "SPRITES_TOKEN or sprite CLI login", + "hopx": "HOPX_API_KEY", + "vercel": "VERCEL_TOKEN + VERCEL_PROJECT_ID + VERCEL_TEAM_ID", + "modal": "~/.modal.toml or MODAL_TOKEN_ID", + "cloudflare": "CLOUDFLARE_SANDBOX_BASE_URL + CLOUDFLARE_API_TOKEN", +} + + +def e2b_benchmark_template() -> str: + """Return E2B template used for benchmark workloads.""" + configured = os.getenv("E2B_BENCHMARK_TEMPLATE") + if configured: + return configured + + # Prefer repository template when available to keep benchmark runtime stable. + e2b_toml = Path(__file__).parent / "e2b-daytona-benchmark" / "e2b.toml" + try: + contents = e2b_toml.read_text() + match = _E2B_TEMPLATE_ID_RE.search(contents) + if match: + return match.group(1) + except OSError: + pass + + # Fallback for environments without repository template metadata. + return "code-interpreter" + + +def hopx_benchmark_template() -> str: + """Return Hopx template used for benchmark workloads.""" + return os.getenv("HOPX_BENCHMARK_TEMPLATE", "code-interpreter") + + +def benchmark_image_for_provider(provider_name: str) -> str | None: + """Return benchmark image/template hint for a provider.""" + normalized = provider_name.lower() + if normalized in {"modal", "daytona"}: + return STANDARD_IMAGE + if normalized == "e2b": + return e2b_benchmark_template() + if normalized == "hopx": + return hopx_benchmark_template() + return None + + +def benchmark_runtime_label(provider_name: str) -> str: + """Return a human-readable runtime label used in benchmark output.""" + runtime = benchmark_image_for_provider(provider_name) + if runtime is None: + return "provider-default runtime" + if provider_name.lower() in {"e2b", "hopx"}: + return f"template={runtime}" + return f"image={runtime}" + + +def discover_benchmark_providers( + *, + include_cloudflare: bool = False, + image_only: bool = False, +) -> list[BenchmarkProvider]: + """Return configured providers for benchmark runs.""" + discovered: list[BenchmarkProvider] = [] + for provider in PROVIDERS: + if provider.name == "cloudflare" and not include_cloudflare: + continue + if image_only and not provider.supports_image_benchmark: + continue + if provider.is_configured(): + discovered.append(provider) + return discovered + + +def discover_provider_names( + *, + include_cloudflare: bool = False, + image_only: bool = False, +) -> list[str]: + """Return configured benchmark provider names.""" + return [ + provider.name + for provider in discover_benchmark_providers( + include_cloudflare=include_cloudflare, + image_only=image_only, + ) + ] + + +def provider_configuration_hints(*, include_cloudflare: bool = False) -> list[str]: + """Return provider auth hints for benchmark setup messaging.""" + hints = [] + for provider in PROVIDERS: + if provider.name == "cloudflare" and not include_cloudflare: + continue + hints.append(f"- {provider.display_name}: {PROVIDER_CONFIGURATION_HINTS[provider.name]}") + return hints diff --git a/benchmarks/run_all_benchmarks.py b/benchmarks/run_all_benchmarks.py index 59af011..9d05636 100644 --- a/benchmarks/run_all_benchmarks.py +++ b/benchmarks/run_all_benchmarks.py @@ -2,12 +2,13 @@ """ Meta-benchmark runner that executes all benchmarks multiple times. -Runs each benchmark 10 times, aggregates results, and calculates statistics -including p50, p95, and p99 percentiles. +Runs each benchmark according to the per-suite `runs` configuration, aggregates +results, and calculates statistics including p50, p95, and p99 percentiles. Outputs comprehensive results to benchmarks/results.txt """ +import os import subprocess import sys import time @@ -59,6 +60,8 @@ }, ] +BENCHMARK_SCRIPT_TIMEOUT_SECONDS = int(os.getenv("BENCHMARK_SCRIPT_TIMEOUT_SECONDS", "1800")) + def calculate_percentiles(data: list[float]) -> dict[str, float]: """Calculate p50, p95, p99 percentiles.""" @@ -98,7 +101,7 @@ def run_benchmark(script: str, run_number: int) -> dict[str, Any]: [sys.executable, str(script_path)], capture_output=True, text=True, - timeout=600, # 10 minute timeout + timeout=BENCHMARK_SCRIPT_TIMEOUT_SECONDS, ) duration = time.time() - start @@ -121,7 +124,7 @@ def run_benchmark(script: str, run_number: int) -> dict[str, Any]: "success": False, "duration_seconds": duration, "stdout": "", - "stderr": "Benchmark timed out after 10 minutes", + "stderr": f"Benchmark timed out after {BENCHMARK_SCRIPT_TIMEOUT_SECONDS} seconds", "exit_code": -1, } except Exception as e: diff --git a/benchmarks/simple_benchmark.py b/benchmarks/simple_benchmark.py index add3be4..806261b 100644 --- a/benchmarks/simple_benchmark.py +++ b/benchmarks/simple_benchmark.py @@ -1,103 +1,122 @@ #!/usr/bin/env python -"""Simple benchmark for Modal provider.""" +"""Quick benchmark smoke test for configured providers.""" import asyncio import os import sys import time -from statistics import mean, median, stdev +from statistics import mean, median # Add parent directory to path sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from benchmarks.provider_matrix import benchmark_image_for_provider, discover_benchmark_providers from sandboxes import SandboxConfig -from sandboxes.providers.modal import ModalProvider -async def benchmark_modal(runs=5): - """Benchmark Modal provider operations.""" - provider = ModalProvider() +async def benchmark_provider(provider_name: str, display_name: str, provider_class, runs: int = 3) -> dict | None: + """Run a quick create/exec/destroy smoke benchmark for a provider.""" + try: + provider = provider_class() + except Exception as e: + print(f"\nāŒ {display_name} initialization failed: {e}") + return None create_times = [] execute_times = [] destroy_times = [] total_times = [] - print(f"\nšŸ”¬ Running Modal Benchmark ({runs} iterations)") + print(f"\nšŸ”¬ Running {display_name} benchmark ({runs} iterations)") print("=" * 60) for i in range(runs): - print(f"\nRun {i+1}/{runs}:") - - # Total operation time total_start = time.time() + try: + config = SandboxConfig(labels={"benchmark": "simple", "provider": provider_name, "run": str(i)}) + runtime_image = benchmark_image_for_provider(provider_name) + if runtime_image: + config.image = runtime_image + + start = time.time() + sandbox = await provider.create_sandbox(config) + create_time = (time.time() - start) * 1000 + create_times.append(create_time) + + start = time.time() + result = await provider.execute_command( + sandbox.id, + 'python3 -c \'import sys; print(sys.version.split()[0])\'', + ) + execute_time = (time.time() - start) * 1000 + execute_times.append(execute_time) + + start = time.time() + await provider.destroy_sandbox(sandbox.id) + destroy_time = (time.time() - start) * 1000 + destroy_times.append(destroy_time) + + total_time = (time.time() - total_start) * 1000 + total_times.append(total_time) + + icon = "āœ…" if result.success else "āŒ" + print( + f"Run {i+1}: {icon} Create={create_time:.0f}ms " + f"Execute={execute_time:.0f}ms Destroy={destroy_time:.0f}ms Total={total_time:.0f}ms" + ) + except Exception as e: + print(f"Run {i+1}: āŒ Failed - {str(e)[:100]}") - # Create sandbox - start = time.time() - config = SandboxConfig(labels={"benchmark": f"run_{i}", "test": "modal_perf"}) - sandbox = await provider.create_sandbox(config) - create_time = (time.time() - start) * 1000 - create_times.append(create_time) - print(f" āœ… Create: {create_time:.2f}ms - {sandbox.id}") - - # Execute command - start = time.time() - result = await provider.execute_command( - sandbox.id, - 'python3 -c \'import sys; print(f"Python {sys.version}"); print("Benchmark test complete")\'', - ) - execute_time = (time.time() - start) * 1000 - execute_times.append(execute_time) - print(f" āœ… Execute: {execute_time:.2f}ms - Success: {result.success}") - - # Destroy sandbox - start = time.time() - await provider.destroy_sandbox(sandbox.id) - destroy_time = (time.time() - start) * 1000 - destroy_times.append(destroy_time) - print(f" āœ… Destroy: {destroy_time:.2f}ms") - - total_time = (time.time() - total_start) * 1000 - total_times.append(total_time) - print(f" ā±ļø Total: {total_time:.2f}ms") - - # Small delay between runs if i < runs - 1: - await asyncio.sleep(1) - - print("\n" + "=" * 60) - print("šŸ“Š RESULTS SUMMARY") - print("=" * 60) + await asyncio.sleep(0.2) - # Calculate statistics - def print_stats(name, times): - if len(times) > 1: - print(f"\n{name}:") - print(f" Mean: {mean(times):.2f}ms") - print(f" Median: {median(times):.2f}ms") - print(f" Min: {min(times):.2f}ms") - print(f" Max: {max(times):.2f}ms") - if len(times) > 2: - print(f" StdDev: {stdev(times):.2f}ms") - else: - print(f"\n{name}: {times[0]:.2f}ms") - - print_stats("CREATE SANDBOX", create_times) - print_stats("EXECUTE COMMAND", execute_times) - print_stats("DESTROY SANDBOX", destroy_times) - print_stats("TOTAL OPERATION", total_times) - - print("\n" + "=" * 60) - print(f"šŸŽÆ AVERAGE THROUGHPUT: {1000 / mean(total_times):.2f} ops/sec") - print("=" * 60) + if not total_times: + return None return { - "create": {"times": create_times, "mean": mean(create_times)}, - "execute": {"times": execute_times, "mean": mean(execute_times)}, - "destroy": {"times": destroy_times, "mean": mean(destroy_times)}, - "total": {"times": total_times, "mean": mean(total_times)}, + "provider": display_name, + "runs": len(total_times), + "create_median": median(create_times), + "execute_median": median(execute_times), + "destroy_median": median(destroy_times), + "total_mean": mean(total_times), + "total_median": median(total_times), } +async def main(): + """Run quick benchmark for configured providers.""" + providers = discover_benchmark_providers(include_cloudflare=False) + if not providers: + print("āŒ No configured providers found.") + return + + results = [] + for provider in providers: + provider_class = provider.load_class() + result = await benchmark_provider(provider.name, provider.display_name, provider_class, runs=3) + if result: + results.append(result) + + if not results: + print("\nāŒ No successful provider runs.") + return + + print("\n" + "=" * 80) + print("QUICK BENCHMARK SUMMARY") + print("=" * 80) + print(f"{'Provider':<12} {'Runs':<6} {'Create':<10} {'Execute':<10} {'Destroy':<10} {'Total':<10}") + print("-" * 80) + for result in sorted(results, key=lambda r: r["total_median"]): + print( + f"{result['provider']:<12} " + f"{result['runs']:<6} " + f"{result['create_median']:<10.0f} " + f"{result['execute_median']:<10.0f} " + f"{result['destroy_median']:<10.0f} " + f"{result['total_median']:<10.0f}" + ) + + if __name__ == "__main__": - results = asyncio.run(benchmark_modal(5)) + asyncio.run(main())