ProsusAI · isha-prosus · Apr 9, 2026 · Mar 9, 2026 · Mar 9, 2026 · Mar 7, 2026
diff --git a/.env.example b/.env.example
@@ -1,8 +1,17 @@
 # Murphy Configuration
 # Copy this file to .env and fill in your values
 
-# === Required ===
+# === LLM Provider (at least one required) ===
 OPENAI_API_KEY=your_openai_api_key_here
+# GOOGLE_API_KEY=your_google_api_key_here
+# ANTHROPIC_API_KEY=your_anthropic_api_key_here
+# AZURE_OPENAI_KEY=your_azure_openai_key_here
+# AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/
+# MISTRAL_API_KEY=your_mistral_api_key_here
+# GROQ_API_KEY=your_groq_api_key_here
+# CEREBRAS_API_KEY=your_cerebras_api_key_here
+# OPENROUTER_API_KEY=your_openrouter_api_key_here
+# BROWSER_USE_API_KEY=your_browser_use_api_key_here
 
 # === Murphy REST API (optional, for murphy-api server) ===
 # MURPHY_API_KEY=your_murphy_api_key_here

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,12 +4,27 @@ All notable changes to Murphy will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 
-## [Unreleased]
+## [1.1.0] - 2026-04-07
 
 ### Added
+- Multi-provider LLM support: `--provider` and `--model` flags for OpenAI, Google Gemini, Anthropic Claude, Azure OpenAI, Mistral, Groq, DeepSeek, Cerebras, Ollama, OpenRouter, and Browser Use
+- Separate `--judge-provider` and `--judge-model` flags for using a different model for verdicts
+- `provider` field in REST API request models (`/analyze`, `/generate-plan`, `/execute`, `/evaluate`)
 - `--open` flag to re-open the interactive web UI for a previously completed run without re-running any tests (no browser or LLM required)
-- Step-by-step execution trace view in the UI (`View trace →`) showing each agent step with goal, evaluation, actions, screenshots, memory, and reasoning
-- Interactive agent path graph in the UI (`View graph →`) visualising the full decision tree with colour-coded nodes (success/failure evaluations) and labelled action edges; click any node to inspect step details
+- Step-by-step execution trace view in the UI (`View trace ->`) showing each agent step with goal, evaluation, actions, screenshots, memory, and reasoning
+- Interactive agent path graph in the UI (`View graph ->`) visualising the full decision tree with colour-coded nodes (success/failure evaluations) and labelled action edges; click any node to inspect step details
+- Missing signals reporting in judge verdicts (UX observations that don't affect the verdict)
+- Agent history saved as JSON per test in `output/agent_history/`
+
+### Fixed
+- UTF-8 surrogate encoding error (`ModelProviderError: 'utf-8' codec can't encode character`)
+- Occasional infinite verify loop during test execution
+- Pages URL incorrect in reporting and trace visualization
+- Angry user persona double-click test failing due to unidentified element ID
+- Agent not reporting missing validation indicators
+
+### Changed
+- Removed actions column from results main page in the UI
 
 ## [1.0.0] - 2026-03-05
 

diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@ Built on top of [browser-use](https://github.com/browser-use/browser-use) (AI br
 ## Prerequisites
 
 - Python >= 3.11
-- An OpenAI API key (`OPENAI_API_KEY`) — default model is `gpt-5-mini`
+- An LLM API key — default model is `gpt-5-mini` (OpenAI), but Murphy supports multiple providers (see [Model Providers](#model-providers))
 
 ## Which setup should I use?
 
@@ -43,10 +43,11 @@ uv run playwright install chromium
 ```bash
 cp .env.example .env
 ```
-Then set your key:
+Then set your key (at minimum one provider):
 ```
 OPENAI_API_KEY=sk-...
 ```
+See [Model Providers](#model-providers) for other providers.
 
 ## Setup (Docker)
 
@@ -179,8 +180,10 @@ The full JSON report (`evaluation_report.json`) contains structured results, act
 | `--features` | | Path to existing features markdown (skips feature discovery) |
 | `--plan` | | Path to existing YAML test plan (skips planning, goes straight to execution) |
 | `--max-tests` | `8` | Maximum number of test scenarios to generate |
-| `--model` | `gpt-5-mini` | LLM model for agent tasks |
-| `--judge-model` | `gpt-5-mini` | LLM model for judging verdicts |
+| `--provider` | `openai` | LLM provider (see [Model Providers](#model-providers)) |
+| `--model` | `gpt-5-mini` | LLM model name as it appears in the provider's docs |
+| `--judge-provider` | *(same as `--provider`)* | LLM provider for judging verdicts |
+| `--judge-model` | *(same as `--model`)* | LLM model for judging verdicts |
 | `--output-dir` | `./murphy/output` | Output directory for all generated files |
 | `--category` | | Site category hint (`ecommerce`, `saas`, `content`, `social`) |
 | `--open` | `false` | Open the interactive UI for a previously completed run (no browser or LLM required); `--url` is not needed |
@@ -220,15 +223,69 @@ See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md#rest-api-murphy-api) for full en
 
 ---
 
+## Model Providers
+
+Murphy supports multiple LLM providers via [browser-use](https://github.com/browser-use/browser-use). Use `--provider` and `--model` to select any supported provider. Model names are passed exactly as they appear in the provider's documentation — no renaming needed.
+
+```bash
+# Default: OpenAI
+uv run murphy --url https://example.com --model gpt-5-mini
+
+# Google Gemini
+uv run murphy --url https://example.com --provider google --model gemini-2.5-pro
+
+# Anthropic Claude
+uv run murphy --url https://example.com --provider anthropic --model claude-sonnet-4-20250514
+
+# Azure OpenAI
+uv run murphy --url https://example.com --provider azure --model gpt-4o
+
+# Mistral
+uv run murphy --url https://example.com --provider mistral --model mistral-large-latest
+
+# Mix providers: cheap model for agent tasks, stronger model for judging
+uv run murphy --url https://example.com \
+  --provider google --model gemini-2.5-flash \
+  --judge-provider openai --judge-model gpt-5-mini
+```
+
+Set the corresponding API key as an environment variable (see [Environment Variables](#environment-variables)).
+
+| Provider | `--provider` value | Example `--model` |
+|----------|-------------------|-------------------|
+| OpenAI | `openai` (default) | `gpt-5-mini`, `gpt-4o`, `o3` |
+| Google Gemini | `google` | `gemini-2.5-pro`, `gemini-2.5-flash` |
+| Anthropic | `anthropic` | `claude-sonnet-4-20250514`, `claude-haiku-4-5-20251001` |
+| Azure OpenAI | `azure` | `gpt-4o`, `gpt-4o-mini` |
+| Mistral | `mistral` | `mistral-large-latest`, `mistral-small-latest` |
+| Groq | `groq` | `llama3-70b-8192` |
+| DeepSeek | `deepseek` | `deepseek-chat` |
+| Cerebras | `cerebras` | `llama-3.3-70b` |
+| Ollama | `ollama` | `llama3`, `mistral` |
+| OpenRouter | `openrouter` | `meta-llama/llama-3-70b` |
+| Browser Use | `bu` | `bu-latest` |
+
+---
+
 ## Environment Variables
 
 All variables are optional unless noted. See `.env.example` for a template.
 
-### LLM Provider
-
-| Variable | Description |
-|----------|-------------|
-| `OPENAI_API_KEY` | OpenAI API key (required) |
+### LLM Providers
+
+Set the API key for whichever provider you use (at least one is required):
+
+| Variable | Provider |
+|----------|----------|
+| `OPENAI_API_KEY` | OpenAI (default) |
+| `GOOGLE_API_KEY` | Google Gemini |
+| `ANTHROPIC_API_KEY` | Anthropic Claude |
+| `AZURE_OPENAI_KEY` | Azure OpenAI (also needs `AZURE_OPENAI_ENDPOINT`) |
+| `MISTRAL_API_KEY` | Mistral |
+| `GROQ_API_KEY` | Groq |
+| `CEREBRAS_API_KEY` | Cerebras |
+| `OPENROUTER_API_KEY` | OpenRouter |
+| `BROWSER_USE_API_KEY` | Browser Use |
 
 ### REST API
 

diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
@@ -7,6 +7,7 @@ murphy/
 ├── __init__.py              # Package exports and version
 ├── __main__.py              # python -m murphy entry point
 ├── config.py                # Shared configuration constants
+├── llm.py                   # Multi-provider LLM factory (OpenAI, Google, Anthropic, etc.)
 ├── models.py                # Pydantic models (TestPlan, TestResult, JudgeVerdict, etc.)
 ├── prompts.py               # All LLM prompt text
 ├── evaluate.py              # Backward-compatible re-exports

diff --git a/murphy/__init__.py b/murphy/__init__.py
@@ -1,6 +1,6 @@
 """Murphy — AI-driven website evaluation powered by browser-use."""
 
-__version__ = '1.0.0'
+__version__ = '1.1.0'
 
 from murphy.core.analysis import analyze_website as analyze_website
 from murphy.core.execution import execute_tests as execute_tests

diff --git a/murphy/api/auth.py b/murphy/api/auth.py
@@ -8,12 +8,12 @@
 
 if TYPE_CHECKING:
 	from browser_use.browser.session import BrowserSession
-	from browser_use.llm import ChatOpenAI
+	from browser_use.llm import BaseChatModel
 
 logger = logging.getLogger(__name__)
 
 
-async def detect_auth_required(browser_session: BrowserSession, llm: ChatOpenAI, url: str) -> bool:
+async def detect_auth_required(browser_session: BrowserSession, llm: BaseChatModel, url: str) -> bool:
 	"""Navigate to URL and use a passive LLM call to detect if login is required."""
 	logger.info('\n%s', '=' * 60)
 	logger.info('Checking if %s requires login...', url)
@@ -59,7 +59,7 @@ async def _get_page_text(browser_session: BrowserSession) -> tuple[str, str, str
 	return current_url, title, body
 
 
-async def _llm_classify_page(llm: ChatOpenAI, url: str, title: str, body: str, *, mode: str = 'auth_detect') -> bool:
+async def _llm_classify_page(llm: BaseChatModel, url: str, title: str, body: str, *, mode: str = 'auth_detect') -> bool:
 	"""Use a single LLM call (no agent) to classify page content.
 
 	Returns True if the page looks like authenticated/usable content.
@@ -97,7 +97,7 @@ async def _llm_classify_page(llm: ChatOpenAI, url: str, title: str, body: str, *
 
 async def wait_for_manual_login(
 	browser_session: BrowserSession,
-	llm: ChatOpenAI,
+	llm: BaseChatModel,
 	url: str,
 	*,
 	already_navigated: bool = False,

diff --git a/murphy/api/cli.py b/murphy/api/cli.py
@@ -51,8 +51,14 @@ def main() -> int:
 	parser.add_argument('--features', help='Path to existing features markdown (skips analysis, goes to test generation)')
 	parser.add_argument('--plan', help='Path to existing YAML test plan (skips analysis + test generation)')
 	parser.add_argument('--max-tests', type=int, default=8, help='Max test scenarios (default: 8)')
-	parser.add_argument('--model', default='gpt-5-mini', help='OpenAI model for agent tasks (default: gpt-5-mini)')
-	parser.add_argument('--judge-model', default='gpt-5-mini', help='OpenAI model for judging verdicts (default: gpt-5-mini)')
+	parser.add_argument(
+		'--provider', default='openai', help='LLM provider (default: openai). e.g. google, anthropic, azure, mistral'
+	)
+	parser.add_argument(
+		'--model', default='gpt-5-mini', help='LLM model name as it appears in the provider docs (default: gpt-5-mini)'
+	)
+	parser.add_argument('--judge-provider', default=None, help='LLM provider for judging (defaults to --provider)')
+	parser.add_argument('--judge-model', default=None, help='LLM model for judging (defaults to --model)')
 	parser.add_argument('--output-dir', default='./murphy/output', help='Output directory for reports')
 	parser.add_argument(
 		'--open',
@@ -94,7 +100,6 @@ async def _async_main(args: argparse.Namespace) -> None:
 
 	from browser_use.browser.profile import BrowserProfile
 	from browser_use.browser.session import BrowserSession
-	from browser_use.llm import ChatOpenAI
 	from murphy.api.auth import detect_auth_required, wait_for_manual_login
 	from murphy.browser.patches import apply as apply_patches
 	from murphy.core.analysis import analyze_website
@@ -104,6 +109,7 @@ async def _async_main(args: argparse.Namespace) -> None:
 	from murphy.io.features_io import read_features_markdown, write_features_markdown
 	from murphy.io.fixtures import ensure_dummy_fixture_files
 	from murphy.io.test_plan_io import load_test_plan, save_test_plan
+	from murphy.llm import create_llm
 	from murphy.models import WebsiteAnalysis
 
 	# Apply patches early (idempotent)
@@ -112,8 +118,14 @@ async def _async_main(args: argparse.Namespace) -> None:
 	# Ensure dummy fixture files exist for upload testing
 	fixture_paths = ensure_dummy_fixture_files()
 
-	llm = ChatOpenAI(model=args.model)
-	judge_llm = ChatOpenAI(model=args.judge_model) if args.judge_model != args.model else None
+	llm = create_llm(args.model, provider=args.provider)
+	judge_provider = args.judge_provider or args.provider
+	judge_model = args.judge_model or args.model
+	judge_llm = (
+		create_llm(judge_model, provider=judge_provider)
+		if (judge_model != args.model or judge_provider != args.provider)
+		else None
+	)
 	output_dir = Path(args.output_dir)
 	output_dir.mkdir(parents=True, exist_ok=True)
 

diff --git a/murphy/api/request_models.py b/murphy/api/request_models.py
@@ -23,6 +23,7 @@ class AnalyzeRequest(BaseModel):
 	url: str
 	category: str | None = None
 	goal: str | None = None
+	provider: str = 'openai'
 	model: str = 'gpt-5-mini'
 	webhook_url: str | None = None
 	async_mode: bool = Field(False, alias='async')
@@ -35,6 +36,7 @@ class GeneratePlanRequest(BaseModel):
 	analysis: Annotated[WebsiteAnalysis, BeforeValidator(_parse_json_string)]
 	max_tests: int = 8
 	goal: str | None = None
+	provider: str = 'openai'
 	model: str = 'gpt-5-mini'
 	webhook_url: str | None = None
 	async_mode: bool = Field(False, alias='async')
@@ -47,8 +49,10 @@ class ExecuteRequest(BaseModel):
 	test_plan: Annotated[TestPlan, BeforeValidator(_parse_json_string)] | None = None
 	evaluate_job_id: str | None = None
 	goal: str | None = None
+	provider: str = 'openai'
 	model: str = 'gpt-5-mini'
-	judge_model: str = 'gpt-5-mini'
+	judge_provider: str | None = None
+	judge_model: str | None = None
 	max_steps: int = 15
 	max_concurrent: int = 3
 	webhook_url: str | None = None
@@ -61,8 +65,10 @@ class EvaluateRequest(BaseModel):
 	url: str
 	goal: str | None = None
 	max_tests: int = 8
+	provider: str = 'openai'
 	model: str = 'gpt-5-mini'
-	judge_model: str = 'gpt-5-mini'
+	judge_provider: str | None = None
+	judge_model: str | None = None
 	async_mode: bool = Field(False, alias='async')
 	webhook_url: str | None = None
 

diff --git a/murphy/api/rest.py b/murphy/api/rest.py
@@ -61,15 +61,17 @@ async def _core_analyze(req: AnalyzeRequest) -> dict[str, Any]:
 	"""Run website analysis. Returns serialized WebsiteAnalysis dict."""
 	from murphy.core.pipeline import run_analyze
 
-	analysis = await run_analyze(req.url, req.model, goal=req.goal)
+	analysis = await run_analyze(req.url, req.model, provider=req.provider, goal=req.goal)
 	return analysis.model_dump()
 
 
 async def _core_generate_plan(req: GeneratePlanRequest) -> dict[str, Any]:
 	"""Generate test plan from analysis. Returns serialized TestPlan dict."""
 	from murphy.core.pipeline import run_generate_plan
 
-	test_plan = await run_generate_plan(req.url, req.analysis, req.model, req.max_tests, goal=req.goal)
+	test_plan = await run_generate_plan(
+		req.url, req.analysis, req.model, provider=req.provider, max_tests=req.max_tests, goal=req.goal
+	)
 	return test_plan.model_dump()
 
 
@@ -93,7 +95,9 @@ async def _core_execute(req: ExecuteRequest) -> dict[str, Any]:
 		req.url,
 		test_plan,
 		req.model,
+		provider=req.provider,
 		judge_model=req.judge_model,
+		judge_provider=req.judge_provider,
 		goal=req.goal,
 		max_steps=req.max_steps,
 		max_concurrent=req.max_concurrent,
@@ -105,7 +109,7 @@ async def _core_evaluate(req: EvaluateRequest) -> dict[str, Any]:
 	"""Run exploration-first evaluation: explore site → generate test plan."""
 	from murphy.core.pipeline import run_evaluate
 
-	test_plan = await run_evaluate(req.url, req.model, req.max_tests, goal=req.goal)
+	test_plan = await run_evaluate(req.url, req.model, provider=req.provider, max_tests=req.max_tests, goal=req.goal)
 	return test_plan.model_dump()
 
 

diff --git a/murphy/core/analysis.py b/murphy/core/analysis.py
@@ -4,7 +4,7 @@
 
 from browser_use import Agent
 from browser_use.browser.session import BrowserSession
-from browser_use.llm import ChatOpenAI
+from browser_use.llm import BaseChatModel
 from murphy.models import WebsiteAnalysis
 from murphy.prompts import build_analysis_prompt
 
@@ -13,7 +13,7 @@
 
 async def analyze_website(
 	url: str,
-	llm: ChatOpenAI,
+	llm: BaseChatModel,
 	category: str | None = None,
 	goal: str | None = None,
 	browser_session: BrowserSession | None = None,