diff --git a/CLAUDE.md b/CLAUDE.md
index 1159923..c34ce47 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
## Status
-Foragent is at **milestone 5 shipped, v0.2 spec adopted, step 6 next**. Three capabilities are live (`fetch-page-title`, `extract-structured-data`, `post-to-site`); the A2A loop is wired end-to-end against RockBot via the `docker-compose.yml` harness pinned to `rockylhotka/rockbot-agent:0.8.5`. The governing spec is now `docs/foragent-specification.md` **v0.2** — read it before making non-trivial changes. v0.2 pivots Foragent to an agentic model: one generalist `browser-task` capability (built natively on `Microsoft.Playwright` NuGet — no MCP sidecar, no Stagehand port — see Appendix A #16) plus narrow fast-path specialists, with RockBot's `ISkillStore` + `ILongTermMemory` as the learning substrate. The v0.1 proposal document is archived at `docs/archive/foragent-spec-v0.2-proposal.md`. Storage-state persistence, 2FA input-required flow, k8s-secrets broker, and per-tenant credential namespaces remain deferred — tracked in `docs/framework-feedback.md` step 4. Framework-level observations from each milestone are captured in `docs/framework-feedback.md`.
+Foragent is at **milestone 6 shipped, step 7 next**. Four capabilities are live (`browser-task`, `fetch-page-title`, `extract-structured-data`, `post-to-site`); the A2A loop is wired end-to-end against RockBot via the `docker-compose.yml` harness pinned to `rockylhotka/rockbot-agent:0.8.5`. Step 6 shipped the generalist `browser-task` planner (LLM-in-the-loop over ref-annotated aria snapshots + `aria-ref=eN` locator resolution, built on `Microsoft.Playwright` 1.59 — bumped from 1.50 for the Ai aria-snapshot mode; see Appendix A #16). Tiered chat clients are wired via `AddRockBotTieredChatClients` with one model aliased across Low/Balanced/High per spec §3.7. The governing spec is `docs/foragent-specification.md` **v0.2**. Step 7 wires `ISkillStore` + `ILongTermMemory` priming; `post-to-site` is removed from the advertised skill list once `browser-task` + the learned bsky skill cover it. Storage-state persistence, 2FA input-required flow, k8s-secrets broker, and per-tenant credential namespaces remain deferred — tracked in `docs/framework-feedback.md`. Framework-level observations from each milestone are captured in `docs/framework-feedback.md`.
## Build / test
@@ -69,11 +69,13 @@ Key framework pieces Foragent uses today:
- `RockBot.A2A.IAgentTaskHandler` — the single per-agent extension point. `ForagentTaskHandler` (in `Foragent.Capabilities`) implements this and dispatches on `request.Skill`.
- `RockBot.A2A.Gateway.AddA2AHttpGateway` + `MapA2AHttpGateway` — the in-process HTTP surface. Published as NuGet in RockBot 0.8.4 (see `docs/framework-feedback.md`).
-Foragent requires an LLM (for `extract-structured-data` and future capabilities). The same `IChatClient` is registered both as a singleton (capabilities inject it directly) and via `AddRockBotChatClient` (satisfies the framework's mandatory registration). Config lives under `ForagentLlm` — separate from any rockbot-side `LLM` config so the two agents can point at different models. Program.cs fails fast at startup if `ForagentLlm:Endpoint`/`ModelId`/`ApiKey` are missing.
+Foragent requires an LLM. Config lives under `ForagentLlm` — separate from any rockbot-side `LLM` config so the two agents can point at different models. Program.cs fails fast at startup if `ForagentLlm:Endpoint`/`ModelId`/`ApiKey` are missing. Starting step 6 the single configured model is wired via `AddRockBotTieredChatClients(low, balanced, high)` aliased to the same inner `IChatClient`; that one call registers both `IChatClient` (wrapped with `RockBotFunctionInvokingChatClient` for automatic tool invocation) and `TieredChatClientRegistry` (per spec §3.7). Don't also call `AddRockBotChatClient` — it would swap out the wrapped registration. Capabilities that want to escalate/de-escalate per request can resolve `TieredChatClientRegistry` and call `GetClient(ModelTier.Low|Balanced|High)`; none do today.
## Browser
-`Foragent.Browser` wraps Playwright. `AddForagentBrowser()` in `Foragent.Agent/Program.cs` registers `PlaywrightBrowserHost` (`IHostedService` owning one shared Chromium per process) and `IBrowserSessionFactory` (hands out a fresh `IBrowserContext` per A2A task — isolation guarantee from spec §3.5). `IBrowserSession` exposes `FetchPageTitleAsync` / `CapturePageSnapshotAsync` for one-shot reads, plus `OpenPageAsync` → `IBrowserPage` (navigate / fill / click / wait / read) for multi-step flows like login + post. The snapshot uses Chromium's aria-snapshot (via `Locator.AriaSnapshotAsync`) and falls back to `
` inner text when the tree is empty. Selectors passed to `IBrowserPage` use Playwright's string-selector dialect (CSS + `role=role[name="..."]`); **regex is not accepted in string form**, use exact attribute matches. `Foragent.Browser` has `InternalsVisibleTo("Foragent.Browser.Tests")` so tests drive the real `PlaywrightBrowserSessionFactory` without promoting its implementation types to public.
+`Foragent.Browser` wraps Playwright. `AddForagentBrowser()` in `Foragent.Agent/Program.cs` registers `PlaywrightBrowserHost` (`IHostedService` owning one shared Chromium per process) and `IBrowserSessionFactory` (hands out a fresh `IBrowserContext` per A2A task — isolation guarantee from spec §3.5). `IBrowserSession` exposes `FetchPageTitleAsync` / `CapturePageSnapshotAsync` for one-shot reads, `OpenPageAsync` → `IBrowserPage` (navigate / fill / click / wait / read) for multi-step flows like login + post, and `OpenAgentPageAsync` → `IBrowserAgentPage` for LLM-in-the-loop planners (ref-annotated aria snapshots + `aria-ref=eN` locator resolution). The snapshot uses Chromium's aria-snapshot (via `Locator.AriaSnapshotAsync`; `Mode = AriaSnapshotMode.Ai` gets the ref-annotated form) and falls back to `` inner text when the tree is empty. Selectors passed to `IBrowserPage` use Playwright's string-selector dialect (CSS + `role=role[name="..."]`); **regex is not accepted in string form**, use exact attribute matches. `Foragent.Browser` has `InternalsVisibleTo("Foragent.Browser.Tests")` so tests drive the real `PlaywrightBrowserSessionFactory` without promoting its implementation types to public.
+
+`CreateSessionAsync(Func allowedHost, ...)` is the step-6 entry point for allowlist-scoped sessions. The factory installs a context-wide `RouteAsync("**/*", ...)` that aborts off-list document/subframe navigations before Playwright issues the request (spec §7.1). The no-argument overload accepts any host and stays available for specialists that enforce narrower rules elsewhere (e.g. `post-to-site` where the site id selects the host).
## Capabilities
@@ -84,6 +86,7 @@ Foragent requires an LLM (for `extract-structured-data` and future capabilities)
- `ForagentCapabilities.Skills` (static array) is the single source of truth for advertised skills — both the bus-side `AgentCard.Skills` and the HTTP gateway's `opts.Skills` read from it.
- `CapabilityInput.Parse` is the shared URL + description shim used by `fetch-page-title` and `extract-structured-data`. Capabilities with different input shapes (e.g. `post-to-site` needing `site` / `credentialId` / `content`) parse their own input near the capability — see `PostToSiteInput` in `PostToSiteCapability.cs`. Don't overload `CapabilityInput` for unrelated shapes.
- `post-to-site` dispatches to an `ISitePoster` keyed on `Site` (in `SitePosting/`). `BlueskySitePoster` is the only implementation today; add new sites by registering another `ISitePoster` in `AddForagentCapabilities()`. The capability never echoes exception messages from posters back to callers — they may contain credential material; operators read the full exception in logs.
+- `browser-task` (in `BrowserTask/`) is the generalist planner (spec §5.2). `BrowserTaskInput` parses intent + mandatory `allowedHosts` + optional `url` / `credentialId` / `maxSteps` (default 60, ceiling 150) / `maxSeconds` (default 120, ceiling 600). `BrowserTaskTools` wraps `snapshot` / `navigate` / `click` / `type` / `wait_for` / `done` / `fail` as `AIFunction`s via `AIFunctionFactory.Create` and passes them in `ChatOptions.Tools`; the RockBot-wrapped function-invoking `IChatClient` runs the full model ↔ tool loop inside one `GetResponseAsync` call. Budget is enforced tool-side (each tool checks `BrowserTaskState.BudgetExhausted`) because Microsoft.Extensions.AI does not surface per-request iteration caps through `ChatOptions`; wall-clock is a linked `CancellationTokenSource`. **Never log tool arguments verbatim** — `type` carries user-supplied values that may be sensitive (log length only). Refs from a snapshot are valid only until the next mutating call; the system prompt and tool descriptions both state this, but don't code anything that assumes cross-snapshot ref stability.
## Credentials
diff --git a/Directory.Packages.props b/Directory.Packages.props
index 64e171d..53db1f9 100644
--- a/Directory.Packages.props
+++ b/Directory.Packages.props
@@ -4,7 +4,7 @@
true
-
+
diff --git a/deploy/rockbot-seed/agent-trust.json b/deploy/rockbot-seed/agent-trust.json
index 0466fa6..10ffabe 100644
--- a/deploy/rockbot-seed/agent-trust.json
+++ b/deploy/rockbot-seed/agent-trust.json
@@ -2,7 +2,7 @@
{
"agentId": "Foragent",
"level": 4,
- "approvedSkills": ["fetch-page-title", "extract-structured-data", "post-to-site"],
+ "approvedSkills": ["browser-task", "fetch-page-title", "extract-structured-data", "post-to-site"],
"firstSeen": "2026-04-21T00:00:00+00:00",
"lastInteraction": "2026-04-21T00:00:00+00:00",
"interactionCount": 0
diff --git a/deploy/rockbot-seed/well-known-agents.json b/deploy/rockbot-seed/well-known-agents.json
index 6f6e3d4..35bcd03 100644
--- a/deploy/rockbot-seed/well-known-agents.json
+++ b/deploy/rockbot-seed/well-known-agents.json
@@ -8,6 +8,11 @@
"authHeaderName": "X-Api-Key",
"authHeaderValueBase64": "cm9ja2JvdC1jYWxscy1mb3JhZ2VudA==",
"skills": [
+ {
+ "id": "browser-task",
+ "name": "Browser Task (generalist)",
+ "description": "Drive a browser with an LLM-in-the-loop planner to accomplish a free-form intent. Input JSON {\"intent\":\"...\",\"allowedHosts\":[\"host\",\"*.host\",\"*\"],\"url\":\"optional start\",\"credentialId\":\"optional\",\"maxSteps\":60,\"maxSeconds\":120}. allowedHosts is required and empty rejects. Returns a structured JSON result with status (done/failed/incomplete), summary, optional result, step count, and navigations."
+ },
{
"id": "fetch-page-title",
"name": "Fetch Page Title",
diff --git a/docker-compose.yml b/docker-compose.yml
index 0611c56..e7a5ac9 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -61,7 +61,7 @@ services:
RabbitMq__VirtualHost: /
Gateway__AgentName: Foragent
Gateway__InternalAgentName: Foragent
- Gateway__Description: "Browser agent — fetch-page-title, extract-structured-data, post-to-site"
+ Gateway__Description: "Browser agent — browser-task (generalist), fetch-page-title, extract-structured-data, post-to-site"
# RockBot will call Foragent with header X-Api-Key: rockbot-calls-foragent
ApiKeys__rockbot-calls-foragent__AgentId: RockBot
ApiKeys__rockbot-calls-foragent__DisplayName: RockBot
diff --git a/docs/capabilities.md b/docs/capabilities.md
index 3aaaef2..31bd7c4 100644
--- a/docs/capabilities.md
+++ b/docs/capabilities.md
@@ -3,18 +3,84 @@
Foragent exposes browser operations as discrete A2A capabilities. Callers
invoke capabilities by name; Foragent handles the browser mechanics.
-## Planned initial capability set
+## Advertised capabilities (v0.2)
-- [ ] `fetch-page-content` — Navigate to a URL and return the page content
-- [ ] `extract-structured-data` — Extract structured data from a page using
- an LLM-assisted schema
-- [ ] `fill-form` — Fill and optionally submit an HTML form
-- [ ] `post-to-site` — Perform a multi-step posting action on a target site
-- [ ] `monitor-page` — Poll a page for a condition and notify when met
+- `browser-task` — **generalist**, spec §5.2. LLM-in-the-loop planner that
+ drives a real browser to accomplish a free-form intent. Shipped in
+ step 6.
+- `fetch-page-title` — specialist. Inherited from step 1/2.
+- `extract-structured-data` — specialist. Inherited from step 3.
+- `post-to-site` — specialist, credential-using. Inherited from step 4.
+ Scheduled for removal from the advertised list once step 7 lands
+ (`browser-task` + learned bsky skill subsume it).
+
+## `browser-task` input shape
+
+JSON in the first text part, or field-by-field metadata:
+
+```json
+{
+ "intent": "free-form description of what to accomplish",
+ "allowedHosts": ["bsky.app", "*.example.com", "*"],
+ "url": "optional absolute http(s) starting URL",
+ "credentialId": "optional broker reference",
+ "maxSteps": 60,
+ "maxSeconds": 120
+}
+```
+
+- `intent` — required. Free-form.
+- `allowedHosts` — required, non-empty (spec §7.1). An empty list rejects.
+ Supports exact hosts, `*.domain` subdomain wildcards, and `*` for
+ unrestricted. Off-list navigations are aborted inside the browser
+ context before Playwright issues the request.
+- `url` — optional. If provided, must match the allowlist.
+- `credentialId` — optional. Resolved but not exposed to the planner in
+ step 6; reserved for a typed login tool in a later step.
+- `maxSteps` — default 60, ceiling 150. Enforced tool-side via
+ `BrowserTaskState.BudgetExhausted`; once exceeded, tools return a
+ "call done or fail" message and refuse further work.
+- `maxSeconds` — default 120, ceiling 600. Enforced via a linked
+ `CancellationTokenSource`.
+
+## `browser-task` output shape
+
+A JSON object in a single text part:
+
+```json
+{
+ "status": "done" | "failed" | "incomplete",
+ "summary": "one-sentence human-readable result",
+ "result": "optional structured result text (e.g. extracted value)",
+ "steps": 7,
+ "navigations": ["https://host/path", "..."]
+}
+```
+
+`incomplete` means the budget was exhausted before `done`/`fail` was
+called.
+
+## `browser-task` tool surface
+
+Exposed to the planner via `[AIFunction]` wrappers over `IChatClient`
+(spec Appendix A #16 — no MCP sidecar). Refs are Playwright aria-ref ids
+and are valid only within the snapshot they came from.
+
+- `snapshot()` — ref-annotated aria tree of the current page.
+- `navigate(url)` — load a URL; host must be on the allowlist.
+- `click(ref)` — click by ref.
+- `type(ref, text)` — fill by ref.
+- `wait_for(ref, timeoutSeconds?)` — wait for visibility.
+- `done(summary, result?)` — mark complete.
+- `fail(reason)` — mark failed.
## Design principles
-- Capabilities operate at the task level, not at the DOM-operation level
-- Each capability invocation gets an isolated browser context
+- Capabilities operate at the task level, not at the DOM-operation level.
+- Each capability invocation gets an isolated `BrowserContext` (spec §3.5).
+- Per-task host allowlists are mandatory (spec §7.1).
- Credential references are passed by ID; values are resolved inside
- Foragent and never cross A2A boundaries
+ Foragent and never cross A2A boundaries (spec §6.1).
+- Prohibited capabilities — account creation, financial transactions,
+ modifying security permissions — are out of scope regardless of
+ implementation ease (spec §7.3).
diff --git a/docs/framework-feedback.md b/docs/framework-feedback.md
index 078649d..c02a69d 100644
--- a/docs/framework-feedback.md
+++ b/docs/framework-feedback.md
@@ -254,3 +254,88 @@ in `.env`.
discovery) and `GatewayOptions.Skills` (HTTP agent-card endpoint) are independent. Our
Program.cs populates both from a single `ForagentCapabilities.Skills` array — a workaround,
not a fix. The framework should treat one as authoritative and derive the other.
+
+## Step 6 — baseline `browser-task` generalist
+
+### Framework observations
+
+- **`AddRockBotTieredChatClients` obviates `AddRockBotChatClient` but this
+ is undocumented.** Calling `AddRockBotTieredChatClients(low, balanced,
+ high)` registers an `IChatClient` singleton whose factory already wraps
+ the inner client with `RockBotFunctionInvokingChatClient`, plus a
+ `TieredChatClientRegistry` singleton. Callers who previously used
+ `AddRockBotChatClient(client)` don't need to call both — but that's
+ not spelled out anywhere. If both are called, the second registration
+ silently wins (standard MEDI behavior), which can swap the wrapped
+ client for an unwrapped one depending on order. Docs gap; candidate
+ framework fix is either a guard throw or collapsing both methods into
+ one overload shape.
+
+- **No per-request iteration cap surface on the function-invoking chat
+ client.** `FunctionInvokingChatClient.MaximumIterationsPerRequest` is
+ an *instance* property, and the wrapped client is built inside
+ `AddRockBotTieredChatClients` — the caller has no hook to set it per
+ `GetResponseAsync` invocation. `ChatOptions.AdditionalProperties`
+ lookup keys are not honored. `ModelBehavior.MaxToolIterationsOverride`
+ exists on the RockBot side but routes through YAML behavior config,
+ not per-call. Foragent enforces its step budget tool-side (each tool
+ checks `BrowserTaskState.BudgetExhausted`); wall-clock cancellation
+ is the real safety net. Framework candidate: either honor a standard
+ `ChatOptions.AdditionalProperties["MaximumIterationsPerRequest"]`
+ convention or expose the FICC instance via DI so consumers can
+ configure it.
+
+- **`Microsoft.Playwright` 1.50 (pinned since step 2) does not expose
+ the Ai aria-snapshot mode.** Step 6 requires ref-annotated snapshots
+ (`[ref=eN]` + `aria-ref=eN` locator resolution). That gating moved
+ from a boolean `Ref` option to `Mode = AriaSnapshotMode.Ai` sometime
+ between 1.52 and the current 1.59 C# bindings. Foragent bumped the
+ pin to 1.59.0; container base image
+ (`mcr.microsoft.com/playwright/dotnet:v1.50.0-noble`) will need the
+ matching bump in the first release that ships browser-task. Not a
+ framework-issue per se, but relevant to RockBot's "v1 Foragent" story
+ and to anyone using the framework + Playwright together.
+
+- **Aria-ref lifetime is a contract the planner must respect.** Refs are
+ valid only within the snapshot they came from. The tool surface
+ documents this in the `snapshot` description; if the framework ever
+ ships a "browser task runner" helper of its own (candidate
+ `RockBot.Browser.Planner`?), it should bake the "re-snapshot after
+ mutation" rule into a first-class contract rather than leaving it to
+ prompt text.
+
+- **`AIFunctionFactory.Create(Delegate, name:, description:, …)`
+ descriptions only surface the method-level `[Description]`.** Parameter
+ descriptions must be on parameters via `[Description]` — easy to miss
+ without the reminder. Worked as expected; noting for anyone building
+ similar tool surfaces.
+
+- **RockBot's `RockBotFunctionInvokingChatClient` auto-invokes tools end
+ to end in a single `GetResponseAsync` call.** This is exactly what the
+ planner wants; no custom loop needed. One quirk: the FICC keeps
+ iterating as long as the model emits tool calls, with no public
+ step cap (see above). Combined with aria-ref lifetimes, a model that
+ thrashes on stale refs can burn budget fast. Step 7's learning
+ substrate is the intended mitigation.
+
+### Unaided floor measurement (2026-04-22)
+
+First end-to-end benchmark against the operator's Azure AI Foundry
+Balanced model (no learned skills, no priming — the "unaided" floor the
+spec §9.1 step 6 calls for):
+
+| Scenario | Result | Wall-clock |
+|---|---|---|
+| Click-through (home → link → read destination value) | ✅ done | 5 s |
+| Form submit (fill name + textarea → submit → read confirmation) | ✅ done | 8 s |
+| Multi-page nav (index → intro → chapter-2 → read bolded answer) | ✅ done | 7 s |
+
+3 / 3 passed on first attempt. Establishes the baseline Foragent must
+not regress against once step 7 adds priming. Re-run this set whenever
+the planner prompt, tool surface, or model pin changes.
+
+### Not yet exercised
+
+- **`TieredChatClientRegistry.GetClient(ModelTier.Low/High)` is wired
+ but no capability resolves it yet.** All three tiers currently alias
+ to the same model. Tier-aware capability code lands as models diverge.
diff --git a/src/Foragent.Agent/Program.cs b/src/Foragent.Agent/Program.cs
index 19ccfa2..b6ca3f9 100644
--- a/src/Foragent.Agent/Program.cs
+++ b/src/Foragent.Agent/Program.cs
@@ -8,6 +8,7 @@
using RockBot.A2A.Gateway;
using RockBot.A2A.Gateway.Auth;
using RockBot.Host;
+using RockBot.Llm;
using RockBot.Messaging.RabbitMQ;
var builder = WebApplication.CreateBuilder(args);
@@ -36,11 +37,17 @@
builder.Services.AddRockBotRabbitMq(opts =>
builder.Configuration.GetSection("RabbitMq").Bind(opts));
-// ── Chat client — Foragent capabilities use this directly. Registered with
-// RockBot too so the framework's startup requirement is satisfied. ───────
-
-builder.Services.AddSingleton(foragentChatClient);
-builder.Services.AddRockBotChatClient(foragentChatClient);
+// ── Tiered chat clients (spec §3.7, Appendix #17). One configured model is
+// aliased across Low/Balanced/High; capabilities that inject IChatClient
+// receive the Balanced tier. Tier-aware capabilities (browser-task) may
+// resolve TieredChatClientRegistry to escalate/de-escalate. The factory
+// inside AddRockBotTieredChatClients already wraps with
+// RockBotFunctionInvokingChatClient — AddRockBotChatClient is redundant
+// once this is called.
+builder.Services.AddRockBotTieredChatClients(
+ lowInnerClient: foragentChatClient,
+ balancedInnerClient: foragentChatClient,
+ highInnerClient: foragentChatClient);
// ── Agent host + A2A bus subscription ───────────────────────────────────────
diff --git a/src/Foragent.Browser/IBrowserSession.cs b/src/Foragent.Browser/IBrowserSession.cs
index f8d07c8..c8318b9 100644
--- a/src/Foragent.Browser/IBrowserSession.cs
+++ b/src/Foragent.Browser/IBrowserSession.cs
@@ -33,6 +33,16 @@ public interface IBrowserSession : IAsyncDisposable
/// finished, dispose the session when the task ends.
///
Task OpenPageAsync(Uri url, CancellationToken cancellationToken = default);
+
+ ///
+ /// Opens a page suited to an LLM-in-the-loop planner: exposes
+ /// ref-annotated aria snapshots ()
+ /// and ref-based interactions resolved via Playwright's aria-ref=eN
+ /// locator dialect. No initial URL is required; the planner drives
+ /// navigation through its own tool calls. Used by the
+ /// browser-task generalist (spec §5.2).
+ ///
+ Task OpenAgentPageAsync(CancellationToken cancellationToken = default);
}
///
@@ -74,6 +84,56 @@ Task WaitForSelectorAsync(
Task GetTextAsync(string selector, CancellationToken cancellationToken = default);
}
+///
+/// Ref-based page surface for LLM-in-the-loop planners. Each call to
+/// returns a tree annotated with
+/// [ref=eN] ids; , ,
+/// and resolve those refs via Playwright's
+/// aria-ref=eN locator dialect. Refs are valid only within the
+/// snapshot they came from — the planner must re-snapshot after any
+/// mutation (spec §9.1 step 6, decision D1 — no cache).
+///
+public interface IBrowserAgentPage : IAsyncDisposable
+{
+ /// The current URL, after any redirects and client-side navigations.
+ Uri CurrentUrl { get; }
+
+ /// The current page title, or null if absent.
+ Task GetTitleAsync(CancellationToken cancellationToken = default);
+
+ ///
+ /// Navigates to . The implementation must respect
+ /// the session's allowlist — off-list navigations fail with
+ /// before Playwright issues the request.
+ ///
+ Task NavigateAsync(Uri url, CancellationToken cancellationToken = default);
+
+ ///
+ /// Returns a ref-annotated aria snapshot of the current page. Each
+ /// interactive element carries [ref=eN]; planners pass the ref
+ /// back to /.
+ ///
+ Task AriaSnapshotAsync(CancellationToken cancellationToken = default);
+
+ /// Clicks the element identified by (e.g. e12).
+ Task ClickByRefAsync(string elementRef, CancellationToken cancellationToken = default);
+
+ ///
+ /// Fills the element identified by . Used for
+ /// input/textarea/contenteditable. Sensitive values must not be logged.
+ ///
+ Task TypeByRefAsync(string elementRef, string text, CancellationToken cancellationToken = default);
+
+ ///
+ /// Waits until the element identified by is
+ /// visible. Throws on timeout.
+ ///
+ Task WaitForRefAsync(
+ string elementRef,
+ TimeSpan? timeout = null,
+ CancellationToken cancellationToken = default);
+}
+
///
/// A compact rendering of a page suitable for LLM prompting.
///
@@ -97,4 +157,15 @@ public enum PageSnapshotSource
public interface IBrowserSessionFactory
{
Task CreateSessionAsync(CancellationToken cancellationToken = default);
+
+ ///
+ /// Creates a session whose navigations and subframe loads are restricted to
+ /// hosts accepted by . An off-list request is
+ /// aborted inside the browser context before Playwright issues it
+ /// (spec §7.1). Passing a predicate that always returns false
+ /// effectively rejects all navigation.
+ ///
+ Task CreateSessionAsync(
+ Func allowedHost,
+ CancellationToken cancellationToken = default);
}
diff --git a/src/Foragent.Browser/PlaywrightBrowserSessionFactory.cs b/src/Foragent.Browser/PlaywrightBrowserSessionFactory.cs
index 5bf333e..ff7ace2 100644
--- a/src/Foragent.Browser/PlaywrightBrowserSessionFactory.cs
+++ b/src/Foragent.Browser/PlaywrightBrowserSessionFactory.cs
@@ -5,19 +5,53 @@ namespace Foragent.Browser;
internal sealed class PlaywrightBrowserSessionFactory(
PlaywrightBrowserHost host) : IBrowserSessionFactory
{
+ public Task CreateSessionAsync(
+ CancellationToken cancellationToken = default) =>
+ CreateSessionAsync(static _ => true, cancellationToken);
+
public async Task CreateSessionAsync(
+ Func allowedHost,
CancellationToken cancellationToken = default)
{
+ ArgumentNullException.ThrowIfNull(allowedHost);
var context = await host.Browser.NewContextAsync();
- return new PlaywrightBrowserSession(context);
+
+ // Install a context-wide route handler that aborts off-list navigations
+ // and subframe loads before Playwright sees them (spec §7.1). This
+ // intercepts Navigation requests (document/subframe); resource loads
+ // (images, styles) pass through so pages can still render.
+ await context.RouteAsync("**/*", async route =>
+ {
+ var request = route.Request;
+ var resourceType = request.ResourceType;
+ if (resourceType is not ("document" or "subframe"))
+ {
+ await route.ContinueAsync();
+ return;
+ }
+
+ if (!Uri.TryCreate(request.Url, UriKind.Absolute, out var target) ||
+ !allowedHost(target))
+ {
+ await route.AbortAsync("accessdenied");
+ return;
+ }
+
+ await route.ContinueAsync();
+ });
+
+ return new PlaywrightBrowserSession(context, allowedHost);
}
}
-internal sealed class PlaywrightBrowserSession(IBrowserContext context) : IBrowserSession
+internal sealed class PlaywrightBrowserSession(
+ IBrowserContext context,
+ Func allowedHost) : IBrowserSession
{
public async Task FetchPageTitleAsync(Uri url, CancellationToken cancellationToken = default)
{
cancellationToken.ThrowIfCancellationRequested();
+ EnsureAllowed(url);
var page = await context.NewPageAsync();
try
{
@@ -41,6 +75,7 @@ internal sealed class PlaywrightBrowserSession(IBrowserContext context) : IBrows
public async Task CapturePageSnapshotAsync(Uri url, CancellationToken cancellationToken = default)
{
cancellationToken.ThrowIfCancellationRequested();
+ EnsureAllowed(url);
var page = await context.NewPageAsync();
try
{
@@ -76,6 +111,7 @@ public async Task CapturePageSnapshotAsync(Uri url, CancellationTo
public async Task OpenPageAsync(Uri url, CancellationToken cancellationToken = default)
{
cancellationToken.ThrowIfCancellationRequested();
+ EnsureAllowed(url);
var page = await context.NewPageAsync();
try
{
@@ -96,8 +132,21 @@ public async Task OpenPageAsync(Uri url, CancellationToken cancell
}
}
+ public async Task OpenAgentPageAsync(CancellationToken cancellationToken = default)
+ {
+ cancellationToken.ThrowIfCancellationRequested();
+ var page = await context.NewPageAsync();
+ return new PlaywrightBrowserAgentPage(page, allowedHost);
+ }
+
public ValueTask DisposeAsync() => new(context.CloseAsync());
+ private void EnsureAllowed(Uri url)
+ {
+ if (!allowedHost(url))
+ throw new InvalidOperationException(
+ $"Host '{url.Host}' is not in the session's allowlist.");
+ }
}
internal sealed class PlaywrightBrowserPage(IPage page) : IBrowserPage
@@ -163,3 +212,71 @@ public Task GetUrlAsync(CancellationToken cancellationToken = default)
public ValueTask DisposeAsync() => new(page.CloseAsync());
}
+
+internal sealed class PlaywrightBrowserAgentPage(
+ IPage page,
+ Func allowedHost) : IBrowserAgentPage
+{
+ public Uri CurrentUrl => Uri.TryCreate(page.Url, UriKind.Absolute, out var u) ? u : new Uri("about:blank");
+
+ public async Task GetTitleAsync(CancellationToken cancellationToken = default)
+ {
+ cancellationToken.ThrowIfCancellationRequested();
+ var title = await page.TitleAsync();
+ return string.IsNullOrEmpty(title) ? null : title;
+ }
+
+ public async Task NavigateAsync(Uri url, CancellationToken cancellationToken = default)
+ {
+ cancellationToken.ThrowIfCancellationRequested();
+ if (!allowedHost(url))
+ throw new InvalidOperationException(
+ $"Host '{url.Host}' is not in the session's allowlist.");
+ var response = await page.GotoAsync(url.ToString(), new PageGotoOptions
+ {
+ WaitUntil = WaitUntilState.DOMContentLoaded
+ });
+ if (response is null || !response.Ok)
+ throw new InvalidOperationException(
+ $"Navigation to {url} returned status {response?.Status.ToString() ?? "no response"}.");
+ }
+
+ public async Task AriaSnapshotAsync(CancellationToken cancellationToken = default)
+ {
+ cancellationToken.ThrowIfCancellationRequested();
+ // Ref-annotated aria snapshot — Playwright's "AI" mode emits [ref=eN]
+ // identifiers that resolve via the aria-ref=eN locator dialect
+ // (spec §9.1 step 6). In the 1.59 C# bindings this is gated behind
+ // AriaSnapshotMode.Ai rather than a boolean Ref option.
+ var snapshot = await page.Locator("body").AriaSnapshotAsync(
+ new LocatorAriaSnapshotOptions { Mode = AriaSnapshotMode.Ai });
+ return snapshot ?? string.Empty;
+ }
+
+ public Task ClickByRefAsync(string elementRef, CancellationToken cancellationToken = default)
+ {
+ cancellationToken.ThrowIfCancellationRequested();
+ return page.Locator($"aria-ref={elementRef}").ClickAsync();
+ }
+
+ public Task TypeByRefAsync(string elementRef, string text, CancellationToken cancellationToken = default)
+ {
+ cancellationToken.ThrowIfCancellationRequested();
+ return page.Locator($"aria-ref={elementRef}").FillAsync(text);
+ }
+
+ public async Task WaitForRefAsync(
+ string elementRef,
+ TimeSpan? timeout = null,
+ CancellationToken cancellationToken = default)
+ {
+ cancellationToken.ThrowIfCancellationRequested();
+ await page.Locator($"aria-ref={elementRef}").WaitForAsync(new LocatorWaitForOptions
+ {
+ State = WaitForSelectorState.Visible,
+ Timeout = timeout is null ? null : (float)timeout.Value.TotalMilliseconds
+ });
+ }
+
+ public ValueTask DisposeAsync() => new(page.CloseAsync());
+}
diff --git a/src/Foragent.Capabilities/BrowserTask/BrowserTaskCapability.cs b/src/Foragent.Capabilities/BrowserTask/BrowserTaskCapability.cs
new file mode 100644
index 0000000..55feadf
--- /dev/null
+++ b/src/Foragent.Capabilities/BrowserTask/BrowserTaskCapability.cs
@@ -0,0 +1,186 @@
+using System.Text;
+using System.Text.Json;
+using Foragent.Browser;
+using Foragent.Credentials;
+using Microsoft.Extensions.AI;
+using Microsoft.Extensions.Logging;
+using RockBot.A2A;
+
+namespace Foragent.Capabilities.BrowserTask;
+
+///
+/// The generalist browser-task capability (spec §5.2). Runs an
+/// LLM-in-the-loop planner over a small ref-based tool surface against a
+/// per-task . This is Foragent's primary
+/// capability — specialists exist only where deterministic, programmatic
+/// callers benefit from a typed shape.
+///
+/// v0.2 step 6 scope: no learning substrate, no credentials injection into
+/// tools (credential id is acknowledged but unused beyond audit logging —
+/// step 7 wires ISkillStore + ILongTermMemory priming, later
+/// steps expose credentials to the planner through a typed tool).
+///
+public sealed class BrowserTaskCapability(
+ IBrowserSessionFactory browserFactory,
+ IChatClient chatClient,
+ ICredentialBroker credentialBroker,
+ ILogger logger) : ICapability
+{
+ public static AgentSkill SkillDefinition { get; } = new()
+ {
+ Id = "browser-task",
+ Name = "Browser Task (generalist)",
+ Description = "Drive a browser with an LLM-in-the-loop planner to accomplish a free-form intent. "
+ + "Input: JSON {\"intent\":\"...\",\"allowedHosts\":[\"host\",\"*.host\",\"*\"],\"url\":\"optional start\",\"credentialId\":\"optional\",\"maxSteps\":60,\"maxSeconds\":120}. "
+ + "Returns a short summary plus optional structured result string."
+ };
+
+ private const string SystemPrompt = """
+ You drive a real web browser to accomplish a task on behalf of another agent.
+
+ You have these tools:
+ - snapshot() — returns a ref-annotated aria tree of the current page. ALWAYS call this first, and again after any click/type/navigate, because refs expire when the page mutates.
+ - navigate(url) — load a URL. The URL's host must be on the task's allowlist.
+ - click(ref) — click an element by its [ref=eN] id from the latest snapshot.
+ - type(ref, text) — fill an input by ref with the given text.
+ - wait_for(ref, timeoutSeconds?) — wait for an element to be visible.
+ - done(summary, result?) — call exactly once when the task is complete. After calling done, return a short final message and DO NOT emit further tool calls.
+ - fail(reason) — call if the task cannot be completed. After calling fail, stop emitting tool calls.
+
+ Rules:
+ - Stay on the allowlisted hosts the caller provided. Off-list navigations are rejected.
+ - If an element you want is not in the latest snapshot, re-snapshot rather than guessing a ref.
+ - Never enter credentials or secrets yourself — if the task needs authentication, call fail and explain.
+ - Be efficient: each tool call counts toward a step budget.
+ - When the task is done, call done() with a concise summary. If the caller asked for a specific value (e.g. a page title), include it as the result argument.
+ """;
+
+ public string SkillId => SkillDefinition.Id;
+ public AgentSkill Skill => SkillDefinition;
+
+ public async Task ExecuteAsync(AgentTaskRequest request, AgentTaskContext context)
+ {
+ var ct = context.MessageContext.CancellationToken;
+ var input = BrowserTaskInput.Parse(request);
+
+ if (input.Error is not null)
+ return CapabilityResult.Error(request, input.Error);
+
+ // credentialId is accepted by the input shape but not consumed by the
+ // planner in step 6. Resolve it so we fail fast (and audit-log access)
+ // if the caller references an unknown id. A future step exposes a
+ // typed login tool that actually uses the resolved values.
+ if (!string.IsNullOrWhiteSpace(input.CredentialId))
+ {
+ try
+ {
+ _ = await credentialBroker.ResolveAsync(input.CredentialId!, ct);
+ }
+ catch (CredentialNotFoundException ex)
+ {
+ return CapabilityResult.Error(request, $"Credential '{ex.CredentialId}' is not configured.");
+ }
+ }
+
+ using var budgetCts = new CancellationTokenSource(TimeSpan.FromSeconds(input.MaxSeconds));
+ using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(ct, budgetCts.Token);
+
+ var state = new BrowserTaskState();
+
+ try
+ {
+ await using var session = await browserFactory.CreateSessionAsync(input.Allowlist!.IsAllowed, linkedCts.Token);
+ await using var page = await session.OpenAgentPageAsync(linkedCts.Token);
+
+ if (input.Url is not null)
+ await page.NavigateAsync(input.Url, linkedCts.Token);
+
+ state.MaxSteps = input.MaxSteps;
+ var tools = new BrowserTaskTools(page, state, input.Allowlist!, logger).BuildFunctions();
+
+ var messages = new List
+ {
+ new(ChatRole.System, SystemPrompt),
+ new(ChatRole.User, BuildUserPrompt(input))
+ };
+
+ var options = new ChatOptions
+ {
+ Tools = [.. tools],
+ ToolMode = ChatToolMode.Auto
+ // Step budget is enforced tool-side (BrowserTaskState) and
+ // wall-clock via linkedCts. The function-invoking chat client
+ // does not currently expose a per-request iteration cap that
+ // works through ChatOptions — noted in framework-feedback.
+ };
+
+ // The IChatClient we inject is the RockBot-wrapped
+ // FunctionInvokingChatClient (see Program.cs tiered registration).
+ // It runs the full model ↔ tool loop internally and returns the
+ // final assistant response when the model stops emitting tool
+ // calls or the iteration cap trips.
+ try
+ {
+ _ = await chatClient.GetResponseAsync(messages, options, linkedCts.Token);
+ }
+ catch (OperationCanceledException) when (budgetCts.IsCancellationRequested && !ct.IsCancellationRequested)
+ {
+ logger.LogInformation(
+ "browser-task budget exhausted after {Seconds}s / {Steps} step(s)",
+ input.MaxSeconds, state.Steps);
+ }
+
+ return BuildResult(request, input, state);
+ }
+ catch (OperationCanceledException) when (ct.IsCancellationRequested)
+ {
+ throw;
+ }
+ catch (Exception ex)
+ {
+ logger.LogWarning(ex, "browser-task failed after {Steps} step(s)", state.Steps);
+ return CapabilityResult.Error(request, $"Browser task failed: {ex.Message}");
+ }
+ }
+
+ private static string BuildUserPrompt(BrowserTaskInput input)
+ {
+ var sb = new StringBuilder();
+ sb.Append("Intent: ").AppendLine(input.Intent);
+ if (input.Url is not null)
+ sb.Append("Starting URL: ").AppendLine(input.Url.ToString());
+ sb.Append("Allowed hosts: ").AppendLine(string.Join(", ", input.Allowlist!.Patterns));
+ sb.Append("Step budget: ").Append(input.MaxSteps).Append(" steps / ").Append(input.MaxSeconds).AppendLine("s wall-clock.");
+ if (!string.IsNullOrWhiteSpace(input.CredentialId))
+ sb.AppendLine("A credential id was provided but is not yet exposed as a tool. If authentication is required, call fail().");
+ return sb.ToString();
+ }
+
+ private static AgentTaskResult BuildResult(
+ AgentTaskRequest request,
+ BrowserTaskInput input,
+ BrowserTaskState state)
+ {
+ // Structured JSON payload so callers (usually other agents) can parse
+ // success vs. failure reliably; the summary field is the primary
+ // human-readable signal.
+ var payload = new
+ {
+ status = state.IsDone ? "done" : state.IsFailed ? "failed" : "incomplete",
+ summary = state.IsDone
+ ? state.Summary
+ : state.IsFailed
+ ? state.FailureReason
+ : $"Task did not terminate within {input.MaxSteps} steps / {input.MaxSeconds}s.",
+ result = state.Result,
+ steps = state.Steps,
+ navigations = state.Navigations.Select(u => u.ToString()).ToArray()
+ };
+ return CapabilityResult.Completed(request, JsonSerializer.Serialize(payload, SerializerOptions));
+ }
+
+ private static readonly JsonSerializerOptions SerializerOptions = new()
+ {
+ WriteIndented = false
+ };
+}
diff --git a/src/Foragent.Capabilities/BrowserTask/BrowserTaskInput.cs b/src/Foragent.Capabilities/BrowserTask/BrowserTaskInput.cs
new file mode 100644
index 0000000..cb49508
--- /dev/null
+++ b/src/Foragent.Capabilities/BrowserTask/BrowserTaskInput.cs
@@ -0,0 +1,137 @@
+using System.Text.Json;
+using RockBot.A2A;
+
+namespace Foragent.Capabilities.BrowserTask;
+
+///
+/// Parses the browser-task input shape (spec §5.2).
+///
+/// Accepts either a JSON object in the first text part or field-by-field
+/// metadata on the message/request. Metadata overrides JSON when both are
+/// present. Shape:
+///
+/// - intent — required. Free-form description of what to do.
+/// - allowedHosts — required. Array of host patterns. Empty rejects.
+/// - url — optional starting URL (string).
+/// - credentialId — optional credential reference.
+/// - maxSteps — optional int; default 60, max 150.
+/// - maxSeconds — optional int; default 120, max 600.
+///
+///
+internal readonly record struct BrowserTaskInput(
+ string? Intent,
+ Uri? Url,
+ string? CredentialId,
+ HostAllowlist? Allowlist,
+ int MaxSteps,
+ int MaxSeconds,
+ string? Error)
+{
+ public const int DefaultMaxSteps = 60;
+ public const int CeilingMaxSteps = 150;
+ public const int DefaultMaxSeconds = 120;
+ public const int CeilingMaxSeconds = 600;
+
+ public static BrowserTaskInput Parse(AgentTaskRequest request)
+ {
+ string? intent = null;
+ string? url = null;
+ string? credentialId = null;
+ List? allowedHosts = null;
+ int? maxSteps = null;
+ int? maxSeconds = null;
+
+ var text = request.Message.Parts
+ .Where(p => p.Kind == "text")
+ .Select(p => p.Text)
+ .FirstOrDefault(t => !string.IsNullOrWhiteSpace(t))
+ ?.Trim();
+
+ if (!string.IsNullOrEmpty(text) && text.StartsWith('{'))
+ {
+ try
+ {
+ using var doc = JsonDocument.Parse(text);
+ var root = doc.RootElement;
+ if (root.TryGetProperty("intent", out var i)) intent = i.GetString();
+ if (root.TryGetProperty("url", out var u)) url = u.GetString();
+ if (root.TryGetProperty("credentialId", out var c)) credentialId = c.GetString();
+ if (root.TryGetProperty("allowedHosts", out var h) && h.ValueKind == JsonValueKind.Array)
+ allowedHosts = [.. h.EnumerateArray().Select(e => e.GetString() ?? string.Empty)];
+ if (root.TryGetProperty("maxSteps", out var ms) && ms.TryGetInt32(out var msv))
+ maxSteps = msv;
+ if (root.TryGetProperty("maxSeconds", out var mt) && mt.TryGetInt32(out var mtv))
+ maxSeconds = mtv;
+ }
+ catch (JsonException)
+ {
+ return Fail("Input must be a JSON object with intent, allowedHosts, and optional url/credentialId/maxSteps/maxSeconds.");
+ }
+ }
+ else if (!string.IsNullOrEmpty(text))
+ {
+ intent = text;
+ }
+
+ intent = ReadMetadata(request, "intent") ?? intent;
+ url = ReadMetadata(request, "url") ?? url;
+ credentialId = ReadMetadata(request, "credentialId") ?? credentialId;
+ var hostsCsv = ReadMetadata(request, "allowedHosts");
+ if (hostsCsv is not null)
+ allowedHosts = [.. hostsCsv.Split(',', StringSplitOptions.TrimEntries | StringSplitOptions.RemoveEmptyEntries)];
+
+ if (string.IsNullOrWhiteSpace(intent))
+ return Fail("Missing 'intent' — a natural-language description of what to do.");
+
+ if (allowedHosts is null || allowedHosts.Count == 0)
+ return Fail("Missing 'allowedHosts' — browser-task requires an explicit allowlist (spec §7.1). Use ['*'] to accept any host.");
+
+ HostAllowlist allowlist;
+ try
+ {
+ allowlist = HostAllowlist.Parse(allowedHosts);
+ }
+ catch (ArgumentException ex)
+ {
+ return Fail(ex.Message);
+ }
+
+ Uri? parsedUrl = null;
+ if (!string.IsNullOrWhiteSpace(url))
+ {
+ if (!Uri.TryCreate(url, UriKind.Absolute, out parsedUrl) ||
+ (parsedUrl.Scheme != Uri.UriSchemeHttp && parsedUrl.Scheme != Uri.UriSchemeHttps))
+ return Fail($"'url' must be an absolute http(s) URL; got '{url}'.");
+ if (!allowlist.IsAllowed(parsedUrl))
+ return Fail($"Starting URL host '{parsedUrl.Host}' is not in the allowlist.");
+ }
+
+ var steps = Clamp(maxSteps ?? DefaultMaxSteps, 1, CeilingMaxSteps);
+ var seconds = Clamp(maxSeconds ?? DefaultMaxSeconds, 1, CeilingMaxSeconds);
+
+ return new BrowserTaskInput(intent, parsedUrl, credentialId, allowlist, steps, seconds, null);
+ }
+
+ private static BrowserTaskInput Fail(string message) =>
+ new(null, null, null, null, DefaultMaxSteps, DefaultMaxSeconds, message);
+
+ private static int Clamp(int value, int min, int max) =>
+ value < min ? min : value > max ? max : value;
+
+ private static string? ReadMetadata(AgentTaskRequest request, string key)
+ {
+ if (request.Message.Metadata is not null
+ && request.Message.Metadata.TryGetValue(key, out var msgValue)
+ && !string.IsNullOrWhiteSpace(msgValue))
+ {
+ return msgValue;
+ }
+ if (request.Metadata is not null
+ && request.Metadata.TryGetValue(key, out var reqValue)
+ && !string.IsNullOrWhiteSpace(reqValue))
+ {
+ return reqValue;
+ }
+ return null;
+ }
+}
diff --git a/src/Foragent.Capabilities/BrowserTask/BrowserTaskTools.cs b/src/Foragent.Capabilities/BrowserTask/BrowserTaskTools.cs
new file mode 100644
index 0000000..121e939
--- /dev/null
+++ b/src/Foragent.Capabilities/BrowserTask/BrowserTaskTools.cs
@@ -0,0 +1,204 @@
+using System.ComponentModel;
+using Foragent.Browser;
+using Microsoft.Extensions.AI;
+using Microsoft.Extensions.Logging;
+
+namespace Foragent.Capabilities.BrowserTask;
+
+///
+/// The tool surface exposed to the planner LLM for a single browser-task
+/// run. Methods are wrapped into instances via
+/// .
+/// The that wraps Foragent's
+/// (see AddRockBotTieredChatClients)
+/// invokes these between model turns; no separate planner loop is needed.
+/// Instances are per-task — they close over one
+/// and one .
+///
+internal sealed class BrowserTaskTools
+{
+ private readonly IBrowserAgentPage _page;
+ private readonly BrowserTaskState _state;
+ private readonly HostAllowlist _allowlist;
+ private readonly ILogger _logger;
+
+ public BrowserTaskTools(
+ IBrowserAgentPage page,
+ BrowserTaskState state,
+ HostAllowlist allowlist,
+ ILogger logger)
+ {
+ _page = page;
+ _state = state;
+ _allowlist = allowlist;
+ _logger = logger;
+ }
+
+ public AIFunction[] BuildFunctions() =>
+ [
+ AIFunctionFactory.Create(
+ Snapshot,
+ name: "snapshot",
+ description: "Capture a ref-annotated accessibility snapshot of the current page. Returns a YAML-ish aria tree where each interactive element carries [ref=eN]; pass those refs to click/type/wait_for. Re-snapshot after every click/type/navigate — refs from a previous snapshot are invalid once the page mutates."),
+ AIFunctionFactory.Create(
+ Navigate,
+ name: "navigate",
+ description: "Navigate the current page to an absolute URL. The URL's host must be on the task's allowlist; off-list navigation is rejected before the request is issued."),
+ AIFunctionFactory.Create(
+ Click,
+ name: "click",
+ description: "Click an element by ref (e.g. 'e12'). Refs come from the most recent snapshot."),
+ AIFunctionFactory.Create(
+ Type,
+ name: "type",
+ description: "Fill an input/textarea/contenteditable identified by ref. Pass the target value; prior contents are replaced."),
+ AIFunctionFactory.Create(
+ WaitFor,
+ name: "wait_for",
+ description: "Wait until the element identified by ref is visible. Use after an action that triggers navigation or async UI update."),
+ AIFunctionFactory.Create(
+ Done,
+ name: "done",
+ description: "Mark the task complete. Pass a short summary of what was accomplished and optionally a structured result string. After calling done, stop emitting tool calls."),
+ AIFunctionFactory.Create(
+ Fail,
+ name: "fail",
+ description: "Mark the task failed with a reason explaining what went wrong. After calling fail, stop emitting tool calls.")
+ ];
+
+ private const string BudgetMessage = "Step budget exhausted — call done() with whatever was achieved, or fail() with a reason. Do not call other tools.";
+
+ [Description("Capture a ref-annotated aria snapshot of the current page.")]
+ private async Task Snapshot()
+ {
+ if (_state.BudgetExhausted) return BudgetMessage;
+ _state.IncrementStep();
+ var url = _page.CurrentUrl;
+ var title = await _page.GetTitleAsync();
+ var snapshot = await _page.AriaSnapshotAsync();
+ _state.RecordNavigation(url);
+ _logger.LogInformation("browser-task step {Step}: snapshot {Url}", _state.Steps, url);
+ return $"Url: {url}\nTitle: {title ?? "(none)"}\n\n{snapshot}";
+ }
+
+ [Description("Navigate to an absolute URL within the allowlist.")]
+ private async Task Navigate(
+ [Description("Absolute http(s) URL to load. Must match an allowlist pattern.")] string url)
+ {
+ if (_state.BudgetExhausted) return BudgetMessage;
+ _state.IncrementStep();
+ if (!Uri.TryCreate(url, UriKind.Absolute, out var target) ||
+ (target.Scheme != Uri.UriSchemeHttp && target.Scheme != Uri.UriSchemeHttps))
+ return $"Rejected: '{url}' is not an absolute http(s) URL.";
+ if (!_allowlist.IsAllowed(target))
+ return $"Rejected: host '{target.Host}' is not on the allowlist.";
+ await _page.NavigateAsync(target);
+ _state.RecordNavigation(target);
+ _logger.LogInformation("browser-task step {Step}: navigate {Url}", _state.Steps, target);
+ return $"Loaded {target}. Call snapshot to see the page.";
+ }
+
+ [Description("Click an element by ref.")]
+ private async Task Click(
+ [Description("The element ref (e.g. 'e12') from the latest snapshot.")] string @ref)
+ {
+ if (_state.BudgetExhausted) return BudgetMessage;
+ _state.IncrementStep();
+ await _page.ClickByRefAsync(@ref);
+ _logger.LogInformation("browser-task step {Step}: click ref={Ref}", _state.Steps, @ref);
+ return $"Clicked {@ref}. Call snapshot to see the resulting page.";
+ }
+
+ [Description("Fill a field by ref.")]
+ private async Task Type(
+ [Description("The element ref (e.g. 'e12') from the latest snapshot.")] string @ref,
+ [Description("Text to enter into the field, replacing any prior contents.")] string text)
+ {
+ if (_state.BudgetExhausted) return BudgetMessage;
+ _state.IncrementStep();
+ await _page.TypeByRefAsync(@ref, text);
+ // Never log the value — may be a password or other sensitive content.
+ _logger.LogInformation("browser-task step {Step}: type ref={Ref} ({Length} chars)",
+ _state.Steps, @ref, text.Length);
+ return $"Typed into {@ref}.";
+ }
+
+ [Description("Wait for an element to become visible.")]
+ private async Task WaitFor(
+ [Description("The element ref to wait for.")] string @ref,
+ [Description("Timeout in seconds; default 10.")] int? timeoutSeconds = null)
+ {
+ if (_state.BudgetExhausted) return BudgetMessage;
+ _state.IncrementStep();
+ var timeout = TimeSpan.FromSeconds(timeoutSeconds ?? 10);
+ try
+ {
+ await _page.WaitForRefAsync(@ref, timeout);
+ return $"{@ref} visible.";
+ }
+ catch (TimeoutException)
+ {
+ return $"Timeout: {@ref} did not become visible within {timeout.TotalSeconds:0}s.";
+ }
+ }
+
+ [Description("Mark the task complete.")]
+ private string Done(
+ [Description("One-sentence summary of what was accomplished.")] string summary,
+ [Description("Optional structured result text (JSON, extracted value, etc.). Omit if no result is expected.")] string? result = null)
+ {
+ _state.Completed(summary, result);
+ _logger.LogInformation("browser-task done after {Steps} step(s): {Summary}", _state.Steps, summary);
+ return "Task marked complete. Stop emitting tool calls.";
+ }
+
+ [Description("Mark the task failed.")]
+ private string Fail(
+ [Description("Reason the task could not complete.")] string reason)
+ {
+ _state.Failed(reason);
+ _logger.LogWarning("browser-task failed after {Steps} step(s): {Reason}", _state.Steps, reason);
+ return "Task marked failed. Stop emitting tool calls.";
+ }
+}
+
+///
+/// Shared state between the tool surface and the capability wrapper. Not
+/// thread-safe — a browser-task runs one tool at a time inside the
+/// function-invoking chat client.
+///
+internal sealed class BrowserTaskState
+{
+ public int Steps { get; private set; }
+ public int MaxSteps { get; set; } = int.MaxValue;
+ public bool IsTerminal => IsDone || IsFailed;
+ public bool IsDone { get; private set; }
+ public bool IsFailed { get; private set; }
+ public string? Summary { get; private set; }
+ public string? Result { get; private set; }
+ public string? FailureReason { get; private set; }
+ public List Navigations { get; } = [];
+
+ public bool BudgetExhausted => Steps >= MaxSteps;
+
+ public void IncrementStep() => Steps++;
+
+ public void RecordNavigation(Uri url)
+ {
+ if (Navigations.Count == 0 || Navigations[^1] != url)
+ Navigations.Add(url);
+ }
+
+ public void Completed(string summary, string? result)
+ {
+ IsDone = true;
+ Summary = summary;
+ Result = result;
+ }
+
+ public void Failed(string reason)
+ {
+ IsFailed = true;
+ FailureReason = reason;
+ }
+}
diff --git a/src/Foragent.Capabilities/BrowserTask/HostAllowlist.cs b/src/Foragent.Capabilities/BrowserTask/HostAllowlist.cs
new file mode 100644
index 0000000..831b767
--- /dev/null
+++ b/src/Foragent.Capabilities/BrowserTask/HostAllowlist.cs
@@ -0,0 +1,106 @@
+namespace Foragent.Capabilities.BrowserTask;
+
+///
+/// Parses and evaluates the per-task allowed-hosts list from spec §7.1.
+/// Supports exact hosts (bsky.app), subdomain wildcards
+/// (*.example.com matches foo.example.com but not
+/// example.com), and the unrestricted sentinel (*). Empty
+/// input is rejected at construction — there is no default-permissive mode.
+///
+public sealed class HostAllowlist
+{
+ private readonly string[] _exact;
+ private readonly string[] _suffix;
+ private readonly bool _wildcardAll;
+
+ private HostAllowlist(string[] exact, string[] suffix, bool wildcardAll)
+ {
+ _exact = exact;
+ _suffix = suffix;
+ _wildcardAll = wildcardAll;
+ }
+
+ /// The raw patterns, preserved for audit logging.
+ public IReadOnlyList Patterns { get; private init; } = [];
+
+ ///
+ /// Parses . Throws
+ /// when the list is empty or contains an invalid pattern. Per-pattern
+ /// rules:
+ ///
+ /// - * alone — match all hosts.
+ /// - *.host — match any subdomain of host (not host itself).
+ /// - host — match the exact host (case-insensitive).
+ ///
+ ///
+ public static HostAllowlist Parse(IEnumerable patterns)
+ {
+ ArgumentNullException.ThrowIfNull(patterns);
+
+ var raw = new List();
+ var exact = new List();
+ var suffix = new List();
+ var wildcardAll = false;
+
+ foreach (var entry in patterns)
+ {
+ if (string.IsNullOrWhiteSpace(entry))
+ continue;
+ var pattern = entry.Trim().ToLowerInvariant();
+ raw.Add(pattern);
+
+ if (pattern == "*")
+ {
+ wildcardAll = true;
+ continue;
+ }
+
+ if (pattern.StartsWith("*.", StringComparison.Ordinal))
+ {
+ var tail = pattern[2..];
+ if (string.IsNullOrEmpty(tail) || tail.Contains('*') || tail.StartsWith('.'))
+ throw new ArgumentException(
+ $"Invalid allowlist pattern '{entry}'. Expected '*.domain.tld'.",
+ nameof(patterns));
+ suffix.Add("." + tail);
+ continue;
+ }
+
+ if (pattern.Contains('*'))
+ throw new ArgumentException(
+ $"Invalid allowlist pattern '{entry}'. Only '*' or '*.host' wildcards are supported.",
+ nameof(patterns));
+
+ exact.Add(pattern);
+ }
+
+ if (raw.Count == 0)
+ throw new ArgumentException(
+ "Allowlist is empty; an empty allowlist rejects all hosts (spec §7.1).",
+ nameof(patterns));
+
+ return new HostAllowlist([.. exact], [.. suffix], wildcardAll) { Patterns = raw };
+ }
+
+ /// Returns true if is permitted.
+ public bool IsAllowed(string host)
+ {
+ if (string.IsNullOrEmpty(host))
+ return false;
+ if (_wildcardAll)
+ return true;
+ var normalized = host.ToLowerInvariant();
+ foreach (var e in _exact)
+ if (e == normalized) return true;
+ foreach (var s in _suffix)
+ if (normalized.EndsWith(s, StringComparison.Ordinal)) return true;
+ return false;
+ }
+
+ /// Returns true if 's host is permitted.
+ public bool IsAllowed(Uri url)
+ {
+ ArgumentNullException.ThrowIfNull(url);
+ return IsAllowed(url.Host);
+ }
+}
diff --git a/src/Foragent.Capabilities/ForagentCapabilitiesServiceCollectionExtensions.cs b/src/Foragent.Capabilities/ForagentCapabilitiesServiceCollectionExtensions.cs
index ce248d0..e2ea5a1 100644
--- a/src/Foragent.Capabilities/ForagentCapabilitiesServiceCollectionExtensions.cs
+++ b/src/Foragent.Capabilities/ForagentCapabilitiesServiceCollectionExtensions.cs
@@ -1,3 +1,4 @@
+using Foragent.Capabilities.BrowserTask;
using Foragent.Capabilities.SitePosting;
using Microsoft.Extensions.DependencyInjection;
using RockBot.A2A;
@@ -16,6 +17,7 @@ public static IServiceCollection AddForagentCapabilities(this IServiceCollection
services.AddScoped();
services.AddScoped();
services.AddScoped();
+ services.AddScoped();
services.AddScoped();
services.AddScoped();
return services;
@@ -32,6 +34,7 @@ public static class ForagentCapabilities
{
public static IReadOnlyList Skills { get; } =
[
+ BrowserTaskCapability.SkillDefinition,
FetchPageTitleCapability.SkillDefinition,
ExtractStructuredDataCapability.SkillDefinition,
PostToSiteCapability.SkillDefinition
diff --git a/tests/Foragent.Agent.Tests/BrowserTask/BrowserTaskCapabilityTests.cs b/tests/Foragent.Agent.Tests/BrowserTask/BrowserTaskCapabilityTests.cs
new file mode 100644
index 0000000..902c6da
--- /dev/null
+++ b/tests/Foragent.Agent.Tests/BrowserTask/BrowserTaskCapabilityTests.cs
@@ -0,0 +1,170 @@
+using System.Text.Json;
+using Foragent.Capabilities.BrowserTask;
+using Foragent.Credentials;
+using Microsoft.Extensions.AI;
+using Microsoft.Extensions.Logging.Abstractions;
+using RockBot.A2A;
+using Xunit;
+
+namespace Foragent.Agent.Tests.BrowserTask;
+
+public class BrowserTaskCapabilityTests
+{
+ [Fact]
+ public async Task RejectsInput_WhenAllowlistMissing()
+ {
+ var (capability, _, _) = Build(ScriptedChatClient.Text("unused"));
+ var (ctx, _) = TestContext.Build();
+
+ var result = await capability.ExecuteAsync(
+ TestContext.Request("browser-task",
+ """{"intent":"do stuff"}"""),
+ ctx);
+
+ Assert.Equal(AgentTaskState.Completed, result.State);
+ Assert.Contains("allowedHosts", TestContext.TextOf(result));
+ }
+
+ [Fact]
+ public async Task RejectsInput_WhenIntentMissing()
+ {
+ var (capability, _, _) = Build(ScriptedChatClient.Text("unused"));
+ var (ctx, _) = TestContext.Build();
+
+ var result = await capability.ExecuteAsync(
+ TestContext.Request("browser-task",
+ """{"allowedHosts":["*"]}"""),
+ ctx);
+
+ Assert.Contains("intent", TestContext.TextOf(result));
+ }
+
+ [Fact]
+ public async Task DonePayload_ReturnsStructuredJson()
+ {
+ var (capability, page, _) = Build(
+ ScriptedChatClient.ToolCall("snapshot"),
+ ScriptedChatClient.ToolCall("done", new { summary = "found the title", result = "Example Domain" }),
+ ScriptedChatClient.Text("stopping"));
+ var (ctx, _) = TestContext.Build();
+
+ var result = await capability.ExecuteAsync(
+ TestContext.Request("browser-task",
+ """{"intent":"read the page title","url":"https://example.com/","allowedHosts":["example.com"]}"""),
+ ctx);
+
+ Assert.Equal(AgentTaskState.Completed, result.State);
+ var text = TestContext.TextOf(result);
+ using var doc = JsonDocument.Parse(text);
+ Assert.Equal("done", doc.RootElement.GetProperty("status").GetString());
+ Assert.Equal("found the title", doc.RootElement.GetProperty("summary").GetString());
+ Assert.Equal("Example Domain", doc.RootElement.GetProperty("result").GetString());
+ Assert.Equal(1, doc.RootElement.GetProperty("steps").GetInt32());
+ Assert.Contains("snapshot", page.Actions);
+ }
+
+ [Fact]
+ public async Task Fail_ReturnsFailedStatus()
+ {
+ var (capability, _, _) = Build(
+ ScriptedChatClient.ToolCall("fail", new { reason = "unreachable" }),
+ ScriptedChatClient.Text("stopping"));
+ var (ctx, _) = TestContext.Build();
+
+ var result = await capability.ExecuteAsync(
+ TestContext.Request("browser-task",
+ """{"intent":"try something","allowedHosts":["*"]}"""),
+ ctx);
+
+ using var doc = JsonDocument.Parse(TestContext.TextOf(result));
+ Assert.Equal("failed", doc.RootElement.GetProperty("status").GetString());
+ Assert.Equal("unreachable", doc.RootElement.GetProperty("summary").GetString());
+ }
+
+ [Fact]
+ public async Task BudgetExhausted_ReturnsIncompleteStatus()
+ {
+ // Script drives the planner to snapshot repeatedly past the budget.
+ // maxSteps=2 — after 2 snapshot calls the tool returns a budget
+ // message; the model keeps calling snapshot but each call is a
+ // budget-exhausted no-op. The script runs out and returns a final
+ // text, ending the loop naturally.
+ var responses = new List();
+ for (var i = 0; i < 10; i++)
+ responses.Add(ScriptedChatClient.ToolCall("snapshot"));
+ responses.Add(ScriptedChatClient.Text("giving up"));
+
+ var (capability, _, _) = Build([.. responses]);
+ var (ctx, _) = TestContext.Build();
+
+ var result = await capability.ExecuteAsync(
+ TestContext.Request("browser-task",
+ """{"intent":"spin forever","allowedHosts":["*"],"maxSteps":2}"""),
+ ctx);
+
+ using var doc = JsonDocument.Parse(TestContext.TextOf(result));
+ Assert.Equal("incomplete", doc.RootElement.GetProperty("status").GetString());
+ }
+
+ [Fact]
+ public async Task OffAllowlistStartingUrl_IsRejected()
+ {
+ var (capability, _, _) = Build(ScriptedChatClient.Text("unused"));
+ var (ctx, _) = TestContext.Build();
+
+ var result = await capability.ExecuteAsync(
+ TestContext.Request("browser-task",
+ """{"intent":"go","url":"https://other.example/","allowedHosts":["bsky.app"]}"""),
+ ctx);
+
+ Assert.Contains("not in the allowlist", TestContext.TextOf(result));
+ }
+
+ [Fact]
+ public async Task NavigateTool_RejectsOffAllowlistHosts()
+ {
+ var (capability, page, _) = Build(
+ ScriptedChatClient.ToolCall("navigate", new { url = "https://evil.example/phish" }),
+ ScriptedChatClient.ToolCall("done", new { summary = "model gave up" }),
+ ScriptedChatClient.Text("stopping"));
+ var (ctx, _) = TestContext.Build();
+
+ var result = await capability.ExecuteAsync(
+ TestContext.Request("browser-task",
+ """{"intent":"try","allowedHosts":["bsky.app"]}"""),
+ ctx);
+
+ using var doc = JsonDocument.Parse(TestContext.TextOf(result));
+ Assert.Equal("done", doc.RootElement.GetProperty("status").GetString());
+ // Fake page.NavigateAsync never ran because the tool rejected first.
+ Assert.DoesNotContain(page.Actions, a => a.StartsWith("navigate:"));
+ }
+
+ private static (BrowserTaskCapability Capability, FakeBrowserAgentPage Page, FakeAgentBrowserSessionFactory Factory) Build(
+ params ChatResponse[] script)
+ {
+ var page = new FakeBrowserAgentPage();
+ var factory = new FakeAgentBrowserSessionFactory(page);
+
+ var scripted = new ScriptedChatClient(script);
+ var wrapped = new ChatClientBuilder(scripted)
+ .UseFunctionInvocation()
+ .Build();
+
+ var broker = new StubCredentialBroker();
+
+ var capability = new BrowserTaskCapability(
+ factory,
+ wrapped,
+ broker,
+ NullLogger.Instance);
+
+ return (capability, page, factory);
+ }
+
+ private sealed class StubCredentialBroker : ICredentialBroker
+ {
+ public Task ResolveAsync(string credentialId, CancellationToken cancellationToken = default) =>
+ throw new CredentialNotFoundException(credentialId);
+ }
+}
diff --git a/tests/Foragent.Agent.Tests/BrowserTask/FakeBrowserAgentPage.cs b/tests/Foragent.Agent.Tests/BrowserTask/FakeBrowserAgentPage.cs
new file mode 100644
index 0000000..8027593
--- /dev/null
+++ b/tests/Foragent.Agent.Tests/BrowserTask/FakeBrowserAgentPage.cs
@@ -0,0 +1,87 @@
+using Foragent.Browser;
+
+namespace Foragent.Agent.Tests.BrowserTask;
+
+internal sealed class FakeBrowserAgentPage : IBrowserAgentPage
+{
+ public List Actions { get; } = [];
+ public Uri CurrentUrl { get; set; } = new("about:blank");
+ public string Snapshot { get; set; } = "- heading: stub page\n- button \"ok\" [ref=e1]";
+ public string? Title { get; set; } = "stub";
+
+ public Task GetTitleAsync(CancellationToken cancellationToken = default) =>
+ Task.FromResult(Title);
+
+ public Task NavigateAsync(Uri url, CancellationToken cancellationToken = default)
+ {
+ Actions.Add($"navigate:{url}");
+ CurrentUrl = url;
+ return Task.CompletedTask;
+ }
+
+ public Task AriaSnapshotAsync(CancellationToken cancellationToken = default)
+ {
+ Actions.Add("snapshot");
+ return Task.FromResult(Snapshot);
+ }
+
+ public Task ClickByRefAsync(string elementRef, CancellationToken cancellationToken = default)
+ {
+ Actions.Add($"click:{elementRef}");
+ return Task.CompletedTask;
+ }
+
+ public Task TypeByRefAsync(string elementRef, string text, CancellationToken cancellationToken = default)
+ {
+ Actions.Add($"type:{elementRef}:{text.Length}");
+ return Task.CompletedTask;
+ }
+
+ public Task WaitForRefAsync(string elementRef, TimeSpan? timeout = null, CancellationToken cancellationToken = default)
+ {
+ Actions.Add($"wait:{elementRef}");
+ return Task.CompletedTask;
+ }
+
+ public ValueTask DisposeAsync()
+ {
+ Actions.Add("dispose");
+ return ValueTask.CompletedTask;
+ }
+}
+
+internal sealed class FakeAgentBrowserSession(FakeBrowserAgentPage page) : IBrowserSession
+{
+ public Task FetchPageTitleAsync(Uri url, CancellationToken ct = default) =>
+ throw new NotSupportedException();
+
+ public Task CapturePageSnapshotAsync(Uri url, CancellationToken ct = default) =>
+ throw new NotSupportedException();
+
+ public Task OpenPageAsync(Uri url, CancellationToken ct = default) =>
+ throw new NotSupportedException();
+
+ public Task OpenAgentPageAsync(CancellationToken ct = default) =>
+ Task.FromResult(page);
+
+ public ValueTask DisposeAsync() => ValueTask.CompletedTask;
+}
+
+internal sealed class FakeAgentBrowserSessionFactory(FakeBrowserAgentPage page) : IBrowserSessionFactory
+{
+ public Func? CapturedAllowlist { get; private set; }
+ public int SessionsCreated { get; private set; }
+
+ public Task CreateSessionAsync(CancellationToken ct = default)
+ {
+ SessionsCreated++;
+ return Task.FromResult(new FakeAgentBrowserSession(page));
+ }
+
+ public Task CreateSessionAsync(Func allowedHost, CancellationToken ct = default)
+ {
+ CapturedAllowlist = allowedHost;
+ SessionsCreated++;
+ return Task.FromResult(new FakeAgentBrowserSession(page));
+ }
+}
diff --git a/tests/Foragent.Agent.Tests/BrowserTask/HostAllowlistTests.cs b/tests/Foragent.Agent.Tests/BrowserTask/HostAllowlistTests.cs
new file mode 100644
index 0000000..febaa73
--- /dev/null
+++ b/tests/Foragent.Agent.Tests/BrowserTask/HostAllowlistTests.cs
@@ -0,0 +1,67 @@
+using Foragent.Capabilities.BrowserTask;
+using Xunit;
+
+namespace Foragent.Agent.Tests.BrowserTask;
+
+public class HostAllowlistTests
+{
+ [Fact]
+ public void ExactHost_Matches_CaseInsensitively()
+ {
+ var list = HostAllowlist.Parse(["bsky.app"]);
+ Assert.True(list.IsAllowed(new Uri("https://bsky.app/")));
+ Assert.True(list.IsAllowed(new Uri("https://BSKY.APP/")));
+ Assert.False(list.IsAllowed(new Uri("https://foo.bsky.app/")));
+ Assert.False(list.IsAllowed(new Uri("https://example.com/")));
+ }
+
+ [Fact]
+ public void SubdomainWildcard_MatchesSubdomainButNotBareDomain()
+ {
+ var list = HostAllowlist.Parse(["*.example.com"]);
+ Assert.True(list.IsAllowed(new Uri("https://foo.example.com/")));
+ Assert.True(list.IsAllowed(new Uri("https://a.b.example.com/")));
+ Assert.False(list.IsAllowed(new Uri("https://example.com/")));
+ Assert.False(list.IsAllowed(new Uri("https://example.org/")));
+ }
+
+ [Fact]
+ public void StarAloneMatchesAny()
+ {
+ var list = HostAllowlist.Parse(["*"]);
+ Assert.True(list.IsAllowed(new Uri("https://any.host.tld/")));
+ }
+
+ [Fact]
+ public void EmptyListIsRejected()
+ {
+ Assert.Throws(() => HostAllowlist.Parse([]));
+ Assert.Throws(() => HostAllowlist.Parse([""]));
+ Assert.Throws(() => HostAllowlist.Parse([" "]));
+ }
+
+ [Fact]
+ public void MultiPatternList_OrsEntries()
+ {
+ var list = HostAllowlist.Parse(["bsky.app", "*.example.com"]);
+ Assert.True(list.IsAllowed(new Uri("https://bsky.app/")));
+ Assert.True(list.IsAllowed(new Uri("https://foo.example.com/")));
+ Assert.False(list.IsAllowed(new Uri("https://example.com/")));
+ }
+
+ [Theory]
+ [InlineData("*.")]
+ [InlineData("*.*")]
+ [InlineData("foo*bar")]
+ public void InvalidPattern_IsRejected(string pattern)
+ {
+ Assert.Throws(() => HostAllowlist.Parse([pattern]));
+ }
+
+ [Fact]
+ public void PreservesPatterns_ForAuditLogging()
+ {
+ var list = HostAllowlist.Parse([" bsky.app ", "*.Example.com"]);
+ Assert.Equal(["bsky.app", "*.example.com"], list.Patterns);
+ }
+}
diff --git a/tests/Foragent.Agent.Tests/BrowserTask/ScriptedChatClient.cs b/tests/Foragent.Agent.Tests/BrowserTask/ScriptedChatClient.cs
new file mode 100644
index 0000000..3c3aab2
--- /dev/null
+++ b/tests/Foragent.Agent.Tests/BrowserTask/ScriptedChatClient.cs
@@ -0,0 +1,69 @@
+using Microsoft.Extensions.AI;
+
+namespace Foragent.Agent.Tests.BrowserTask;
+
+///
+/// Raw IChatClient that plays a scripted sequence of assistant turns. The
+/// production IChatClient injected into BrowserTaskCapability is the
+/// RockBot-wrapped , so in tests we
+/// build the same shape via +
+/// .
+///
+internal sealed class ScriptedChatClient : IChatClient
+{
+ private readonly Queue _responses;
+
+ public int Turns { get; private set; }
+
+ public ScriptedChatClient(params ChatResponse[] responses)
+ {
+ _responses = new Queue(responses);
+ }
+
+ public static ChatResponse Text(string text) =>
+ new(new ChatMessage(ChatRole.Assistant, text));
+
+ public static ChatResponse ToolCall(string name, object? args = null)
+ {
+ var id = $"call_{Guid.NewGuid():N}";
+ var dict = args is IDictionary d
+ ? (IDictionary)d
+ : ObjectToDictionary(args);
+ var call = new FunctionCallContent(id, name, dict);
+ return new ChatResponse(new ChatMessage(ChatRole.Assistant, [call]));
+ }
+
+ public Task GetResponseAsync(
+ IEnumerable messages,
+ ChatOptions? options = null,
+ CancellationToken cancellationToken = default)
+ {
+ Turns++;
+ if (_responses.Count == 0)
+ return Task.FromResult(Text("(script exhausted — stopping)"));
+ return Task.FromResult(_responses.Dequeue());
+ }
+
+#pragma warning disable CS1998
+ public async IAsyncEnumerable GetStreamingResponseAsync(
+ IEnumerable messages,
+ ChatOptions? options = null,
+ [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken = default)
+ {
+ yield break;
+ }
+#pragma warning restore CS1998
+
+ public object? GetService(Type serviceType, object? serviceKey = null) => null;
+
+ public void Dispose() { }
+
+ private static Dictionary ObjectToDictionary(object? source)
+ {
+ var result = new Dictionary(StringComparer.OrdinalIgnoreCase);
+ if (source is null) return result;
+ foreach (var p in source.GetType().GetProperties())
+ result[p.Name] = p.GetValue(source);
+ return result;
+ }
+}
diff --git a/tests/Foragent.Agent.Tests/TestDoubles.cs b/tests/Foragent.Agent.Tests/TestDoubles.cs
index cab0d0c..237baad 100644
--- a/tests/Foragent.Agent.Tests/TestDoubles.cs
+++ b/tests/Foragent.Agent.Tests/TestDoubles.cs
@@ -106,6 +106,12 @@ public Task CreateSessionAsync(CancellationToken ct = default)
return Task.FromResult(new StubSession(this));
}
+ public Task CreateSessionAsync(Func allowedHost, CancellationToken ct = default)
+ {
+ SessionsCreated++;
+ return Task.FromResult(new StubSession(this));
+ }
+
private sealed class StubSession(StubBrowserSessionFactory owner) : IBrowserSession
{
public Task FetchPageTitleAsync(Uri url, CancellationToken ct = default) =>
@@ -117,6 +123,9 @@ public Task CapturePageSnapshotAsync(Uri url, CancellationToken ct
public Task OpenPageAsync(Uri url, CancellationToken ct = default) =>
owner.PageResponder(url, ct);
+ public Task OpenAgentPageAsync(CancellationToken ct = default) =>
+ throw new NotSupportedException("StubBrowserSessionFactory does not expose an agent page; test BrowserTaskCapability with a dedicated fake.");
+
public ValueTask DisposeAsync()
{
owner.SessionsDisposed++;
diff --git a/tests/Foragent.Browser.Tests/BrowserTaskIntegrationTests.cs b/tests/Foragent.Browser.Tests/BrowserTaskIntegrationTests.cs
new file mode 100644
index 0000000..e2da5b1
--- /dev/null
+++ b/tests/Foragent.Browser.Tests/BrowserTaskIntegrationTests.cs
@@ -0,0 +1,268 @@
+using System.ClientModel;
+using System.Text.Json;
+using Foragent.Capabilities.BrowserTask;
+using Foragent.Credentials;
+using Microsoft.AspNetCore.Builder;
+using Microsoft.AspNetCore.Http;
+using Microsoft.Extensions.AI;
+using Microsoft.Extensions.DependencyInjection;
+using Microsoft.Extensions.Logging.Abstractions;
+using OpenAI;
+using RockBot.A2A;
+using RockBot.Host;
+using RockBot.Llm;
+using RockBot.Messaging;
+using Xunit;
+
+namespace Foragent.Browser.Tests;
+
+///
+/// Step 6's "small curated benchmark" (spec §9.1). Three Kestrel-hosted
+/// scenarios exercise the real end-to-end
+/// with real Chromium + a real LLM. Skipped when FORAGENT_LLM_* env
+/// vars are unset, so the main test run stays free of network / API
+/// dependencies. Establishes the unaided success floor before step 7 adds
+/// learned-skill priming.
+///
+[Collection("Playwright")]
+public class BrowserTaskIntegrationTests(TestPageServerFixture fixture)
+{
+ [SkippableFact]
+ public async Task ClickThrough_FindsDestinationText()
+ {
+ var config = LlmConfig.FromEnvironment();
+ Skip.If(config is null, "FORAGENT_LLM_* env vars not set — skipping browser-task benchmark.");
+
+ await using var server = await StartSiteAsync(app =>
+ {
+ app.MapGet("/", () => Results.Content("""
+ Home
+ Welcome
See details
+ """, "text/html"));
+ app.MapGet("/details", () => Results.Content("""
+ Details
+ Details
SECRET-TOKEN-42
+ """, "text/html"));
+ });
+
+ var capability = BuildCapability(config!);
+ var request = Request("""
+ {"intent":"Starting from the home page, click the See details link and report the secret token shown on the destination page as the result.",
+ "url":"%URL%/",
+ "allowedHosts":["127.0.0.1"],
+ "maxSteps":20}
+ """.Replace("%URL%", server.BaseUrl));
+
+ var result = await capability.ExecuteAsync(request, BuildContext());
+ var payload = ParsePayload(result);
+
+ Assert.Equal("done", payload.Status);
+ Assert.Contains("SECRET-TOKEN-42", payload.Result ?? payload.Summary ?? string.Empty);
+ }
+
+ [SkippableFact]
+ public async Task FormSubmit_ReportsConfirmationMessage()
+ {
+ var config = LlmConfig.FromEnvironment();
+ Skip.If(config is null, "FORAGENT_LLM_* env vars not set — skipping browser-task benchmark.");
+
+ await using var server = await StartSiteAsync(app =>
+ {
+ app.MapGet("/form", () => Results.Content("""
+ Contact
+
+ Contact us
+
+
+ """, "text/html"));
+ app.MapPost("/submit", async (HttpRequest r) =>
+ {
+ var form = await r.ReadFormAsync();
+ var name = form["name"].ToString();
+ return Results.Content($"""
+ Thanks
+ Thanks {name}, we received your message.
+ """, "text/html");
+ });
+ });
+
+ var capability = BuildCapability(config!);
+ var request = Request("""
+ {"intent":"Fill the contact form with name 'Rocky' and message 'Hello from step 6', submit it, and report the confirmation headline you see next.",
+ "url":"%URL%/form",
+ "allowedHosts":["127.0.0.1"],
+ "maxSteps":20}
+ """.Replace("%URL%", server.BaseUrl));
+
+ var result = await capability.ExecuteAsync(request, BuildContext());
+ var payload = ParsePayload(result);
+
+ Assert.Equal("done", payload.Status);
+ var combined = (payload.Result ?? string.Empty) + " " + (payload.Summary ?? string.Empty);
+ Assert.Contains("Rocky", combined);
+ Assert.Contains("received your message", combined, StringComparison.OrdinalIgnoreCase);
+ }
+
+ [SkippableFact]
+ public async Task MultiPageNav_ReadsNestedContent()
+ {
+ var config = LlmConfig.FromEnvironment();
+ Skip.If(config is null, "FORAGENT_LLM_* env vars not set — skipping browser-task benchmark.");
+
+ await using var server = await StartSiteAsync(app =>
+ {
+ app.MapGet("/", () => Results.Content("""
+ Docs index
+ Docs
+ """, "text/html"));
+ app.MapGet("/intro", () => Results.Content("""
+ Intro
+ Intro
Chapter 2
+ """, "text/html"));
+ app.MapGet("/intro/chapter-2", () => Results.Content("""
+ Chapter 2
+ Chapter 2: the widget
+ The answer you seek is FORTY-TWO.
+ """, "text/html"));
+ app.MapGet("/advanced", () => Results.Content("""
+ No answer here.
+ """, "text/html"));
+ });
+
+ var capability = BuildCapability(config!);
+ var request = Request("""
+ {"intent":"Starting from the docs index, navigate into Intro and then into its Chapter 2, and return the strong-emphasised answer word you find there.",
+ "url":"%URL%/",
+ "allowedHosts":["127.0.0.1"],
+ "maxSteps":30}
+ """.Replace("%URL%", server.BaseUrl));
+
+ var result = await capability.ExecuteAsync(request, BuildContext());
+ var payload = ParsePayload(result);
+
+ Assert.Equal("done", payload.Status);
+ Assert.Contains("FORTY-TWO", (payload.Result ?? string.Empty) + (payload.Summary ?? string.Empty));
+ }
+
+ // ── helpers ──────────────────────────────────────────────────────────────
+
+ private BrowserTaskCapability BuildCapability(LlmConfig config)
+ {
+ var openAi = new OpenAIClient(
+ new ApiKeyCredential(config.ApiKey),
+ new OpenAIClientOptions { Endpoint = new Uri(config.Endpoint) });
+ var inner = openAi.GetChatClient(config.ModelId).AsIChatClient();
+
+ // Match the production wiring — the capability sees a
+ // function-invoking IChatClient (same shape as what
+ // AddRockBotTieredChatClients installs in Program.cs).
+ var chatClient = new ChatClientBuilder(inner)
+ .UseFunctionInvocation()
+ .Build();
+
+ return new BrowserTaskCapability(
+ fixture.Factory,
+ chatClient,
+ new NoCredentialsBroker(),
+ NullLogger.Instance);
+ }
+
+ private static AgentTaskRequest Request(string json) => new()
+ {
+ TaskId = Guid.NewGuid().ToString(),
+ Skill = BrowserTaskCapability.SkillDefinition.Id,
+ Message = new AgentMessage
+ {
+ Role = "user",
+ Parts = [new AgentMessagePart { Kind = "text", Text = json }]
+ }
+ };
+
+ private static AgentTaskContext BuildContext()
+ {
+ var envelope = MessageEnvelope.Create(
+ messageType: typeof(AgentTaskRequest).FullName!,
+ body: ReadOnlyMemory.Empty,
+ source: "test");
+ var messageContext = new MessageHandlerContext
+ {
+ Envelope = envelope,
+ Agent = new AgentIdentity("Foragent"),
+ Services = new ServiceCollection().BuildServiceProvider(),
+ CancellationToken = CancellationToken.None
+ };
+ return new AgentTaskContext
+ {
+ MessageContext = messageContext,
+ PublishStatus = (_, _) => Task.CompletedTask
+ };
+ }
+
+ private static TaskPayload ParsePayload(AgentTaskResult result)
+ {
+ var text = result.Message?.Parts.FirstOrDefault(p => p.Kind == "text")?.Text ?? "{}";
+ using var doc = JsonDocument.Parse(text);
+ var r = doc.RootElement;
+ return new TaskPayload(
+ Status: r.TryGetProperty("status", out var s) ? s.GetString() : null,
+ Summary: r.TryGetProperty("summary", out var sm) ? sm.GetString() : null,
+ Result: r.TryGetProperty("result", out var rs) && rs.ValueKind != JsonValueKind.Null
+ ? rs.GetString() : null);
+ }
+
+ private static async Task StartSiteAsync(Action configure)
+ {
+ var builder = WebApplication.CreateEmptyBuilder(new WebApplicationOptions());
+ builder.WebHost.UseKestrelCore();
+ builder.WebHost.UseUrls("http://127.0.0.1:0");
+ builder.Services.AddRoutingCore();
+ builder.Logging.ClearProviders();
+
+ var app = builder.Build();
+ app.UseRouting();
+ configure(app);
+ await app.StartAsync();
+ var addresses = app.Services
+ .GetRequiredService()
+ .Features.Get()!
+ .Addresses;
+ return new SiteHost(app, addresses.First().TrimEnd('/'));
+ }
+
+ private sealed record TaskPayload(string? Status, string? Summary, string? Result);
+
+ private sealed record SiteHost(WebApplication App, string BaseUrl) : IAsyncDisposable
+ {
+ public async ValueTask DisposeAsync() => await App.DisposeAsync();
+ }
+
+ private sealed record LlmConfig(string Endpoint, string ModelId, string ApiKey)
+ {
+ public static LlmConfig? FromEnvironment()
+ {
+ var endpoint = Environment.GetEnvironmentVariable("FORAGENT_LLM_ENDPOINT");
+ var model = Environment.GetEnvironmentVariable("FORAGENT_LLM_MODEL_ID");
+ var key = Environment.GetEnvironmentVariable("FORAGENT_LLM_API_KEY");
+ if (string.IsNullOrWhiteSpace(endpoint)
+ || string.IsNullOrWhiteSpace(model)
+ || string.IsNullOrWhiteSpace(key))
+ {
+ return null;
+ }
+ return new LlmConfig(endpoint, model, key);
+ }
+ }
+
+ private sealed class NoCredentialsBroker : ICredentialBroker
+ {
+ public Task ResolveAsync(string credentialId, CancellationToken cancellationToken = default) =>
+ throw new CredentialNotFoundException(credentialId);
+ }
+}