From 32b72b9c081a267ede4826d5db218eed7858a5eb Mon Sep 17 00:00:00 2001 From: Rockford Lhotka Date: Wed, 22 Apr 2026 20:11:35 -0500 Subject: [PATCH] Step 6: baseline browser-task generalist MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First v0.2 capability. LLM-in-the-loop planner over ref-annotated aria snapshots + aria-ref=eN locator resolution, built on Microsoft.Playwright 1.59 (bumped from 1.50 for AriaSnapshotMode.Ai). Registered tiered chat clients via AddRockBotTieredChatClients with one model aliased across Low/Balanced/High per spec §3.7. - HostAllowlist with *, *.domain, exact-host patterns; empty rejects. - Context-wide RouteAsync aborts off-list document/subframe navs before Playwright issues the request (spec §7.1). - [AIFunction] tools: snapshot / navigate / click / type / wait_for / done / fail. Budget enforced tool-side (default maxSteps=60, ceiling 150) + wall-clock CancellationTokenSource (default 120s, ceiling 600). - Structured JSON output: {status, summary, result, steps, navigations}. - Unit tests: 14 new (HostAllowlist, BrowserTaskCapability via ScriptedChatClient + FakeBrowserAgentPage). - Real-LLM benchmark: 3 Kestrel scenarios, all pass on first attempt against Azure AI Foundry Balanced (5/8/7s). Establishes the unaided floor before step 7 adds priming. - Smoke-tested via docker-compose HTTP gateway (agent-card lists four skills; browser-task returns "Example Domain" in 1 step; off-list URLs rejected). Framework observations captured in docs/framework-feedback.md: AddRockBotTieredChatClients subsumes AddRockBotChatClient (undocumented); no per-request iteration cap on the function-invoking chat client; Playwright aria-ref gating via AriaSnapshotMode.Ai rather than a boolean Ref option. Co-Authored-By: Claude Opus 4.7 (1M context) --- CLAUDE.md | 9 +- Directory.Packages.props | 2 +- deploy/rockbot-seed/agent-trust.json | 2 +- deploy/rockbot-seed/well-known-agents.json | 5 + docker-compose.yml | 2 +- docs/capabilities.md | 86 +++++- docs/framework-feedback.md | 85 ++++++ src/Foragent.Agent/Program.cs | 17 +- src/Foragent.Browser/IBrowserSession.cs | 71 +++++ .../PlaywrightBrowserSessionFactory.cs | 121 +++++++- .../BrowserTask/BrowserTaskCapability.cs | 186 ++++++++++++ .../BrowserTask/BrowserTaskInput.cs | 137 +++++++++ .../BrowserTask/BrowserTaskTools.cs | 204 +++++++++++++ .../BrowserTask/HostAllowlist.cs | 106 +++++++ ...CapabilitiesServiceCollectionExtensions.cs | 3 + .../BrowserTask/BrowserTaskCapabilityTests.cs | 170 +++++++++++ .../BrowserTask/FakeBrowserAgentPage.cs | 87 ++++++ .../BrowserTask/HostAllowlistTests.cs | 67 +++++ .../BrowserTask/ScriptedChatClient.cs | 69 +++++ tests/Foragent.Agent.Tests/TestDoubles.cs | 9 + .../BrowserTaskIntegrationTests.cs | 268 ++++++++++++++++++ 21 files changed, 1683 insertions(+), 23 deletions(-) create mode 100644 src/Foragent.Capabilities/BrowserTask/BrowserTaskCapability.cs create mode 100644 src/Foragent.Capabilities/BrowserTask/BrowserTaskInput.cs create mode 100644 src/Foragent.Capabilities/BrowserTask/BrowserTaskTools.cs create mode 100644 src/Foragent.Capabilities/BrowserTask/HostAllowlist.cs create mode 100644 tests/Foragent.Agent.Tests/BrowserTask/BrowserTaskCapabilityTests.cs create mode 100644 tests/Foragent.Agent.Tests/BrowserTask/FakeBrowserAgentPage.cs create mode 100644 tests/Foragent.Agent.Tests/BrowserTask/HostAllowlistTests.cs create mode 100644 tests/Foragent.Agent.Tests/BrowserTask/ScriptedChatClient.cs create mode 100644 tests/Foragent.Browser.Tests/BrowserTaskIntegrationTests.cs diff --git a/CLAUDE.md b/CLAUDE.md index 1159923..c34ce47 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co ## Status -Foragent is at **milestone 5 shipped, v0.2 spec adopted, step 6 next**. Three capabilities are live (`fetch-page-title`, `extract-structured-data`, `post-to-site`); the A2A loop is wired end-to-end against RockBot via the `docker-compose.yml` harness pinned to `rockylhotka/rockbot-agent:0.8.5`. The governing spec is now `docs/foragent-specification.md` **v0.2** — read it before making non-trivial changes. v0.2 pivots Foragent to an agentic model: one generalist `browser-task` capability (built natively on `Microsoft.Playwright` NuGet — no MCP sidecar, no Stagehand port — see Appendix A #16) plus narrow fast-path specialists, with RockBot's `ISkillStore` + `ILongTermMemory` as the learning substrate. The v0.1 proposal document is archived at `docs/archive/foragent-spec-v0.2-proposal.md`. Storage-state persistence, 2FA input-required flow, k8s-secrets broker, and per-tenant credential namespaces remain deferred — tracked in `docs/framework-feedback.md` step 4. Framework-level observations from each milestone are captured in `docs/framework-feedback.md`. +Foragent is at **milestone 6 shipped, step 7 next**. Four capabilities are live (`browser-task`, `fetch-page-title`, `extract-structured-data`, `post-to-site`); the A2A loop is wired end-to-end against RockBot via the `docker-compose.yml` harness pinned to `rockylhotka/rockbot-agent:0.8.5`. Step 6 shipped the generalist `browser-task` planner (LLM-in-the-loop over ref-annotated aria snapshots + `aria-ref=eN` locator resolution, built on `Microsoft.Playwright` 1.59 — bumped from 1.50 for the Ai aria-snapshot mode; see Appendix A #16). Tiered chat clients are wired via `AddRockBotTieredChatClients` with one model aliased across Low/Balanced/High per spec §3.7. The governing spec is `docs/foragent-specification.md` **v0.2**. Step 7 wires `ISkillStore` + `ILongTermMemory` priming; `post-to-site` is removed from the advertised skill list once `browser-task` + the learned bsky skill cover it. Storage-state persistence, 2FA input-required flow, k8s-secrets broker, and per-tenant credential namespaces remain deferred — tracked in `docs/framework-feedback.md`. Framework-level observations from each milestone are captured in `docs/framework-feedback.md`. ## Build / test @@ -69,11 +69,13 @@ Key framework pieces Foragent uses today: - `RockBot.A2A.IAgentTaskHandler` — the single per-agent extension point. `ForagentTaskHandler` (in `Foragent.Capabilities`) implements this and dispatches on `request.Skill`. - `RockBot.A2A.Gateway.AddA2AHttpGateway` + `MapA2AHttpGateway` — the in-process HTTP surface. Published as NuGet in RockBot 0.8.4 (see `docs/framework-feedback.md`). -Foragent requires an LLM (for `extract-structured-data` and future capabilities). The same `IChatClient` is registered both as a singleton (capabilities inject it directly) and via `AddRockBotChatClient` (satisfies the framework's mandatory registration). Config lives under `ForagentLlm` — separate from any rockbot-side `LLM` config so the two agents can point at different models. Program.cs fails fast at startup if `ForagentLlm:Endpoint`/`ModelId`/`ApiKey` are missing. +Foragent requires an LLM. Config lives under `ForagentLlm` — separate from any rockbot-side `LLM` config so the two agents can point at different models. Program.cs fails fast at startup if `ForagentLlm:Endpoint`/`ModelId`/`ApiKey` are missing. Starting step 6 the single configured model is wired via `AddRockBotTieredChatClients(low, balanced, high)` aliased to the same inner `IChatClient`; that one call registers both `IChatClient` (wrapped with `RockBotFunctionInvokingChatClient` for automatic tool invocation) and `TieredChatClientRegistry` (per spec §3.7). Don't also call `AddRockBotChatClient` — it would swap out the wrapped registration. Capabilities that want to escalate/de-escalate per request can resolve `TieredChatClientRegistry` and call `GetClient(ModelTier.Low|Balanced|High)`; none do today. ## Browser -`Foragent.Browser` wraps Playwright. `AddForagentBrowser()` in `Foragent.Agent/Program.cs` registers `PlaywrightBrowserHost` (`IHostedService` owning one shared Chromium per process) and `IBrowserSessionFactory` (hands out a fresh `IBrowserContext` per A2A task — isolation guarantee from spec §3.5). `IBrowserSession` exposes `FetchPageTitleAsync` / `CapturePageSnapshotAsync` for one-shot reads, plus `OpenPageAsync` → `IBrowserPage` (navigate / fill / click / wait / read) for multi-step flows like login + post. The snapshot uses Chromium's aria-snapshot (via `Locator.AriaSnapshotAsync`) and falls back to `` inner text when the tree is empty. Selectors passed to `IBrowserPage` use Playwright's string-selector dialect (CSS + `role=role[name="..."]`); **regex is not accepted in string form**, use exact attribute matches. `Foragent.Browser` has `InternalsVisibleTo("Foragent.Browser.Tests")` so tests drive the real `PlaywrightBrowserSessionFactory` without promoting its implementation types to public. +`Foragent.Browser` wraps Playwright. `AddForagentBrowser()` in `Foragent.Agent/Program.cs` registers `PlaywrightBrowserHost` (`IHostedService` owning one shared Chromium per process) and `IBrowserSessionFactory` (hands out a fresh `IBrowserContext` per A2A task — isolation guarantee from spec §3.5). `IBrowserSession` exposes `FetchPageTitleAsync` / `CapturePageSnapshotAsync` for one-shot reads, `OpenPageAsync` → `IBrowserPage` (navigate / fill / click / wait / read) for multi-step flows like login + post, and `OpenAgentPageAsync` → `IBrowserAgentPage` for LLM-in-the-loop planners (ref-annotated aria snapshots + `aria-ref=eN` locator resolution). The snapshot uses Chromium's aria-snapshot (via `Locator.AriaSnapshotAsync`; `Mode = AriaSnapshotMode.Ai` gets the ref-annotated form) and falls back to `` inner text when the tree is empty. Selectors passed to `IBrowserPage` use Playwright's string-selector dialect (CSS + `role=role[name="..."]`); **regex is not accepted in string form**, use exact attribute matches. `Foragent.Browser` has `InternalsVisibleTo("Foragent.Browser.Tests")` so tests drive the real `PlaywrightBrowserSessionFactory` without promoting its implementation types to public. + +`CreateSessionAsync(Func allowedHost, ...)` is the step-6 entry point for allowlist-scoped sessions. The factory installs a context-wide `RouteAsync("**/*", ...)` that aborts off-list document/subframe navigations before Playwright issues the request (spec §7.1). The no-argument overload accepts any host and stays available for specialists that enforce narrower rules elsewhere (e.g. `post-to-site` where the site id selects the host). ## Capabilities @@ -84,6 +86,7 @@ Foragent requires an LLM (for `extract-structured-data` and future capabilities) - `ForagentCapabilities.Skills` (static array) is the single source of truth for advertised skills — both the bus-side `AgentCard.Skills` and the HTTP gateway's `opts.Skills` read from it. - `CapabilityInput.Parse` is the shared URL + description shim used by `fetch-page-title` and `extract-structured-data`. Capabilities with different input shapes (e.g. `post-to-site` needing `site` / `credentialId` / `content`) parse their own input near the capability — see `PostToSiteInput` in `PostToSiteCapability.cs`. Don't overload `CapabilityInput` for unrelated shapes. - `post-to-site` dispatches to an `ISitePoster` keyed on `Site` (in `SitePosting/`). `BlueskySitePoster` is the only implementation today; add new sites by registering another `ISitePoster` in `AddForagentCapabilities()`. The capability never echoes exception messages from posters back to callers — they may contain credential material; operators read the full exception in logs. +- `browser-task` (in `BrowserTask/`) is the generalist planner (spec §5.2). `BrowserTaskInput` parses intent + mandatory `allowedHosts` + optional `url` / `credentialId` / `maxSteps` (default 60, ceiling 150) / `maxSeconds` (default 120, ceiling 600). `BrowserTaskTools` wraps `snapshot` / `navigate` / `click` / `type` / `wait_for` / `done` / `fail` as `AIFunction`s via `AIFunctionFactory.Create` and passes them in `ChatOptions.Tools`; the RockBot-wrapped function-invoking `IChatClient` runs the full model ↔ tool loop inside one `GetResponseAsync` call. Budget is enforced tool-side (each tool checks `BrowserTaskState.BudgetExhausted`) because Microsoft.Extensions.AI does not surface per-request iteration caps through `ChatOptions`; wall-clock is a linked `CancellationTokenSource`. **Never log tool arguments verbatim** — `type` carries user-supplied values that may be sensitive (log length only). Refs from a snapshot are valid only until the next mutating call; the system prompt and tool descriptions both state this, but don't code anything that assumes cross-snapshot ref stability. ## Credentials diff --git a/Directory.Packages.props b/Directory.Packages.props index 64e171d..53db1f9 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -4,7 +4,7 @@ true - + diff --git a/deploy/rockbot-seed/agent-trust.json b/deploy/rockbot-seed/agent-trust.json index 0466fa6..10ffabe 100644 --- a/deploy/rockbot-seed/agent-trust.json +++ b/deploy/rockbot-seed/agent-trust.json @@ -2,7 +2,7 @@ { "agentId": "Foragent", "level": 4, - "approvedSkills": ["fetch-page-title", "extract-structured-data", "post-to-site"], + "approvedSkills": ["browser-task", "fetch-page-title", "extract-structured-data", "post-to-site"], "firstSeen": "2026-04-21T00:00:00+00:00", "lastInteraction": "2026-04-21T00:00:00+00:00", "interactionCount": 0 diff --git a/deploy/rockbot-seed/well-known-agents.json b/deploy/rockbot-seed/well-known-agents.json index 6f6e3d4..35bcd03 100644 --- a/deploy/rockbot-seed/well-known-agents.json +++ b/deploy/rockbot-seed/well-known-agents.json @@ -8,6 +8,11 @@ "authHeaderName": "X-Api-Key", "authHeaderValueBase64": "cm9ja2JvdC1jYWxscy1mb3JhZ2VudA==", "skills": [ + { + "id": "browser-task", + "name": "Browser Task (generalist)", + "description": "Drive a browser with an LLM-in-the-loop planner to accomplish a free-form intent. Input JSON {\"intent\":\"...\",\"allowedHosts\":[\"host\",\"*.host\",\"*\"],\"url\":\"optional start\",\"credentialId\":\"optional\",\"maxSteps\":60,\"maxSeconds\":120}. allowedHosts is required and empty rejects. Returns a structured JSON result with status (done/failed/incomplete), summary, optional result, step count, and navigations." + }, { "id": "fetch-page-title", "name": "Fetch Page Title", diff --git a/docker-compose.yml b/docker-compose.yml index 0611c56..e7a5ac9 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -61,7 +61,7 @@ services: RabbitMq__VirtualHost: / Gateway__AgentName: Foragent Gateway__InternalAgentName: Foragent - Gateway__Description: "Browser agent — fetch-page-title, extract-structured-data, post-to-site" + Gateway__Description: "Browser agent — browser-task (generalist), fetch-page-title, extract-structured-data, post-to-site" # RockBot will call Foragent with header X-Api-Key: rockbot-calls-foragent ApiKeys__rockbot-calls-foragent__AgentId: RockBot ApiKeys__rockbot-calls-foragent__DisplayName: RockBot diff --git a/docs/capabilities.md b/docs/capabilities.md index 3aaaef2..31bd7c4 100644 --- a/docs/capabilities.md +++ b/docs/capabilities.md @@ -3,18 +3,84 @@ Foragent exposes browser operations as discrete A2A capabilities. Callers invoke capabilities by name; Foragent handles the browser mechanics. -## Planned initial capability set +## Advertised capabilities (v0.2) -- [ ] `fetch-page-content` — Navigate to a URL and return the page content -- [ ] `extract-structured-data` — Extract structured data from a page using - an LLM-assisted schema -- [ ] `fill-form` — Fill and optionally submit an HTML form -- [ ] `post-to-site` — Perform a multi-step posting action on a target site -- [ ] `monitor-page` — Poll a page for a condition and notify when met +- `browser-task` — **generalist**, spec §5.2. LLM-in-the-loop planner that + drives a real browser to accomplish a free-form intent. Shipped in + step 6. +- `fetch-page-title` — specialist. Inherited from step 1/2. +- `extract-structured-data` — specialist. Inherited from step 3. +- `post-to-site` — specialist, credential-using. Inherited from step 4. + Scheduled for removal from the advertised list once step 7 lands + (`browser-task` + learned bsky skill subsume it). + +## `browser-task` input shape + +JSON in the first text part, or field-by-field metadata: + +```json +{ + "intent": "free-form description of what to accomplish", + "allowedHosts": ["bsky.app", "*.example.com", "*"], + "url": "optional absolute http(s) starting URL", + "credentialId": "optional broker reference", + "maxSteps": 60, + "maxSeconds": 120 +} +``` + +- `intent` — required. Free-form. +- `allowedHosts` — required, non-empty (spec §7.1). An empty list rejects. + Supports exact hosts, `*.domain` subdomain wildcards, and `*` for + unrestricted. Off-list navigations are aborted inside the browser + context before Playwright issues the request. +- `url` — optional. If provided, must match the allowlist. +- `credentialId` — optional. Resolved but not exposed to the planner in + step 6; reserved for a typed login tool in a later step. +- `maxSteps` — default 60, ceiling 150. Enforced tool-side via + `BrowserTaskState.BudgetExhausted`; once exceeded, tools return a + "call done or fail" message and refuse further work. +- `maxSeconds` — default 120, ceiling 600. Enforced via a linked + `CancellationTokenSource`. + +## `browser-task` output shape + +A JSON object in a single text part: + +```json +{ + "status": "done" | "failed" | "incomplete", + "summary": "one-sentence human-readable result", + "result": "optional structured result text (e.g. extracted value)", + "steps": 7, + "navigations": ["https://host/path", "..."] +} +``` + +`incomplete` means the budget was exhausted before `done`/`fail` was +called. + +## `browser-task` tool surface + +Exposed to the planner via `[AIFunction]` wrappers over `IChatClient` +(spec Appendix A #16 — no MCP sidecar). Refs are Playwright aria-ref ids +and are valid only within the snapshot they came from. + +- `snapshot()` — ref-annotated aria tree of the current page. +- `navigate(url)` — load a URL; host must be on the allowlist. +- `click(ref)` — click by ref. +- `type(ref, text)` — fill by ref. +- `wait_for(ref, timeoutSeconds?)` — wait for visibility. +- `done(summary, result?)` — mark complete. +- `fail(reason)` — mark failed. ## Design principles -- Capabilities operate at the task level, not at the DOM-operation level -- Each capability invocation gets an isolated browser context +- Capabilities operate at the task level, not at the DOM-operation level. +- Each capability invocation gets an isolated `BrowserContext` (spec §3.5). +- Per-task host allowlists are mandatory (spec §7.1). - Credential references are passed by ID; values are resolved inside - Foragent and never cross A2A boundaries + Foragent and never cross A2A boundaries (spec §6.1). +- Prohibited capabilities — account creation, financial transactions, + modifying security permissions — are out of scope regardless of + implementation ease (spec §7.3). diff --git a/docs/framework-feedback.md b/docs/framework-feedback.md index 078649d..c02a69d 100644 --- a/docs/framework-feedback.md +++ b/docs/framework-feedback.md @@ -254,3 +254,88 @@ in `.env`. discovery) and `GatewayOptions.Skills` (HTTP agent-card endpoint) are independent. Our Program.cs populates both from a single `ForagentCapabilities.Skills` array — a workaround, not a fix. The framework should treat one as authoritative and derive the other. + +## Step 6 — baseline `browser-task` generalist + +### Framework observations + +- **`AddRockBotTieredChatClients` obviates `AddRockBotChatClient` but this + is undocumented.** Calling `AddRockBotTieredChatClients(low, balanced, + high)` registers an `IChatClient` singleton whose factory already wraps + the inner client with `RockBotFunctionInvokingChatClient`, plus a + `TieredChatClientRegistry` singleton. Callers who previously used + `AddRockBotChatClient(client)` don't need to call both — but that's + not spelled out anywhere. If both are called, the second registration + silently wins (standard MEDI behavior), which can swap the wrapped + client for an unwrapped one depending on order. Docs gap; candidate + framework fix is either a guard throw or collapsing both methods into + one overload shape. + +- **No per-request iteration cap surface on the function-invoking chat + client.** `FunctionInvokingChatClient.MaximumIterationsPerRequest` is + an *instance* property, and the wrapped client is built inside + `AddRockBotTieredChatClients` — the caller has no hook to set it per + `GetResponseAsync` invocation. `ChatOptions.AdditionalProperties` + lookup keys are not honored. `ModelBehavior.MaxToolIterationsOverride` + exists on the RockBot side but routes through YAML behavior config, + not per-call. Foragent enforces its step budget tool-side (each tool + checks `BrowserTaskState.BudgetExhausted`); wall-clock cancellation + is the real safety net. Framework candidate: either honor a standard + `ChatOptions.AdditionalProperties["MaximumIterationsPerRequest"]` + convention or expose the FICC instance via DI so consumers can + configure it. + +- **`Microsoft.Playwright` 1.50 (pinned since step 2) does not expose + the Ai aria-snapshot mode.** Step 6 requires ref-annotated snapshots + (`[ref=eN]` + `aria-ref=eN` locator resolution). That gating moved + from a boolean `Ref` option to `Mode = AriaSnapshotMode.Ai` sometime + between 1.52 and the current 1.59 C# bindings. Foragent bumped the + pin to 1.59.0; container base image + (`mcr.microsoft.com/playwright/dotnet:v1.50.0-noble`) will need the + matching bump in the first release that ships browser-task. Not a + framework-issue per se, but relevant to RockBot's "v1 Foragent" story + and to anyone using the framework + Playwright together. + +- **Aria-ref lifetime is a contract the planner must respect.** Refs are + valid only within the snapshot they came from. The tool surface + documents this in the `snapshot` description; if the framework ever + ships a "browser task runner" helper of its own (candidate + `RockBot.Browser.Planner`?), it should bake the "re-snapshot after + mutation" rule into a first-class contract rather than leaving it to + prompt text. + +- **`AIFunctionFactory.Create(Delegate, name:, description:, …)` + descriptions only surface the method-level `[Description]`.** Parameter + descriptions must be on parameters via `[Description]` — easy to miss + without the reminder. Worked as expected; noting for anyone building + similar tool surfaces. + +- **RockBot's `RockBotFunctionInvokingChatClient` auto-invokes tools end + to end in a single `GetResponseAsync` call.** This is exactly what the + planner wants; no custom loop needed. One quirk: the FICC keeps + iterating as long as the model emits tool calls, with no public + step cap (see above). Combined with aria-ref lifetimes, a model that + thrashes on stale refs can burn budget fast. Step 7's learning + substrate is the intended mitigation. + +### Unaided floor measurement (2026-04-22) + +First end-to-end benchmark against the operator's Azure AI Foundry +Balanced model (no learned skills, no priming — the "unaided" floor the +spec §9.1 step 6 calls for): + +| Scenario | Result | Wall-clock | +|---|---|---| +| Click-through (home → link → read destination value) | ✅ done | 5 s | +| Form submit (fill name + textarea → submit → read confirmation) | ✅ done | 8 s | +| Multi-page nav (index → intro → chapter-2 → read bolded answer) | ✅ done | 7 s | + +3 / 3 passed on first attempt. Establishes the baseline Foragent must +not regress against once step 7 adds priming. Re-run this set whenever +the planner prompt, tool surface, or model pin changes. + +### Not yet exercised + +- **`TieredChatClientRegistry.GetClient(ModelTier.Low/High)` is wired + but no capability resolves it yet.** All three tiers currently alias + to the same model. Tier-aware capability code lands as models diverge. diff --git a/src/Foragent.Agent/Program.cs b/src/Foragent.Agent/Program.cs index 19ccfa2..b6ca3f9 100644 --- a/src/Foragent.Agent/Program.cs +++ b/src/Foragent.Agent/Program.cs @@ -8,6 +8,7 @@ using RockBot.A2A.Gateway; using RockBot.A2A.Gateway.Auth; using RockBot.Host; +using RockBot.Llm; using RockBot.Messaging.RabbitMQ; var builder = WebApplication.CreateBuilder(args); @@ -36,11 +37,17 @@ builder.Services.AddRockBotRabbitMq(opts => builder.Configuration.GetSection("RabbitMq").Bind(opts)); -// ── Chat client — Foragent capabilities use this directly. Registered with -// RockBot too so the framework's startup requirement is satisfied. ─────── - -builder.Services.AddSingleton(foragentChatClient); -builder.Services.AddRockBotChatClient(foragentChatClient); +// ── Tiered chat clients (spec §3.7, Appendix #17). One configured model is +// aliased across Low/Balanced/High; capabilities that inject IChatClient +// receive the Balanced tier. Tier-aware capabilities (browser-task) may +// resolve TieredChatClientRegistry to escalate/de-escalate. The factory +// inside AddRockBotTieredChatClients already wraps with +// RockBotFunctionInvokingChatClient — AddRockBotChatClient is redundant +// once this is called. +builder.Services.AddRockBotTieredChatClients( + lowInnerClient: foragentChatClient, + balancedInnerClient: foragentChatClient, + highInnerClient: foragentChatClient); // ── Agent host + A2A bus subscription ─────────────────────────────────────── diff --git a/src/Foragent.Browser/IBrowserSession.cs b/src/Foragent.Browser/IBrowserSession.cs index f8d07c8..c8318b9 100644 --- a/src/Foragent.Browser/IBrowserSession.cs +++ b/src/Foragent.Browser/IBrowserSession.cs @@ -33,6 +33,16 @@ public interface IBrowserSession : IAsyncDisposable /// finished, dispose the session when the task ends. /// Task OpenPageAsync(Uri url, CancellationToken cancellationToken = default); + + /// + /// Opens a page suited to an LLM-in-the-loop planner: exposes + /// ref-annotated aria snapshots () + /// and ref-based interactions resolved via Playwright's aria-ref=eN + /// locator dialect. No initial URL is required; the planner drives + /// navigation through its own tool calls. Used by the + /// browser-task generalist (spec §5.2). + /// + Task OpenAgentPageAsync(CancellationToken cancellationToken = default); } /// @@ -74,6 +84,56 @@ Task WaitForSelectorAsync( Task GetTextAsync(string selector, CancellationToken cancellationToken = default); } +/// +/// Ref-based page surface for LLM-in-the-loop planners. Each call to +/// returns a tree annotated with +/// [ref=eN] ids; , , +/// and resolve those refs via Playwright's +/// aria-ref=eN locator dialect. Refs are valid only within the +/// snapshot they came from — the planner must re-snapshot after any +/// mutation (spec §9.1 step 6, decision D1 — no cache). +/// +public interface IBrowserAgentPage : IAsyncDisposable +{ + /// The current URL, after any redirects and client-side navigations. + Uri CurrentUrl { get; } + + /// The current page title, or null if absent. + Task GetTitleAsync(CancellationToken cancellationToken = default); + + /// + /// Navigates to . The implementation must respect + /// the session's allowlist — off-list navigations fail with + /// before Playwright issues the request. + /// + Task NavigateAsync(Uri url, CancellationToken cancellationToken = default); + + /// + /// Returns a ref-annotated aria snapshot of the current page. Each + /// interactive element carries [ref=eN]; planners pass the ref + /// back to /. + /// + Task AriaSnapshotAsync(CancellationToken cancellationToken = default); + + /// Clicks the element identified by (e.g. e12). + Task ClickByRefAsync(string elementRef, CancellationToken cancellationToken = default); + + /// + /// Fills the element identified by . Used for + /// input/textarea/contenteditable. Sensitive values must not be logged. + /// + Task TypeByRefAsync(string elementRef, string text, CancellationToken cancellationToken = default); + + /// + /// Waits until the element identified by is + /// visible. Throws on timeout. + /// + Task WaitForRefAsync( + string elementRef, + TimeSpan? timeout = null, + CancellationToken cancellationToken = default); +} + /// /// A compact rendering of a page suitable for LLM prompting. /// @@ -97,4 +157,15 @@ public enum PageSnapshotSource public interface IBrowserSessionFactory { Task CreateSessionAsync(CancellationToken cancellationToken = default); + + /// + /// Creates a session whose navigations and subframe loads are restricted to + /// hosts accepted by . An off-list request is + /// aborted inside the browser context before Playwright issues it + /// (spec §7.1). Passing a predicate that always returns false + /// effectively rejects all navigation. + /// + Task CreateSessionAsync( + Func allowedHost, + CancellationToken cancellationToken = default); } diff --git a/src/Foragent.Browser/PlaywrightBrowserSessionFactory.cs b/src/Foragent.Browser/PlaywrightBrowserSessionFactory.cs index 5bf333e..ff7ace2 100644 --- a/src/Foragent.Browser/PlaywrightBrowserSessionFactory.cs +++ b/src/Foragent.Browser/PlaywrightBrowserSessionFactory.cs @@ -5,19 +5,53 @@ namespace Foragent.Browser; internal sealed class PlaywrightBrowserSessionFactory( PlaywrightBrowserHost host) : IBrowserSessionFactory { + public Task CreateSessionAsync( + CancellationToken cancellationToken = default) => + CreateSessionAsync(static _ => true, cancellationToken); + public async Task CreateSessionAsync( + Func allowedHost, CancellationToken cancellationToken = default) { + ArgumentNullException.ThrowIfNull(allowedHost); var context = await host.Browser.NewContextAsync(); - return new PlaywrightBrowserSession(context); + + // Install a context-wide route handler that aborts off-list navigations + // and subframe loads before Playwright sees them (spec §7.1). This + // intercepts Navigation requests (document/subframe); resource loads + // (images, styles) pass through so pages can still render. + await context.RouteAsync("**/*", async route => + { + var request = route.Request; + var resourceType = request.ResourceType; + if (resourceType is not ("document" or "subframe")) + { + await route.ContinueAsync(); + return; + } + + if (!Uri.TryCreate(request.Url, UriKind.Absolute, out var target) || + !allowedHost(target)) + { + await route.AbortAsync("accessdenied"); + return; + } + + await route.ContinueAsync(); + }); + + return new PlaywrightBrowserSession(context, allowedHost); } } -internal sealed class PlaywrightBrowserSession(IBrowserContext context) : IBrowserSession +internal sealed class PlaywrightBrowserSession( + IBrowserContext context, + Func allowedHost) : IBrowserSession { public async Task FetchPageTitleAsync(Uri url, CancellationToken cancellationToken = default) { cancellationToken.ThrowIfCancellationRequested(); + EnsureAllowed(url); var page = await context.NewPageAsync(); try { @@ -41,6 +75,7 @@ internal sealed class PlaywrightBrowserSession(IBrowserContext context) : IBrows public async Task CapturePageSnapshotAsync(Uri url, CancellationToken cancellationToken = default) { cancellationToken.ThrowIfCancellationRequested(); + EnsureAllowed(url); var page = await context.NewPageAsync(); try { @@ -76,6 +111,7 @@ public async Task CapturePageSnapshotAsync(Uri url, CancellationTo public async Task OpenPageAsync(Uri url, CancellationToken cancellationToken = default) { cancellationToken.ThrowIfCancellationRequested(); + EnsureAllowed(url); var page = await context.NewPageAsync(); try { @@ -96,8 +132,21 @@ public async Task OpenPageAsync(Uri url, CancellationToken cancell } } + public async Task OpenAgentPageAsync(CancellationToken cancellationToken = default) + { + cancellationToken.ThrowIfCancellationRequested(); + var page = await context.NewPageAsync(); + return new PlaywrightBrowserAgentPage(page, allowedHost); + } + public ValueTask DisposeAsync() => new(context.CloseAsync()); + private void EnsureAllowed(Uri url) + { + if (!allowedHost(url)) + throw new InvalidOperationException( + $"Host '{url.Host}' is not in the session's allowlist."); + } } internal sealed class PlaywrightBrowserPage(IPage page) : IBrowserPage @@ -163,3 +212,71 @@ public Task GetUrlAsync(CancellationToken cancellationToken = default) public ValueTask DisposeAsync() => new(page.CloseAsync()); } + +internal sealed class PlaywrightBrowserAgentPage( + IPage page, + Func allowedHost) : IBrowserAgentPage +{ + public Uri CurrentUrl => Uri.TryCreate(page.Url, UriKind.Absolute, out var u) ? u : new Uri("about:blank"); + + public async Task GetTitleAsync(CancellationToken cancellationToken = default) + { + cancellationToken.ThrowIfCancellationRequested(); + var title = await page.TitleAsync(); + return string.IsNullOrEmpty(title) ? null : title; + } + + public async Task NavigateAsync(Uri url, CancellationToken cancellationToken = default) + { + cancellationToken.ThrowIfCancellationRequested(); + if (!allowedHost(url)) + throw new InvalidOperationException( + $"Host '{url.Host}' is not in the session's allowlist."); + var response = await page.GotoAsync(url.ToString(), new PageGotoOptions + { + WaitUntil = WaitUntilState.DOMContentLoaded + }); + if (response is null || !response.Ok) + throw new InvalidOperationException( + $"Navigation to {url} returned status {response?.Status.ToString() ?? "no response"}."); + } + + public async Task AriaSnapshotAsync(CancellationToken cancellationToken = default) + { + cancellationToken.ThrowIfCancellationRequested(); + // Ref-annotated aria snapshot — Playwright's "AI" mode emits [ref=eN] + // identifiers that resolve via the aria-ref=eN locator dialect + // (spec §9.1 step 6). In the 1.59 C# bindings this is gated behind + // AriaSnapshotMode.Ai rather than a boolean Ref option. + var snapshot = await page.Locator("body").AriaSnapshotAsync( + new LocatorAriaSnapshotOptions { Mode = AriaSnapshotMode.Ai }); + return snapshot ?? string.Empty; + } + + public Task ClickByRefAsync(string elementRef, CancellationToken cancellationToken = default) + { + cancellationToken.ThrowIfCancellationRequested(); + return page.Locator($"aria-ref={elementRef}").ClickAsync(); + } + + public Task TypeByRefAsync(string elementRef, string text, CancellationToken cancellationToken = default) + { + cancellationToken.ThrowIfCancellationRequested(); + return page.Locator($"aria-ref={elementRef}").FillAsync(text); + } + + public async Task WaitForRefAsync( + string elementRef, + TimeSpan? timeout = null, + CancellationToken cancellationToken = default) + { + cancellationToken.ThrowIfCancellationRequested(); + await page.Locator($"aria-ref={elementRef}").WaitForAsync(new LocatorWaitForOptions + { + State = WaitForSelectorState.Visible, + Timeout = timeout is null ? null : (float)timeout.Value.TotalMilliseconds + }); + } + + public ValueTask DisposeAsync() => new(page.CloseAsync()); +} diff --git a/src/Foragent.Capabilities/BrowserTask/BrowserTaskCapability.cs b/src/Foragent.Capabilities/BrowserTask/BrowserTaskCapability.cs new file mode 100644 index 0000000..55feadf --- /dev/null +++ b/src/Foragent.Capabilities/BrowserTask/BrowserTaskCapability.cs @@ -0,0 +1,186 @@ +using System.Text; +using System.Text.Json; +using Foragent.Browser; +using Foragent.Credentials; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.Logging; +using RockBot.A2A; + +namespace Foragent.Capabilities.BrowserTask; + +/// +/// The generalist browser-task capability (spec §5.2). Runs an +/// LLM-in-the-loop planner over a small ref-based tool surface against a +/// per-task . This is Foragent's primary +/// capability — specialists exist only where deterministic, programmatic +/// callers benefit from a typed shape. +/// +/// v0.2 step 6 scope: no learning substrate, no credentials injection into +/// tools (credential id is acknowledged but unused beyond audit logging — +/// step 7 wires ISkillStore + ILongTermMemory priming, later +/// steps expose credentials to the planner through a typed tool). +/// +public sealed class BrowserTaskCapability( + IBrowserSessionFactory browserFactory, + IChatClient chatClient, + ICredentialBroker credentialBroker, + ILogger logger) : ICapability +{ + public static AgentSkill SkillDefinition { get; } = new() + { + Id = "browser-task", + Name = "Browser Task (generalist)", + Description = "Drive a browser with an LLM-in-the-loop planner to accomplish a free-form intent. " + + "Input: JSON {\"intent\":\"...\",\"allowedHosts\":[\"host\",\"*.host\",\"*\"],\"url\":\"optional start\",\"credentialId\":\"optional\",\"maxSteps\":60,\"maxSeconds\":120}. " + + "Returns a short summary plus optional structured result string." + }; + + private const string SystemPrompt = """ + You drive a real web browser to accomplish a task on behalf of another agent. + + You have these tools: + - snapshot() — returns a ref-annotated aria tree of the current page. ALWAYS call this first, and again after any click/type/navigate, because refs expire when the page mutates. + - navigate(url) — load a URL. The URL's host must be on the task's allowlist. + - click(ref) — click an element by its [ref=eN] id from the latest snapshot. + - type(ref, text) — fill an input by ref with the given text. + - wait_for(ref, timeoutSeconds?) — wait for an element to be visible. + - done(summary, result?) — call exactly once when the task is complete. After calling done, return a short final message and DO NOT emit further tool calls. + - fail(reason) — call if the task cannot be completed. After calling fail, stop emitting tool calls. + + Rules: + - Stay on the allowlisted hosts the caller provided. Off-list navigations are rejected. + - If an element you want is not in the latest snapshot, re-snapshot rather than guessing a ref. + - Never enter credentials or secrets yourself — if the task needs authentication, call fail and explain. + - Be efficient: each tool call counts toward a step budget. + - When the task is done, call done() with a concise summary. If the caller asked for a specific value (e.g. a page title), include it as the result argument. + """; + + public string SkillId => SkillDefinition.Id; + public AgentSkill Skill => SkillDefinition; + + public async Task ExecuteAsync(AgentTaskRequest request, AgentTaskContext context) + { + var ct = context.MessageContext.CancellationToken; + var input = BrowserTaskInput.Parse(request); + + if (input.Error is not null) + return CapabilityResult.Error(request, input.Error); + + // credentialId is accepted by the input shape but not consumed by the + // planner in step 6. Resolve it so we fail fast (and audit-log access) + // if the caller references an unknown id. A future step exposes a + // typed login tool that actually uses the resolved values. + if (!string.IsNullOrWhiteSpace(input.CredentialId)) + { + try + { + _ = await credentialBroker.ResolveAsync(input.CredentialId!, ct); + } + catch (CredentialNotFoundException ex) + { + return CapabilityResult.Error(request, $"Credential '{ex.CredentialId}' is not configured."); + } + } + + using var budgetCts = new CancellationTokenSource(TimeSpan.FromSeconds(input.MaxSeconds)); + using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(ct, budgetCts.Token); + + var state = new BrowserTaskState(); + + try + { + await using var session = await browserFactory.CreateSessionAsync(input.Allowlist!.IsAllowed, linkedCts.Token); + await using var page = await session.OpenAgentPageAsync(linkedCts.Token); + + if (input.Url is not null) + await page.NavigateAsync(input.Url, linkedCts.Token); + + state.MaxSteps = input.MaxSteps; + var tools = new BrowserTaskTools(page, state, input.Allowlist!, logger).BuildFunctions(); + + var messages = new List + { + new(ChatRole.System, SystemPrompt), + new(ChatRole.User, BuildUserPrompt(input)) + }; + + var options = new ChatOptions + { + Tools = [.. tools], + ToolMode = ChatToolMode.Auto + // Step budget is enforced tool-side (BrowserTaskState) and + // wall-clock via linkedCts. The function-invoking chat client + // does not currently expose a per-request iteration cap that + // works through ChatOptions — noted in framework-feedback. + }; + + // The IChatClient we inject is the RockBot-wrapped + // FunctionInvokingChatClient (see Program.cs tiered registration). + // It runs the full model ↔ tool loop internally and returns the + // final assistant response when the model stops emitting tool + // calls or the iteration cap trips. + try + { + _ = await chatClient.GetResponseAsync(messages, options, linkedCts.Token); + } + catch (OperationCanceledException) when (budgetCts.IsCancellationRequested && !ct.IsCancellationRequested) + { + logger.LogInformation( + "browser-task budget exhausted after {Seconds}s / {Steps} step(s)", + input.MaxSeconds, state.Steps); + } + + return BuildResult(request, input, state); + } + catch (OperationCanceledException) when (ct.IsCancellationRequested) + { + throw; + } + catch (Exception ex) + { + logger.LogWarning(ex, "browser-task failed after {Steps} step(s)", state.Steps); + return CapabilityResult.Error(request, $"Browser task failed: {ex.Message}"); + } + } + + private static string BuildUserPrompt(BrowserTaskInput input) + { + var sb = new StringBuilder(); + sb.Append("Intent: ").AppendLine(input.Intent); + if (input.Url is not null) + sb.Append("Starting URL: ").AppendLine(input.Url.ToString()); + sb.Append("Allowed hosts: ").AppendLine(string.Join(", ", input.Allowlist!.Patterns)); + sb.Append("Step budget: ").Append(input.MaxSteps).Append(" steps / ").Append(input.MaxSeconds).AppendLine("s wall-clock."); + if (!string.IsNullOrWhiteSpace(input.CredentialId)) + sb.AppendLine("A credential id was provided but is not yet exposed as a tool. If authentication is required, call fail()."); + return sb.ToString(); + } + + private static AgentTaskResult BuildResult( + AgentTaskRequest request, + BrowserTaskInput input, + BrowserTaskState state) + { + // Structured JSON payload so callers (usually other agents) can parse + // success vs. failure reliably; the summary field is the primary + // human-readable signal. + var payload = new + { + status = state.IsDone ? "done" : state.IsFailed ? "failed" : "incomplete", + summary = state.IsDone + ? state.Summary + : state.IsFailed + ? state.FailureReason + : $"Task did not terminate within {input.MaxSteps} steps / {input.MaxSeconds}s.", + result = state.Result, + steps = state.Steps, + navigations = state.Navigations.Select(u => u.ToString()).ToArray() + }; + return CapabilityResult.Completed(request, JsonSerializer.Serialize(payload, SerializerOptions)); + } + + private static readonly JsonSerializerOptions SerializerOptions = new() + { + WriteIndented = false + }; +} diff --git a/src/Foragent.Capabilities/BrowserTask/BrowserTaskInput.cs b/src/Foragent.Capabilities/BrowserTask/BrowserTaskInput.cs new file mode 100644 index 0000000..cb49508 --- /dev/null +++ b/src/Foragent.Capabilities/BrowserTask/BrowserTaskInput.cs @@ -0,0 +1,137 @@ +using System.Text.Json; +using RockBot.A2A; + +namespace Foragent.Capabilities.BrowserTask; + +/// +/// Parses the browser-task input shape (spec §5.2). +/// +/// Accepts either a JSON object in the first text part or field-by-field +/// metadata on the message/request. Metadata overrides JSON when both are +/// present. Shape: +/// +/// intent — required. Free-form description of what to do. +/// allowedHosts — required. Array of host patterns. Empty rejects. +/// url — optional starting URL (string). +/// credentialId — optional credential reference. +/// maxSteps — optional int; default 60, max 150. +/// maxSeconds — optional int; default 120, max 600. +/// +/// +internal readonly record struct BrowserTaskInput( + string? Intent, + Uri? Url, + string? CredentialId, + HostAllowlist? Allowlist, + int MaxSteps, + int MaxSeconds, + string? Error) +{ + public const int DefaultMaxSteps = 60; + public const int CeilingMaxSteps = 150; + public const int DefaultMaxSeconds = 120; + public const int CeilingMaxSeconds = 600; + + public static BrowserTaskInput Parse(AgentTaskRequest request) + { + string? intent = null; + string? url = null; + string? credentialId = null; + List? allowedHosts = null; + int? maxSteps = null; + int? maxSeconds = null; + + var text = request.Message.Parts + .Where(p => p.Kind == "text") + .Select(p => p.Text) + .FirstOrDefault(t => !string.IsNullOrWhiteSpace(t)) + ?.Trim(); + + if (!string.IsNullOrEmpty(text) && text.StartsWith('{')) + { + try + { + using var doc = JsonDocument.Parse(text); + var root = doc.RootElement; + if (root.TryGetProperty("intent", out var i)) intent = i.GetString(); + if (root.TryGetProperty("url", out var u)) url = u.GetString(); + if (root.TryGetProperty("credentialId", out var c)) credentialId = c.GetString(); + if (root.TryGetProperty("allowedHosts", out var h) && h.ValueKind == JsonValueKind.Array) + allowedHosts = [.. h.EnumerateArray().Select(e => e.GetString() ?? string.Empty)]; + if (root.TryGetProperty("maxSteps", out var ms) && ms.TryGetInt32(out var msv)) + maxSteps = msv; + if (root.TryGetProperty("maxSeconds", out var mt) && mt.TryGetInt32(out var mtv)) + maxSeconds = mtv; + } + catch (JsonException) + { + return Fail("Input must be a JSON object with intent, allowedHosts, and optional url/credentialId/maxSteps/maxSeconds."); + } + } + else if (!string.IsNullOrEmpty(text)) + { + intent = text; + } + + intent = ReadMetadata(request, "intent") ?? intent; + url = ReadMetadata(request, "url") ?? url; + credentialId = ReadMetadata(request, "credentialId") ?? credentialId; + var hostsCsv = ReadMetadata(request, "allowedHosts"); + if (hostsCsv is not null) + allowedHosts = [.. hostsCsv.Split(',', StringSplitOptions.TrimEntries | StringSplitOptions.RemoveEmptyEntries)]; + + if (string.IsNullOrWhiteSpace(intent)) + return Fail("Missing 'intent' — a natural-language description of what to do."); + + if (allowedHosts is null || allowedHosts.Count == 0) + return Fail("Missing 'allowedHosts' — browser-task requires an explicit allowlist (spec §7.1). Use ['*'] to accept any host."); + + HostAllowlist allowlist; + try + { + allowlist = HostAllowlist.Parse(allowedHosts); + } + catch (ArgumentException ex) + { + return Fail(ex.Message); + } + + Uri? parsedUrl = null; + if (!string.IsNullOrWhiteSpace(url)) + { + if (!Uri.TryCreate(url, UriKind.Absolute, out parsedUrl) || + (parsedUrl.Scheme != Uri.UriSchemeHttp && parsedUrl.Scheme != Uri.UriSchemeHttps)) + return Fail($"'url' must be an absolute http(s) URL; got '{url}'."); + if (!allowlist.IsAllowed(parsedUrl)) + return Fail($"Starting URL host '{parsedUrl.Host}' is not in the allowlist."); + } + + var steps = Clamp(maxSteps ?? DefaultMaxSteps, 1, CeilingMaxSteps); + var seconds = Clamp(maxSeconds ?? DefaultMaxSeconds, 1, CeilingMaxSeconds); + + return new BrowserTaskInput(intent, parsedUrl, credentialId, allowlist, steps, seconds, null); + } + + private static BrowserTaskInput Fail(string message) => + new(null, null, null, null, DefaultMaxSteps, DefaultMaxSeconds, message); + + private static int Clamp(int value, int min, int max) => + value < min ? min : value > max ? max : value; + + private static string? ReadMetadata(AgentTaskRequest request, string key) + { + if (request.Message.Metadata is not null + && request.Message.Metadata.TryGetValue(key, out var msgValue) + && !string.IsNullOrWhiteSpace(msgValue)) + { + return msgValue; + } + if (request.Metadata is not null + && request.Metadata.TryGetValue(key, out var reqValue) + && !string.IsNullOrWhiteSpace(reqValue)) + { + return reqValue; + } + return null; + } +} diff --git a/src/Foragent.Capabilities/BrowserTask/BrowserTaskTools.cs b/src/Foragent.Capabilities/BrowserTask/BrowserTaskTools.cs new file mode 100644 index 0000000..121e939 --- /dev/null +++ b/src/Foragent.Capabilities/BrowserTask/BrowserTaskTools.cs @@ -0,0 +1,204 @@ +using System.ComponentModel; +using Foragent.Browser; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.Logging; + +namespace Foragent.Capabilities.BrowserTask; + +/// +/// The tool surface exposed to the planner LLM for a single browser-task +/// run. Methods are wrapped into instances via +/// . +/// The that wraps Foragent's +/// (see AddRockBotTieredChatClients) +/// invokes these between model turns; no separate planner loop is needed. +/// Instances are per-task — they close over one +/// and one . +/// +internal sealed class BrowserTaskTools +{ + private readonly IBrowserAgentPage _page; + private readonly BrowserTaskState _state; + private readonly HostAllowlist _allowlist; + private readonly ILogger _logger; + + public BrowserTaskTools( + IBrowserAgentPage page, + BrowserTaskState state, + HostAllowlist allowlist, + ILogger logger) + { + _page = page; + _state = state; + _allowlist = allowlist; + _logger = logger; + } + + public AIFunction[] BuildFunctions() => + [ + AIFunctionFactory.Create( + Snapshot, + name: "snapshot", + description: "Capture a ref-annotated accessibility snapshot of the current page. Returns a YAML-ish aria tree where each interactive element carries [ref=eN]; pass those refs to click/type/wait_for. Re-snapshot after every click/type/navigate — refs from a previous snapshot are invalid once the page mutates."), + AIFunctionFactory.Create( + Navigate, + name: "navigate", + description: "Navigate the current page to an absolute URL. The URL's host must be on the task's allowlist; off-list navigation is rejected before the request is issued."), + AIFunctionFactory.Create( + Click, + name: "click", + description: "Click an element by ref (e.g. 'e12'). Refs come from the most recent snapshot."), + AIFunctionFactory.Create( + Type, + name: "type", + description: "Fill an input/textarea/contenteditable identified by ref. Pass the target value; prior contents are replaced."), + AIFunctionFactory.Create( + WaitFor, + name: "wait_for", + description: "Wait until the element identified by ref is visible. Use after an action that triggers navigation or async UI update."), + AIFunctionFactory.Create( + Done, + name: "done", + description: "Mark the task complete. Pass a short summary of what was accomplished and optionally a structured result string. After calling done, stop emitting tool calls."), + AIFunctionFactory.Create( + Fail, + name: "fail", + description: "Mark the task failed with a reason explaining what went wrong. After calling fail, stop emitting tool calls.") + ]; + + private const string BudgetMessage = "Step budget exhausted — call done() with whatever was achieved, or fail() with a reason. Do not call other tools."; + + [Description("Capture a ref-annotated aria snapshot of the current page.")] + private async Task Snapshot() + { + if (_state.BudgetExhausted) return BudgetMessage; + _state.IncrementStep(); + var url = _page.CurrentUrl; + var title = await _page.GetTitleAsync(); + var snapshot = await _page.AriaSnapshotAsync(); + _state.RecordNavigation(url); + _logger.LogInformation("browser-task step {Step}: snapshot {Url}", _state.Steps, url); + return $"Url: {url}\nTitle: {title ?? "(none)"}\n\n{snapshot}"; + } + + [Description("Navigate to an absolute URL within the allowlist.")] + private async Task Navigate( + [Description("Absolute http(s) URL to load. Must match an allowlist pattern.")] string url) + { + if (_state.BudgetExhausted) return BudgetMessage; + _state.IncrementStep(); + if (!Uri.TryCreate(url, UriKind.Absolute, out var target) || + (target.Scheme != Uri.UriSchemeHttp && target.Scheme != Uri.UriSchemeHttps)) + return $"Rejected: '{url}' is not an absolute http(s) URL."; + if (!_allowlist.IsAllowed(target)) + return $"Rejected: host '{target.Host}' is not on the allowlist."; + await _page.NavigateAsync(target); + _state.RecordNavigation(target); + _logger.LogInformation("browser-task step {Step}: navigate {Url}", _state.Steps, target); + return $"Loaded {target}. Call snapshot to see the page."; + } + + [Description("Click an element by ref.")] + private async Task Click( + [Description("The element ref (e.g. 'e12') from the latest snapshot.")] string @ref) + { + if (_state.BudgetExhausted) return BudgetMessage; + _state.IncrementStep(); + await _page.ClickByRefAsync(@ref); + _logger.LogInformation("browser-task step {Step}: click ref={Ref}", _state.Steps, @ref); + return $"Clicked {@ref}. Call snapshot to see the resulting page."; + } + + [Description("Fill a field by ref.")] + private async Task Type( + [Description("The element ref (e.g. 'e12') from the latest snapshot.")] string @ref, + [Description("Text to enter into the field, replacing any prior contents.")] string text) + { + if (_state.BudgetExhausted) return BudgetMessage; + _state.IncrementStep(); + await _page.TypeByRefAsync(@ref, text); + // Never log the value — may be a password or other sensitive content. + _logger.LogInformation("browser-task step {Step}: type ref={Ref} ({Length} chars)", + _state.Steps, @ref, text.Length); + return $"Typed into {@ref}."; + } + + [Description("Wait for an element to become visible.")] + private async Task WaitFor( + [Description("The element ref to wait for.")] string @ref, + [Description("Timeout in seconds; default 10.")] int? timeoutSeconds = null) + { + if (_state.BudgetExhausted) return BudgetMessage; + _state.IncrementStep(); + var timeout = TimeSpan.FromSeconds(timeoutSeconds ?? 10); + try + { + await _page.WaitForRefAsync(@ref, timeout); + return $"{@ref} visible."; + } + catch (TimeoutException) + { + return $"Timeout: {@ref} did not become visible within {timeout.TotalSeconds:0}s."; + } + } + + [Description("Mark the task complete.")] + private string Done( + [Description("One-sentence summary of what was accomplished.")] string summary, + [Description("Optional structured result text (JSON, extracted value, etc.). Omit if no result is expected.")] string? result = null) + { + _state.Completed(summary, result); + _logger.LogInformation("browser-task done after {Steps} step(s): {Summary}", _state.Steps, summary); + return "Task marked complete. Stop emitting tool calls."; + } + + [Description("Mark the task failed.")] + private string Fail( + [Description("Reason the task could not complete.")] string reason) + { + _state.Failed(reason); + _logger.LogWarning("browser-task failed after {Steps} step(s): {Reason}", _state.Steps, reason); + return "Task marked failed. Stop emitting tool calls."; + } +} + +/// +/// Shared state between the tool surface and the capability wrapper. Not +/// thread-safe — a browser-task runs one tool at a time inside the +/// function-invoking chat client. +/// +internal sealed class BrowserTaskState +{ + public int Steps { get; private set; } + public int MaxSteps { get; set; } = int.MaxValue; + public bool IsTerminal => IsDone || IsFailed; + public bool IsDone { get; private set; } + public bool IsFailed { get; private set; } + public string? Summary { get; private set; } + public string? Result { get; private set; } + public string? FailureReason { get; private set; } + public List Navigations { get; } = []; + + public bool BudgetExhausted => Steps >= MaxSteps; + + public void IncrementStep() => Steps++; + + public void RecordNavigation(Uri url) + { + if (Navigations.Count == 0 || Navigations[^1] != url) + Navigations.Add(url); + } + + public void Completed(string summary, string? result) + { + IsDone = true; + Summary = summary; + Result = result; + } + + public void Failed(string reason) + { + IsFailed = true; + FailureReason = reason; + } +} diff --git a/src/Foragent.Capabilities/BrowserTask/HostAllowlist.cs b/src/Foragent.Capabilities/BrowserTask/HostAllowlist.cs new file mode 100644 index 0000000..831b767 --- /dev/null +++ b/src/Foragent.Capabilities/BrowserTask/HostAllowlist.cs @@ -0,0 +1,106 @@ +namespace Foragent.Capabilities.BrowserTask; + +/// +/// Parses and evaluates the per-task allowed-hosts list from spec §7.1. +/// Supports exact hosts (bsky.app), subdomain wildcards +/// (*.example.com matches foo.example.com but not +/// example.com), and the unrestricted sentinel (*). Empty +/// input is rejected at construction — there is no default-permissive mode. +/// +public sealed class HostAllowlist +{ + private readonly string[] _exact; + private readonly string[] _suffix; + private readonly bool _wildcardAll; + + private HostAllowlist(string[] exact, string[] suffix, bool wildcardAll) + { + _exact = exact; + _suffix = suffix; + _wildcardAll = wildcardAll; + } + + /// The raw patterns, preserved for audit logging. + public IReadOnlyList Patterns { get; private init; } = []; + + /// + /// Parses . Throws + /// when the list is empty or contains an invalid pattern. Per-pattern + /// rules: + /// + /// * alone — match all hosts. + /// *.host — match any subdomain of host (not host itself). + /// host — match the exact host (case-insensitive). + /// + /// + public static HostAllowlist Parse(IEnumerable patterns) + { + ArgumentNullException.ThrowIfNull(patterns); + + var raw = new List(); + var exact = new List(); + var suffix = new List(); + var wildcardAll = false; + + foreach (var entry in patterns) + { + if (string.IsNullOrWhiteSpace(entry)) + continue; + var pattern = entry.Trim().ToLowerInvariant(); + raw.Add(pattern); + + if (pattern == "*") + { + wildcardAll = true; + continue; + } + + if (pattern.StartsWith("*.", StringComparison.Ordinal)) + { + var tail = pattern[2..]; + if (string.IsNullOrEmpty(tail) || tail.Contains('*') || tail.StartsWith('.')) + throw new ArgumentException( + $"Invalid allowlist pattern '{entry}'. Expected '*.domain.tld'.", + nameof(patterns)); + suffix.Add("." + tail); + continue; + } + + if (pattern.Contains('*')) + throw new ArgumentException( + $"Invalid allowlist pattern '{entry}'. Only '*' or '*.host' wildcards are supported.", + nameof(patterns)); + + exact.Add(pattern); + } + + if (raw.Count == 0) + throw new ArgumentException( + "Allowlist is empty; an empty allowlist rejects all hosts (spec §7.1).", + nameof(patterns)); + + return new HostAllowlist([.. exact], [.. suffix], wildcardAll) { Patterns = raw }; + } + + /// Returns true if is permitted. + public bool IsAllowed(string host) + { + if (string.IsNullOrEmpty(host)) + return false; + if (_wildcardAll) + return true; + var normalized = host.ToLowerInvariant(); + foreach (var e in _exact) + if (e == normalized) return true; + foreach (var s in _suffix) + if (normalized.EndsWith(s, StringComparison.Ordinal)) return true; + return false; + } + + /// Returns true if 's host is permitted. + public bool IsAllowed(Uri url) + { + ArgumentNullException.ThrowIfNull(url); + return IsAllowed(url.Host); + } +} diff --git a/src/Foragent.Capabilities/ForagentCapabilitiesServiceCollectionExtensions.cs b/src/Foragent.Capabilities/ForagentCapabilitiesServiceCollectionExtensions.cs index ce248d0..e2ea5a1 100644 --- a/src/Foragent.Capabilities/ForagentCapabilitiesServiceCollectionExtensions.cs +++ b/src/Foragent.Capabilities/ForagentCapabilitiesServiceCollectionExtensions.cs @@ -1,3 +1,4 @@ +using Foragent.Capabilities.BrowserTask; using Foragent.Capabilities.SitePosting; using Microsoft.Extensions.DependencyInjection; using RockBot.A2A; @@ -16,6 +17,7 @@ public static IServiceCollection AddForagentCapabilities(this IServiceCollection services.AddScoped(); services.AddScoped(); services.AddScoped(); + services.AddScoped(); services.AddScoped(); services.AddScoped(); return services; @@ -32,6 +34,7 @@ public static class ForagentCapabilities { public static IReadOnlyList Skills { get; } = [ + BrowserTaskCapability.SkillDefinition, FetchPageTitleCapability.SkillDefinition, ExtractStructuredDataCapability.SkillDefinition, PostToSiteCapability.SkillDefinition diff --git a/tests/Foragent.Agent.Tests/BrowserTask/BrowserTaskCapabilityTests.cs b/tests/Foragent.Agent.Tests/BrowserTask/BrowserTaskCapabilityTests.cs new file mode 100644 index 0000000..902c6da --- /dev/null +++ b/tests/Foragent.Agent.Tests/BrowserTask/BrowserTaskCapabilityTests.cs @@ -0,0 +1,170 @@ +using System.Text.Json; +using Foragent.Capabilities.BrowserTask; +using Foragent.Credentials; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.Logging.Abstractions; +using RockBot.A2A; +using Xunit; + +namespace Foragent.Agent.Tests.BrowserTask; + +public class BrowserTaskCapabilityTests +{ + [Fact] + public async Task RejectsInput_WhenAllowlistMissing() + { + var (capability, _, _) = Build(ScriptedChatClient.Text("unused")); + var (ctx, _) = TestContext.Build(); + + var result = await capability.ExecuteAsync( + TestContext.Request("browser-task", + """{"intent":"do stuff"}"""), + ctx); + + Assert.Equal(AgentTaskState.Completed, result.State); + Assert.Contains("allowedHosts", TestContext.TextOf(result)); + } + + [Fact] + public async Task RejectsInput_WhenIntentMissing() + { + var (capability, _, _) = Build(ScriptedChatClient.Text("unused")); + var (ctx, _) = TestContext.Build(); + + var result = await capability.ExecuteAsync( + TestContext.Request("browser-task", + """{"allowedHosts":["*"]}"""), + ctx); + + Assert.Contains("intent", TestContext.TextOf(result)); + } + + [Fact] + public async Task DonePayload_ReturnsStructuredJson() + { + var (capability, page, _) = Build( + ScriptedChatClient.ToolCall("snapshot"), + ScriptedChatClient.ToolCall("done", new { summary = "found the title", result = "Example Domain" }), + ScriptedChatClient.Text("stopping")); + var (ctx, _) = TestContext.Build(); + + var result = await capability.ExecuteAsync( + TestContext.Request("browser-task", + """{"intent":"read the page title","url":"https://example.com/","allowedHosts":["example.com"]}"""), + ctx); + + Assert.Equal(AgentTaskState.Completed, result.State); + var text = TestContext.TextOf(result); + using var doc = JsonDocument.Parse(text); + Assert.Equal("done", doc.RootElement.GetProperty("status").GetString()); + Assert.Equal("found the title", doc.RootElement.GetProperty("summary").GetString()); + Assert.Equal("Example Domain", doc.RootElement.GetProperty("result").GetString()); + Assert.Equal(1, doc.RootElement.GetProperty("steps").GetInt32()); + Assert.Contains("snapshot", page.Actions); + } + + [Fact] + public async Task Fail_ReturnsFailedStatus() + { + var (capability, _, _) = Build( + ScriptedChatClient.ToolCall("fail", new { reason = "unreachable" }), + ScriptedChatClient.Text("stopping")); + var (ctx, _) = TestContext.Build(); + + var result = await capability.ExecuteAsync( + TestContext.Request("browser-task", + """{"intent":"try something","allowedHosts":["*"]}"""), + ctx); + + using var doc = JsonDocument.Parse(TestContext.TextOf(result)); + Assert.Equal("failed", doc.RootElement.GetProperty("status").GetString()); + Assert.Equal("unreachable", doc.RootElement.GetProperty("summary").GetString()); + } + + [Fact] + public async Task BudgetExhausted_ReturnsIncompleteStatus() + { + // Script drives the planner to snapshot repeatedly past the budget. + // maxSteps=2 — after 2 snapshot calls the tool returns a budget + // message; the model keeps calling snapshot but each call is a + // budget-exhausted no-op. The script runs out and returns a final + // text, ending the loop naturally. + var responses = new List(); + for (var i = 0; i < 10; i++) + responses.Add(ScriptedChatClient.ToolCall("snapshot")); + responses.Add(ScriptedChatClient.Text("giving up")); + + var (capability, _, _) = Build([.. responses]); + var (ctx, _) = TestContext.Build(); + + var result = await capability.ExecuteAsync( + TestContext.Request("browser-task", + """{"intent":"spin forever","allowedHosts":["*"],"maxSteps":2}"""), + ctx); + + using var doc = JsonDocument.Parse(TestContext.TextOf(result)); + Assert.Equal("incomplete", doc.RootElement.GetProperty("status").GetString()); + } + + [Fact] + public async Task OffAllowlistStartingUrl_IsRejected() + { + var (capability, _, _) = Build(ScriptedChatClient.Text("unused")); + var (ctx, _) = TestContext.Build(); + + var result = await capability.ExecuteAsync( + TestContext.Request("browser-task", + """{"intent":"go","url":"https://other.example/","allowedHosts":["bsky.app"]}"""), + ctx); + + Assert.Contains("not in the allowlist", TestContext.TextOf(result)); + } + + [Fact] + public async Task NavigateTool_RejectsOffAllowlistHosts() + { + var (capability, page, _) = Build( + ScriptedChatClient.ToolCall("navigate", new { url = "https://evil.example/phish" }), + ScriptedChatClient.ToolCall("done", new { summary = "model gave up" }), + ScriptedChatClient.Text("stopping")); + var (ctx, _) = TestContext.Build(); + + var result = await capability.ExecuteAsync( + TestContext.Request("browser-task", + """{"intent":"try","allowedHosts":["bsky.app"]}"""), + ctx); + + using var doc = JsonDocument.Parse(TestContext.TextOf(result)); + Assert.Equal("done", doc.RootElement.GetProperty("status").GetString()); + // Fake page.NavigateAsync never ran because the tool rejected first. + Assert.DoesNotContain(page.Actions, a => a.StartsWith("navigate:")); + } + + private static (BrowserTaskCapability Capability, FakeBrowserAgentPage Page, FakeAgentBrowserSessionFactory Factory) Build( + params ChatResponse[] script) + { + var page = new FakeBrowserAgentPage(); + var factory = new FakeAgentBrowserSessionFactory(page); + + var scripted = new ScriptedChatClient(script); + var wrapped = new ChatClientBuilder(scripted) + .UseFunctionInvocation() + .Build(); + + var broker = new StubCredentialBroker(); + + var capability = new BrowserTaskCapability( + factory, + wrapped, + broker, + NullLogger.Instance); + + return (capability, page, factory); + } + + private sealed class StubCredentialBroker : ICredentialBroker + { + public Task ResolveAsync(string credentialId, CancellationToken cancellationToken = default) => + throw new CredentialNotFoundException(credentialId); + } +} diff --git a/tests/Foragent.Agent.Tests/BrowserTask/FakeBrowserAgentPage.cs b/tests/Foragent.Agent.Tests/BrowserTask/FakeBrowserAgentPage.cs new file mode 100644 index 0000000..8027593 --- /dev/null +++ b/tests/Foragent.Agent.Tests/BrowserTask/FakeBrowserAgentPage.cs @@ -0,0 +1,87 @@ +using Foragent.Browser; + +namespace Foragent.Agent.Tests.BrowserTask; + +internal sealed class FakeBrowserAgentPage : IBrowserAgentPage +{ + public List Actions { get; } = []; + public Uri CurrentUrl { get; set; } = new("about:blank"); + public string Snapshot { get; set; } = "- heading: stub page\n- button \"ok\" [ref=e1]"; + public string? Title { get; set; } = "stub"; + + public Task GetTitleAsync(CancellationToken cancellationToken = default) => + Task.FromResult(Title); + + public Task NavigateAsync(Uri url, CancellationToken cancellationToken = default) + { + Actions.Add($"navigate:{url}"); + CurrentUrl = url; + return Task.CompletedTask; + } + + public Task AriaSnapshotAsync(CancellationToken cancellationToken = default) + { + Actions.Add("snapshot"); + return Task.FromResult(Snapshot); + } + + public Task ClickByRefAsync(string elementRef, CancellationToken cancellationToken = default) + { + Actions.Add($"click:{elementRef}"); + return Task.CompletedTask; + } + + public Task TypeByRefAsync(string elementRef, string text, CancellationToken cancellationToken = default) + { + Actions.Add($"type:{elementRef}:{text.Length}"); + return Task.CompletedTask; + } + + public Task WaitForRefAsync(string elementRef, TimeSpan? timeout = null, CancellationToken cancellationToken = default) + { + Actions.Add($"wait:{elementRef}"); + return Task.CompletedTask; + } + + public ValueTask DisposeAsync() + { + Actions.Add("dispose"); + return ValueTask.CompletedTask; + } +} + +internal sealed class FakeAgentBrowserSession(FakeBrowserAgentPage page) : IBrowserSession +{ + public Task FetchPageTitleAsync(Uri url, CancellationToken ct = default) => + throw new NotSupportedException(); + + public Task CapturePageSnapshotAsync(Uri url, CancellationToken ct = default) => + throw new NotSupportedException(); + + public Task OpenPageAsync(Uri url, CancellationToken ct = default) => + throw new NotSupportedException(); + + public Task OpenAgentPageAsync(CancellationToken ct = default) => + Task.FromResult(page); + + public ValueTask DisposeAsync() => ValueTask.CompletedTask; +} + +internal sealed class FakeAgentBrowserSessionFactory(FakeBrowserAgentPage page) : IBrowserSessionFactory +{ + public Func? CapturedAllowlist { get; private set; } + public int SessionsCreated { get; private set; } + + public Task CreateSessionAsync(CancellationToken ct = default) + { + SessionsCreated++; + return Task.FromResult(new FakeAgentBrowserSession(page)); + } + + public Task CreateSessionAsync(Func allowedHost, CancellationToken ct = default) + { + CapturedAllowlist = allowedHost; + SessionsCreated++; + return Task.FromResult(new FakeAgentBrowserSession(page)); + } +} diff --git a/tests/Foragent.Agent.Tests/BrowserTask/HostAllowlistTests.cs b/tests/Foragent.Agent.Tests/BrowserTask/HostAllowlistTests.cs new file mode 100644 index 0000000..febaa73 --- /dev/null +++ b/tests/Foragent.Agent.Tests/BrowserTask/HostAllowlistTests.cs @@ -0,0 +1,67 @@ +using Foragent.Capabilities.BrowserTask; +using Xunit; + +namespace Foragent.Agent.Tests.BrowserTask; + +public class HostAllowlistTests +{ + [Fact] + public void ExactHost_Matches_CaseInsensitively() + { + var list = HostAllowlist.Parse(["bsky.app"]); + Assert.True(list.IsAllowed(new Uri("https://bsky.app/"))); + Assert.True(list.IsAllowed(new Uri("https://BSKY.APP/"))); + Assert.False(list.IsAllowed(new Uri("https://foo.bsky.app/"))); + Assert.False(list.IsAllowed(new Uri("https://example.com/"))); + } + + [Fact] + public void SubdomainWildcard_MatchesSubdomainButNotBareDomain() + { + var list = HostAllowlist.Parse(["*.example.com"]); + Assert.True(list.IsAllowed(new Uri("https://foo.example.com/"))); + Assert.True(list.IsAllowed(new Uri("https://a.b.example.com/"))); + Assert.False(list.IsAllowed(new Uri("https://example.com/"))); + Assert.False(list.IsAllowed(new Uri("https://example.org/"))); + } + + [Fact] + public void StarAloneMatchesAny() + { + var list = HostAllowlist.Parse(["*"]); + Assert.True(list.IsAllowed(new Uri("https://any.host.tld/"))); + } + + [Fact] + public void EmptyListIsRejected() + { + Assert.Throws(() => HostAllowlist.Parse([])); + Assert.Throws(() => HostAllowlist.Parse([""])); + Assert.Throws(() => HostAllowlist.Parse([" "])); + } + + [Fact] + public void MultiPatternList_OrsEntries() + { + var list = HostAllowlist.Parse(["bsky.app", "*.example.com"]); + Assert.True(list.IsAllowed(new Uri("https://bsky.app/"))); + Assert.True(list.IsAllowed(new Uri("https://foo.example.com/"))); + Assert.False(list.IsAllowed(new Uri("https://example.com/"))); + } + + [Theory] + [InlineData("*.")] + [InlineData("*.*")] + [InlineData("foo*bar")] + public void InvalidPattern_IsRejected(string pattern) + { + Assert.Throws(() => HostAllowlist.Parse([pattern])); + } + + [Fact] + public void PreservesPatterns_ForAuditLogging() + { + var list = HostAllowlist.Parse([" bsky.app ", "*.Example.com"]); + Assert.Equal(["bsky.app", "*.example.com"], list.Patterns); + } +} diff --git a/tests/Foragent.Agent.Tests/BrowserTask/ScriptedChatClient.cs b/tests/Foragent.Agent.Tests/BrowserTask/ScriptedChatClient.cs new file mode 100644 index 0000000..3c3aab2 --- /dev/null +++ b/tests/Foragent.Agent.Tests/BrowserTask/ScriptedChatClient.cs @@ -0,0 +1,69 @@ +using Microsoft.Extensions.AI; + +namespace Foragent.Agent.Tests.BrowserTask; + +/// +/// Raw IChatClient that plays a scripted sequence of assistant turns. The +/// production IChatClient injected into BrowserTaskCapability is the +/// RockBot-wrapped , so in tests we +/// build the same shape via + +/// . +/// +internal sealed class ScriptedChatClient : IChatClient +{ + private readonly Queue _responses; + + public int Turns { get; private set; } + + public ScriptedChatClient(params ChatResponse[] responses) + { + _responses = new Queue(responses); + } + + public static ChatResponse Text(string text) => + new(new ChatMessage(ChatRole.Assistant, text)); + + public static ChatResponse ToolCall(string name, object? args = null) + { + var id = $"call_{Guid.NewGuid():N}"; + var dict = args is IDictionary d + ? (IDictionary)d + : ObjectToDictionary(args); + var call = new FunctionCallContent(id, name, dict); + return new ChatResponse(new ChatMessage(ChatRole.Assistant, [call])); + } + + public Task GetResponseAsync( + IEnumerable messages, + ChatOptions? options = null, + CancellationToken cancellationToken = default) + { + Turns++; + if (_responses.Count == 0) + return Task.FromResult(Text("(script exhausted — stopping)")); + return Task.FromResult(_responses.Dequeue()); + } + +#pragma warning disable CS1998 + public async IAsyncEnumerable GetStreamingResponseAsync( + IEnumerable messages, + ChatOptions? options = null, + [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken = default) + { + yield break; + } +#pragma warning restore CS1998 + + public object? GetService(Type serviceType, object? serviceKey = null) => null; + + public void Dispose() { } + + private static Dictionary ObjectToDictionary(object? source) + { + var result = new Dictionary(StringComparer.OrdinalIgnoreCase); + if (source is null) return result; + foreach (var p in source.GetType().GetProperties()) + result[p.Name] = p.GetValue(source); + return result; + } +} diff --git a/tests/Foragent.Agent.Tests/TestDoubles.cs b/tests/Foragent.Agent.Tests/TestDoubles.cs index cab0d0c..237baad 100644 --- a/tests/Foragent.Agent.Tests/TestDoubles.cs +++ b/tests/Foragent.Agent.Tests/TestDoubles.cs @@ -106,6 +106,12 @@ public Task CreateSessionAsync(CancellationToken ct = default) return Task.FromResult(new StubSession(this)); } + public Task CreateSessionAsync(Func allowedHost, CancellationToken ct = default) + { + SessionsCreated++; + return Task.FromResult(new StubSession(this)); + } + private sealed class StubSession(StubBrowserSessionFactory owner) : IBrowserSession { public Task FetchPageTitleAsync(Uri url, CancellationToken ct = default) => @@ -117,6 +123,9 @@ public Task CapturePageSnapshotAsync(Uri url, CancellationToken ct public Task OpenPageAsync(Uri url, CancellationToken ct = default) => owner.PageResponder(url, ct); + public Task OpenAgentPageAsync(CancellationToken ct = default) => + throw new NotSupportedException("StubBrowserSessionFactory does not expose an agent page; test BrowserTaskCapability with a dedicated fake."); + public ValueTask DisposeAsync() { owner.SessionsDisposed++; diff --git a/tests/Foragent.Browser.Tests/BrowserTaskIntegrationTests.cs b/tests/Foragent.Browser.Tests/BrowserTaskIntegrationTests.cs new file mode 100644 index 0000000..e2da5b1 --- /dev/null +++ b/tests/Foragent.Browser.Tests/BrowserTaskIntegrationTests.cs @@ -0,0 +1,268 @@ +using System.ClientModel; +using System.Text.Json; +using Foragent.Capabilities.BrowserTask; +using Foragent.Credentials; +using Microsoft.AspNetCore.Builder; +using Microsoft.AspNetCore.Http; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging.Abstractions; +using OpenAI; +using RockBot.A2A; +using RockBot.Host; +using RockBot.Llm; +using RockBot.Messaging; +using Xunit; + +namespace Foragent.Browser.Tests; + +/// +/// Step 6's "small curated benchmark" (spec §9.1). Three Kestrel-hosted +/// scenarios exercise the real end-to-end +/// with real Chromium + a real LLM. Skipped when FORAGENT_LLM_* env +/// vars are unset, so the main test run stays free of network / API +/// dependencies. Establishes the unaided success floor before step 7 adds +/// learned-skill priming. +/// +[Collection("Playwright")] +public class BrowserTaskIntegrationTests(TestPageServerFixture fixture) +{ + [SkippableFact] + public async Task ClickThrough_FindsDestinationText() + { + var config = LlmConfig.FromEnvironment(); + Skip.If(config is null, "FORAGENT_LLM_* env vars not set — skipping browser-task benchmark."); + + await using var server = await StartSiteAsync(app => + { + app.MapGet("/", () => Results.Content(""" + Home +

Welcome

See details

+ """, "text/html")); + app.MapGet("/details", () => Results.Content(""" + Details +

Details

SECRET-TOKEN-42

+ """, "text/html")); + }); + + var capability = BuildCapability(config!); + var request = Request(""" + {"intent":"Starting from the home page, click the See details link and report the secret token shown on the destination page as the result.", + "url":"%URL%/", + "allowedHosts":["127.0.0.1"], + "maxSteps":20} + """.Replace("%URL%", server.BaseUrl)); + + var result = await capability.ExecuteAsync(request, BuildContext()); + var payload = ParsePayload(result); + + Assert.Equal("done", payload.Status); + Assert.Contains("SECRET-TOKEN-42", payload.Result ?? payload.Summary ?? string.Empty); + } + + [SkippableFact] + public async Task FormSubmit_ReportsConfirmationMessage() + { + var config = LlmConfig.FromEnvironment(); + Skip.If(config is null, "FORAGENT_LLM_* env vars not set — skipping browser-task benchmark."); + + await using var server = await StartSiteAsync(app => + { + app.MapGet("/form", () => Results.Content(""" + Contact + +

Contact us

+
+ + + +
+ + """, "text/html")); + app.MapPost("/submit", async (HttpRequest r) => + { + var form = await r.ReadFormAsync(); + var name = form["name"].ToString(); + return Results.Content($""" + Thanks +

Thanks {name}, we received your message.

+ """, "text/html"); + }); + }); + + var capability = BuildCapability(config!); + var request = Request(""" + {"intent":"Fill the contact form with name 'Rocky' and message 'Hello from step 6', submit it, and report the confirmation headline you see next.", + "url":"%URL%/form", + "allowedHosts":["127.0.0.1"], + "maxSteps":20} + """.Replace("%URL%", server.BaseUrl)); + + var result = await capability.ExecuteAsync(request, BuildContext()); + var payload = ParsePayload(result); + + Assert.Equal("done", payload.Status); + var combined = (payload.Result ?? string.Empty) + " " + (payload.Summary ?? string.Empty); + Assert.Contains("Rocky", combined); + Assert.Contains("received your message", combined, StringComparison.OrdinalIgnoreCase); + } + + [SkippableFact] + public async Task MultiPageNav_ReadsNestedContent() + { + var config = LlmConfig.FromEnvironment(); + Skip.If(config is null, "FORAGENT_LLM_* env vars not set — skipping browser-task benchmark."); + + await using var server = await StartSiteAsync(app => + { + app.MapGet("/", () => Results.Content(""" + Docs index +

Docs

+ """, "text/html")); + app.MapGet("/intro", () => Results.Content(""" + Intro +

Intro

Chapter 2

+ """, "text/html")); + app.MapGet("/intro/chapter-2", () => Results.Content(""" + Chapter 2 +

Chapter 2: the widget

+

The answer you seek is FORTY-TWO.

+ """, "text/html")); + app.MapGet("/advanced", () => Results.Content(""" + No answer here. + """, "text/html")); + }); + + var capability = BuildCapability(config!); + var request = Request(""" + {"intent":"Starting from the docs index, navigate into Intro and then into its Chapter 2, and return the strong-emphasised answer word you find there.", + "url":"%URL%/", + "allowedHosts":["127.0.0.1"], + "maxSteps":30} + """.Replace("%URL%", server.BaseUrl)); + + var result = await capability.ExecuteAsync(request, BuildContext()); + var payload = ParsePayload(result); + + Assert.Equal("done", payload.Status); + Assert.Contains("FORTY-TWO", (payload.Result ?? string.Empty) + (payload.Summary ?? string.Empty)); + } + + // ── helpers ────────────────────────────────────────────────────────────── + + private BrowserTaskCapability BuildCapability(LlmConfig config) + { + var openAi = new OpenAIClient( + new ApiKeyCredential(config.ApiKey), + new OpenAIClientOptions { Endpoint = new Uri(config.Endpoint) }); + var inner = openAi.GetChatClient(config.ModelId).AsIChatClient(); + + // Match the production wiring — the capability sees a + // function-invoking IChatClient (same shape as what + // AddRockBotTieredChatClients installs in Program.cs). + var chatClient = new ChatClientBuilder(inner) + .UseFunctionInvocation() + .Build(); + + return new BrowserTaskCapability( + fixture.Factory, + chatClient, + new NoCredentialsBroker(), + NullLogger.Instance); + } + + private static AgentTaskRequest Request(string json) => new() + { + TaskId = Guid.NewGuid().ToString(), + Skill = BrowserTaskCapability.SkillDefinition.Id, + Message = new AgentMessage + { + Role = "user", + Parts = [new AgentMessagePart { Kind = "text", Text = json }] + } + }; + + private static AgentTaskContext BuildContext() + { + var envelope = MessageEnvelope.Create( + messageType: typeof(AgentTaskRequest).FullName!, + body: ReadOnlyMemory.Empty, + source: "test"); + var messageContext = new MessageHandlerContext + { + Envelope = envelope, + Agent = new AgentIdentity("Foragent"), + Services = new ServiceCollection().BuildServiceProvider(), + CancellationToken = CancellationToken.None + }; + return new AgentTaskContext + { + MessageContext = messageContext, + PublishStatus = (_, _) => Task.CompletedTask + }; + } + + private static TaskPayload ParsePayload(AgentTaskResult result) + { + var text = result.Message?.Parts.FirstOrDefault(p => p.Kind == "text")?.Text ?? "{}"; + using var doc = JsonDocument.Parse(text); + var r = doc.RootElement; + return new TaskPayload( + Status: r.TryGetProperty("status", out var s) ? s.GetString() : null, + Summary: r.TryGetProperty("summary", out var sm) ? sm.GetString() : null, + Result: r.TryGetProperty("result", out var rs) && rs.ValueKind != JsonValueKind.Null + ? rs.GetString() : null); + } + + private static async Task StartSiteAsync(Action configure) + { + var builder = WebApplication.CreateEmptyBuilder(new WebApplicationOptions()); + builder.WebHost.UseKestrelCore(); + builder.WebHost.UseUrls("http://127.0.0.1:0"); + builder.Services.AddRoutingCore(); + builder.Logging.ClearProviders(); + + var app = builder.Build(); + app.UseRouting(); + configure(app); + await app.StartAsync(); + var addresses = app.Services + .GetRequiredService() + .Features.Get()! + .Addresses; + return new SiteHost(app, addresses.First().TrimEnd('/')); + } + + private sealed record TaskPayload(string? Status, string? Summary, string? Result); + + private sealed record SiteHost(WebApplication App, string BaseUrl) : IAsyncDisposable + { + public async ValueTask DisposeAsync() => await App.DisposeAsync(); + } + + private sealed record LlmConfig(string Endpoint, string ModelId, string ApiKey) + { + public static LlmConfig? FromEnvironment() + { + var endpoint = Environment.GetEnvironmentVariable("FORAGENT_LLM_ENDPOINT"); + var model = Environment.GetEnvironmentVariable("FORAGENT_LLM_MODEL_ID"); + var key = Environment.GetEnvironmentVariable("FORAGENT_LLM_API_KEY"); + if (string.IsNullOrWhiteSpace(endpoint) + || string.IsNullOrWhiteSpace(model) + || string.IsNullOrWhiteSpace(key)) + { + return null; + } + return new LlmConfig(endpoint, model, key); + } + } + + private sealed class NoCredentialsBroker : ICredentialBroker + { + public Task ResolveAsync(string credentialId, CancellationToken cancellationToken = default) => + throw new CredentialNotFoundException(credentialId); + } +}