diff --git a/CLAUDE.md b/CLAUDE.md index ffb3e72..bf6d232 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co ## Status -Foragent is at **milestone 7.5 shipped, step 8 next**. Three capabilities are advertised (`browser-task`, `fetch-page-title`, `extract-structured-data`); the A2A loop is wired end-to-end against RockBot via the `docker-compose.yml` harness pinned to `rockylhotka/rockbot-agent:0.8.5`. Step 6 shipped the generalist `browser-task` planner (LLM-in-the-loop over ref-annotated aria snapshots + `aria-ref=eN` locator resolution, built on `Microsoft.Playwright` 1.59 — bumped from 1.50 for the Ai aria-snapshot mode; see Appendix A #16). Tiered chat clients are wired via `AddRockBotTieredChatClients` with one model aliased across Low/Balanced/High per spec §3.7. Step 7 wired the learning substrate: `ISkillStore` + `ILongTermMemory` via `WithSkills()` + `WithLongTermMemory()`, `BrowserTaskPriming` injects retrieved skill + memory content into the planner prompt, successful tasks write a learned skill at `sites/{host}/learned/{slug}`, and `BskySeedSkillService` seeds `sites/bsky.app/login` on first start (idempotent — only writes when absent). Embeddings are optional and configured separately under `ForagentEmbeddings` so they can live on a different Azure Foundry subscription than the chat model; missing embeddings downgrade retrieval to BM25-only with a single startup warning. The step-6 unaided benchmark (3/3) still passes after the priming wiring. `post-to-site` has been removed from both the advertised skill list and the codebase (greenfield deletion — `browser-task` + the learned bsky skill cover the use case). The governing spec is `docs/foragent-specification.md` **v0.2**. Storage-state persistence, 2FA input-required flow, k8s-secrets broker, and per-tenant credential namespaces remain deferred — tracked in `docs/framework-feedback.md`. Framework-level observations from each milestone are captured in `docs/framework-feedback.md`. +Foragent is at **milestone 8 shipped, step 9 next**. Five capabilities are advertised (`browser-task`, `learn-form-schema`, `execute-form-batch`, `fetch-page-title`, `extract-structured-data`); the A2A loop is wired end-to-end against RockBot via the `docker-compose.yml` harness (image pinned to `rockylhotka/rockbot-agent:0.9.11`, which brings the structured-data `invoke_agent` surface from PR #291 so RockBot can consume Foragent's FormSchema JSON natively rather than as prose). RockBot framework refs are on `0.9.*` (bumped from `0.8.*` for step 8's multi-file skill API). Step 6 shipped the generalist `browser-task` planner (LLM-in-the-loop over ref-annotated aria snapshots + `aria-ref=eN` locator resolution, built on `Microsoft.Playwright` 1.59 — bumped from 1.50 for the Ai aria-snapshot mode; see Appendix A #16). Tiered chat clients are wired via `AddRockBotTieredChatClients` with one model aliased across Low/Balanced/High per spec §3.7. Step 7 wired the learning substrate: `ISkillStore` + `ILongTermMemory` via `WithSkills()` + `WithLongTermMemory()`, `BrowserTaskPriming` injects retrieved skill + memory content into the planner prompt, successful tasks write a learned skill at `sites/{host}/learned/{slug}`, and `BskySeedSkillService` seeds `sites/bsky.app/login` on first start (idempotent — only writes when absent). Step 8 added `learn-form-schema` + `execute-form-batch` (spec §5.5 multi-phase flow): learn returns a typed `FormSchema` and persists it as a `Skill` plus a `SkillResourceType.JsonSchema` resource at `sites/{host}/forms/{slug}`; execute resolves schemas via `ISkillStore.GetResourceAsync("schema.json")` (or accepts them inline), submits each row, and streams per-row progress via `AgentTaskContext.PublishStatus`. Default mode is **abort-on-first** (spec open-question #8 resolution); callers opt into `"continue"` for known-messy batches. Spec open-question #6 (typed JSON alongside markdown skills) is resolved by the RockBot 0.9 multi-file skill API — no parallel Foragent-local store was needed. Embeddings are optional and configured separately under `ForagentEmbeddings` so they can live on a different Azure Foundry subscription than the chat model; missing embeddings downgrade retrieval to BM25-only with a single startup warning. `post-to-site` has been removed from both the advertised skill list and the codebase (greenfield deletion — `browser-task` + the learned bsky skill cover the use case). The governing spec is `docs/foragent-specification.md` **v0.2**. Storage-state persistence, 2FA input-required flow, k8s-secrets broker, and per-tenant credential namespaces remain deferred — tracked in `docs/framework-feedback.md`. Framework-level observations from each milestone are captured in `docs/framework-feedback.md`. ## Build / test @@ -68,13 +68,13 @@ Key framework pieces Foragent uses today: - `RockBot.Host.AddRockBotHost` + `AgentHostBuilder.AddA2A` — bus-side agent registration. Subscribes to `agent.task.{agentName}` on RabbitMQ. - `RockBot.A2A.IAgentTaskHandler` — the single per-agent extension point. `ForagentTaskHandler` (in `Foragent.Capabilities`) implements this and dispatches on `request.Skill`. - `RockBot.A2A.Gateway.AddA2AHttpGateway` + `MapA2AHttpGateway` — the in-process HTTP surface. Published as NuGet in RockBot 0.8.4 (see `docs/framework-feedback.md`). -- `RockBot.Host.AgentMemoryExtensions.WithSkills` / `WithLongTermMemory` — file-backed `ISkillStore` + `ILongTermMemory` (step 7). `ISkillStore.SearchAsync` takes an explicit `float[]? queryEmbedding`; callers compute the embedding. `Skill` record is lean (`Name, Summary, Content, CreatedAt, UpdatedAt?, LastUsedAt?, SeeAlso`) — no tags or importance field. +- `RockBot.Host.AgentMemoryExtensions.WithSkills` / `WithLongTermMemory` — file-backed `ISkillStore` + `ILongTermMemory` (step 7). `ISkillStore.SearchAsync` takes an explicit `float[]? queryEmbedding`; callers compute the embedding. `Skill` record is lean (`Name, Summary, Content, CreatedAt, UpdatedAt?, LastUsedAt?, SeeAlso, Manifest?`). Multi-file skills (RockBot 0.9.x, step 8): `ISkillStore.SaveAsync(skill, resources)` writes a bundle atomically, `GetResourceAsync(skillName, filename)` reads a resource's text. `Skill.Manifest` is an `IReadOnlyList?` describing attached files (`SkillResourceType.JsonSchema`, `Python`, `Wisp`, `Markdown`, `Text`, `Other`). **A plain `SaveAsync(skill)` preserves the existing manifest** (framework commit 2db3775 fix #1), so dream-loop skill-optimize rewriting markdown content won't drop Foragent's attached schemas. Foragent requires an LLM. Config lives under `ForagentLlm` — separate from any rockbot-side `LLM` config so the two agents can point at different models. Program.cs fails fast at startup if `ForagentLlm:Endpoint`/`ModelId`/`ApiKey` are missing. Starting step 6 the single configured model is wired via `AddRockBotTieredChatClients(low, balanced, high)` aliased to the same inner `IChatClient`; that one call registers both `IChatClient` (wrapped with `RockBotFunctionInvokingChatClient` for automatic tool invocation) and `TieredChatClientRegistry` (per spec §3.7). Don't also call `AddRockBotChatClient` — it would swap out the wrapped registration. Capabilities that want to escalate/de-escalate per request can resolve `TieredChatClientRegistry` and call `GetClient(ModelTier.Low|Balanced|High)`; none do today. ## Browser -`Foragent.Browser` wraps Playwright. `AddForagentBrowser()` in `Foragent.Agent/Program.cs` registers `PlaywrightBrowserHost` (`IHostedService` owning one shared Chromium per process) and `IBrowserSessionFactory` (hands out a fresh `IBrowserContext` per A2A task — isolation guarantee from spec §3.5). `IBrowserSession` exposes `FetchPageTitleAsync` / `CapturePageSnapshotAsync` for one-shot reads, `OpenPageAsync` → `IBrowserPage` (navigate / fill / click / wait / read) for multi-step flows like login + post, and `OpenAgentPageAsync` → `IBrowserAgentPage` for LLM-in-the-loop planners (ref-annotated aria snapshots + `aria-ref=eN` locator resolution). The snapshot uses Chromium's aria-snapshot (via `Locator.AriaSnapshotAsync`; `Mode = AriaSnapshotMode.Ai` gets the ref-annotated form) and falls back to `` inner text when the tree is empty. Selectors passed to `IBrowserPage` use Playwright's string-selector dialect (CSS + `role=role[name="..."]`); **regex is not accepted in string form**, use exact attribute matches. `Foragent.Browser` has `InternalsVisibleTo("Foragent.Browser.Tests")` so tests drive the real `PlaywrightBrowserSessionFactory` without promoting its implementation types to public. +`Foragent.Browser` wraps Playwright. `AddForagentBrowser()` in `Foragent.Agent/Program.cs` registers `PlaywrightBrowserHost` (`IHostedService` owning one shared Chromium per process) and `IBrowserSessionFactory` (hands out a fresh `IBrowserContext` per A2A task — isolation guarantee from spec §3.5). `IBrowserSession` exposes `FetchPageTitleAsync` / `CapturePageSnapshotAsync` for one-shot reads, `OpenPageAsync` → `IBrowserPage` (navigate / fill / click / wait / read / `SelectOptionAsync` / `SetCheckedAsync` / `ScanFormAsync`) for multi-step flows like login + post and form submission, and `OpenAgentPageAsync` → `IBrowserAgentPage` for LLM-in-the-loop planners (ref-annotated aria snapshots + `aria-ref=eN` locator resolution). `ScanFormAsync` (added in step 8) runs a single JS pass in the page to collect inputs/selects/textareas with labels, validation attributes, and select/radio options — returns a typed `FormScan` that the form capabilities lift into `FormSchema`. The snapshot uses Chromium's aria-snapshot (via `Locator.AriaSnapshotAsync`; `Mode = AriaSnapshotMode.Ai` gets the ref-annotated form) and falls back to `` inner text when the tree is empty. Selectors passed to `IBrowserPage` use Playwright's string-selector dialect (CSS + `role=role[name="..."]`); **regex is not accepted in string form**, use exact attribute matches. `Foragent.Browser` has `InternalsVisibleTo("Foragent.Browser.Tests")` so tests drive the real `PlaywrightBrowserSessionFactory` without promoting its implementation types to public. `CreateSessionAsync(Func allowedHost, ...)` is the step-6 entry point for allowlist-scoped sessions. The factory installs a context-wide `RouteAsync("**/*", ...)` that aborts off-list document/subframe navigations before Playwright issues the request (spec §7.1). The no-argument overload accepts any host and stays available for specialists that enforce narrower rules elsewhere. @@ -87,6 +87,7 @@ Foragent requires an LLM. Config lives under `ForagentLlm` — separate from any - `ForagentCapabilities.Skills` (static array) is the single source of truth for advertised skills — both the bus-side `AgentCard.Skills` and the HTTP gateway's `opts.Skills` read from it. - `CapabilityInput.Parse` is the shared URL + description shim used by `fetch-page-title` and `extract-structured-data`. Capabilities with different input shapes parse their own input near the capability (e.g. `BrowserTaskInput` in `BrowserTask/`). Don't overload `CapabilityInput` for unrelated shapes. - `browser-task` (in `BrowserTask/`) is the generalist planner (spec §5.2). `BrowserTaskInput` parses intent + mandatory `allowedHosts` + optional `url` / `credentialId` / `maxSteps` (default 60, ceiling 150) / `maxSeconds` (default 120, ceiling 600). `BrowserTaskTools` wraps `snapshot` / `navigate` / `click` / `type` / `wait_for` / `done` / `fail` as `AIFunction`s via `AIFunctionFactory.Create` and passes them in `ChatOptions.Tools`; the RockBot-wrapped function-invoking `IChatClient` runs the full model ↔ tool loop inside one `GetResponseAsync` call. Budget is enforced tool-side (each tool checks `BrowserTaskState.BudgetExhausted`) because Microsoft.Extensions.AI does not surface per-request iteration caps through `ChatOptions`; wall-clock is a linked `CancellationTokenSource`. **Never log tool arguments verbatim** — `type` carries user-supplied values that may be sensitive (log length only). Refs from a snapshot are valid only until the next mutating call; the system prompt and tool descriptions both state this, but don't code anything that assumes cross-snapshot ref stability. +- `learn-form-schema` and `execute-form-batch` (both in `Forms/`) are the step-8 phase-1 / phase-3 pair (spec §5.5). `FormSchema` / `FormField` are the wire contract — stable JSON shape, stored via `FormSchema.SerializerOptions`. `LearnFormSchemaCapability` navigates, calls `IBrowserPage.ScanFormAsync`, maps the raw scan to `FormSchema` via the deterministic `FormSchemaMapper` (pure — no LLM), then runs `FormSchemaEnricher` for one LLM turn to infer `dependsOn` and a note (skipped when there are no select/radio fields). The schema is persisted as a `Skill` bundle at `sites/{host}/forms/{slug}` with a `SkillResourceType.JsonSchema` resource named `schema.json`. Only the enricher can add `dependsOn` / `notes` — structural fields (type, selector, required, options) come only from the DOM scan, so the LLM cannot fabricate fields or rewrite selectors. `ExecuteFormBatchCapability` accepts `schemaRef` (resolves via `ISkillStore.GetResourceAsync(name, "schema.json")`) or an inline `schema`, and submits each row with `FillAsync` / `SelectOptionAsync` / `SetCheckedAsync` per field type. Per-row progress is published via `AgentTaskContext.PublishStatus(new AgentTaskStatusUpdate { State = Working, Message = … })`. Default `mode` is `"abort-on-first"` (spec open-question #8 resolution: a failed submit on a mutating form usually indicates a real problem, so continuing would just generate more bad submissions); callers opt into `"continue"` for known-messy batches. Success detection: an optional `successIndicator` CSS selector is the preferred signal; the fallback is URL change after submit, which fails (correctly) for forms that submit in place. File uploads and multi-step wizards are **out of scope** for v0.2 — `ScanFormAsync` explicitly skips `type=file`, and there's no flow control for wizards. ## Learning substrate (step 7) @@ -103,7 +104,7 @@ On successful completion (`state.IsDone`), `BrowserTaskCapability.TryWriteLearne `BskySeedSkillService` (IHostedService) seeds `sites/bsky.app/login` on first start by calling `ISkillStore.GetAsync` and only writing if absent — docker volume recreation reseeds cleanly; operator edits to the skill through other channels are preserved. -Skill naming follows spec §5.6: `sites/{host}/{intent}` for human-authored primers, `sites/{host}/learned/{slug}` for agent-generated. `Skill.SeeAlso` cross-references related skills to surface clusters rather than single entries. **Note:** `Skill` (from `RockBot.Host 0.8.5`) does not carry tags, metadata, or importance — the `agent-learned` distinction is encoded in the name prefix only. The dream loop (below) keeps the distinction from mattering at retrieval time: skills get improved, merged, and deduped across origins on a daily cadence. +Skill naming follows spec §5.6: `sites/{host}/{intent}` for human-authored primers, `sites/{host}/learned/{slug}` for agent-generated, `sites/{host}/forms/{slug}` for learned form schemas (step 8). `Skill.SeeAlso` cross-references related skills to surface clusters rather than single entries. **Note:** `Skill` (from `RockBot.Host 0.9.x`) carries `Manifest: IReadOnlyList?` for multi-file bundles but still doesn't carry tags, metadata, or importance — the `agent-learned` vs `human-authored` vs `form-schema` distinction is encoded in the name prefix only. The dream loop (below) keeps the distinction from mattering at retrieval time: skills get improved, merged, and deduped across origins on a daily cadence; `SaveAsync(skill)` with null resources preserves the existing manifest so dream-loop markdown edits don't orphan attached resource files. ## Dream loop (step 7.5) @@ -132,3 +133,4 @@ Credential ids are free-form via user-secrets/appsettings (slashes are fine — - Contributions are not yet open externally (`CONTRIBUTING.md`). Internal changes are fine. - APIs are explicitly unstable — the README warns "APIs will change without notice." Backwards-compat shims and deprecation paths are not expected at this stage; prefer clean replacements. +- **Version scheme.** `Directory.Build.props` sets `` for every assembly/package in the solution. Format: `{spec-major}.{spec-minor}.0-alpha.{step}` while the spec is pre-1.0 — e.g. `0.2.0-alpha.8` = implementing spec v0.2, step 8 has shipped. Bump the `alpha.N` segment when a step's PR merges to `main`; bump the spec major/minor when a new spec revision is adopted. Drop `-alpha.N` only when the spec version is declared stable and Foragent actually ships NuGet/Docker artifacts (spec §10.3 — "published once the project actually works end-to-end. No placeholder packages."). No git tags yet; add them when the first artifact is published. diff --git a/Directory.Build.props b/Directory.Build.props index c097140..74c6bcf 100644 --- a/Directory.Build.props +++ b/Directory.Build.props @@ -6,7 +6,7 @@ enable true true - 0.1.0-alpha + 0.2.0-alpha.8 Marimer LLC Marimer LLC Copyright (c) Marimer LLC diff --git a/Directory.Packages.props b/Directory.Packages.props index 53db1f9..bfe8260 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -17,14 +17,15 @@ - - - - - - - - + + + + + + + + + diff --git a/docker-compose.yml b/docker-compose.yml index bd170e4..4c8a7dc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,8 +5,11 @@ # - foragent — this project; exposes HTTP A2A on port 5210 # - rockbot-init — seeds /data/agent with RockBot profile + well-known-agents.json # pointing at foragent -# - rockbot — rockylhotka/rockbot-agent:0.8.5, configured to know Foragent -# as an A2A peer it can delegate tasks to +# - rockbot — rockylhotka/rockbot-agent:0.9.11, configured to know Foragent +# as an A2A peer it can delegate tasks to. 0.9.11 brings +# the structured-data invoke_agent surface (PR #291) so +# RockBot can consume Foragent's FormSchema JSON results +# natively, not as text. # - blazor — rockylhotka/rockbot-blazor:latest, web UI for chatting with # the rockbot agent. Open http://localhost:8080 to test. # @@ -61,7 +64,7 @@ services: RabbitMq__VirtualHost: / Gateway__AgentName: Foragent Gateway__InternalAgentName: Foragent - Gateway__Description: "Browser agent — browser-task (generalist), fetch-page-title, extract-structured-data" + Gateway__Description: "Browser agent — browser-task (generalist), learn-form-schema, execute-form-batch, fetch-page-title, extract-structured-data" # RockBot will call Foragent with header X-Api-Key: rockbot-calls-foragent ApiKeys__rockbot-calls-foragent__AgentId: RockBot ApiKeys__rockbot-calls-foragent__DisplayName: RockBot @@ -97,7 +100,7 @@ services: - foragent-data:/data/foragent rockbot-init: - image: rockylhotka/rockbot-agent:0.8.5 + image: rockylhotka/rockbot-agent:0.9.11 user: root entrypoint: ["/bin/sh", "-c"] command: @@ -133,7 +136,7 @@ services: - ./deploy/rockbot-seed:/seed:ro rockbot: - image: rockylhotka/rockbot-agent:0.8.5 + image: rockylhotka/rockbot-agent:0.9.11 depends_on: rockbot-init: condition: service_completed_successfully diff --git a/docs/framework-feedback.md b/docs/framework-feedback.md index 08915c8..0481e9c 100644 --- a/docs/framework-feedback.md +++ b/docs/framework-feedback.md @@ -484,3 +484,92 @@ docker-compose: First live dream pass against a non-empty skill store will be observed after enough `browser-task` runs accumulate — probably step 8 or when the operator turns the harness on for a sustained session. + +## Step 8 — `learn-form-schema` + `execute-form-batch` + +### Framework observations + +- **Multi-file skill API (RockBot 0.9) closes spec open-question #6 + cleanly.** Step 8 needed typed JSON schemas alongside the existing + markdown-shaped skills; we'd sketched three options (fenced JSON in + the skill body, a parallel Foragent-local typed store, or an upstream + framework extension). The upstream extension had already landed in + `rockbot` main — `Skill.Manifest: IReadOnlyList?` plus + `ISkillStore.SaveAsync(skill, resources)` and + `GetResourceAsync(skillName, filename)`. `SkillResourceType.JsonSchema` + is literally the enum value this use case needed. Foragent consumed it + directly: `LearnFormSchemaCapability` writes the skill bundle, + `ExecuteFormBatchCapability` reads `schema.json` back, no parallel + store. The "framework is the substrate" discipline from spec §8 + actually paid off here — we'd have thrown away a Foragent-local store + one step later when the framework landed this. + +- **`SaveAsync(skill)` preserving the manifest is the important bit.** + Per commit 2db3775 fix #1, a plain + `ISkillStore.SaveAsync(skill)` call preserves the existing + `Manifest` when the incoming skill doesn't carry one. That means the + daily dream loop's `skill-optimize` subtype (which rewrites + markdown content) can't accidentally orphan resource files, and + Foragent's `learn-form-schema` can update a skill's prose primer + without re-writing the schema resource. Without this property, the + dream loop would silently delete Foragent's typed schemas over time. + Worth documenting prominently in RockBot's multi-file-skill guide — + future framework consumers will trip on "my resources disappeared" + otherwise. + +- **`AgentTaskContext.PublishStatus` works unchanged for per-row + streaming.** Step 8's `execute-form-batch` publishes + `AgentTaskStatusUpdate { State = Working, Message = …per-row text… }` + between row submissions. The surface from 0.8.x is still right for + this, and matches how `RockBot.ResearchAgent` uses it for its + iterative research loop. Nothing to change; noting so the next + step-N capability that wants streaming knows the shape is stable. + +- **Credential broker still doesn't know about storage-state or + per-tenant scoping.** `learn-form-schema` and `execute-form-batch` + both accept `credentialId` but only resolve-and-discard it for + audit / fail-fast; the actual authenticated-form flow (storage-state + reuse from a prior `browser-task` login) is still the step-4 deferred + item. Not new — noting because step 8's capabilities would naturally + use this if it existed. When storage-state lands, both form + capabilities grow `storageStateCredentialId` support in one pass. + +- **Foragent-local `FakeSkillStore` still a 40-line hand-rolled + double.** Step 8 adds more surface area (`SaveAsync(skill, resources)` + + `GetResourceAsync`) and the fake needs to match `FileSkillStore`'s + manifest-preservation behavior to be a faithful substitute. Still + noting from step 7: a `RockBot.Host.Testing` package with in-memory + implementations would let Foragent delete both `FakeSkillStore` in + `Foragent.Agent.Tests` and `InMemorySkillStore` in + `Foragent.Browser.Tests`, and would surface any future-proof gaps + in the fakes once a framework change lands. + +### Spec resolutions + +- **Open-question #6 (structured artifacts in `ISkillStore`): + resolved upstream.** No Foragent-local typed store. `schema.json` + resources under `SkillResourceType.JsonSchema`, consumed via + `GetResourceAsync`. Step 8 ships the reference pattern. +- **Open-question #8 (batch retry/failure semantics): resolved as + abort-on-first default, caller-opt-in continue.** Rationale: forms + mutate, so a row failure is likely a schema or session issue where + continuing would generate more bad submissions, not recover. Per-row + status stays in the final result regardless. Deciding abort-by-default + aligns with how human operators would handle a failed row during a + paper-form batch. + +### Verification + +- Unit tests in `Foragent.Agent.Tests/Forms/` — 14 tests covering input + validation, schema round-trip through `SkillResource`, abort-on-first + vs continue semantics, `successIndicator` path, and required-field + validation. Run time ~3s. +- Integration test `Foragent.Browser.Tests/FormCapabilitiesIntegrationTests` + — spins up Kestrel with a real HTML form, drives + `learn-form-schema` + `execute-form-batch` end-to-end against real + Chromium, verifies two rows actually land in the server's POST + handler. Not LLM-gated — the enricher short-circuits on forms + without select/radio, so this runs in CI without + `FORAGENT_LLM_*`. +- Existing step-6 benchmark still 3/3 — framework bump didn't regress + anything else. diff --git a/src/Foragent.Browser/IBrowserSession.cs b/src/Foragent.Browser/IBrowserSession.cs index c8318b9..65629a4 100644 --- a/src/Foragent.Browser/IBrowserSession.cs +++ b/src/Foragent.Browser/IBrowserSession.cs @@ -64,6 +64,20 @@ public interface IBrowserPage : IAsyncDisposable /// Clicks the element matched by . Task ClickAsync(string selector, CancellationToken cancellationToken = default); + /// + /// Selects the option with in the <select> + /// matched by . Throws if the option is absent. + /// + Task SelectOptionAsync(string selector, string value, CancellationToken cancellationToken = default); + + /// + /// Sets the checked state of a checkbox or radio input matched by + /// . Unlike , this is + /// idempotent — calling with true when the box is already checked + /// is a no-op rather than a toggle. + /// + Task SetCheckedAsync(string selector, bool checked_, CancellationToken cancellationToken = default); + /// /// Waits until the element matched by is attached /// and visible. Throws on timeout. @@ -82,6 +96,18 @@ Task WaitForSelectorAsync( /// messages or confirmation text. /// Task GetTextAsync(string selector, CancellationToken cancellationToken = default); + + /// + /// Scans the first <form> matching + /// (or the first form on the page when is + /// null) and returns a structured description of its inputs, selects, + /// textareas, labels, validation attributes, and submit button. Produces no + /// LLM output — purely deterministic DOM reading — so callers can use it as + /// the skeleton for a typed FormSchema. Returns null when no + /// form is found. Radio groups are collapsed to a single field per name + /// with all options enumerated. + /// + Task ScanFormAsync(string? formSelector = null, CancellationToken cancellationToken = default); } /// @@ -134,6 +160,56 @@ Task WaitForRefAsync( CancellationToken cancellationToken = default); } +/// +/// A deterministic rendering of an HTML form. Produced by +/// ; the learn-form-schema +/// capability lifts this into the wire-level FormSchema with optional +/// LLM enrichment (dropdown dependencies, validation hints). +/// +/// The URL the scan was taken on (after redirects). +/// A CSS selector that reaches the scanned form — either the one the caller passed in, or a generated one based on the form's id/name. +/// Selector for the form's submit control, or null if none was detected. +/// Fields detected in document order. Radio groups appear once per group name. +public sealed record FormScan( + Uri Url, + string FormSelector, + string? SubmitSelector, + IReadOnlyList Fields); + +/// +/// One field detected by . Carries raw +/// HTML attributes — the capability layer decides how to map +/// + to its typed FormFieldType. +/// +/// The element tag — input, select, or textarea. +/// The type attribute for <input> elements (text, email, …); null for non-input tags. +/// The name attribute, or null. +/// The id attribute, or null. +/// Visible label text resolved via label[for=id], a wrapping <label>, aria-label, or the placeholder. +/// Whether the element carries the required attribute. +/// The HTML5 pattern attribute, or null. +/// The HTML5 min attribute, or null. +/// The HTML5 max attribute, or null. +/// The HTML5 maxlength attribute, or null when unspecified or non-positive. +/// Enumerated options for select and radio groups; null for free-text fields. +/// A CSS selector the capability can use to drive the field; null when neither name nor id is present. +public sealed record FormScanField( + string Tag, + string? InputType, + string? Name, + string? Id, + string? Label, + bool Required, + string? Pattern, + string? Min, + string? Max, + int? MaxLength, + IReadOnlyList? Options, + string? Selector); + +/// An option entry for a <select> or radio group. +public sealed record FormScanOption(string Value, string? Label); + /// /// A compact rendering of a page suitable for LLM prompting. /// diff --git a/src/Foragent.Browser/PlaywrightBrowserSessionFactory.cs b/src/Foragent.Browser/PlaywrightBrowserSessionFactory.cs index ff7ace2..7dcfb30 100644 --- a/src/Foragent.Browser/PlaywrightBrowserSessionFactory.cs +++ b/src/Foragent.Browser/PlaywrightBrowserSessionFactory.cs @@ -1,3 +1,4 @@ +using System.Text.Json; using Microsoft.Playwright; namespace Foragent.Browser; @@ -175,6 +176,18 @@ public Task ClickAsync(string selector, CancellationToken cancellationToken = de return page.ClickAsync(selector); } + public async Task SelectOptionAsync(string selector, string value, CancellationToken cancellationToken = default) + { + cancellationToken.ThrowIfCancellationRequested(); + await page.Locator(selector).SelectOptionAsync(value); + } + + public Task SetCheckedAsync(string selector, bool checked_, CancellationToken cancellationToken = default) + { + cancellationToken.ThrowIfCancellationRequested(); + return page.Locator(selector).SetCheckedAsync(checked_); + } + public async Task WaitForSelectorAsync( string selector, TimeSpan? timeout = null, @@ -210,6 +223,146 @@ public Task GetUrlAsync(CancellationToken cancellationToken = default) return await locator.First.InnerTextAsync(); } + public async Task ScanFormAsync( + string? formSelector = null, + CancellationToken cancellationToken = default) + { + cancellationToken.ThrowIfCancellationRequested(); + // All the DOM walking happens inside the page — avoids N round-trips + // to read each attribute. The JS returns a JSON-serializable shape + // that mirrors FormScan/FormScanField. + var raw = await page.EvaluateAsync(FormScanScript, formSelector); + if (raw is null || raw.Value.ValueKind != JsonValueKind.Object) + return null; + + var root = raw.Value; + var url = new Uri(root.GetProperty("url").GetString()!); + var selector = root.GetProperty("formSelector").GetString()!; + var submit = root.TryGetProperty("submitSelector", out var s) && s.ValueKind == JsonValueKind.String + ? s.GetString() + : null; + + var fields = new List(); + foreach (var f in root.GetProperty("fields").EnumerateArray()) + { + List? options = null; + if (f.TryGetProperty("options", out var o) && o.ValueKind == JsonValueKind.Array) + { + options = []; + foreach (var item in o.EnumerateArray()) + { + options.Add(new FormScanOption( + item.GetProperty("value").GetString() ?? string.Empty, + item.TryGetProperty("label", out var lbl) && lbl.ValueKind == JsonValueKind.String + ? lbl.GetString() + : null)); + } + } + + fields.Add(new FormScanField( + Tag: f.GetProperty("tag").GetString()!, + InputType: StringOrNull(f, "type"), + Name: StringOrNull(f, "name"), + Id: StringOrNull(f, "id"), + Label: StringOrNull(f, "label"), + Required: f.TryGetProperty("required", out var req) && req.ValueKind == JsonValueKind.True, + Pattern: StringOrNull(f, "pattern"), + Min: StringOrNull(f, "min"), + Max: StringOrNull(f, "max"), + MaxLength: f.TryGetProperty("maxLength", out var ml) && ml.ValueKind == JsonValueKind.Number ? ml.GetInt32() : null, + Options: options, + Selector: StringOrNull(f, "selector"))); + } + + return new FormScan(url, selector, submit, fields); + } + + private static string? StringOrNull(JsonElement parent, string name) => + parent.TryGetProperty(name, out var v) && v.ValueKind == JsonValueKind.String + ? v.GetString() + : null; + + // JS walker: resolves labels, collapses radio groups by name, emits a + // canonical selector per field. Kept in one place so it's easy to update + // when we discover a form shape the scan misses. + private const string FormScanScript = """ + (formSelectorOrNull) => { + const q = (root, sel) => root.querySelector(sel); + const form = formSelectorOrNull ? q(document, formSelectorOrNull) : q(document, 'form'); + if (!form) return null; + const labelFor = (el) => { + if (el.id) { + const lab = document.querySelector(`label[for="${CSS.escape(el.id)}"]`); + if (lab) return lab.innerText.trim(); + } + const wrap = el.closest('label'); + if (wrap) return wrap.innerText.trim(); + const aria = el.getAttribute('aria-label'); + if (aria) return aria.trim(); + const placeholder = el.getAttribute('placeholder'); + if (placeholder) return placeholder.trim(); + return null; + }; + const emitSelector = (el) => { + const tag = el.tagName.toLowerCase(); + const type = (el.type || '').toLowerCase(); + if (tag === 'input' && type === 'radio') { + return el.name ? `input[type="radio"][name=${JSON.stringify(el.name)}]` : null; + } + if (el.name) return `${tag}[name=${JSON.stringify(el.name)}]`; + if (el.id) return `#${CSS.escape(el.id)}`; + return null; + }; + const seenRadios = new Set(); + const fields = []; + const controls = form.querySelectorAll('input, select, textarea'); + for (const el of controls) { + const tag = el.tagName.toLowerCase(); + const type = (el.type || '').toLowerCase(); + if (tag === 'input' && (type === 'submit' || type === 'button' || type === 'reset' || type === 'image' || type === 'file')) continue; + let options = null; + if (tag === 'select') { + options = Array.from(el.options).map(o => ({ value: o.value, label: (o.text || '').trim() })); + } else if (tag === 'input' && type === 'radio') { + if (!el.name || seenRadios.has(el.name)) continue; + seenRadios.add(el.name); + const group = form.querySelectorAll(`input[type="radio"][name="${CSS.escape(el.name)}"]`); + options = Array.from(group).map(r => ({ value: r.value, label: labelFor(r) || r.value })); + } + const ml = el.maxLength; + fields.push({ + tag, + type: type || null, + name: el.name || null, + id: el.id || null, + label: labelFor(el), + required: !!el.required, + pattern: el.getAttribute('pattern'), + min: el.getAttribute('min'), + max: el.getAttribute('max'), + maxLength: (typeof ml === 'number' && ml > 0) ? ml : null, + options, + selector: emitSelector(el), + }); + } + let submitSelector = null; + const submit = form.querySelector('button[type="submit"], input[type="submit"], button:not([type])'); + if (submit) { + if (submit.id) submitSelector = `#${CSS.escape(submit.id)}`; + else if (submit.getAttribute('name')) submitSelector = `${submit.tagName.toLowerCase()}[name=${JSON.stringify(submit.getAttribute('name'))}]`; + else if (submit.tagName.toLowerCase() === 'button') submitSelector = 'button[type="submit"]'; + else submitSelector = 'input[type="submit"]'; + } + let formSelector = formSelectorOrNull; + if (!formSelector) { + if (form.id) formSelector = `#${CSS.escape(form.id)}`; + else if (form.getAttribute('name')) formSelector = `form[name=${JSON.stringify(form.getAttribute('name'))}]`; + else formSelector = 'form'; + } + return { url: location.href, formSelector, submitSelector, fields }; + } + """; + public ValueTask DisposeAsync() => new(page.CloseAsync()); } diff --git a/src/Foragent.Capabilities/ForagentCapabilitiesServiceCollectionExtensions.cs b/src/Foragent.Capabilities/ForagentCapabilitiesServiceCollectionExtensions.cs index ff6de1c..169de19 100644 --- a/src/Foragent.Capabilities/ForagentCapabilitiesServiceCollectionExtensions.cs +++ b/src/Foragent.Capabilities/ForagentCapabilitiesServiceCollectionExtensions.cs @@ -1,4 +1,5 @@ using Foragent.Capabilities.BrowserTask; +using Foragent.Capabilities.Forms; using Microsoft.Extensions.DependencyInjection; using RockBot.A2A; @@ -16,7 +17,10 @@ public static IServiceCollection AddForagentCapabilities(this IServiceCollection services.AddScoped(); services.AddScoped(); services.AddScoped(); + services.AddScoped(); + services.AddScoped(); services.AddScoped(); + services.AddScoped(); services.AddScoped(); return services; } @@ -33,6 +37,8 @@ public static class ForagentCapabilities public static IReadOnlyList Skills { get; } = [ BrowserTaskCapability.SkillDefinition, + LearnFormSchemaCapability.SkillDefinition, + ExecuteFormBatchCapability.SkillDefinition, FetchPageTitleCapability.SkillDefinition, ExtractStructuredDataCapability.SkillDefinition ]; diff --git a/src/Foragent.Capabilities/Forms/ExecuteFormBatchCapability.cs b/src/Foragent.Capabilities/Forms/ExecuteFormBatchCapability.cs new file mode 100644 index 0000000..3b83aee --- /dev/null +++ b/src/Foragent.Capabilities/Forms/ExecuteFormBatchCapability.cs @@ -0,0 +1,324 @@ +using System.Text.Json; +using Foragent.Browser; +using Foragent.Credentials; +using Microsoft.Extensions.Logging; +using RockBot.A2A; +using RockBot.Host; + +namespace Foragent.Capabilities.Forms; + +/// +/// Phase-3 of the learn → review → execute flow (spec §5.5). Submits a batch +/// of rows against a previously-learned , either +/// inline or fetched from by name. Streams per-row +/// s while running. +/// +/// Default mode is abort-on-first (spec open-question #8 resolution): +/// a row failure halts the batch, since continuing past a failure on a +/// mutating form typically generates bad data rather than surfacing the +/// real problem. Callers opt into continue when row-level data +/// quality is known-messy and partial success is desired. +/// +public sealed class ExecuteFormBatchCapability( + IBrowserSessionFactory browserFactory, + ICredentialBroker credentialBroker, + ISkillStore skillStore, + ILogger logger) : ICapability +{ + public static AgentSkill SkillDefinition { get; } = new() + { + Id = "execute-form-batch", + Name = "Execute Form Batch", + Description = "Submit a batch of rows against a learned form schema. " + + "Input: JSON {\"schemaRef\":\"sites/host/forms/name\" OR \"schema\":{...FormSchema...},\"rows\":[{field:value,...}],\"allowedHosts\":[\"host\"],\"credentialId\":\"optional\",\"mode\":\"abort-on-first\"|\"continue\",\"successIndicator\":\"optional CSS selector\"}. " + + "Streams per-row progress. Default mode aborts on first failure." + }; + + public string SkillId => SkillDefinition.Id; + public AgentSkill Skill => SkillDefinition; + + public async Task ExecuteAsync(AgentTaskRequest request, AgentTaskContext context) + { + var ct = context.MessageContext.CancellationToken; + var input = ExecuteFormBatchInput.Parse(request); + if (input.Error is not null) + return CapabilityResult.Error(request, input.Error); + + if (!string.IsNullOrWhiteSpace(input.CredentialId)) + { + try + { + _ = await credentialBroker.ResolveAsync(input.CredentialId!, ct); + } + catch (CredentialNotFoundException ex) + { + return CapabilityResult.Error(request, $"Credential '{ex.CredentialId}' is not configured."); + } + } + + FormSchema schema; + try + { + schema = await ResolveSchemaAsync(input); + } + catch (Exception ex) + { + return CapabilityResult.Error(request, ex.Message); + } + + if (!Uri.TryCreate(schema.Url, UriKind.Absolute, out var formUrl)) + return CapabilityResult.Error(request, $"Schema URL '{schema.Url}' is not a valid absolute URL."); + if (!input.Allowlist!.IsAllowed(formUrl)) + return CapabilityResult.Error(request, $"Schema URL host '{formUrl.Host}' is not in the allowlist."); + if (string.IsNullOrWhiteSpace(schema.SubmitSelector)) + return CapabilityResult.Error(request, "Schema does not specify a submit selector — cannot submit without it."); + + var results = new List(input.Rows!.Count); + var submittedAny = false; + + try + { + await using var session = await browserFactory.CreateSessionAsync(input.Allowlist.IsAllowed, ct); + + for (var i = 0; i < input.Rows.Count; i++) + { + ct.ThrowIfCancellationRequested(); + var row = input.Rows[i]; + + await PublishProgressAsync(context, request, $"Submitting row {i + 1} of {input.Rows.Count}...", ct); + + var rowResult = await SubmitRowAsync(session, schema, row, input.SuccessIndicator, i, ct); + results.Add(rowResult); + if (rowResult.Status == RowStatus.Success) + submittedAny = true; + + if (rowResult.Status != RowStatus.Success && input.Mode == ExecuteFormBatchMode.AbortOnFirst) + { + logger.LogInformation( + "Aborting batch after row {Index} failure: {Reason}", i, rowResult.Error); + break; + } + } + } + catch (OperationCanceledException) when (ct.IsCancellationRequested) + { + throw; + } + catch (Exception ex) + { + logger.LogWarning(ex, "execute-form-batch failed outside the row loop after {Count} row(s)", results.Count); + return CapabilityResult.Error(request, $"Batch failed: {ex.Message}"); + } + + var successCount = results.Count(r => r.Status == RowStatus.Success); + var failureCount = results.Count(r => r.Status != RowStatus.Success); + var pending = input.Rows.Count - results.Count; + + string status; + if (failureCount == 0 && pending == 0) + status = "done"; + else if (submittedAny) + status = "partial"; + else + status = "failed"; + + var payload = new + { + status, + mode = input.Mode == ExecuteFormBatchMode.Continue ? "continue" : "abort-on-first", + successCount, + failureCount, + pending, + rows = results.Select(r => new + { + index = r.Index, + status = r.Status switch + { + RowStatus.Success => "success", + RowStatus.ValidationError => "validation-error", + _ => "failed" + }, + error = r.Error + }).ToArray() + }; + return CapabilityResult.Completed( + request, + JsonSerializer.Serialize(payload, FormSchema.SerializerOptions)); + } + + private async Task ResolveSchemaAsync(ExecuteFormBatchInput input) + { + if (input.InlineSchema is not null) + return input.InlineSchema; + + var content = await skillStore.GetResourceAsync(input.SchemaRef!, "schema.json"); + if (content is null) + throw new InvalidOperationException( + $"No schema.json resource found for skill '{input.SchemaRef}'. Run learn-form-schema first."); + + try + { + return FormSchema.Deserialize(content); + } + catch (Exception ex) when (ex is JsonException or InvalidOperationException) + { + throw new InvalidOperationException( + $"Skill '{input.SchemaRef}' has a schema.json resource that isn't a valid FormSchema: {ex.Message}"); + } + } + + private async Task SubmitRowAsync( + IBrowserSession session, + FormSchema schema, + IReadOnlyDictionary row, + string? externalSuccessIndicator, + int index, + CancellationToken ct) + { + // Pre-validate required fields so we fail before spending a network + // round-trip on a row that could never succeed. + var missing = schema.Fields + .Where(f => f.Required && f.Type != FormFieldType.Hidden) + .Where(f => !row.TryGetValue(f.Name, out var v) || string.IsNullOrEmpty(v)) + .Select(f => f.Name) + .ToList(); + if (missing.Count > 0) + { + return new RowResult(index, RowStatus.ValidationError, + $"Missing required field(s): {string.Join(", ", missing)}."); + } + + var formUri = new Uri(schema.Url); + IBrowserPage? page = null; + try + { + page = await session.OpenPageAsync(formUri, ct); + var formSelector = string.IsNullOrWhiteSpace(schema.FormSelector) ? "form" : schema.FormSelector!; + try + { + await page.WaitForSelectorAsync(formSelector, TimeSpan.FromSeconds(15), ct); + } + catch (TimeoutException) + { + return new RowResult(index, RowStatus.Failed, $"Form '{formSelector}' did not appear within 15s."); + } + + foreach (var field in schema.Fields) + { + if (field.Type == FormFieldType.Hidden) + continue; + if (!row.TryGetValue(field.Name, out var value)) + continue; + + try + { + await SetFieldAsync(page, field, value, ct); + } + catch (Exception ex) + { + return new RowResult(index, RowStatus.Failed, + $"Failed to fill '{field.Name}': {ex.Message}"); + } + } + + var urlBefore = await page.GetUrlAsync(ct); + try + { + await page.ClickAsync(schema.SubmitSelector!, ct); + } + catch (Exception ex) + { + return new RowResult(index, RowStatus.Failed, + $"Failed to click submit '{schema.SubmitSelector}': {ex.Message}"); + } + + var indicator = externalSuccessIndicator ?? schema.SuccessIndicator; + if (!string.IsNullOrWhiteSpace(indicator)) + { + try + { + await page.WaitForSelectorAsync(indicator, TimeSpan.FromSeconds(15), ct); + return new RowResult(index, RowStatus.Success, null); + } + catch (TimeoutException) + { + return new RowResult(index, RowStatus.Failed, + $"Success indicator '{indicator}' did not appear within 15s."); + } + } + + // No explicit success signal — fall back to URL change as a weak + // heuristic. Forms that submit in place without navigation will + // need a successIndicator; the error mentions it. + await Task.Delay(500, ct); + var urlAfter = await page.GetUrlAsync(ct); + if (urlAfter != urlBefore) + return new RowResult(index, RowStatus.Success, null); + + return new RowResult(index, RowStatus.Failed, + "URL did not change after submit and no successIndicator was provided; cannot confirm success."); + } + finally + { + if (page is not null) + await page.DisposeAsync(); + } + } + + private static async Task SetFieldAsync( + IBrowserPage page, + FormField field, + string value, + CancellationToken ct) + { + switch (field.Type) + { + case FormFieldType.Select: + await page.SelectOptionAsync(field.Selector, value, ct); + break; + case FormFieldType.Checkbox: + await page.SetCheckedAsync(field.Selector, ParseBool(value), ct); + break; + case FormFieldType.Radio: + // The schema's radio selector matches the whole group; append + // a value attribute to target the specific option. + await page.SetCheckedAsync( + $"{field.Selector}[value={JsonSerializer.Serialize(value)}]", + true, + ct); + break; + default: + await page.FillAsync(field.Selector, value, ct); + break; + } + } + + private static bool ParseBool(string value) => + value.Equals("true", StringComparison.OrdinalIgnoreCase) + || value == "1" + || value.Equals("on", StringComparison.OrdinalIgnoreCase) + || value.Equals("yes", StringComparison.OrdinalIgnoreCase); + + private static Task PublishProgressAsync( + AgentTaskContext context, AgentTaskRequest request, string message, CancellationToken ct) => + context.PublishStatus(new AgentTaskStatusUpdate + { + TaskId = request.TaskId, + ContextId = request.ContextId, + State = AgentTaskState.Working, + Message = new AgentMessage + { + Role = "agent", + Parts = [new AgentMessagePart { Kind = "text", Text = message }] + } + }, ct); + + private enum RowStatus + { + Success, + ValidationError, + Failed + } + + private sealed record RowResult(int Index, RowStatus Status, string? Error); +} diff --git a/src/Foragent.Capabilities/Forms/ExecuteFormBatchInput.cs b/src/Foragent.Capabilities/Forms/ExecuteFormBatchInput.cs new file mode 100644 index 0000000..82d9fdd --- /dev/null +++ b/src/Foragent.Capabilities/Forms/ExecuteFormBatchInput.cs @@ -0,0 +1,153 @@ +using System.Text.Json; +using Foragent.Capabilities.BrowserTask; +using RockBot.A2A; + +namespace Foragent.Capabilities.Forms; + +/// +/// Parses input for execute-form-batch (spec §5.2, phase-3 of §5.5). +/// +/// Shape (JSON in the first text part — metadata pass-through doesn't fit +/// multi-row shapes): +/// +/// schemaRef — optional. Skill name produced by learn-form-schema. +/// schema — optional. Inline JSON. +/// rows — required, non-empty. Each row is an object keyed by field name. +/// allowedHosts — required. Host allowlist (spec §7.1). +/// credentialId — optional. Resolved and discarded (reserved for future auth). +/// mode — optional. abort-on-first (default, spec open-question #8 resolution) or continue. +/// successIndicator — optional CSS selector that signals successful submission. +/// +/// Exactly one of schemaRef and schema must be present. +/// +internal readonly record struct ExecuteFormBatchInput( + string? SchemaRef, + FormSchema? InlineSchema, + IReadOnlyList>? Rows, + HostAllowlist? Allowlist, + string? CredentialId, + ExecuteFormBatchMode Mode, + string? SuccessIndicator, + string? Error) +{ + public static ExecuteFormBatchInput Parse(AgentTaskRequest request) + { + var text = request.Message.Parts + .Where(p => p.Kind == "text") + .Select(p => p.Text) + .FirstOrDefault(t => !string.IsNullOrWhiteSpace(t)) + ?.Trim(); + + if (string.IsNullOrEmpty(text) || !text.StartsWith('{')) + return Fail("Input must be a JSON object with rows, allowedHosts, and either schemaRef or schema."); + + string? schemaRef; + FormSchema? inlineSchema = null; + List>? rows = null; + List? allowedHosts = null; + string? credentialId; + string? modeRaw; + string? successIndicator; + + try + { + using var doc = JsonDocument.Parse(text); + var root = doc.RootElement; + schemaRef = root.TryGetProperty("schemaRef", out var sr) ? sr.GetString() : null; + if (root.TryGetProperty("schema", out var s) && s.ValueKind == JsonValueKind.Object) + { + try + { + inlineSchema = JsonSerializer.Deserialize(s.GetRawText(), FormSchema.SerializerOptions); + } + catch (JsonException ex) + { + return Fail($"Inline 'schema' is not a valid FormSchema: {ex.Message}"); + } + } + credentialId = root.TryGetProperty("credentialId", out var c) ? c.GetString() : null; + modeRaw = root.TryGetProperty("mode", out var m) ? m.GetString() : null; + successIndicator = root.TryGetProperty("successIndicator", out var si) ? si.GetString() : null; + + if (root.TryGetProperty("rows", out var r) && r.ValueKind == JsonValueKind.Array) + { + rows = []; + var index = 0; + foreach (var item in r.EnumerateArray()) + { + if (item.ValueKind != JsonValueKind.Object) + return Fail($"Row {index} is not an object."); + var dict = new Dictionary(StringComparer.Ordinal); + foreach (var prop in item.EnumerateObject()) + { + dict[prop.Name] = prop.Value.ValueKind switch + { + JsonValueKind.String => prop.Value.GetString() ?? string.Empty, + JsonValueKind.Number => prop.Value.ToString(), + JsonValueKind.True => "true", + JsonValueKind.False => "false", + JsonValueKind.Null => string.Empty, + _ => prop.Value.GetRawText() + }; + } + rows.Add(dict); + index++; + } + } + + if (root.TryGetProperty("allowedHosts", out var h) && h.ValueKind == JsonValueKind.Array) + allowedHosts = [.. h.EnumerateArray().Select(e => e.GetString() ?? string.Empty)]; + } + catch (JsonException ex) + { + return Fail($"Input JSON is malformed: {ex.Message}"); + } + + if (schemaRef is null && inlineSchema is null) + return Fail("Provide either 'schemaRef' (a skill name) or inline 'schema'."); + if (schemaRef is not null && inlineSchema is not null) + return Fail("Provide only one of 'schemaRef' or 'schema', not both."); + if (rows is null || rows.Count == 0) + return Fail("Missing 'rows' — must be a non-empty array of objects."); + if (allowedHosts is null || allowedHosts.Count == 0) + return Fail("Missing 'allowedHosts' — execute-form-batch requires an explicit allowlist (spec §7.1)."); + + HostAllowlist allowlist; + try + { + allowlist = HostAllowlist.Parse(allowedHosts); + } + catch (ArgumentException ex) + { + return Fail(ex.Message); + } + + ExecuteFormBatchMode mode = ExecuteFormBatchMode.AbortOnFirst; + if (modeRaw is not null) + { + mode = modeRaw.Trim().ToLowerInvariant() switch + { + "abort-on-first" or "" or null => ExecuteFormBatchMode.AbortOnFirst, + "continue" => ExecuteFormBatchMode.Continue, + _ => ExecuteFormBatchMode.Unknown + }; + if (mode == ExecuteFormBatchMode.Unknown) + return Fail($"Unknown 'mode' '{modeRaw}'. Valid values: 'abort-on-first' (default), 'continue'."); + } + + if (inlineSchema is not null && !allowlist.IsAllowed(new Uri(inlineSchema.Url))) + return Fail($"Inline schema URL '{inlineSchema.Url}' is not in the allowlist."); + + return new ExecuteFormBatchInput(schemaRef, inlineSchema, rows, allowlist, credentialId, mode, successIndicator, null); + } + + private static ExecuteFormBatchInput Fail(string message) => + new(null, null, null, null, null, ExecuteFormBatchMode.AbortOnFirst, null, message); +} + +internal enum ExecuteFormBatchMode +{ + AbortOnFirst, + Continue, + Unknown +} diff --git a/src/Foragent.Capabilities/Forms/FormSchema.cs b/src/Foragent.Capabilities/Forms/FormSchema.cs new file mode 100644 index 0000000..adb0fbd --- /dev/null +++ b/src/Foragent.Capabilities/Forms/FormSchema.cs @@ -0,0 +1,93 @@ +using System.Text.Json; +using System.Text.Json.Serialization; + +namespace Foragent.Capabilities.Forms; + +/// +/// Typed schema for a web form produced by learn-form-schema (spec §5.2) +/// and consumed by execute-form-batch. Persisted as a +/// resource attached to +/// a named sites/{host}/forms/{slug}. +/// +/// This is the on-the-wire shape — callers exchange it as JSON and reference +/// it by skill name, not by process-local object identity. +/// +public sealed record FormSchema( + [property: JsonPropertyName("version")] int Version, + [property: JsonPropertyName("url")] string Url, + [property: JsonPropertyName("formSelector")] string? FormSelector, + [property: JsonPropertyName("submitSelector")] string? SubmitSelector, + [property: JsonPropertyName("successIndicator")] string? SuccessIndicator, + [property: JsonPropertyName("fields")] IReadOnlyList Fields, + [property: JsonPropertyName("notes")] string? Notes = null) +{ + public const int CurrentVersion = 1; + + /// + /// Stable used for every persisted and + /// wire-transmitted schema. Never rebuild; the string shape is part of the + /// skill resource format. + /// + public static JsonSerializerOptions SerializerOptions { get; } = new() + { + WriteIndented = true, + PropertyNamingPolicy = JsonNamingPolicy.CamelCase, + DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull, + Converters = { new JsonStringEnumConverter(JsonNamingPolicy.CamelCase) } + }; + + public string Serialize() => JsonSerializer.Serialize(this, SerializerOptions); + + public static FormSchema Deserialize(string json) => + JsonSerializer.Deserialize(json, SerializerOptions) + ?? throw new InvalidOperationException("Form schema JSON was null."); +} + +/// +/// One field in a . is a +/// Playwright-dialect CSS selector — the concrete hook execute-form-batch +/// uses to reach the input. is the JSON key callers put +/// values under when they build rows for a batch. +/// +public sealed record FormField( + [property: JsonPropertyName("name")] string Name, + [property: JsonPropertyName("type")] FormFieldType Type, + [property: JsonPropertyName("selector")] string Selector, + [property: JsonPropertyName("label")] string? Label = null, + [property: JsonPropertyName("required")] bool Required = false, + [property: JsonPropertyName("options")] IReadOnlyList? Options = null, + [property: JsonPropertyName("pattern")] string? Pattern = null, + [property: JsonPropertyName("min")] string? Min = null, + [property: JsonPropertyName("max")] string? Max = null, + [property: JsonPropertyName("maxLength")] int? MaxLength = null, + [property: JsonPropertyName("dependsOn")] IReadOnlyList? DependsOn = null); + +/// +/// An option entry for a or +/// field. is the string a +/// caller supplies in a row payload; is the human text +/// rendered on the page. +/// +public sealed record FormFieldOption( + [property: JsonPropertyName("value")] string Value, + [property: JsonPropertyName("label")] string? Label = null); + +public enum FormFieldType +{ + Text, + Email, + Password, + Number, + Date, + DateTime, + Time, + Url, + Tel, + Search, + Color, + Hidden, + TextArea, + Select, + Radio, + Checkbox +} diff --git a/src/Foragent.Capabilities/Forms/FormSchemaEnricher.cs b/src/Foragent.Capabilities/Forms/FormSchemaEnricher.cs new file mode 100644 index 0000000..fb767a1 --- /dev/null +++ b/src/Foragent.Capabilities/Forms/FormSchemaEnricher.cs @@ -0,0 +1,158 @@ +using System.Text.Json; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.Logging; + +namespace Foragent.Capabilities.Forms; + +/// +/// Runs one LLM turn over a deterministic to infer +/// field dependencies and add a short human-readable note. Fail-soft: any +/// exception or unparseable output leaves the schema unchanged. +/// +/// The deterministic mapper () owns all +/// structural fields — type, selector, required, options, validation +/// attributes. The enricher can only add and +/// set . This division keeps the LLM unable to +/// invent fields or change how the batch capability reaches them. +/// +public sealed class FormSchemaEnricher( + IChatClient chatClient, + ILogger logger) +{ + private const string SystemPrompt = """ + You are reviewing a deterministic schema just extracted from a web form. + + Your job is narrow: + - Identify any dependency relationships between fields — e.g. the options of a 'state' select are populated only after a 'country' select picks a value. + - Write one short note about the form a future planner would value (no more than 40 words). + + STRICT RULES: + - Do NOT add, remove, or rename fields. + - Do NOT change field types, selectors, required flags, or option lists. + - Do NOT fabricate dependencies that are not suggested by field names, labels, or the existence of multiple selects in the same form. + - Use EXACT field names from the input when emitting dependsOn. + + Respond with valid JSON only, no prose, no code fence: + { + "notes": "short sentence or empty string", + "dependsOn": { + "fieldA": ["fieldB"], + "fieldC": ["fieldD", "fieldE"] + } + } + Omit fields from dependsOn when there is no dependency. An empty object is fine. + """; + + public async Task EnrichAsync(FormSchema schema, CancellationToken ct) + { + ArgumentNullException.ThrowIfNull(schema); + + // Pure-text forms with no selects/radios rarely have inter-field + // dependencies; skip the LLM turn to save tokens. + if (!schema.Fields.Any(f => f.Type is FormFieldType.Select or FormFieldType.Radio)) + return schema; + + try + { + var messages = new List + { + new(ChatRole.System, SystemPrompt), + new(ChatRole.User, BuildUserPrompt(schema)) + }; + // No tools — pure summarisation turn. + var response = await chatClient.GetResponseAsync(messages, new ChatOptions { Tools = [] }, ct); + var text = response.Text?.Trim(); + if (string.IsNullOrWhiteSpace(text)) + return schema; + + var json = ExtractJsonObject(text); + if (json is null) + return schema; + + using var doc = JsonDocument.Parse(json); + return Merge(schema, doc.RootElement); + } + catch (OperationCanceledException) when (ct.IsCancellationRequested) + { + throw; + } + catch (Exception ex) + { + logger.LogDebug(ex, "Form-schema enrichment failed; using deterministic schema as-is."); + return schema; + } + } + + private static string BuildUserPrompt(FormSchema schema) + { + // Feed only the shape the enricher is allowed to touch: names, types, + // labels, options. Selectors and URLs are noise for this turn. + var trimmed = new + { + url = schema.Url, + fields = schema.Fields.Select(f => new + { + name = f.Name, + type = f.Type.ToString(), + label = f.Label, + required = f.Required, + options = f.Options?.Select(o => o.Label ?? o.Value).ToArray() + }).ToArray() + }; + return JsonSerializer.Serialize(trimmed, FormSchema.SerializerOptions); + } + + private static string? ExtractJsonObject(string text) + { + var start = text.IndexOf('{'); + var end = text.LastIndexOf('}'); + if (start < 0 || end <= start) + return null; + return text[start..(end + 1)]; + } + + private static FormSchema Merge(FormSchema schema, JsonElement root) + { + string? notes = schema.Notes; + if (root.TryGetProperty("notes", out var notesElement) && + notesElement.ValueKind == JsonValueKind.String) + { + var raw = notesElement.GetString(); + if (!string.IsNullOrWhiteSpace(raw)) + notes = raw.Trim(); + } + + var dependencies = new Dictionary>(StringComparer.Ordinal); + if (root.TryGetProperty("dependsOn", out var dependsRoot) && + dependsRoot.ValueKind == JsonValueKind.Object) + { + var fieldNames = schema.Fields.Select(f => f.Name).ToHashSet(StringComparer.Ordinal); + foreach (var entry in dependsRoot.EnumerateObject()) + { + if (!fieldNames.Contains(entry.Name)) + continue; + if (entry.Value.ValueKind != JsonValueKind.Array) + continue; + var upstream = entry.Value.EnumerateArray() + .Where(v => v.ValueKind == JsonValueKind.String) + .Select(v => v.GetString()!) + .Where(n => fieldNames.Contains(n) && n != entry.Name) + .Distinct(StringComparer.Ordinal) + .ToArray(); + if (upstream.Length > 0) + dependencies[entry.Name] = upstream; + } + } + + if (notes == schema.Notes && dependencies.Count == 0) + return schema; + + var mergedFields = schema.Fields + .Select(f => dependencies.TryGetValue(f.Name, out var deps) + ? f with { DependsOn = deps } + : f) + .ToArray(); + + return schema with { Fields = mergedFields, Notes = notes }; + } +} diff --git a/src/Foragent.Capabilities/Forms/FormSchemaMapper.cs b/src/Foragent.Capabilities/Forms/FormSchemaMapper.cs new file mode 100644 index 0000000..838fac0 --- /dev/null +++ b/src/Foragent.Capabilities/Forms/FormSchemaMapper.cs @@ -0,0 +1,109 @@ +using Foragent.Browser; + +namespace Foragent.Capabilities.Forms; + +/// +/// Pure mapping from the browser-layer (raw HTML +/// attributes) to the wire-level (typed fields the +/// capability contract exposes). No LLM, no I/O — the floor that +/// learn-form-schema can always produce even when enrichment fails. +/// +internal static class FormSchemaMapper +{ + public static FormSchema Map(FormScan scan) + { + ArgumentNullException.ThrowIfNull(scan); + + var fields = new List(scan.Fields.Count); + var used = new HashSet(StringComparer.Ordinal); + int unnamedIndex = 0; + + foreach (var raw in scan.Fields) + { + if (raw.Selector is null) + continue; // no deterministic way to reach the field; skip rather than invent + + var name = raw.Name; + if (string.IsNullOrEmpty(name)) + name = raw.Id; + if (string.IsNullOrEmpty(name)) + name = $"field{++unnamedIndex}"; + name = Deduplicate(name!, used); + + var type = InferType(raw); + IReadOnlyList? options = null; + if (raw.Options is not null && raw.Options.Count > 0) + options = raw.Options.Select(o => new FormFieldOption(o.Value, o.Label)).ToArray(); + + fields.Add(new FormField( + Name: name, + Type: type, + Selector: raw.Selector, + Label: Normalize(raw.Label), + Required: raw.Required, + Options: options, + Pattern: raw.Pattern, + Min: raw.Min, + Max: raw.Max, + MaxLength: raw.MaxLength, + DependsOn: null)); + } + + return new FormSchema( + Version: FormSchema.CurrentVersion, + Url: scan.Url.ToString(), + FormSelector: scan.FormSelector, + SubmitSelector: scan.SubmitSelector, + SuccessIndicator: null, + Fields: fields, + Notes: null); + } + + private static string Deduplicate(string name, HashSet used) + { + if (used.Add(name)) + return name; + for (var i = 2; ; i++) + { + var candidate = $"{name}_{i}"; + if (used.Add(candidate)) + return candidate; + } + } + + private static FormFieldType InferType(FormScanField raw) => + raw.Tag switch + { + "textarea" => FormFieldType.TextArea, + "select" => FormFieldType.Select, + "input" => raw.InputType switch + { + "email" => FormFieldType.Email, + "password" => FormFieldType.Password, + "number" => FormFieldType.Number, + "date" => FormFieldType.Date, + "datetime-local" or "datetime" => FormFieldType.DateTime, + "time" => FormFieldType.Time, + "url" => FormFieldType.Url, + "tel" => FormFieldType.Tel, + "search" => FormFieldType.Search, + "color" => FormFieldType.Color, + "hidden" => FormFieldType.Hidden, + "checkbox" => FormFieldType.Checkbox, + "radio" => FormFieldType.Radio, + _ => FormFieldType.Text + }, + _ => FormFieldType.Text + }; + + private static string? Normalize(string? label) + { + if (string.IsNullOrWhiteSpace(label)) + return null; + var trimmed = label.Trim(); + // Labels often pick up trailing punctuation or whitespace-heavy markup. + while (trimmed.EndsWith(':') || trimmed.EndsWith('*')) + trimmed = trimmed[..^1].TrimEnd(); + return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed; + } +} diff --git a/src/Foragent.Capabilities/Forms/LearnFormSchemaCapability.cs b/src/Foragent.Capabilities/Forms/LearnFormSchemaCapability.cs new file mode 100644 index 0000000..83769cd --- /dev/null +++ b/src/Foragent.Capabilities/Forms/LearnFormSchemaCapability.cs @@ -0,0 +1,237 @@ +using System.Text; +using Foragent.Browser; +using Foragent.Credentials; +using Microsoft.Extensions.Logging; +using RockBot.A2A; +using RockBot.Host; + +namespace Foragent.Capabilities.Forms; + +/// +/// Phase-1 of the learn → review → execute flow (spec §5.5). Navigates to a +/// form, scans it deterministically, runs one LLM pass to infer dependencies +/// and compose a short note, persists the schema alongside a markdown primer +/// as a + +/// resource, and returns the schema inline for caller review. +/// +/// Scope cuts for step 8 (spec §9.1 step 8): file uploads and multi-step +/// wizards are out of scope. A single static form per call. +/// +public sealed class LearnFormSchemaCapability( + IBrowserSessionFactory browserFactory, + ICredentialBroker credentialBroker, + FormSchemaEnricher enricher, + ISkillStore skillStore, + ILogger logger) : ICapability +{ + public static AgentSkill SkillDefinition { get; } = new() + { + Id = "learn-form-schema", + Name = "Learn Form Schema", + Description = "Navigate to a web form, extract its structure (fields, types, options, validation), and persist it as a reusable skill. " + + "Input: JSON {\"url\":\"https://...\",\"allowedHosts\":[\"host\"],\"formSelector\":\"optional\",\"credentialId\":\"optional\",\"skillName\":\"optional override\",\"intent\":\"optional prose\"}. " + + "Returns the typed form schema plus the skill name it was persisted under." + }; + + public string SkillId => SkillDefinition.Id; + public AgentSkill Skill => SkillDefinition; + + public async Task ExecuteAsync(AgentTaskRequest request, AgentTaskContext context) + { + var ct = context.MessageContext.CancellationToken; + var input = LearnFormSchemaInput.Parse(request); + if (input.Error is not null) + return CapabilityResult.Error(request, input.Error); + + // credentialId is accepted for future use (authenticated forms); resolve + // to fail fast and audit-log access if supplied. Not consumed by the + // scan itself — the caller should pre-authenticate via browser-task and + // rely on storage-state (spec §6.5, deferred) to reach the form. + if (!string.IsNullOrWhiteSpace(input.CredentialId)) + { + try + { + _ = await credentialBroker.ResolveAsync(input.CredentialId!, ct); + } + catch (CredentialNotFoundException ex) + { + return CapabilityResult.Error(request, $"Credential '{ex.CredentialId}' is not configured."); + } + } + + FormSchema schema; + try + { + await using var session = await browserFactory.CreateSessionAsync(input.Allowlist!.IsAllowed, ct); + await using var page = await session.OpenPageAsync(input.Url!, ct); + + var selector = string.IsNullOrWhiteSpace(input.FormSelector) ? "form" : input.FormSelector!; + try + { + await page.WaitForSelectorAsync(selector, TimeSpan.FromSeconds(15), ct); + } + catch (TimeoutException) + { + return CapabilityResult.Error(request, $"No form matching '{selector}' appeared within 15s."); + } + + var scan = await page.ScanFormAsync(input.FormSelector, ct); + if (scan is null) + return CapabilityResult.Error(request, $"No form matching '{selector}' was found on {input.Url}."); + if (scan.Fields.Count == 0) + return CapabilityResult.Error(request, $"The form at {input.Url} had no recognizable input fields."); + + var deterministic = FormSchemaMapper.Map(scan); + schema = await enricher.EnrichAsync(deterministic, ct); + } + catch (OperationCanceledException) when (ct.IsCancellationRequested) + { + throw; + } + catch (Exception ex) + { + logger.LogWarning(ex, "learn-form-schema failed for {Url}", input.Url); + return CapabilityResult.Error(request, $"Form scan failed: {ex.Message}"); + } + + var skillName = input.SkillName ?? DeriveSkillName(input.Url!, input.Intent); + try + { + await PersistAsync(skillName, schema, input.Intent, ct); + logger.LogInformation("Persisted form schema '{SkillName}' ({FieldCount} fields).", skillName, schema.Fields.Count); + } + catch (Exception ex) + { + // Persistence failure shouldn't fail the task — the schema still + // goes back inline and the caller can retry. Log for the operator. + logger.LogWarning(ex, "Failed to persist skill '{SkillName}'; returning schema inline anyway.", skillName); + } + + var payload = new + { + status = "done", + skillName, + schema + }; + return CapabilityResult.Completed( + request, + System.Text.Json.JsonSerializer.Serialize(payload, FormSchema.SerializerOptions)); + } + + private async Task PersistAsync(string skillName, FormSchema schema, string? intent, CancellationToken ct) + { + ct.ThrowIfCancellationRequested(); + var existing = await skillStore.GetAsync(skillName); + var now = DateTimeOffset.UtcNow; + + var skill = new Skill( + Name: skillName, + Summary: BuildSummary(schema, intent), + Content: BuildPrimer(schema, intent), + CreatedAt: existing?.CreatedAt ?? now, + UpdatedAt: existing is null ? null : now, + LastUsedAt: now, + SeeAlso: existing?.SeeAlso, + Manifest: null); + + var resource = new SkillResourceInput( + Filename: "schema.json", + Type: SkillResourceType.JsonSchema, + Description: "Typed form schema — fields, types, options, validation, dependencies.", + Content: schema.Serialize()); + + await skillStore.SaveAsync(skill, [resource]); + } + + private static string BuildSummary(FormSchema schema, string? intent) + { + var uri = new Uri(schema.Url); + var host = uri.Host; + var prefix = string.IsNullOrWhiteSpace(intent) ? $"Form on {host}" : intent!.Trim(); + if (prefix.Length > 120) prefix = prefix[..120]; + return $"{prefix} — {schema.Fields.Count} fields."; + } + + private static string BuildPrimer(FormSchema schema, string? intent) + { + var sb = new StringBuilder(); + sb.Append("# Form: ").AppendLine(schema.Url); + sb.AppendLine(); + if (!string.IsNullOrWhiteSpace(intent)) + { + sb.AppendLine(intent!.Trim()); + sb.AppendLine(); + } + sb.Append("Form selector: `").Append(schema.FormSelector ?? "form").AppendLine("`"); + if (!string.IsNullOrWhiteSpace(schema.SubmitSelector)) + sb.Append("Submit selector: `").Append(schema.SubmitSelector).AppendLine("`"); + sb.AppendLine(); + sb.AppendLine("## Fields"); + sb.AppendLine(); + foreach (var f in schema.Fields) + { + sb.Append("- **").Append(f.Name).Append("** (`").Append(f.Type).Append("`"); + if (f.Required) sb.Append(", required"); + sb.Append(") — selector `").Append(f.Selector).Append('`'); + if (!string.IsNullOrWhiteSpace(f.Label)) + sb.Append(" — label: ").Append(f.Label); + if (f.Options is { Count: > 0 }) + sb.Append(" — options: ").Append(string.Join(", ", f.Options.Select(o => o.Label ?? o.Value))); + if (f.DependsOn is { Count: > 0 }) + sb.Append(" — depends on: ").Append(string.Join(", ", f.DependsOn)); + sb.AppendLine(); + } + + if (!string.IsNullOrWhiteSpace(schema.Notes)) + { + sb.AppendLine(); + sb.AppendLine("## Notes"); + sb.AppendLine(); + sb.AppendLine(schema.Notes); + } + + sb.AppendLine(); + sb.AppendLine("The typed schema lives in the `schema.json` resource attached to this skill — that is the file `execute-form-batch` consumes."); + return sb.ToString(); + } + + private static string DeriveSkillName(Uri url, string? intent) + { + var host = url.Host.ToLowerInvariant(); + var slug = string.IsNullOrWhiteSpace(intent) + ? SlugFromPath(url.AbsolutePath) + : Slugify(intent!); + if (string.IsNullOrEmpty(slug)) + slug = "form"; + return $"sites/{host}/forms/{slug}"; + } + + private static string SlugFromPath(string path) + { + var trimmed = path.Trim('/'); + if (string.IsNullOrEmpty(trimmed)) + return "root"; + return Slugify(trimmed); + } + + private static string Slugify(string text) + { + var sb = new StringBuilder(capacity: Math.Min(text.Length, 48)); + var lastDash = true; + foreach (var ch in text.ToLowerInvariant()) + { + if (char.IsLetterOrDigit(ch)) + { + sb.Append(ch); + lastDash = false; + } + else if (!lastDash && sb.Length < 48) + { + sb.Append('-'); + lastDash = true; + } + if (sb.Length >= 48) break; + } + return sb.ToString().Trim('-'); + } +} diff --git a/src/Foragent.Capabilities/Forms/LearnFormSchemaInput.cs b/src/Foragent.Capabilities/Forms/LearnFormSchemaInput.cs new file mode 100644 index 0000000..1a487ec --- /dev/null +++ b/src/Foragent.Capabilities/Forms/LearnFormSchemaInput.cs @@ -0,0 +1,119 @@ +using System.Text.Json; +using Foragent.Capabilities.BrowserTask; +using RockBot.A2A; + +namespace Foragent.Capabilities.Forms; + +/// +/// Parses input for the learn-form-schema capability (spec §5.2, +/// phase-1 of the learn→review→execute pattern in §5.5). +/// +/// Shape (JSON in the first text part, field-by-field metadata overrides): +/// +/// url — required. Absolute http(s) URL of the page hosting the form. +/// allowedHosts — required. Host allowlist (spec §7.1). Empty rejects. +/// formSelector — optional CSS selector for the form. Defaults to the first <form>. +/// credentialId — optional; resolved and discarded (reserved for future auth). +/// skillName — optional override for the generated skill name; defaults to sites/{host}/forms/{slug}. +/// intent — optional free-form description stored in the skill primer. +/// +/// +internal readonly record struct LearnFormSchemaInput( + Uri? Url, + HostAllowlist? Allowlist, + string? FormSelector, + string? CredentialId, + string? SkillName, + string? Intent, + string? Error) +{ + public static LearnFormSchemaInput Parse(AgentTaskRequest request) + { + string? url = null; + string? formSelector = null; + string? credentialId = null; + string? skillName = null; + string? intent = null; + List? allowedHosts = null; + + var text = request.Message.Parts + .Where(p => p.Kind == "text") + .Select(p => p.Text) + .FirstOrDefault(t => !string.IsNullOrWhiteSpace(t)) + ?.Trim(); + + if (!string.IsNullOrEmpty(text) && text.StartsWith('{')) + { + try + { + using var doc = JsonDocument.Parse(text); + var root = doc.RootElement; + if (root.TryGetProperty("url", out var u)) url = u.GetString(); + if (root.TryGetProperty("formSelector", out var fs)) formSelector = fs.GetString(); + if (root.TryGetProperty("credentialId", out var c)) credentialId = c.GetString(); + if (root.TryGetProperty("skillName", out var sn)) skillName = sn.GetString(); + if (root.TryGetProperty("intent", out var it)) intent = it.GetString(); + if (root.TryGetProperty("allowedHosts", out var h) && h.ValueKind == JsonValueKind.Array) + allowedHosts = [.. h.EnumerateArray().Select(e => e.GetString() ?? string.Empty)]; + } + catch (JsonException) + { + return Fail("Input must be a JSON object with at least 'url' and 'allowedHosts'."); + } + } + + url = ReadMetadata(request, "url") ?? url; + formSelector = ReadMetadata(request, "formSelector") ?? formSelector; + credentialId = ReadMetadata(request, "credentialId") ?? credentialId; + skillName = ReadMetadata(request, "skillName") ?? skillName; + intent = ReadMetadata(request, "intent") ?? intent; + var hostsCsv = ReadMetadata(request, "allowedHosts"); + if (hostsCsv is not null) + allowedHosts = [.. hostsCsv.Split(',', StringSplitOptions.TrimEntries | StringSplitOptions.RemoveEmptyEntries)]; + + if (string.IsNullOrWhiteSpace(url)) + return Fail("Missing 'url' — the page hosting the form."); + + if (allowedHosts is null || allowedHosts.Count == 0) + return Fail("Missing 'allowedHosts' — learn-form-schema requires an explicit allowlist (spec §7.1)."); + + if (!Uri.TryCreate(url, UriKind.Absolute, out var parsedUrl) || + (parsedUrl.Scheme != Uri.UriSchemeHttp && parsedUrl.Scheme != Uri.UriSchemeHttps)) + return Fail($"'url' must be an absolute http(s) URL; got '{url}'."); + + HostAllowlist allowlist; + try + { + allowlist = HostAllowlist.Parse(allowedHosts); + } + catch (ArgumentException ex) + { + return Fail(ex.Message); + } + + if (!allowlist.IsAllowed(parsedUrl)) + return Fail($"URL host '{parsedUrl.Host}' is not in the allowlist."); + + return new LearnFormSchemaInput(parsedUrl, allowlist, formSelector, credentialId, skillName, intent, null); + } + + private static LearnFormSchemaInput Fail(string message) => + new(null, null, null, null, null, null, message); + + private static string? ReadMetadata(AgentTaskRequest request, string key) + { + if (request.Message.Metadata is not null + && request.Message.Metadata.TryGetValue(key, out var msgValue) + && !string.IsNullOrWhiteSpace(msgValue)) + { + return msgValue; + } + if (request.Metadata is not null + && request.Metadata.TryGetValue(key, out var reqValue) + && !string.IsNullOrWhiteSpace(reqValue)) + { + return reqValue; + } + return null; + } +} diff --git a/tests/Foragent.Agent.Tests/Forms/ExecuteFormBatchCapabilityTests.cs b/tests/Foragent.Agent.Tests/Forms/ExecuteFormBatchCapabilityTests.cs new file mode 100644 index 0000000..828f773 --- /dev/null +++ b/tests/Foragent.Agent.Tests/Forms/ExecuteFormBatchCapabilityTests.cs @@ -0,0 +1,310 @@ +using System.Text.Json; +using Foragent.Browser; +using Foragent.Capabilities.Forms; +using Foragent.Credentials; +using Microsoft.Extensions.Logging.Abstractions; +using RockBot.A2A; +using RockBot.Host; +using Xunit; + +namespace Foragent.Agent.Tests.Forms; + +public class ExecuteFormBatchCapabilityTests +{ + [Fact] + public async Task RejectsInput_WhenBothSchemaAndSchemaRefAreMissing() + { + var (cap, _, _) = Build(); + var (ctx, _) = TestContext.Build(); + + var result = await cap.ExecuteAsync( + TestContext.Request("execute-form-batch", + """{"rows":[{"email":"a"}],"allowedHosts":["example.com"]}"""), + ctx); + + Assert.Contains("schemaRef", TestContext.TextOf(result)); + } + + [Fact] + public async Task RejectsInput_WhenRowsMissing() + { + var (cap, _, _) = Build(); + var (ctx, _) = TestContext.Build(); + + var result = await cap.ExecuteAsync( + TestContext.Request("execute-form-batch", + """{"schemaRef":"sites/example.com/forms/x","allowedHosts":["example.com"]}"""), + ctx); + + Assert.Contains("rows", TestContext.TextOf(result)); + } + + [Fact] + public async Task ResolvesSchemaFromRef_AndSubmitsRows() + { + var (cap, factory, skills) = Build(); + await SeedSchemaAsync(skills, "sites/example.com/forms/contact", SampleSchema()); + var page = SubmittingPage(); + factory.PageResponder = (_, _) => + { + // Each row opens a fresh page — reset the URL so the URL-change + // heuristic fires per row rather than only on the first. + page.CurrentUrl = new Uri("https://example.com/form"); + return Task.FromResult(page); + }; + + var (ctx, capture) = TestContext.Build(); + + var result = await cap.ExecuteAsync( + TestContext.Request("execute-form-batch", + """ + { + "schemaRef": "sites/example.com/forms/contact", + "allowedHosts": ["example.com"], + "rows": [ + {"email":"a@example.com","message":"hi"}, + {"email":"b@example.com","message":"hello"} + ] + } + """), + ctx); + + using var doc = JsonDocument.Parse(TestContext.TextOf(result)); + Assert.Equal("done", doc.RootElement.GetProperty("status").GetString()); + Assert.Equal(2, doc.RootElement.GetProperty("successCount").GetInt32()); + Assert.Equal(0, doc.RootElement.GetProperty("failureCount").GetInt32()); + // One progress update per row (at minimum). + Assert.True(capture.Statuses.Count >= 2); + } + + [Fact] + public async Task AbortOnFirst_StopsAtFailedRow() + { + var (cap, factory, _) = Build(); + var page = new StubBrowserPage + { + CurrentUrl = new Uri("https://example.com/form") + // no OnClick → URL never changes → rows fail (URL-change fallback) + }; + factory.PageResponder = (_, _) => Task.FromResult(page); + + var inline = SampleSchema(); + var inlineJson = JsonSerializer.Serialize(inline, FormSchema.SerializerOptions); + var (ctx, _) = TestContext.Build(); + + var result = await cap.ExecuteAsync( + TestContext.Request("execute-form-batch", + $$""" + { + "schema": {{inlineJson}}, + "allowedHosts": ["example.com"], + "rows": [ + {"email":"a@example.com","message":"1"}, + {"email":"b@example.com","message":"2"}, + {"email":"c@example.com","message":"3"} + ] + } + """), + ctx); + + using var doc = JsonDocument.Parse(TestContext.TextOf(result)); + Assert.Equal("failed", doc.RootElement.GetProperty("status").GetString()); + Assert.Equal(1, doc.RootElement.GetProperty("rows").GetArrayLength()); + Assert.Equal(2, doc.RootElement.GetProperty("pending").GetInt32()); + } + + [Fact] + public async Task ContinueMode_RunsAllRowsEvenWhenSomeFail() + { + var (cap, factory, _) = Build(); + // First row fails (URL unchanged, no OnClick); then OnClick toggles so + // subsequent rows succeed via URL change. + var attempts = 0; + var page = new StubBrowserPage { CurrentUrl = new Uri("https://example.com/form") }; + page.OnClick = _ => + { + attempts++; + if (attempts >= 2) + page.CurrentUrl = new Uri($"https://example.com/thanks/{attempts}"); + }; + factory.PageResponder = (_, _) => + { + // Fresh session page per row — reset URL. + page.CurrentUrl = new Uri("https://example.com/form"); + return Task.FromResult(page); + }; + + var inline = SampleSchema(); + var inlineJson = JsonSerializer.Serialize(inline, FormSchema.SerializerOptions); + var (ctx, _) = TestContext.Build(); + + var result = await cap.ExecuteAsync( + TestContext.Request("execute-form-batch", + $$""" + { + "schema": {{inlineJson}}, + "allowedHosts": ["example.com"], + "mode": "continue", + "rows": [ + {"email":"a@example.com","message":"1"}, + {"email":"b@example.com","message":"2"}, + {"email":"c@example.com","message":"3"} + ] + } + """), + ctx); + + using var doc = JsonDocument.Parse(TestContext.TextOf(result)); + Assert.Equal("partial", doc.RootElement.GetProperty("status").GetString()); + Assert.Equal(3, doc.RootElement.GetProperty("rows").GetArrayLength()); + Assert.Equal(0, doc.RootElement.GetProperty("pending").GetInt32()); + Assert.Equal(1, doc.RootElement.GetProperty("failureCount").GetInt32()); + } + + [Fact] + public async Task ValidationFails_WhenRequiredFieldMissing() + { + var (cap, factory, _) = Build(); + var page = SubmittingPage(); + factory.PageResponder = (_, _) => Task.FromResult(page); + + var inline = SampleSchema(); + var inlineJson = JsonSerializer.Serialize(inline, FormSchema.SerializerOptions); + var (ctx, _) = TestContext.Build(); + + var result = await cap.ExecuteAsync( + TestContext.Request("execute-form-batch", + $$""" + { + "schema": {{inlineJson}}, + "allowedHosts": ["example.com"], + "rows": [ {"message":"missing email"} ] + } + """), + ctx); + + using var doc = JsonDocument.Parse(TestContext.TextOf(result)); + var firstRow = doc.RootElement.GetProperty("rows")[0]; + Assert.Equal("validation-error", firstRow.GetProperty("status").GetString()); + Assert.Contains("email", firstRow.GetProperty("error").GetString()!); + // Validation should have short-circuited before any navigation. + Assert.DoesNotContain(page.Actions, a => a.StartsWith("navigate:")); + } + + [Fact] + public async Task UsesSuccessIndicator_WhenProvided() + { + var (cap, factory, _) = Build(); + var page = new StubBrowserPage { CurrentUrl = new Uri("https://example.com/form") }; + page.OnClick = _ => { /* don't change URL; rely on successIndicator */ }; + factory.PageResponder = (_, _) => Task.FromResult(page); + + var inline = SampleSchema(); + var inlineJson = JsonSerializer.Serialize(inline, FormSchema.SerializerOptions); + var (ctx, _) = TestContext.Build(); + + var result = await cap.ExecuteAsync( + TestContext.Request("execute-form-batch", + $$""" + { + "schema": {{inlineJson}}, + "allowedHosts": ["example.com"], + "successIndicator": ".thanks", + "rows": [ {"email":"a@example.com","message":"hi"} ] + } + """), + ctx); + + using var doc = JsonDocument.Parse(TestContext.TextOf(result)); + Assert.Equal("done", doc.RootElement.GetProperty("status").GetString()); + Assert.Contains("wait:.thanks", page.Actions); + } + + [Fact] + public async Task MissingSchemaResource_ReportsError() + { + var (cap, _, _) = Build(); + var (ctx, _) = TestContext.Build(); + + var result = await cap.ExecuteAsync( + TestContext.Request("execute-form-batch", + """ + { + "schemaRef": "sites/example.com/forms/absent", + "allowedHosts": ["example.com"], + "rows": [ {"email":"a@example.com"} ] + } + """), + ctx); + + Assert.Contains("No schema.json resource", TestContext.TextOf(result)); + } + + private static async Task SeedSchemaAsync(FakeSkillStore store, string name, FormSchema schema) + { + var skill = new Skill( + Name: name, + Summary: "test", + Content: "seeded", + CreatedAt: DateTimeOffset.UtcNow); + var resource = new SkillResourceInput( + Filename: "schema.json", + Type: SkillResourceType.JsonSchema, + Description: "test", + Content: schema.Serialize()); + await store.SaveAsync(skill, [resource]); + } + + private static FormSchema SampleSchema() => new( + Version: FormSchema.CurrentVersion, + Url: "https://example.com/form", + FormSelector: "#contact", + SubmitSelector: "#submit", + SuccessIndicator: null, + Fields: + [ + new FormField( + Name: "email", + Type: FormFieldType.Email, + Selector: "input[name=\"email\"]", + Label: "Email", + Required: true), + new FormField( + Name: "message", + Type: FormFieldType.TextArea, + Selector: "textarea[name=\"message\"]", + Label: "Message", + Required: false) + ], + Notes: null); + + private static StubBrowserPage SubmittingPage() + { + var page = new StubBrowserPage { CurrentUrl = new Uri("https://example.com/form") }; + page.OnClick = sel => + { + if (sel == "#submit") + page.CurrentUrl = new Uri("https://example.com/thanks"); + }; + return page; + } + + private static (ExecuteFormBatchCapability Capability, StubBrowserSessionFactory Factory, FakeSkillStore Skills) Build() + { + var factory = new StubBrowserSessionFactory(); + var skills = new FakeSkillStore(); + var broker = new ThrowingCredentialBroker(); + var cap = new ExecuteFormBatchCapability( + factory, + broker, + skills, + NullLogger.Instance); + return (cap, factory, skills); + } + + private sealed class ThrowingCredentialBroker : ICredentialBroker + { + public Task ResolveAsync(string credentialId, CancellationToken cancellationToken = default) => + throw new CredentialNotFoundException(credentialId); + } +} diff --git a/tests/Foragent.Agent.Tests/Forms/LearnFormSchemaCapabilityTests.cs b/tests/Foragent.Agent.Tests/Forms/LearnFormSchemaCapabilityTests.cs new file mode 100644 index 0000000..489b798 --- /dev/null +++ b/tests/Foragent.Agent.Tests/Forms/LearnFormSchemaCapabilityTests.cs @@ -0,0 +1,158 @@ +using System.Text.Json; +using Foragent.Browser; +using Foragent.Capabilities.Forms; +using Foragent.Credentials; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.Logging.Abstractions; +using RockBot.A2A; +using RockBot.Host; +using Xunit; + +namespace Foragent.Agent.Tests.Forms; + +public class LearnFormSchemaCapabilityTests +{ + [Fact] + public async Task RejectsInput_WhenUrlMissing() + { + var (cap, _, _) = Build(); + var (ctx, _) = TestContext.Build(); + + var result = await cap.ExecuteAsync( + TestContext.Request("learn-form-schema", """{"allowedHosts":["example.com"]}"""), + ctx); + + Assert.Contains("url", TestContext.TextOf(result)); + } + + [Fact] + public async Task RejectsInput_WhenAllowlistMissing() + { + var (cap, _, _) = Build(); + var (ctx, _) = TestContext.Build(); + + var result = await cap.ExecuteAsync( + TestContext.Request("learn-form-schema", """{"url":"https://example.com/form"}"""), + ctx); + + Assert.Contains("allowedHosts", TestContext.TextOf(result)); + } + + [Fact] + public async Task RejectsInput_WhenUrlOffAllowlist() + { + var (cap, _, _) = Build(); + var (ctx, _) = TestContext.Build(); + + var result = await cap.ExecuteAsync( + TestContext.Request("learn-form-schema", + """{"url":"https://evil.example/","allowedHosts":["example.com"]}"""), + ctx); + + Assert.Contains("not in the allowlist", TestContext.TextOf(result)); + } + + [Fact] + public async Task ReturnsSchema_AndPersistsSkillWithResource() + { + var scan = SampleScan(); + var (cap, factory, skills) = Build(); + var page = new StubBrowserPage { FormScan = scan }; + factory.PageResponder = (_, _) => Task.FromResult(page); + var (ctx, _) = TestContext.Build(); + + var result = await cap.ExecuteAsync( + TestContext.Request("learn-form-schema", + """{"url":"https://example.com/contact","allowedHosts":["example.com"]}"""), + ctx); + + Assert.Equal(AgentTaskState.Completed, result.State); + using var doc = JsonDocument.Parse(TestContext.TextOf(result)); + Assert.Equal("done", doc.RootElement.GetProperty("status").GetString()); + var skillName = doc.RootElement.GetProperty("skillName").GetString()!; + Assert.StartsWith("sites/example.com/forms/", skillName); + + var saved = skills.Saved[skillName]; + Assert.NotNull(saved.Manifest); + Assert.Contains(saved.Manifest!, r => r.Filename == "schema.json" && r.Type == SkillResourceType.JsonSchema); + + // Resource content round-trips as a FormSchema. + var resourceJson = skills.Resources[skillName]["schema.json"]; + var roundtrip = FormSchema.Deserialize(resourceJson); + Assert.Equal(2, roundtrip.Fields.Count); + Assert.Contains(roundtrip.Fields, f => f.Name == "email" && f.Type == FormFieldType.Email && f.Required); + Assert.Contains(roundtrip.Fields, f => f.Name == "message" && f.Type == FormFieldType.TextArea); + } + + [Fact] + public async Task ReturnsError_WhenScanFindsNoForm() + { + var (cap, factory, _) = Build(); + var page = new StubBrowserPage { FormScan = null }; + factory.PageResponder = (_, _) => Task.FromResult(page); + var (ctx, _) = TestContext.Build(); + + var result = await cap.ExecuteAsync( + TestContext.Request("learn-form-schema", + """{"url":"https://example.com/noform","allowedHosts":["example.com"]}"""), + ctx); + + Assert.Contains("No form", TestContext.TextOf(result)); + } + + [Fact] + public async Task MissingCredential_IsReportedWithoutLeakingId() + { + var (cap, _, _) = Build(); + var (ctx, _) = TestContext.Build(); + + var result = await cap.ExecuteAsync( + TestContext.Request("learn-form-schema", + """{"url":"https://example.com/","allowedHosts":["example.com"],"credentialId":"not-there"}"""), + ctx); + + Assert.Contains("not-there", TestContext.TextOf(result)); + Assert.Contains("not configured", TestContext.TextOf(result)); + } + + private static FormScan SampleScan() => new( + Url: new Uri("https://example.com/contact"), + FormSelector: "#contact-form", + SubmitSelector: "#submit", + Fields: + [ + new FormScanField( + Tag: "input", InputType: "email", Name: "email", Id: "email-input", + Label: "Email", Required: true, Pattern: null, Min: null, Max: null, + MaxLength: 120, Options: null, Selector: "input[name=\"email\"]"), + new FormScanField( + Tag: "textarea", InputType: null, Name: "message", Id: "msg", + Label: "Message:", Required: false, Pattern: null, Min: null, Max: null, + MaxLength: 1000, Options: null, Selector: "textarea[name=\"message\"]") + ]); + + private static (LearnFormSchemaCapability Capability, StubBrowserSessionFactory Factory, FakeSkillStore Skills) Build() + { + var factory = new StubBrowserSessionFactory(); + var skills = new FakeSkillStore(); + // Enricher doesn't run for forms without Select/Radio fields, so an + // always-empty chat client is safe; it would only be reached for + // sample forms that include a select. + var chat = new StubChatClient((_, _) => Task.FromResult(new ChatResponse([]))); + var enricher = new FormSchemaEnricher(chat, NullLogger.Instance); + var broker = new ThrowingCredentialBroker(); + var cap = new LearnFormSchemaCapability( + factory, + broker, + enricher, + skills, + NullLogger.Instance); + return (cap, factory, skills); + } + + private sealed class ThrowingCredentialBroker : ICredentialBroker + { + public Task ResolveAsync(string credentialId, CancellationToken cancellationToken = default) => + throw new CredentialNotFoundException(credentialId); + } +} diff --git a/tests/Foragent.Agent.Tests/TestDoubles.cs b/tests/Foragent.Agent.Tests/TestDoubles.cs index a5fb91b..67c7d2f 100644 --- a/tests/Foragent.Agent.Tests/TestDoubles.cs +++ b/tests/Foragent.Agent.Tests/TestDoubles.cs @@ -138,6 +138,8 @@ internal sealed class StubBrowserPage : IBrowserPage { public List Actions { get; } = []; public Uri CurrentUrl { get; set; } = new("https://stub.example/"); + public HashSet TimeoutSelectors { get; } = new(StringComparer.Ordinal); + public Action? OnClick { get; set; } public Task NavigateAsync(Uri url, CancellationToken ct = default) { @@ -157,12 +159,15 @@ public Task FillAsync(string selector, string value, CancellationToken ct = defa public Task ClickAsync(string selector, CancellationToken ct = default) { Actions.Add($"click:{selector}"); + OnClick?.Invoke(selector); return Task.CompletedTask; } public Task WaitForSelectorAsync(string selector, TimeSpan? timeout = null, CancellationToken ct = default) { Actions.Add($"wait:{selector}"); + if (TimeoutSelectors.Contains(selector)) + throw new TimeoutException($"Stub: '{selector}' marked as a timeout."); return Task.CompletedTask; } @@ -171,6 +176,26 @@ public Task WaitForSelectorAsync(string selector, TimeSpan? timeout = null, Canc public Task GetTextAsync(string selector, CancellationToken ct = default) => Task.FromResult(null); + public FormScan? FormScan { get; set; } + + public Task ScanFormAsync(string? formSelector = null, CancellationToken ct = default) + { + Actions.Add($"scan:{formSelector ?? ""}"); + return Task.FromResult(FormScan); + } + + public Task SelectOptionAsync(string selector, string value, CancellationToken ct = default) + { + Actions.Add($"select:{selector}:{value}"); + return Task.CompletedTask; + } + + public Task SetCheckedAsync(string selector, bool checked_, CancellationToken ct = default) + { + Actions.Add($"checked:{selector}:{checked_}"); + return Task.CompletedTask; + } + public ValueTask DisposeAsync() => ValueTask.CompletedTask; } @@ -182,23 +207,48 @@ public Task WaitForSelectorAsync(string selector, TimeSpan? timeout = null, Canc internal sealed class FakeSkillStore : ISkillStore { public Dictionary Saved { get; } = new(StringComparer.Ordinal); + public Dictionary> Resources { get; } = new(StringComparer.Ordinal); public List<(string Query, int MaxResults)> Searches { get; } = []; public Task SaveAsync(Skill skill) { + // Match FileSkillStore's behavior: SaveAsync(skill) alone preserves + // the existing manifest (see rockbot commit 2db3775 fix #1). + if (skill.Manifest is null && Saved.TryGetValue(skill.Name, out var prior) && prior.Manifest is not null) + skill = skill with { Manifest = prior.Manifest }; Saved[skill.Name] = skill; return Task.CompletedTask; } + public Task SaveAsync(Skill skill, IReadOnlyList? resources) + { + if (resources is null || resources.Count == 0) + return SaveAsync(skill); + + var manifest = resources.Select(r => new SkillResource(r.Filename, r.Type, r.Description)).ToList(); + var bundled = skill with { Manifest = manifest }; + Saved[skill.Name] = bundled; + Resources[skill.Name] = resources.ToDictionary(r => r.Filename, r => r.Content, StringComparer.Ordinal); + return Task.CompletedTask; + } + public Task GetAsync(string name) => Task.FromResult(Saved.TryGetValue(name, out var skill) ? skill : null); + public Task GetResourceAsync(string skillName, string filename) + { + if (Resources.TryGetValue(skillName, out var bundle) && bundle.TryGetValue(filename, out var content)) + return Task.FromResult(content); + return Task.FromResult(null); + } + public Task> ListAsync() => Task.FromResult>([.. Saved.Values]); public Task DeleteAsync(string name) { Saved.Remove(name); + Resources.Remove(name); return Task.CompletedTask; } diff --git a/tests/Foragent.Browser.Tests/FormCapabilitiesIntegrationTests.cs b/tests/Foragent.Browser.Tests/FormCapabilitiesIntegrationTests.cs new file mode 100644 index 0000000..b6e2877 --- /dev/null +++ b/tests/Foragent.Browser.Tests/FormCapabilitiesIntegrationTests.cs @@ -0,0 +1,257 @@ +using System.Text.Json; +using Foragent.Capabilities.Forms; +using Foragent.Credentials; +using Microsoft.AspNetCore.Builder; +using Microsoft.AspNetCore.Http; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging.Abstractions; +using RockBot.A2A; +using RockBot.Host; +using RockBot.Messaging; +using Xunit; + +namespace Foragent.Browser.Tests; + +/// +/// Step 8 end-to-end: drive learn-form-schema + execute-form-batch +/// against a real Kestrel-hosted form with real Chromium. No LLM required — +/// the form has no select/radio fields, so the +/// short-circuits without calling a chat client. +/// +[Collection("Playwright")] +public class FormCapabilitiesIntegrationTests(TestPageServerFixture fixture) +{ + [Fact] + public async Task LearnThenExecute_SubmitsRowsAgainstRealForm() + { + var submissions = new List(); + await using var server = await StartSiteAsync(app => + { + app.MapGet("/contact", () => Results.Content(""" + Contact + +

Contact us

+
+ + + + + +
+ + """, "text/html")); + app.MapPost("/submit", async (HttpContext ctx) => + { + var form = await ctx.Request.ReadFormAsync(); + submissions.Add($"{form["email"]}|{form["message"]}"); + return Results.Content(""" + Thanks +

Message received.

+ """, "text/html"); + }); + }); + + var skills = new InMemorySkillStore(); + var learn = BuildLearn(skills); + + var learnResult = await learn.ExecuteAsync( + FormRequest("learn-form-schema", $$""" + { + "url": "{{server.BaseUrl}}/contact", + "allowedHosts": ["127.0.0.1"] + } + """), + BuildContext()); + + Assert.Equal(AgentTaskState.Completed, learnResult.State); + using var learnDoc = JsonDocument.Parse(TextOf(learnResult)); + Assert.Equal("done", learnDoc.RootElement.GetProperty("status").GetString()); + var skillName = learnDoc.RootElement.GetProperty("skillName").GetString()!; + Assert.True(skills.HasResource(skillName, "schema.json")); + + // Schema round-trips to the shape execute-form-batch expects. + var resourceJson = skills.GetResource(skillName, "schema.json")!; + var schema = FormSchema.Deserialize(resourceJson); + Assert.Contains(schema.Fields, f => f.Name == "email" && f.Type == FormFieldType.Email && f.Required); + Assert.Contains(schema.Fields, f => f.Name == "message" && f.Type == FormFieldType.TextArea); + Assert.Equal("#submit", schema.SubmitSelector); + + var execute = BuildExecute(skills); + var executeResult = await execute.ExecuteAsync( + FormRequest("execute-form-batch", $$""" + { + "schemaRef": "{{skillName}}", + "allowedHosts": ["127.0.0.1"], + "successIndicator": ".thanks", + "rows": [ + {"email":"a@example.com","message":"hello"}, + {"email":"b@example.com","message":"world"} + ] + } + """), + BuildContext()); + + Assert.Equal(AgentTaskState.Completed, executeResult.State); + using var execDoc = JsonDocument.Parse(TextOf(executeResult)); + Assert.Equal("done", execDoc.RootElement.GetProperty("status").GetString()); + Assert.Equal(2, execDoc.RootElement.GetProperty("successCount").GetInt32()); + Assert.Equal(2, submissions.Count); + Assert.Equal("a@example.com|hello", submissions[0]); + Assert.Equal("b@example.com|world", submissions[1]); + } + + private LearnFormSchemaCapability BuildLearn(InMemorySkillStore skills) + { + var enricher = new FormSchemaEnricher( + new UnreachableChatClient(), NullLogger.Instance); + return new LearnFormSchemaCapability( + fixture.Factory, + new NoCredentialBroker(), + enricher, + skills, + NullLogger.Instance); + } + + private ExecuteFormBatchCapability BuildExecute(InMemorySkillStore skills) => + new( + fixture.Factory, + new NoCredentialBroker(), + skills, + NullLogger.Instance); + + private static AgentTaskRequest FormRequest(string skill, string json) => new() + { + TaskId = Guid.NewGuid().ToString(), + Skill = skill, + Message = new AgentMessage + { + Role = "user", + Parts = [new AgentMessagePart { Kind = "text", Text = json }] + } + }; + + private static AgentTaskContext BuildContext() + { + var envelope = MessageEnvelope.Create( + messageType: typeof(AgentTaskRequest).FullName!, + body: ReadOnlyMemory.Empty, + source: "test"); + return new AgentTaskContext + { + MessageContext = new MessageHandlerContext + { + Envelope = envelope, + Agent = new AgentIdentity("Foragent"), + Services = new ServiceCollection().BuildServiceProvider(), + CancellationToken = CancellationToken.None + }, + PublishStatus = (_, _) => Task.CompletedTask + }; + } + + private static string TextOf(AgentTaskResult result) => + result.Message?.Parts.FirstOrDefault(p => p.Kind == "text")?.Text ?? string.Empty; + + private static async Task StartSiteAsync(Action configure) + { + var builder = WebApplication.CreateEmptyBuilder(new WebApplicationOptions()); + builder.WebHost.UseKestrelCore(); + builder.WebHost.UseUrls("http://127.0.0.1:0"); + builder.Services.AddRoutingCore(); + builder.Logging.ClearProviders(); + var app = builder.Build(); + app.UseRouting(); + configure(app); + await app.StartAsync(); + var addresses = app.Services + .GetRequiredService() + .Features.Get()! + .Addresses; + return new SiteHost(app, addresses.First().TrimEnd('/')); + } + + private sealed record SiteHost(WebApplication App, string BaseUrl) : IAsyncDisposable + { + public async ValueTask DisposeAsync() => await App.DisposeAsync(); + } + + private sealed class NoCredentialBroker : ICredentialBroker + { + public Task ResolveAsync(string id, CancellationToken ct = default) => + throw new CredentialNotFoundException(id); + } + + private sealed class InMemorySkillStore : ISkillStore + { + private readonly Dictionary _skills = new(StringComparer.Ordinal); + private readonly Dictionary> _resources = new(StringComparer.Ordinal); + + public Task SaveAsync(Skill skill) + { + _skills[skill.Name] = skill; + return Task.CompletedTask; + } + + public Task SaveAsync(Skill skill, IReadOnlyList? resources) + { + if (resources is null || resources.Count == 0) + { + _skills[skill.Name] = skill; + return Task.CompletedTask; + } + var manifest = resources.Select(r => new SkillResource(r.Filename, r.Type, r.Description)).ToList(); + _skills[skill.Name] = skill with { Manifest = manifest }; + _resources[skill.Name] = resources.ToDictionary(r => r.Filename, r => r.Content, StringComparer.Ordinal); + return Task.CompletedTask; + } + + public Task GetResourceAsync(string skillName, string filename) => + Task.FromResult(_resources.TryGetValue(skillName, out var bundle) && bundle.TryGetValue(filename, out var c) ? c : null); + + public Task GetAsync(string name) => + Task.FromResult(_skills.TryGetValue(name, out var s) ? s : null); + + public Task> ListAsync() => + Task.FromResult>([.. _skills.Values]); + + public Task DeleteAsync(string name) + { + _skills.Remove(name); + _resources.Remove(name); + return Task.CompletedTask; + } + + public Task> SearchAsync( + string query, int maxResults, CancellationToken cancellationToken = default, float[]? queryEmbedding = null) => + Task.FromResult>([]); + + public bool HasResource(string skillName, string filename) => + _resources.TryGetValue(skillName, out var bundle) && bundle.ContainsKey(filename); + + public string? GetResource(string skillName, string filename) => + _resources.TryGetValue(skillName, out var bundle) && bundle.TryGetValue(filename, out var c) ? c : null; + } + + private sealed class UnreachableChatClient : IChatClient + { + public Task GetResponseAsync( + IEnumerable chatMessages, + ChatOptions? options = null, + CancellationToken cancellationToken = default) => + throw new InvalidOperationException( + "The enricher should not call an LLM for forms without select/radio fields."); + +#pragma warning disable CS1998 + public async IAsyncEnumerable GetStreamingResponseAsync( + IEnumerable chatMessages, + ChatOptions? options = null, + [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken = default) + { + yield break; + } +#pragma warning restore CS1998 + public object? GetService(Type serviceType, object? serviceKey = null) => null; + public void Dispose() { } + } +}