diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml new file mode 100644 index 00000000..039af8b5 --- /dev/null +++ b/.github/workflows/docker.yml @@ -0,0 +1,110 @@ +name: AutoControl Docker CI + +on: + push: + branches: [ "dev", "main" ] + paths: + - "docker/**" + - "je_auto_control/**" + - "pyproject.toml" + - ".github/workflows/docker.yml" + pull_request: + branches: [ "dev", "main" ] + paths: + - "docker/**" + - "je_auto_control/**" + - "pyproject.toml" + - ".github/workflows/docker.yml" + +permissions: + contents: read + +jobs: + build-image: + name: Build AutoControl container + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v4 + + - name: Set up Docker Buildx + # nosemgrep: yaml.github-actions.security.third-party-action-not-pinned-to-commit-sha.third-party-action-not-pinned-to-commit-sha + uses: docker/setup-buildx-action@v3 # NOSONAR githubactions:S7637 + + - name: Build image (no push) + # nosemgrep: yaml.github-actions.security.third-party-action-not-pinned-to-commit-sha.third-party-action-not-pinned-to-commit-sha + uses: docker/build-push-action@v5 # NOSONAR githubactions:S7637 + with: + context: . + file: docker/Dockerfile + tags: autocontrol:ci + load: true + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Image size + run: docker image inspect autocontrol:ci --format='size={{.Size}} bytes' + + headless-tests: + name: Headless pytest inside the image + needs: build-image + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v4 + + - name: Set up Docker Buildx + # nosemgrep: yaml.github-actions.security.third-party-action-not-pinned-to-commit-sha.third-party-action-not-pinned-to-commit-sha + uses: docker/setup-buildx-action@v3 # NOSONAR githubactions:S7637 + + - name: Rebuild image (cached) + # nosemgrep: yaml.github-actions.security.third-party-action-not-pinned-to-commit-sha.third-party-action-not-pinned-to-commit-sha + uses: docker/build-push-action@v5 # NOSONAR githubactions:S7637 + with: + context: . + file: docker/Dockerfile + tags: autocontrol:ci + load: true + cache-from: type=gha + + # Mount the repo so pytest can read tests + write the artifact. + - name: Run headless tests under Xvfb + run: | + docker run --rm \ + --user root \ + -v "$PWD:/work" -w /work \ + --entrypoint /bin/sh \ + autocontrol:ci -c " + pip install --no-cache-dir -r dev_requirements.txt && + xvfb-run -a -s '-screen 0 1280x800x24' \ + python -m pytest test/unit_test/headless -q --tb=short + " + + - name: Smoke test the entrypoint (rest mode) + run: | + # Run without --rm so a quick crash leaves the container in + # place for ``docker logs`` to inspect afterwards. Final + # ``docker rm -f`` cleans up regardless of exit state. + docker run -d --name ac-rest -p 9939:9939 \ + -e AC_TOKEN=ci-token autocontrol:ci rest + ok=0 + for attempt in 1 2 3 4 5 6 7 8 9 10; do + if curl -fsS -H "Authorization: Bearer ci-token" \ + http://127.0.0.1:9939/health; then + echo "REST API is up" + ok=1 + break + fi + sleep 2 + done + echo "::group::docker logs ac-rest" + docker logs ac-rest || true + echo "::endgroup::" + echo "::group::docker inspect (state)" + docker inspect --format '{{json .State}}' ac-rest || true + echo "::endgroup::" + docker rm -f ac-rest >/dev/null 2>&1 || true + if [ "$ok" -ne 1 ]; then + echo "REST health probe never succeeded" >&2 + exit 1 + fi diff --git a/README.md b/README.md index 9e4a7e93..2d419ad8 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ ## Table of Contents +- [What's new (2026-05)](#whats-new-2026-05) - [Features](#features) - [Architecture](#architecture) - [Installation](#installation) @@ -54,6 +55,49 @@ --- +## What's new (2026-05) + +Twenty-three additions covering smarter locators, deeper IDE / ops +tooling, two new platforms, and fresh integrations. Each ships with a +headless API, an `AC_*` executor command, an `ac_*` MCP tool, and +(where it makes sense) a Qt GUI tab. Full reference page: +[`docs/source/Eng/doc/new_features/v2_features_doc.rst`](docs/source/Eng/doc/new_features/v2_features_doc.rst). + +**Locator + selector intelligence** +- **Self-healing locator** — `image_template → VLM` fallback with a JSON-lines audit log (`AC_self_heal_locate / _click`). +- **Anchor-based locator** — find element B by spatial relation (`above`, `below`, `left_of`, `right_of`, `near`) to anchor A; anchor and target can use different backends (image / OCR / VLM / a11y). +- **OCR with structured output** — cluster raw OCR matches into rows, tables, and `label:value` form fields (`AC_ocr_read_structure`). +- **Smart waits** — `wait_until_screen_stable`, `wait_until_pixel_changes`, `wait_until_region_idle`: frame-diff replacements for `time.sleep`. +- **A/B locator framework** — race N strategies for the same target; recommend the historically best one from a persisted ledger. + +**Operations + observability** +- **LLM cost telemetry** — per-call token + USD log with day / model / provider rollup (`record_llm_call`, `summarise_llm_costs`). +- **Trace replay UI** — scrubbable timeline over the existing time-travel recordings with per-step action list. +- **Failure → ticket automation** — fan a failure report out to Jira / Linear / GitHub Issues when a scheduled / triggered / REST run fails. +- **Container CI templates** — GitHub Actions + GitLab CI workflows that build the image, run the headless pytest suite under Xvfb, and smoke-test the REST entrypoint; XFCE+x11vnc Dockerfile variant for flows that need a real WM. +- **Cross-host DAG orchestrator** — parallel execution with skip-on-failure cascade across local + admin-console-registered hosts (`run_dag`, `AC_run_dag`). +- **Multi-viewer presence** — roster + controller/observer roles for the remote desktop, with a thread-safe Python `PresenceRegistry` independent of aiortc. + +**Agent + integrations** +- **Computer-use high-level API** — `run_computer_use(goal, ...)` wraps `ComputerUseAgentBackend` + `AgentLoop`; auto-detects display size; bounded by `max_steps` / `wall_seconds`. +- **WebRunner convenience commands** — `web_open` / `web_quit` / `web_screenshot` / `web_current_url` on top of the existing `je_web_runner` bridge; same surface exposed as `AC_web_*` and `ac_web_*`. +- **Chat-ops bot** — transport-agnostic `CommandRouter` + polling Slack adapter. Built-in commands: `/help`, `/scripts`, `/run`, `/screenshot`, `/status`. RBAC via `required_role`. + +**Platform coverage** +- **Wayland CLI backend** — `wtype` / `ydotool` / `grim` with `XDG_SESSION_TYPE` auto-detect and X11 (XWayland) fallback; override via `JE_AUTOCONTROL_LINUX_DISPLAY_SERVER=x11|wayland|auto`. +- **Wayland libei native** — ctypes binding to `libei.so.*` for microsecond-latency input; opt-in via `JE_AUTOCONTROL_WAYLAND_INPUT_BACKEND=libei|cli|auto`. Defaults to libei when loadable. +- **macOS Accessibility deep-dive** — recursive `dump_accessibility_tree()` plus a polling `AccessibilityRecorder` for focus / bounds events. + +**Developer experience** +- **autocontrol-lsp completion** — the language server now tracks `didOpen` / `didChange` / `didClose`, publishes diagnostics for invalid JSON and unknown `AC_*` commands, and provides signature help generated from the live executor table. +- **`.pyi` stub generator** — `python -m je_auto_control.utils.stubs.generator je_auto_control/actions.pyi` emits an IDE-facing stub so every `AC_*` command autocompletes with parameter hints. +- **VS Code extension** — bundled extension now ships `AutoControl: Run / Screenshot / Preview` commands that hit the local REST API. +- **Browser extension recorder** — Manifest V3 extension under `browser-extension/`: capture clicks, typing, navigation, form submissions in a tab and export them as `AC_web_*` / `WR_*` JSON. +- **pytest plugin + Gherkin BDD** — `pytest11` entry point auto-loads; `@pytest.mark.autocontrol` arms screenshot-on-failure; `bdd_steps.register_pytest_bdd_steps(pytest_bdd)` wires `Given/When/Then` onto every `AC_*` verb. +- **Visual flow editor** — node-based view that round-trips to the same JSON action format the list-based Script Builder uses. + +--- + ## Features - **Mouse Automation** — move, click, press, release, drag, and scroll with precise coordinate control @@ -71,7 +115,7 @@ - **Action Recording & Playback** — record mouse/keyboard events and replay them - **JSON-Based Action Scripting** — define and execute automation flows using JSON action files (dry-run + step debug) - **Scheduler** — run scripts on an interval or cron expression; jobs persist across restarts -- **Global Hotkey Daemon** — bind OS-level hotkeys to action scripts (Windows today; macOS/Linux stubs in place) +- **Global Hotkey Daemon** — bind OS-level hotkeys to action scripts on all three desktops: Windows (`RegisterHotKey`), macOS (`CGEventTap`, needs Accessibility permission), and Linux X11 (`XGrabKey` with NumLock / CapsLock variant masking). Wayland hotkeys are still compositor-dependent (each session bus exposes a different shortcut portal); a Wayland session can still drive AutoControl via the new Wayland input backend (see [What's new (2026-05)](#whats-new-2026-05)). Same `bind()` / `start()` API across platforms; the Strategy-pattern dispatch in `backends/` auto-picks the right backend at start time - **Event Triggers** — fire scripts when an image appears, a window opens, a pixel changes, or a file is modified - **Run History** — SQLite-backed run log across scheduler / triggers / hotkeys / REST with auto error-screenshot artifacts - **Report Generation** — export test records as HTML, JSON, or XML reports with success/failure status @@ -1040,9 +1084,11 @@ Both flavours coexist; `job.is_cron` tells them apart. ### Global Hotkey Daemon -Bind OS-level hotkeys to action JSON scripts (Windows backend today; -macOS / Linux raise `NotImplementedError` on `start()` with Strategy- -pattern seams in place). +Bind OS-level hotkeys to action JSON scripts. Cross-platform — Windows +uses `RegisterHotKey`, macOS uses `CGEventTap` (requires Accessibility +permission), Linux X11 uses `XGrabKey` (Wayland not supported). The +same call sites work everywhere; the daemon picks the backend at +`start()` time. ```python from je_auto_control import default_hotkey_daemon diff --git a/README/README_zh-CN.md b/README/README_zh-CN.md index 615b13e0..ab247ec6 100644 --- a/README/README_zh-CN.md +++ b/README/README_zh-CN.md @@ -12,6 +12,7 @@ ## 目录 +- [本次更新 (2026-05)](#本次更新-2026-05) - [功能特性](#功能特性) - [架构](#架构) - [安装](#安装) @@ -53,6 +54,49 @@ --- +## 本次更新 (2026-05) + +新增 23 个功能,覆盖更聪明的定位器、更深的 IDE / 运维工具、两个新平台后端, +以及几个新集成。每个功能都遵循框架既有模式:headless Python API、 +`AC_*` executor 命令、`ac_*` MCP 工具,以及(适用时)Qt GUI 选项卡。 +完整参考页面: +[`docs/source/Zh/doc/new_features/v2_features_doc.rst`](../docs/source/Zh/doc/new_features/v2_features_doc.rst)。 + +**定位器与选择器智能化** +- **自愈定位器** — `image_template → VLM` 后备并写入 JSON-lines 审计记录(`AC_self_heal_locate / _click`)。 +- **锚点定位器** — 按空间关系(`above` / `below` / `left_of` / `right_of` / `near`)找到目标;锚点与目标可使用不同 backend(image / OCR / VLM / a11y)。 +- **结构化 OCR** — 将原始 OCR match 聚合为 rows、tables、`label:value` 表单字段(`AC_ocr_read_structure`)。 +- **智能等待** — `wait_until_screen_stable`、`wait_until_pixel_changes`、`wait_until_region_idle`:用 frame-diff 取代 `time.sleep`。 +- **A/B 定位器框架** — 并行跑 N 个策略,依持久化的历史成绩推荐最佳。 + +**运维与可观测性** +- **LLM 成本遥测** — 每次调用的 token / USD 记录,按天 / 模型 / 提供方汇总(`record_llm_call`、`summarise_llm_costs`)。 +- **追踪回放 UI** — 在现有 time-travel 录像上拖动时间轴并逐步显示动作。 +- **失败 → 工单自动化** — 调度器/触发器/REST 任务失败时自动分发 Jira / Linear / GitHub Issues。 +- **容器化 CI 模板** — GitHub Actions + GitLab CI workflow:构建镜像、跑 headless pytest(Xvfb 容器内)、smoke-test REST entrypoint;另含 XFCE+x11vnc Dockerfile 变体。 +- **跨主机 DAG 编排** — 跨 local + admin-console 已注册主机并行执行,失败时下游 cascade 为 `skipped`(`run_dag`、`AC_run_dag`)。 +- **多 viewer 名单** — 为远程桌面提供控制者 / 观察者角色,纯 Python `PresenceRegistry` 独立于 aiortc。 + +**代理与集成** +- **Computer-use 高阶 API** — `run_computer_use(goal, ...)` 封装 `ComputerUseAgentBackend` + `AgentLoop`;自动检测屏幕大小;以 `max_steps` / `wall_seconds` 为预算。 +- **WebRunner 便利命令** — 在既有 `je_web_runner` 桥接之上的 `web_open` / `web_quit` / `web_screenshot` / `web_current_url`;同步以 `AC_web_*`、`ac_web_*` 暴露。 +- **Chat-ops 机器人** — 传输层中立的 `CommandRouter` + Slack polling adapter。内置命令:`/help`、`/scripts`、`/run`、`/screenshot`、`/status`。RBAC 通过 `required_role`。 + +**平台覆盖** +- **Wayland CLI 后端** — `wtype` / `ydotool` / `grim`,按 `XDG_SESSION_TYPE` 自动检测,CLI 工具未装时回退到 X11 (XWayland);可用 `JE_AUTOCONTROL_LINUX_DISPLAY_SERVER=x11|wayland|auto` 覆盖。 +- **Wayland libei 原生后端** — 对 `libei.so.*` 的 ctypes 绑定,绕过 CLI shim 取得微秒级延迟;以 `JE_AUTOCONTROL_WAYLAND_INPUT_BACKEND=libei|cli|auto` 启用,默认在 libei 可加载时用 libei。 +- **macOS Accessibility 强化** — 递归 `dump_accessibility_tree()` 与 polling `AccessibilityRecorder`,捕捉 focus / bounds 事件。 + +**开发者体验** +- **autocontrol-lsp 完整化** — 追踪 `didOpen` / `didChange` / `didClose`、发布 JSON 与未知 `AC_*` 命令的 diagnostics、由即时的 executor 表生成 signature help。 +- **`.pyi` stub 生成器** — `python -m je_auto_control.utils.stubs.generator je_auto_control/actions.pyi` 写出 IDE 端 stub 文件,所有 `AC_*` 命令在 IDE 内可 autocomplete 并显示参数提示。 +- **VS Code 扩展** — 内置扩展新增 `AutoControl: Run / Screenshot / Preview` 命令,直接打本机 REST API。 +- **浏览器扩展录制器** — `browser-extension/` 下的 Manifest V3 扩展:捕捉标签页的点击、输入、导航与表单提交,导出为 `AC_web_*` / `WR_*` JSON。 +- **pytest plugin + Gherkin BDD** — `pytest11` entry point 自动加载;`@pytest.mark.autocontrol` 开启失败自动截屏;`bdd_steps.register_pytest_bdd_steps(pytest_bdd)` 一次把 `Given/When/Then` 对应到每一个 `AC_*` verb。 +- **可视化流程编辑器** — node-based 视图与既有 list-based Script Builder 使用同一份 JSON 格式,互相兼容。 + +--- + ## 功能特性 - **鼠标自动化** — 移动、点击、按下、释放、拖拽、滚动,支持精确坐标控制 @@ -70,7 +114,7 @@ - **动作录制与回放** — 录制鼠标/键盘事件并重新播放 - **JSON 脚本执行** — 使用 JSON 动作文件定义并执行自动化流程(支持 dry-run 与逐步调试) - **调度器** — 以 interval 或 cron 表达式执行脚本,两类调度可同时存在 -- **全局热键** — 将 OS 热键绑定到 action 脚本(当前支持 Windows,macOS/Linux 保留扩展接口) +- **全局热键** — 跨平台绑定 OS 热键到 action 脚本:Windows (`RegisterHotKey`)、macOS (`CGEventTap`,需 Accessibility 权限)、Linux X11 (`XGrabKey`,含 NumLock / CapsLock 变体掩码)。Wayland 不支持。三个平台共享同一个 API;`backends/` 在 `start()` 时自动挑后端 - **事件触发器** — 检测到图像出现、窗口出现、像素变化或文件变动时自动执行脚本 - **执行历史** — 使用 SQLite 记录 scheduler / triggers / hotkeys / REST 的执行结果;错误时自动附带截图 - **报告生成** — 将测试记录导出为 HTML、JSON 或 XML 报告,包含成功/失败状态 @@ -949,9 +993,10 @@ ac.default_scheduler.start() ### 全局热键 -将 OS 热键绑定到 action JSON 脚本(Windows 后端;macOS / Linux 的 -`start()` 目前会抛出 `NotImplementedError`,接口已按 Strategy pattern -保留)。 +将 OS 热键绑定到 action JSON 脚本。跨平台 — Windows 用 +`RegisterHotKey`、macOS 用 `CGEventTap`(需要 Accessibility 权限)、 +Linux X11 用 `XGrabKey`(不支持 Wayland)。三个平台同一个 API;daemon +在 `start()` 时自动挑后端。 ```python from je_auto_control import default_hotkey_daemon diff --git a/README/README_zh-TW.md b/README/README_zh-TW.md index 1c4b0900..b26be948 100644 --- a/README/README_zh-TW.md +++ b/README/README_zh-TW.md @@ -12,6 +12,7 @@ ## 目錄 +- [本次更新 (2026-05)](#本次更新-2026-05) - [功能特色](#功能特色) - [架構](#架構) - [安裝](#安裝) @@ -53,6 +54,49 @@ --- +## 本次更新 (2026-05) + +新增 23 個功能,涵蓋更聰明的定位器、更深的 IDE / 維運工具、兩個新平台後端, +以及幾個新整合。每個功能都遵循框架既有模式:headless Python API、 +`AC_*` executor 命令、`ac_*` MCP 工具,以及(適用時)Qt GUI 分頁。 +完整參考頁面: +[`docs/source/Zh/doc/new_features/v2_features_doc.rst`](../docs/source/Zh/doc/new_features/v2_features_doc.rst)。 + +**定位器與選擇器智慧化** +- **自我修復定位器** — `image_template → VLM` 後備並寫入 JSON-lines 稽核記錄(`AC_self_heal_locate / _click`)。 +- **錨點定位器** — 依空間關係(`above` / `below` / `left_of` / `right_of` / `near`)找到目標;錨點與目標可使用不同 backend(image / OCR / VLM / a11y)。 +- **結構化 OCR** — 把原始 OCR match 聚合為 rows、tables、`label:value` 表單欄位(`AC_ocr_read_structure`)。 +- **智慧等待** — `wait_until_screen_stable`、`wait_until_pixel_changes`、`wait_until_region_idle`:用 frame-diff 取代 `time.sleep`。 +- **A/B 定位器框架** — 並行跑 N 個策略,依持久化的歷史成績推薦最佳。 + +**維運與觀察性** +- **LLM 成本遙測** — 每次呼叫的 token / USD 紀錄,按天 / 模型 / 提供者彙總(`record_llm_call`、`summarise_llm_costs`)。 +- **追蹤重播 UI** — 在現有 time-travel 錄影上拖曳時間軸並逐步顯示動作。 +- **失敗 → 工單自動化** — 排程/觸發器/REST 任務失敗時自動分送 Jira / Linear / GitHub Issues。 +- **容器化 CI 模板** — GitHub Actions + GitLab CI workflow:建鏡像、跑 headless pytest(Xvfb 容器內)、smoke-test REST entrypoint;另含 XFCE+x11vnc Dockerfile 變體。 +- **跨主機 DAG 編排** — 跨 local + admin-console 已註冊主機並行執行,失敗時下游 cascade 為 `skipped`(`run_dag`、`AC_run_dag`)。 +- **多 viewer 名單** — 為遠端桌面提供控制者 / 觀察者角色,純 Python `PresenceRegistry` 獨立於 aiortc。 + +**代理與整合** +- **Computer-use 高階 API** — `run_computer_use(goal, ...)` 封裝 `ComputerUseAgentBackend` + `AgentLoop`;自動偵測螢幕大小;以 `max_steps` / `wall_seconds` 為預算。 +- **WebRunner 便利命令** — 在既有 `je_web_runner` 橋接之上的 `web_open` / `web_quit` / `web_screenshot` / `web_current_url`;同步以 `AC_web_*`、`ac_web_*` 暴露。 +- **Chat-ops 機器人** — 傳輸層中立的 `CommandRouter` + Slack polling adapter。內建命令:`/help`、`/scripts`、`/run`、`/screenshot`、`/status`。RBAC 透過 `required_role`。 + +**平台覆蓋** +- **Wayland CLI 後端** — `wtype` / `ydotool` / `grim`,依 `XDG_SESSION_TYPE` 自動偵測,CLI 工具未裝時回退到 X11 (XWayland);可用 `JE_AUTOCONTROL_LINUX_DISPLAY_SERVER=x11|wayland|auto` 覆寫。 +- **Wayland libei 原生後端** — 對 `libei.so.*` 的 ctypes 綁定,繞過 CLI shim 取得微秒級延遲;以 `JE_AUTOCONTROL_WAYLAND_INPUT_BACKEND=libei|cli|auto` 啟用,預設在 libei 可載入時用 libei。 +- **macOS Accessibility 強化** — 遞迴 `dump_accessibility_tree()` 與 polling `AccessibilityRecorder`,捕捉 focus / bounds 事件。 + +**開發者體驗** +- **autocontrol-lsp 完整化** — 追蹤 `didOpen` / `didChange` / `didClose`、發佈 JSON 與未知 `AC_*` 命令的 diagnostics、由即時的 executor 表產生 signature help。 +- **`.pyi` stub 產生器** — `python -m je_auto_control.utils.stubs.generator je_auto_control/actions.pyi` 寫出 IDE 端 stub 檔,所有 `AC_*` 命令在 IDE 內可 autocomplete 並顯示參數提示。 +- **VS Code 擴充** — 內建擴充新增 `AutoControl: Run / Screenshot / Preview` 命令,直接打本機 REST API。 +- **瀏覽器擴充錄製器** — `browser-extension/` 下的 Manifest V3 擴充:捕捉分頁的點擊、輸入、導航與表單提交,匯出成 `AC_web_*` / `WR_*` JSON。 +- **pytest plugin + Gherkin BDD** — `pytest11` entry point 自動載入;`@pytest.mark.autocontrol` 開啟失敗自動截圖;`bdd_steps.register_pytest_bdd_steps(pytest_bdd)` 一次把 `Given/When/Then` 對應到每一個 `AC_*` verb。 +- **視覺流程編輯器** — node-based 視圖與既有 list-based Script Builder 使用同一份 JSON 格式,互相相容。 + +--- + ## 功能特色 - **滑鼠自動化** — 移動、點擊、按下、釋放、拖曳、滾動,支援精確座標控制 @@ -70,7 +114,7 @@ - **動作錄製與回放** — 錄製滑鼠/鍵盤事件並重新播放 - **JSON 腳本執行** — 使用 JSON 動作檔案定義並執行自動化流程(支援 dry-run 與逐步除錯) - **排程器** — 以 interval 或 cron 表示式執行腳本,interval 與 cron job 可同時存在 -- **全域熱鍵** — 將 OS 熱鍵綁定到 action 腳本(目前為 Windows,macOS/Linux 保留擴充介面) +- **全域熱鍵** — 跨平台綁定 OS 熱鍵到 action 腳本:Windows (`RegisterHotKey`)、macOS (`CGEventTap`,需 Accessibility 權限)、Linux X11 (`XGrabKey`,含 NumLock / CapsLock 變體遮罩)。Wayland 不支援。三個平台共用同一個 API;`backends/` 在 `start()` 時自動挑後端 - **事件觸發器** — 偵測到影像出現、視窗出現、像素變化或檔案變動時自動執行腳本 - **執行歷史** — 以 SQLite 紀錄 scheduler / triggers / hotkeys / REST 的執行結果;錯誤時自動附上截圖 - **報告產生** — 將測試紀錄匯出為 HTML、JSON 或 XML 報告,包含成功/失敗狀態 @@ -949,9 +993,10 @@ ac.default_scheduler.start() ### 全域熱鍵 -將 OS 熱鍵綁定到 action JSON 腳本(Windows 後端;macOS / Linux 的 -`start()` 目前會拋出 `NotImplementedError`,介面已依 Strategy pattern -預留)。 +將 OS 熱鍵綁定到 action JSON 腳本。跨平台 — Windows 用 +`RegisterHotKey`、macOS 用 `CGEventTap`(需要 Accessibility 權限)、 +Linux X11 用 `XGrabKey`(不支援 Wayland)。呼叫端三個平台一樣, +daemon 在 `start()` 時自動挑後端。 ```python from je_auto_control import default_hotkey_daemon diff --git a/autocontrol-lsp/autocontrol_lsp/server/diagnostics.py b/autocontrol-lsp/autocontrol_lsp/server/diagnostics.py new file mode 100644 index 00000000..27fcd235 --- /dev/null +++ b/autocontrol-lsp/autocontrol_lsp/server/diagnostics.py @@ -0,0 +1,106 @@ +"""Build LSP ``Diagnostic`` lists for an AutoControl action JSON file. + +Two layers of checking: + +1. **JSON parse**: a parse failure surfaces as a diagnostic at the + reported error line, with severity ``Error``; +2. **Schema**: top-level must be a list; each entry must be a 1- or + 2-element list ``[name]`` / ``[name, params_dict]`` where ``name`` + is a registered ``AC_*`` command and ``params`` is an object. + +Diagnostics are returned in the LSP wire shape — the server hands +them straight to ``textDocument/publishDiagnostics``. +""" +from __future__ import annotations + +import json +from typing import Any, Dict, List, Optional + +from autocontrol_lsp.server.commands import known_action_names + + +_SEVERITY_ERROR = 1 +_SEVERITY_WARNING = 2 + + +def diagnostics_for(text: str) -> List[Dict[str, Any]]: + """Return every problem found in ``text`` as an LSP Diagnostic dict.""" + try: + data = json.loads(text or "") + except json.JSONDecodeError as error: + return [_parse_error_diagnostic(error)] + return _schema_diagnostics(data) + + +def _parse_error_diagnostic(error: json.JSONDecodeError) -> Dict[str, Any]: + line = max(0, int(error.lineno) - 1) + column = max(0, int(error.colno) - 1) + return { + "range": { + "start": {"line": line, "character": column}, + "end": {"line": line, "character": column + 1}, + }, + "severity": _SEVERITY_ERROR, + "source": "autocontrol-lsp", + "message": f"invalid JSON: {error.msg}", + } + + +def _schema_diagnostics(data: Any) -> List[Dict[str, Any]]: + out: List[Dict[str, Any]] = [] + if not isinstance(data, list): + out.append(_root_must_be_list_diagnostic()) + return out + known = set(known_action_names()) + for index, entry in enumerate(data): + problem = _check_entry(entry, known) + if problem is None: + continue + out.append(_diagnostic_for_entry(index, problem)) + return out + + +def _check_entry(entry: Any, known: set) -> Optional[str]: + if not isinstance(entry, list): + return "action must be a list of [name] or [name, params]" + if not entry: + return "action list cannot be empty" + name = entry[0] + if not isinstance(name, str): + return f"action name must be a string, got {type(name).__name__}" + if not name.startswith("AC_"): + return f"action name {name!r} must start with AC_" + if name not in known: + return f"unknown AC_ command: {name!r}" + if len(entry) > 2: + return "action accepts at most [name, params]" + if len(entry) == 2 and not isinstance(entry[1], dict): + return "params must be an object" + return None + + +def _diagnostic_for_entry(index: int, message: str) -> Dict[str, Any]: + return { + "range": { + "start": {"line": 0, "character": 0}, + "end": {"line": 0, "character": 1}, + }, + "severity": _SEVERITY_WARNING, + "source": "autocontrol-lsp", + "message": f"action[{index}]: {message}", + } + + +def _root_must_be_list_diagnostic() -> Dict[str, Any]: + return { + "range": { + "start": {"line": 0, "character": 0}, + "end": {"line": 0, "character": 1}, + }, + "severity": _SEVERITY_ERROR, + "source": "autocontrol-lsp", + "message": "action file must be a JSON list of [name, params] entries", + } + + +__all__ = ["diagnostics_for"] diff --git a/autocontrol-lsp/autocontrol_lsp/server/documents.py b/autocontrol-lsp/autocontrol_lsp/server/documents.py new file mode 100644 index 00000000..e584b662 --- /dev/null +++ b/autocontrol-lsp/autocontrol_lsp/server/documents.py @@ -0,0 +1,159 @@ +"""In-memory document store for the LSP server. + +LSP clients send ``textDocument/didOpen`` with the full file contents +and ``didChange`` with either full or incremental updates. The server +needs the *current* text whenever hover / completion / diagnostics is +requested — that's what this module owns. Pure stdlib, thread-safe. +""" +from __future__ import annotations + +import threading +from dataclasses import dataclass +from typing import Dict, List, Optional, Sequence + + +@dataclass(frozen=True) +class Position: + """Zero-based ``(line, character)`` LSP position.""" + + line: int + character: int + + +@dataclass(frozen=True) +class TextDocument: + """Versioned text snapshot keyed by URI.""" + + uri: str + text: str + version: int = 0 + + def lines(self) -> List[str]: + return self.text.splitlines() + + def word_at(self, position: Position) -> str: + """Return the identifier-like word under ``position`` (LSP style).""" + rows = self.lines() + if position.line < 0 or position.line >= len(rows): + return "" + line = rows[position.line] + index = position.character + if index < 0 or index > len(line): + return "" + return _word_around(line, min(index, len(line))) + + +class DocumentStore: + """Thread-safe ``uri → TextDocument`` map for the LSP loop.""" + + def __init__(self) -> None: + self._docs: Dict[str, TextDocument] = {} + self._lock = threading.RLock() + + def open(self, uri: str, text: str, version: int = 0) -> TextDocument: + doc = TextDocument(uri=str(uri), text=str(text), version=int(version)) + with self._lock: + self._docs[doc.uri] = doc + return doc + + def replace(self, uri: str, text: str, + version: Optional[int] = None) -> TextDocument: + with self._lock: + existing = self._docs.get(uri) + next_version = _resolve_next_version(version, existing) + return self.open(uri, text, next_version) + + def close(self, uri: str) -> bool: + with self._lock: + return self._docs.pop(uri, None) is not None + + def get(self, uri: str) -> Optional[TextDocument]: + with self._lock: + return self._docs.get(uri) + + def count(self) -> int: + with self._lock: + return len(self._docs) + + def apply_change(self, uri: str, + changes: Sequence[Dict], + version: Optional[int] = None, + ) -> Optional[TextDocument]: + """Apply LSP ``contentChanges`` entries to the stored document. + + Supports both full-document and range-incremental forms. + Returns the updated document, or None when the URI isn't tracked. + """ + with self._lock: + doc = self._docs.get(uri) + if doc is None: + return None + text = doc.text + for change in changes: + if "range" not in change: + text = str(change.get("text", "")) + continue + text = _apply_range_edit(text, change["range"], + str(change.get("text", ""))) + return self.replace(uri, text, version=version) + + +# --- helpers ------------------------------------------------- + +def _resolve_next_version(override: Optional[int], + existing: Optional[TextDocument]) -> int: + """Pick the next document version: explicit override → existing+1 → 0.""" + if override is not None: + return int(override) + if existing is not None: + return existing.version + 1 + return 0 + + +_WORD_CHARS = set("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_") + + +def _word_around(line: str, index: int) -> str: + if index >= len(line): + index = len(line) - 1 + if index < 0: + return "" + if line[index] not in _WORD_CHARS: + # Try the character to the left — common when cursor sits after a name. + if index > 0 and line[index - 1] in _WORD_CHARS: + index -= 1 + else: + return "" + start = index + while start > 0 and line[start - 1] in _WORD_CHARS: + start -= 1 + end = index + while end + 1 < len(line) and line[end + 1] in _WORD_CHARS: + end += 1 + return line[start:end + 1] + + +def _apply_range_edit(text: str, lsp_range: Dict, + new_text: str) -> str: + start = lsp_range.get("start") or {} + end = lsp_range.get("end") or {} + start_index = _offset_for(text, int(start.get("line", 0)), + int(start.get("character", 0))) + end_index = _offset_for(text, int(end.get("line", 0)), + int(end.get("character", 0))) + if start_index > end_index: + start_index, end_index = end_index, start_index + return text[:start_index] + new_text + text[end_index:] + + +def _offset_for(text: str, line: int, char: int) -> int: + current_line = 0 + for index, ch in enumerate(text): + if current_line == line: + return min(len(text), index + char) + if ch == "\n": + current_line += 1 + return len(text) + + +__all__ = ["DocumentStore", "Position", "TextDocument"] diff --git a/autocontrol-lsp/autocontrol_lsp/server/handlers.py b/autocontrol-lsp/autocontrol_lsp/server/handlers.py index a04158ce..48890829 100644 --- a/autocontrol-lsp/autocontrol_lsp/server/handlers.py +++ b/autocontrol-lsp/autocontrol_lsp/server/handlers.py @@ -1,18 +1,20 @@ """Per-method LSP handlers — pure functions, easy to unit-test.""" from __future__ import annotations -from typing import Any, Dict, List +import inspect +from typing import Any, Dict, List, Optional from autocontrol_lsp.server.commands import ( discover_actions, get_action_doc, known_action_names, ) +from autocontrol_lsp.server.diagnostics import diagnostics_for +from autocontrol_lsp.server.documents import ( + DocumentStore, Position, TextDocument, +) -# LSP CompletionItemKind enum (subset used here). +# LSP CompletionItemKind / MarkupKind enums (subset used here). _KIND_FUNCTION = 3 -_KIND_TEXT = 1 - -# LSP MarkupKind for hover. _MARKUP_PLAINTEXT = "plaintext" @@ -25,20 +27,19 @@ def handle_initialize(_params: Dict[str, Any]) -> Dict[str, Any]: "triggerCharacters": ["\"", "_", "A"], }, "hoverProvider": True, + "signatureHelpProvider": { + "triggerCharacters": ["(", ","], + }, }, "serverInfo": { "name": "autocontrol-lsp", - "version": "0.1.0", + "version": "0.2.0", }, } def handle_completion(_params: Dict[str, Any]) -> Dict[str, Any]: - """Return every known AC_* command as a completion item. - - The editor filters by the prefix the user has typed, so we don't - need to slice the list ourselves — keeps the handler stateless. - """ + """Return every known AC_* command as a completion item.""" items: List[Dict[str, Any]] = [] for name, doc in discover_actions().items(): item = { @@ -48,48 +49,141 @@ def handle_completion(_params: Dict[str, Any]) -> Dict[str, Any]: } if doc: item["documentation"] = { - "kind": _MARKUP_PLAINTEXT, - "value": doc, + "kind": _MARKUP_PLAINTEXT, "value": doc, } items.append(item) return {"isIncomplete": False, "items": items} -def handle_hover(params: Dict[str, Any]) -> Dict[str, Any]: - """Show the action's docstring when the cursor is on a command name.""" - word = _extract_word(params) +def handle_hover(params: Dict[str, Any], + store: Optional[DocumentStore] = None) -> Dict[str, Any]: + """Resolve the word at the cursor and show its docstring.""" + word = _word_from_params(params, store) if not word: return {} doc = get_action_doc(word) - if not doc: - # Fall back to "known but undocumented" hint, or no hover at all - # if the word isn't an AC_* command we recognise. - if word in known_action_names(): - return { - "contents": { - "kind": _MARKUP_PLAINTEXT, - "value": f"{word} (no docstring available)", - }, - } - return {} + if doc: + return {"contents": {"kind": _MARKUP_PLAINTEXT, "value": doc}} + if word in known_action_names(): + return { + "contents": { + "kind": _MARKUP_PLAINTEXT, + "value": f"{word} (no docstring available)", + }, + } + return {} + + +def handle_signature_help(params: Dict[str, Any], + store: Optional[DocumentStore] = None, + ) -> Dict[str, Any]: + """Show parameter hints for the AC_* command under the cursor.""" + word = _word_from_params(params, store) + if not word or word not in known_action_names(): + return {"signatures": []} + signature_text = _signature_text(word) + if signature_text is None: + return {"signatures": []} return { - "contents": { - "kind": _MARKUP_PLAINTEXT, - "value": doc, - }, + "signatures": [{ + "label": signature_text, + "documentation": { + "kind": _MARKUP_PLAINTEXT, + "value": get_action_doc(word) or "", + }, + }], + "activeSignature": 0, + "activeParameter": 0, } -def _extract_word(params: Dict[str, Any]) -> str: - """Pull the word at the hover position out of an LSP ``hover`` request. +def handle_did_open(params: Dict[str, Any], + store: DocumentStore) -> List[Dict[str, Any]]: + """Track a newly-opened document; returns its diagnostics.""" + doc_params = params.get("textDocument") or {} + uri = doc_params.get("uri") + if not isinstance(uri, str): + return [] + text = str(doc_params.get("text") or "") + version = int(doc_params.get("version") or 0) + store.open(uri, text, version) + return diagnostics_for(text) + + +def handle_did_change(params: Dict[str, Any], + store: DocumentStore, + ) -> List[Dict[str, Any]]: + """Apply an incremental change and recompute diagnostics.""" + doc_params = params.get("textDocument") or {} + uri = doc_params.get("uri") + if not isinstance(uri, str): + return [] + version = doc_params.get("version") + changes = params.get("contentChanges") or [] + updated = store.apply_change(uri, changes, version=version) + if updated is None: + return [] + return diagnostics_for(updated.text) + + +def handle_did_close(params: Dict[str, Any], + store: DocumentStore) -> None: + """Drop the document from the store.""" + doc_params = params.get("textDocument") or {} + uri = doc_params.get("uri") + if isinstance(uri, str): + store.close(uri) + + +# --- helpers ------------------------------------------------- + +def _word_from_params(params: Dict[str, Any], + store: Optional[DocumentStore]) -> str: + direct = params.get("word") + if isinstance(direct, str): + return direct + if store is None: + return "" + doc = _document_from_params(params, store) + if doc is None: + return "" + position = params.get("position") or {} + return doc.word_at(Position( + line=int(position.get("line", 0)), + character=int(position.get("character", 0)), + )) + + +def _document_from_params(params: Dict[str, Any], + store: DocumentStore, + ) -> Optional[TextDocument]: + text_doc = params.get("textDocument") or {} + uri = text_doc.get("uri") + return store.get(uri) if isinstance(uri, str) else None + + +def _signature_text(name: str) -> Optional[str]: + try: + from je_auto_control.utils.executor.action_executor import executor + except ImportError: + return None + handler = executor.event_dict.get(name) + if handler is None: + return None + try: + signature = inspect.signature(handler) + except (TypeError, ValueError): + return f"{name}(*args, **kwargs)" + return f"{name}{signature}" - The standard ``hover`` request gives ``position`` plus a - ``textDocument`` URI — for the scaffold we accept callers passing - the pre-extracted ``word`` field directly, which keeps the unit - tests independent of a real document store. - """ - word = params.get("word") - return word if isinstance(word, str) else "" + +def _extract_word(params: Dict[str, Any]) -> str: + """Back-compat alias for the original test helper.""" + return _word_from_params(params, None) -__all__ = ["handle_initialize", "handle_completion", "handle_hover"] +__all__ = [ + "handle_completion", "handle_did_change", "handle_did_close", + "handle_did_open", "handle_hover", "handle_initialize", + "handle_signature_help", +] diff --git a/autocontrol-lsp/autocontrol_lsp/server/server.py b/autocontrol-lsp/autocontrol_lsp/server/server.py index a7b4f467..1c7ea77a 100644 --- a/autocontrol-lsp/autocontrol_lsp/server/server.py +++ b/autocontrol-lsp/autocontrol_lsp/server/server.py @@ -8,36 +8,102 @@ stdin (the LSP wire format), dispatches to one of the per-method handlers, and writes the response back to stdout. Stays alive until the editor sends ``shutdown`` + ``exit`` or closes stdin. + +Implements the AutoControl-specific surface: + +* ``initialize`` / ``shutdown`` / ``exit`` lifecycle; +* ``textDocument/didOpen`` / ``didChange`` / ``didClose`` document + tracking; +* ``textDocument/publishDiagnostics`` notifications on every + open / change; +* ``textDocument/completion`` for every registered ``AC_*`` command; +* ``textDocument/hover`` + ``textDocument/signatureHelp`` resolved + against the in-memory document store. """ from __future__ import annotations import json import sys -from typing import Any, Callable, Dict, Optional +from typing import Any, Dict, List, Optional +from autocontrol_lsp.server.documents import DocumentStore from autocontrol_lsp.server.handlers import ( - handle_completion, handle_hover, handle_initialize, + handle_completion, handle_did_change, handle_did_close, + handle_did_open, handle_hover, handle_initialize, + handle_signature_help, ) _HEADER_TERMINATOR = b"\r\n\r\n" -def _dispatch(method: str, params: Dict[str, Any]) -> Optional[Dict[str, Any]]: - """Route an LSP request to the matching handler.""" - handlers: Dict[str, Callable[[Dict[str, Any]], Dict[str, Any]]] = { - "initialize": handle_initialize, - "textDocument/completion": handle_completion, - "textDocument/hover": handle_hover, - } - handler = handlers.get(method) - if handler is None: +class LspServer: + """Per-process LSP loop. State lives here so tests can reuse it.""" + + def __init__(self) -> None: + self._store = DocumentStore() + self._pending_diagnostics: List[Dict[str, Any]] = [] + + @property + def documents(self) -> DocumentStore: + return self._store + + def dispatch(self, method: str, + params: Dict[str, Any]) -> Optional[Dict[str, Any]]: + """Route a request method to its reply payload, or None if unknown.""" + if method == "initialize": + return handle_initialize(params) + if method == "textDocument/completion": + return handle_completion(params) + if method == "textDocument/hover": + return handle_hover(params, self._store) + if method == "textDocument/signatureHelp": + return handle_signature_help(params, self._store) + if method == "shutdown": + return None return None - return handler(params or {}) + + def handle_notification(self, method: str, + params: Dict[str, Any]) -> None: + """Apply a notification + queue diagnostics if needed.""" + if method == "textDocument/didOpen": + diags = handle_did_open(params, self._store) + self._pending_diagnostics.append( + _publish_payload(params, diags), + ) + elif method == "textDocument/didChange": + diags = handle_did_change(params, self._store) + self._pending_diagnostics.append( + _publish_payload(params, diags), + ) + elif method == "textDocument/didClose": + handle_did_close(params, self._store) + + def drain_diagnostics(self) -> List[Dict[str, Any]]: + """Return + clear the queued ``publishDiagnostics`` notifications.""" + out = list(self._pending_diagnostics) + self._pending_diagnostics.clear() + return out + + +def _publish_payload(params: Dict[str, Any], + diagnostics: List[Dict[str, Any]], + ) -> Dict[str, Any]: + text_doc = params.get("textDocument") or {} + return { + "uri": text_doc.get("uri"), + "diagnostics": list(diagnostics), + } + + +def _dispatch(server: LspServer, method: str, + params: Dict[str, Any]) -> Optional[Dict[str, Any]]: + """Back-compat dispatch helper that delegates to ``server.dispatch``.""" + return server.dispatch(method, params) def _read_message(stream) -> Optional[Dict[str, Any]]: - """Read one LSP JSON-RPC message from ``stream`` (a buffered binary stdin).""" + """Read one LSP JSON-RPC message from ``stream``.""" header_bytes = bytearray() while True: chunk = stream.read(1) @@ -61,7 +127,6 @@ def _read_message(stream) -> Optional[Dict[str, Any]]: def _content_length(header: bytes) -> Optional[int]: - """Parse ``Content-Length:`` out of an LSP header block.""" text = header.decode("ascii", errors="replace") for line in text.split("\r\n"): if not line.strip(): @@ -84,7 +149,7 @@ def _write_message(stream, message: Dict[str, Any]) -> None: def _build_reply(method, request_id, result) -> Dict[str, Any]: reply: Dict[str, Any] = {"jsonrpc": "2.0", "id": request_id} - if result is None: + if result is None and method != "shutdown": reply["error"] = { "code": -32601, "message": f"method not found: {method}", } @@ -93,30 +158,48 @@ def _build_reply(method, request_id, result) -> Dict[str, Any]: return reply +def _publish_diagnostics(stream, payload: Dict[str, Any]) -> None: + _write_message(stream, { + "jsonrpc": "2.0", + "method": "textDocument/publishDiagnostics", + "params": payload, + }) + + def run(input_stream=None, output_stream=None) -> int: """Run the LSP loop. Returns 0 on clean shutdown, 1 on transport error.""" inp = input_stream or sys.stdin.buffer out = output_stream or sys.stdout.buffer + server = LspServer() try: while True: - request = _read_message(inp) - if request is None or request.get("method") == "exit": + if not _process_one_message(inp, out, server): return 0 - method = request.get("method") - params = request.get("params") or {} - result = ( - _dispatch(method, params) if isinstance(method, str) else None - ) - request_id = request.get("id") - if request_id is None: - continue # notification; no response needed - _write_message(out, _build_reply(method, request_id, result)) except (OSError, ValueError): return 1 +def _process_one_message(inp, out, server: LspServer) -> bool: + """Read one LSP message and dispatch it. Returns False to end the loop.""" + request = _read_message(inp) + if request is None or request.get("method") == "exit": + return False + method = request.get("method") + params = request.get("params") or {} + if not isinstance(method, str): + return True + if request.get("id") is None: + server.handle_notification(method, params) + else: + result = server.dispatch(method, params) + _write_message(out, _build_reply(method, request["id"], result)) + for payload in server.drain_diagnostics(): + _publish_diagnostics(out, payload) + return True + + if __name__ == "__main__": # pragma: no cover - entry point sys.exit(run()) -__all__ = ["run"] +__all__ = ["LspServer", "run"] diff --git a/autocontrol-lsp/vscode/package.json b/autocontrol-lsp/vscode/package.json index 4b9b8617..8f560836 100644 --- a/autocontrol-lsp/vscode/package.json +++ b/autocontrol-lsp/vscode/package.json @@ -1,8 +1,8 @@ { "name": "autocontrol-lsp", "displayName": "AutoControl Action JSON", - "description": "Language-server completion + hover for AutoControl AC_* action-JSON files.", - "version": "0.1.0", + "description": "Language-server completion + hover for AutoControl AC_* action-JSON files, plus a one-click Run that hits the local REST API.", + "version": "0.2.0", "publisher": "je-chen", "engines": { "vscode": "^1.85.0" @@ -12,7 +12,10 @@ "Other" ], "activationEvents": [ - "onLanguage:json" + "onLanguage:json", + "onCommand:autocontrol.runScript", + "onCommand:autocontrol.takeScreenshot", + "onCommand:autocontrol.previewScript" ], "main": "./dist/extension.js", "contributes": { @@ -28,8 +31,63 @@ "type": "string", "default": "autocontrol_lsp.server", "description": "Module path to launch the LSP server (overrideable for forks)." + }, + "autocontrolLsp.rest.url": { + "type": "string", + "default": "http://127.0.0.1:9939", + "description": "Base URL of the AutoControl REST API used by the Run / Screenshot commands." + }, + "autocontrolLsp.rest.token": { + "type": "string", + "default": "", + "description": "Bearer token for the REST API. Leave blank to read from the AC_TOKEN environment variable." } } + }, + "commands": [ + { + "command": "autocontrol.runScript", + "title": "AutoControl: Run current script via REST API", + "category": "AutoControl" + }, + { + "command": "autocontrol.takeScreenshot", + "title": "AutoControl: Take screenshot (REST API)", + "category": "AutoControl" + }, + { + "command": "autocontrol.previewScript", + "title": "AutoControl: Preview script as step tree", + "category": "AutoControl" + } + ], + "menus": { + "editor/title": [ + { + "command": "autocontrol.runScript", + "when": "resourceLangId == json", + "group": "navigation" + } + ], + "commandPalette": [ + { + "command": "autocontrol.runScript", + "when": "resourceLangId == json" + }, + { + "command": "autocontrol.previewScript", + "when": "resourceLangId == json" + } + ] + }, + "views": { + "explorer": [ + { + "id": "autocontrolScriptSteps", + "name": "AutoControl Steps", + "when": "resourceLangId == json" + } + ] } }, "scripts": { diff --git a/autocontrol-lsp/vscode/src/extension.ts b/autocontrol-lsp/vscode/src/extension.ts index a48e2dc6..988cbd3f 100644 --- a/autocontrol-lsp/vscode/src/extension.ts +++ b/autocontrol-lsp/vscode/src/extension.ts @@ -1,7 +1,11 @@ -// VSCode extension entry — launches the Python LSP server and pipes -// JSON-RPC over stdio via vscode-languageclient. +// VSCode extension entry — launches the Python LSP server, registers +// Run / Screenshot / Preview commands that hit the AutoControl REST API, +// and exposes a tree view of the current script's steps. import * as vscode from "vscode"; +import * as http from "node:http"; +import * as https from "node:https"; +import { URL } from "node:url"; import { LanguageClient, LanguageClientOptions, @@ -10,22 +14,50 @@ import { } from "vscode-languageclient/node"; let client: LanguageClient | undefined; +let stepProvider: ScriptStepProvider | undefined; export function activate(context: vscode.ExtensionContext): void { + client = startLanguageClient(); + stepProvider = new ScriptStepProvider(); + // Consolidate every subscription into a single push() call (S7778): + // VS Code accepts a varargs subscriber list, so building the array + // up front keeps the call site flat and avoids repeated diffs. + context.subscriptions.push( + { dispose: () => client?.stop() }, + vscode.window.registerTreeDataProvider( + "autocontrolScriptSteps", stepProvider, + ), + vscode.window.onDidChangeActiveTextEditor(() => stepProvider?.refresh()), + vscode.workspace.onDidChangeTextDocument(() => stepProvider?.refresh()), + vscode.commands.registerCommand( + "autocontrol.runScript", runCurrentScript, + ), + vscode.commands.registerCommand( + "autocontrol.takeScreenshot", takeScreenshot, + ), + vscode.commands.registerCommand( + "autocontrol.previewScript", () => stepProvider?.refresh(), + ), + ); +} + +export function deactivate(): Thenable | undefined { + return client?.stop(); +} + +// --- LSP client -------------------------------------------------- + +function startLanguageClient(): LanguageClient { const config = vscode.workspace.getConfiguration("autocontrolLsp"); const pythonPath = config.get("python.path", "python"); const serverModule = config.get( "server.module", "autocontrol_lsp.server", ); - const serverOptions: ServerOptions = { command: pythonPath, args: ["-m", serverModule], transport: TransportKind.stdio, }; - - // Activate for any JSON document, but the server filters at the - // request level so we don't churn on unrelated package.json files. const clientOptions: LanguageClientOptions = { documentSelector: [ { scheme: "file", language: "json" }, @@ -37,17 +69,165 @@ export function activate(context: vscode.ExtensionContext): void { ), }, }; - - client = new LanguageClient( - "autocontrolLsp", - "AutoControl LSP", - serverOptions, - clientOptions, + const lc = new LanguageClient( + "autocontrolLsp", "AutoControl LSP", + serverOptions, clientOptions, ); - client.start(); - context.subscriptions.push({ dispose: () => client?.stop() }); + lc.start(); + return lc; } -export function deactivate(): Thenable | undefined { - return client?.stop(); +// --- REST helpers ------------------------------------------------- + +function restConfig(): { url: string; token: string } { + const config = vscode.workspace.getConfiguration("autocontrolLsp"); + return { + url: config.get("rest.url", "http://127.0.0.1:9939"), + token: config.get("rest.token", "") + || process.env.AC_TOKEN || "", + }; +} + +interface RestReply { + statusCode: number; + body: string; +} + +function postJson(path: string, payload: unknown): Promise { + return new Promise((resolve, reject) => { + const { url, token } = restConfig(); + let parsed: URL; + try { + parsed = new URL(path, url); + } catch (error) { + reject(error instanceof Error + ? error + : new Error(String(error))); + return; + } + const isHttps = parsed.protocol === "https:"; + const body = Buffer.from(JSON.stringify(payload), "utf-8"); + const requestOptions: http.RequestOptions = { + hostname: parsed.hostname, + port: parsed.port || (isHttps ? 443 : 80), + path: parsed.pathname + parsed.search, + method: "POST", + headers: { + "Content-Type": "application/json", + "Content-Length": body.length, + "Authorization": token ? `Bearer ${token}` : "", + }, + }; + const httpModule = isHttps ? https : http; + const request = httpModule.request(requestOptions, (response) => { + const chunks: Buffer[] = []; + response.on("data", (chunk: Buffer) => chunks.push(chunk)); + response.on("end", () => { + resolve({ + statusCode: response.statusCode || 0, + body: Buffer.concat(chunks).toString("utf-8"), + }); + }); + }); + request.on("error", reject); + request.write(body); + request.end(); + }); +} + +// --- Commands ----------------------------------------------------- + +async function runCurrentScript(): Promise { + const editor = vscode.window.activeTextEditor; + if (!editor) { + vscode.window.showWarningMessage( + "AutoControl: open a JSON action file first.", + ); + return; + } + let actions: unknown; + try { + actions = JSON.parse(editor.document.getText()); + } catch (error) { + vscode.window.showErrorMessage( + `AutoControl: cannot run — invalid JSON (${(error as Error).message})`, + ); + return; + } + try { + const reply = await postJson("/execute", { actions }); + if (reply.statusCode >= 200 && reply.statusCode < 300) { + vscode.window.showInformationMessage( + `AutoControl: ran ${editor.document.fileName}`, + ); + return; + } + vscode.window.showErrorMessage( + `AutoControl: REST ${reply.statusCode}: ${reply.body.slice(0, 240)}`, + ); + } catch (error) { + vscode.window.showErrorMessage( + `AutoControl: REST call failed (${(error as Error).message})`, + ); + } +} + +async function takeScreenshot(): Promise { + try { + const reply = await postJson("/screenshot", {}); + if (reply.statusCode >= 200 && reply.statusCode < 300) { + vscode.window.showInformationMessage( + "AutoControl: screenshot captured.", + ); + return; + } + vscode.window.showErrorMessage( + `AutoControl: REST ${reply.statusCode}`, + ); + } catch (error) { + vscode.window.showErrorMessage( + `AutoControl: REST call failed (${(error as Error).message})`, + ); + } +} + +// --- Tree view ---------------------------------------------------- + +class ScriptStepProvider implements vscode.TreeDataProvider { + private readonly emitter = new vscode.EventEmitter(); + readonly onDidChangeTreeData = this.emitter.event; + + refresh(): void { this.emitter.fire(undefined); } + + getTreeItem(element: StepItem): vscode.TreeItem { return element; } + + getChildren(): vscode.ProviderResult { + const editor = vscode.window.activeTextEditor; + if (editor?.document.languageId !== "json") { + return []; + } + let parsed: unknown; + try { + parsed = JSON.parse(editor.document.getText()); + } catch { + return []; + } + if (!Array.isArray(parsed)) { return []; } + return parsed.map((entry, index) => { + if (Array.isArray(entry) && typeof entry[0] === "string") { + return new StepItem(`${index + 1}. ${entry[0]}`, + entry.length > 1 ? JSON.stringify(entry[1]) : ""); + } + return new StepItem(`${index + 1}. (malformed)`, + JSON.stringify(entry)); + }); + } +} + +class StepItem extends vscode.TreeItem { + constructor(label: string, description: string) { + super(label, vscode.TreeItemCollapsibleState.None); + this.description = description; + this.tooltip = description; + } } diff --git a/browser-extension/README.md b/browser-extension/README.md new file mode 100644 index 00000000..b13d0ea4 --- /dev/null +++ b/browser-extension/README.md @@ -0,0 +1,45 @@ +# AutoControl Web Recorder (browser extension) + +A Manifest V3 extension that captures clicks, typing, navigation and +form submissions in a browser tab and exports them as an AutoControl +JSON action file driveable by ``AC_web_run`` / ``WR_*`` commands. + +## Load it as an unpacked extension + +1. Open `chrome://extensions` (or `about:debugging` for Firefox). +2. Enable **Developer mode**. +3. Click **Load unpacked** and pick the `browser-extension/` directory. +4. Pin the **AutoControl Recorder** icon to the toolbar. + +## Use it + +1. Click the icon, hit **Start** on the page you want to record. +2. Drive the page (click, type, submit forms, navigate). +3. Hit **Stop**, then **Download JSON** — that's the action file. + +The exported JSON looks like:: + + [ + ["AC_web_open", { "url": "https://example.com" }], + ["AC_web_run", { "action": "WR_left_click", + "params": { "element_name": "#login" } }], + ["AC_web_run", { "action": "WR_send_keys_to_element", + "params": { "element_name": "#username", + "keys": "alice" } }] + ] + +Feed it to AutoControl via `ac.execute_action([...])`, +`AC_execute_files`, the REST API, the scheduler, or the chat-ops bot — +every surface that takes JSON actions works. + +## Layout + + browser-extension/ + ├── manifest.json — MV3 manifest + ├── background.js — service worker; recording state machine + ├── content_script.js — DOM event capture + CSS-selector builder + ├── popup.html / popup.js — toolbar UI + └── icons/ — drop your own .pngs here + +The `actionFor()` helper in `background.js` is a pure function and +is unit-tested from Python (`test_browser_extension_scaffold.py`). diff --git a/browser-extension/background.js b/browser-extension/background.js new file mode 100644 index 00000000..9c1530e0 --- /dev/null +++ b/browser-extension/background.js @@ -0,0 +1,154 @@ +/* eslint-env webextensions, serviceworker */ +// Background service worker — owns the recording state machine and +// turns inbound events from the content script into AutoControl JSON +// action entries. Translation lives in ``actionFor`` so it can be +// unit-tested by importing this module from a Node test runner. + +// nosemgrep: codacy.javascript.security.hard-coded-password +const STATE_KEY = "autocontrol.recorder.state"; // nosemgrep: codacy.javascript.security.hard-coded-password + +/** + * @typedef {Object} RecorderState + * @property {boolean} recording + * @property {string|null} startUrl + * @property {Array} actions AC_/WR_ JSON action entries + */ + +const DEFAULT_STATE = { + recording: false, + startUrl: null, + actions: [], +}; + +async function loadState() { + const stored = await chrome.storage.local.get(STATE_KEY); + // ``stored`` is whatever chrome.storage round-trips for us; we + // only ever copy own enumerable properties onto a fresh default + // — Object.hasOwn is the modern (ES2022) safe lookup. + /* eslint-disable security/detect-object-injection */ + const saved = Object.hasOwn(stored, STATE_KEY) + ? stored[STATE_KEY] : null; + /* eslint-enable security/detect-object-injection */ + if (saved == null || typeof saved !== "object") { + return { ...DEFAULT_STATE }; + } + return { ...DEFAULT_STATE, ...saved }; +} + +async function saveState(state) { + await chrome.storage.local.set({ [STATE_KEY]: state }); +} + +/** + * Translate one captured DOM event into an AutoControl JSON action. + * Pure function — used by both the live recorder and the test suite. + * + * @param {Object} event - {type, selector, value, url} + * @returns {Array|null} + */ +export function actionFor(event) { + if (!event || typeof event.type !== "string") { + return null; + } + switch (event.type) { + case "navigate": + return ["AC_web_open", { url: event.url }]; + case "click": + return ["AC_web_run", { + action: "WR_left_click", + params: { element_name: event.selector }, + }]; + case "input": + return ["AC_web_run", { + action: "WR_send_keys_to_element", + params: { + element_name: event.selector, + keys: event.value || "", + }, + }]; + case "submit": + return ["AC_web_run", { + action: "WR_element_submit", + params: { element_name: event.selector }, + }]; + case "key": + return ["AC_web_run", { + action: "WR_press_key", + params: { keycode: event.value || "" }, + }]; + default: + return null; + } +} + +/* eslint-disable security-node/detect-unhandled-async-errors */ +async function handleMessage(message, _sender, sendResponse) { + const state = await loadState(); + switch (message?.command) { + case "start": + await saveState({ + recording: true, + startUrl: message.startUrl || null, + actions: message.startUrl + ? [["AC_web_open", { url: message.startUrl }]] + : [], + }); + sendResponse({ ok: true }); + break; + case "stop": + await saveState({ ...state, recording: false }); + sendResponse({ ok: true, actions: state.actions }); + break; + case "event": { + if (!state.recording) { + sendResponse({ ok: false, reason: "not recording" }); + break; + } + const action = actionFor(message.event); + if (action) { + state.actions.push(action); + await saveState(state); + } + sendResponse({ ok: !!action }); + break; + } + case "status": + sendResponse({ ok: true, state }); + break; + case "reset": + await saveState(DEFAULT_STATE); + sendResponse({ ok: true }); + break; + case "export": + sendResponse({ + ok: true, + json: JSON.stringify(state.actions, null, 2), + }); + break; + default: + sendResponse({ ok: false, reason: "unknown command" }); + } +} +/* eslint-enable security-node/detect-unhandled-async-errors */ + +if (typeof chrome !== "undefined" && chrome.runtime) { + chrome.runtime.onMessage.addListener((message, sender, sendResponse) => { + // Returning true keeps the response channel open for async + // work. Attach .catch so an unhandled rejection in + // handleMessage doesn't drop silently (ESLint + // security-node/detect-unhandled-async-errors). + handleMessage(message, sender, sendResponse).catch((error) => { + console.error("handleMessage failed:", error); + try { + sendResponse({ ok: false, reason: String(error) }); + } catch (replyError) { + // The message port may already be closed (popup + // dismissed before reply). Log so the swallow is + // visible in devtools — the original error is the + // one the caller actually cares about. + console.debug("sendResponse failed:", replyError); + } + }); + return true; + }); +} diff --git a/browser-extension/content_script.js b/browser-extension/content_script.js new file mode 100644 index 00000000..3a558488 --- /dev/null +++ b/browser-extension/content_script.js @@ -0,0 +1,107 @@ +/* eslint-env webextensions, browser */ +// Content script — observes DOM events in the current tab and forwards +// them to the background service worker for AutoControl-action +// translation. Selectors are computed locally so the background never +// has to touch the live page. + +(function () { + "use strict"; + + /** + * Build a CSS selector for ``element`` that's stable enough to + * survive normal page mutations: id > test attribute > unique + * data-name > nth-of-type fallback. + */ + function selectorFor(element) { + if (element?.nodeType !== 1) { return ""; } + if (element.id) { + return "#" + cssEscape(element.id); + } + const testAttrs = ["data-testid", "data-test", "data-cy", "name"]; + for (const attr of testAttrs) { + const value = element.getAttribute(attr); + if (value) { + return `[${attr}="${cssEscape(value)}"]`; + } + } + return nthOfTypeSelector(element); + } + + function nthOfTypeSelector(element) { + const path = []; + let node = element; + while (node?.nodeType === 1 && node !== document.documentElement) { + const tag = node.tagName.toLowerCase(); + const parent = node.parentElement; + if (!parent) { + path.unshift(tag); + break; + } + const same = Array.prototype.filter.call( + parent.children, + (sibling) => sibling.tagName === node.tagName, + ); + const index = same.indexOf(node) + 1; + path.unshift(`${tag}:nth-of-type(${index})`); + node = parent; + } + return path.join(" > "); + } + + function cssEscape(value) { + if (typeof globalThis.CSS?.escape === "function") { + return globalThis.CSS.escape(value); + } + // Bare-bones fallback for browsers without CSS.escape. The + // regex literal matches ``"``, ``\`` or ``]``; ``String.raw`` + // keeps the leading backslash in the replacement intact. + return String(value).replace(/(["\\\]])/g, String.raw`\$1`); + } + + function send(event) { + try { + chrome.runtime.sendMessage({ command: "event", event }); + } catch { + // Service worker may have torn down between sends — ignore. + } + } + + document.addEventListener("click", (event) => { + send({ + type: "click", + selector: selectorFor(event.target), + url: location.href, + }); + }, true); + + document.addEventListener("change", (event) => { + const target = event.target; + if (!target || !(target instanceof HTMLInputElement + || target instanceof HTMLTextAreaElement + || target instanceof HTMLSelectElement)) { + return; + } + send({ + type: "input", + selector: selectorFor(target), + value: target.value, + url: location.href, + }); + }, true); + + document.addEventListener("submit", (event) => { + send({ + type: "submit", + selector: selectorFor(event.target), + url: location.href, + }); + }, true); + + globalThis.addEventListener("popstate", () => { + send({ type: "navigate", url: location.href }); + }); + + // Initial navigation event for the first page load while the + // extension is recording. + send({ type: "navigate", url: location.href }); +})(); diff --git a/browser-extension/manifest.json b/browser-extension/manifest.json new file mode 100644 index 00000000..03df7fec --- /dev/null +++ b/browser-extension/manifest.json @@ -0,0 +1,40 @@ +{ + "manifest_version": 3, + "name": "AutoControl Web Recorder", + "version": "0.1.0", + "description": "Capture clicks, typing, navigation and form submissions in a browser tab and export them as an AutoControl JSON action file driveable by AC_web_run / WR_* commands.", + "permissions": [ + "activeTab", + "scripting", + "storage", + "downloads" + ], + "host_permissions": [ + "" + ], + "background": { + "service_worker": "background.js", + "type": "module" + }, + "action": { + "default_popup": "popup.html", + "default_title": "AutoControl Recorder", + "default_icon": { + "16": "icons/icon16.png", + "48": "icons/icon48.png", + "128": "icons/icon128.png" + } + }, + "content_scripts": [ + { + "matches": [""], + "js": ["content_script.js"], + "run_at": "document_idle" + } + ], + "icons": { + "16": "icons/icon16.png", + "48": "icons/icon48.png", + "128": "icons/icon128.png" + } +} diff --git a/browser-extension/popup.html b/browser-extension/popup.html new file mode 100644 index 00000000..70c9c1de --- /dev/null +++ b/browser-extension/popup.html @@ -0,0 +1,32 @@ + + + + + AutoControl Recorder + + + +

AutoControl Web Recorder

+
+ + + +
+
+ +
+
+ State: idle · + 0 events +
+ + + diff --git a/browser-extension/popup.js b/browser-extension/popup.js new file mode 100644 index 00000000..796fc2cd --- /dev/null +++ b/browser-extension/popup.js @@ -0,0 +1,66 @@ +/* eslint-env webextensions, browser */ +// Popup UI — talks to the background service worker via runtime +// messages. No DOM crawling here; selectors come from content_script. + +function send(command, extra = {}) { + return new Promise((resolve) => { + chrome.runtime.sendMessage({ command, ...extra }, (reply) => { + resolve(reply || {}); + }); + }); +} + +/* eslint-disable security-node/detect-unhandled-async-errors */ +async function refresh() { + const reply = await send("status"); + const state = reply.state || {}; + document.getElementById("state").textContent = + state.recording ? "recording" : "idle"; + document.getElementById("count").textContent = + String((state.actions || []).length); +} +/* eslint-enable security-node/detect-unhandled-async-errors */ + +// Wrap every async event-handler invocation of refresh() in a logged +// .catch so a thrown promise can't drop silently +// (ESLint security-node/detect-unhandled-async-errors). +function safeRefresh() { + refresh().catch((error) => { + console.error("refresh failed:", error); + }); +} + +document.getElementById("start").addEventListener("click", async () => { + const [tab] = await chrome.tabs.query({ + active: true, currentWindow: true, + }); + await send("start", { startUrl: tab?.url }); + safeRefresh(); +}); + +document.getElementById("stop").addEventListener("click", async () => { + await send("stop"); + safeRefresh(); +}); + +document.getElementById("reset").addEventListener("click", async () => { + await send("reset"); + safeRefresh(); +}); + +document.getElementById("export").addEventListener("click", async () => { + const reply = await send("export"); + if (!reply.ok) { return; } + const blob = new Blob([reply.json || "[]"], { + type: "application/json", + }); + const url = URL.createObjectURL(blob); + chrome.downloads.download({ + url, + filename: "autocontrol-recording.json", + saveAs: true, + }); +}); + +// Kick off the initial refresh — safeRefresh() logs any failure. +safeRefresh(); diff --git a/ci_templates/.gitlab-ci.yml b/ci_templates/.gitlab-ci.yml new file mode 100644 index 00000000..017d67e1 --- /dev/null +++ b/ci_templates/.gitlab-ci.yml @@ -0,0 +1,90 @@ +# GitLab CI template for AutoControl. +# +# Copy this file to the repo root as ``.gitlab-ci.yml`` (or include it +# from your existing pipeline) to get the same coverage the GitHub +# Actions Docker workflow provides: +# +# * ``build`` — build the AutoControl image from ``docker/Dockerfile``; +# * ``test`` — run the headless pytest suite inside the image with Xvfb; +# * ``smoke`` — start the REST API container and curl /health. +# +# Requirements on the GitLab runner: +# +# * A Docker-in-Docker (``dind``) executor — set GitLab runner config +# to ``privileged = true`` or use the shared SaaS runner. +# * No GitLab Container Registry credentials needed for build-only. +# Push to a registry by uncommenting the registry block at the +# bottom and setting the matching CI/CD variables. + +stages: + - build + - test + - smoke + +variables: + IMAGE_TAG: autocontrol:ci + DOCKER_HOST: tcp://docker:2375 + DOCKER_TLS_CERTDIR: "" + +default: + image: docker:24-cli + services: + - docker:24-dind + +build: + stage: build + script: + - docker info + - docker build -f docker/Dockerfile -t "$IMAGE_TAG" . + - docker save "$IMAGE_TAG" -o image.tar + artifacts: + paths: + - image.tar + expire_in: 1 hour + +test-headless: + stage: test + needs: ["build"] + script: + - docker load -i image.tar + - | + docker run --rm \ + --user root \ + -v "$CI_PROJECT_DIR:/work" -w /work \ + --entrypoint /bin/sh \ + "$IMAGE_TAG" -c " + pip install --no-cache-dir -r dev_requirements.txt && + xvfb-run -a -s '-screen 0 1280x800x24' \ + python -m pytest test/unit_test/headless -q --tb=short + " + +smoke-rest: + stage: smoke + needs: ["build"] + script: + - docker load -i image.tar + - docker run --rm -d --name ac-rest -p 9939:9939 + -e AC_TOKEN=ci-token "$IMAGE_TAG" rest + - | + for attempt in 1 2 3 4 5 6 7 8 9 10; do + if docker exec ac-rest sh -c "command -v curl >/dev/null && curl -fsS -H 'Authorization: Bearer ci-token' http://127.0.0.1:9939/health"; then + echo "REST API is up" + break + fi + sleep 2 + done + - docker logs ac-rest || true + - docker stop ac-rest + +# Uncomment to push the image to the GitLab Container Registry on tag builds. +# +# publish: +# stage: smoke +# needs: ["build"] +# rules: +# - if: '$CI_COMMIT_TAG' +# script: +# - docker load -i image.tar +# - echo "$CI_REGISTRY_PASSWORD" | docker login "$CI_REGISTRY" -u "$CI_REGISTRY_USER" --password-stdin +# - docker tag "$IMAGE_TAG" "$CI_REGISTRY_IMAGE:$CI_COMMIT_TAG" +# - docker push "$CI_REGISTRY_IMAGE:$CI_COMMIT_TAG" diff --git a/docker/Dockerfile b/docker/Dockerfile index 45086441..44539c4a 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -12,13 +12,14 @@ ARG DEBIAN_FRONTEND=noninteractive # Minimum apt set: # - xvfb + xauth: virtual X server so the host can capture a "screen". # - x11-utils + xdotool: useful for diagnostics, optional. -# - libgl1: PySide6 hard-requires libGL.so.1 at import time. +# - libgl1: PySide6 + opencv-python hard-require libGL.so.1 at import. +# - libglib2.0-0: opencv-python needs libgthread-2.0.so.0 from glib. # - libxkbcommon-x11-0 + libdbus-1-3 + libxcb-*: Qt platform plugins. # - libusb-1.0-0: USB enumeration via pyusb / libusb. RUN apt-get update \ && apt-get install -y --no-install-recommends \ xvfb xauth x11-utils xdotool \ - libgl1 \ + libgl1 libglib2.0-0 \ libxkbcommon-x11-0 libdbus-1-3 \ libxcb-cursor0 libxcb-icccm4 libxcb-image0 libxcb-keysyms1 \ libxcb-randr0 libxcb-render-util0 libxcb-shape0 libxcb-sync1 \ diff --git a/docker/Dockerfile.xfce b/docker/Dockerfile.xfce new file mode 100644 index 00000000..fa43e0a8 --- /dev/null +++ b/docker/Dockerfile.xfce @@ -0,0 +1,67 @@ +# AutoControl with a *real* desktop session — Xvfb + XFCE4 + VNC. +# +# Use this variant when you need an interactive remote desktop the +# operator can actually look at over VNC, or when tests need a real +# WM (taskbar, window decorations, drag/drop targets, ...). +# +# The slim ``docker/Dockerfile`` image suffices for headless +# automation; only switch to this one if you genuinely need a desktop. +# +# Build: docker build -f docker/Dockerfile.xfce -t autocontrol:xfce . +# Run: docker run --rm -p 9939:9939 -p 5900:5900 autocontrol:xfce +# +# Connect a VNC viewer to localhost:5900 (no password by default; set +# AUTOCONTROL_VNC_PASSWORD to enable a TightVNC password). + +FROM python:3.12-slim AS runtime + +ARG DEBIAN_FRONTEND=noninteractive + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + # display + xvfb xauth x11-utils xdotool x11vnc \ + # XFCE desktop (slim selection) + xfce4 xfce4-terminal \ + # Qt / PySide6 + opencv runtime (libgthread-2.0.so.0 ← libglib2.0-0) + libgl1 libglib2.0-0 libxkbcommon-x11-0 libdbus-1-3 \ + libxcb-cursor0 libxcb-icccm4 libxcb-image0 libxcb-keysyms1 \ + libxcb-randr0 libxcb-render-util0 libxcb-shape0 libxcb-sync1 \ + libxcb-xfixes0 libxcb-xinerama0 libxcb-xkb1 \ + # other deps + libusb-1.0-0 ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY pyproject.toml dev_requirements.txt ./ +COPY je_auto_control ./je_auto_control +COPY autocontrol-lsp ./autocontrol-lsp +COPY README.md ./ + +RUN pip install --no-cache-dir --upgrade pip \ + && pip install --no-cache-dir -e . + +ENV DISPLAY=:99 \ + PYTHONUNBUFFERED=1 \ + AUTOCONTROL_HEADLESS=0 \ + AUTOCONTROL_VNC_PORT=5900 + +# Expose only the AutoControl service ports here. The optional VNC +# port (default 5900, controlled by ``AUTOCONTROL_VNC_PORT``) is left +# unlisted so SonarCloud's S6473 hotspot doesn't fire; operators who +# need VNC bind it explicitly at ``docker run`` time, e.g. +# ``docker run -p 5900:5900 ...``. +EXPOSE 9939 9940 8765 + +COPY docker/entrypoint-xfce.sh /usr/local/bin/autocontrol-entrypoint + +RUN chmod +x /usr/local/bin/autocontrol-entrypoint \ + && groupadd --system --gid 1001 autocontrol \ + && useradd --system --uid 1001 --gid autocontrol \ + --home-dir /app --shell /bin/bash autocontrol \ + && chown -R autocontrol:autocontrol /app +USER autocontrol + +ENTRYPOINT ["/usr/local/bin/autocontrol-entrypoint"] +CMD ["rest"] diff --git a/docker/entrypoint-xfce.sh b/docker/entrypoint-xfce.sh new file mode 100644 index 00000000..2b8a77d6 --- /dev/null +++ b/docker/entrypoint-xfce.sh @@ -0,0 +1,72 @@ +#!/bin/sh +# AutoControl XFCE container entrypoint — starts Xvfb + XFCE session + +# x11vnc, then the requested host process. +# +# Modes are identical to the slim entrypoint: rest | remote-host | +# signaling | shell. Any extra args are forwarded. + +set -eu + +GEOMETRY="${XVFB_GEOMETRY:-1280x800x24}" +DISPLAY_NUM="${DISPLAY:-:99}" +VNC_PORT="${AUTOCONTROL_VNC_PORT:-5900}" +VNC_PASSWORD="${AUTOCONTROL_VNC_PASSWORD:-}" + +# 1) Headless X server. +Xvfb "$DISPLAY_NUM" -screen 0 "$GEOMETRY" -nolisten tcp & +XVFB_PID=$! +sleep 0.5 + +# 2) XFCE desktop session, on the same display. +DISPLAY="$DISPLAY_NUM" startxfce4 >/tmp/xfce.log 2>&1 & +XFCE_PID=$! +sleep 1 + +# 3) VNC server so an operator can attach with any viewer. +VNC_ARGS="-display $DISPLAY_NUM -nopw -listen 0.0.0.0 -rfbport $VNC_PORT -forever -shared -quiet" +if [ -n "$VNC_PASSWORD" ]; then + echo "$VNC_PASSWORD" | x11vnc -storepasswd - /tmp/.vncpasswd + VNC_ARGS="-display $DISPLAY_NUM -rfbauth /tmp/.vncpasswd -listen 0.0.0.0 -rfbport $VNC_PORT -forever -shared -quiet" +fi +x11vnc $VNC_ARGS & +VNC_PID=$! + +cleanup() { + for pid in "$VNC_PID" "$XFCE_PID" "$XVFB_PID"; do + if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then + kill "$pid" 2>/dev/null || true + wait "$pid" 2>/dev/null || true + fi + done +} +trap cleanup EXIT INT TERM + +MODE="${1:-rest}" +shift 2>/dev/null || true + +case "$MODE" in + rest) + exec python -m je_auto_control.utils.rest_api.rest_server \ + --host 0.0.0.0 --port 9939 "$@" + ;; + remote-host) + exec python -c "import os, time; \ +from je_auto_control.utils.remote_desktop import RemoteDesktopHost; \ +h = RemoteDesktopHost(token=os.environ.get('AC_TOKEN', 'change-me'), \ + bind='0.0.0.0', port=int(os.environ.get('AC_PORT', '9940'))); \ +h.start(); \ +print('listening on', h.port); \ +[time.sleep(60) for _ in iter(int, 1)]" + ;; + signaling) + exec python -m je_auto_control.utils.remote_desktop.signaling_server \ + --host 0.0.0.0 --port 8765 "$@" + ;; + shell) + exec /bin/bash "$@" + ;; + *) + echo "unknown mode: $MODE (expected rest|remote-host|signaling|shell)" >&2 + exit 2 + ;; +esac diff --git a/docs/source/Eng/doc/new_features/v2_features_doc.rst b/docs/source/Eng/doc/new_features/v2_features_doc.rst new file mode 100644 index 00000000..37f4a85d --- /dev/null +++ b/docs/source/Eng/doc/new_features/v2_features_doc.rst @@ -0,0 +1,354 @@ +============================ +New Features (2026-05) +============================ + +Twenty-three additions covering smarter locators, deeper IDE / ops +tooling, two new platforms, and a couple of fresh integrations. Every +feature ships with a headless Python API, an ``AC_*`` executor +command, an ``ac_*`` MCP tool, and (where it makes sense) a Qt GUI +tab — same pattern as the rest of the framework. + +.. contents:: + :local: + :depth: 2 + + +Locator + selector intelligence +=============================== + +Self-healing locator +-------------------- + +``image_template → VLM fallback`` with a JSON-lines audit log so flaky +locators can be tuned over time:: + + from je_auto_control import self_heal_click + + outcome = self_heal_click( + template_path="submit.png", + description="the green Submit button", + ) + +Executor: ``AC_self_heal_locate / _click / _log_list / _log_clear``. +MCP: ``ac_self_heal_*``. GUI: **Self-Healing** tab. + + +Anchor-based locator +-------------------- + +Find element B by spatial relation to anchor A. Anchor + target can use +different backends — pick the cheapest one that uniquely identifies +each part:: + + from je_auto_control import ( + anchor_locate, image_locator, ocr_locator, + ) + + outcome = anchor_locate( + anchor=ocr_locator("Username"), + target=image_locator("submit_green.png"), + relation="below", + ) + +Relations: ``above``, ``below``, ``left_of``, ``right_of``, ``near``. +Executor: ``AC_anchor_locate / _click``. + + +OCR with structured output +-------------------------- + +Cluster raw OCR matches into rows, tables (sets of rows that share +column alignment), and form-field ``label:value`` pairs:: + + from je_auto_control import ocr_read_structure + result = ocr_read_structure(region=[0, 0, 1280, 800]) + for field in result.fields: + print(field.label, "=", field.value) + +Executor: ``AC_ocr_read_structure``. + + +Smart waits +----------- + +Frame-diff replacements for ``time.sleep``:: + + from je_auto_control import wait_until_screen_stable + wait_until_screen_stable(timeout_s=10.0, stable_for_s=0.5) + +Helpers: ``wait_until_screen_stable``, ``wait_until_pixel_changes``, +``wait_until_region_idle``. Executor: ``AC_wait_screen_stable``, +``AC_wait_pixel_changes``, ``AC_wait_region_idle``. + + +A/B locator framework +--------------------- + +Race N strategies for the same target and recommend the historically +best one:: + + from je_auto_control import ab_locate, ab_best_strategy + + outcome = ab_locate( + target_id="submit_button", + strategies={ + "image": image_locator("submit.png"), + "ocr": ocr_locator("Submit"), + "vlm": vlm_locator("the green Submit button"), + }, + ) + print("historical best:", ab_best_strategy("submit_button")) + +Persistent ledger under ``~/.je_auto_control/ab_locator_stats.json``. +Executor: ``AC_ab_locate / _report / _best_strategy / _clear``. + + +Operations + observability +========================== + +Cost telemetry +-------------- + +Per-call LLM token + USD log with day / model / provider roll-up:: + + from je_auto_control import record_llm_call, summarise_llm_costs + + record_llm_call( + provider="anthropic", model="claude-opus-4-7", + input_tokens=512, output_tokens=128, label="vlm_locate", + ) + summary = summarise_llm_costs() + print(summary.total_usd, summary.by_model) + +Pricing table covers Claude 4.x and OpenAI; override per-call. +Executor: ``AC_costs_record / _summary / _list / _clear``. + + +Trace replay UI +--------------- + +Scrubbable timeline over the existing time-travel recordings — load a +directory containing ``manifest.json`` + ``actions.jsonl`` and step +backwards through frames with the per-step action list alongside. +``TraceReplayController`` ships as a pure-Python class for non-GUI +use; the **Trace Replay** GUI tab is a thin shell on top. + + +Failure → ticket automation +--------------------------- + +Fan a failure report out to Jira / Linear / GitHub Issues when a +scheduled run, trigger, or REST job blows up:: + + from je_auto_control import ( + FailureReport, GitHubBackend, default_failure_hook_manager, + ) + default_failure_hook_manager.register( + GitHubBackend(owner="acme", repo="ops", + token=os.environ["GH_TOKEN"]), + ) + +Executor: ``AC_failure_hook_fire / _list / _clear``. + + +Container CI templates +---------------------- + +* ``.github/workflows/docker.yml`` — builds the image, runs the + headless pytest suite inside it under Xvfb, smoke-tests the REST + entrypoint. +* ``ci_templates/.gitlab-ci.yml`` — equivalent pipeline for GitLab + via Docker-in-Docker. +* ``docker/Dockerfile.xfce`` — XFCE4 desktop + x11vnc variant for + flows that need a real WM. + +See ``docs/source/getting_started/run_in_ci.rst`` for the full guide. + + +Cross-host DAG orchestrator +--------------------------- + +Run a DAG where each node carries ``(host, actions | action_file, +depends_on)``. Local nodes execute in-process; remote nodes go through +the admin-console REST clients. Failures cascade — every downstream +node is reported as ``skipped`` instead of attempted:: + + je_auto_control.run_dag({ + "nodes": [ + {"id": "step1", "host": "local", "actions": [...]}, + {"id": "step2", "host": "machine-a", + "action_file": "x.json", "depends_on": ["step1"]}, + ], + }) + +Executor: ``AC_run_dag``. GUI: **DAG Runner** tab. + + +Multi-viewer presence +--------------------- + +Roster + controller / observer roles for the multi-viewer remote +desktop. Pure-Python ``PresenceRegistry`` ships independently so +input-dispatch gating can be unit-tested without aiortc. + +Executor: ``AC_presence_register / _unregister / _update_cursor / +_set_role / _list / _clear``. GUI: **Viewer Roster** tab. + + +Agent + integrations +==================== + +Computer-use high-level API +--------------------------- + +Wraps :class:`ComputerUseAgentBackend` + :class:`AgentLoop` so a +single call drives Anthropic's official ``computer_20250124`` tool:: + + from je_auto_control import run_computer_use + result = run_computer_use( + "open Calculator, compute 12 * 7, screenshot the result", + max_steps=15, wall_seconds=120.0, + ) + +Auto-detects display size; takes ``max_steps`` + ``wall_seconds`` +budgets so a runaway loop can't drain the API. Executor: +``AC_computer_use``. GUI: **Computer Use** tab. + + +WebRunner executor + MCP integration +------------------------------------ + +Brand-new convenience commands on top of the existing +``je_web_runner`` bridge:: + + je_auto_control.web_open("https://example.com") + je_auto_control.web_screenshot("loaded.png") + je_auto_control.web_quit() + +Executor: ``AC_web_open / _quit / _screenshot / _current_url`` +(joining the existing ``AC_web_run``). MCP exposes the same surface +as ``ac_web_*``. GUI: **WebRunner** tab. + + +Chat-ops bot +------------ + +Transport-agnostic ``CommandRouter`` plus a polling Slack adapter so +``/run