diff --git a/README.md b/README.md index 9d39a0e..a6cc9e6 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,10 @@ plugin that calls this binary, while this adapter owns platform-specific backend Dedicated OpenCoven computer-use boundary for OpenClaw: - macOS: shells to [`peekaboo`](https://peekaboo.boo) with `--json --no-remote` -- Linux/Windows: returns a clean unsupported JSON response for now +- Linux (X11): shells to `scrot`/`maim` (capture), `xdotool` (input), `wmctrl` (focus) +- Linux (Wayland): shells to `grim` (capture), `wtype`/`ydotool` (input), `swaymsg` (focus on Sway) +- Windows: returns a clean unsupported JSON response for now +- Session detection uses `XDG_SESSION_TYPE`, with fallback to `WAYLAND_DISPLAY`/`DISPLAY` - No shell interpolation; uses process argv directly - Interactive desktop actions require OpenClaw approval and adapter `--confirm` - Typed text, clipboard text, file-write content, tokens, cookies, and secrets @@ -35,6 +38,63 @@ All commands print a JSON envelope. The 0.1.0 command names remain as aliases: `permissions -> doctor`, `see -> inspect`, `capture -> screenshot`, `type -> type-text`, and `press -> keypress`. +## Linux / Ubuntu onboarding + +Linux desktop-use shells to per-session helper tools instead of a single +bundled backend. Run the doctor first to see what's installed: + +```bash +coven-desktop-use doctor +``` + +The JSON response includes the detected session (`x11` or `wayland`), a tool +inventory (each tool's path or `found:false`), and a `setupGuide` with the +exact `apt install` line for missing pieces. + +Recommended packages by session: + +```bash +# X11 (default on Ubuntu 22.04 GNOME with "Login on Xorg" selected, +# or any KDE/X session): +sudo apt install scrot xdotool wmctrl + +# Wayland (default on Ubuntu 22.04+ GNOME, Sway, Hyprland, KDE Wayland): +sudo apt install grim wtype ydotool +``` + +### Wayland notes + +- `ydotool` synthesises mouse events through `/dev/uinput`. After installing + it, enable the daemon and ensure your user can talk to it: + + ```bash + sudo systemctl enable --now ydotoold + sudo usermod -aG input "$USER" + # log out and back in + ``` + +- `wtype` only works on wlroots-based compositors (Sway, Hyprland, river). + GNOME Mutter and KDE KWin do not accept `wtype` events; on those + compositors the adapter falls back to `ydotool` for typing. +- Window focus on Wayland is compositor-specific. `focus` works on Sway when + `SWAYSOCK` is set and `swaymsg` is on `PATH`. GNOME Mutter has no public + CLI for window activation. +- `scroll` on Wayland degrades to `Page_Up`/`Page_Down` keystrokes via + `wtype` because there is no portable scroll-wheel injector across + Wayland compositors. Install `wlrctl` if you need real wheel events on + wlroots compositors. The response includes `degraded: ...` when this + fallback is taken. + +### What is *not* supported on Linux yet + +- AT-SPI element annotation. `inspect` captures a screenshot but does not + return `B1`/`T2`-style element ids, so `click --on B1` is unavailable. + Use `click --coords x,y --confirm` instead, after a screenshot. +- "Active window" capture on vanilla Wayland (`grim` has no notion of + focused window). On X11, `scrot --focused` and `maim -i $(xdotool + getactivewindow)` both work and are picked automatically when + `--mode window` is requested. + ## macOS onboarding Desktop inspection and interaction require two macOS privacy grants because the diff --git a/src/main.rs b/src/main.rs index baaf538..12668b3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -23,67 +23,146 @@ fn run(args: Vec) -> String { ]); } - let command = normalize_command(args[0].as_str()); - if !cfg!(target_os = "macos") { - return json_obj(vec![ + let command = match normalize_command(args[0].as_str()) { + Some(c) => c, + None => { + return json_obj(vec![ + ("ok", "false".to_string()), + ( + "error", + json_string(&format!("unknown command: {}", args[0])), + ), + ( + "help", + json_string( + "commands: doctor, inspect, screenshot, click, type-text, keypress, scroll, focus", + ), + ), + ]); + } + }; + + match detect_platform() { + Platform::Macos => run_macos(command, &args[1..]), + Platform::LinuxX11 => run_linux(LinuxSession::X11, command, &args[1..]), + Platform::LinuxWayland => run_linux(LinuxSession::Wayland, command, &args[1..]), + Platform::LinuxNoDisplay => json_obj(vec![ ("ok", "false".to_string()), ("supported", "false".to_string()), ("platform", json_string(env::consts::OS)), - ("backend", json_string("none")), - ("message", json_string("coven-desktop-use currently supports macOS via Peekaboo. This platform is unsupported.")), - ]); - } - - match command { - Some("doctor") => run_peekaboo(vec!["permissions".into()], false, true), - Some("inspect") => run_peekaboo(build_inspect_args(&args[1..]), false, false), - Some("screenshot") => run_peekaboo(build_screenshot_args(&args[1..]), false, false), - Some("click" | "type-text" | "keypress" | "scroll" | "focus") - if !has_flag(&args[1..], "--confirm") => - { - confirmation_required(command.unwrap()) - } - Some("click") => run_peekaboo( - build_click_args(&strip_flag(&args[1..], "--confirm")), - false, - false, - ), - Some("type-text") => run_peekaboo( - build_type_args(&strip_flag(&args[1..], "--confirm")), - true, - false, - ), - Some("keypress") => run_peekaboo( - build_press_args(&strip_flag(&args[1..], "--confirm")), - false, - false, - ), - Some("scroll") => run_peekaboo( - build_scroll_args(&strip_flag(&args[1..], "--confirm")), - false, - false, - ), - Some("focus") => run_peekaboo( - build_focus_args(&strip_flag(&args[1..], "--confirm")), - false, - false, - ), - Some(_) | None => json_obj(vec![ + ("backend", json_string("linux")), + ("error", json_string("No display session detected. XDG_SESSION_TYPE, WAYLAND_DISPLAY, and DISPLAY are all unset. coven-desktop-use requires an active graphical session (X11 or Wayland).")), + ("hint", json_string("Connect to the machine with display forwarding (ssh -X), or run in an active graphical session. Set DISPLAY=:0 if a local X11 server is running.")), + ]), + Platform::Other => json_obj(vec![ ("ok", "false".to_string()), + ("supported", "false".to_string()), + ("platform", json_string(env::consts::OS)), + ("backend", json_string("none")), ( - "error", - json_string(&format!("unknown command: {}", args[0])), - ), - ( - "help", + "message", json_string( - "commands: doctor, inspect, screenshot, click, type-text, keypress, scroll, focus", + "coven-desktop-use supports macOS (via Peekaboo) and Linux (X11 / Wayland). This platform is unsupported.", ), ), ]), } } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum Platform { + Macos, + LinuxX11, + LinuxWayland, + LinuxNoDisplay, + Other, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum LinuxSession { + X11, + Wayland, +} + +impl LinuxSession { + fn label(self) -> &'static str { + match self { + LinuxSession::X11 => "x11", + LinuxSession::Wayland => "wayland", + } + } +} + +fn detect_platform() -> Platform { + if cfg!(target_os = "macos") { + return Platform::Macos; + } + if cfg!(target_os = "linux") { + return detect_linux_session(); + } + Platform::Other +} + +fn detect_linux_session() -> Platform { + let session = env::var("XDG_SESSION_TYPE") + .unwrap_or_default() + .to_lowercase(); + match session.as_str() { + "wayland" => Platform::LinuxWayland, + "x11" => Platform::LinuxX11, + _ => { + if env::var_os("WAYLAND_DISPLAY").is_some() { + Platform::LinuxWayland + } else if env::var_os("DISPLAY").is_some() { + Platform::LinuxX11 + } else { + // No display variables set (headless / SSH session). + // Return LinuxNoDisplay so all commands report a clean error. + Platform::LinuxNoDisplay + } + } + } +} + +fn run_macos(command: &'static str, args: &[String]) -> String { + match command { + "doctor" => run_peekaboo(vec!["permissions".into()], false, true), + "inspect" => run_peekaboo(build_inspect_args(args), false, false), + "screenshot" => run_peekaboo(build_screenshot_args(args), false, false), + "click" | "type-text" | "keypress" | "scroll" | "focus" + if !has_flag(args, "--confirm") => + { + confirmation_required(command) + } + "click" => run_peekaboo(build_click_args(&strip_flag(args, "--confirm")), false, false), + "type-text" => run_peekaboo(build_type_args(&strip_flag(args, "--confirm")), true, false), + "keypress" => run_peekaboo(build_press_args(&strip_flag(args, "--confirm")), false, false), + "scroll" => run_peekaboo(build_scroll_args(&strip_flag(args, "--confirm")), false, false), + "focus" => run_peekaboo(build_focus_args(&strip_flag(args, "--confirm")), false, false), + _ => unreachable!("normalize_command returned {} but match did not handle it", command), + } +} + +fn run_linux(session: LinuxSession, command: &'static str, args: &[String]) -> String { + if matches!(command, "click" | "type-text" | "keypress" | "scroll" | "focus") + && !has_flag(args, "--confirm") + { + return confirmation_required(command); + } + let stripped: Vec = strip_flag(args, "--confirm"); + match command { + "doctor" => linux_doctor(session), + "inspect" => linux_inspect(session, args), + "screenshot" => linux_screenshot(session, args), + "click" => linux_click(session, &stripped), + "type-text" => linux_type_text(session, &stripped), + "keypress" => linux_keypress(session, &stripped), + "scroll" => linux_scroll(session, &stripped), + "focus" => linux_focus(session, &stripped), + _ => unreachable!("normalize_command returned {} but match did not handle it", command), + } +} + fn normalize_command(command: &str) -> Option<&'static str> { match command { "doctor" | "permissions" => Some("doctor"), @@ -407,13 +486,909 @@ fn is_executable_file(path: &Path) -> bool { path.is_file() } +// ============================================================================ +// Linux backend (X11 + Wayland) +// ---------------------------------------------------------------------------- +// The Linux path mirrors the macOS Peekaboo dispatch but shells out to small +// per-session tools instead of a single bundled backend: +// +// X11: scrot/maim (capture) + xdotool (input) + wmctrl (focus) +// Wayland: grim (capture) + wtype/ydotool (input) +// +// Element-id targeting (`--on B1`) is intentionally not supported in this v1 — +// it requires AT-SPI integration. Linux callers must use `--coords x,y` or, for +// `focus`, `--app` / `--window-title`. `doctor` reports which tools are +// installed and which apt packages cover the gaps. +// ============================================================================ + +fn linux_doctor(session: LinuxSession) -> String { + let tools = linux_tool_inventory(session); + let ok = linux_minimum_tools_present(session, &tools); + json_obj(vec![ + ("ok", bool_json(ok)), + ("supported", "true".to_string()), + ("platform", json_string(env::consts::OS)), + ("backend", json_string("linux")), + ("session", json_string(session.label())), + ("tools", linux_tool_inventory_json(&tools)), + ("setupGuide", linux_setup_guide_json(session, &tools)), + ]) +} + +#[derive(Debug)] +struct LinuxToolStatus { + name: &'static str, + found: bool, + path: Option, +} + +fn linux_tool_inventory(session: LinuxSession) -> Vec { + let names: &[&'static str] = match session { + LinuxSession::X11 => &["scrot", "maim", "xdotool", "wmctrl", "xprop"], + LinuxSession::Wayland => &["grim", "wtype", "ydotool", "wlrctl", "swaymsg"], + }; + names + .iter() + .map(|name| { + let path = resolve_path_binary(name); + LinuxToolStatus { + name, + found: path.is_some(), + path, + } + }) + .collect() +} + +fn linux_minimum_tools_present(session: LinuxSession, tools: &[LinuxToolStatus]) -> bool { + let has = |name: &str| tools.iter().any(|t| t.name == name && t.found); + match session { + LinuxSession::X11 => (has("scrot") || has("maim")) && has("xdotool"), + LinuxSession::Wayland => has("grim") && has("ydotool"), + } +} + +fn linux_tool_inventory_json(tools: &[LinuxToolStatus]) -> String { + let body = tools + .iter() + .map(|t| { + let mut item = vec![("found", bool_json(t.found))]; + if let Some(p) = &t.path { + item.push(("path", json_string(p))); + } + format!("{}:{}", json_string(t.name), json_obj(item)) + }) + .collect::>() + .join(","); + format!("{{{}}}", body) +} + +fn required_tools(session: LinuxSession) -> &'static [&'static str] { + match session { + LinuxSession::X11 => &["scrot", "xdotool"], // maim is alt for scrot; wmctrl optional + LinuxSession::Wayland => &["grim", "ydotool"], // wtype optional fallback + } +} + +fn linux_setup_guide_json(session: LinuxSession, tools: &[LinuxToolStatus]) -> String { + let missing: Vec = tools + .iter() + .filter(|t| !t.found && required_tools(session).contains(&t.name)) + .map(|t| t.name.to_string()) + .collect(); + let install_command = match session { + LinuxSession::X11 => "sudo apt install scrot xdotool wmctrl", + LinuxSession::Wayland => "sudo apt install grim wtype ydotool", + }; + let summary = match session { + LinuxSession::X11 => { + "X11 desktop-use uses scrot (or maim) for screen capture, xdotool for input synthesis, and wmctrl for window focus." + } + LinuxSession::Wayland => { + "Wayland desktop-use uses grim for screen capture and wtype/ydotool for input. ydotool needs the ydotoold daemon and uinput permissions; wtype requires a wlroots-based compositor (Sway, Hyprland)." + } + }; + let mut fields = vec![ + ("session", json_string(session.label())), + ("summary", json_string(summary)), + ("installCommand", json_string(install_command)), + ("missingTools", json_array_strings(&missing)), + ]; + if matches!(session, LinuxSession::Wayland) { + fields.push(( + "ydotoolNote", + json_string( + "ydotool requires the ydotoold systemd service running and your user in the 'input' group, or a uinput udev rule. See https://github.com/ReimuNotMoe/ydotool#installation", + ), + )); + fields.push(( + "focusNote", + json_string( + "Window focus on Wayland is compositor-specific. Sway/wlroots use swaymsg; GNOME Mutter has no public CLI for window activation.", + ), + )); + fields.push(( + "scrollNote", + json_string( + "Wayland scroll uses Page_Up/Page_Down via wtype as a fallback because true scroll-wheel emulation requires compositor-specific support not available through a portable CLI.", + ), + )); + } else { + fields.push(( + "elementIdNote", + json_string( + "Linux v1 does not implement AT-SPI annotation; element-id targeting (`--on B1`) is not yet available. Use `--coords x,y` or `--app`/`--window-title`.", + ), + )); + } + fields.push(( + "verificationCommand", + json_string("coven-desktop-use doctor"), + )); + json_obj(fields) +} + +fn linux_inspect(session: LinuxSession, args: &[String]) -> String { + let path = output_path(value(args, "--path"), "inspect", "png"); + let mode = value(args, "--mode").unwrap_or_else(|| "screen".to_string()); + let extras: Vec<(&'static str, String)> = vec![ + ("purpose", json_string("inspect")), + ("path", json_string(&path)), + ("elementsAvailable", "false".to_string()), + ( + "note", + json_string( + "Linux inspect captures a screenshot but does not yet emit AT-SPI element ids. Use `--coords x,y` for click targeting.", + ), + ), + ]; + capture_screenshot(session, &path, &mode, &extras) +} + +fn linux_screenshot(session: LinuxSession, args: &[String]) -> String { + let format = value(args, "--format").unwrap_or_else(|| "png".to_string()); + let path = output_path(value(args, "--path"), "screenshot", &format); + let mode = value(args, "--mode").unwrap_or_else(|| "screen".to_string()); + let extras: Vec<(&'static str, String)> = vec![ + ("purpose", json_string("screenshot")), + ("path", json_string(&path)), + ]; + capture_screenshot(session, &path, &mode, &extras) +} + +fn capture_screenshot( + session: LinuxSession, + path: &str, + mode: &str, + extras: &[(&'static str, String)], +) -> String { + match session { + LinuxSession::X11 => x11_screenshot(path, mode, extras), + LinuxSession::Wayland => wayland_screenshot(path, mode, extras), + } +} + +fn x11_screenshot(path: &str, mode: &str, extras: &[(&'static str, String)]) -> String { + if resolve_path_binary("scrot").is_some() { + let mut a: Vec = vec!["--overwrite".into()]; + if matches!(mode, "window" | "frontmost") { + a.push("--focused".into()); + } + a.push(path.into()); + return run_linux_command("scrot", a, "scrot", false, extras); + } + if resolve_path_binary("maim").is_some() { + // maim has no built-in active-window selector; require xdotool for that. + if matches!(mode, "window" | "frontmost") && resolve_path_binary("xdotool").is_some() { + let active = Command::new("xdotool") + .args(["getactivewindow"]) + .output() + .ok() + .and_then(|o| { + if o.status.success() { + Some(String::from_utf8_lossy(&o.stdout).trim().to_string()) + } else { + None + } + }); + if let Some(id) = active { + let a: Vec = vec!["-i".into(), id.into(), path.into()]; + return run_linux_command("maim", a, "maim", false, extras); + } + } + let a: Vec = vec![path.into()]; + return run_linux_command("maim", a, "maim", false, extras); + } + missing_tool_json( + "scrot or maim", + "Install with: sudo apt install scrot", + "x11-screenshot", + ) +} + +fn wayland_screenshot(path: &str, mode: &str, extras: &[(&'static str, String)]) -> String { + if resolve_path_binary("grim").is_none() { + return missing_tool_json( + "grim", + "Install with: sudo apt install grim", + "wayland-screenshot", + ); + } + // `mode=window`/`frontmost` is degraded on vanilla wlroots: grim has no + // notion of "active window". Sway users can pre-resolve via `swaymsg -t + // get_tree` but that's out of scope for v1; we just take the full screen. + let _ = mode; + let a: Vec = vec![path.into()]; + run_linux_command("grim", a, "grim", false, extras) +} + +fn linux_click(session: LinuxSession, args: &[String]) -> String { + let coords = match value(args, "--coords") { + Some(c) => c, + None => { + return linux_error_json( + "Linux click requires --coords x,y. Element-id targeting (`--on B1`) needs AT-SPI which is not yet implemented; query/window-title fallbacks are not available either.", + Some("Take a screenshot, identify pixel coordinates, then call click with --coords x,y --confirm."), + ); + } + }; + let (x, y) = match parse_coords(&coords) { + Some(c) => c, + None => { + return linux_error_json( + "--coords must be in x,y form, e.g. 120,240.", + None, + ); + } + }; + let button = if has_flag(args, "--right") { 3u32 } else { 1u32 }; + let times = if has_flag(args, "--double") { 2u32 } else { 1u32 }; + match session { + LinuxSession::X11 => x11_click(x, y, button, times), + LinuxSession::Wayland => wayland_click(x, y, button, times), + } +} + +fn x11_click(x: i32, y: i32, button: u32, times: u32) -> String { + if resolve_path_binary("xdotool").is_none() { + return missing_tool_json( + "xdotool", + "Install with: sudo apt install xdotool", + "x11-click", + ); + } + let cmd_args: Vec = vec![ + "mousemove".into(), + "--sync".into(), + x.to_string().into(), + y.to_string().into(), + "click".into(), + "--repeat".into(), + times.to_string().into(), + button.to_string().into(), + ]; + run_linux_command("xdotool", cmd_args, "xdotool", false, &[]) +} + +fn wayland_click(x: i32, y: i32, button: u32, times: u32) -> String { + if resolve_path_binary("ydotool").is_none() { + return missing_tool_json( + "ydotool", + "Install with: sudo apt install ydotool, then enable ydotoold (see setup guide).", + "wayland-click", + ); + } + // ydotool button codes for press+release: left=0xC0, right=0xC1, middle=0xC2. + let click_code = match button { + 3 => "0xC1", + 2 => "0xC2", + _ => "0xC0", + }; + let mut steps: Vec = vec![LinuxStep { + program: "ydotool", + args: vec![ + "mousemove".into(), + "--absolute".into(), + "-x".into(), + x.to_string().into(), + "-y".into(), + y.to_string().into(), + ], + }]; + for _ in 0..times { + steps.push(LinuxStep { + program: "ydotool", + args: vec!["click".into(), click_code.into()], + }); + } + run_linux_steps(steps, "ydotool", false, &[]) +} + +fn linux_type_text(session: LinuxSession, args: &[String]) -> String { + let text = value(args, "--text").unwrap_or_default(); + let clear = has_flag(args, "--clear"); + let press_return = has_flag(args, "--return"); + match session { + LinuxSession::X11 => x11_type_text(&text, clear, press_return), + LinuxSession::Wayland => wayland_type_text(&text, clear, press_return), + } +} + +fn x11_type_text(text: &str, clear: bool, press_return: bool) -> String { + if resolve_path_binary("xdotool").is_none() { + return missing_tool_json( + "xdotool", + "Install with: sudo apt install xdotool", + "x11-type", + ); + } + let mut steps: Vec = Vec::new(); + if clear { + steps.push(LinuxStep { + program: "xdotool", + args: vec![ + "key".into(), + "--clearmodifiers".into(), + "ctrl+a".into(), + ], + }); + steps.push(LinuxStep { + program: "xdotool", + args: vec!["key".into(), "--clearmodifiers".into(), "Delete".into()], + }); + } + steps.push(LinuxStep { + program: "xdotool", + args: vec![ + "type".into(), + "--clearmodifiers".into(), + "--delay".into(), + "0".into(), + "--".into(), + text.into(), + ], + }); + if press_return { + steps.push(LinuxStep { + program: "xdotool", + args: vec!["key".into(), "Return".into()], + }); + } + run_linux_steps(steps, "xdotool", true, &[]) +} + +fn wayland_type_text(text: &str, clear: bool, press_return: bool) -> String { + let use_wtype = resolve_path_binary("wtype").is_some(); + let use_ydotool = resolve_path_binary("ydotool").is_some(); + if !use_wtype && !use_ydotool { + return missing_tool_json( + "wtype or ydotool", + "Install with: sudo apt install wtype (wlroots compositors) or sudo apt install ydotool (any compositor; needs ydotoold).", + "wayland-type", + ); + } + let mut steps: Vec = Vec::new(); + if use_wtype { + if clear { + steps.push(LinuxStep { + program: "wtype", + args: vec![ + "-M".into(), + "ctrl".into(), + "a".into(), + "-m".into(), + "ctrl".into(), + ], + }); + steps.push(LinuxStep { + program: "wtype", + args: vec!["-k".into(), "Delete".into()], + }); + } + steps.push(LinuxStep { + program: "wtype", + args: vec!["--".into(), text.into()], + }); + if press_return { + steps.push(LinuxStep { + program: "wtype", + args: vec!["-k".into(), "Return".into()], + }); + } + return run_linux_steps(steps, "wtype", true, &[]); + } + // ydotool fallback. ydotool's `type` command takes the literal string. + if clear { + // Ctrl+A : keycode 29 (LCTRL) + 30 (A); Delete: keycode 111. + steps.push(LinuxStep { + program: "ydotool", + args: vec![ + "key".into(), + "29:1".into(), + "30:1".into(), + "30:0".into(), + "29:0".into(), + ], + }); + steps.push(LinuxStep { + program: "ydotool", + args: vec!["key".into(), "111:1".into(), "111:0".into()], + }); + } + steps.push(LinuxStep { + program: "ydotool", + args: vec!["type".into(), text.into()], + }); + if press_return { + // Return: keycode 28. + steps.push(LinuxStep { + program: "ydotool", + args: vec!["key".into(), "28:1".into(), "28:0".into()], + }); + } + run_linux_steps(steps, "ydotool", true, &[]) +} + +fn linux_keypress(session: LinuxSession, args: &[String]) -> String { + let raw = match value(args, "--keys") { + Some(s) => s, + None => return linux_error_json("--keys is required for keypress.", None), + }; + let keys: Vec<&str> = raw.split(',').filter(|s| !s.is_empty()).collect(); + if keys.is_empty() { + return linux_error_json("--keys must contain at least one key.", None); + } + match session { + LinuxSession::X11 => x11_keypress(&keys), + LinuxSession::Wayland => wayland_keypress(&keys), + } +} + +fn x11_keypress(keys: &[&str]) -> String { + if resolve_path_binary("xdotool").is_none() { + return missing_tool_json( + "xdotool", + "Install with: sudo apt install xdotool", + "x11-keypress", + ); + } + let mut steps: Vec = Vec::new(); + for key in keys { + steps.push(LinuxStep { + program: "xdotool", + args: vec![ + "key".into(), + "--clearmodifiers".into(), + map_key_xdotool(key).into(), + ], + }); + } + run_linux_steps(steps, "xdotool", false, &[]) +} + +fn wayland_keypress(keys: &[&str]) -> String { + if resolve_path_binary("wtype").is_none() { + return missing_tool_json( + "wtype", + "Install with: sudo apt install wtype (wlroots compositors). For non-wlroots, use ydotool with raw keycodes.", + "wayland-keypress", + ); + } + let mut steps: Vec = Vec::new(); + for key in keys { + steps.push(LinuxStep { + program: "wtype", + args: vec!["-k".into(), map_key_wtype(key).into()], + }); + } + run_linux_steps(steps, "wtype", false, &[]) +} + +fn linux_scroll(session: LinuxSession, args: &[String]) -> String { + let direction = value(args, "--direction").unwrap_or_else(|| "down".to_string()); + let amount: u32 = value(args, "--amount") + .as_deref() + .and_then(|s| s.parse().ok()) + .unwrap_or(3); + match session { + LinuxSession::X11 => x11_scroll(&direction, amount), + LinuxSession::Wayland => wayland_scroll(&direction, amount), + } +} + +fn x11_scroll(direction: &str, amount: u32) -> String { + if resolve_path_binary("xdotool").is_none() { + return missing_tool_json( + "xdotool", + "Install with: sudo apt install xdotool", + "x11-scroll", + ); + } + let button = match direction { + "up" => "4", + "down" => "5", + "left" => "6", + "right" => "7", + _ => "5", + }; + let cmd_args: Vec = vec![ + "click".into(), + "--repeat".into(), + amount.to_string().into(), + button.into(), + ]; + run_linux_command("xdotool", cmd_args, "xdotool", false, &[]) +} + +fn wayland_scroll(direction: &str, amount: u32) -> String { + // True scroll-wheel emulation needs wlrctl or compositor support; fall back + // to Page_Up/Page_Down via wtype as a degraded behavior. Make the trade-off + // explicit in the response. + if resolve_path_binary("wtype").is_none() { + return missing_tool_json( + "wtype", + "Install wtype for Wayland scroll support.", + "wayland-scroll", + ); + } + let key = match direction { + "up" => "Page_Up", + "down" => "Page_Down", + "left" => "Left", + "right" => "Right", + _ => "Page_Down", + }; + let mut steps: Vec = Vec::new(); + for _ in 0..amount { + steps.push(LinuxStep { + program: "wtype", + args: vec!["-k".into(), key.into()], + }); + } + run_linux_steps( + steps, + "wtype", + false, + &[( + "degraded", + json_string( + "Wayland scroll uses Page_Up/Page_Down via wtype as a fallback because true scroll-wheel emulation requires compositor-specific support not available through a portable CLI.", + ), + )], + ) +} + +fn linux_focus(session: LinuxSession, args: &[String]) -> String { + let app = value(args, "--app"); + let title = value(args, "--window-title"); + let window_id = value(args, "--window-id"); + + if window_id.is_none() && app.is_none() && title.is_none() { + return linux_error_json( + "focus requires --app, --window-title, or --window-id on Linux.", + None, + ); + } + + match session { + LinuxSession::X11 => x11_focus(app.as_deref(), title.as_deref(), window_id.as_deref()), + LinuxSession::Wayland => wayland_focus(app.as_deref(), title.as_deref(), window_id.as_deref()), + } +} + +fn x11_focus(app: Option<&str>, title: Option<&str>, window_id: Option<&str>) -> String { + if let Some(id) = window_id { + if resolve_path_binary("wmctrl").is_some() { + let cmd_args: Vec = vec!["-i".into(), "-a".into(), id.into()]; + return run_linux_command("wmctrl", cmd_args, "wmctrl", false, &[]); + } + if resolve_path_binary("xdotool").is_some() { + let cmd_args: Vec = vec!["windowactivate".into(), id.into()]; + return run_linux_command("xdotool", cmd_args, "xdotool", false, &[]); + } + return missing_tool_json( + "wmctrl or xdotool", + "Install with: sudo apt install wmctrl", + "x11-focus", + ); + } + if let Some(a) = app { + if resolve_path_binary("wmctrl").is_some() { + let cmd_args: Vec = vec!["-x".into(), "-a".into(), a.into()]; + return run_linux_command("wmctrl", cmd_args, "wmctrl", false, &[]); + } + if resolve_path_binary("xdotool").is_some() { + let cmd_args: Vec = + vec!["search".into(), "--class".into(), a.into(), "windowactivate".into()]; + return run_linux_command("xdotool", cmd_args, "xdotool", false, &[]); + } + return missing_tool_json( + "wmctrl or xdotool", + "Install with: sudo apt install wmctrl", + "x11-focus", + ); + } + if let Some(t) = title { + if resolve_path_binary("wmctrl").is_some() { + let cmd_args: Vec = vec!["-a".into(), t.into()]; + return run_linux_command("wmctrl", cmd_args, "wmctrl", false, &[]); + } + if resolve_path_binary("xdotool").is_some() { + let cmd_args: Vec = + vec!["search".into(), "--name".into(), t.into(), "windowactivate".into()]; + return run_linux_command("xdotool", cmd_args, "xdotool", false, &[]); + } + return missing_tool_json( + "wmctrl or xdotool", + "Install with: sudo apt install wmctrl", + "x11-focus", + ); + } + linux_error_json("focus requires --app, --window-title, or --window-id on Linux.", None) +} + +fn wayland_focus(app: Option<&str>, title: Option<&str>, window_id: Option<&str>) -> String { + if env::var_os("SWAYSOCK").is_some() && resolve_path_binary("swaymsg").is_some() { + let selector = if let Some(id) = window_id { + format!("[con_id={}] focus", id) + } else if let Some(a) = app { + let escaped = a.replace('"', "\\\""); + format!("[app_id=\"{}\"] focus", escaped) + } else if let Some(t) = title { + let escaped = t.replace('"', "\\\""); + format!("[title=\"{}\"] focus", escaped) + } else { + return linux_error_json("focus requires --app, --window-title, or --window-id on Linux.", None); + }; + let cmd_args: Vec = vec![selector.into()]; + return run_linux_command("swaymsg", cmd_args, "swaymsg", false, &[]); + } + if window_id.is_some() { + return linux_error_json( + "Wayland --window-id focus requires SWAYSOCK and swaymsg (Sway compositor).", + Some("Connect via Sway or use an X11 session."), + ); + } + linux_error_json( + "Wayland focus is compositor-specific. SWAYSOCK is not set and swaymsg is not available; no portable CLI exists for window focus on GNOME Mutter or KDE KWin.", + Some("Switch to an X11 session, run a wlroots compositor (Sway/Hyprland), or use the compositor's own IPC."), + ) +} + +fn parse_coords(s: &str) -> Option<(i32, i32)> { + let mut parts = s.split(','); + let x = parts.next()?.trim().parse::().ok()?; + let y = parts.next()?.trim().parse::().ok()?; + if parts.next().is_some() { + return None; + } + Some((x, y)) +} + +fn map_key_xdotool(key: &str) -> String { + let trimmed = key.trim(); + match trimmed.to_lowercase().as_str() { + "return" | "enter" => "Return".into(), + "tab" => "Tab".into(), + "escape" | "esc" => "Escape".into(), + "space" | "spacebar" => "space".into(), + "up" => "Up".into(), + "down" => "Down".into(), + "left" => "Left".into(), + "right" => "Right".into(), + "backspace" => "BackSpace".into(), + "delete" | "del" => "Delete".into(), + "home" => "Home".into(), + "end" => "End".into(), + "pageup" | "page_up" | "pgup" => "Page_Up".into(), + "pagedown" | "page_down" | "pgdn" => "Page_Down".into(), + // Allow XKeysym names through unchanged (e.g. "ctrl+c", "F5", "shift+Tab"). + _ => trimmed.to_string(), + } +} + +fn map_key_wtype(key: &str) -> String { + // wtype shares XKB key names with xdotool for the common cases. + let trimmed = key.trim(); + match trimmed.to_lowercase().as_str() { + "return" | "enter" => "Return".into(), + "tab" => "Tab".into(), + "escape" | "esc" => "Escape".into(), + "space" | "spacebar" => "space".into(), + "up" => "Up".into(), + "down" => "Down".into(), + "left" => "Left".into(), + "right" => "Right".into(), + "backspace" => "BackSpace".into(), + "delete" | "del" => "Delete".into(), + "home" => "Home".into(), + "end" => "End".into(), + "pageup" | "page_up" | "pgup" => "Page_Up".into(), + "pagedown" | "page_down" | "pgdn" => "Page_Down".into(), + _ => trimmed.to_string(), + } +} + +#[derive(Debug)] +struct LinuxStep { + program: &'static str, + args: Vec, +} + +fn run_linux_command( + program: &str, + args: Vec, + backend_label: &str, + redact_type_text: bool, + extra_fields: &[(&'static str, String)], +) -> String { + let mut cmd = Command::new(program); + cmd.args(&args); + match cmd.output() { + Ok(output) => { + let ok = output.status.success(); + let code = output.status.code().unwrap_or(-1); + let stdout = if redact_type_text { + "".to_string() + } else { + String::from_utf8_lossy(&output.stdout).to_string() + }; + let stderr = String::from_utf8_lossy(&output.stderr).to_string(); + let mut fields = vec![ + ("ok", bool_json(ok)), + ("supported", "true".to_string()), + ("platform", json_string(env::consts::OS)), + ("backend", json_string(backend_label)), + ("exitCode", code.to_string()), + ( + "command", + json_array_program_args(program, &redact_args(&args, redact_type_text)), + ), + ("stdout", json_string(&stdout)), + ("stdoutRedacted", bool_json(redact_type_text)), + ("stderr", json_string(&stderr)), + ]; + for (k, v) in extra_fields { + fields.push((k, v.clone())); + } + json_obj(fields) + } + Err(err) => json_obj(vec![ + ("ok", "false".to_string()), + ("supported", "true".to_string()), + ("platform", json_string(env::consts::OS)), + ("backend", json_string(backend_label)), + ("error", json_string(&err.to_string())), + ( + "hint", + json_string(&format!( + "Failed to invoke `{}`. Run `coven-desktop-use doctor` for installation guidance.", + program + )), + ), + ]), + } +} + +fn run_linux_steps( + steps: Vec, + backend_label: &str, + redact_type_text: bool, + extra_fields: &[(&'static str, String)], +) -> String { + let mut step_jsons: Vec = Vec::with_capacity(steps.len()); + let mut overall_ok = true; + let mut last_exit: i32 = 0; + for step in &steps { + let mut cmd = Command::new(step.program); + cmd.args(&step.args); + let entry = match cmd.output() { + Ok(output) => { + let ok = output.status.success(); + if !ok { + overall_ok = false; + } + let code = output.status.code().unwrap_or(-1); + last_exit = code; + let stdout = if redact_type_text { + "".to_string() + } else { + String::from_utf8_lossy(&output.stdout).to_string() + }; + let stderr = String::from_utf8_lossy(&output.stderr).to_string(); + json_obj(vec![ + ("program", json_string(step.program)), + ( + "args", + json_array_strings(&redact_args(&step.args, redact_type_text)), + ), + ("exitCode", code.to_string()), + ("ok", bool_json(ok)), + ("stdout", json_string(&stdout)), + ("stderr", json_string(&stderr)), + ]) + } + Err(err) => { + overall_ok = false; + json_obj(vec![ + ("program", json_string(step.program)), + ("ok", "false".to_string()), + ("error", json_string(&err.to_string())), + ]) + } + }; + step_jsons.push(entry); + } + let mut fields = vec![ + ("ok", bool_json(overall_ok)), + ("supported", "true".to_string()), + ("platform", json_string(env::consts::OS)), + ("backend", json_string(backend_label)), + ("exitCode", last_exit.to_string()), + ("steps", format!("[{}]", step_jsons.join(","))), + ("stdoutRedacted", bool_json(redact_type_text)), + ]; + for (k, v) in extra_fields { + fields.push((k, v.clone())); + } + json_obj(fields) +} + +fn missing_tool_json(tool_label: &str, install_hint: &str, backend_label: &str) -> String { + json_obj(vec![ + ("ok", "false".to_string()), + ("supported", "true".to_string()), + ("platform", json_string(env::consts::OS)), + ("backend", json_string(backend_label)), + ( + "error", + json_string(&format!("Missing required tool: {}.", tool_label)), + ), + ("hint", json_string(install_hint)), + ( + "verificationCommand", + json_string("coven-desktop-use doctor"), + ), + ]) +} + +fn linux_error_json(message: &str, hint: Option<&str>) -> String { + let mut fields = vec![ + ("ok", "false".to_string()), + ("supported", "true".to_string()), + ("platform", json_string(env::consts::OS)), + ("backend", json_string("linux")), + ("error", json_string(message)), + ]; + if let Some(h) = hint { + fields.push(("hint", json_string(h))); + } + json_obj(fields) +} + +fn json_array_program_args(program: &str, args: &[String]) -> String { + let mut all: Vec = Vec::with_capacity(args.len() + 1); + all.push(program.to_string()); + all.extend_from_slice(args); + json_array_strings(&all) +} + fn redact_args(args: &[OsString], redact_type_text: bool) -> Vec { let mut items: Vec = args .iter() .map(|a| a.to_string_lossy().to_string()) .collect(); if redact_type_text { - if let Some(pos) = items.iter().position(|v| v == "type") { + // Linux backends use `--` to separate flags from positional text + // (xdotool type, wtype). When present, redact everything after it. + if let Some(sep_pos) = items.iter().position(|v| v == "--") { + for item in items.iter_mut().skip(sep_pos + 1) { + let len = item.len(); + *item = format!("<{}-byte text>", len); + } + } else if let Some(pos) = items.iter().position(|v| v == "type") { + // Peekaboo (`type TEXT`) and ydotool (`type TEXT`): redact the + // arg right after `type`. if let Some(text) = items.get_mut(pos + 1) { let len = text.len(); *text = format!("<{}-byte text>", len); @@ -580,4 +1555,127 @@ mod tests { )); assert!(!looks_like_permission_failure(r#"{"success":true}"#)); } + + // ------------------------------------------------------------------ + // Linux backend tests + // ------------------------------------------------------------------ + + #[test] + fn parse_coords_accepts_x_y_form_and_rejects_garbage() { + assert_eq!(parse_coords("120,240"), Some((120, 240))); + assert_eq!(parse_coords(" 0 , 0 "), Some((0, 0))); + assert_eq!(parse_coords("-3,-7"), Some((-3, -7))); + assert_eq!(parse_coords("120"), None); + assert_eq!(parse_coords("a,b"), None); + assert_eq!(parse_coords("1,2,3"), None); + } + + #[test] + fn key_mapping_translates_common_names() { + assert_eq!(map_key_xdotool("return"), "Return"); + assert_eq!(map_key_xdotool("ENTER"), "Return"); + assert_eq!(map_key_xdotool("esc"), "Escape"); + assert_eq!(map_key_xdotool("pageup"), "Page_Up"); + // Unknown names pass through preserving original case (xdotool accepts XKeysym names directly). + assert_eq!(map_key_xdotool("F5"), "F5"); + assert_eq!(map_key_xdotool("ctrl+c"), "ctrl+c"); + + assert_eq!(map_key_wtype("return"), "Return"); + assert_eq!(map_key_wtype("pgdn"), "Page_Down"); + } + + #[test] + fn redact_args_handles_dash_dash_separator() { + // xdotool: `type --clearmodifiers --delay 0 -- secret`. + let args = vec![ + OsString::from("type"), + OsString::from("--clearmodifiers"), + OsString::from("--delay"), + OsString::from("0"), + OsString::from("--"), + OsString::from("hunter2"), + ]; + let redacted = redact_args(&args, true); + assert_eq!(redacted.last().unwrap(), "<7-byte text>"); + // wtype: `-- TEXT`. + let args = vec![OsString::from("--"), OsString::from("hi there")]; + let redacted = redact_args(&args, true); + assert_eq!(redacted.last().unwrap(), "<8-byte text>"); + } + + #[test] + fn redact_args_legacy_type_form_still_redacts() { + // ydotool / Peekaboo: `type TEXT`. + let args = vec![OsString::from("type"), OsString::from("hello")]; + let redacted = redact_args(&args, true); + assert_eq!(redacted, vec!["type".to_string(), "<5-byte text>".to_string()]); + } + + #[test] + fn linux_doctor_envelope_advertises_session_and_install_command() { + let x11 = linux_doctor(LinuxSession::X11); + assert!(x11.contains("\"session\":\"x11\"")); + assert!(x11.contains("apt install scrot xdotool wmctrl")); + assert!(x11.contains("\"backend\":\"linux\"")); + + let wl = linux_doctor(LinuxSession::Wayland); + assert!(wl.contains("\"session\":\"wayland\"")); + assert!(wl.contains("apt install grim wtype ydotool")); + assert!(wl.contains("ydotoolNote")); + assert!(wl.contains("focusNote")); + } + + #[test] + fn linux_click_without_coords_returns_actionable_error() { + let stripped: Vec = vec![]; + let result = linux_click(LinuxSession::X11, &stripped); + assert!(result.contains("\"ok\":false")); + assert!(result.contains("--coords")); + assert!(result.contains("AT-SPI")); + } + + #[test] + fn linux_keypress_without_keys_errors() { + let stripped: Vec = vec![]; + let result = linux_keypress(LinuxSession::X11, &stripped); + assert!(result.contains("\"ok\":false")); + assert!(result.contains("--keys")); + } + + #[test] + fn linux_focus_without_target_errors() { + let stripped: Vec = vec![]; + let result = linux_focus(LinuxSession::X11, &stripped); + assert!(result.contains("\"ok\":false")); + assert!(result.contains("--app")); + assert!(result.contains("--window-title")); + assert!(result.contains("--window-id")); + } + + #[test] + fn linux_minimum_tools_requires_capture_and_input() { + let none = vec![ + LinuxToolStatus { name: "scrot", found: false, path: None }, + LinuxToolStatus { name: "xdotool", found: false, path: None }, + ]; + assert!(!linux_minimum_tools_present(LinuxSession::X11, &none)); + + let only_capture = vec![ + LinuxToolStatus { name: "scrot", found: true, path: Some("/usr/bin/scrot".into()) }, + LinuxToolStatus { name: "xdotool", found: false, path: None }, + ]; + assert!(!linux_minimum_tools_present(LinuxSession::X11, &only_capture)); + + let both = vec![ + LinuxToolStatus { name: "scrot", found: true, path: Some("/usr/bin/scrot".into()) }, + LinuxToolStatus { name: "xdotool", found: true, path: Some("/usr/bin/xdotool".into()) }, + ]; + assert!(linux_minimum_tools_present(LinuxSession::X11, &both)); + + let wl_ok = vec![ + LinuxToolStatus { name: "grim", found: true, path: Some("/usr/bin/grim".into()) }, + LinuxToolStatus { name: "ydotool", found: true, path: Some("/usr/bin/ydotool".into()) }, + ]; + assert!(linux_minimum_tools_present(LinuxSession::Wayland, &wl_ok)); + } } diff --git a/src/plugin-tool.ts b/src/plugin-tool.ts index 6447643..1dab0a0 100644 --- a/src/plugin-tool.ts +++ b/src/plugin-tool.ts @@ -750,6 +750,10 @@ function parseJsonOrText(stdout: string | Buffer): unknown { } function permissionFlowForResult(result: unknown, action: string): unknown | undefined { + const platform = readStringField(result, "platform"); + if (platform === "linux") { + return linuxPermissionFlow(result, action); + } if (action !== "doctor" && !looksLikePermissionFailure(result)) { return undefined; } @@ -781,6 +785,75 @@ function permissionFlowForResult(result: unknown, action: string): unknown | und }; } +function inferLinuxSession(result: unknown): string | undefined { + const session = readStringField(result, "session"); + if (session) return session; + const backend = readStringField(result, "backend") ?? ""; + if (backend.startsWith("wayland")) return "wayland"; + if (backend.startsWith("x11")) return "x11"; + return undefined; +} + +function linuxPermissionFlow(result: unknown, action: string): unknown | undefined { + const setupGuide = readObjectField(result, "setupGuide"); + const tools = readObjectField(result, "tools"); + const session = inferLinuxSession(result); + const errorText = readStringField(result, "error") ?? ""; + const missingTool = errorText.startsWith("Missing required tool"); + + if (action !== "doctor" && !missingTool && !setupGuide) { + return undefined; + } + + return { + platform: "linux", + session: session ?? null, + summary: + readStringField(setupGuide, "summary") ?? + "Linux desktop-use requires per-session helper tools (scrot/xdotool on X11, grim/wtype/ydotool on Wayland).", + installCommand: + readStringField(setupGuide, "installCommand") ?? + (session === "wayland" + ? "sudo apt install grim wtype ydotool" + : "sudo apt install scrot xdotool wmctrl"), + missingTools: readArrayField(setupGuide, "missingTools") ?? [], + tools: tools ?? null, + sessionNotes: collectSessionNotes(setupGuide), + afterInstall: + "After installing the listed packages, rerun desktop_use action=doctor. Wayland users may also need to start the ydotoold service and add their user to the 'input' group.", + verification: { tool: "desktop_use", args: { action: "doctor" } }, + }; +} + +function readStringField(value: unknown, key: string): string | undefined { + if (!value || typeof value !== "object") return undefined; + const v = (value as Record)[key]; + return typeof v === "string" ? v : undefined; +} + +function readObjectField(value: unknown, key: string): Record | undefined { + if (!value || typeof value !== "object") return undefined; + const v = (value as Record)[key]; + return v && typeof v === "object" ? (v as Record) : undefined; +} + +function readArrayField(value: unknown, key: string): unknown[] | undefined { + if (!value || typeof value !== "object") return undefined; + const v = (value as Record)[key]; + return Array.isArray(v) ? v : undefined; +} + +function collectSessionNotes( + setupGuide: Record | undefined, +): Record { + const notes: Record = {}; + for (const key of ["ydotoolNote", "focusNote", "scrollNote", "elementIdNote"]) { + const v = readStringField(setupGuide, key); + if (v) notes[key] = v; + } + return notes; +} + function primaryPermissionBinaries(): string[] { const adapter = resolveAdapterBin(); const peekaboo = resolvePathBinary("peekaboo") ?? "peekaboo";