diff --git a/extension/src/background/service-worker.ts b/extension/src/background/service-worker.ts index f1f8d73..21046aa 100644 --- a/extension/src/background/service-worker.ts +++ b/extension/src/background/service-worker.ts @@ -87,6 +87,8 @@ async function handleRequest(req: Request): Promise { return await forwardToContent(req); case 'screenshot': return await handleScreenshot(req); + case 'download': + return await handleDownload(req); default: return { id: req.id, ok: false, error: `Unknown action: ${req.action}` }; } @@ -342,6 +344,119 @@ async function handleScreenshot(req: Request): Promise { }; } +async function handleDownload(req: Request): Promise { + const session = sessionFromRequest(req); + if (!session.ok) { + return { id: req.id, ok: false, error: session.error }; + } + + const target = req.params.target; + if (typeof target !== 'string' || target.length === 0) { + return { id: req.id, ok: false, error: 'target is required' }; + } + + let url: string; + + // Check if target looks like a URL (contains :// or starts with //) + if (/^https?:\/\//.test(target) || target.startsWith('//')) { + url = target.startsWith('//') ? `https:${target}` : target; + } else { + // Treat as element ref — resolve via content script + const result = await sendToContent(session.value.tab_id, { + type: 'resolve_url', + params: { + session_id: session.value.session_id, + request_id: req.id, + ref: target, + }, + }); + if (!result.ok || !result.data?.url) { + return { + id: req.id, + ok: false, + error: result.error ?? `Could not resolve URL for target: ${target}`, + }; + } + url = result.data.url as string; + } + + try { + const response = await fetch(url, { credentials: 'include' }); + if (!response.ok) { + return { + id: req.id, + ok: false, + error: `Download failed: HTTP ${response.status} ${response.statusText}`, + }; + } + + const contentType = response.headers.get('content-type') ?? 'application/octet-stream'; + const buffer = await response.arrayBuffer(); + const size = buffer.byteLength; + + // Extract filename from Content-Disposition or URL + let filename = 'download'; + const disposition = response.headers.get('content-disposition'); + if (disposition) { + const match = /filename\*?=(?:UTF-8''|"?)([^";]+)"?/i.exec(disposition); + if (match?.[1]) { + filename = decodeURIComponent(match[1]); + } + } else { + try { + const urlPath = new URL(url).pathname; + const lastSegment = urlPath.split('/').filter(Boolean).pop(); + if (lastSegment && lastSegment.includes('.')) { + filename = decodeURIComponent(lastSegment); + } + } catch { + // keep default + } + } + + // Stream base64 data as download_chunk messages through the native port + // to stay under Chrome's native messaging size limits. + const bytes = new Uint8Array(buffer); + const CHUNK_SIZE = 3 * 1024 * 1024; // 3MB raw -> ~4MB base64, safe under limits + const totalChunks = Math.max(1, Math.ceil(bytes.length / CHUNK_SIZE)); + + for (let i = 0; i < totalChunks; i++) { + const start = i * CHUNK_SIZE; + const slice = bytes.subarray(start, Math.min(start + CHUNK_SIZE, bytes.length)); + + // Encode slice to base64 in sub-chunks to avoid call stack limits + let binary = ''; + for (let j = 0; j < slice.length; j += 8192) { + const sub = slice.subarray(j, Math.min(j + 8192, slice.length)); + binary += String.fromCharCode(...sub); + } + const chunkData = btoa(binary); + + port?.postMessage({ + type: 'download_chunk', + session_id: session.value.session_id, + request_id: req.id, + chunk_index: i, + data: chunkData, + done: i === totalChunks - 1, + ...(i === 0 ? { filename, content_type: contentType, size } : {}), + }); + } + + return { + id: req.id, + ok: true, + data: { streamed: true, filename, content_type: contentType, size }, + }; + } catch (error) { + return { + id: req.id, + ok: false, + error: `Download failed: ${error instanceof Error ? error.message : String(error)}`, + }; + } +} + function sessionFromRequest( req: Request, ): { ok: true; value: Session } | { ok: false; error: string } { @@ -618,7 +733,7 @@ function isChunkEvent(message: unknown): message is ChunkEvent { return false; } const value = message as { type?: unknown; chunk?: unknown }; - return value.type === 'page_chunk' && typeof value.chunk === 'object'; + return (value.type === 'page_chunk' || value.type === 'download_chunk') && typeof value.chunk === 'object'; } chrome.tabs.onRemoved.addListener((tabId) => { diff --git a/extension/src/content/content-script.ts b/extension/src/content/content-script.ts index 9f2cd8f..9f6b148 100644 --- a/extension/src/content/content-script.ts +++ b/extension/src/content/content-script.ts @@ -456,6 +456,8 @@ async function handleMessage(req: ContentRequest): Promise { return handlePresenceStart(req); case 'presence_stop': return handlePresenceStop(req); + case 'resolve_url': + return handleResolveUrl(req); default: return { ok: false, @@ -494,6 +496,32 @@ function handlePresenceStop(req: ContentRequest): ContentResponse { }; } +function handleResolveUrl(req: ContentRequest): ContentResponse { + const refId = requireString(req.params.ref, 'ref'); + const target = resolveTarget(refId); + if (!target) { + return { ok: false, error: `Element not found: ${refId}` }; + } + + const url = + target.getAttribute('src') ?? + target.getAttribute('href') ?? + (target as HTMLObjectElement).data ?? + null; + + if (!url) { + return { ok: false, error: `Element ${refId} has no src or href attribute` }; + } + + // Resolve to absolute URL using the page's base URL + try { + const absolute = new URL(url, document.baseURI).href; + return { ok: true, data: { url: absolute } }; + } catch { + return { ok: false, error: `Invalid URL: ${url}` }; + } +} + async function handleSnapshot(req: ContentRequest): Promise { const sessionId = requireString(req.params.session_id, 'session_id'); const requestId = requireString(req.params.request_id, 'request_id'); diff --git a/extension/src/shared/types.ts b/extension/src/shared/types.ts index d16e1e1..5c45cfe 100644 --- a/extension/src/shared/types.ts +++ b/extension/src/shared/types.ts @@ -58,7 +58,7 @@ export interface PageChunk { } export interface ContentRequest { - type: 'snapshot' | 'click' | 'type' | 'wait' | 'presence_start' | 'presence_stop'; + type: 'snapshot' | 'click' | 'type' | 'wait' | 'presence_start' | 'presence_stop' | 'resolve_url'; params: Record; } @@ -68,7 +68,19 @@ export interface ContentResponse { error?: string; } +export interface DownloadChunk { + type: 'download_chunk'; + session_id: string; + request_id: string; + chunk_index: number; + data: string; + done: boolean; + filename?: string; + content_type?: string; + size?: number; +} + export interface ChunkEvent { - type: 'page_chunk'; - chunk: PageChunk; + type: 'page_chunk' | 'download_chunk'; + chunk: PageChunk | DownloadChunk; } diff --git a/src/cli/commands.rs b/src/cli/commands.rs index 43c74fa..57b2111 100644 --- a/src/cli/commands.rs +++ b/src/cli/commands.rs @@ -884,6 +884,144 @@ pub async fn view( Ok(()) } +fn sanitize_filename(raw: &str) -> String { + let name = std::path::Path::new(raw) + .file_name() + .and_then(|n| n.to_str()) + .unwrap_or("download") + .to_string(); + + let name = name.replace(['/', '\\'], "_"); + + if name.is_empty() || name == "." || name == ".." { + "download".to_string() + } else { + name + } +} + +/// Check whether a target string looks like a URL (contains `://`). +fn is_url_target(target: &str) -> bool { + target.contains("://") +} + +/// Resolve a download target that looks like an element ID (`e3`, `3`) into an +/// absolute URL by looking up the element in the cached page snapshot. +/// +/// Returns `(element_key, resolved_url)` on success. +fn resolve_element_to_url( + target: &str, + snapshot: &crate::protocol::messages::RawSnapshot, + page: &PageData, +) -> Result<(String, String)> { + let element_key = normalize_element_target(target).ok_or_else(|| { + anyhow::anyhow!("'{target}' is not a valid element ID or URL") + })?; + + let ref_id = page.element_refs.get(&element_key).ok_or_else(|| { + anyhow::anyhow!( + "element {element_key} not found on the current page. \ + Run `browser-cli page ` to see available elements." + ) + })?; + + // Find the raw node matching the ref_id and extract href or src + let raw_node = snapshot + .nodes + .iter() + .find(|n| n.ref_id == *ref_id) + .ok_or_else(|| { + anyhow::anyhow!("internal error: ref {ref_id} for {element_key} not in snapshot") + })?; + + let url_attr = raw_node + .attrs + .get("href") + .or_else(|| raw_node.attrs.get("src")) + .ok_or_else(|| { + anyhow::anyhow!( + "element {element_key} has no downloadable URL (no href or src attribute)" + ) + })?; + + let resolved = resolve_link_url(&page.url, url_attr)?; + Ok((element_key, resolved)) +} + +pub async fn download( + session_id: &str, + target: &str, + output: Option<&str>, + json_mode: bool, +) -> Result<()> { + use base64::Engine; + + // Resolve the target: if it looks like an element ID, look up its URL from + // the snapshot; if it's already a URL, pass it through directly. + let download_url = if is_url_target(target) { + target.to_string() + } else { + let snapshot = fetch_snapshot(session_id, actions::GET_PAGE).await?; + let page = parse_page_from_snapshot(&snapshot, None)?; + let (_element_key, url) = resolve_element_to_url(target, &snapshot, &page)?; + url + }; + + let data = send_ok(Request::new( + actions::DOWNLOAD, + json!({ + "session_id": session_id, + "target": download_url, + }), + )) + .await?; + + let b64 = data + .get("data") + .and_then(|v| v.as_str()) + .unwrap_or(""); + let filename = data + .get("filename") + .and_then(|v| v.as_str()) + .unwrap_or("download"); + let content_type = data + .get("content_type") + .and_then(|v| v.as_str()) + .unwrap_or("application/octet-stream"); + let size = data + .get("size") + .and_then(|v| v.as_u64()) + .unwrap_or(0); + + let bytes = base64::engine::general_purpose::STANDARD + .decode(b64) + .map_err(|e| anyhow::anyhow!("failed to decode base64 data: {e}"))?; + + let out_path = match output { + Some(p) => PathBuf::from(p), + None => PathBuf::from(sanitize_filename(filename)), + }; + + fs::write(&out_path, &bytes)?; + + if json_mode { + print_json(&json!({ + "path": out_path.display().to_string(), + "content_type": content_type, + "size": size, + }))?; + } else { + println!( + "Downloaded {} ({}, {} bytes)", + out_path.display(), + content_type, + size, + ); + } + + Ok(()) +} + pub async fn plugin(name: &str, session_id: &str, json_mode: bool) -> Result<()> { let plugin = crate::plugin::loader::load_plugin(name)?; let summary = crate::plugin::runner::run_plugin(&plugin, session_id).await?; @@ -1389,4 +1527,133 @@ mod tests { }; assert!(is_wait_timeout_error(&response)); } + + #[test] + fn sanitize_filename_strips_path_components() { + assert_eq!(sanitize_filename("/tmp/secret/file.pdf"), "file.pdf"); + assert_eq!(sanitize_filename("report.csv"), "report.csv"); + assert_eq!(sanitize_filename(""), "download"); + assert_eq!(sanitize_filename("."), "download"); + assert_eq!(sanitize_filename(".."), "download"); + assert_eq!(sanitize_filename("dir/sub\\file.txt"), "sub_file.txt"); + } + + #[test] + fn is_url_target_detects_urls() { + assert!(is_url_target("https://example.com/file.zip")); + assert!(is_url_target("http://example.com/file.zip")); + assert!(!is_url_target("e3")); + assert!(!is_url_target("42")); + } + + #[test] + fn resolve_element_to_url_finds_href() { + use crate::protocol::messages::{RawNode, RawSnapshot, Rect, ScrollState, Viewport}; + let snapshot = RawSnapshot { + url: "https://example.com".into(), + title: "Test".into(), + viewport: Viewport { width: 1200.0, height: 800.0 }, + scroll: ScrollState { top: 0.0, height: 800.0 }, + nodes: vec![RawNode { + ref_id: "r5".into(), + parent: None, + tag: "a".into(), + text: "Download".into(), + attrs: std::collections::HashMap::from([ + ("href".into(), "/files/report.pdf".into()), + ]), + rect: Rect { x: 0.0, y: 0.0, w: 100.0, h: 20.0 }, + }], + }; + let page = PageData { + url: "https://example.com".into(), + title: "Test".into(), + current_page: 1, + total_pages: 1, + truncated: false, + shown: 1, + total: 1, + nodes: vec![], + element_refs: std::collections::HashMap::from([("e3".into(), "r5".into())]), + full_texts: Default::default(), + full_blocks: Default::default(), + }; + + let (eid, url) = resolve_element_to_url("e3", &snapshot, &page).unwrap(); + assert_eq!(eid, "e3"); + assert_eq!(url, "https://example.com/files/report.pdf"); + } + + #[test] + fn resolve_element_to_url_finds_src() { + use crate::protocol::messages::{RawNode, RawSnapshot, Rect, ScrollState, Viewport}; + let snapshot = RawSnapshot { + url: "https://example.com".into(), + title: "Test".into(), + viewport: Viewport { width: 1200.0, height: 800.0 }, + scroll: ScrollState { top: 0.0, height: 800.0 }, + nodes: vec![RawNode { + ref_id: "r10".into(), + parent: None, + tag: "img".into(), + text: "".into(), + attrs: std::collections::HashMap::from([ + ("src".into(), "https://cdn.example.com/image.png".into()), + ]), + rect: Rect { x: 0.0, y: 0.0, w: 100.0, h: 100.0 }, + }], + }; + let page = PageData { + url: "https://example.com".into(), + title: "Test".into(), + current_page: 1, + total_pages: 1, + truncated: false, + shown: 1, + total: 1, + nodes: vec![], + element_refs: std::collections::HashMap::from([("e7".into(), "r10".into())]), + full_texts: Default::default(), + full_blocks: Default::default(), + }; + + let (eid, url) = resolve_element_to_url("7", &snapshot, &page).unwrap(); + assert_eq!(eid, "e7"); + assert_eq!(url, "https://cdn.example.com/image.png"); + } + + #[test] + fn resolve_element_to_url_errors_on_missing_url_attr() { + use crate::protocol::messages::{RawNode, RawSnapshot, Rect, ScrollState, Viewport}; + let snapshot = RawSnapshot { + url: "https://example.com".into(), + title: "Test".into(), + viewport: Viewport { width: 1200.0, height: 800.0 }, + scroll: ScrollState { top: 0.0, height: 800.0 }, + nodes: vec![RawNode { + ref_id: "r1".into(), + parent: None, + tag: "button".into(), + text: "Submit".into(), + attrs: std::collections::HashMap::new(), + rect: Rect { x: 0.0, y: 0.0, w: 80.0, h: 30.0 }, + }], + }; + let page = PageData { + url: "https://example.com".into(), + title: "Test".into(), + current_page: 1, + total_pages: 1, + truncated: false, + shown: 1, + total: 1, + nodes: vec![], + element_refs: std::collections::HashMap::from([("e1".into(), "r1".into())]), + full_texts: Default::default(), + full_blocks: Default::default(), + }; + + let err = resolve_element_to_url("e1", &snapshot, &page).unwrap_err(); + assert!(err.to_string().contains("no downloadable URL")); + } } diff --git a/src/main.rs b/src/main.rs index 3d114f4..bfc9d11 100644 --- a/src/main.rs +++ b/src/main.rs @@ -293,6 +293,19 @@ enum Command { #[arg(long)] json: bool, }, + /// Download a browser-accessible resource to a local file + Download { + /// Session ID + session_id: String, + /// Target element ID (e.g. e3) or URL + target: String, + /// Output file path (default: auto-detect from URL/content-type) + #[arg(short, long)] + output: Option, + /// Output as JSON + #[arg(long)] + json: bool, + }, /// Manage and run plugins Plugin { #[command(subcommand)] @@ -489,6 +502,12 @@ async fn main() -> anyhow::Result<()> { quality, json, } => cli::commands::screenshot(session_id, output.as_deref(), full_page, quality, json).await?, + Command::Download { + ref session_id, + ref target, + ref output, + json, + } => cli::commands::download(session_id, target, output.as_deref(), json).await?, Command::Plugin { ref cmd } => match cmd { PluginCommand::Run { name, diff --git a/src/protocol/messages.rs b/src/protocol/messages.rs index 753e131..95afe0f 100644 --- a/src/protocol/messages.rs +++ b/src/protocol/messages.rs @@ -153,9 +153,28 @@ pub mod actions { pub const WAIT: &str = "wait"; pub const GET_TEXT: &str = "get_text"; pub const SCREENSHOT: &str = "screenshot"; + pub const DOWNLOAD: &str = "download"; } pub const PAGE_CHUNK_TYPE: &str = "page_chunk"; +pub const DOWNLOAD_CHUNK_TYPE: &str = "download_chunk"; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct DownloadChunk { + #[serde(rename = "type")] + pub message_type: String, + pub session_id: String, + pub request_id: String, + pub chunk_index: usize, + pub data: String, + pub done: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub filename: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub content_type: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub size: Option, +} #[cfg(test)] mod tests { diff --git a/src/relay/server.rs b/src/relay/server.rs index 364b123..18c7fbb 100644 --- a/src/relay/server.rs +++ b/src/relay/server.rs @@ -7,7 +7,8 @@ use tokio::net::TcpListener; use tokio::sync::{Mutex, oneshot}; use crate::protocol::messages::{ - PAGE_CHUNK_TYPE, PageChunk, RawSnapshot, Request, Response, actions, + DOWNLOAD_CHUNK_TYPE, DownloadChunk, PAGE_CHUNK_TYPE, PageChunk, RawSnapshot, Request, Response, + actions, }; use super::native_msg; @@ -16,6 +17,16 @@ const RELAY_ADDR: &str = "127.0.0.1:12899"; type PendingMap = Arc>>>; type SessionMap = Arc>>; +type DownloadMap = Arc>>; + +#[derive(Debug, Clone)] +struct DownloadBuffer { + chunks: Vec, + filename: Option, + content_type: Option, + size: Option, + complete: bool, +} #[derive(Debug, Clone)] struct SessionCache { @@ -32,10 +43,12 @@ pub async fn run() -> Result<()> { let pending: PendingMap = Arc::new(Mutex::new(HashMap::new())); let sessions: SessionMap = Arc::new(Mutex::new(HashMap::new())); + let downloads: DownloadMap = Arc::new(Mutex::new(HashMap::new())); let stdout = Arc::new(Mutex::new(tokio::io::stdout())); let pending_clone = Arc::clone(&pending); let sessions_clone = Arc::clone(&sessions); + let downloads_clone = Arc::clone(&downloads); tokio::spawn(async move { let mut stdin = tokio::io::stdin(); loop { @@ -48,6 +61,17 @@ pub async fn run() -> Result<()> { continue; } + if msg.get("type").and_then(|value| value.as_str()) + == Some(DOWNLOAD_CHUNK_TYPE) + { + if let Err(err) = + handle_download_chunk(msg, &downloads_clone).await + { + eprintln!("relay: failed to handle download chunk: {err}"); + } + continue; + } + let response: Response = match serde_json::from_value(msg.clone()) { Ok(response) => response, Err(err) => { @@ -75,10 +99,11 @@ pub async fn run() -> Result<()> { let (stream, addr) = listener.accept().await?; let pending = Arc::clone(&pending); let sessions = Arc::clone(&sessions); + let downloads = Arc::clone(&downloads); let stdout = Arc::clone(&stdout); tokio::spawn(async move { - if let Err(err) = handle_client(stream, pending, sessions, stdout).await { + if let Err(err) = handle_client(stream, pending, sessions, downloads, stdout).await { eprintln!("relay: client {addr} error: {err}"); } }); @@ -89,6 +114,7 @@ async fn handle_client( stream: tokio::net::TcpStream, pending: PendingMap, sessions: SessionMap, + downloads: DownloadMap, stdout: Arc>, ) -> Result<()> { let (reader, mut writer) = stream.into_split(); @@ -122,7 +148,7 @@ async fn handle_client( } }; - let response = finalize_response(request, extension_response, &sessions).await?; + let response = finalize_response(request, extension_response, &sessions, &downloads).await?; write_response(&mut writer, &response).await?; } @@ -180,6 +206,7 @@ async fn finalize_response( request: Request, extension_response: Response, sessions: &SessionMap, + downloads: &DownloadMap, ) -> Result { if !extension_response.is_success() { return Ok(extension_response); @@ -212,6 +239,25 @@ async fn finalize_response( } Ok(extension_response) } + actions::DOWNLOAD => { + // If the extension streamed download data via download_chunk messages, + // assemble the chunks and include the full base64 data in the response. + let buf = downloads.lock().await.remove(&request.id); + if let Some(buf) = buf { + if buf.complete { + let assembled_data = buf.chunks.join(""); + let data = serde_json::json!({ + "data": assembled_data, + "filename": buf.filename.unwrap_or_else(|| "download".into()), + "content_type": buf.content_type.unwrap_or_else(|| "application/octet-stream".into()), + "size": buf.size.unwrap_or(0), + }); + return Ok(Response::success(request.id, data)); + } + } + // Fallback: return the extension response as-is (non-streamed or error) + Ok(extension_response) + } _ => Ok(extension_response), } } @@ -263,6 +309,33 @@ async fn handle_page_chunk(msg: serde_json::Value, sessions: &SessionMap) -> Res Ok(()) } +async fn handle_download_chunk(msg: serde_json::Value, downloads: &DownloadMap) -> Result<()> { + let chunk: DownloadChunk = serde_json::from_value(msg)?; + let mut downloads = downloads.lock().await; + let buf = downloads + .entry(chunk.request_id.clone()) + .or_insert(DownloadBuffer { + chunks: Vec::new(), + filename: None, + content_type: None, + size: None, + complete: false, + }); + + // First chunk carries metadata + if chunk.chunk_index == 0 { + buf.filename = chunk.filename; + buf.content_type = chunk.content_type; + buf.size = chunk.size; + buf.chunks.clear(); + } + + buf.chunks.push(chunk.data); + buf.complete = chunk.done; + + Ok(()) +} + async fn write_response( writer: &mut tokio::net::tcp::OwnedWriteHalf, response: &Response,