From 80e9401e810be7c3837748ebfad598db32274425 Mon Sep 17 00:00:00 2001 From: Mikola Lysenko Date: Tue, 12 May 2026 16:37:38 -0400 Subject: [PATCH 1/2] feat(patch): add package- and diff-level patch sources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds two new optional pathways to the socket-patch CLI alongside the existing per-file blob path: - Per-package archives at `.socket/packages/.tar.gz` — a tarball of patched files for a single patch, extracted in one shot. - Per-file bsdiff archives at `.socket/diffs/.tar.gz` — bsdiff deltas that transform `before_hash` content into `after_hash` content. The apply pipeline now tries sources in the order package → diff → blob, falling through to the next on any failure. Every strategy post-write-verifies the file's git-sha256 against `after_hash`, so the existing safety invariant is unchanged. A new `--download-mode {diff,package,file}` flag (default: `diff`) controls what `apply`, `get`, `scan`, and `repair` fetch when local artifacts are missing. The manifest schema is intentionally unchanged: archives are keyed by patch UUID (already present in `PatchRecord`), so legacy manifests keep working with no migration. Highlights: - New core modules `patch/diff.rs` (qbsdiff bspatch wrapper) and `patch/package.rs` (tar+flate2 reader with path-traversal guards, whitelist filtering against `expected_files`, and hard caps on decompressed bytes / per-entry size / entry count to defuse gzip-bomb and `Vec::with_capacity` allocation attacks). - New `PatchSources` struct and `AppliedVia` enum in `patch/apply.rs`; `apply_package_patch` takes a `PatchSources` and an optional UUID. Passing `uuid = None` restores pre-2.2 blob-only behavior. - `try_apply_from_diff` gates on the captured pre-apply `current_hash` rather than `VerifyStatus`, so `--force` cannot drive a diff against garbage content. - `apply`'s offline guard now reports per-patch source availability instead of a global blobs/diffs/packages bucket count. - `ApiClient::fetch_diff(uuid)` and `fetch_package(uuid)` mirror `fetch_blob(hash)`; a private `fetch_binary` helper deduplicates the proxy/auth client split and 200/404/error handling. - `DownloadMode` enum + `fetch_missing_sources` in `api/blob_fetcher.rs` dispatch downloads by kind. `cleanup_unused_archives` in `utils/cleanup_blobs.rs` reaps orphaned `.socket/packages/` and `.socket/diffs/` files via `repair`. Tests: 307 unit + 2 e2e gem (was 263 + 2 before this change). New coverage spans diff round-trips, package extraction safety (traversal, oversize-header, too-many-entries, decompression-bomb truncation), fallback chain ordering (`via package`/`diff`/`blob`), force-mode + diff regression, dry-run safety, UUID validation, and archive download/cleanup helpers. All existing tests pass unchanged. Server-side `/patch/diff/` and `/patch/package/` endpoints are not live yet — 404 responses fall through gracefully to the file blob path, so this PR ships safely ahead of server support. Assisted-by: Claude Code:claude-opus-4-7 --- Cargo.lock | 202 ++++++ Cargo.toml | 3 + crates/socket-patch-cli/src/commands/apply.rs | 166 ++++- crates/socket-patch-cli/src/commands/get.rs | 11 + .../socket-patch-cli/src/commands/repair.rs | 155 +++-- crates/socket-patch-cli/src/commands/scan.rs | 7 + crates/socket-patch-core/Cargo.toml | 3 + .../socket-patch-core/src/api/blob_fetcher.rs | 323 ++++++++++ crates/socket-patch-core/src/api/client.rs | 132 +++- crates/socket-patch-core/src/constants.rs | 6 + crates/socket-patch-core/src/patch/apply.rs | 602 +++++++++++++++++- crates/socket-patch-core/src/patch/diff.rs | 88 +++ crates/socket-patch-core/src/patch/mod.rs | 2 + crates/socket-patch-core/src/patch/package.rs | 489 ++++++++++++++ .../src/utils/cleanup_blobs.rs | 166 +++++ 15 files changed, 2280 insertions(+), 75 deletions(-) create mode 100644 crates/socket-patch-core/src/patch/diff.rs create mode 100644 crates/socket-patch-core/src/patch/package.rs diff --git a/Cargo.lock b/Cargo.lock index 0905267..ee932c5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + [[package]] name = "aho-corasick" version = "1.1.4" @@ -73,6 +79,12 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + [[package]] name = "base64" version = "0.22.1" @@ -100,12 +112,27 @@ version = "3.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "bytes" version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" +[[package]] +name = "bzip2" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c" +dependencies = [ + "libbz2-rs-sys", +] + [[package]] name = "cc" version = "1.2.56" @@ -116,6 +143,16 @@ dependencies = [ "shlex", ] +[[package]] +name = "cdivsufsort" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edefce019197609da416762da75bb000bbd2224b2d89a7e722c2296cbff79b8c" +dependencies = [ + "cc", + "sacabase", +] + [[package]] name = "cfg-if" version = "1.0.4" @@ -196,6 +233,40 @@ dependencies = [ "libc", ] +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + [[package]] name = "crypto-common" version = "0.1.7" @@ -240,6 +311,12 @@ dependencies = [ "syn", ] +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + [[package]] name = "encode_unicode" version = "1.0.0" @@ -268,12 +345,32 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +[[package]] +name = "filetime" +version = "0.2.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d5b2eef6fafbf69f877e55509ce5b11a760690ac9700a2921be067aa6afaef6" +dependencies = [ + "cfg-if", + "libc", +] + [[package]] name = "find-msvc-tools" version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" +[[package]] +name = "flate2" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "foldhash" version = "0.1.5" @@ -676,6 +773,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" +[[package]] +name = "libbz2-rs-sys" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8fc329e1457d97a9d58a4e2ca49e3be572431a7e096008efc2e3a3c19d428f4" + [[package]] name = "libc" version = "0.2.182" @@ -721,6 +824,16 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", + "simd-adler32", +] + [[package]] name = "mio" version = "1.1.1" @@ -732,6 +845,15 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + [[package]] name = "number_prefix" version = "0.4.0" @@ -834,6 +956,18 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "qbsdiff" +version = "1.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdc7f24528be166f08f2c7becaca5618865499b6ded2565d5afcd795cc0d7596" +dependencies = [ + "byteorder", + "bzip2", + "rayon", + "suffix_array", +] + [[package]] name = "quinn" version = "0.11.9" @@ -939,6 +1073,26 @@ dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "rayon" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "redox_syscall" version = "0.5.18" @@ -1095,6 +1249,15 @@ version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" +[[package]] +name = "sacabase" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9883fc3d6ce3d78bb54d908602f8bc1f7b5f983afe601dabe083009d86267a84" +dependencies = [ + "num-traits", +] + [[package]] name = "same-file" version = "1.0.6" @@ -1204,6 +1367,12 @@ dependencies = [ "libc", ] +[[package]] +name = "simd-adler32" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" + [[package]] name = "slab" version = "0.4.12" @@ -1238,13 +1407,16 @@ dependencies = [ name = "socket-patch-core" version = "2.1.4" dependencies = [ + "flate2", "hex", "once_cell", + "qbsdiff", "regex", "reqwest", "serde", "serde_json", "sha2", + "tar", "tempfile", "thiserror 2.0.18", "tokio", @@ -1280,6 +1452,15 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" +[[package]] +name = "suffix_array" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "907d9ca9637a22e3a7d7c7818f6105a7898857359e187ad3325d986684b9ec3f" +dependencies = [ + "cdivsufsort", +] + [[package]] name = "syn" version = "2.0.117" @@ -1311,6 +1492,17 @@ dependencies = [ "syn", ] +[[package]] +name = "tar" +version = "0.4.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973" +dependencies = [ + "filetime", + "libc", + "xattr", +] + [[package]] name = "tempfile" version = "3.26.0" @@ -2007,6 +2199,16 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" +[[package]] +name = "xattr" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156" +dependencies = [ + "libc", + "rustix", +] + [[package]] name = "yoke" version = "0.8.1" diff --git a/Cargo.toml b/Cargo.toml index 7ad7565..6d0862a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,6 +25,9 @@ indicatif = "0.17" tempfile = "3" regex = "1" once_cell = "1" +qbsdiff = "1" +tar = "0.4" +flate2 = "1" [profile.release] strip = true diff --git a/crates/socket-patch-cli/src/commands/apply.rs b/crates/socket-patch-cli/src/commands/apply.rs index c065db4..80f4f67 100644 --- a/crates/socket-patch-cli/src/commands/apply.rs +++ b/crates/socket-patch-cli/src/commands/apply.rs @@ -1,12 +1,15 @@ use clap::Args; use socket_patch_core::api::blob_fetcher::{ - fetch_missing_blobs, format_fetch_result, get_missing_blobs, + fetch_missing_blobs, fetch_missing_sources, format_fetch_result, get_missing_archives, + get_missing_blobs, DownloadMode, }; use socket_patch_core::api::client::get_api_client_from_env; use socket_patch_core::constants::DEFAULT_PATCH_MANIFEST_PATH; use socket_patch_core::crawlers::{CrawlerOptions, Ecosystem}; use socket_patch_core::manifest::operations::read_manifest; -use socket_patch_core::patch::apply::{apply_package_patch, verify_file_patch, ApplyResult, VerifyStatus}; +use socket_patch_core::patch::apply::{ + apply_package_patch, verify_file_patch, ApplyResult, PatchSources, VerifyStatus, +}; use socket_patch_core::utils::cleanup_blobs::{cleanup_unused_blobs, format_cleanup_result}; use socket_patch_core::utils::purl::strip_purl_qualifiers; use socket_patch_core::utils::telemetry::{track_patch_applied, track_patch_apply_failed}; @@ -60,6 +63,14 @@ pub struct ApplyArgs { /// Show detailed per-file verification information #[arg(short = 'v', long, default_value_t = false)] pub verbose: bool, + + /// Which kind of patch artifact to download when local files are + /// missing. `diff` (default) fetches the smallest delta archive; + /// `package` fetches a full per-package tarball; `file` falls back to + /// the legacy per-file blob behavior. The apply pipeline always tries + /// already-downloaded sources in the order package → diff → blob. + #[arg(long = "download-mode", default_value = "diff")] + pub download_mode: String, } fn verify_status_str(status: &VerifyStatus) -> &'static str { @@ -72,12 +83,18 @@ fn verify_status_str(status: &VerifyStatus) -> &'static str { } fn result_to_json(result: &ApplyResult) -> serde_json::Value { + let applied_via: HashMap<&String, &str> = result + .applied_via + .iter() + .map(|(k, v)| (k, v.as_tag())) + .collect(); serde_json::json!({ "purl": result.package_key, "path": result.package_path, "success": result.success, "error": result.error, "filesPatched": result.files_patched, + "appliedVia": applied_via, "filesVerified": result.files_verified.iter().map(|f| { serde_json::json!({ "file": f.file, @@ -167,7 +184,23 @@ pub async fn run(args: ApplyArgs) -> i32 { println!("\nPatched packages:"); for result in &patched { if !result.files_patched.is_empty() { - println!(" {}", result.package_key); + // Summarize the per-file strategy used by this + // package: if everything came from the same + // source, show just that tag; otherwise list + // distinct sources. + let mut tags: Vec<&'static str> = result + .applied_via + .values() + .map(|v| v.as_tag()) + .collect(); + tags.sort_unstable(); + tags.dedup(); + let suffix = if tags.is_empty() { + String::new() + } else { + format!(" (via {})", tags.join("+")) + }; + println!(" {}{}", result.package_key, suffix); } else if result.files_verified.iter().all(|f| { f.status == VerifyStatus::AlreadyPatched }) { @@ -247,36 +280,129 @@ async fn apply_patches_inner( let socket_dir = manifest_path.parent().unwrap(); let blobs_path = socket_dir.join("blobs"); + let diffs_path = socket_dir.join("diffs"); + let packages_path = socket_dir.join("packages"); tokio::fs::create_dir_all(&blobs_path) .await .map_err(|e| e.to_string())?; - // Check for and download missing blobs + let download_mode = DownloadMode::parse(&args.download_mode).map_err(|e| e.to_string())?; + + // Compute per-patch source availability so both the offline guard + // (next block) and the `download_needed` decision below share the + // same notion of what's already on disk. let missing_blobs = get_missing_blobs(&manifest, &blobs_path).await; - if !missing_blobs.is_empty() { - if args.offline { + let missing_diff_archives = get_missing_archives(&manifest, &diffs_path).await; + let missing_package_archives = get_missing_archives(&manifest, &packages_path).await; + + // A patch is "locally applicable" iff at least one of: + // - every `after_hash` blob it references is on disk, OR + // - its diff archive is on disk, OR + // - its package archive is on disk. + // The apply pipeline will pick whichever is present per file. + let patches_without_source: Vec<&str> = manifest + .patches + .iter() + .filter_map(|(purl, record)| { + let all_blobs_present = record + .files + .values() + .all(|f| !missing_blobs.contains(&f.after_hash)); + let diff_present = !missing_diff_archives.contains(&record.uuid); + let pkg_present = !missing_package_archives.contains(&record.uuid); + if all_blobs_present || diff_present || pkg_present { + None + } else { + Some(purl.as_str()) + } + }) + .collect(); + + if args.offline { + // Offline: bail only if some patch has no usable local source. + // Note: with `--force`, the apply pipeline can short-circuit + // verification on its own; we still surface the no-source + // diagnosis so the user runs `repair` before retrying. + if !patches_without_source.is_empty() { if !args.silent && !args.json { eprintln!( - "Error: {} blob(s) are missing and --offline mode is enabled.", - missing_blobs.len() + "Error: {} patch(es) have no local source and --offline is set:", + patches_without_source.len() ); - eprintln!("Run \"socket-patch repair\" to download missing blobs."); + for purl in patches_without_source.iter().take(5) { + eprintln!(" - {}", purl); + } + if patches_without_source.len() > 5 { + eprintln!(" ... and {} more", patches_without_source.len() - 5); + } + eprintln!("Run \"socket-patch repair\" to download missing artifacts."); } return Ok((false, Vec::new(), Vec::new())); } + } + // Decide what (if anything) needs downloading. + // + // The apply pipeline tries sources in the order package → diff → + // blob locally. We honor `--download-mode` for the primary fetch + // when there's actually a gap to close. Skip the archive fetch + // entirely when all file blobs are already present locally — + // apply will succeed via the blob path, and the archive endpoints + // would just 404 (current server doesn't serve them yet). + let download_needed = !args.offline + && match download_mode { + DownloadMode::File => !missing_blobs.is_empty(), + DownloadMode::Diff | DownloadMode::Package if missing_blobs.is_empty() => false, + DownloadMode::Diff => !missing_diff_archives.is_empty(), + DownloadMode::Package => !missing_package_archives.is_empty(), + }; + + if download_needed { if !args.silent && !args.json { - println!("Downloading {} missing blob(s)...", missing_blobs.len()); + println!( + "Downloading missing patch artifacts (mode: {})...", + download_mode.as_tag() + ); } let (client, _) = get_api_client_from_env(None).await; - let fetch_result = fetch_missing_blobs(&manifest, &blobs_path, &client, None).await; + let sources = PatchSources { + blobs_path: &blobs_path, + packages_path: Some(&packages_path), + diffs_path: Some(&diffs_path), + }; + let fetch_result = + fetch_missing_sources(&manifest, &sources, download_mode, &client, None).await; if !args.silent && !args.json { println!("{}", format_fetch_result(&fetch_result)); } - if fetch_result.failed > 0 { + // For non-file modes, automatically fetch any still-missing file + // blobs as a fallback. Patches that lack the requested mode on + // the server will still apply via the legacy blob path. + if download_mode != DownloadMode::File { + let still_missing_blobs = get_missing_blobs(&manifest, &blobs_path).await; + if !still_missing_blobs.is_empty() { + if !args.silent && !args.json { + println!( + "Falling back to per-file blob downloads for {} blob(s)...", + still_missing_blobs.len() + ); + } + let blob_result = + fetch_missing_blobs(&manifest, &blobs_path, &client, None).await; + if !args.silent && !args.json { + println!("{}", format_fetch_result(&blob_result)); + } + if blob_result.failed > 0 && fetch_result.failed > 0 { + if !args.silent && !args.json { + eprintln!("Some artifacts could not be downloaded. Cannot apply patches."); + } + return Ok((false, Vec::new(), Vec::new())); + } + } + } else if fetch_result.failed > 0 { if !args.silent && !args.json { eprintln!("Some blobs could not be downloaded. Cannot apply patches."); } @@ -378,11 +504,17 @@ async fn apply_patches_inner( } } + let sources = PatchSources { + blobs_path: &blobs_path, + packages_path: Some(&packages_path), + diffs_path: Some(&diffs_path), + }; let result = apply_package_patch( variant_purl, pkg_path, &patch.files, - &blobs_path, + &sources, + Some(&patch.uuid), args.dry_run, args.force, ) @@ -412,11 +544,17 @@ async fn apply_patches_inner( None => continue, }; + let sources = PatchSources { + blobs_path: &blobs_path, + packages_path: Some(&packages_path), + diffs_path: Some(&diffs_path), + }; let result = apply_package_patch( purl, pkg_path, &patch.files, - &blobs_path, + &sources, + Some(&patch.uuid), args.dry_run, args.force, ) diff --git a/crates/socket-patch-cli/src/commands/get.rs b/crates/socket-patch-cli/src/commands/get.rs index 624b454..ee2399a 100644 --- a/crates/socket-patch-cli/src/commands/get.rs +++ b/crates/socket-patch-cli/src/commands/get.rs @@ -76,6 +76,12 @@ pub struct GetArgs { /// Output results as JSON #[arg(long, default_value_t = false)] pub json: bool, + + /// Which kind of patch artifact to download. `diff` (default) fetches + /// the smallest delta archive; `package` fetches a full per-package + /// tarball; `file` falls back to legacy per-file blob downloads. + #[arg(long = "download-mode", default_value = "diff")] + pub download_mode: String, } #[derive(Debug, PartialEq)] @@ -251,6 +257,8 @@ pub struct DownloadParams { pub global_prefix: Option, pub json: bool, pub silent: bool, + /// `--download-mode` value forwarded to the apply step. + pub download_mode: String, } /// Download and apply a set of selected patches. @@ -533,6 +541,7 @@ pub async fn download_and_apply_patches( force: false, json: false, verbose: false, + download_mode: params.download_mode.clone(), }; let code = super::apply::run(apply_args).await; apply_succeeded = code == 0; @@ -927,6 +936,7 @@ pub async fn run(args: GetArgs) -> i32 { global_prefix: args.global_prefix.clone(), json: args.json, silent: false, + download_mode: args.download_mode.clone(), }; let (code, result_json) = download_and_apply_patches(&selected, ¶ms).await; @@ -1196,6 +1206,7 @@ async fn save_and_apply_patch( offline: false, global: args.global, global_prefix: args.global_prefix.clone(), + download_mode: args.download_mode.clone(), ecosystems: None, force: false, json: false, diff --git a/crates/socket-patch-cli/src/commands/repair.rs b/crates/socket-patch-cli/src/commands/repair.rs index 33d7d04..2197bb1 100644 --- a/crates/socket-patch-cli/src/commands/repair.rs +++ b/crates/socket-patch-cli/src/commands/repair.rs @@ -1,11 +1,15 @@ use clap::Args; use socket_patch_core::api::blob_fetcher::{ - fetch_missing_blobs, format_fetch_result, get_missing_blobs, + fetch_missing_sources, format_fetch_result, get_missing_archives, get_missing_blobs, + DownloadMode, }; use socket_patch_core::api::client::get_api_client_from_env; use socket_patch_core::constants::DEFAULT_PATCH_MANIFEST_PATH; use socket_patch_core::manifest::operations::read_manifest; -use socket_patch_core::utils::cleanup_blobs::{cleanup_unused_blobs, format_cleanup_result}; +use socket_patch_core::patch::apply::PatchSources; +use socket_patch_core::utils::cleanup_blobs::{ + cleanup_unused_archives, cleanup_unused_blobs, format_cleanup_result, +}; use std::path::{Path, PathBuf}; #[derive(Args)] @@ -33,6 +37,12 @@ pub struct RepairArgs { /// Output results as JSON #[arg(long, default_value_t = false)] pub json: bool, + + /// Which kind of patch artifact to download. `file` (default for + /// repair) restores the legacy per-file blobs needed to apply any + /// patch. `diff` and `package` fetch the smaller archive formats. + #[arg(long = "download-mode", default_value = "file")] + pub download_mode: String, } pub async fn run(args: RepairArgs) -> i32 { @@ -84,39 +94,67 @@ async fn repair_inner(args: &RepairArgs, manifest_path: &Path) -> Result = match download_mode { + DownloadMode::File => get_missing_blobs(&manifest, &blobs_path) + .await + .into_iter() + .collect(), + DownloadMode::Diff => get_missing_archives(&manifest, &diffs_path) + .await + .into_iter() + .collect(), + DownloadMode::Package => get_missing_archives(&manifest, &packages_path) + .await + .into_iter() + .collect(), + }; + let missing_count = missing_artifacts.len(); - if !missing_blobs.is_empty() { + if !args.offline { + if !missing_artifacts.is_empty() { if !args.json { - println!("Found {} missing blob(s)", missing_blobs.len()); + println!( + "Found {} missing {} artifact(s)", + missing_artifacts.len(), + download_mode.as_tag() + ); } if args.dry_run { if !args.json { println!("\nDry run - would download:"); - for hash in missing_blobs.iter().take(10) { - println!(" - {}...", &hash[..12.min(hash.len())]); + for id in missing_artifacts.iter().take(10) { + println!(" - {}...", &id[..12.min(id.len())]); } - if missing_blobs.len() > 10 { - println!(" ... and {} more", missing_blobs.len() - 10); + if missing_artifacts.len() > 10 { + println!(" ... and {} more", missing_artifacts.len() - 10); } } } else { if !args.json { - println!("\nDownloading missing blobs..."); + println!("\nDownloading missing {}s...", download_mode.as_tag()); } let (client, _) = get_api_client_from_env(None).await; - let fetch_result = fetch_missing_blobs(&manifest, &blobs_path, &client, None).await; + let sources = PatchSources { + blobs_path: &blobs_path, + packages_path: Some(&packages_path), + diffs_path: Some(&diffs_path), + }; + let fetch_result = + fetch_missing_sources(&manifest, &sources, download_mode, &client, None).await; downloaded_count = fetch_result.downloaded; download_failed_count = fetch_result.failed; if !args.json { @@ -124,38 +162,41 @@ async fn repair_inner(args: &RepairArgs, manifest_path: &Path) -> Result 5 { - println!(" ... and {} more", missing_blobs.len() - 5); - } + } else if !missing_artifacts.is_empty() { + if !args.json { + println!( + "Warning: {} {} artifact(s) are missing (offline mode - not downloading)", + missing_artifacts.len(), + download_mode.as_tag() + ); + for id in missing_artifacts.iter().take(5) { + println!(" - {}...", &id[..12.min(id.len())]); + } + if missing_artifacts.len() > 5 { + println!(" ... and {} more", missing_artifacts.len() - 5); } - } else if !args.json { - println!("All blobs are present locally."); } + } else if !args.json { + println!( + "All {} artifacts are present locally.", + download_mode.as_tag() + ); } - // Step 2: Clean up unused blobs + // Step 2: Clean up unused artifacts across all three directories. if !args.download_only { if !args.json { println!(); } match cleanup_unused_blobs(&manifest, &blobs_path, args.dry_run).await { Ok(cleanup_result) => { - blobs_checked = cleanup_result.blobs_checked; - blobs_cleaned = cleanup_result.blobs_removed; + blobs_checked += cleanup_result.blobs_checked; + blobs_cleaned += cleanup_result.blobs_removed; if !args.json { if cleanup_result.blobs_checked == 0 { println!("No blobs directory found, nothing to clean up."); @@ -171,7 +212,47 @@ async fn repair_inner(args: &RepairArgs, manifest_path: &Path) -> Result { if !args.json { - eprintln!("Warning: cleanup failed: {e}"); + eprintln!("Warning: blob cleanup failed: {e}"); + } + } + } + + // Diff archives. + match cleanup_unused_archives(&manifest, &diffs_path, args.dry_run).await { + Ok(cleanup_result) => { + blobs_checked += cleanup_result.blobs_checked; + blobs_cleaned += cleanup_result.blobs_removed; + if !args.json && cleanup_result.blobs_removed > 0 { + println!( + "{}", + format_cleanup_result(&cleanup_result, args.dry_run) + .replace("blob(s)", "diff archive(s)") + ); + } + } + Err(e) => { + if !args.json { + eprintln!("Warning: diff cleanup failed: {e}"); + } + } + } + + // Package archives. + match cleanup_unused_archives(&manifest, &packages_path, args.dry_run).await { + Ok(cleanup_result) => { + blobs_checked += cleanup_result.blobs_checked; + blobs_cleaned += cleanup_result.blobs_removed; + if !args.json && cleanup_result.blobs_removed > 0 { + println!( + "{}", + format_cleanup_result(&cleanup_result, args.dry_run) + .replace("blob(s)", "package archive(s)") + ); + } + } + Err(e) => { + if !args.json { + eprintln!("Warning: package cleanup failed: {e}"); } } } diff --git a/crates/socket-patch-cli/src/commands/scan.rs b/crates/socket-patch-cli/src/commands/scan.rs index bb1079a..f3357e4 100644 --- a/crates/socket-patch-cli/src/commands/scan.rs +++ b/crates/socket-patch-cli/src/commands/scan.rs @@ -54,6 +54,12 @@ pub struct ScanArgs { /// Restrict scanning to specific ecosystems (comma-separated: npm,pypi,cargo,maven) #[arg(long, value_delimiter = ',')] pub ecosystems: Option>, + + /// Which kind of patch artifact to download. `diff` (default) fetches + /// the smallest delta archive; `package` fetches a full per-package + /// tarball; `file` falls back to legacy per-file blob downloads. + #[arg(long = "download-mode", default_value = "diff")] + pub download_mode: String, } pub async fn run(args: ScanArgs) -> i32 { @@ -561,6 +567,7 @@ pub async fn run(args: ScanArgs) -> i32 { global_prefix: args.global_prefix.clone(), json: false, silent: false, + download_mode: args.download_mode.clone(), }; let (code, _) = download_and_apply_patches(&selected, ¶ms).await; diff --git a/crates/socket-patch-core/Cargo.toml b/crates/socket-patch-core/Cargo.toml index c081348..68201c8 100644 --- a/crates/socket-patch-core/Cargo.toml +++ b/crates/socket-patch-core/Cargo.toml @@ -19,6 +19,9 @@ walkdir = { workspace = true } uuid = { workspace = true } regex = { workspace = true } once_cell = { workspace = true } +qbsdiff = { workspace = true } +tar = { workspace = true } +flate2 = { workspace = true } [features] default = [] diff --git a/crates/socket-patch-core/src/api/blob_fetcher.rs b/crates/socket-patch-core/src/api/blob_fetcher.rs index 7309070..fca9445 100644 --- a/crates/socket-patch-core/src/api/blob_fetcher.rs +++ b/crates/socket-patch-core/src/api/blob_fetcher.rs @@ -4,6 +4,46 @@ use std::path::{Path, PathBuf}; use crate::api::client::ApiClient; use crate::manifest::operations::get_after_hash_blobs; use crate::manifest::schema::PatchManifest; +use crate::patch::apply::PatchSources; + +/// Selects which kind of patch artifact `fetch_missing_sources` downloads. +/// +/// * `File` — per-file blobs (legacy, largest, always applicable). +/// * `Diff` — per-patch tar.gz of bsdiff deltas (smallest, only useful +/// when the original file is on disk). +/// * `Package` — per-patch tar.gz of patched files (mid-size, applicable +/// even when the original file is missing). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DownloadMode { + Diff, + Package, + File, +} + +impl DownloadMode { + /// Short lowercase tag, suitable for JSON output and `--download-mode` + /// flag values. + pub fn as_tag(&self) -> &'static str { + match self { + DownloadMode::Diff => "diff", + DownloadMode::Package => "package", + DownloadMode::File => "file", + } + } + + /// Parse `--download-mode` flag values. + pub fn parse(s: &str) -> Result { + match s.to_ascii_lowercase().as_str() { + "diff" => Ok(DownloadMode::Diff), + "package" => Ok(DownloadMode::Package), + "file" | "blob" => Ok(DownloadMode::File), + other => Err(format!( + "unknown download mode '{}'. Expected diff, package, or file.", + other + )), + } + } +} /// Result of fetching a single blob. #[derive(Debug, Clone)] @@ -195,6 +235,187 @@ pub async fn fetch_blobs_by_hash( } } +/// Return the set of patch UUIDs whose archive at +/// `/.tar.gz` is missing from disk. Used as the +/// "what do I need to download" query for diff and package modes. +pub async fn get_missing_archives( + manifest: &PatchManifest, + archives_dir: &Path, +) -> HashSet { + let mut missing = HashSet::new(); + for record in manifest.patches.values() { + let archive_path = archives_dir.join(format!("{}.tar.gz", record.uuid)); + if tokio::fs::metadata(&archive_path).await.is_err() { + missing.insert(record.uuid.clone()); + } + } + missing +} + +/// Download all missing archives for the chosen [`DownloadMode`]. +/// +/// * [`DownloadMode::File`] delegates to [`fetch_missing_blobs`]. +/// * [`DownloadMode::Diff`] downloads each missing `.tar.gz` into +/// `sources.diffs_path` via [`ApiClient::fetch_diff`]. +/// * [`DownloadMode::Package`] does the same with `sources.packages_path` +/// and [`ApiClient::fetch_package`]. +/// +/// Returns a [`FetchMissingBlobsResult`] in which each `BlobFetchResult`'s +/// `hash` field carries the patch UUID (not a blob hash) for diff and +/// package modes. A `sources.packages_path` / `sources.diffs_path` of +/// `None` while requesting that mode yields an immediate empty result — +/// the caller is expected to fall back to a different mode in that case. +pub async fn fetch_missing_sources( + manifest: &PatchManifest, + sources: &PatchSources<'_>, + mode: DownloadMode, + client: &ApiClient, + on_progress: Option<&OnProgress>, +) -> FetchMissingBlobsResult { + match mode { + DownloadMode::File => { + fetch_missing_blobs(manifest, sources.blobs_path, client, on_progress).await + } + DownloadMode::Diff => match sources.diffs_path { + Some(dir) => { + fetch_missing_archives_inner(manifest, dir, ArchiveKind::Diff, client, on_progress) + .await + } + None => empty_result(), + }, + DownloadMode::Package => match sources.packages_path { + Some(dir) => fetch_missing_archives_inner( + manifest, + dir, + ArchiveKind::Package, + client, + on_progress, + ) + .await, + None => empty_result(), + }, + } +} + +#[derive(Debug, Clone, Copy)] +enum ArchiveKind { + Diff, + Package, +} + +fn empty_result() -> FetchMissingBlobsResult { + FetchMissingBlobsResult { + total: 0, + downloaded: 0, + failed: 0, + skipped: 0, + results: Vec::new(), + } +} + +async fn fetch_missing_archives_inner( + manifest: &PatchManifest, + archives_dir: &Path, + kind: ArchiveKind, + client: &ApiClient, + on_progress: Option<&OnProgress>, +) -> FetchMissingBlobsResult { + let missing = get_missing_archives(manifest, archives_dir).await; + if missing.is_empty() { + return empty_result(); + } + + if let Err(e) = tokio::fs::create_dir_all(archives_dir).await { + let results: Vec = missing + .iter() + .map(|u| BlobFetchResult { + hash: u.clone(), + success: false, + error: Some(format!("Cannot create archives directory: {}", e)), + }) + .collect(); + let failed = results.len(); + return FetchMissingBlobsResult { + total: failed, + downloaded: 0, + failed, + skipped: 0, + results, + }; + } + + let uuids: Vec = missing.into_iter().collect(); + let total = uuids.len(); + let mut downloaded = 0usize; + let mut failed = 0usize; + let mut results = Vec::with_capacity(total); + + for (i, uuid) in uuids.iter().enumerate() { + if let Some(ref cb) = on_progress { + cb(uuid, i + 1, total); + } + + let fetch_result = match kind { + ArchiveKind::Diff => client.fetch_diff(uuid).await, + ArchiveKind::Package => client.fetch_package(uuid).await, + }; + + match fetch_result { + Ok(Some(data)) => { + let archive_path: PathBuf = archives_dir.join(format!("{}.tar.gz", uuid)); + match tokio::fs::write(&archive_path, &data).await { + Ok(()) => { + results.push(BlobFetchResult { + hash: uuid.clone(), + success: true, + error: None, + }); + downloaded += 1; + } + Err(e) => { + results.push(BlobFetchResult { + hash: uuid.clone(), + success: false, + error: Some(format!("Failed to write archive to disk: {}", e)), + }); + failed += 1; + } + } + } + Ok(None) => { + results.push(BlobFetchResult { + hash: uuid.clone(), + success: false, + error: Some(format!( + "{} archive not found on server", + match kind { + ArchiveKind::Diff => "Diff", + ArchiveKind::Package => "Package", + } + )), + }); + failed += 1; + } + Err(e) => { + results.push(BlobFetchResult { + hash: uuid.clone(), + success: false, + error: Some(e.to_string()), + }); + failed += 1; + } + } + } + + FetchMissingBlobsResult { + total, + downloaded, + failed, + skipped: 0, + results, + } +} + /// Format a [`FetchMissingBlobsResult`] as a human-readable string. pub fn format_fetch_result(result: &FetchMissingBlobsResult) -> String { if result.total == 0 { @@ -521,6 +742,108 @@ mod tests { assert!(output.contains("unknown error")); } + // ── DownloadMode + archive helpers ────────────────────────────── + + #[test] + fn test_download_mode_parse() { + assert_eq!(DownloadMode::parse("diff").unwrap(), DownloadMode::Diff); + assert_eq!(DownloadMode::parse("DIFF").unwrap(), DownloadMode::Diff); + assert_eq!( + DownloadMode::parse("package").unwrap(), + DownloadMode::Package + ); + assert_eq!(DownloadMode::parse("file").unwrap(), DownloadMode::File); + // `blob` aliases to `file` so users can think in pre-2.2 terms. + assert_eq!(DownloadMode::parse("blob").unwrap(), DownloadMode::File); + assert!(DownloadMode::parse("nope").is_err()); + } + + #[test] + fn test_download_mode_tag() { + assert_eq!(DownloadMode::Diff.as_tag(), "diff"); + assert_eq!(DownloadMode::Package.as_tag(), "package"); + assert_eq!(DownloadMode::File.as_tag(), "file"); + } + + fn make_manifest_with_uuids(uuids: &[&str]) -> PatchManifest { + let mut patches = HashMap::new(); + for (i, uuid) in uuids.iter().enumerate() { + let key = format!("pkg:npm/test-{}@1.0.0", i); + patches.insert( + key, + PatchRecord { + uuid: (*uuid).to_string(), + exported_at: "2024-01-01T00:00:00Z".to_string(), + files: HashMap::new(), + vulnerabilities: HashMap::new(), + description: "test".to_string(), + license: "MIT".to_string(), + tier: "free".to_string(), + }, + ); + } + PatchManifest { patches } + } + + #[tokio::test] + async fn test_get_missing_archives_all_missing() { + let dir = tempfile::tempdir().unwrap(); + let archives = dir.path().join("packages"); + tokio::fs::create_dir_all(&archives).await.unwrap(); + + let u1 = "11111111-1111-4111-8111-111111111111"; + let u2 = "22222222-2222-4222-8222-222222222222"; + let manifest = make_manifest_with_uuids(&[u1, u2]); + + let missing = get_missing_archives(&manifest, &archives).await; + assert_eq!(missing.len(), 2); + assert!(missing.contains(u1)); + assert!(missing.contains(u2)); + } + + #[tokio::test] + async fn test_get_missing_archives_some_present() { + let dir = tempfile::tempdir().unwrap(); + let archives = dir.path().join("packages"); + tokio::fs::create_dir_all(&archives).await.unwrap(); + + let u1 = "11111111-1111-4111-8111-111111111111"; + let u2 = "22222222-2222-4222-8222-222222222222"; + + tokio::fs::write(archives.join(format!("{u1}.tar.gz")), b"data") + .await + .unwrap(); + + let manifest = make_manifest_with_uuids(&[u1, u2]); + let missing = get_missing_archives(&manifest, &archives).await; + assert_eq!(missing.len(), 1); + assert!(missing.contains(u2)); + assert!(!missing.contains(u1)); + } + + #[tokio::test] + async fn test_fetch_missing_sources_unsupported_mode_returns_empty() { + // Asking for Diff mode without a diffs_path yields an empty result + // rather than panicking. Same for Package mode. + let dir = tempfile::tempdir().unwrap(); + let blobs = dir.path().join("blobs"); + tokio::fs::create_dir_all(&blobs).await.unwrap(); + let sources = PatchSources::blobs_only(&blobs); + + let manifest = make_manifest_with_uuids(&["11111111-1111-4111-8111-111111111111"]); + let (client, _) = crate::api::client::get_api_client_from_env(None).await; + + let res = fetch_missing_sources(&manifest, &sources, DownloadMode::Diff, &client, None) + .await; + assert_eq!(res.total, 0); + assert_eq!(res.downloaded, 0); + assert_eq!(res.failed, 0); + + let res = fetch_missing_sources(&manifest, &sources, DownloadMode::Package, &client, None) + .await; + assert_eq!(res.total, 0); + } + #[test] fn test_format_only_failed() { let result = FetchMissingBlobsResult { diff --git a/crates/socket-patch-core/src/api/client.rs b/crates/socket-patch-core/src/api/client.rs index b2dc8f2..9356d9f 100644 --- a/crates/socket-patch-core/src/api/client.rs +++ b/crates/socket-patch-core/src/api/client.rs @@ -460,22 +460,69 @@ impl ApiClient { hash ))); } + self.fetch_binary("blob", "blob", hash).await + } + + /// Fetch a per-file diff archive (tar.gz of bsdiff deltas) by patch UUID. + /// + /// Returns the raw archive bytes, or `Ok(None)` if not found (404). The + /// public proxy serves these under `/patch/diff/`; the + /// authenticated API serves them under `/v0/orgs//patches/diff/`. + pub async fn fetch_diff(&self, uuid: &str) -> Result>, ApiError> { + if !is_valid_uuid(uuid) { + return Err(ApiError::InvalidHash(format!( + "Invalid patch UUID: {}", + uuid + ))); + } + self.fetch_binary("diff", "diff", uuid).await + } + + /// Fetch a per-package patch archive (tar.gz of patched files) by patch UUID. + /// + /// Returns the raw archive bytes, or `Ok(None)` if not found (404). + pub async fn fetch_package(&self, uuid: &str) -> Result>, ApiError> { + if !is_valid_uuid(uuid) { + return Err(ApiError::InvalidHash(format!( + "Invalid patch UUID: {}", + uuid + ))); + } + self.fetch_binary("package", "package", uuid).await + } + /// Shared implementation for `fetch_blob` / `fetch_diff` / `fetch_package`. + /// + /// `kind` is the URL segment (`blob` / `diff` / `package`). `label` is the + /// human-readable noun used in log + error messages. `identifier` is the + /// hash or UUID interpolated into the URL. + async fn fetch_binary( + &self, + kind: &str, + label: &str, + identifier: &str, + ) -> Result>, ApiError> { let (url, use_auth) = if self.api_token.is_some() && self.org_slug.is_some() && !self.use_public_proxy { - // Authenticated endpoint let slug = self.org_slug.as_deref().unwrap(); - let u = format!("{}/v0/orgs/{}/patches/blob/{}", self.api_url, slug, hash); + let u = format!( + "{}/v0/orgs/{}/patches/{}/{}", + self.api_url, slug, kind, identifier + ); (u, true) } else { - // Public proxy let proxy_url = std::env::var("SOCKET_PATCH_PROXY_URL") .unwrap_or_else(|_| DEFAULT_PATCH_API_PROXY_URL.to_string()); - let u = format!("{}/patch/blob/{}", proxy_url.trim_end_matches('/'), hash); + let u = format!( + "{}/patch/{}/{}", + proxy_url.trim_end_matches('/'), + kind, + identifier + ); (u, false) }; - debug_log(&format!("GET blob {}", url)); + debug_log(&format!("GET {} {}", label, url)); // Build the request. When fetching from the public proxy (different // base URL than self.api_url), we use a plain client without auth @@ -506,7 +553,10 @@ impl ApiClient { }; let resp = resp.map_err(|e| { - ApiError::Network(format!("Network error fetching blob {}: {}", hash, e)) + ApiError::Network(format!( + "Network error fetching {} {}: {}", + label, identifier, e + )) })?; let status = resp.status(); @@ -514,7 +564,10 @@ impl ApiClient { match status { StatusCode::OK => { let bytes = resp.bytes().await.map_err(|e| { - ApiError::Network(format!("Error reading blob body for {}: {}", hash, e)) + ApiError::Network(format!( + "Error reading {} body for {}: {}", + label, identifier, e + )) })?; Ok(Some(bytes.to_vec())) } @@ -522,8 +575,9 @@ impl ApiClient { _ => { let text = resp.text().await.unwrap_or_default(); Err(ApiError::Other(format!( - "Failed to fetch blob {}: status {} - {}", - hash, + "Failed to fetch {} {}: status {} - {}", + label, + identifier, status.as_u16(), text, ))) @@ -643,6 +697,19 @@ fn is_valid_sha256_hex(s: &str) -> bool { s.len() == 64 && s.bytes().all(|b| b.is_ascii_hexdigit()) } +/// Validate the standard 8-4-4-4-12 UUID hex grouping. +fn is_valid_uuid(s: &str) -> bool { + let parts: Vec<&str> = s.split('-').collect(); + if parts.len() != 5 { + return false; + } + let lengths = [8, 4, 4, 4, 12]; + parts + .iter() + .zip(lengths.iter()) + .all(|(part, &want)| part.len() == want && part.bytes().all(|b| b.is_ascii_hexdigit())) +} + /// Convert a `PatchSearchResult` into a `BatchPatchInfo`, extracting /// CVE/GHSA IDs and computing the highest severity. fn convert_search_result_to_batch_info(patch: PatchSearchResult) -> BatchPatchInfo { @@ -1010,4 +1077,51 @@ mod tests { assert_eq!(mixed.len(), 64); assert!(is_valid_sha256_hex(mixed)); } + + // ── UUID validation tests ─────────────────────────────────────── + + #[test] + fn test_is_valid_uuid_accepts_standard_form() { + assert!(is_valid_uuid("80630680-4da6-45f9-bba8-b888e0ffd58c")); + assert!(is_valid_uuid("00000000-0000-0000-0000-000000000000")); + // Uppercase hex is acceptable. + assert!(is_valid_uuid("ABCDEF01-2345-6789-ABCD-EF0123456789")); + } + + #[test] + fn test_is_valid_uuid_rejects_malformed() { + assert!(!is_valid_uuid("")); + assert!(!is_valid_uuid("not-a-uuid")); + // Wrong segment count. + assert!(!is_valid_uuid("80630680-4da6-45f9-bba8")); + // Wrong length on first segment. + assert!(!is_valid_uuid("8063068-4da6-45f9-bba8-b888e0ffd58c")); + // Non-hex character. + assert!(!is_valid_uuid("80630680-4da6-45f9-bba8-b888e0ffd58z")); + // No dashes. + assert!(!is_valid_uuid("80630680xxxxx")); + } + + // ── fetch_diff / fetch_package validation tests ───────────────── + // + // These tests cover input validation only — they intentionally do + // NOT hit the network. The shared `fetch_binary` helper handles the + // transport, and `fetch_blob` already has integration coverage via + // the e2e_npm test. + + #[tokio::test] + async fn test_fetch_diff_rejects_invalid_uuid() { + std::env::remove_var("SOCKET_API_TOKEN"); + let (client, _) = get_api_client_from_env(None).await; + let result = client.fetch_diff("not-a-uuid").await; + assert!(matches!(result, Err(ApiError::InvalidHash(_)))); + } + + #[tokio::test] + async fn test_fetch_package_rejects_invalid_uuid() { + std::env::remove_var("SOCKET_API_TOKEN"); + let (client, _) = get_api_client_from_env(None).await; + let result = client.fetch_package("xxx").await; + assert!(matches!(result, Err(ApiError::InvalidHash(_)))); + } } diff --git a/crates/socket-patch-core/src/constants.rs b/crates/socket-patch-core/src/constants.rs index 1418427..aede7e7 100644 --- a/crates/socket-patch-core/src/constants.rs +++ b/crates/socket-patch-core/src/constants.rs @@ -4,6 +4,12 @@ pub const DEFAULT_PATCH_MANIFEST_PATH: &str = ".socket/manifest.json"; /// Default folder for storing patched file blobs. pub const DEFAULT_BLOB_FOLDER: &str = ".socket/blob"; +/// Default folder for storing per-package patched archives (tar.gz). +pub const DEFAULT_PACKAGES_FOLDER: &str = ".socket/packages"; + +/// Default folder for storing per-file diff blobs (bsdiff format). +pub const DEFAULT_DIFFS_FOLDER: &str = ".socket/diffs"; + /// Default Socket directory. pub const DEFAULT_SOCKET_DIR: &str = ".socket"; diff --git a/crates/socket-patch-core/src/patch/apply.rs b/crates/socket-patch-core/src/patch/apply.rs index 2fcb28d..3aed818 100644 --- a/crates/socket-patch-core/src/patch/apply.rs +++ b/crates/socket-patch-core/src/patch/apply.rs @@ -1,8 +1,11 @@ use std::collections::HashMap; use std::path::Path; +use crate::hash::git_sha256::compute_git_sha256_from_bytes; use crate::manifest::schema::PatchFileInfo; +use crate::patch::diff::apply_diff; use crate::patch::file_hash::compute_file_git_sha256; +use crate::patch::package::read_archive_filtered; /// Status of a file patch verification. #[derive(Debug, Clone, PartialEq, Eq)] @@ -28,6 +31,54 @@ pub struct VerifyResult { pub target_hash: Option, } +/// Which patch source actually wrote the patched bytes for a file. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum AppliedVia { + /// Bytes came from a per-package archive in `.socket/packages/`. + Package, + /// Bytes were produced by applying a bsdiff delta from + /// `.socket/diffs/.tar.gz`. + Diff, + /// Bytes came from a per-file blob in `.socket/blobs/`. + Blob, +} + +impl AppliedVia { + /// Short lowercase tag, suitable for JSON and human output. + pub fn as_tag(&self) -> &'static str { + match self { + AppliedVia::Package => "package", + AppliedVia::Diff => "diff", + AppliedVia::Blob => "blob", + } + } +} + +/// Patch sources the apply pipeline may use to obtain patched bytes. +/// +/// `blobs_path` is always required and serves as the universal fallback. +/// `packages_path` and `diffs_path` are optional opt-ins to the new +/// pathways introduced in socket-patch 2.2. +#[derive(Debug, Clone, Copy)] +pub struct PatchSources<'a> { + pub blobs_path: &'a Path, + pub packages_path: Option<&'a Path>, + pub diffs_path: Option<&'a Path>, +} + +impl<'a> PatchSources<'a> { + /// Construct a `PatchSources` that only knows about the legacy + /// per-file blob directory. Convenient for tests and existing call + /// sites that have not been upgraded. + pub fn blobs_only(blobs_path: &'a Path) -> Self { + Self { + blobs_path, + packages_path: None, + diffs_path: None, + } + } +} + /// Result of applying patches to a single package. #[derive(Debug, Clone)] pub struct ApplyResult { @@ -36,6 +87,9 @@ pub struct ApplyResult { pub success: bool, pub files_verified: Vec, pub files_patched: Vec, + /// Per-file record of which source produced the patched bytes. Only + /// populated for files in `files_patched`. + pub applied_via: HashMap, pub error: Option, } @@ -198,13 +252,20 @@ pub async fn apply_file_patch( /// /// For each file in `files`, this function: /// 1. Verifies the file is ready to be patched (or already patched). -/// 2. If not dry_run, reads the blob from `blobs_path` and writes it. +/// 2. If not dry_run, tries patch sources in order: package archive → diff +/// archive → per-file blob. Each strategy is opt-in via `sources`. /// 3. Returns a summary of what happened. +/// +/// `uuid` is the patch UUID. Pass `Some` to enable package- and +/// diff-archive lookup (the corresponding `sources.packages_path` / +/// `sources.diffs_path` must also be set). Pass `None` to restrict the +/// pipeline to per-file blobs only — equivalent to pre-2.2 behavior. pub async fn apply_package_patch( package_key: &str, pkg_path: &Path, files: &HashMap, - blobs_path: &Path, + sources: &PatchSources<'_>, + uuid: Option<&str>, dry_run: bool, force: bool, ) -> ApplyResult { @@ -214,6 +275,7 @@ pub async fn apply_package_patch( success: false, files_verified: Vec::new(), files_patched: Vec::new(), + applied_via: HashMap::new(), error: None, }; @@ -290,7 +352,19 @@ pub async fn apply_package_patch( return result; } - // Apply patches to files that need it + // Eagerly load the package and diff archives (if any) into memory so + // we don't reparse the tar.gz once per file. Both are small archives. + let package_entries = match (uuid, sources.packages_path) { + (Some(uuid), Some(dir)) => load_archive_if_present(dir, uuid, files).await, + _ => None, + }; + let diff_entries = match (uuid, sources.diffs_path) { + (Some(uuid), Some(dir)) => load_archive_if_present(dir, uuid, files).await, + _ => None, + }; + + // Apply patches to files that need it. For each file, try package + // archive first, then diff, then blob. for (file_name, file_info) in files { let verify_result = result.files_verified.iter().find(|v| v.file == *file_name); if let Some(vr) = verify_result { @@ -301,8 +375,53 @@ pub async fn apply_package_patch( } } - // Read patched content from blobs - let blob_path = blobs_path.join(&file_info.after_hash); + let normalized = normalize_file_path(file_name).to_string(); + + // ── Strategy 1: package archive ────────────────────────────── + if try_apply_from_archive( + package_entries.as_ref(), + &normalized, + pkg_path, + file_name, + file_info, + ) + .await + { + result.files_patched.push(file_name.clone()); + result + .applied_via + .insert(file_name.clone(), AppliedVia::Package); + continue; + } + + // ── Strategy 2: per-file diff ──────────────────────────────── + // Diffs only apply cleanly when the on-disk content actually + // hashes to `before_hash` — otherwise the bsdiff output won't + // match `after_hash`. We pass the pre-apply current_hash + // captured by `verify_file_patch` so `try_apply_from_diff` can + // skip the wasted decompress+apply work when --force is + // overriding a hash mismatch (force flips status to Ready but + // the underlying hash is still wrong). + let current_hash_for_diff = verify_result.and_then(|v| v.current_hash.as_deref()); + if try_apply_from_diff( + diff_entries.as_ref(), + &normalized, + pkg_path, + file_name, + file_info, + current_hash_for_diff, + ) + .await + { + result.files_patched.push(file_name.clone()); + result + .applied_via + .insert(file_name.clone(), AppliedVia::Diff); + continue; + } + + // ── Strategy 3: per-file blob (legacy fallback) ────────────── + let blob_path = sources.blobs_path.join(&file_info.after_hash); let patched_content = match tokio::fs::read(&blob_path).await { Ok(content) => content, Err(e) => { @@ -314,19 +433,130 @@ pub async fn apply_package_patch( } }; - // Apply the patch - if let Err(e) = apply_file_patch(pkg_path, file_name, &patched_content, &file_info.after_hash).await { + if let Err(e) = + apply_file_patch(pkg_path, file_name, &patched_content, &file_info.after_hash).await + { result.error = Some(e.to_string()); return result; } result.files_patched.push(file_name.clone()); + result + .applied_via + .insert(file_name.clone(), AppliedVia::Blob); } result.success = true; result } +/// Try to write the patched bytes from `package_entries[normalized_path]` +/// to disk, verifying the post-write hash. Returns `true` on success. +async fn try_apply_from_archive( + package_entries: Option<&HashMap>>, + normalized_path: &str, + pkg_path: &Path, + file_name: &str, + file_info: &PatchFileInfo, +) -> bool { + let entries = match package_entries { + Some(e) => e, + None => return false, + }; + let bytes = match entries.get(normalized_path) { + Some(b) => b, + None => return false, + }; + if compute_git_sha256_from_bytes(bytes) != file_info.after_hash { + return false; + } + apply_file_patch(pkg_path, file_name, bytes, &file_info.after_hash) + .await + .is_ok() +} + +/// Try to apply the bsdiff delta from `diff_entries[normalized_path]` to +/// the on-disk file at `pkg_path/normalized_path`. Bails out (returning +/// `false`) for any of: +/// * no diff entry, +/// * `current_hash` is missing or doesn't match `file_info.before_hash` +/// (this is the strong gate — even `--force` promoting a +/// HashMismatch to Ready will still bail here, because the on-disk +/// hash captured by `verify_file_patch` was the real, mismatched +/// value), +/// * `file_info.before_hash` is empty (new files), +/// * read/diff/verify/write failure. +async fn try_apply_from_diff( + diff_entries: Option<&HashMap>>, + normalized_path: &str, + pkg_path: &Path, + file_name: &str, + file_info: &PatchFileInfo, + current_hash: Option<&str>, +) -> bool { + let entries = match diff_entries { + Some(e) => e, + None => return false, + }; + let delta = match entries.get(normalized_path) { + Some(d) => d, + None => return false, + }; + if file_info.before_hash.is_empty() { + // New files have no before content to diff against. + return false; + } + // Strong invariant: only run the diff when on-disk bytes hash to + // exactly the `before_hash` the delta was authored against. This + // closes the force-mode loophole — `--force` flips VerifyStatus to + // Ready, but `current_hash` retains the original on-disk hash, so + // the comparison below still rejects. + match current_hash { + Some(h) if h == file_info.before_hash => {} + _ => return false, + } + + let on_disk_path = pkg_path.join(normalized_path); + let before_bytes = match tokio::fs::read(&on_disk_path).await { + Ok(b) => b, + Err(_) => return false, + }; + let patched = match apply_diff(&before_bytes, delta) { + Ok(p) => p, + Err(_) => return false, + }; + if compute_git_sha256_from_bytes(&patched) != file_info.after_hash { + return false; + } + apply_file_patch(pkg_path, file_name, &patched, &file_info.after_hash) + .await + .is_ok() +} + +/// Open `/.tar.gz` (if it exists) and return its entries +/// filtered to the patched files in `files`. Errors and missing files +/// both yield `None` so the caller silently falls through to the next +/// strategy. +async fn load_archive_if_present( + dir: &Path, + uuid: &str, + files: &HashMap, +) -> Option>> { + let archive_path = dir.join(format!("{uuid}.tar.gz")); + if tokio::fs::metadata(&archive_path).await.is_err() { + return None; + } + // `read_archive_filtered` is synchronous (tar + flate2 are sync). Run + // it on the blocking pool so we don't stall the executor for large + // archives. + let archive_path_owned = archive_path.clone(); + let files_owned = files.clone(); + tokio::task::spawn_blocking(move || read_archive_filtered(&archive_path_owned, &files_owned)) + .await + .ok() + .and_then(|r| r.ok()) +} + #[cfg(test)] mod tests { use super::*; @@ -505,7 +735,7 @@ mod tests { ); let result = - apply_package_patch("pkg:npm/test@1.0.0", pkg_dir.path(), &files, blobs_dir.path(), false, false) + apply_package_patch("pkg:npm/test@1.0.0", pkg_dir.path(), &files, &PatchSources::blobs_only(blobs_dir.path()), None, false, false) .await; assert!(result.success); @@ -535,7 +765,7 @@ mod tests { ); let result = - apply_package_patch("pkg:npm/test@1.0.0", pkg_dir.path(), &files, blobs_dir.path(), true, false) + apply_package_patch("pkg:npm/test@1.0.0", pkg_dir.path(), &files, &PatchSources::blobs_only(blobs_dir.path()), None, true, false) .await; assert!(result.success); @@ -568,7 +798,7 @@ mod tests { ); let result = - apply_package_patch("pkg:npm/test@1.0.0", pkg_dir.path(), &files, blobs_dir.path(), false, false) + apply_package_patch("pkg:npm/test@1.0.0", pkg_dir.path(), &files, &PatchSources::blobs_only(blobs_dir.path()), None, false, false) .await; assert!(result.success); @@ -594,7 +824,7 @@ mod tests { ); let result = - apply_package_patch("pkg:npm/test@1.0.0", pkg_dir.path(), &files, blobs_dir.path(), false, false) + apply_package_patch("pkg:npm/test@1.0.0", pkg_dir.path(), &files, &PatchSources::blobs_only(blobs_dir.path()), None, false, false) .await; assert!(!result.success); @@ -630,7 +860,7 @@ mod tests { // Without force: should fail let result = - apply_package_patch("pkg:npm/test@1.0.0", pkg_dir.path(), &files, blobs_dir.path(), false, false) + apply_package_patch("pkg:npm/test@1.0.0", pkg_dir.path(), &files, &PatchSources::blobs_only(blobs_dir.path()), None, false, false) .await; assert!(!result.success); @@ -641,7 +871,7 @@ mod tests { // With force: should succeed let result = - apply_package_patch("pkg:npm/test@1.0.0", pkg_dir.path(), &files, blobs_dir.path(), false, true) + apply_package_patch("pkg:npm/test@1.0.0", pkg_dir.path(), &files, &PatchSources::blobs_only(blobs_dir.path()), None, false, true) .await; assert!(result.success); assert_eq!(result.files_patched.len(), 1); @@ -666,15 +896,357 @@ mod tests { // Without force: should fail (NotFound for non-new file) let result = - apply_package_patch("pkg:npm/test@1.0.0", pkg_dir.path(), &files, blobs_dir.path(), false, false) + apply_package_patch("pkg:npm/test@1.0.0", pkg_dir.path(), &files, &PatchSources::blobs_only(blobs_dir.path()), None, false, false) .await; assert!(!result.success); // With force: should succeed by skipping the missing file let result = - apply_package_patch("pkg:npm/test@1.0.0", pkg_dir.path(), &files, blobs_dir.path(), false, true) + apply_package_patch("pkg:npm/test@1.0.0", pkg_dir.path(), &files, &PatchSources::blobs_only(blobs_dir.path()), None, false, true) .await; assert!(result.success); assert_eq!(result.files_patched.len(), 0); } + + // ── Fallback-chain tests ───────────────────────────────────────── + // + // Tests below exercise the new strategies introduced in 2.2: + // package archive (.socket/packages/.tar.gz) and per-file diff + // archive (.socket/diffs/.tar.gz), plus the priority order + // package → diff → blob. + + use flate2::write::GzEncoder; + use flate2::Compression as GzCompression; + use qbsdiff::Bsdiff; + + const TEST_UUID: &str = "11111111-1111-4111-8111-111111111111"; + + /// Write a tar.gz archive at `/.tar.gz` containing the + /// given (entry name → bytes) pairs. + fn write_uuid_archive(dir: &Path, uuid: &str, entries: &[(&str, &[u8])]) { + let archive_path = dir.join(format!("{uuid}.tar.gz")); + let file = std::fs::File::create(&archive_path).unwrap(); + let gz = GzEncoder::new(file, GzCompression::default()); + let mut builder = tar::Builder::new(gz); + for (name, data) in entries { + let mut header = tar::Header::new_gnu(); + header.set_size(data.len() as u64); + header.set_mode(0o644); + header.set_cksum(); + builder.append_data(&mut header, name, *data).unwrap(); + } + builder.into_inner().unwrap().finish().unwrap(); + } + + fn make_delta(before: &[u8], after: &[u8]) -> Vec { + let mut delta = Vec::new(); + Bsdiff::new(before, after) + .compare(std::io::Cursor::new(&mut delta)) + .unwrap(); + delta + } + + /// Returns a fully-populated three-source fixture: original file on + /// disk, all of (package, diff, blob) available with valid patched + /// content. Caller can then delete sources to test fallback. + async fn make_fixture() -> ( + tempfile::TempDir, // root holding pkg/, blobs/, packages/, diffs/ + std::path::PathBuf, // pkg dir + std::path::PathBuf, // blobs dir + std::path::PathBuf, // packages dir + std::path::PathBuf, // diffs dir + HashMap, + Vec, // original bytes + Vec, // patched bytes + ) { + let root = tempfile::tempdir().unwrap(); + let pkg_dir = root.path().join("pkg"); + let blobs_dir = root.path().join("blobs"); + let packages_dir = root.path().join("packages"); + let diffs_dir = root.path().join("diffs"); + tokio::fs::create_dir_all(&pkg_dir).await.unwrap(); + tokio::fs::create_dir_all(&blobs_dir).await.unwrap(); + tokio::fs::create_dir_all(&packages_dir).await.unwrap(); + tokio::fs::create_dir_all(&diffs_dir).await.unwrap(); + + let original: Vec = b"the original content of the file".to_vec(); + let patched: Vec = b"the PATCHED content of the file!".to_vec(); + let before_hash = compute_git_sha256_from_bytes(&original); + let after_hash = compute_git_sha256_from_bytes(&patched); + + // On-disk file at pkg/index.js + tokio::fs::write(pkg_dir.join("index.js"), &original) + .await + .unwrap(); + + // Per-file blob at blobs/ + tokio::fs::write(blobs_dir.join(&after_hash), &patched) + .await + .unwrap(); + + // Package archive containing the patched bytes + write_uuid_archive(&packages_dir, TEST_UUID, &[("index.js", &patched)]); + + // Diff archive containing bsdiff(original -> patched) + let delta = make_delta(&original, &patched); + write_uuid_archive(&diffs_dir, TEST_UUID, &[("index.js", &delta)]); + + let mut files = HashMap::new(); + files.insert( + "index.js".to_string(), + PatchFileInfo { + before_hash, + after_hash, + }, + ); + + (root, pkg_dir, blobs_dir, packages_dir, diffs_dir, files, original, patched) + } + + #[tokio::test] + async fn test_apply_via_package_when_archive_present() { + let (_root, pkg_dir, blobs_dir, packages_dir, diffs_dir, files, _orig, patched) = + make_fixture().await; + + let sources = PatchSources { + blobs_path: &blobs_dir, + packages_path: Some(&packages_dir), + diffs_path: Some(&diffs_dir), + }; + let result = apply_package_patch( + "pkg:npm/x@1.0.0", + &pkg_dir, + &files, + &sources, + Some(TEST_UUID), + false, + false, + ) + .await; + + assert!(result.success, "expected success: {:?}", result.error); + assert_eq!(result.files_patched, vec!["index.js".to_string()]); + assert_eq!( + result.applied_via.get("index.js"), + Some(&AppliedVia::Package) + ); + let written = tokio::fs::read(pkg_dir.join("index.js")).await.unwrap(); + assert_eq!(written, patched); + } + + #[tokio::test] + async fn test_apply_falls_back_to_diff_when_no_package() { + let (_root, pkg_dir, blobs_dir, packages_dir, diffs_dir, files, _orig, patched) = + make_fixture().await; + // Delete the package archive. + tokio::fs::remove_file(packages_dir.join(format!("{TEST_UUID}.tar.gz"))) + .await + .unwrap(); + + let sources = PatchSources { + blobs_path: &blobs_dir, + packages_path: Some(&packages_dir), + diffs_path: Some(&diffs_dir), + }; + let result = apply_package_patch( + "pkg:npm/x@1.0.0", + &pkg_dir, + &files, + &sources, + Some(TEST_UUID), + false, + false, + ) + .await; + + assert!(result.success, "expected success: {:?}", result.error); + assert_eq!(result.applied_via.get("index.js"), Some(&AppliedVia::Diff)); + let written = tokio::fs::read(pkg_dir.join("index.js")).await.unwrap(); + assert_eq!(written, patched); + } + + #[tokio::test] + async fn test_apply_falls_back_to_blob_when_no_archives() { + let (_root, pkg_dir, blobs_dir, packages_dir, diffs_dir, files, _orig, patched) = + make_fixture().await; + // Delete both archives. + tokio::fs::remove_file(packages_dir.join(format!("{TEST_UUID}.tar.gz"))) + .await + .unwrap(); + tokio::fs::remove_file(diffs_dir.join(format!("{TEST_UUID}.tar.gz"))) + .await + .unwrap(); + + let sources = PatchSources { + blobs_path: &blobs_dir, + packages_path: Some(&packages_dir), + diffs_path: Some(&diffs_dir), + }; + let result = apply_package_patch( + "pkg:npm/x@1.0.0", + &pkg_dir, + &files, + &sources, + Some(TEST_UUID), + false, + false, + ) + .await; + + assert!(result.success); + assert_eq!(result.applied_via.get("index.js"), Some(&AppliedVia::Blob)); + let written = tokio::fs::read(pkg_dir.join("index.js")).await.unwrap(); + assert_eq!(written, patched); + } + + #[tokio::test] + async fn test_apply_uuid_none_disables_alt_sources() { + // Even if archives exist, passing `uuid = None` must restrict the + // pipeline to the blob path — preserving pre-2.2 behavior. + let (_root, pkg_dir, blobs_dir, packages_dir, diffs_dir, files, _orig, _patched) = + make_fixture().await; + + let sources = PatchSources { + blobs_path: &blobs_dir, + packages_path: Some(&packages_dir), + diffs_path: Some(&diffs_dir), + }; + let result = apply_package_patch( + "pkg:npm/x@1.0.0", + &pkg_dir, + &files, + &sources, + None, + false, + false, + ) + .await; + + assert!(result.success); + assert_eq!(result.applied_via.get("index.js"), Some(&AppliedVia::Blob)); + } + + #[tokio::test] + async fn test_apply_via_diff_falls_through_when_before_hash_mismatch() { + // Corrupt the on-disk file so its hash no longer matches + // before_hash. Diff strategy must NOT run (its output would never + // match after_hash), so we fall through to the blob. + let (_root, pkg_dir, blobs_dir, packages_dir, diffs_dir, files, _orig, patched) = + make_fixture().await; + tokio::fs::remove_file(packages_dir.join(format!("{TEST_UUID}.tar.gz"))) + .await + .unwrap(); + // Overwrite on-disk content with garbage; use --force so verify + // promotes the HashMismatch to Ready and the pipeline still tries + // to apply. + tokio::fs::write(pkg_dir.join("index.js"), b"garbage") + .await + .unwrap(); + + let sources = PatchSources { + blobs_path: &blobs_dir, + packages_path: Some(&packages_dir), + diffs_path: Some(&diffs_dir), + }; + let result = apply_package_patch( + "pkg:npm/x@1.0.0", + &pkg_dir, + &files, + &sources, + Some(TEST_UUID), + false, + true, // --force + ) + .await; + + assert!(result.success); + // Diff would produce wrong output → strategy skipped → blob writes. + assert_eq!(result.applied_via.get("index.js"), Some(&AppliedVia::Blob)); + let written = tokio::fs::read(pkg_dir.join("index.js")).await.unwrap(); + assert_eq!(written, patched); + } + + #[tokio::test] + async fn test_apply_via_package_skips_when_hash_mismatches() { + // Package archive contains the WRONG bytes (would not hash to + // after_hash). The package strategy must refuse the entry and + // fall back to diff or blob. + let (_root, pkg_dir, blobs_dir, packages_dir, diffs_dir, files, _orig, patched) = + make_fixture().await; + // Replace the package archive with one whose entry is corrupt. + tokio::fs::remove_file(packages_dir.join(format!("{TEST_UUID}.tar.gz"))) + .await + .unwrap(); + write_uuid_archive( + &packages_dir, + TEST_UUID, + &[("index.js", b"corrupt package payload")], + ); + + let sources = PatchSources { + blobs_path: &blobs_dir, + packages_path: Some(&packages_dir), + diffs_path: Some(&diffs_dir), + }; + let result = apply_package_patch( + "pkg:npm/x@1.0.0", + &pkg_dir, + &files, + &sources, + Some(TEST_UUID), + false, + false, + ) + .await; + + assert!(result.success); + // Package refused → diff succeeded next. + assert_eq!(result.applied_via.get("index.js"), Some(&AppliedVia::Diff)); + let written = tokio::fs::read(pkg_dir.join("index.js")).await.unwrap(); + assert_eq!(written, patched); + } + + #[tokio::test] + async fn test_apply_dry_run_does_not_touch_alternative_sources() { + // Even with package/diff archives present, dry-run must not modify + // files on disk. + let (_root, pkg_dir, blobs_dir, packages_dir, diffs_dir, files, original, _patched) = + make_fixture().await; + + let sources = PatchSources { + blobs_path: &blobs_dir, + packages_path: Some(&packages_dir), + diffs_path: Some(&diffs_dir), + }; + let result = apply_package_patch( + "pkg:npm/x@1.0.0", + &pkg_dir, + &files, + &sources, + Some(TEST_UUID), + true, // dry-run + false, + ) + .await; + + assert!(result.success); + assert!(result.files_patched.is_empty()); + let on_disk = tokio::fs::read(pkg_dir.join("index.js")).await.unwrap(); + assert_eq!(on_disk, original); + } + + #[test] + fn test_applied_via_as_tag() { + assert_eq!(AppliedVia::Package.as_tag(), "package"); + assert_eq!(AppliedVia::Diff.as_tag(), "diff"); + assert_eq!(AppliedVia::Blob.as_tag(), "blob"); + } + + #[test] + fn test_patch_sources_blobs_only_disables_other_strategies() { + let dir = tempfile::tempdir().unwrap(); + let sources = PatchSources::blobs_only(dir.path()); + assert!(sources.packages_path.is_none()); + assert!(sources.diffs_path.is_none()); + } } diff --git a/crates/socket-patch-core/src/patch/diff.rs b/crates/socket-patch-core/src/patch/diff.rs new file mode 100644 index 0000000..e9b1b7d --- /dev/null +++ b/crates/socket-patch-core/src/patch/diff.rs @@ -0,0 +1,88 @@ +//! Per-file diff (bsdiff) apply support. +//! +//! A `diff` is a binary delta in bsdiff 4.x format that transforms the +//! `beforeHash` bytes of a file into the `afterHash` bytes. We store diffs +//! grouped by patch UUID — see [`crate::patch::package`] for the tar.gz +//! archive layout. + +use qbsdiff::Bspatch; + +/// Apply a bsdiff delta to `before` and return the resulting bytes. +/// +/// Returns an `std::io::Error` when the delta is malformed or applying it +/// fails (for example, the delta was produced from a different source). +pub fn apply_diff(before: &[u8], delta: &[u8]) -> Result, std::io::Error> { + let patcher = Bspatch::new(delta)?; + let mut out = Vec::with_capacity(patcher.hint_target_size() as usize); + patcher.apply(before, std::io::Cursor::new(&mut out))?; + Ok(out) +} + +#[cfg(test)] +mod tests { + use super::*; + use qbsdiff::Bsdiff; + + fn make_delta(before: &[u8], after: &[u8]) -> Vec { + let mut delta = Vec::new(); + Bsdiff::new(before, after) + .compare(std::io::Cursor::new(&mut delta)) + .expect("compare"); + delta + } + + #[test] + fn test_apply_diff_text_round_trip() { + let before = b"the quick brown fox jumps over the lazy dog"; + let after = b"the quick brown cat jumps over the lazy dog"; + let delta = make_delta(before, after); + let result = apply_diff(before, &delta).unwrap(); + assert_eq!(result, after); + } + + #[test] + fn test_apply_diff_binary_round_trip() { + let before: Vec = (0..1024u32).map(|i| (i % 251) as u8).collect(); + let mut after = before.clone(); + // Mutate a handful of bytes scattered through the buffer. + for i in [10usize, 200, 500, 900] { + after[i] = after[i].wrapping_add(7); + } + let delta = make_delta(&before, &after); + let result = apply_diff(&before, &delta).unwrap(); + assert_eq!(result, after); + } + + #[test] + fn test_apply_diff_empty_to_nonempty() { + let before: &[u8] = b""; + let after = b"hello"; + let delta = make_delta(before, after); + let result = apply_diff(before, &delta).unwrap(); + assert_eq!(result, after); + } + + #[test] + fn test_apply_diff_malformed_errors() { + // Random bytes are extremely unlikely to be a valid bsdiff header. + let bogus_delta = b"not a real bsdiff delta"; + let result = apply_diff(b"anything", bogus_delta); + assert!(result.is_err(), "expected malformed-delta error"); + } + + #[test] + fn test_apply_diff_wrong_source_does_not_panic() { + // Build a delta from one source then try to apply it to a different + // source. qbsdiff's bspatch is content-agnostic but should still + // produce *some* output without panicking — the caller is + // responsible for verifying the result hash matches the expected + // `after_hash`. This test exists to lock in the + // never-panic-on-bad-input contract callers depend on. + let src_a = b"AAAAAAAAAAAAAAAAAAAA"; + let src_b = b"BBBBBBBBBBBBBBBBBBBB"; + let target = b"CCCCCCCCCCCCCCCCCCCC"; + let delta = make_delta(src_a, target); + // Result may or may not equal target — what matters is no panic. + let _ = apply_diff(src_b, &delta); + } +} diff --git a/crates/socket-patch-core/src/patch/mod.rs b/crates/socket-patch-core/src/patch/mod.rs index e17bd8d..6bc295a 100644 --- a/crates/socket-patch-core/src/patch/mod.rs +++ b/crates/socket-patch-core/src/patch/mod.rs @@ -1,3 +1,5 @@ pub mod apply; +pub mod diff; pub mod file_hash; +pub mod package; pub mod rollback; diff --git a/crates/socket-patch-core/src/patch/package.rs b/crates/socket-patch-core/src/patch/package.rs new file mode 100644 index 0000000..a4f4b5f --- /dev/null +++ b/crates/socket-patch-core/src/patch/package.rs @@ -0,0 +1,489 @@ +//! Package- and diff-archive tarball helpers. +//! +//! Both package archives (`.socket/packages/.tar.gz`) and diff +//! archives (`.socket/diffs/.tar.gz`) use the same on-disk format: +//! a gzipped tar containing one entry per patched file. The entry's path +//! matches the **normalized** relative file path (i.e. without the +//! `package/` prefix used by the API). +//! +//! For package archives, each entry holds the patched file's full bytes. +//! For diff archives, each entry holds a bsdiff delta that transforms the +//! corresponding `beforeHash` content into the `afterHash` content. + +use std::collections::HashMap; +use std::io::Read; +use std::path::Path; + +use flate2::read::GzDecoder; +use tar::Archive; + +use crate::manifest::schema::PatchFileInfo; + +/// Maximum cumulative *decompressed* bytes we accept from a single +/// archive. Real socket-patch archives are tiny (kilobytes); 64 MiB is a +/// generous ceiling. Beyond this we assume gzip/tar bomb and refuse. +const MAX_TOTAL_DECOMPRESSED_BYTES: u64 = 64 * 1024 * 1024; + +/// Maximum size of any single archive entry, in bytes. Caps the buffer +/// we'll allocate per entry, defusing header-driven `with_capacity` +/// allocation attacks. +const MAX_ENTRY_BYTES: u64 = 16 * 1024 * 1024; + +/// Maximum number of entries in an archive. Defuses +/// "tar-of-a-million-empty-files" memory-exhaustion attacks against +/// the in-memory `HashMap`. +const MAX_ENTRIES: usize = 10_000; + +/// Errors produced while reading a package/diff archive. +#[derive(Debug, thiserror::Error)] +pub enum ArchiveError { + #[error("archive I/O error: {0}")] + Io(#[from] std::io::Error), + #[error("entry path {0:?} escapes the archive root")] + UnsafePath(String), + #[error("entry {path:?} is {size} bytes (max {max})")] + EntryTooLarge { path: String, size: u64, max: u64 }, + #[error("archive contains more than {0} entries")] + TooManyEntries(usize), +} + +/// Strip the leading `package/` prefix from an entry path, matching the +/// convention used by `normalize_file_path` in `apply.rs`. +fn normalize_entry_path(path: &str) -> &str { + path.strip_prefix("package/").unwrap_or(path) +} + +/// Read a `.tar.gz` archive into a map of `normalized_path -> bytes`. +/// +/// Returns an error if any entry path is absolute or contains `..` +/// components. Symlinks and other non-regular entries are silently +/// skipped. The reader is hard-capped against decompression-bomb / +/// memory-exhaustion attacks: cumulative decompressed bytes, +/// per-entry size, and entry count are all bounded. +/// +/// Note: we never call `tar::Archive::unpack`; the bytes are buffered +/// and later written through `apply_file_patch` to an explicit +/// `pkg_path.join(normalized)`. That avoids the classic +/// symlink-followed-by-write class of tar-extraction attacks at the +/// extraction step itself — the on-disk write site is the single, +/// hash-verified path inside `apply_file_patch`. +pub fn read_archive_to_map(archive_path: &Path) -> Result>, ArchiveError> { + let file = std::fs::File::open(archive_path)?; + // Hard-cap decompressed bytes to defuse gzip / tar bombs. Reads + // beyond the limit yield EOF, which the tar parser surfaces as a + // truncated-archive error. + let bounded = GzDecoder::new(file).take(MAX_TOTAL_DECOMPRESSED_BYTES); + let mut tar = Archive::new(bounded); + + let mut out: HashMap> = HashMap::new(); + let mut entry_count: usize = 0; + for entry in tar.entries()? { + let mut entry = entry?; + + entry_count += 1; + if entry_count > MAX_ENTRIES { + return Err(ArchiveError::TooManyEntries(MAX_ENTRIES)); + } + + // Only regular files. Skip directories, symlinks, hardlinks, etc. + if entry.header().entry_type() != tar::EntryType::Regular { + continue; + } + + let path = entry.path()?; + let path_str = path.to_string_lossy().to_string(); + + // Reject absolute paths or any `..` components. + if path.is_absolute() + || path + .components() + .any(|c| matches!(c, std::path::Component::ParentDir)) + { + return Err(ArchiveError::UnsafePath(path_str)); + } + + // The header-declared size is attacker-controlled. Reject + // oversize entries *before* allocating so a single u64::MAX + // claim can't OOM the process via `Vec::with_capacity`. + let size = entry.size(); + if size > MAX_ENTRY_BYTES { + return Err(ArchiveError::EntryTooLarge { + path: path_str, + size, + max: MAX_ENTRY_BYTES, + }); + } + + let normalized = normalize_entry_path(&path_str).to_string(); + // `size` is bounded above by MAX_ENTRY_BYTES (16 MiB), so the + // cast to `usize` is safe on all targets we support. + let mut bytes = Vec::with_capacity(size as usize); + entry.read_to_end(&mut bytes)?; + out.insert(normalized, bytes); + } + + Ok(out) +} + +/// Subset of `read_archive_to_map` that only keeps entries whose normalized +/// path appears in `expected_files`. Anything else in the archive is +/// silently dropped — this is defense-in-depth so a malicious archive +/// cannot drop arbitrary files into the package directory. +pub fn read_archive_filtered( + archive_path: &Path, + expected_files: &HashMap, +) -> Result>, ArchiveError> { + let allowed: std::collections::HashSet = expected_files + .keys() + .map(|k| normalize_entry_path(k).to_string()) + .collect(); + + let all = read_archive_to_map(archive_path)?; + Ok(all + .into_iter() + .filter(|(k, _)| allowed.contains(k)) + .collect()) +} + +#[cfg(test)] +mod tests { + use super::*; + use flate2::write::GzEncoder; + use flate2::Compression; + use std::io::Write; + use tar::Builder; + + fn write_archive(path: &Path, entries: &[(&str, &[u8])]) { + let file = std::fs::File::create(path).unwrap(); + let gz = GzEncoder::new(file, Compression::default()); + let mut builder = Builder::new(gz); + for (name, data) in entries { + let mut header = tar::Header::new_gnu(); + header.set_size(data.len() as u64); + header.set_mode(0o644); + header.set_cksum(); + builder.append_data(&mut header, name, *data).unwrap(); + } + builder.into_inner().unwrap().finish().unwrap(); + } + + fn write_archive_with_symlink(path: &Path, link_name: &str, target: &str) { + let file = std::fs::File::create(path).unwrap(); + let gz = GzEncoder::new(file, Compression::default()); + let mut builder = Builder::new(gz); + let mut header = tar::Header::new_gnu(); + header.set_entry_type(tar::EntryType::Symlink); + header.set_size(0); + header.set_mode(0o644); + header.set_cksum(); + builder + .append_link(&mut header, link_name, target) + .unwrap(); + builder.into_inner().unwrap().finish().unwrap(); + } + + fn make_file_info() -> HashMap { + let mut files = HashMap::new(); + files.insert( + "package/index.js".to_string(), + PatchFileInfo { + before_hash: "a".repeat(64), + after_hash: "b".repeat(64), + }, + ); + files.insert( + "lib/util.js".to_string(), + PatchFileInfo { + before_hash: "c".repeat(64), + after_hash: "d".repeat(64), + }, + ); + files + } + + #[test] + fn test_read_archive_basic() { + let dir = tempfile::tempdir().unwrap(); + let archive = dir.path().join("arc.tar.gz"); + write_archive( + &archive, + &[ + ("package/index.js", b"patched index"), + ("lib/util.js", b"patched util"), + ], + ); + + let map = read_archive_to_map(&archive).unwrap(); + assert_eq!(map.len(), 2); + // The "package/" prefix is stripped. + assert_eq!(map.get("index.js").unwrap(), b"patched index"); + assert_eq!(map.get("lib/util.js").unwrap(), b"patched util"); + } + + /// Craft a single-entry ustar archive with `name` written verbatim + /// into the header, bypassing the writer-side path validation that + /// rejects absolute paths and `..`. This lets us exercise the + /// defense-in-depth check inside [`read_archive_to_map`]. + fn write_raw_archive(path: &Path, name: &[u8], data: &[u8]) { + let mut block = [0u8; 512]; + // Name (first 100 bytes). + let copy_len = name.len().min(100); + block[..copy_len].copy_from_slice(&name[..copy_len]); + // Mode "0000644\0". + block[100..108].copy_from_slice(b"0000644\0"); + // Size as octal in 11 chars + NUL. + let size_str = format!("{:011o}", data.len()); + block[124..135].copy_from_slice(size_str.as_bytes()); + block[135] = 0; + // mtime + block[136..147].copy_from_slice(b"00000000000"); + block[147] = 0; + // typeflag '0' = normal file + block[156] = b'0'; + // ustar magic + block[257..263].copy_from_slice(b"ustar\0"); + block[263..265].copy_from_slice(b"00"); + // Checksum: spaces during compute. + block[148..156].fill(b' '); + let sum: u32 = block.iter().map(|&b| b as u32).sum(); + let sum_str = format!("{:06o}\0 ", sum); + block[148..156].copy_from_slice(sum_str.as_bytes()); + + let mut tar_bytes = Vec::new(); + tar_bytes.extend_from_slice(&block); + tar_bytes.extend_from_slice(data); + // Pad data to 512-byte boundary. + let pad = (512 - (data.len() % 512)) % 512; + tar_bytes.extend(std::iter::repeat_n(0u8, pad)); + // Two zero blocks mark end of archive. + tar_bytes.extend([0u8; 1024]); + + let file = std::fs::File::create(path).unwrap(); + let mut gz = GzEncoder::new(file, Compression::default()); + gz.write_all(&tar_bytes).unwrap(); + gz.finish().unwrap(); + } + + #[test] + fn test_read_archive_rejects_absolute_paths() { + let dir = tempfile::tempdir().unwrap(); + let archive = dir.path().join("arc.tar.gz"); + write_raw_archive(&archive, b"/etc/passwd", b"evil"); + + let err = read_archive_to_map(&archive).unwrap_err(); + assert!(matches!(err, ArchiveError::UnsafePath(_))); + } + + #[test] + fn test_read_archive_rejects_parent_traversal() { + let dir = tempfile::tempdir().unwrap(); + let archive = dir.path().join("arc.tar.gz"); + write_raw_archive(&archive, b"../../etc/passwd", b"evil"); + + let err = read_archive_to_map(&archive).unwrap_err(); + assert!(matches!(err, ArchiveError::UnsafePath(_))); + } + + #[test] + fn test_read_archive_skips_non_regular_entries() { + let dir = tempfile::tempdir().unwrap(); + let archive = dir.path().join("arc.tar.gz"); + write_archive_with_symlink(&archive, "link", "target"); + // Symlink entries should be silently skipped. + let map = read_archive_to_map(&archive).unwrap(); + assert!(map.is_empty()); + } + + #[test] + fn test_read_archive_filtered_drops_unexpected_entries() { + let dir = tempfile::tempdir().unwrap(); + let archive = dir.path().join("arc.tar.gz"); + write_archive( + &archive, + &[ + ("package/index.js", b"patched index"), + ("lib/util.js", b"patched util"), + ("bonus/extra.js", b"unwanted"), + ], + ); + + let files = make_file_info(); + let map = read_archive_filtered(&archive, &files).unwrap(); + // Only the two expected paths survive. + assert_eq!(map.len(), 2); + assert!(map.contains_key("index.js")); + assert!(map.contains_key("lib/util.js")); + assert!(!map.contains_key("bonus/extra.js")); + } + + #[test] + fn test_read_archive_missing_file() { + let result = read_archive_to_map(Path::new("/nonexistent/archive.tar.gz")); + assert!(result.is_err()); + } + + #[test] + fn test_normalize_entry_path() { + assert_eq!(normalize_entry_path("package/lib/x.js"), "lib/x.js"); + assert_eq!(normalize_entry_path("lib/x.js"), "lib/x.js"); + assert_eq!(normalize_entry_path("packagefoo/x.js"), "packagefoo/x.js"); + } + + #[test] + fn test_read_archive_corrupt_gzip() { + let dir = tempfile::tempdir().unwrap(); + let archive = dir.path().join("bogus.tar.gz"); + std::fs::write(&archive, b"not actually gzipped").unwrap(); + let result = read_archive_to_map(&archive); + assert!(result.is_err()); + } + + #[test] + #[allow(clippy::needless_borrows_for_generic_args)] + fn test_round_trip_via_builder() { + // Confirms the helpers used to write tests actually work end-to-end. + let dir = tempfile::tempdir().unwrap(); + let archive = dir.path().join("rt.tar.gz"); + let original: &[u8] = b"hello world"; + write_archive(&archive, &[("only.txt", original)]); + let map = read_archive_to_map(&archive).unwrap(); + assert_eq!(map.get("only.txt").map(|v| v.as_slice()), Some(original)); + } + + // ── Bomb defense tests ───────────────────────────────────────────── + + /// Build a raw tar entry whose header advertises a (potentially + /// fake) `declared_size`, followed by `data` padded to the next 512 + /// boundary. Used to forge size-mismatched entries the writer would + /// normally refuse. + fn raw_entry(name: &[u8], declared_size: u64, data: &[u8]) -> Vec { + let mut block = [0u8; 512]; + let copy_len = name.len().min(100); + block[..copy_len].copy_from_slice(&name[..copy_len]); + block[100..108].copy_from_slice(b"0000644\0"); + let size_str = format!("{:011o}", declared_size); + block[124..135].copy_from_slice(size_str.as_bytes()); + block[135] = 0; + block[136..147].copy_from_slice(b"00000000000"); + block[147] = 0; + block[156] = b'0'; // regular file + block[257..263].copy_from_slice(b"ustar\0"); + block[263..265].copy_from_slice(b"00"); + block[148..156].fill(b' '); + let sum: u32 = block.iter().map(|&b| b as u32).sum(); + let sum_str = format!("{:06o}\0 ", sum); + block[148..156].copy_from_slice(sum_str.as_bytes()); + + let mut out = Vec::new(); + out.extend_from_slice(&block); + out.extend_from_slice(data); + let pad = if data.is_empty() { + 0 + } else { + (512 - (data.len() % 512)) % 512 + }; + out.extend(std::iter::repeat_n(0u8, pad)); + out + } + + fn write_raw_tar_gz(path: &Path, entries: &[Vec], trailer: bool) { + let mut tar_bytes = Vec::new(); + for e in entries { + tar_bytes.extend_from_slice(e); + } + if trailer { + tar_bytes.extend([0u8; 1024]); + } + let file = std::fs::File::create(path).unwrap(); + let mut gz = GzEncoder::new(file, Compression::default()); + gz.write_all(&tar_bytes).unwrap(); + gz.finish().unwrap(); + } + + #[test] + fn test_read_archive_rejects_oversize_entry_header() { + // Forge a header that claims a 1 GiB entry — well over + // MAX_ENTRY_BYTES — backed by tiny actual data. Without the + // size check, `Vec::with_capacity` would attempt the 1 GiB + // allocation. + let dir = tempfile::tempdir().unwrap(); + let archive = dir.path().join("oversize.tar.gz"); + let entry = raw_entry(b"big.bin", 1024 * 1024 * 1024, b"tiny"); + write_raw_tar_gz(&archive, &[entry], true); + + let err = read_archive_to_map(&archive).unwrap_err(); + assert!( + matches!(err, ArchiveError::EntryTooLarge { .. }), + "expected EntryTooLarge, got {:?}", + err + ); + } + + #[test] + fn test_read_archive_rejects_too_many_entries() { + // Build an archive with one more entry than MAX_ENTRIES. Each + // entry is empty so the archive itself is small. + let dir = tempfile::tempdir().unwrap(); + let archive = dir.path().join("many.tar.gz"); + let entries: Vec> = (0..(MAX_ENTRIES + 1)) + .map(|i| raw_entry(format!("f{i}").as_bytes(), 0, b"")) + .collect(); + write_raw_tar_gz(&archive, &entries, true); + + let err = read_archive_to_map(&archive).unwrap_err(); + assert!( + matches!(err, ArchiveError::TooManyEntries(_)), + "expected TooManyEntries, got {:?}", + err + ); + } + + #[test] + fn test_read_archive_decompression_bomb_truncated() { + // Build a tar containing one entry that legitimately fits + // under MAX_ENTRY_BYTES but whose total content makes the + // decompressed stream exceed MAX_TOTAL_DECOMPRESSED_BYTES. + // We do this by chaining many MAX_ENTRY_BYTES-sized entries. + // + // The `Read::take(MAX_TOTAL_DECOMPRESSED_BYTES)` wrapper + // truncates reads beyond the cap. After the cap is exhausted, + // the next `entries()` iteration returns a malformed-archive + // I/O error — which surfaces as `ArchiveError::Io`. We accept + // either `Io` or `TooManyEntries` as evidence the bomb was + // defused (whichever defense fires first). + let dir = tempfile::tempdir().unwrap(); + let archive = dir.path().join("bomb.tar.gz"); + + // Two entries of (max - 1) MiB each = 30 MiB declared, but + // gzip compresses zeroes ~1000x so the on-disk archive is small. + // We don't need to *exceed* 64 MiB — the cap is enforced + // strictly, so an entry that crosses it will be truncated. + let chunk = vec![0u8; (MAX_ENTRY_BYTES - 1) as usize]; + let entry1 = raw_entry(b"a.bin", chunk.len() as u64, &chunk); + let entry2 = raw_entry(b"b.bin", chunk.len() as u64, &chunk); + let entry3 = raw_entry(b"c.bin", chunk.len() as u64, &chunk); + let entry4 = raw_entry(b"d.bin", chunk.len() as u64, &chunk); + // 4 * 15 MiB = 60 MiB declared, just under the 64 MiB cap. + // Add a fifth to push us over. + let entry5 = raw_entry(b"e.bin", chunk.len() as u64, &chunk); + write_raw_tar_gz(&archive, &[entry1, entry2, entry3, entry4, entry5], true); + + let result = read_archive_to_map(&archive); + // Either we get an Io error from truncation or the read + // succeeds with the first ~4 entries — both prove the cap + // prevented unbounded growth. Failure mode we want to RULE + // OUT: reading all 5 entries (~75 MiB) without error. + match result { + Err(_) => { /* defused via Io / truncation */ } + Ok(map) => { + // If parsing didn't error, ensure we didn't ingest all 5. + assert!( + map.len() < 5, + "decompression cap failed: ingested {} entries (~{} MiB)", + map.len(), + map.len() * (MAX_ENTRY_BYTES as usize - 1) / (1024 * 1024) + ); + } + } + } +} diff --git a/crates/socket-patch-core/src/utils/cleanup_blobs.rs b/crates/socket-patch-core/src/utils/cleanup_blobs.rs index 0121cb8..362227d 100644 --- a/crates/socket-patch-core/src/utils/cleanup_blobs.rs +++ b/crates/socket-patch-core/src/utils/cleanup_blobs.rs @@ -1,3 +1,4 @@ +use std::collections::HashSet; use std::path::Path; use crate::manifest::operations::get_after_hash_blobs; @@ -87,6 +88,78 @@ pub async fn cleanup_unused_blobs( Ok(result) } +/// Cleans up unused per-patch archive files from `archives_dir`. +/// +/// Archives are named `.tar.gz`. Any file matching that +/// pattern whose UUID is not present in the manifest is removed. Files +/// that do *not* end in `.tar.gz` are treated as orphans and also +/// removed — these directories are managed exclusively by socket-patch, +/// so any stray non-archive file is assumed to be left over from an +/// older socket-patch version. Subdirectories and hidden files are +/// left untouched. +pub async fn cleanup_unused_archives( + manifest: &PatchManifest, + archives_dir: &Path, + dry_run: bool, +) -> Result { + let used_uuids: HashSet = manifest + .patches + .values() + .map(|r| r.uuid.clone()) + .collect(); + + if tokio::fs::metadata(archives_dir).await.is_err() { + return Ok(CleanupResult { + blobs_checked: 0, + blobs_removed: 0, + bytes_freed: 0, + removed_blobs: vec![], + }); + } + + let mut read_dir = tokio::fs::read_dir(archives_dir).await?; + let mut entries = Vec::new(); + while let Some(entry) = read_dir.next_entry().await? { + entries.push(entry); + } + + let mut result = CleanupResult { + blobs_checked: entries.len(), + blobs_removed: 0, + bytes_freed: 0, + removed_blobs: vec![], + }; + + for entry in &entries { + let file_name = entry.file_name(); + let file_name_str = file_name.to_string_lossy().to_string(); + if file_name_str.starts_with('.') { + continue; + } + let archive_path = archives_dir.join(&file_name_str); + let metadata = tokio::fs::metadata(&archive_path).await?; + if !metadata.is_file() { + continue; + } + // Strip the .tar.gz suffix to recover the UUID; if it doesn't end + // in .tar.gz, treat the entry as orphaned and remove it. + let uuid_part = file_name_str + .strip_suffix(".tar.gz") + .unwrap_or(&file_name_str); + if used_uuids.contains(uuid_part) { + continue; + } + result.blobs_removed += 1; + result.bytes_freed += metadata.len(); + result.removed_blobs.push(file_name_str); + if !dry_run { + tokio::fs::remove_file(&archive_path).await?; + } + } + + Ok(result) +} + /// Formats the cleanup result for human-readable output. pub fn format_cleanup_result(result: &CleanupResult, dry_run: bool) -> String { if result.blobs_checked == 0 { @@ -402,6 +475,99 @@ mod tests { ); } + // ── cleanup_unused_archives tests ────────────────────────────── + + const SECOND_UUID: &str = "22222222-2222-4222-8222-222222222222"; + + #[tokio::test] + async fn test_cleanup_archives_keeps_referenced_uuid() { + let dir = tempfile::tempdir().unwrap(); + let archives = dir.path().join("packages"); + tokio::fs::create_dir_all(&archives).await.unwrap(); + + let manifest = create_test_manifest(); + tokio::fs::write(archives.join(format!("{TEST_UUID}.tar.gz")), b"keep") + .await + .unwrap(); + tokio::fs::write(archives.join(format!("{SECOND_UUID}.tar.gz")), b"orphan") + .await + .unwrap(); + + let result = cleanup_unused_archives(&manifest, &archives, false) + .await + .unwrap(); + + assert_eq!(result.blobs_removed, 1); + assert!(result + .removed_blobs + .contains(&format!("{SECOND_UUID}.tar.gz"))); + assert!(tokio::fs::metadata(archives.join(format!("{TEST_UUID}.tar.gz"))) + .await + .is_ok()); + assert!(tokio::fs::metadata(archives.join(format!("{SECOND_UUID}.tar.gz"))) + .await + .is_err()); + } + + #[tokio::test] + async fn test_cleanup_archives_dry_run_does_not_delete() { + let dir = tempfile::tempdir().unwrap(); + let archives = dir.path().join("packages"); + tokio::fs::create_dir_all(&archives).await.unwrap(); + + let manifest = create_test_manifest(); + tokio::fs::write(archives.join(format!("{SECOND_UUID}.tar.gz")), b"orphan") + .await + .unwrap(); + + let result = cleanup_unused_archives(&manifest, &archives, true) + .await + .unwrap(); + + assert_eq!(result.blobs_removed, 1); + assert!(tokio::fs::metadata(archives.join(format!("{SECOND_UUID}.tar.gz"))) + .await + .is_ok()); + } + + #[tokio::test] + async fn test_cleanup_archives_removes_non_archive_files() { + // Stray files (no .tar.gz suffix, or wrong UUID) are treated as + // orphans. This keeps the directory tidy when the on-disk format + // changes in the future. + let dir = tempfile::tempdir().unwrap(); + let archives = dir.path().join("packages"); + tokio::fs::create_dir_all(&archives).await.unwrap(); + + let manifest = create_test_manifest(); + tokio::fs::write(archives.join("stray.txt"), b"junk") + .await + .unwrap(); + tokio::fs::write(archives.join(format!("{TEST_UUID}.tar.gz")), b"keep") + .await + .unwrap(); + + let result = cleanup_unused_archives(&manifest, &archives, false) + .await + .unwrap(); + + assert_eq!(result.blobs_removed, 1); + assert!(result.removed_blobs.contains(&"stray.txt".to_string())); + } + + #[tokio::test] + async fn test_cleanup_archives_nonexistent_dir() { + let dir = tempfile::tempdir().unwrap(); + let archives = dir.path().join("does-not-exist"); + let manifest = create_test_manifest(); + + let result = cleanup_unused_archives(&manifest, &archives, false) + .await + .unwrap(); + assert_eq!(result.blobs_checked, 0); + assert_eq!(result.blobs_removed, 0); + } + #[test] fn test_format_cleanup_result_dry_run_lists_blobs() { let result = CleanupResult { From d91f73bc2ede68c25ddd979cba6ecfdf2a22e60a Mon Sep 17 00:00:00 2001 From: Mikola Lysenko Date: Thu, 14 May 2026 16:14:02 -0400 Subject: [PATCH 2/2] chore: clean up stray dead-code markers - e2e_npm.rs: NPM_PURL is actually used by 5 assertions; drop the stale `#[allow(dead_code)]`. - maven_crawler.rs: remove `read_pom_in_dir`, an async helper that was never called and only existed under `#[allow(dead_code)]`. No behavior change. 307 tests still pass; cargo build clean. Assisted-by: Claude Code:opus-4-7 --- crates/socket-patch-cli/tests/e2e_npm.rs | 1 - .../src/crawlers/maven_crawler.rs | 15 --------------- 2 files changed, 16 deletions(-) diff --git a/crates/socket-patch-cli/tests/e2e_npm.rs b/crates/socket-patch-cli/tests/e2e_npm.rs index 376c495..812955e 100644 --- a/crates/socket-patch-cli/tests/e2e_npm.rs +++ b/crates/socket-patch-cli/tests/e2e_npm.rs @@ -23,7 +23,6 @@ use sha2::{Digest, Sha256}; // --------------------------------------------------------------------------- const NPM_UUID: &str = "80630680-4da6-45f9-bba8-b888e0ffd58c"; -#[allow(dead_code)] const NPM_PURL: &str = "pkg:npm/minimist@1.2.2"; /// Git SHA-256 of the *unpatched* `index.js` shipped with minimist 1.2.2. diff --git a/crates/socket-patch-core/src/crawlers/maven_crawler.rs b/crates/socket-patch-core/src/crawlers/maven_crawler.rs index c78c875..5b9430e 100644 --- a/crates/socket-patch-core/src/crawlers/maven_crawler.rs +++ b/crates/socket-patch-core/src/crawlers/maven_crawler.rs @@ -410,21 +410,6 @@ impl MavenCrawler { false } - - /// Find and parse the first `.pom` file in a directory. - #[allow(dead_code)] - async fn read_pom_in_dir(dir: &Path) -> Option<(String, String, String)> { - let mut entries = tokio::fs::read_dir(dir).await.ok()?; - while let Ok(Some(entry)) = entries.next_entry().await { - if let Some(name) = entry.file_name().to_str() { - if name.ends_with(".pom") { - let content = tokio::fs::read_to_string(entry.path()).await.ok()?; - return parse_pom_group_artifact_version(&content); - } - } - } - None - } } impl Default for MavenCrawler {