From 2f7a7861651baac096b3318bcc0feee55476bae8 Mon Sep 17 00:00:00 2001 From: qustrolabe Date: Sat, 10 Jan 2026 01:33:28 +0200 Subject: [PATCH 1/6] some performance improvements --- Cargo.lock | 33 ++++++++++++++ Cargo.toml | 4 +- src/ingest.rs | 122 +++++++++++++++++++++++++++++++++++++------------- src/main.rs | 21 ++++++++- 4 files changed, 147 insertions(+), 33 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d4e26a2..6ddaa2e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -166,6 +166,28 @@ dependencies = [ "memchr", ] +[[package]] +name = "crossbeam" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1137cd7e7fc0fb5d3c5a8678be38ec56e819125d8d7907411fe24ccb943faca8" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-epoch", + "crossbeam-queue", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-deque" version = "0.8.6" @@ -185,6 +207,15 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "crossbeam-queue" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f58bbc28f91df819d0aa2a2c00cd19754769c2fad90579b3592b1c9ba7a3115" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.21" @@ -266,6 +297,8 @@ dependencies = [ "anyhow", "clap", "content_inspector", + "crossbeam", + "crossbeam-channel", "env_logger", "glob", "ignore", diff --git a/Cargo.toml b/Cargo.toml index 93fd3be..b438927 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,8 @@ categories = ["command-line-utilities"] anyhow = "1.0.100" clap = { version = "4.5.54", features = ["derive"] } content_inspector = "0.2.4" +crossbeam = "0.8.4" +crossbeam-channel = "0.5.15" env_logger = "0.11.8" glob = "0.3.3" ignore = "0.4.25" @@ -28,4 +30,4 @@ tempfile = "3.24.0" [profile.release] strip = true lto = true -codegen-units = 1 \ No newline at end of file +codegen-units = 1 diff --git a/src/ingest.rs b/src/ingest.rs index 1d8653d..6f0616b 100644 --- a/src/ingest.rs +++ b/src/ingest.rs @@ -1,9 +1,11 @@ use crate::decorator::{ContentDecorator, GlobalDecorator}; use anyhow::Result; +use crossbeam_channel::bounded; use log::{error, info, warn}; use rayon::prelude::*; +use std::collections::BTreeMap; use std::fs::File; -use std::io::{self, Write}; +use std::io::{self, BufWriter, Write}; use std::path::PathBuf; use tiktoken_rs::{CoreBPE, cl100k_base}; @@ -17,7 +19,12 @@ pub enum OutputDestination { pub const MAX_FILE_SIZE: u64 = 10 * 1024 * 1024; // 10MB +pub struct IngestMetrics { + pub total_tokens: usize, +} + struct ProcessedFile { + index: usize, content: String, tokens: usize, } @@ -27,7 +34,7 @@ pub fn ingest( output_dest: OutputDestination, content_decorator: &dyn ContentDecorator, global_decorator: Option<&dyn GlobalDecorator>, -) -> Result<()> { +) -> Result> { match &output_dest { OutputDestination::File(path) => info!("Writing digest to {}", path.display()), OutputDestination::Stdout => info!("Writing digest to stdout"), @@ -37,41 +44,88 @@ pub fn ingest( // Pre-load tokenizer let tokenizer = cl100k_base().ok(); - // Process files in parallel - let processed_results: Vec = files - .par_iter() - .filter_map(|path| process_single_file(path, content_decorator, tokenizer.as_ref())) - .collect(); + let (tx, rx) = bounded(32); // Buffer some results to keep cores busy + + let metrics = crossbeam::scope(|scope| -> Result { + // Spawn writer thread + let rx = rx; // Move rx into the scope, but it's shared + let writer_handle = scope.spawn(move |_| -> Result { + let mut writer: Option> = match output_dest { + OutputDestination::File(path) => { + Some(Box::new(BufWriter::new(File::create(path)?))) + } + OutputDestination::Stdout => Some(Box::new(io::stdout())), + OutputDestination::Null => None, + }; + + let mut total_tokens = 0; + let mut pending = BTreeMap::new(); + let mut next_index = 0; + + if let Some(prologue) = global_decorator.and_then(|g| g.prologue(files)) { + if let Some(ref mut w) = writer { + writeln!(w, "{prologue}")?; + } + } + + while next_index < files.len() { + // Check if we already have the next segment + while let Some(processed) = pending.remove(&next_index) { + let processed: ProcessedFile = processed; + if let Some(ref mut w) = writer { + writeln!(w, "{}", processed.content)?; + } + total_tokens += processed.tokens; + next_index += 1; + } + + if next_index >= files.len() { + break; + } + + // Wait for more results + if let Ok(processed) = rx.recv() { + let processed: ProcessedFile = processed; + pending.insert(processed.index, processed); + } else { + break; // Channel closed + } + } + + if let Some(ref mut w) = writer { + w.flush()?; + } + + Ok(total_tokens) + }); - let mut writer: Option> = match output_dest { - OutputDestination::File(path) => Some(Box::new(File::create(path)?)), - OutputDestination::Stdout => Some(Box::new(io::stdout())), - OutputDestination::Null => None, - }; + // Process files in parallel + files.par_iter().enumerate().for_each(|(idx, path)| { + if let Some(processed) = + process_single_file(idx, path, content_decorator, tokenizer.as_ref()) + { + let _ = tx.send(processed); + } + }); - let mut total_tokens = 0; + drop(tx); // Signal completion - if let Some(prologue) = global_decorator.and_then(|g| g.prologue(files)) { - if let Some(ref mut w) = writer { - writeln!(w, "{prologue}")?; - } - } + let total_tokens = writer_handle + .join() + .map_err(|_| anyhow::anyhow!("Writer thread panicked"))??; - // Write results sequentially to maintain order (files was sorted in traversal) - for processed in processed_results { - if let Some(ref mut w) = writer { - writeln!(w, "{}", processed.content)?; - } - total_tokens += processed.tokens; - } + Ok(IngestMetrics { total_tokens }) + }) + .map_err(|e| anyhow::anyhow!("Scope error: {:?}", e))??; - info!("Total estimated tokens: {total_tokens}"); - println!("Total estimated tokens: {total_tokens}"); + info!("Total estimated tokens: {}", metrics.total_tokens); + println!("Total estimated tokens: {}", metrics.total_tokens); - Ok(()) + Ok(Some(metrics)) } fn process_single_file( + index: usize, path: &PathBuf, content_decorator: &dyn ContentDecorator, tokenizer: Option<&CoreBPE>, @@ -85,6 +139,7 @@ fn process_single_file( metadata.len() ); return Some(ProcessedFile { + index, content: format!("----- {} (Skipped: >10MB) -----", path.display()), tokens: 0, }); @@ -97,27 +152,30 @@ fn process_single_file( Err(e) => { error!("Error opening {}: {e}", path.display()); return Some(ProcessedFile { + index, content: format!("----- {} (Error opening file) -----", path.display()), tokens: 0, }); } }; - let mut buffer = [0u8; 1024]; - let n = match std::io::Read::read(&mut file, &mut buffer) { + let mut prelude_buffer = [0u8; 1024]; + let n = match std::io::Read::read(&mut file, &mut prelude_buffer) { Ok(n) => n, Err(e) => { error!("Error reading prelude of {}: {e}", path.display()); return Some(ProcessedFile { + index, content: format!("----- {} (Error reading prelude) -----", path.display()), tokens: 0, }); } }; - if n > 0 && content_inspector::inspect(&buffer[..n]).is_binary() { + if n > 0 && content_inspector::inspect(&prelude_buffer[..n]).is_binary() { warn!("Skipping binary file: {}", path.display()); return Some(ProcessedFile { + index, content: format!("----- {} (Skipped: Binary) -----", path.display()), tokens: 0, }); @@ -133,6 +191,7 @@ fn process_single_file( if let Err(e) = std::io::Read::read_to_string(&mut file, &mut content) { error!("Error reading {}: {e}", path.display()); return Some(ProcessedFile { + index, content: format!("----- {} (Error reading content) -----", path.display()), tokens: 0, }); @@ -164,6 +223,7 @@ fn process_single_file( }; Some(ProcessedFile { + index, content: final_output.trim_end().to_string(), tokens, }) diff --git a/src/main.rs b/src/main.rs index 9cbc8ab..044db73 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,6 +8,7 @@ use clap::Parser; use log::{LevelFilter, info}; use std::env; use std::path::PathBuf; +use std::time::Instant; use traversal::TraversalOptions; use crate::decorator::{ContentDecorator, DefaultDecorator, FileTreeDecorator, XmlDecorator}; @@ -62,6 +63,10 @@ struct Cli { /// Dry run (only token estimation) #[arg(long)] dry: bool, + + /// Show detailed timing information + #[arg(short, long)] + timing: bool, } fn init_logger(verbose: bool) { @@ -79,6 +84,7 @@ fn init_logger(verbose: bool) { } fn main() -> Result<()> { + let global_start = Instant::now(); let cli = Cli::parse(); init_logger(cli.verbose); @@ -102,7 +108,9 @@ fn main() -> Result<()> { }; info!("Traversing files in {}", options.root.display()); + let discovery_start = Instant::now(); let files = traversal::traverse(&options)?; + let discovery_duration = discovery_start.elapsed(); info!("Found {} files", files.len()); if files.is_empty() { @@ -133,14 +141,25 @@ fn main() -> Result<()> { mode: cli.prologue, }; - ingest::ingest( + let ingest_start = Instant::now(); + let _ingest_metrics = ingest::ingest( &files, output_dest, content_decorator.as_ref(), Some(&global_decorator), )?; + let ingest_duration = ingest_start.elapsed(); info!("Done!"); + if cli.timing { + println!("\nTiming Summary:"); + println!("----------------------------------------"); + println!("Discovery: {:?}", discovery_duration); + println!("Ingestion: {:?}", ingest_duration); + println!("Total Runtime: {:?}", global_start.elapsed()); + println!("----------------------------------------"); + } + Ok(()) } From 72afa03cde147a51a465ef38bcdaad4c922277d7 Mon Sep 17 00:00:00 2001 From: qustrolabe Date: Sat, 10 Jan 2026 01:54:49 +0200 Subject: [PATCH 2/6] add no tokens mode, rethink token count strategy --- src/ingest.rs | 56 +++++++++++++++++++++++---------------------------- src/main.rs | 5 +++++ 2 files changed, 30 insertions(+), 31 deletions(-) diff --git a/src/ingest.rs b/src/ingest.rs index 6f0616b..69dfb53 100644 --- a/src/ingest.rs +++ b/src/ingest.rs @@ -26,7 +26,6 @@ pub struct IngestMetrics { struct ProcessedFile { index: usize, content: String, - tokens: usize, } pub fn ingest( @@ -34,6 +33,7 @@ pub fn ingest( output_dest: OutputDestination, content_decorator: &dyn ContentDecorator, global_decorator: Option<&dyn GlobalDecorator>, + count_tokens: bool, ) -> Result> { match &output_dest { OutputDestination::File(path) => info!("Writing digest to {}", path.display()), @@ -41,8 +41,12 @@ pub fn ingest( OutputDestination::Null => info!("Dry run: only token estimation will be performed"), } - // Pre-load tokenizer - let tokenizer = cl100k_base().ok(); + // Pre-load tokenizer if needed + let tokenizer = if count_tokens { + cl100k_base().ok() + } else { + None + }; let (tx, rx) = bounded(32); // Buffer some results to keep cores busy @@ -51,31 +55,28 @@ pub fn ingest( let rx = rx; // Move rx into the scope, but it's shared let writer_handle = scope.spawn(move |_| -> Result { let mut writer: Option> = match output_dest { - OutputDestination::File(path) => { + OutputDestination::File(ref path) => { Some(Box::new(BufWriter::new(File::create(path)?))) } OutputDestination::Stdout => Some(Box::new(io::stdout())), OutputDestination::Null => None, }; - let mut total_tokens = 0; + let mut full_content = String::new(); let mut pending = BTreeMap::new(); let mut next_index = 0; if let Some(prologue) = global_decorator.and_then(|g| g.prologue(files)) { - if let Some(ref mut w) = writer { - writeln!(w, "{prologue}")?; - } + full_content.push_str(&prologue); + full_content.push('\n'); } while next_index < files.len() { // Check if we already have the next segment while let Some(processed) = pending.remove(&next_index) { let processed: ProcessedFile = processed; - if let Some(ref mut w) = writer { - writeln!(w, "{}", processed.content)?; - } - total_tokens += processed.tokens; + full_content.push_str(&processed.content); + full_content.push('\n'); next_index += 1; } @@ -92,7 +93,17 @@ pub fn ingest( } } + // Trim the last newline if we have content + let final_output = full_content.trim_end(); + + let total_tokens = if let Some(t) = tokenizer { + t.encode_with_special_tokens(final_output).len() + } else { + 0 + }; + if let Some(ref mut w) = writer { + writeln!(w, "{}", final_output)?; w.flush()?; } @@ -101,9 +112,7 @@ pub fn ingest( // Process files in parallel files.par_iter().enumerate().for_each(|(idx, path)| { - if let Some(processed) = - process_single_file(idx, path, content_decorator, tokenizer.as_ref()) - { + if let Some(processed) = process_single_file(idx, path, content_decorator) { let _ = tx.send(processed); } }); @@ -128,7 +137,6 @@ fn process_single_file( index: usize, path: &PathBuf, content_decorator: &dyn ContentDecorator, - tokenizer: Option<&CoreBPE>, ) -> Option { // 1. Check file size if let Ok(metadata) = std::fs::metadata(path) { @@ -141,7 +149,6 @@ fn process_single_file( return Some(ProcessedFile { index, content: format!("----- {} (Skipped: >10MB) -----", path.display()), - tokens: 0, }); } } @@ -154,7 +161,6 @@ fn process_single_file( return Some(ProcessedFile { index, content: format!("----- {} (Error opening file) -----", path.display()), - tokens: 0, }); } }; @@ -167,7 +173,6 @@ fn process_single_file( return Some(ProcessedFile { index, content: format!("----- {} (Error reading prelude) -----", path.display()), - tokens: 0, }); } }; @@ -177,7 +182,6 @@ fn process_single_file( return Some(ProcessedFile { index, content: format!("----- {} (Skipped: Binary) -----", path.display()), - tokens: 0, }); } @@ -193,7 +197,6 @@ fn process_single_file( return Some(ProcessedFile { index, content: format!("----- {} (Error reading content) -----", path.display()), - tokens: 0, }); } @@ -213,19 +216,9 @@ fn process_single_file( final_output.push('\n'); } - // Count tokens - let tokens = if let Some(tokenizer) = tokenizer { - tokenizer - .encode_with_special_tokens(&transformed_content) - .len() - } else { - 0 - }; - Some(ProcessedFile { index, content: final_output.trim_end().to_string(), - tokens, }) } @@ -256,6 +249,7 @@ mod tests { OutputDestination::File(output_path.clone()), &decorator, None, + true, )?; assert!(output_path.exists()); diff --git a/src/main.rs b/src/main.rs index 044db73..3c9100e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -64,6 +64,10 @@ struct Cli { #[arg(long)] dry: bool, + /// Disable token counting + #[arg(long)] + no_tokens: bool, + /// Show detailed timing information #[arg(short, long)] timing: bool, @@ -147,6 +151,7 @@ fn main() -> Result<()> { output_dest, content_decorator.as_ref(), Some(&global_decorator), + !cli.no_tokens, )?; let ingest_duration = ingest_start.elapsed(); From 26b59d6ff8791ef28eb06c30988d96e67d5ea77d Mon Sep 17 00:00:00 2001 From: qustrolabe Date: Sat, 10 Jan 2026 02:02:27 +0200 Subject: [PATCH 3/6] fix README.md typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 606a203..be6c252 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,6 @@ and `.toml` files in the current directory and its subdirectories ## Build ```bash -cargo build --relase +cargo build --release cargo install --path . ``` From 4856704b341e55735457b8e3cd91a263029126b3 Mon Sep 17 00:00:00 2001 From: qustrolabe Date: Sat, 10 Jan 2026 02:04:54 +0200 Subject: [PATCH 4/6] added tests, markdown decorator, few tokenization performance improvements --- Cargo.lock | 105 +++++++++++++++++++++++++++-- Cargo.toml | 3 +- src/cloner.rs | 12 +++- src/decorator/markdown.rs | 22 +++++++ src/decorator/mod.rs | 2 + src/ingest.rs | 39 ++++++----- src/main.rs | 4 +- tests/integration_tests.rs | 131 +++++++++++++++++++++++++++++++++++++ 8 files changed, 290 insertions(+), 28 deletions(-) create mode 100644 src/decorator/markdown.rs create mode 100644 tests/integration_tests.rs diff --git a/Cargo.lock b/Cargo.lock index 6ddaa2e..eb9ced4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -67,6 +67,27 @@ version = "1.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" +[[package]] +name = "assert_cmd" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c5bcfa8749ac45dd12cb11055aeeb6b27a3895560d60d71e3c23bf979e60514" +dependencies = [ + "anstyle", + "bstr", + "libc", + "predicates", + "predicates-core", + "predicates-tree", + "wait-timeout", +] + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + [[package]] name = "base64" version = "0.22.1" @@ -222,6 +243,12 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "difflib" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" + [[package]] name = "either" version = "1.15.0" @@ -278,6 +305,15 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +[[package]] +name = "float-cmp" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b09cf3155332e944990140d967ff5eceb70df778b34f77d8075db46e4704e6d8" +dependencies = [ + "num-traits", +] + [[package]] name = "getrandom" version = "0.3.4" @@ -295,25 +331,20 @@ name = "gitmelt" version = "0.3.0" dependencies = [ "anyhow", + "assert_cmd", "clap", "content_inspector", "crossbeam", "crossbeam-channel", "env_logger", - "glob", "ignore", "log", + "predicates", "rayon", "tempfile", "tiktoken-rs", ] -[[package]] -name = "glob" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" - [[package]] name = "globset" version = "0.4.18" @@ -409,6 +440,21 @@ version = "2.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" +[[package]] +name = "normalize-line-endings" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61807f77802ff30975e01f4f071c8ba10c022052f98b3294119f3e615d13e5be" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + [[package]] name = "once_cell" version = "1.21.3" @@ -436,6 +482,36 @@ dependencies = [ "portable-atomic", ] +[[package]] +name = "predicates" +version = "3.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5d19ee57562043d37e82899fade9a22ebab7be9cef5026b07fda9cdd4293573" +dependencies = [ + "anstyle", + "difflib", + "float-cmp", + "normalize-line-endings", + "predicates-core", + "regex", +] + +[[package]] +name = "predicates-core" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "727e462b119fe9c93fd0eb1429a5f7647394014cf3c04ab2c0350eeb09095ffa" + +[[package]] +name = "predicates-tree" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72dd2d6d381dfb73a193c7fca536518d7caee39fc8503f74e7dc0be0531b425c" +dependencies = [ + "predicates-core", + "termtree", +] + [[package]] name = "proc-macro2" version = "1.0.105" @@ -596,6 +672,12 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "termtree" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683" + [[package]] name = "tiktoken-rs" version = "0.9.1" @@ -623,6 +705,15 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "wait-timeout" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ac3b126d3914f9849036f826e054cbabdc8519970b8998ddaf3b5bd3c65f11" +dependencies = [ + "libc", +] + [[package]] name = "walkdir" version = "2.5.0" diff --git a/Cargo.toml b/Cargo.toml index b438927..ccd9e6c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,6 @@ content_inspector = "0.2.4" crossbeam = "0.8.4" crossbeam-channel = "0.5.15" env_logger = "0.11.8" -glob = "0.3.3" ignore = "0.4.25" log = "0.4.29" rayon = "1.11.0" @@ -26,6 +25,8 @@ tiktoken-rs = "0.9.1" [dev-dependencies] tempfile = "3.24.0" +assert_cmd = "2.0.16" +predicates = "3.1.3" [profile.release] strip = true diff --git a/src/cloner.rs b/src/cloner.rs index f14ab70..12315e1 100644 --- a/src/cloner.rs +++ b/src/cloner.rs @@ -1,9 +1,19 @@ -use anyhow::{Context, Result}; +use anyhow::{Context, Result, bail}; use log::info; use std::process::Command; use tempfile::TempDir; +fn check_git_installed() -> Result<()> { + match Command::new("git").arg("--version").output() { + Ok(output) if output.status.success() => Ok(()), + _ => { + bail!("Git is not installed or not in PATH. Please install Git to clone repositories.") + } + } +} + pub fn clone_repo(url: &str, branch: Option<&str>) -> Result { + check_git_installed()?; let temp_dir = TempDir::new()?; let target_path = temp_dir.path(); diff --git a/src/decorator/markdown.rs b/src/decorator/markdown.rs new file mode 100644 index 0000000..4d8aa30 --- /dev/null +++ b/src/decorator/markdown.rs @@ -0,0 +1,22 @@ +use super::{ContentDecorator, format_path}; +use std::path::Path; + +pub struct MarkdownDecorator; + +impl ContentDecorator for MarkdownDecorator { + fn before(&self, path: &Path) -> Option { + let path_str = format_path(path); + // Extract extension for syntax highlighting (e.g., "rs", "toml") + let ext = path.extension().and_then(|e| e.to_str()).unwrap_or(""); + + Some(format!("## File: {}\n```{}", path_str, ext)) + } + + fn after(&self, _path: &Path) -> Option { + Some("```".to_string()) + } + + fn transform(&self, _path: &Path, content: String) -> String { + content + } +} diff --git a/src/decorator/mod.rs b/src/decorator/mod.rs index c60236c..d71c8eb 100644 --- a/src/decorator/mod.rs +++ b/src/decorator/mod.rs @@ -2,10 +2,12 @@ use std::path::Path; pub mod default; pub mod file_tree; +pub mod markdown; pub mod xml; pub use default::DefaultDecorator; pub use file_tree::FileTreeDecorator; +pub use markdown::MarkdownDecorator; pub use xml::XmlDecorator; #[derive(clap::ValueEnum, Clone, Debug, Default, PartialEq)] diff --git a/src/ingest.rs b/src/ingest.rs index 69dfb53..df78acc 100644 --- a/src/ingest.rs +++ b/src/ingest.rs @@ -7,7 +7,7 @@ use std::collections::BTreeMap; use std::fs::File; use std::io::{self, BufWriter, Write}; use std::path::PathBuf; -use tiktoken_rs::{CoreBPE, cl100k_base}; +use tiktoken_rs::cl100k_base; pub const DIGEST_FILENAME: &str = "digest.txt"; @@ -62,21 +62,32 @@ pub fn ingest( OutputDestination::Null => None, }; - let mut full_content = String::new(); + let mut total_tokens = 0; let mut pending = BTreeMap::new(); let mut next_index = 0; if let Some(prologue) = global_decorator.and_then(|g| g.prologue(files)) { - full_content.push_str(&prologue); - full_content.push('\n'); + if let Some(ref t) = tokenizer { + total_tokens += t.encode_with_special_tokens(&prologue).len(); + } + if let Some(ref mut w) = writer { + writeln!(w, "{}", prologue)?; + } } while next_index < files.len() { // Check if we already have the next segment while let Some(processed) = pending.remove(&next_index) { let processed: ProcessedFile = processed; - full_content.push_str(&processed.content); - full_content.push('\n'); + + if let Some(ref t) = tokenizer { + total_tokens += t.encode_with_special_tokens(&processed.content).len(); + } + + if let Some(ref mut w) = writer { + writeln!(w, "{}", processed.content)?; + } + next_index += 1; } @@ -93,17 +104,7 @@ pub fn ingest( } } - // Trim the last newline if we have content - let final_output = full_content.trim_end(); - - let total_tokens = if let Some(t) = tokenizer { - t.encode_with_special_tokens(final_output).len() - } else { - 0 - }; - if let Some(ref mut w) = writer { - writeln!(w, "{}", final_output)?; w.flush()?; } @@ -191,8 +192,8 @@ fn process_single_file( return None; } - let mut content = String::new(); - if let Err(e) = std::io::Read::read_to_string(&mut file, &mut content) { + let mut buffer = Vec::new(); + if let Err(e) = std::io::Read::read_to_end(&mut file, &mut buffer) { error!("Error reading {}: {e}", path.display()); return Some(ProcessedFile { index, @@ -200,6 +201,8 @@ fn process_single_file( }); } + let content = String::from_utf8_lossy(&buffer).to_string(); + // Apply decoration let mut final_output = String::new(); if let Some(before) = content_decorator.before(path) { diff --git a/src/main.rs b/src/main.rs index 3c9100e..db31a4d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -11,12 +11,13 @@ use std::path::PathBuf; use std::time::Instant; use traversal::TraversalOptions; -use crate::decorator::{ContentDecorator, DefaultDecorator, FileTreeDecorator, XmlDecorator}; +use crate::decorator::{ContentDecorator, DefaultDecorator, FileTreeDecorator, MarkdownDecorator, XmlDecorator}; use crate::ingest::OutputDestination; #[derive(clap::ValueEnum, Clone, Debug)] enum Preset { Default, + Markdown, Xml, } @@ -137,6 +138,7 @@ fn main() -> Result<()> { let content_decorator: Box = match cli.preset { Preset::Default => Box::new(DefaultDecorator), + Preset::Markdown => Box::new(MarkdownDecorator), Preset::Xml => Box::new(XmlDecorator), }; diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs new file mode 100644 index 0000000..10fcb2e --- /dev/null +++ b/tests/integration_tests.rs @@ -0,0 +1,131 @@ +use assert_cmd::Command; +use predicates::prelude::*; +use std::fs::{self, File}; +use std::io::Write; +use tempfile::TempDir; + +#[test] +fn test_cli_ignores_binaries() -> Result<(), Box> { + let temp = TempDir::new()?; + let root = temp.path(); + + // 1. Create a binary file + let bin_path = root.join("program.exe"); + let mut f = File::create(&bin_path)?; + // Write null bytes to look like binary + f.write_all(&[0u8; 100])?; + + // 2. Create a text file + let text_path = root.join("readme.md"); + let mut f = File::create(&text_path)?; + writeln!(f, "Important Context")?; + + // 3. Run gitmelt + let mut cmd = Command::cargo_bin("gitmelt")?; + cmd.arg(root.to_str().unwrap()) + .arg("--stdout") + .arg("--no-tokens"); + + // 4. Verify output + cmd.assert() + .success() + .stdout(predicate::str::contains("Important Context")) + .stdout(predicate::str::contains("Skipped: Binary")); + + Ok(()) +} + +#[test] +fn test_gitignore_logic() -> Result<(), Box> { + let temp = TempDir::new()?; + let root = temp.path(); + + // Create .git directory to ensure ignore crate respects .gitignore + fs::create_dir(root.join(".git"))?; + + // Create .gitignore + let mut gitignore = File::create(root.join(".gitignore"))?; + writeln!(gitignore, "secret.txt")?; + + // Create files + File::create(root.join("secret.txt"))?; + let mut public = File::create(root.join("public.txt"))?; + writeln!(public, "Public info")?; + + let mut cmd = Command::cargo_bin("gitmelt")?; + cmd.arg(root.to_str().unwrap()) + .arg("--stdout") + .arg("--no-tokens"); + + cmd.assert() + .success() + .stdout(predicate::str::contains("public.txt")) + .stdout(predicate::str::contains("secret.txt").not()); + + Ok(()) +} + +#[test] +fn test_file_ordering() -> Result<(), Box> { + let temp = TempDir::new()?; + let root = temp.path(); + + fs::create_dir_all(root.join("a"))?; + fs::create_dir_all(root.join("b"))?; + + let mut az = File::create(root.join("a/z.txt"))?; + writeln!(az, "Content A/Z")?; + + let mut ba = File::create(root.join("b/a.txt"))?; + writeln!(ba, "Content B/A")?; + + let mut cmd = Command::cargo_bin("gitmelt")?; + cmd.arg(root.to_str().unwrap()) + .arg("--stdout") + .arg("--no-tokens"); + + let output = cmd.assert().success().get_output().stdout.clone(); + let output_str = String::from_utf8(output)?; + + let pos_az = output_str.find("a/z.txt").unwrap(); + let pos_ba = output_str.find("b/a.txt").unwrap(); + + assert!(pos_az < pos_ba, "a/z.txt should come before b/a.txt"); + + Ok(()) +} + +#[test] +fn test_include_exclude_complexity() -> Result<(), Box> { + let temp = TempDir::new()?; + let root = temp.path(); + + fs::create_dir_all(root.join("src"))?; + fs::create_dir_all(root.join("tests"))?; + + let mut main_rs = File::create(root.join("src/main.rs"))?; + writeln!(main_rs, "fn main() {{}}")?; + + let mut utils_rs = File::create(root.join("src/utils.rs"))?; + writeln!(utils_rs, "fn utils() {{}}")?; + + let mut test_rs = File::create(root.join("tests/main_test.rs"))?; + writeln!(test_rs, "test")?; + + let mut cmd = Command::cargo_bin("gitmelt")?; + cmd.arg(root.to_str().unwrap()) + .arg("--stdout") + .arg("--no-tokens") + .arg("--include") + .arg("src/*.rs") + .arg("--exclude") + .arg("utils.rs"); + + cmd.assert() + .success() + .stdout(predicate::str::contains("src/main.rs")) + .stdout(predicate::str::contains("src/utils.rs").not()) + .stdout(predicate::str::contains("tests/main_test.rs").not()); + + Ok(()) +} From a1c96bbf37364e7ebbe1d5d6b9c99ee7a5d8803f Mon Sep 17 00:00:00 2001 From: qustrolabe Date: Sat, 10 Jan 2026 02:11:23 +0200 Subject: [PATCH 5/6] pedantic clippy fix --- src/decorator/file_tree.rs | 2 +- src/decorator/markdown.rs | 2 +- src/ingest.rs | 9 ++++----- src/main.rs | 9 ++++++--- src/traversal.rs | 5 ++--- tests/integration_tests.rs | 8 ++++---- 6 files changed, 18 insertions(+), 17 deletions(-) diff --git a/src/decorator/file_tree.rs b/src/decorator/file_tree.rs index 0e6fa47..c927536 100644 --- a/src/decorator/file_tree.rs +++ b/src/decorator/file_tree.rs @@ -74,7 +74,7 @@ fn print_tree(node: &TreeNode, prefix: &str, output: &mut String) { prefix, connector, name, - if !child.is_file { "/" } else { "" } + if child.is_file { "" } else { "/" } ); if !child.children.is_empty() { diff --git a/src/decorator/markdown.rs b/src/decorator/markdown.rs index 4d8aa30..d4cf4db 100644 --- a/src/decorator/markdown.rs +++ b/src/decorator/markdown.rs @@ -9,7 +9,7 @@ impl ContentDecorator for MarkdownDecorator { // Extract extension for syntax highlighting (e.g., "rs", "toml") let ext = path.extension().and_then(|e| e.to_str()).unwrap_or(""); - Some(format!("## File: {}\n```{}", path_str, ext)) + Some(format!("## File: {path_str}\n```{ext}")) } fn after(&self, _path: &Path) -> Option { diff --git a/src/ingest.rs b/src/ingest.rs index df78acc..b308e2b 100644 --- a/src/ingest.rs +++ b/src/ingest.rs @@ -71,7 +71,7 @@ pub fn ingest( total_tokens += t.encode_with_special_tokens(&prologue).len(); } if let Some(ref mut w) = writer { - writeln!(w, "{}", prologue)?; + writeln!(w, "{prologue}")?; } } @@ -126,7 +126,7 @@ pub fn ingest( Ok(IngestMetrics { total_tokens }) }) - .map_err(|e| anyhow::anyhow!("Scope error: {:?}", e))??; + .map_err(|e| anyhow::anyhow!("Scope error: {e:?}"))??; info!("Total estimated tokens: {}", metrics.total_tokens); println!("Total estimated tokens: {}", metrics.total_tokens); @@ -140,8 +140,8 @@ fn process_single_file( content_decorator: &dyn ContentDecorator, ) -> Option { // 1. Check file size - if let Ok(metadata) = std::fs::metadata(path) { - if metadata.len() > MAX_FILE_SIZE { + if let Ok(metadata) = std::fs::metadata(path) + && metadata.len() > MAX_FILE_SIZE { error!( "Skipping large file: {} ({} bytes)", path.display(), @@ -152,7 +152,6 @@ fn process_single_file( content: format!("----- {} (Skipped: >10MB) -----", path.display()), }); } - } // 2. Check for binary content & Read let mut file = match File::open(path) { diff --git a/src/main.rs b/src/main.rs index db31a4d..45bb24a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -11,7 +11,9 @@ use std::path::PathBuf; use std::time::Instant; use traversal::TraversalOptions; -use crate::decorator::{ContentDecorator, DefaultDecorator, FileTreeDecorator, MarkdownDecorator, XmlDecorator}; +use crate::decorator::{ + ContentDecorator, DefaultDecorator, FileTreeDecorator, MarkdownDecorator, XmlDecorator, +}; use crate::ingest::OutputDestination; #[derive(clap::ValueEnum, Clone, Debug)] @@ -24,6 +26,7 @@ enum Preset { #[derive(Parser)] #[command(name = "gitmelt")] #[command(about = "Concatenates file contents into a single digest file", long_about = None)] +#[allow(clippy::struct_excessive_bools)] struct Cli { /// Path to traverse or Git URL #[arg(default_value = ".")] @@ -162,8 +165,8 @@ fn main() -> Result<()> { if cli.timing { println!("\nTiming Summary:"); println!("----------------------------------------"); - println!("Discovery: {:?}", discovery_duration); - println!("Ingestion: {:?}", ingest_duration); + println!("Discovery: {discovery_duration:?}"); + println!("Ingestion: {ingest_duration:?}"); println!("Total Runtime: {:?}", global_start.elapsed()); println!("----------------------------------------"); } diff --git a/src/traversal.rs b/src/traversal.rs index 4cb4758..de7f44d 100644 --- a/src/traversal.rs +++ b/src/traversal.rs @@ -169,13 +169,12 @@ mod tests { let files = traverse(&options)?; for f in &files { - eprintln!("Found: {:?}", f); + eprintln!("Found: {f:?}"); } assert!( files.iter().any(|p| p.ends_with("main.rs")), - "Files found: {:?}", - files + "Files found: {files:?}" ); assert!(!files.iter().any(|p| p.ends_with("Cargo.lock"))); assert!(!files.iter().any(|p| p.ends_with("test.lock"))); diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index 10fcb2e..dd8165c 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -21,7 +21,7 @@ fn test_cli_ignores_binaries() -> Result<(), Box> { writeln!(f, "Important Context")?; // 3. Run gitmelt - let mut cmd = Command::cargo_bin("gitmelt")?; + let mut cmd = Command::new(env!("CARGO_BIN_EXE_gitmelt")); cmd.arg(root.to_str().unwrap()) .arg("--stdout") .arg("--no-tokens"); @@ -52,7 +52,7 @@ fn test_gitignore_logic() -> Result<(), Box> { let mut public = File::create(root.join("public.txt"))?; writeln!(public, "Public info")?; - let mut cmd = Command::cargo_bin("gitmelt")?; + let mut cmd = Command::new(env!("CARGO_BIN_EXE_gitmelt")); cmd.arg(root.to_str().unwrap()) .arg("--stdout") .arg("--no-tokens"); @@ -79,7 +79,7 @@ fn test_file_ordering() -> Result<(), Box> { let mut ba = File::create(root.join("b/a.txt"))?; writeln!(ba, "Content B/A")?; - let mut cmd = Command::cargo_bin("gitmelt")?; + let mut cmd = Command::new(env!("CARGO_BIN_EXE_gitmelt")); cmd.arg(root.to_str().unwrap()) .arg("--stdout") .arg("--no-tokens"); @@ -112,7 +112,7 @@ fn test_include_exclude_complexity() -> Result<(), Box> { let mut test_rs = File::create(root.join("tests/main_test.rs"))?; writeln!(test_rs, "test")?; - let mut cmd = Command::cargo_bin("gitmelt")?; + let mut cmd = Command::new(env!("CARGO_BIN_EXE_gitmelt")); cmd.arg(root.to_str().unwrap()) .arg("--stdout") .arg("--no-tokens") From d6a69009f281b6d4747a993c2db1a3ba387e2692 Mon Sep 17 00:00:00 2001 From: qustrolabe Date: Sat, 10 Jan 2026 02:15:55 +0200 Subject: [PATCH 6/6] version increment --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index eb9ced4..468ecc0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -328,7 +328,7 @@ dependencies = [ [[package]] name = "gitmelt" -version = "0.3.0" +version = "0.4.0" dependencies = [ "anyhow", "assert_cmd", diff --git a/Cargo.toml b/Cargo.toml index ccd9e6c..eea26af 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "gitmelt" -version = "0.3.0" +version = "0.4.0" edition = "2024" authors = ["qustrolabe "] description = "a tool to turn repository into single file text digest to conveniently feed into LLM"