diff --git a/.codex/config.toml b/.codex/config.toml new file mode 100644 index 0000000..4334324 --- /dev/null +++ b/.codex/config.toml @@ -0,0 +1,4 @@ +[mcp_servers.tessl] +type = "stdio" +command = "tessl" +args = [ "mcp", "start" ] diff --git a/.cursor/mcp.json b/.cursor/mcp.json new file mode 100644 index 0000000..ebfccaa --- /dev/null +++ b/.cursor/mcp.json @@ -0,0 +1,12 @@ +{ + "mcpServers": { + "tessl": { + "type": "stdio", + "command": "tessl", + "args": [ + "mcp", + "start" + ] + } + } +} diff --git a/.gemini/settings.json b/.gemini/settings.json new file mode 100644 index 0000000..ebfccaa --- /dev/null +++ b/.gemini/settings.json @@ -0,0 +1,12 @@ +{ + "mcpServers": { + "tessl": { + "type": "stdio", + "command": "tessl", + "args": [ + "mcp", + "start" + ] + } + } +} diff --git a/.gitignore b/.gitignore index d732ec5..61b5300 100644 --- a/.gitignore +++ b/.gitignore @@ -141,3 +141,8 @@ tests/fixtures/* # Git worktrees .worktrees/ + + +# tessl-generated files +**/tessl__* +.tessl/tiles/ diff --git a/.mcp.json b/.mcp.json new file mode 100644 index 0000000..ebfccaa --- /dev/null +++ b/.mcp.json @@ -0,0 +1,12 @@ +{ + "mcpServers": { + "tessl": { + "type": "stdio", + "command": "tessl", + "args": [ + "mcp", + "start" + ] + } + } +} diff --git a/.mergify.yml b/.mergify.yml index 233b9b4..2d967d1 100644 --- a/.mergify.yml +++ b/.mergify.yml @@ -1,6 +1,41 @@ queue_rules: + # Dosubot: only needs quality (fmt + clippy) to pass + - name: dosubot + merge_method: squash + autoqueue: true + queue_conditions: + - author = dosubot[bot] + - base = main + merge_conditions: + - check-success = quality + + # Dependabot: full CI required + - name: dependabot + merge_method: squash + autoqueue: true + queue_conditions: + - author = dependabot[bot] + - base = main + merge_conditions: + - check-success = quality + - check-success = msrv (stable) + - check-success = msrv (stable minus 1 releases) + - check-success = msrv (stable minus 2 releases) + - check-success = msrv (stable minus 3 releases) + - check-success = msrv (stable minus 4 releases) + - check-success = test + - check-success = test-cross-platform (ubuntu-latest, Linux) + - check-success = test-cross-platform (macos-latest, macOS) + - check-success = test-cross-platform (windows-latest, Windows) + - check-success = coverage + + # Human PRs: manually enqueued via /queue command (repo permissions restrict to maintainers) - name: default merge_method: squash + queue_conditions: + - base = main + - author != dependabot[bot] + - author != dosubot[bot] merge_conditions: - check-success = quality - check-success = msrv (stable) @@ -13,58 +48,50 @@ queue_rules: - check-success = test-cross-platform (macos-latest, macOS) - check-success = test-cross-platform (windows-latest, Windows) - check-success = coverage + pull_request_rules: - - name: Queue maintainer PRs with lgtm label + - name: Auto-approve dosubot PRs + description: Approve dosubot PRs so they can proceed through the queue conditions: - base = main - - author=@maintainers - - label = lgtm - - label != do-not-merge + - author = dosubot[bot] actions: - queue: - name: default - - name: Auto-approve and queue dependabot PRs + review: + type: APPROVE + message: Automatically approved by Mergify + + - name: Auto-approve dependabot PRs + description: Approve dependabot PRs so they can proceed through the queue conditions: - base = main - author = dependabot[bot] - - label != do-not-merge - - -files~=\.github/workflows/release\.yml actions: review: type: APPROVE message: Automatically approved by Mergify - queue: - name: default - - name: Queue external PRs when approved by maintainer - conditions: - - base = main - - -author=@maintainers - - author != dependabot[bot] - - approved-reviews-by=@maintainers - - label != do-not-merge - actions: - queue: - name: default + - name: Keep PRs up to date with main conditions: - base = main - -conflict - -draft - - label != do-not-merge actions: update: {} + merge_protections: - name: Enforce conventional commit description: Make sure that we follow https://www.conventionalcommits.org/en/v1.0.0/ if: - base = main + - author != dependabot[bot] + - author != dosubot[bot] success_conditions: - "title ~= ^(fix|feat|docs|style|refactor|perf|test|build|ci|chore|revert)(?:\\(.+\ \\))?:" - - name: CI must pass - description: All CI checks must pass. This protection prevents manual merges - that bypass the merge queue. + + - name: Full CI must pass + description: All CI checks must pass. This protection prevents manual merges that bypass the merge queue. if: - base = main success_conditions: @@ -79,6 +106,7 @@ merge_protections: - check-success = test-cross-platform (macos-latest, macOS) - check-success = test-cross-platform (windows-latest, Windows) - check-success = coverage + - name: Do not merge outdated PRs description: Make sure PRs are within 10 commits of the base branch before merging if: diff --git a/AGENTS.md b/AGENTS.md index 8091021..3fe6122 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -67,7 +67,7 @@ Use idiomatic `clap` derive API patterns. Push validation into clap wherever pos | `--no-tags` | | `Vec` | Repeatable, runtime overlap check with `--only-tags` | | `--min-len` | `-m` | `Option` | Custom parser enforces >= 1 | | `--top` | `-t` | `Option` | Custom parser enforces >= 1 | -| `--enc` | `-e` | `Option` | ascii, utf8, utf16, utf16le, utf16be | +| `--enc` | | `Option` | ascii, utf8, utf16, utf16le, utf16be | | `--raw` | | bool | Conflicts with `--only-tags`, `--no-tags`, `--top`, `--debug`, `--yara` | | `--summary` | | bool | Conflicts with `--json`, `--yara`; runtime TTY check | | `--debug` | | bool | Conflicts with `--raw` | diff --git a/GOTCHAS.md b/GOTCHAS.md index 8a4b9c9..8d6f971 100644 --- a/GOTCHAS.md +++ b/GOTCHAS.md @@ -29,8 +29,9 @@ Changing default values in `ExtractionConfig::default()` requires updating asser - `--raw` mode performs extraction only and then early-exits: ranking, normalization, and pipeline-level classification are skipped. `tags` are cleared, `score` is forced to 0, and `display_score` is set to `Some(0)`. `assert_cmd` tests run piped (non-TTY); use `format_table_with_mode(&strings, &metadata, true)` to test TTY table rendering - Exit codes are typed: 0=success, 2=config/validation error, 3=file not found, 4=permission denied, 1=other. Tests asserting exit codes must match `StringyError::exit_code()` in `types/error.rs` - `--no-tags` is the canonical flag name (kebab-case). Previously was `--notags` -- update all references when touching CLI flag names -- Short flags: `-j` (json), `-m` (min-len), `-t` (top), `-e` (enc). Do not add short flags for infrequent flags (--yara, --raw, --summary, --debug) +- Short flags: `-j` (json), `-m` (min-len), `-t` (top). Do not add short flags for infrequent flags (--enc, --yara, --raw, --summary, --debug) - `NO_COLOR` env var disables progress spinner. The spinner is also hidden when stderr is not a TTY +- Clap derive attributes (`long_help`, `about`, etc.) require string literals -- `const` values and `concat!` with consts do not work. The `cli_help_lists_all_canonical_tags` test in `integration_cli.rs` verifies help text stays in sync with `Tag::from_str()` ## Dependencies diff --git a/README.md b/README.md index 5880d40..cab1b7b 100644 --- a/README.md +++ b/README.md @@ -1,75 +1,26 @@ -![Stupid Sentient Yarn Ball Logo](docs/src/images/logo-320.png) +![Stringy Logo](docs/src/images/logo-320.png) # Stringy -A smarter alternative to the standard `strings` command that uses binary analysis to extract meaningful strings from executables, focusing on data structures rather than arbitrary byte runs. +[![License][license-badge]][license] [![Sponsors][sponsors-badge]][sponsors] ---- - -## The Problem with `strings` - -The standard `strings` command dumps every printable byte sequence it finds, which means you get: - -- Padding bytes and table data -- Interleaved garbage in UTF-16 strings -- No context about where strings come from -- No prioritization of what's actually useful - -**Stringy** solves this by being data-structure aware, section-aware, and semantically intelligent. - ---- - -## What Makes Stringy Different - -### **Data-Structure Aware** - -Only extracts strings that are part of the binary's actual data structures, not arbitrary byte runs. - -### **Section-Aware** - -Prioritizes `.rodata`/`.rdata`/`__cstring`, resources, and version info; de-emphasizes writable `.data`; avoids `.bss`. - -### **Encoding-Aware** +[![CI][ci-badge]][ci] [![dependency status][deps-badge]][deps] -Supports ASCII/UTF-8, UTF-16LE (PE), and UTF-16BE; detects null-interleaved text. - -### **Semantically Tagged** - -Identifies URLs, domains, IPs, file paths, registry keys, GUIDs, user agents, format strings, Base64 runs, crypto constants, and cloud metadata. - -### **Runtime-Specific** - -Handles import/export names, demangled Rust symbols, section names, Go build info, .NET metadata, and PE resources. - -### **Ranked** - -Presents the most relevant strings first using a scoring algorithm. +[![codecov][codecov-badge]][codecov] [![Issues][issues-badge]][issues] [![Last Commit][commits-badge]][commits] [![OpenSSF Scorecard][scorecard-badge]][scorecard] --- -## Features +A smarter alternative to `strings` that uses binary format knowledge and semantic classification to extract the strings that actually matter from ELF, PE, and Mach-O executables. -- **Format-aware parsing** via [`goblin`](https://docs.rs/goblin): ELF, PE, Mach-O -- **Section targeting**: `.rodata`, `.rdata`, `__cstring`, resources, manifests -- **Encoding support**: ASCII, UTF-8, UTF-16LE/BE with confidence scoring -- **Smart classification**: - - URLs, domains, IPv4/IPv6 addresses (implemented) - - Filepaths & registry keys - - GUIDs & user agents - - Format strings (`%s`, `%d`, etc.) - - Base64 & crypto constants -- **Rust symbol demangling** (`rustc-demangle`) -- **JSON output** for pipelines -- **YARA-friendly output** for rule generation -- **Ranking & scoring**: high-signal strings first +The standard `strings` command dumps every printable byte sequence it finds -- padding, table data, interleaved garbage. Stringy is section-aware, encoding-aware, and semantically intelligent: it knows where strings live in a binary, what they mean, and which ones you care about. ---- +## Quick Start -## Installation +### Installation -**Note**: Stringy is currently in development and not yet published to crates.io. +**Pre-built binaries** are available on the [Releases] page for Linux, macOS, and Windows. -### From Source +**From source:** ```bash git clone https://github.com/EvilBit-Labs/Stringy @@ -78,18 +29,10 @@ cargo build --release ./target/release/stringy --help ``` -### Development Build +### Basic Usage ```bash -cargo run -- --help -``` - ---- - -## Usage - -```bash -# Basic analysis with ranked output +# Ranked output with semantic tags stringy target_binary # Filter by semantic tags @@ -109,19 +52,20 @@ stringy --json target_binary stringy --yara target_binary stringy --json target_binary | jq '.[] | select(.tags[] | contains("Url"))' -# Raw extraction (no classification/ranking) +# Raw extraction (no classification or ranking) stringy --raw target_binary # Debug and summary modes stringy --debug target_binary stringy --summary target_binary -``` ---- +# Read from stdin +cat target_binary | stringy - +``` ## Example Output -**Human-readable mode (TTY):** +**TTY table:** ``` String | Tags | Score | Section @@ -132,7 +76,7 @@ https://api.example.com/v1/ | url | 95 | .rdata Error: %s at line %d | fmt | 78 | .rdata ``` -**JSON mode (JSONL):** +**JSON (JSONL):** ```json { @@ -152,42 +96,74 @@ Error: %s at line %d | fmt | 78 | .rdata } ``` ---- +## Features -## Advantages Over Standard `strings` +- **Format-aware parsing**: ELF, PE, and Mach-O via [goblin], with section-level weight prioritization +- **Encoding support**: ASCII, UTF-8, UTF-16LE/BE with confidence scoring +- **Semantic classification**: URLs, domains, IPv4/IPv6, file paths, registry keys, GUIDs, user agents, format strings, Base64, crypto constants +- **Symbol demangling**: C++, Rust, and other mangled symbol name recovery +- **PE resources**: VERSIONINFO, STRINGTABLE, and MANIFEST extraction +- **Import/export analysis**: Symbol extraction from all supported formats +- **Ranking**: Section-aware scoring with band-mapped 0-100 normalization +- **Deduplication**: Canonical string grouping with configurable similarity threshold +- **Output formats**: TTY table, plain text, JSONL, YARA rules +- **Pipeline architecture**: Configurable orchestrator with filtering, encoding selection, and top-N support -- **Eliminates noise**: Stops dumping padding, tables, and interleaved garbage -- **UTF-16 support**: Surfaces UTF-16 (crucial for PE) cleanly -- **Actionable buckets**: Provides categorized results (URLs, keys, UAs, registry paths) first -- **Provenance tracking**: Keeps offset/section info for pivoting to other tools -- **YARA integration**: Feeds only high-signal candidates +## Security ---- +- Zero `unsafe` code (`#![forbid(unsafe_code)]` enforced project-wide) +- [cargo-deny] and [cargo-audit] run in CI +- Vulnerability reporting via [SECURITY.md] -## Features +### Verifying Releases -- **Format Detection**: ELF, PE, and Mach-O via `goblin` with single-parse optimization -- **Container Parsing**: Section classification with weight-based prioritization (1.0-10.0 scale) -- **String Extraction**: ASCII, UTF-8, and UTF-16 (LE/BE/Auto) with noise filtering -- **Semantic Classification**: URLs, IPs, domains, file paths, GUIDs, format strings, registry keys, and more -- **Symbol Demangling**: C++, Rust, and other mangled symbol name recovery -- **Ranking**: Section-aware scoring with band-mapped 0-100 normalization -- **Deduplication**: Canonical string grouping with configurable similarity threshold -- **Output Formats**: TTY table, plain text, JSONL, YARA rules -- **PE Resources**: VERSIONINFO, STRINGTABLE, and MANIFEST extraction -- **Import/Export Analysis**: Symbol extraction from all supported binary formats -- **Pipeline Architecture**: Configurable orchestrator with filtering, encoding selection, and top-N support +All release artifacts are signed via [Sigstore](https://www.sigstore.dev/) using GitHub Attestations: ---- +```bash +gh attestation verify --repo EvilBit-Labs/Stringy +``` -## License +## Documentation -Licensed under Apache 2.0. +Full documentation is available at **[evilbitlabs.io/stringy](https://evilbitlabs.io/stringy/)**. ---- +Quick links: [Installation](docs/src/installation.md) | [Quick Start](docs/src/quickstart.md) | [CLI Reference](docs/src/cli.md) | [Architecture](docs/src/architecture.md) | [Troubleshooting](docs/src/troubleshooting.md) -## Acknowledgements +## Contributing + +See [CONTRIBUTING.md] for development setup, coding guidelines, and submission process. + +## License + +Licensed under the [Apache License, Version 2.0][license]. + +## Acknowledgments - Inspired by `strings(1)` and the need for better binary analysis tools -- Built with Rust ecosystem crates: `goblin`, `bstr`, `regex`, `rustc-demangle` +- Built with [goblin], [bstr](https://docs.rs/bstr), [regex](https://docs.rs/regex), and [rustc-demangle](https://docs.rs/rustc-demangle) - My coworkers, for their excellent input on the original name selection + + + +[cargo-audit]: https://github.com/EvilBit-Labs/Stringy/actions/workflows/audit.yml +[cargo-deny]: https://github.com/EvilBit-Labs/Stringy/actions/workflows/security.yml +[ci]: https://github.com/EvilBit-Labs/Stringy/actions/workflows/ci.yml +[ci-badge]: https://img.shields.io/github/actions/workflow/status/EvilBit-Labs/Stringy/ci.yml?style=flat-square&label=CI +[codecov]: https://codecov.io/gh/EvilBit-Labs/Stringy +[codecov-badge]: https://img.shields.io/codecov/c/github/EvilBit-Labs/Stringy?style=flat-square +[commits]: https://github.com/EvilBit-Labs/Stringy/commits/main +[commits-badge]: https://img.shields.io/github/last-commit/EvilBit-Labs/Stringy?style=flat-square +[contributing.md]: CONTRIBUTING.md +[deps]: https://deps.rs/repo/github/EvilBit-Labs/Stringy +[deps-badge]: https://deps.rs/repo/github/EvilBit-Labs/Stringy/status.svg?style=flat-square +[goblin]: https://docs.rs/goblin +[issues]: https://github.com/EvilBit-Labs/Stringy/issues +[issues-badge]: https://img.shields.io/github/issues/EvilBit-Labs/Stringy?style=flat-square +[license]: https://github.com/EvilBit-Labs/Stringy/blob/main/LICENSE +[license-badge]: https://img.shields.io/github/license/EvilBit-Labs/Stringy?style=flat-square +[releases]: https://github.com/EvilBit-Labs/Stringy/releases +[scorecard]: https://scorecard.dev/viewer/?uri=github.com/EvilBit-Labs/Stringy +[scorecard-badge]: https://img.shields.io/ossf-scorecard/github.com/EvilBit-Labs/Stringy?style=flat-square +[security.md]: SECURITY.md +[sponsors]: https://github.com/sponsors/EvilBit-Labs +[sponsors-badge]: https://img.shields.io/github/sponsors/EvilBit-Labs?style=flat-square diff --git a/docs/src/cli.md b/docs/src/cli.md index 68990eb..62080a7 100644 --- a/docs/src/cli.md +++ b/docs/src/cli.md @@ -109,7 +109,7 @@ stringy --only-tags filepath --only-tags regpath app.exe When stdout is a TTY, results are shown as a table with columns: -``` +```text String | Tags | Score | Section ``` @@ -128,8 +128,12 @@ Generates a YARA rule template. See [Output Formats](./output-formats.md) for de | Code | Meaning | | ---- | -------------------------------------------------------------------------- | | 0 | Success (including unknown binary format, empty binary, no filter matches) | -| 1 | Runtime error (file not found, tag overlap, `--summary` in non-TTY) | -| 2 | Argument parsing error (invalid flag, flag conflict, invalid tag name) | +| 1 | General runtime error | +| 2 | Configuration or validation error (tag overlap, `--summary` in non-TTY) | +| 3 | File not found | +| 4 | Permission denied | + +Clap argument parsing errors (invalid flag, flag conflict, invalid tag name) use clap's own exit code (typically 2). ## Advanced Usage diff --git a/docs/src/installation.md b/docs/src/installation.md index 6b7963a..f5725c5 100644 --- a/docs/src/installation.md +++ b/docs/src/installation.md @@ -1,35 +1,23 @@ # Installation -Stringy is currently in active development and not yet published to crates.io. You can install it from source or use development builds. +## Pre-built Binaries -## Prerequisites +Pre-built binaries for Linux, macOS, and Windows are available on the [Releases] page. -- **Rust**: Version 1.70 or later -- **Git**: For cloning the repository -- **Build tools**: Platform-specific C compiler (for some dependencies) - -### Installing Rust +Download the appropriate archive for your platform, extract it, and place the `stringy` binary somewhere on your PATH. -If you don't have Rust installed, get it from [rustup.rs](https://rustup.rs/): +## From Source -```bash -curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -source ~/.cargo/env -``` +### Prerequisites -## From Source (Recommended) +- **Rust**: Version 1.91 or later (see [rustup.rs](https://rustup.rs/) if you need to install Rust) +- **Git**: For cloning the repository -### Clone and Build +### Build and Install ```bash git clone https://github.com/EvilBit-Labs/Stringy cd Stringy -cargo build --release -``` - -### Install Locally - -```bash cargo install --path . ``` @@ -38,93 +26,31 @@ This installs the `stringy` binary to `~/.cargo/bin/`, which should be in your P ### Verify Installation ```bash -stringy --help +stringy --version ``` ## Development Build -For development and testing: +For development and testing, Stringy uses [just](https://just.systems/) and [mise](https://mise.jdx.dev/) to manage tooling: ```bash git clone https://github.com/EvilBit-Labs/Stringy cd Stringy -cargo run -- --help -``` - -## Platform-Specific Notes - -### Linux - -Most distributions include the necessary build tools. If you encounter issues: - -```bash -# Ubuntu/Debian -sudo apt update -sudo apt install build-essential - -# Fedora/RHEL -sudo dnf groupinstall "Development Tools" - -# Arch Linux -sudo pacman -S base-devel +just setup # Install tools and components +just gen-fixtures # Generate test fixtures (requires Zig via mise) +just test # Run tests ``` -### macOS - -Install Xcode command line tools: +If you do not use `just`, the minimum requirements are: ```bash -xcode-select --install -``` - -### Windows - -Install Visual Studio Build Tools or Visual Studio Community with C++ support. - -Alternatively, use the GNU toolchain: - -```bash -rustup toolchain install stable-x86_64-pc-windows-gnu -rustup default stable-x86_64-pc-windows-gnu -``` - -## Docker (Alternative) - -If you prefer containerized builds: - -```dockerfile -FROM rust:1.70 as builder -WORKDIR /app -COPY . . -RUN cargo build --release - -FROM debian:bookworm-slim -RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/* -COPY --from=builder /app/target/release/stringy /usr/local/bin/ -ENTRYPOINT ["stringy"] -``` - -Build and run: - -```bash -docker build -t stringy . -docker run --rm -v $(pwd):/data stringy /data/binary_file +cargo build --release +cargo test ``` ## Troubleshooting -### Common Issues - -#### "cargo: command not found" - -Ensure Rust is properly installed and `~/.cargo/bin` is in your PATH: - -```bash -echo 'export PATH="$HOME/.cargo/bin:$PATH"' >> ~/.bashrc -source ~/.bashrc -``` - -#### Build Failures +### Build Failures Update Rust to the latest version: @@ -139,26 +65,16 @@ cargo clean cargo build --release ``` -#### Permission Denied - -On Unix systems, ensure the binary is executable: - -```bash -chmod +x ~/.cargo/bin/stringy -``` - ### Getting Help If you encounter issues: 1. Check the [troubleshooting guide](./troubleshooting.md) 2. Search existing [GitHub issues](https://github.com/EvilBit-Labs/Stringy/issues) -3. Open a new issue with: - - Your operating system and version - - Rust version (`rustc --version`) - - Complete error output - - Steps to reproduce +3. Open a new issue with your OS, Rust version (`rustc --version`), and complete error output ## Next Steps Once installed, see the [Quick Start](./quickstart.md) guide to begin using Stringy. + +[releases]: https://github.com/EvilBit-Labs/Stringy/releases diff --git a/docs/src/performance.md b/docs/src/performance.md index 6a42066..0be6258 100644 --- a/docs/src/performance.md +++ b/docs/src/performance.md @@ -1,393 +1,79 @@ # Performance -Stringy is designed for efficient analysis of binary files, from small executables to large system libraries. This guide covers performance characteristics, optimization techniques, and best practices. +Stringy is designed for efficient analysis of binary files, from small executables to large system libraries. -## Performance Overview +## How It Works -### Typical Performance +Stringy memory-maps input files via [mmap-guard] for zero-copy access, then processes sections in weight-priority order. Regex patterns for semantic classification are compiled once using `LazyLock` statics. -| File Size | Processing Time | Memory Usage | Notes | -| --------- | --------------- | ------------ | ------------------------------ | -| < 1MB | < 100ms | < 10MB | Small executables | -| 1-10MB | 100ms - 1s | 10-50MB | Typical applications | -| 10-100MB | 1-10s | 50-200MB | Large applications, libraries | -| > 100MB | 10s+ | 200MB+ | System libraries, packed files | +The processing pipeline is single-threaded and sequential: -### Factors Affecting Performance +1. **Format detection and section analysis** -- O(n) where n = number of sections +2. **String extraction** -- O(m) where m = total section size +3. **Deduplication** -- hash-based grouping of identical strings +4. **Classification** -- O(k) where k = number of unique strings +5. **Ranking and sorting** -- O(k log k) -1. **File size**: Larger files take longer to process -2. **Section count**: More sections require more analysis -3. **String density**: Files with many strings take longer -4. **Encoding complexity**: UTF-16 detection is more expensive than ASCII -5. **Classification depth**: More semantic patterns increase processing time +## Reducing Processing Time -## Memory Management - -### Memory Mapping - -Stringy uses memory mapping for efficient file access: - -```text -// Automatic memory mapping for large files -if file_size > MEMORY_MAP_THRESHOLD { - let mmap = unsafe { Mmap::map(&file)? }; - process_data(&mmap[..]) -} else { - let data = std::fs::read(path)?; - process_data(&data) -} -``` - -**Benefits:** - -- Reduced memory usage for large files -- Faster access to file data -- OS-level caching optimization - -**Configuration:** - -```bash -# Adjust memory mapping threshold -stringy --mmap-threshold 5MB large_file.exe - -# Disable memory mapping -stringy --no-mmap file.exe -``` - -### Memory Usage Patterns - -``` -Peak Memory = Base Memory + File Size + String Storage + Classification Data -``` - -- **Base Memory**: ~5-10MB for the application -- **File Size**: Full file size if not memory-mapped -- **String Storage**: ~2-5x the total extracted string length -- **Classification Data**: ~1-2MB for regex engines and caches - -### Memory Optimization - -```bash -# Limit string length to reduce memory usage -stringy --max-len 200 large_file.exe - -# Limit results to reduce output memory -stringy --top 100 large_file.exe - -# Process specific sections only -stringy --sections .rodata,.rdata large_file.exe -``` - -## CPU Performance - -### Single-Threaded Performance - -Core extraction pipeline is optimized for single-threaded performance: - -1. **Section Analysis**: O(n) where n = number of sections -2. **String Extraction**: O(m) where m = total section size -3. **Classification**: O(k) where k = number of extracted strings -4. **Ranking**: O(k log k) for sorting - -### Parallel Processing - -Future versions will support parallel processing: - -```text -// Planned parallel section processing -sections.par_iter() - .flat_map(|section| extract_from_section(section, data)) - .collect() -``` - -**Parallelization opportunities:** - -- Section-level extraction -- Classification of string batches -- Multiple file processing - -### CPU Optimization Techniques - -#### Regex Caching - -```text -lazy_static! { - static ref URL_REGEX: Regex = Regex::new(r"https?://[^\s]+").unwrap(); - static ref DOMAIN_REGEX: Regex = Regex::new(r"[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap(); -} -``` - -#### Efficient String Scanning - -```text -// Optimized ASCII scanning with SIMD potential -fn scan_ascii_optimized(data: &[u8]) -> Vec { - let mut matches = Vec::new(); - let mut current_start = None; - - for (i, &byte) in data.iter().enumerate() { - if is_printable_ascii(byte) { - if current_start.is_none() { - current_start = Some(i); - } - } else if let Some(start) = current_start.take() { - if i - start >= MIN_LENGTH { - matches.push(StringMatch { start, end: i }); - } - } - } - - matches -} -``` - -## I/O Performance - -### File Access Patterns - -Stringy uses sequential access patterns optimized for modern storage: - -```text -// Sequential section processing -for section in container.sections { - let section_data = &data[section.offset..section.offset + section.size]; - process_section(section_data); -} -``` - -### Storage Type Impact - -| Storage Type | Relative Performance | Notes | -| ------------ | -------------------- | ---------------------------------- | -| NVMe SSD | 1.0x (baseline) | Optimal performance | -| SATA SSD | 0.8-0.9x | Good performance | -| HDD | 0.3-0.5x | Slower, especially for large files | -| Network | 0.1-0.3x | Highly variable | - -### I/O Optimization - -```bash -# Process from faster storage when possible -cp /slow/path/binary /tmp/binary -stringy /tmp/binary - -# Use memory mapping for network files -stringy --force-mmap network_file.exe -``` - -## Optimization Strategies - -### For Interactive Use - -Optimize for fast feedback: +Use CLI flags to narrow the work Stringy does: ```bash -# Quick scan of high-value sections -stringy --sections .rodata,.rdata --top 20 binary.exe +# Limit to top results (skip sorting the long tail) +stringy --top 50 binary -# ASCII only for faster processing -stringy --enc ascii --min-len 6 binary.exe +# Increase minimum length to reduce noise and string count +stringy --min-len 8 binary -# Skip expensive classification -stringy --no-classify --json binary.exe | jq '.[] | select(.length > 10)' -``` - -### For Batch Processing - -Optimize for throughput: - -```bash -# Process multiple files efficiently -find /binaries -name "*.exe" -exec stringy --json {} \; > all_strings.jsonl - -# Use minimal output for large batches -stringy --json --no-metadata --top 10 *.dll - -# Parallel processing with xargs -find /binaries -name "*.so" | xargs -P 4 -I {} stringy --json {} > results.jsonl -``` - -### For Large Files - -Handle large files efficiently: - -```bash -# Focus on high-value sections -stringy --sections .rodata,.rdata,.rsrc huge_file.exe - -# Increase minimum length to reduce noise -stringy --min-len 8 --top 50 huge_file.exe +# Restrict to a single encoding (skip UTF-16 detection) +stringy --enc ascii binary -# Use streaming output for very large results -stringy --json huge_file.exe | head -1000 > sample.jsonl +# Skip classification and ranking entirely +stringy --raw binary ``` -## Performance Monitoring +`--raw` mode is the fastest option -- it extracts and deduplicates strings without running the classifier or ranker. -### Built-in Timing +## Benchmarking -```bash -# Enable timing information -stringy --timing binary.exe -``` - -Output includes: - -- File loading time -- Format detection time -- Section analysis time -- String extraction time -- Classification time -- Output formatting time - -### Memory Profiling - -```bash -# Monitor memory usage (Unix systems) -/usr/bin/time -v stringy large_file.exe - -# macOS -/usr/bin/time -l stringy large_file.exe -``` - -### Benchmarking - -Use the built-in benchmark suite: +Stringy includes [Criterion](https://docs.rs/criterion) benchmarks for core components: ```bash -# Run performance benchmarks -cargo bench +# Run all benchmarks +just bench -# Benchmark specific components -cargo bench --bench extraction +# Run a specific benchmark +cargo bench --bench elf +cargo bench --bench pe cargo bench --bench classification +cargo bench --bench ascii_extraction ``` -## Performance Tuning - -### Configuration Tuning - -```toml -[performance] -# Memory mapping threshold (bytes) -memory_map_threshold = 10485760 # 10MB - -# Maximum memory usage (bytes) -max_memory_usage = 1073741824 # 1GB - -# String extraction chunk size -chunk_size = 1048576 # 1MB - -# Enable performance optimizations -fast_mode = true -skip_low_confidence = true -``` - -### Runtime Tuning - -```bash -# Adjust for available memory -export STRINGY_MAX_MEMORY=512MB - -# Tune for CPU cores -export STRINGY_THREADS=4 - -# Enable aggressive caching -export STRINGY_CACHE_SIZE=100MB -``` - -## Bottleneck Analysis - -### Common Bottlenecks - -1. **Large UTF-16 sections**: UTF-16 detection is CPU-intensive -2. **Many small strings**: Classification overhead per string -3. **Complex regex patterns**: Some semantic patterns are expensive -4. **Large output**: JSON serialization and formatting - -### Profiling Tools +## Profiling ```bash # CPU profiling with perf (Linux) -perf record --call-graph dwarf stringy large_file.exe +perf record --call-graph dwarf -- stringy large_file.exe perf report -# Memory profiling with valgrind -valgrind --tool=massif stringy binary.exe - # macOS profiling with Instruments -instruments -t "Time Profiler" stringy binary.exe -``` - -## Optimization Examples +xcrun xctrace record --template "Time Profiler" --launch -- stringy binary -### Fast Security Scan - -```bash -# Optimized for security indicators -stringy \ - --enc ascii,utf8 \ - --min-len 8 \ - --only url,domain,ipv4,filepath \ - --top 20 \ - --sections .rodata,.rdata \ - malware.exe +# Memory profiling +/usr/bin/time -l stringy large_file.exe # macOS +/usr/bin/time -v stringy large_file.exe # Linux ``` -### Comprehensive Analysis - -```bash -# Thorough but efficient analysis -stringy \ - --enc ascii,utf16le \ - --min-len 4 \ - --max-len 500 \ - --top 200 \ - --json \ - application.exe > analysis.jsonl -``` +## Batch Processing -### Batch Processing Script +Stringy processes one file per invocation. For batch workflows, use standard Unix tools: ```bash -#!/bin/bash -# Efficient batch processing +# Process multiple files +find /path/to/binaries -type f -exec stringy --json {} \; > all_strings.jsonl -TEMP_DIR=$(mktemp -d) -trap "rm -rf $TEMP_DIR" EXIT - -for file in "$@"; do - # Copy to fast storage if needed - if [[ "$file" == /slow/* ]]; then - cp "$file" "$TEMP_DIR/" - file="$TEMP_DIR/$(basename "$file")" - fi - - # Process with optimized settings - stringy \ - --json \ - --top 50 \ - --min-len 6 \ - --sections .rodata,.rdata,.rsrc \ - "$file" >> results.jsonl -done +# Parallel processing with xargs +find /binaries -name "*.exe" -print0 | xargs -0 -P 4 -I {} stringy --json {} > results.jsonl ``` -## Future Optimizations - -### Planned Improvements - -1. **SIMD acceleration**: Vectorized string scanning -2. **Parallel processing**: Multi-threaded extraction and classification -3. **Incremental analysis**: Cache results for repeated analysis -4. **Streaming processing**: Handle arbitrarily large files -5. **GPU acceleration**: Parallel pattern matching on GPU - -### Performance Roadmap - -- **v0.2**: Basic parallel processing -- **v0.3**: SIMD-optimized string scanning -- **v0.4**: Incremental analysis and caching -- **v1.0**: Full streaming support - -This performance guide helps you get the most out of Stringy for your specific use case, whether you're doing interactive analysis or processing large batches of files. +[mmap-guard]: https://docs.rs/mmap-guard diff --git a/docs/src/quickstart.md b/docs/src/quickstart.md index e8781f5..0be03cd 100644 --- a/docs/src/quickstart.md +++ b/docs/src/quickstart.md @@ -78,23 +78,27 @@ See [Output Formats](./output-formats.md) for the full band-mapping table. Semantic classifications help identify string types: -| Tag | Description | Example | -| ----------------- | ----------------------- | ------------------------- | -| `url` | Web URLs | `https://example.com/api` | -| `domain` | Domain names | `api.example.com` | -| `ipv4`/`ipv6` | IP addresses | `192.168.1.1` | -| `filepath` | File paths | `/usr/bin/app` | -| `regpath` | Registry paths | `HKEY_LOCAL_MACHINE\...` | -| `guid` | GUIDs/UUIDs | `{12345678-1234-...}` | -| `email` | Email addresses | `user@example.com` | -| `b64` | Base64 data | `SGVsbG8gV29ybGQ=` | -| `fmt` | Format strings | `Error: %s` | -| `import`/`export` | Symbol names | `CreateFileW` | -| `user-agent-ish` | User-agent-like strings | `Mozilla/5.0 ...` | -| `dylib-path` | Dynamic library paths | `/usr/lib/libfoo.dylib` | -| `rpath` | Runtime search paths | `/usr/local/lib` | -| `rpath-var` | Rpath variables | `@loader_path/../lib` | -| `framework-path` | Framework paths (macOS) | `/System/Library/...` | +| Tag | Description | Example | +| ----------------- | ----------------------- | -------------------------- | +| `url` | Web URLs | `https://example.com/api` | +| `domain` | Domain names | `api.example.com` | +| `ipv4`/`ipv6` | IP addresses | `192.168.1.1` | +| `filepath` | File paths | `/usr/bin/app` | +| `regpath` | Registry paths | `HKEY_LOCAL_MACHINE\...` | +| `guid` | GUIDs/UUIDs | `{12345678-1234-...}` | +| `email` | Email addresses | `user@example.com` | +| `b64` | Base64 data | `SGVsbG8gV29ybGQ=` | +| `fmt` | Format strings | `Error: %s` | +| `import`/`export` | Symbol names | `CreateFileW` | +| `demangled` | Demangled symbols | `std::io::Read::read` | +| `user-agent-ish` | User-agent-like strings | `Mozilla/5.0 ...` | +| `version` | Version strings | `v1.2.3` | +| `manifest` | Manifest data | PE/Mach-O embedded XML | +| `resource` | Resource strings | PE VERSIONINFO/STRINGTABLE | +| `dylib-path` | Dynamic library paths | `/usr/lib/libfoo.dylib` | +| `rpath` | Runtime search paths | `/usr/local/lib` | +| `rpath-var` | Rpath variables | `@loader_path/../lib` | +| `framework-path` | Framework paths (macOS) | `/System/Library/...` | ### Sections diff --git a/src/main.rs b/src/main.rs index cb0fe55..fb4f688 100644 --- a/src/main.rs +++ b/src/main.rs @@ -14,7 +14,9 @@ use stringy::output::OutputFormat; use stringy::types::{StringyError, Tag}; use stringy::{Encoding, EncodingFilter, FilterConfig, Pipeline, PipelineConfig}; -/// Encoding filter for string extraction +/// CLI-specific encoding enum that maps to `EncodingFilter`. +/// +/// Variant doc comments are shown in `--help` output. #[derive(Debug, Clone, Copy, ValueEnum)] enum CliEncoding { /// ASCII-encoded strings only @@ -42,17 +44,23 @@ fn parse_positive_usize(s: &str) -> Result { Ok(value) } -/// Map CLI encoding variant to pipeline `EncodingFilter`. -fn cli_encoding_to_filter(enc: CliEncoding) -> EncodingFilter { - match enc { - CliEncoding::Ascii => EncodingFilter::Exact(Encoding::Ascii), - CliEncoding::Utf8 => EncodingFilter::Exact(Encoding::Utf8), - CliEncoding::Utf16 => EncodingFilter::Utf16Any, - CliEncoding::Utf16Le => EncodingFilter::Exact(Encoding::Utf16Le), - CliEncoding::Utf16Be => EncodingFilter::Exact(Encoding::Utf16Be), +impl From for EncodingFilter { + fn from(enc: CliEncoding) -> Self { + match enc { + CliEncoding::Ascii => EncodingFilter::Exact(Encoding::Ascii), + CliEncoding::Utf8 => EncodingFilter::Exact(Encoding::Utf8), + CliEncoding::Utf16 => EncodingFilter::Utf16Any, + CliEncoding::Utf16Le => EncodingFilter::Exact(Encoding::Utf16Le), + CliEncoding::Utf16Be => EncodingFilter::Exact(Encoding::Utf16Be), + } } } +// The tag list in --only-tags and --no-tags long_help must stay in sync with +// Tag::from_str() in src/types/mod.rs. A compile-time const can't be used in +// Clap derive attributes, so tests/integration_cli.rs verifies the help text +// contains all known tags. + /// A smarter alternative to the strings command that leverages format-specific knowledge #[derive(Parser)] #[command(name = "stringy", author, version)] @@ -71,6 +79,12 @@ fn cli_encoding_to_filter(enc: CliEncoding) -> EncodingFilter { cat binary.exe | stringy -\n \ stringy -m 8 --only-tags url --only-tags domain binary.exe\n \ stringy -t 50 -j binary.elf\n\n\ + EXIT CODES:\n \ + 0 Success\n \ + 1 General runtime error\n \ + 2 Configuration or validation error\n \ + 3 File not found\n \ + 4 Permission denied\n\n\ More info: https://github.com/EvilBit-Labs/Stringy")] struct Cli { /// Input binary file to analyze (use "-" for stdin) @@ -91,9 +105,9 @@ struct Cli { action = ArgAction::Append, value_parser = Tag::from_str, value_name = "TAG", - long_help = "Include only strings with this tag. Repeat the flag for multiple tags (OR logic).\n\ - Valid tags: url, domain, ipv4, ipv6, filepath, regpath, guid, email, b64, fmt,\n\ - user-agent-ish, demangled, import, export, version, manifest, resource,\n\ + long_help = "Include only strings with this tag. Repeat the flag for multiple tags \ + (OR logic).\nValid tags: url, domain, ipv4, ipv6, filepath, regpath, guid, email, \ + b64, fmt, user-agent-ish, demangled, import, export, version, manifest, resource, \ dylib-path, rpath, rpath-var, framework-path" )] only_tags: Vec, @@ -104,9 +118,9 @@ struct Cli { action = ArgAction::Append, value_parser = Tag::from_str, value_name = "TAG", - long_help = "Exclude strings with this tag. Repeat the flag for multiple tags (OR logic).\n\ - Valid tags: url, domain, ipv4, ipv6, filepath, regpath, guid, email, b64, fmt,\n\ - user-agent-ish, demangled, import, export, version, manifest, resource,\n\ + long_help = "Exclude strings with this tag. Repeat the flag for multiple tags \ + (OR logic).\nValid tags: url, domain, ipv4, ipv6, filepath, regpath, guid, email, \ + b64, fmt, user-agent-ish, demangled, import, export, version, manifest, resource, \ dylib-path, rpath, rpath-var, framework-path" )] no_tags: Vec, @@ -120,7 +134,7 @@ struct Cli { top: Option, /// Filter by encoding [possible values: ascii, utf8, utf16, utf16le, utf16be] - #[arg(short = 'e', long, value_enum, value_name = "ENCODING")] + #[arg(long, value_enum, value_name = "ENCODING")] enc: Option, /// Raw output: no tags, no scores, no headers @@ -189,7 +203,7 @@ fn run(cli: &Cli) -> Result<(), StringyError> { filter_config = filter_config.with_min_length(n); } if let Some(enc) = cli.enc { - filter_config = filter_config.with_encoding(cli_encoding_to_filter(enc)); + filter_config = filter_config.with_encoding(enc.into()); } if !cli.only_tags.is_empty() { filter_config = filter_config.with_include_tags(cli.only_tags.clone()); diff --git a/src/pipeline/mod.rs b/src/pipeline/mod.rs index 69b29ed..e0ad13e 100644 --- a/src/pipeline/mod.rs +++ b/src/pipeline/mod.rs @@ -108,7 +108,9 @@ impl Pipeline { // -- Informational diagnostic when filters match nothing -- if filtered.is_empty() && total_count > 0 { eprintln!( - "Info: No strings matched the current filters ({total_count} extracted, 0 shown)" + "Info: No strings matched the current filters \ + ({total_count} extracted, 0 shown)\n \ + Try adjusting --min-len, --only-tags, --no-tags, or --enc to see more results" ); } diff --git a/tessl.json b/tessl.json new file mode 100644 index 0000000..6102c19 --- /dev/null +++ b/tessl.json @@ -0,0 +1,40 @@ +{ + "name": "stringy", + "mode": "vendored", + "dependencies": { + "actionbook/rust-skills": { + "version": "3ea748280d2fa5680675fe4abe1a5e764f7c021e", + "source": "https://github.com/actionbook/rust-skills", + "include": { + "skills": [ + "coding-guidelines", + "domain-cli", + "m01-ownership", + "m02-resource", + "m03-mutability", + "m04-zero-cost", + "m05-type-driven", + "m06-error-handling", + "m07-concurrency", + "m09-domain", + "m10-performance", + "m11-ecosystem", + "m12-lifecycle", + "m13-domain-error", + "m14-mental-model", + "m15-anti-pattern", + "meta-cognition-parallel", + "rust-call-graph", + "rust-code-navigator", + "rust-deps-visualizer", + "rust-learner", + "rust-refactor-helper", + "rust-skill-creator", + "rust-symbol-analyzer", + "rust-trait-explorer", + "unsafe-checker" + ] + } + } + } +} diff --git a/tests/integration_cli.rs b/tests/integration_cli.rs index 677d786..2b84057 100644 --- a/tests/integration_cli.rs +++ b/tests/integration_cli.rs @@ -188,6 +188,15 @@ fn cli_long_help_has_examples() { .stdout(predicate::str::contains("EXAMPLES:")); } +#[test] +fn cli_help_shows_exit_codes() { + stringy() + .arg("--help") + .assert() + .success() + .stdout(predicate::str::contains("EXIT CODES:")); +} + #[test] fn cli_top_flag() { let top_output = stringy() diff --git a/tests/integration_cli_short_flags.rs b/tests/integration_cli_short_flags.rs index 6d1e3d2..ebc3958 100644 --- a/tests/integration_cli_short_flags.rs +++ b/tests/integration_cli_short_flags.rs @@ -101,15 +101,16 @@ fn test_short_flag_top_equivalence() { } #[test] -fn test_short_flag_encoding_equivalence() { +fn test_enc_long_flag_only() { let elf_path = "tests/fixtures/test_binary_elf"; - let long_result = stringy().arg(elf_path).arg("--enc").arg("ascii").assert(); - - let short_result = stringy().arg(elf_path).arg("-e").arg("ascii").assert(); - - long_result.success(); - short_result.success(); + // --enc has no short form (infrequent flag) + stringy() + .arg(elf_path) + .arg("--enc") + .arg("ascii") + .assert() + .success(); } #[test] @@ -123,7 +124,7 @@ fn test_short_flag_combination() { .arg("10") .arg("-t") .arg("5") - .arg("-e") + .arg("--enc") .arg("ascii") .assert() .success(); diff --git a/tests/integration_cli_stdin.rs b/tests/integration_cli_stdin.rs new file mode 100644 index 0000000..ba01566 --- /dev/null +++ b/tests/integration_cli_stdin.rs @@ -0,0 +1,94 @@ +use assert_cmd::Command; +use assert_cmd::cargo_bin_cmd; +use predicates::prelude::*; + +fn stringy() -> Command { + cargo_bin_cmd!("stringy") +} + +// ---------- Stdin edge cases ---------- + +#[test] +fn stdin_pipe_pe_binary() { + let fixture_data = + std::fs::read("tests/fixtures/test_binary_pe.exe").expect("PE fixture should exist"); + + stringy() + .arg("-") + .write_stdin(fixture_data) + .assert() + .success() + .stdout(predicate::str::is_empty().not()); +} + +#[test] +fn stdin_pipe_macho_binary() { + let fixture_data = + std::fs::read("tests/fixtures/test_binary_macho").expect("Mach-O fixture should exist"); + + // Mach-O fixtures may not parse on all platforms; verify it runs without panicking + let result = stringy() + .arg("-") + .write_stdin(fixture_data) + .output() + .expect("should execute"); + + assert!(result.status.success() || !result.stderr.is_empty()); +} + +#[test] +fn stdin_pipe_unknown_data() { + let fixture_data = + std::fs::read("tests/fixtures/test_unknown.bin").expect("unknown fixture should exist"); + + // Unknown format falls back to unstructured byte scan (succeeds, may emit info) + stringy() + .arg("-") + .write_stdin(fixture_data) + .assert() + .success(); +} + +#[test] +fn stdin_pipe_json_output() { + let fixture_data = + std::fs::read("tests/fixtures/test_binary_elf").expect("ELF fixture should exist"); + + stringy() + .arg("-") + .arg("--json") + .write_stdin(fixture_data) + .assert() + .success() + .stdout(predicate::str::is_empty().not()); +} + +#[test] +fn stdin_pipe_raw_mode() { + let fixture_data = + std::fs::read("tests/fixtures/test_binary_elf").expect("ELF fixture should exist"); + + stringy() + .arg("-") + .arg("--raw") + .write_stdin(fixture_data) + .assert() + .success() + .stdout(predicate::str::is_empty().not()); +} + +#[test] +fn stdin_pipe_with_filters() { + let fixture_data = + std::fs::read("tests/fixtures/test_binary_elf").expect("ELF fixture should exist"); + + stringy() + .arg("-") + .arg("--min-len") + .arg("8") + .arg("--top") + .arg("10") + .write_stdin(fixture_data) + .assert() + .success(); +} diff --git a/tests/integration_flow8_diagnostics.rs b/tests/integration_flow8_diagnostics.rs index 73b56f1..71a9c28 100644 --- a/tests/integration_flow8_diagnostics.rs +++ b/tests/integration_flow8_diagnostics.rs @@ -137,7 +137,8 @@ fn flow8_filters_match_nothing_info_stderr_exit_0() { .assert() .success() .stdout(predicate::str::is_empty().trim()) - .stderr(predicate::str::contains("Info:")); + .stderr(predicate::str::contains("Info:")) + .stderr(predicate::str::contains("Try adjusting")); } // ---------------------------------------------------------------------------