diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d1c5a3..8cbadac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,22 @@ All notable changes to microbench are documented here. +## [Unreleased] + +### Enhancements + +- **`file-hash` mixin — automatic argument file scanning** (CLI): the + default hash list now includes not only the command executable (`cmd[0]`) + but also any command-line arguments (`cmd[1:]`) that resolve to existing + files on disk prior to command execution. Passing `--hash-file` still + overrides the default entirely; the Python API is unaffected. The hash + algorithm name is now stored under `mb.file_hash_algorithm`. + +### Documentation + +- Fix documentation on writing custom mixins to note that they must be + added to the registry if they are to be detected by the CLI. + ## [2.0.0] - 2026-03-17 Microbench v2 is a significant upgrade with many new features versus v1.1.0. diff --git a/docs/cli.md b/docs/cli.md index 353c1da..a880ac5 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -182,11 +182,26 @@ so the CLI defaults to the working directory instead. | Option | Description | |---|---| -| `--hash-file FILE [FILE ...]` | File(s) to hash. | +| `--hash-file FILE [FILE ...]` | File(s) to hash. Overrides the default entirely. | | `--hash-algorithm ALGORITHM` | Hash algorithm (e.g. `sha256`, `md5`). Default: `sha256`. | **CLI default for `--hash-file`:** the benchmarked command executable -(`cmd[0]`), e.g. `./run_simulation.sh`. +(`cmd[0]`) **plus any arguments that resolve to existing files on disk** +(`cmd[1:]`). For example, given the command: + +```bash +microbench --mixin file-hash -- ./run.sh input.csv --config params.yaml +``` + +microbench will automatically hash `./run.sh`, `input.csv`, and +`params.yaml` — capturing the provenance of inputs without any extra +flags. Note that file hashes are computed prior to command execution. +Tokens that don't correspond to existing files (flags such as +`--config`, non-existent paths including output filenames) are +silently ignored. + +Passing `--hash-file` overrides this default entirely; only the +explicitly named files are hashed. **Python API default:** the running script (`sys.argv[0]`). The same `sys.argv[0]` issue applies here, so the CLI defaults to hashing the diff --git a/docs/user-guide/mixins.md b/docs/user-guide/mixins.md index d28c4fa..2956843 100644 --- a/docs/user-guide/mixins.md +++ b/docs/user-guide/mixins.md @@ -395,7 +395,8 @@ microbench --mixin git-info --git-repo /path/to/repo -- ./run.sh Records a cryptographic checksum of one or more files alongside benchmark results. This ties a result to the exact version of the script that produced it — useful when benchmarks evolve over time and you need to know which code -generated which numbers. +generated which numbers. Hashes are computed as a pre-hook, i.e. before the +enclosed code is run. ```python from microbench import MicroBench, MBFileHash @@ -432,10 +433,13 @@ class Bench(MicroBench, MBFileHash): ``` Each record will contain a `file_hashes` dict mapping each path to its -hex digest: +hex digest. The hashing algorithm is stored under `mb.file_hash_algorithm`: ```json { + "mb": { + "file_hash_algorithm": "sha256" + }, "file_hashes": { "run_experiment.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", "config.yaml": "2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824" @@ -466,11 +470,17 @@ Any algorithm name accepted by `hashlib.new()` works: `'sha256'` (default), ``` **CLI:** use `--hash-file FILE [FILE ...]` and `--hash-algorithm ALGORITHM`. -The CLI defaults to hashing the benchmarked command executable rather than -`sys.argv[0]`: +The CLI default hashes the benchmarked command executable *plus* any +arguments that resolve to existing files on disk: ```bash +# Automatically hashes run.sh, input.csv, and params.yaml +microbench --mixin file-hash -- ./run.sh input.csv --config params.yaml + +# Hash a specific set of files (overrides the default entirely) microbench --mixin file-hash --hash-file run_experiment.py config.yaml -- ./run.sh + +# Change the hash algorithm microbench --mixin file-hash --hash-algorithm md5 -- ./run.sh ``` diff --git a/microbench/mixins/vcs.py b/microbench/mixins/vcs.py index ee821ee..ddcc4e8 100644 --- a/microbench/mixins/vcs.py +++ b/microbench/mixins/vcs.py @@ -28,16 +28,33 @@ def _existing_dir(value): def _resolve_cmd_path(cmd): - """Resolve cmd[0] to an absolute file path for use as a hash target.""" + """Resolve cmd[0] and scan arguments for file paths to hash. + + Resolves the command executable (``cmd[0]``) to an absolute path via + :func:`shutil.which`, then scans the remaining arguments + (``cmd[1:]``) for tokens that correspond to existing files on disk. + This transparently captures input file paths that appear + on the command line without requiring the user to specify + ``--hash-file`` explicitly. + """ import shutil + paths = [] + + # Resolve the command executable. path = cmd[0] resolved = shutil.which(path) if resolved: - return [resolved] - if os.path.isfile(path): - return [path] - return [] + paths.append(resolved) + elif os.path.isfile(path): + paths.append(path) + + # Scan remaining arguments for tokens that name existing files. + for arg in cmd[1:]: + if os.path.isfile(arg): + paths.append(arg) + + return paths class MBGitInfo: @@ -55,7 +72,7 @@ class MBGitInfo: useful when the script and the repository root are in different locations. - **CLI usage** (``python -m microbench``): the default is the current + **CLI usage**: the default is the current working directory rather than the script directory, since ``sys.argv[0]`` points to the microbench package itself. Use ``--git-repo DIR`` to override. @@ -149,15 +166,16 @@ class MBFileHash: instead. Files are read in 64 KB chunks, so large files are handled without loading them fully into memory. - **CLI usage** (``python -m microbench``): the default is the - benchmarked command executable (``cmd[0]``) rather than the running - script, since ``sys.argv[0]`` points to the microbench package - itself. Use ``--hash-file FILE [FILE ...]`` to override, and - ``--hash-algorithm`` to change the algorithm. + **CLI usage**: the default list of files to hash is the + benchmarked command executable (``cmd[0]``) *plus* any arguments + that resolve to existing files on disk (``cmd[1:]``). This + transparently captures input files without requiring + ``--hash-file``. Use ``--hash-file FILE [FILE ...]`` to override the + default entirely, and ``--hash-algorithm`` to change the algorithm. Attributes: hash_files (iterable of str, optional): File paths to hash. - Defaults to ``[sys.argv[0]]``. + Defaults to ``[sys.argv[0]]`` in the Python API. hash_algorithm (str, optional): Hash algorithm name accepted by :func:`hashlib.new`. Defaults to ``'sha256'``. Use ``'md5'`` for faster hashing of large files where cryptographic strength @@ -167,11 +185,14 @@ class MBFileHash: { "file_hashes": { - "run_experiment.py": "e3b0c44298fc1c14..." + "run_experiment.py": "e3b0c44298fc1c14...", + "input.csv": "2cf24dba5fb0a30e..." } } Note: + The hashing algorithm name is stored under mb.file_hash_algorithm. + CLI compatible. """ @@ -183,8 +204,10 @@ class MBFileHash: nargs='+', type=_existing_file, help=( - 'File(s) to hash with the file-hash mixin. ' - 'CLI default: the benchmarked command executable. ' + 'File(s) to hash with the file-hash mixin. Overrides ' + 'the default entirely. ' + 'CLI default: the command executable plus any arguments ' + 'that are existing files. ' 'Python API default: the running script.' ), cli_default=_resolve_cmd_path, @@ -218,4 +241,8 @@ def capture_file_hashes(self, bm_data): for chunk in iter(lambda: f.read(65536), b''): h.update(chunk) hashes[path] = h.hexdigest() + + if hashes: + bm_data['mb']['file_hash_algorithm'] = algorithm + bm_data['file_hashes'] = hashes diff --git a/tests/test_cli.py b/tests/test_cli.py index 551b07c..43b199a 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1224,6 +1224,134 @@ def test_cli_hash_algorithm(tmp_path): assert sha256_hex != md5_hex +def test_cli_hash_file_default_includes_arg_files(tmp_path): + """file-hash default scans cmd[1:] and hashes arguments that are files.""" + script = tmp_path / 'script.sh' + script.write_bytes(b'#!/bin/sh') + input_file = tmp_path / 'input.csv' + input_file.write_bytes(b'a,b,c\n1,2,3\n') + config = tmp_path / 'params.yaml' + config.write_bytes(b'lr: 0.001\n') + + _, record, _ = _run_main( + [ + '--mixin', + 'file-hash', + '--', + str(script), + str(input_file), + '--flag', + str(config), + ] + ) + hashes = record.get('file_hashes', {}) + assert str(script) in hashes + assert str(input_file) in hashes + assert str(config) in hashes + + +def test_cli_hash_file_default_arg_skips_nonexistent(tmp_path): + """file-hash default ignores cmd[1:] tokens that are not existing files.""" + script = tmp_path / 'script.sh' + script.write_bytes(b'#!/bin/sh') + + _, record, _ = _run_main( + ['--mixin', 'file-hash', '--', str(script), 'no_such_file.csv'] + ) + hashes = record.get('file_hashes', {}) + assert str(script) in hashes + assert 'no_such_file.csv' not in hashes + assert 'capture_errors' not in record.get('call', {}) + + +def test_cli_hash_file_default_arg_skips_flags(tmp_path): + """file-hash default does not attempt to hash flag-like arguments.""" + script = tmp_path / 'script.sh' + script.write_bytes(b'#!/bin/sh') + + _, record, _ = _run_main( + [ + '--mixin', + 'file-hash', + '--', + str(script), + '--verbose', + '-n', + '10', + ] + ) + hashes = record.get('file_hashes', {}) + assert str(script) in hashes + # Flag tokens should not appear as hash keys + assert '--verbose' not in hashes + assert '-n' not in hashes + assert '10' not in hashes + + +def test_cli_hash_file_default_arg_skips_directories(tmp_path): + """file-hash default does not hash directory paths passed as arguments.""" + script = tmp_path / 'script.sh' + script.write_bytes(b'#!/bin/sh') + subdir = tmp_path / 'output_dir' + subdir.mkdir() + + _, record, _ = _run_main(['--mixin', 'file-hash', '--', str(script), str(subdir)]) + hashes = record.get('file_hashes', {}) + assert str(script) in hashes + assert str(subdir) not in hashes + + +def test_cli_hash_file_explicit_overrides_arg_scan(tmp_path): + """--hash-file overrides the default entirely; argument files are not scanned.""" + script = tmp_path / 'script.sh' + script.write_bytes(b'#!/bin/sh') + input_file = tmp_path / 'input.csv' + input_file.write_bytes(b'data\n') + explicit = tmp_path / 'specific.dat' + explicit.write_bytes(b'specific\n') + + _, record, _ = _run_main( + [ + '--mixin', + 'file-hash', + '--hash-file', + str(explicit), + '--', + str(script), + str(input_file), + ] + ) + hashes = record.get('file_hashes', {}) + # Only the explicitly named file should appear + assert str(explicit) in hashes + assert str(script) not in hashes + assert str(input_file) not in hashes + + +def test_cli_hash_file_default_arg_duplicate_file(tmp_path): + """file-hash handles the same file appearing multiple times in args.""" + script = tmp_path / 'script.sh' + script.write_bytes(b'#!/bin/sh') + data = tmp_path / 'data.csv' + data.write_bytes(b'x\n') + + _, record, _ = _run_main( + [ + '--mixin', + 'file-hash', + '--', + str(script), + str(data), + str(data), # duplicated + ] + ) + hashes = record.get('file_hashes', {}) + assert str(script) in hashes + # dict assignment means the second write is idempotent; key appears once + assert str(data) in hashes + assert 'capture_errors' not in record.get('call', {}) + + def test_cli_timeout_grace_period_requires_timeout(): """--timeout-grace-period without --timeout is an error.""" with pytest.raises(SystemExit) as exc: