From 67a2a1642ef289812ec28fae84d00b5581e01359 Mon Sep 17 00:00:00 2001 From: Den Rozhnovskiy Date: Sun, 15 Feb 2026 20:55:25 +0500 Subject: [PATCH 1/2] fix(perf): reduce redundant syscalls and hash computations --- CHANGELOG.md | 51 +++++++++++++++++++++ codeclone/blocks.py | 21 ++++++++- codeclone/cache.py | 10 ++-- codeclone/cli.py | 101 ++++++++++++++++------------------------- codeclone/extractor.py | 61 +++++++++++++++---------- codeclone/scanner.py | 20 ++++---- pyproject.toml | 2 +- tests/test_cli_unit.py | 10 ++-- tests/test_security.py | 28 +++++++++--- 9 files changed, 194 insertions(+), 110 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d24daf0..78fe855 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,56 @@ # Changelog +## [1.4.2] - 2026-02-17 + +### Overview + +This patch release is a maintenance update. Determinism remains guaranteed: reports are stable and ordering is +unchanged. + +### Performance & Implementation Cleanup + +- `process_file()` now uses a single `os.stat()` call to obtain both size (size guard) and `st_mtime_ns`/`st_size` (file + stat signature), removing a redundant `os.path.getsize()` call. +- Discovery logic was deduplicated by extracting `_discover_files()`; quiet/non-quiet behavior differs only by UI status + wrapper, not by semantics or filtering. +- Cache path wiring now precomputes `wire_map` so `_wire_filepath_from_runtime()` is evaluated once per key. + +### Hash Reuse for Block/Segment Analysis + +- `extract_blocks()` and `extract_segments()` accept optional `precomputed_hashes`. When provided, they reuse hashes + instead of recomputing. +- The extractor computes function body hashes once and passes them to both block and segment extraction when both + analyses run for the same function. + +### Scanner Efficiency (No Semantic Change) + +- `iter_py_files()` now filters candidates before sorting, so only valid candidates are sorted. The final order remains + deterministic and equivalent to previous behavior. + +### Contract Tightening + +- `precomputed_hashes` type strengthened: `list[str] | None` → `Sequence[str] | None` (read-only intent in the type + contract). +- Added `assert len(precomputed_hashes) == len(body)` in both `extract_blocks()` and `extract_segments()` to catch + mismatched inputs early (development-time invariant). + +### Testing & Determinism + +- Byte-identical JSON reports verified across repeated runs; differences, when present, are limited to + volatile/provenance meta fields (e.g., cache status/path, timestamps), while semantic payload remains stable. +- Unit tests updated to mock `os.stat` instead of `os.path.getsize` where applicable (`test_process_file_stat_error`, + `test_process_file_size_limit`). + +### Notes + +- No changes to: + - detection semantics / fingerprints + - baseline hash inputs (`payload_sha256` semantic payload) + - exit code contract and precedence + - schema versions (baseline v1.0, cache v1.2, report v1.1) + +--- + ## [1.4.1] - 2026-02-15 ### CLI diff --git a/codeclone/blocks.py b/codeclone/blocks.py index 3469361..12a5526 100644 --- a/codeclone/blocks.py +++ b/codeclone/blocks.py @@ -9,6 +9,7 @@ from __future__ import annotations import ast +from collections.abc import Sequence from dataclasses import dataclass from .blockhash import stmt_hash @@ -45,12 +46,20 @@ def extract_blocks( cfg: NormalizationConfig, block_size: int, max_blocks: int, + precomputed_hashes: Sequence[str] | None = None, ) -> list[BlockUnit]: body = getattr(func_node, "body", None) if not isinstance(body, list) or len(body) < block_size: return [] - stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body] + if precomputed_hashes is not None: + assert len(precomputed_hashes) == len(body), ( + f"precomputed_hashes length {len(precomputed_hashes)} " + f"!= body length {len(body)}" + ) + stmt_hashes = precomputed_hashes + else: + stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body] blocks: list[BlockUnit] = [] last_start: int | None = None @@ -94,12 +103,20 @@ def extract_segments( cfg: NormalizationConfig, window_size: int, max_segments: int, + precomputed_hashes: Sequence[str] | None = None, ) -> list[SegmentUnit]: body = getattr(func_node, "body", None) if not isinstance(body, list) or len(body) < window_size: return [] - stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body] + if precomputed_hashes is not None: + assert len(precomputed_hashes) == len(body), ( + f"precomputed_hashes length {len(precomputed_hashes)} " + f"!= body length {len(body)}" + ) + stmt_hashes = precomputed_hashes + else: + stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body] segments: list[SegmentUnit] = [] diff --git a/codeclone/cache.py b/codeclone/cache.py index 3753148..942c46b 100644 --- a/codeclone/cache.py +++ b/codeclone/cache.py @@ -344,14 +344,14 @@ def save(self) -> None: try: self.path.parent.mkdir(parents=True, exist_ok=True) wire_files: dict[str, object] = {} - for runtime_path in sorted( - self.data["files"], key=self._wire_filepath_from_runtime - ): + wire_map = { + rp: self._wire_filepath_from_runtime(rp) for rp in self.data["files"] + } + for runtime_path in sorted(self.data["files"], key=wire_map.__getitem__): entry = self.get_file_entry(runtime_path) if entry is None: continue - wire_path = self._wire_filepath_from_runtime(runtime_path) - wire_files[wire_path] = _encode_wire_file_entry(entry) + wire_files[wire_map[runtime_path]] = _encode_wire_file_entry(entry) payload: dict[str, object] = { "py": current_python_tag(), diff --git a/codeclone/cli.py b/codeclone/cli.py index b949089..5ac9ce2 100644 --- a/codeclone/cli.py +++ b/codeclone/cli.py @@ -122,14 +122,14 @@ def process_file( """ try: - # Check file size + # Single os.stat() for both size check and cache signature try: - st_size = os.path.getsize(filepath) - if st_size > MAX_FILE_SIZE: + st = os.stat(filepath) + if st.st_size > MAX_FILE_SIZE: return ProcessingResult( filepath=filepath, success=False, - error=f"File too large: {st_size} bytes (max {MAX_FILE_SIZE})", + error=f"File too large: {st.st_size} bytes (max {MAX_FILE_SIZE})", error_kind="file_too_large", ) except OSError as e: @@ -140,6 +140,8 @@ def process_file( error_kind="stat_error", ) + stat: FileStat = {"mtime_ns": st.st_mtime_ns, "size": st.st_size} + try: source = Path(filepath).read_text("utf-8") except UnicodeDecodeError as e: @@ -157,7 +159,6 @@ def process_file( error_kind="source_read_error", ) - stat = file_stat_signature(filepath) module_name = module_name_from_path(root, filepath) units, blocks, segments = extract_units_from_source( @@ -355,68 +356,44 @@ def _safe_future_result( return None, str(e) # Discovery phase - try: - if args.quiet: - for fp in iter_py_files(str(root_path)): - files_found += 1 - stat, cached, warn = _get_cached_entry(fp) - if warn: - console.print(warn) - files_skipped += 1 - continue - if cached and cached.get("stat") == stat: - cache_hits += 1 - all_units.extend( - cast( - list[GroupItem], - cast(object, cached.get("units", [])), - ) + def _discover_files() -> None: + nonlocal files_found, cache_hits, files_skipped + for fp in iter_py_files(str(root_path)): + files_found += 1 + stat, cached, warn = _get_cached_entry(fp) + if warn: + console.print(warn) + files_skipped += 1 + continue + if cached and cached.get("stat") == stat: + cache_hits += 1 + all_units.extend( + cast( + list[GroupItem], + cast(object, cached.get("units", [])), ) - all_blocks.extend( - cast( - list[GroupItem], - cast(object, cached.get("blocks", [])), - ) + ) + all_blocks.extend( + cast( + list[GroupItem], + cast(object, cached.get("blocks", [])), ) - all_segments.extend( - cast( - list[GroupItem], - cast(object, cached.get("segments", [])), - ) + ) + all_segments.extend( + cast( + list[GroupItem], + cast(object, cached.get("segments", [])), ) - else: - files_to_process.append(fp) + ) + else: + files_to_process.append(fp) + + try: + if args.quiet: + _discover_files() else: with console.status(ui.STATUS_DISCOVERING, spinner="dots"): - for fp in iter_py_files(str(root_path)): - files_found += 1 - stat, cached, warn = _get_cached_entry(fp) - if warn: - console.print(warn) - files_skipped += 1 - continue - if cached and cached.get("stat") == stat: - cache_hits += 1 - all_units.extend( - cast( - list[GroupItem], - cast(object, cached.get("units", [])), - ) - ) - all_blocks.extend( - cast( - list[GroupItem], - cast(object, cached.get("blocks", [])), - ) - ) - all_segments.extend( - cast( - list[GroupItem], - cast(object, cached.get("segments", [])), - ) - ) - else: - files_to_process.append(fp) + _discover_files() except OSError as e: console.print(ui.fmt_contract_error(ui.ERR_SCAN_FAILED.format(error=e))) sys.exit(ExitCode.CONTRACT_ERROR) diff --git a/codeclone/extractor.py b/codeclone/extractor.py index d3b81c7..a2e814f 100644 --- a/codeclone/extractor.py +++ b/codeclone/extractor.py @@ -16,6 +16,7 @@ from contextlib import contextmanager from dataclasses import dataclass +from .blockhash import stmt_hash from .blocks import BlockUnit, SegmentUnit, extract_blocks, extract_segments from .cfg import CFGBuilder from .errors import ParseError @@ -250,28 +251,42 @@ def extract_units_from_source( ) ) - # Block-level units (exclude __init__) - if not local_name.endswith("__init__") and loc >= 40 and stmt_count >= 10: - blocks = extract_blocks( - node, - filepath=filepath, - qualname=qualname, - cfg=cfg, - block_size=4, - max_blocks=15, - ) - block_units.extend(blocks) - - # Segment-level units (windows within functions, for internal clones) - if loc >= 30 and stmt_count >= 12: - segments = extract_segments( - node, - filepath=filepath, - qualname=qualname, - cfg=cfg, - window_size=6, - max_segments=60, - ) - segment_units.extend(segments) + # Block-level and segment-level units share statement hashes + needs_blocks = ( + not local_name.endswith("__init__") and loc >= 40 and stmt_count >= 10 + ) + needs_segments = loc >= 30 and stmt_count >= 12 + + if needs_blocks or needs_segments: + body = getattr(node, "body", None) + hashes: list[str] | None = None + if isinstance(body, list): + hashes = [stmt_hash(stmt, cfg) for stmt in body] + + if needs_blocks: + block_units.extend( + extract_blocks( + node, + filepath=filepath, + qualname=qualname, + cfg=cfg, + block_size=4, + max_blocks=15, + precomputed_hashes=hashes, + ) + ) + + if needs_segments: + segment_units.extend( + extract_segments( + node, + filepath=filepath, + qualname=qualname, + cfg=cfg, + window_size=6, + max_segments=60, + precomputed_hashes=hashes, + ) + ) return units, block_units, segment_units diff --git a/codeclone/scanner.py b/codeclone/scanner.py index b2421bf..0588701 100644 --- a/codeclone/scanner.py +++ b/codeclone/scanner.py @@ -77,8 +77,9 @@ def iter_py_files( if root_str.startswith(sensitive + "/"): raise ValidationError(f"Cannot scan under sensitive directory: {root}") - file_count = 0 - for p in sorted(rootp.rglob("*.py"), key=lambda path: str(path)): + # Collect and filter first, then sort — avoids sorting excluded paths + candidates: list[Path] = [] + for p in rootp.rglob("*.py"): # Verify path is actually under root (prevent symlink attacks) try: p.resolve().relative_to(rootp) @@ -90,12 +91,15 @@ def iter_py_files( if any(ex in parts for ex in excludes): continue - file_count += 1 - if file_count > max_files: - raise ValidationError( - f"File count exceeds limit of {max_files}. " - "Use more specific root or increase limit." - ) + candidates.append(p) + + if len(candidates) > max_files: + raise ValidationError( + f"File count exceeds limit of {max_files}. " + "Use more specific root or increase limit." + ) + + for p in sorted(candidates, key=lambda path: str(path)): yield str(p) diff --git a/pyproject.toml b/pyproject.toml index 3ef98f5..a5eb966 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "codeclone" -version = "1.4.1" +version = "1.4.2" description = "AST and CFG-based code clone detector for Python focused on architectural duplication" readme = { file = "README.md", content-type = "text/markdown" } license = { text = "MIT" } diff --git a/tests/test_cli_unit.py b/tests/test_cli_unit.py index 304a201..684a12c 100644 --- a/tests/test_cli_unit.py +++ b/tests/test_cli_unit.py @@ -28,10 +28,14 @@ def test_process_file_stat_error( src = tmp_path / "a.py" src.write_text("def f():\n return 1\n", "utf-8") - def _boom(_path: str) -> int: - raise OSError("nope") + _original_stat = os.stat - monkeypatch.setattr(os.path, "getsize", _boom) + def _boom(path: str, *args: object, **kwargs: object) -> os.stat_result: + if str(path) == str(src): + raise OSError("nope") + return _original_stat(path, *args, **kwargs) # type: ignore[arg-type] + + monkeypatch.setattr(os, "stat", _boom) result = process_file(str(src), str(tmp_path), NormalizationConfig(), 1, 1) assert result.success is False assert result.error is not None diff --git a/tests/test_security.py b/tests/test_security.py index 9bf64a1..3e5c474 100644 --- a/tests/test_security.py +++ b/tests/test_security.py @@ -30,18 +30,34 @@ def test_process_file_size_limit() -> None: try: cfg = NormalizationConfig() + real_stat = os.stat(tmp_path) - # Mock os.path.getsize to return huge size - with patch("os.path.getsize", return_value=MAX_FILE_SIZE + 1): + # Mock os.stat to return huge st_size + def _huge_stat(path: str, *args: object, **kwargs: object) -> os.stat_result: + return os.stat_result( + ( + real_stat.st_mode, + real_stat.st_ino, + real_stat.st_dev, + real_stat.st_nlink, + real_stat.st_uid, + real_stat.st_gid, + MAX_FILE_SIZE + 1, # st_size + int(real_stat.st_atime), + int(real_stat.st_mtime), + int(real_stat.st_ctime), + ) + ) + + with patch("os.stat", side_effect=_huge_stat): result = process_file(tmp_path, os.path.dirname(tmp_path), cfg, 0, 0) assert result.success is False assert result.error is not None assert "File too large" in result.error - # Normal size should pass - with patch("os.path.getsize", return_value=10): - result = process_file(tmp_path, os.path.dirname(tmp_path), cfg, 0, 0) - assert result.success is True + # Normal size should pass (no mock — real stat) + result = process_file(tmp_path, os.path.dirname(tmp_path), cfg, 0, 0) + assert result.success is True finally: os.remove(tmp_path) From 9b247efff483558abb6beca1a6eebdaee4bd740e Mon Sep 17 00:00:00 2001 From: Den Rozhnovskiy Date: Tue, 17 Feb 2026 19:10:09 +0500 Subject: [PATCH 2/2] test(extractor): cover block/segment gate branches and hash reuse fallback --- tests/test_extractor.py | 105 ++++++++++++++++++++++++++++++++++++++++ uv.lock | 2 +- 2 files changed, 106 insertions(+), 1 deletion(-) diff --git a/tests/test_extractor.py b/tests/test_extractor.py index c1bc568..5849aa0 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -374,6 +374,111 @@ def f(): assert segments == [] +def test_extract_generates_segments_without_blocks_when_only_segment_gate_met() -> None: + lines = ["def f():"] + for i in range(12): + lines.append(f" x{i} = {i}") + lines.append("") + lines.append("") + src = "\n".join(lines) + + units, blocks, segments = extract_units_from_source( + source=src, + filepath="x.py", + module_name="mod", + cfg=NormalizationConfig(), + min_loc=1, + min_stmt=1, + ) + + assert units + assert blocks == [] + assert segments + + +def test_extract_generates_blocks_without_segments_when_only_block_gate_met() -> None: + lines = ["def f():"] + for i in range(10): + lines.append(f" x{i} = {i}") + lines.append("") + lines.append("") + lines.append("") + lines.append("") + src = "\n".join(lines) + + units, blocks, segments = extract_units_from_source( + source=src, + filepath="x.py", + module_name="mod", + cfg=NormalizationConfig(), + min_loc=1, + min_stmt=1, + ) + + assert units + assert blocks + assert segments == [] + + +def test_extract_handles_non_list_function_body_for_hash_reuse( + monkeypatch: pytest.MonkeyPatch, +) -> None: + lines = ["def f():"] + for i in range(12): + lines.append(f" x{i} = {i}") + lines.append("") + lines.append("") + tree = ast.parse("\n".join(lines)) + func = tree.body[0] + assert isinstance(func, ast.FunctionDef) + func.body = tuple(func.body) # type: ignore[assignment] + + captured_hashes: dict[str, object] = {} + + def _fake_parse(_source: str, _timeout_s: int) -> ast.AST: + return tree + + def _fake_fingerprint( + _node: ast.FunctionDef | ast.AsyncFunctionDef, + _cfg: NormalizationConfig, + _qualname: str, + ) -> str: + return "f" * 40 + + def _fake_extract_segments( + _node: ast.FunctionDef | ast.AsyncFunctionDef, + filepath: str, + qualname: str, + cfg: NormalizationConfig, + window_size: int = 6, + max_segments: int = 60, + *, + precomputed_hashes: list[str] | None = None, + ) -> list[object]: + del filepath, qualname, cfg, window_size, max_segments + captured_hashes["value"] = precomputed_hashes + return [] + + monkeypatch.setattr(extractor, "_parse_with_limits", _fake_parse) + monkeypatch.setattr(extractor, "_stmt_count", lambda _node: 12) + monkeypatch.setattr(extractor, "get_cfg_fingerprint", _fake_fingerprint) + monkeypatch.setattr(extractor, "extract_segments", _fake_extract_segments) + + units, blocks, segments = extract_units_from_source( + source="def f():\n pass\n", + filepath="x.py", + module_name="mod", + cfg=NormalizationConfig(), + min_loc=1, + min_stmt=1, + ) + + assert len(units) == 1 + assert blocks == [] + assert segments == [] + assert captured_hashes["value"] is None + + def test_extract_skips_invalid_positions(monkeypatch: pytest.MonkeyPatch) -> None: tree = ast.parse( """ diff --git a/uv.lock b/uv.lock index 21ffe1c..3bc62ff 100644 --- a/uv.lock +++ b/uv.lock @@ -189,7 +189,7 @@ wheels = [ [[package]] name = "codeclone" -version = "1.4.1" +version = "1.4.2" source = { editable = "." } dependencies = [ { name = "pygments" },