From 67a2a1642ef289812ec28fae84d00b5581e01359 Mon Sep 17 00:00:00 2001
From: Den Rozhnovskiy <pytelemonbot@mail.ru>
Date: Sun, 15 Feb 2026 20:55:25 +0500
Subject: [PATCH 1/2] fix(perf): reduce redundant syscalls and hash
 computations

---
 CHANGELOG.md           |  51 +++++++++++++++++++++
 codeclone/blocks.py    |  21 ++++++++-
 codeclone/cache.py     |  10 ++--
 codeclone/cli.py       | 101 ++++++++++++++++-------------------------
 codeclone/extractor.py |  61 +++++++++++++++----------
 codeclone/scanner.py   |  20 ++++----
 pyproject.toml         |   2 +-
 tests/test_cli_unit.py |  10 ++--
 tests/test_security.py |  28 +++++++++---
 9 files changed, 194 insertions(+), 110 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d24daf0..78fe855 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,56 @@
 # Changelog
 
+## [1.4.2] - 2026-02-17
+
+### Overview
+
+This patch release is a maintenance update. Determinism remains guaranteed: reports are stable and ordering is
+unchanged.
+
+### Performance & Implementation Cleanup
+
+- `process_file()` now uses a single `os.stat()` call to obtain both size (size guard) and `st_mtime_ns`/`st_size` (file
+  stat signature), removing a redundant `os.path.getsize()` call.
+- Discovery logic was deduplicated by extracting `_discover_files()`; quiet/non-quiet behavior differs only by UI status
+  wrapper, not by semantics or filtering.
+- Cache path wiring now precomputes `wire_map` so `_wire_filepath_from_runtime()` is evaluated once per key.
+
+### Hash Reuse for Block/Segment Analysis
+
+- `extract_blocks()` and `extract_segments()` accept optional `precomputed_hashes`. When provided, they reuse hashes
+  instead of recomputing.
+- The extractor computes function body hashes once and passes them to both block and segment extraction when both
+  analyses run for the same function.
+
+### Scanner Efficiency (No Semantic Change)
+
+- `iter_py_files()` now filters candidates before sorting, so only valid candidates are sorted. The final order remains
+  deterministic and equivalent to previous behavior.
+
+### Contract Tightening
+
+- `precomputed_hashes` type strengthened: `list[str] | None` → `Sequence[str] | None` (read-only intent in the type
+  contract).
+- Added `assert len(precomputed_hashes) == len(body)` in both `extract_blocks()` and `extract_segments()` to catch
+  mismatched inputs early (development-time invariant).
+
+### Testing & Determinism
+
+- Byte-identical JSON reports verified across repeated runs; differences, when present, are limited to
+  volatile/provenance meta fields (e.g., cache status/path, timestamps), while semantic payload remains stable.
+- Unit tests updated to mock `os.stat` instead of `os.path.getsize` where applicable (`test_process_file_stat_error`,
+  `test_process_file_size_limit`).
+
+### Notes
+
+- No changes to:
+  - detection semantics / fingerprints
+  - baseline hash inputs (`payload_sha256` semantic payload)
+  - exit code contract and precedence
+  - schema versions (baseline v1.0, cache v1.2, report v1.1)
+
+---
+
 ## [1.4.1] - 2026-02-15
 
 ### CLI
diff --git a/codeclone/blocks.py b/codeclone/blocks.py
index 3469361..12a5526 100644
--- a/codeclone/blocks.py
+++ b/codeclone/blocks.py
@@ -9,6 +9,7 @@
 from __future__ import annotations
 
 import ast
+from collections.abc import Sequence
 from dataclasses import dataclass
 
 from .blockhash import stmt_hash
@@ -45,12 +46,20 @@ def extract_blocks(
     cfg: NormalizationConfig,
     block_size: int,
     max_blocks: int,
+    precomputed_hashes: Sequence[str] | None = None,
 ) -> list[BlockUnit]:
     body = getattr(func_node, "body", None)
     if not isinstance(body, list) or len(body) < block_size:
         return []
 
-    stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]
+    if precomputed_hashes is not None:
+        assert len(precomputed_hashes) == len(body), (
+            f"precomputed_hashes length {len(precomputed_hashes)} "
+            f"!= body length {len(body)}"
+        )
+        stmt_hashes = precomputed_hashes
+    else:
+        stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]
 
     blocks: list[BlockUnit] = []
     last_start: int | None = None
@@ -94,12 +103,20 @@ def extract_segments(
     cfg: NormalizationConfig,
     window_size: int,
     max_segments: int,
+    precomputed_hashes: Sequence[str] | None = None,
 ) -> list[SegmentUnit]:
     body = getattr(func_node, "body", None)
     if not isinstance(body, list) or len(body) < window_size:
         return []
 
-    stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]
+    if precomputed_hashes is not None:
+        assert len(precomputed_hashes) == len(body), (
+            f"precomputed_hashes length {len(precomputed_hashes)} "
+            f"!= body length {len(body)}"
+        )
+        stmt_hashes = precomputed_hashes
+    else:
+        stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]
 
     segments: list[SegmentUnit] = []
 
diff --git a/codeclone/cache.py b/codeclone/cache.py
index 3753148..942c46b 100644
--- a/codeclone/cache.py
+++ b/codeclone/cache.py
@@ -344,14 +344,14 @@ def save(self) -> None:
         try:
             self.path.parent.mkdir(parents=True, exist_ok=True)
             wire_files: dict[str, object] = {}
-            for runtime_path in sorted(
-                self.data["files"], key=self._wire_filepath_from_runtime
-            ):
+            wire_map = {
+                rp: self._wire_filepath_from_runtime(rp) for rp in self.data["files"]
+            }
+            for runtime_path in sorted(self.data["files"], key=wire_map.__getitem__):
                 entry = self.get_file_entry(runtime_path)
                 if entry is None:
                     continue
-                wire_path = self._wire_filepath_from_runtime(runtime_path)
-                wire_files[wire_path] = _encode_wire_file_entry(entry)
+                wire_files[wire_map[runtime_path]] = _encode_wire_file_entry(entry)
 
             payload: dict[str, object] = {
                 "py": current_python_tag(),
diff --git a/codeclone/cli.py b/codeclone/cli.py
index b949089..5ac9ce2 100644
--- a/codeclone/cli.py
+++ b/codeclone/cli.py
@@ -122,14 +122,14 @@ def process_file(
     """
 
     try:
-        # Check file size
+        # Single os.stat() for both size check and cache signature
         try:
-            st_size = os.path.getsize(filepath)
-            if st_size > MAX_FILE_SIZE:
+            st = os.stat(filepath)
+            if st.st_size > MAX_FILE_SIZE:
                 return ProcessingResult(
                     filepath=filepath,
                     success=False,
-                    error=f"File too large: {st_size} bytes (max {MAX_FILE_SIZE})",
+                    error=f"File too large: {st.st_size} bytes (max {MAX_FILE_SIZE})",
                     error_kind="file_too_large",
                 )
         except OSError as e:
@@ -140,6 +140,8 @@ def process_file(
                 error_kind="stat_error",
             )
 
+        stat: FileStat = {"mtime_ns": st.st_mtime_ns, "size": st.st_size}
+
         try:
             source = Path(filepath).read_text("utf-8")
         except UnicodeDecodeError as e:
@@ -157,7 +159,6 @@ def process_file(
                 error_kind="source_read_error",
             )
 
-        stat = file_stat_signature(filepath)
         module_name = module_name_from_path(root, filepath)
 
         units, blocks, segments = extract_units_from_source(
@@ -355,68 +356,44 @@ def _safe_future_result(
             return None, str(e)
 
     # Discovery phase
-    try:
-        if args.quiet:
-            for fp in iter_py_files(str(root_path)):
-                files_found += 1
-                stat, cached, warn = _get_cached_entry(fp)
-                if warn:
-                    console.print(warn)
-                    files_skipped += 1
-                    continue
-                if cached and cached.get("stat") == stat:
-                    cache_hits += 1
-                    all_units.extend(
-                        cast(
-                            list[GroupItem],
-                            cast(object, cached.get("units", [])),
-                        )
+    def _discover_files() -> None:
+        nonlocal files_found, cache_hits, files_skipped
+        for fp in iter_py_files(str(root_path)):
+            files_found += 1
+            stat, cached, warn = _get_cached_entry(fp)
+            if warn:
+                console.print(warn)
+                files_skipped += 1
+                continue
+            if cached and cached.get("stat") == stat:
+                cache_hits += 1
+                all_units.extend(
+                    cast(
+                        list[GroupItem],
+                        cast(object, cached.get("units", [])),
                     )
-                    all_blocks.extend(
-                        cast(
-                            list[GroupItem],
-                            cast(object, cached.get("blocks", [])),
-                        )
+                )
+                all_blocks.extend(
+                    cast(
+                        list[GroupItem],
+                        cast(object, cached.get("blocks", [])),
                     )
-                    all_segments.extend(
-                        cast(
-                            list[GroupItem],
-                            cast(object, cached.get("segments", [])),
-                        )
+                )
+                all_segments.extend(
+                    cast(
+                        list[GroupItem],
+                        cast(object, cached.get("segments", [])),
                     )
-                else:
-                    files_to_process.append(fp)
+                )
+            else:
+                files_to_process.append(fp)
+
+    try:
+        if args.quiet:
+            _discover_files()
         else:
             with console.status(ui.STATUS_DISCOVERING, spinner="dots"):
-                for fp in iter_py_files(str(root_path)):
-                    files_found += 1
-                    stat, cached, warn = _get_cached_entry(fp)
-                    if warn:
-                        console.print(warn)
-                        files_skipped += 1
-                        continue
-                    if cached and cached.get("stat") == stat:
-                        cache_hits += 1
-                        all_units.extend(
-                            cast(
-                                list[GroupItem],
-                                cast(object, cached.get("units", [])),
-                            )
-                        )
-                        all_blocks.extend(
-                            cast(
-                                list[GroupItem],
-                                cast(object, cached.get("blocks", [])),
-                            )
-                        )
-                        all_segments.extend(
-                            cast(
-                                list[GroupItem],
-                                cast(object, cached.get("segments", [])),
-                            )
-                        )
-                    else:
-                        files_to_process.append(fp)
+                _discover_files()
     except OSError as e:
         console.print(ui.fmt_contract_error(ui.ERR_SCAN_FAILED.format(error=e)))
         sys.exit(ExitCode.CONTRACT_ERROR)
diff --git a/codeclone/extractor.py b/codeclone/extractor.py
index d3b81c7..a2e814f 100644
--- a/codeclone/extractor.py
+++ b/codeclone/extractor.py
@@ -16,6 +16,7 @@
 from contextlib import contextmanager
 from dataclasses import dataclass
 
+from .blockhash import stmt_hash
 from .blocks import BlockUnit, SegmentUnit, extract_blocks, extract_segments
 from .cfg import CFGBuilder
 from .errors import ParseError
@@ -250,28 +251,42 @@ def extract_units_from_source(
             )
         )
 
-        # Block-level units (exclude __init__)
-        if not local_name.endswith("__init__") and loc >= 40 and stmt_count >= 10:
-            blocks = extract_blocks(
-                node,
-                filepath=filepath,
-                qualname=qualname,
-                cfg=cfg,
-                block_size=4,
-                max_blocks=15,
-            )
-            block_units.extend(blocks)
-
-        # Segment-level units (windows within functions, for internal clones)
-        if loc >= 30 and stmt_count >= 12:
-            segments = extract_segments(
-                node,
-                filepath=filepath,
-                qualname=qualname,
-                cfg=cfg,
-                window_size=6,
-                max_segments=60,
-            )
-            segment_units.extend(segments)
+        # Block-level and segment-level units share statement hashes
+        needs_blocks = (
+            not local_name.endswith("__init__") and loc >= 40 and stmt_count >= 10
+        )
+        needs_segments = loc >= 30 and stmt_count >= 12
+
+        if needs_blocks or needs_segments:
+            body = getattr(node, "body", None)
+            hashes: list[str] | None = None
+            if isinstance(body, list):
+                hashes = [stmt_hash(stmt, cfg) for stmt in body]
+
+            if needs_blocks:
+                block_units.extend(
+                    extract_blocks(
+                        node,
+                        filepath=filepath,
+                        qualname=qualname,
+                        cfg=cfg,
+                        block_size=4,
+                        max_blocks=15,
+                        precomputed_hashes=hashes,
+                    )
+                )
+
+            if needs_segments:
+                segment_units.extend(
+                    extract_segments(
+                        node,
+                        filepath=filepath,
+                        qualname=qualname,
+                        cfg=cfg,
+                        window_size=6,
+                        max_segments=60,
+                        precomputed_hashes=hashes,
+                    )
+                )
 
     return units, block_units, segment_units
diff --git a/codeclone/scanner.py b/codeclone/scanner.py
index b2421bf..0588701 100644
--- a/codeclone/scanner.py
+++ b/codeclone/scanner.py
@@ -77,8 +77,9 @@ def iter_py_files(
             if root_str.startswith(sensitive + "/"):
                 raise ValidationError(f"Cannot scan under sensitive directory: {root}")
 
-    file_count = 0
-    for p in sorted(rootp.rglob("*.py"), key=lambda path: str(path)):
+    # Collect and filter first, then sort — avoids sorting excluded paths
+    candidates: list[Path] = []
+    for p in rootp.rglob("*.py"):
         # Verify path is actually under root (prevent symlink attacks)
         try:
             p.resolve().relative_to(rootp)
@@ -90,12 +91,15 @@ def iter_py_files(
         if any(ex in parts for ex in excludes):
             continue
 
-        file_count += 1
-        if file_count > max_files:
-            raise ValidationError(
-                f"File count exceeds limit of {max_files}. "
-                "Use more specific root or increase limit."
-            )
+        candidates.append(p)
+
+    if len(candidates) > max_files:
+        raise ValidationError(
+            f"File count exceeds limit of {max_files}. "
+            "Use more specific root or increase limit."
+        )
+
+    for p in sorted(candidates, key=lambda path: str(path)):
         yield str(p)
 
 
diff --git a/pyproject.toml b/pyproject.toml
index 3ef98f5..a5eb966 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "codeclone"
-version = "1.4.1"
+version = "1.4.2"
 description = "AST and CFG-based code clone detector for Python focused on architectural duplication"
 readme = { file = "README.md", content-type = "text/markdown" }
 license = { text = "MIT" }
diff --git a/tests/test_cli_unit.py b/tests/test_cli_unit.py
index 304a201..684a12c 100644
--- a/tests/test_cli_unit.py
+++ b/tests/test_cli_unit.py
@@ -28,10 +28,14 @@ def test_process_file_stat_error(
     src = tmp_path / "a.py"
     src.write_text("def f():\n    return 1\n", "utf-8")
 
-    def _boom(_path: str) -> int:
-        raise OSError("nope")
+    _original_stat = os.stat
 
-    monkeypatch.setattr(os.path, "getsize", _boom)
+    def _boom(path: str, *args: object, **kwargs: object) -> os.stat_result:
+        if str(path) == str(src):
+            raise OSError("nope")
+        return _original_stat(path, *args, **kwargs)  # type: ignore[arg-type]
+
+    monkeypatch.setattr(os, "stat", _boom)
     result = process_file(str(src), str(tmp_path), NormalizationConfig(), 1, 1)
     assert result.success is False
     assert result.error is not None
diff --git a/tests/test_security.py b/tests/test_security.py
index 9bf64a1..3e5c474 100644
--- a/tests/test_security.py
+++ b/tests/test_security.py
@@ -30,18 +30,34 @@ def test_process_file_size_limit() -> None:
 
     try:
         cfg = NormalizationConfig()
+        real_stat = os.stat(tmp_path)
 
-        # Mock os.path.getsize to return huge size
-        with patch("os.path.getsize", return_value=MAX_FILE_SIZE + 1):
+        # Mock os.stat to return huge st_size
+        def _huge_stat(path: str, *args: object, **kwargs: object) -> os.stat_result:
+            return os.stat_result(
+                (
+                    real_stat.st_mode,
+                    real_stat.st_ino,
+                    real_stat.st_dev,
+                    real_stat.st_nlink,
+                    real_stat.st_uid,
+                    real_stat.st_gid,
+                    MAX_FILE_SIZE + 1,  # st_size
+                    int(real_stat.st_atime),
+                    int(real_stat.st_mtime),
+                    int(real_stat.st_ctime),
+                )
+            )
+
+        with patch("os.stat", side_effect=_huge_stat):
             result = process_file(tmp_path, os.path.dirname(tmp_path), cfg, 0, 0)
             assert result.success is False
             assert result.error is not None
             assert "File too large" in result.error
 
-        # Normal size should pass
-        with patch("os.path.getsize", return_value=10):
-            result = process_file(tmp_path, os.path.dirname(tmp_path), cfg, 0, 0)
-            assert result.success is True
+        # Normal size should pass (no mock — real stat)
+        result = process_file(tmp_path, os.path.dirname(tmp_path), cfg, 0, 0)
+        assert result.success is True
 
     finally:
         os.remove(tmp_path)

From 9b247efff483558abb6beca1a6eebdaee4bd740e Mon Sep 17 00:00:00 2001
From: Den Rozhnovskiy <pytelemonbot@mail.ru>
Date: Tue, 17 Feb 2026 19:10:09 +0500
Subject: [PATCH 2/2] test(extractor): cover block/segment gate branches and
 hash reuse fallback

---
 tests/test_extractor.py | 105 ++++++++++++++++++++++++++++++++++++++++
 uv.lock                 |   2 +-
 2 files changed, 106 insertions(+), 1 deletion(-)

diff --git a/tests/test_extractor.py b/tests/test_extractor.py
index c1bc568..5849aa0 100644
--- a/tests/test_extractor.py
+++ b/tests/test_extractor.py
@@ -374,6 +374,111 @@ def f():
     assert segments == []
 
 
+def test_extract_generates_segments_without_blocks_when_only_segment_gate_met() -> None:
+    lines = ["def f():"]
+    for i in range(12):
+        lines.append(f"    x{i} = {i}")
+        lines.append("")
+        lines.append("")
+    src = "\n".join(lines)
+
+    units, blocks, segments = extract_units_from_source(
+        source=src,
+        filepath="x.py",
+        module_name="mod",
+        cfg=NormalizationConfig(),
+        min_loc=1,
+        min_stmt=1,
+    )
+
+    assert units
+    assert blocks == []
+    assert segments
+
+
+def test_extract_generates_blocks_without_segments_when_only_block_gate_met() -> None:
+    lines = ["def f():"]
+    for i in range(10):
+        lines.append(f"    x{i} = {i}")
+        lines.append("")
+        lines.append("")
+        lines.append("")
+        lines.append("")
+    src = "\n".join(lines)
+
+    units, blocks, segments = extract_units_from_source(
+        source=src,
+        filepath="x.py",
+        module_name="mod",
+        cfg=NormalizationConfig(),
+        min_loc=1,
+        min_stmt=1,
+    )
+
+    assert units
+    assert blocks
+    assert segments == []
+
+
+def test_extract_handles_non_list_function_body_for_hash_reuse(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    lines = ["def f():"]
+    for i in range(12):
+        lines.append(f"    x{i} = {i}")
+        lines.append("")
+        lines.append("")
+    tree = ast.parse("\n".join(lines))
+    func = tree.body[0]
+    assert isinstance(func, ast.FunctionDef)
+    func.body = tuple(func.body)  # type: ignore[assignment]
+
+    captured_hashes: dict[str, object] = {}
+
+    def _fake_parse(_source: str, _timeout_s: int) -> ast.AST:
+        return tree
+
+    def _fake_fingerprint(
+        _node: ast.FunctionDef | ast.AsyncFunctionDef,
+        _cfg: NormalizationConfig,
+        _qualname: str,
+    ) -> str:
+        return "f" * 40
+
+    def _fake_extract_segments(
+        _node: ast.FunctionDef | ast.AsyncFunctionDef,
+        filepath: str,
+        qualname: str,
+        cfg: NormalizationConfig,
+        window_size: int = 6,
+        max_segments: int = 60,
+        *,
+        precomputed_hashes: list[str] | None = None,
+    ) -> list[object]:
+        del filepath, qualname, cfg, window_size, max_segments
+        captured_hashes["value"] = precomputed_hashes
+        return []
+
+    monkeypatch.setattr(extractor, "_parse_with_limits", _fake_parse)
+    monkeypatch.setattr(extractor, "_stmt_count", lambda _node: 12)
+    monkeypatch.setattr(extractor, "get_cfg_fingerprint", _fake_fingerprint)
+    monkeypatch.setattr(extractor, "extract_segments", _fake_extract_segments)
+
+    units, blocks, segments = extract_units_from_source(
+        source="def f():\n    pass\n",
+        filepath="x.py",
+        module_name="mod",
+        cfg=NormalizationConfig(),
+        min_loc=1,
+        min_stmt=1,
+    )
+
+    assert len(units) == 1
+    assert blocks == []
+    assert segments == []
+    assert captured_hashes["value"] is None
+
+
 def test_extract_skips_invalid_positions(monkeypatch: pytest.MonkeyPatch) -> None:
     tree = ast.parse(
         """
diff --git a/uv.lock b/uv.lock
index 21ffe1c..3bc62ff 100644
--- a/uv.lock
+++ b/uv.lock
@@ -189,7 +189,7 @@ wheels = [
 
 [[package]]
 name = "codeclone"
-version = "1.4.1"
+version = "1.4.2"
 source = { editable = "." }
 dependencies = [
     { name = "pygments" },