Skip to content

Commit f2cb95e

Browse files
committed
fix(core): fix detection integrity
1 parent e99dc57 commit f2cb95e

13 files changed

Lines changed: 909 additions & 86 deletions

CHANGELOG.md

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,17 @@ workflows that reuse old baselines.
1212
### Clone Detection Accuracy
1313

1414
- **Commutative normalization**
15-
Canonicalized operand order for `+`, `*`, `|`, `&`, `^` when operands are free of side
16-
effects, enabling safe detection of reordered expressions.
15+
Canonicalized operand order for `+`, `*`, `|`, `&`, `^` only for provably safe constant
16+
domains. Symbolic operands are no longer reordered.
1717

1818
- **Local logical equivalence**
1919
Normalized `not (x in y)` to `x not in y` and `not (x is y)` to `x is not y` without
2020
De Morgan transformations or broader boolean rewrites.
2121

22+
- **Call-target preservation**
23+
Kept symbolic call targets during normalization to avoid conflating different APIs
24+
(for example, `load_user(...)` vs `delete_user(...)`).
25+
2226
### CFG Precision
2327

2428
- **Short‑circuit modeling**
@@ -28,6 +32,19 @@ workflows that reuse old baselines.
2832
Linked `try/except` only to statements that may raise (calls, attribute access, indexing,
2933
`await`, `yield from`, `raise`) instead of blanket links.
3034

35+
### Detection Integrity
36+
37+
- **Internal CFG marker hardening**
38+
Switched CFG metadata markers to an internal namespace (`__CC_META__::...`) emitted as
39+
synthetic AST names, preventing collisions with user string literals.
40+
41+
- **Ordered control-flow semantics**
42+
Modeled `break`/`continue` as terminating loop transitions, added correct `for/while ... else`
43+
semantics, preserved `match case` evaluation order, and preserved `except` handler order.
44+
45+
- **Deterministic traversal order**
46+
Sorted Python file discovery to stabilize processing and report ordering across runs/platforms.
47+
3148
### Segment‑Level Detection
3249

3350
- **Window fingerprints**
@@ -36,6 +53,7 @@ workflows that reuse old baselines.
3653
- **Candidate generation**
3754
Used an order‑insensitive signature for candidate grouping and a strict segment hash for
3855
final confirmation. Segment matches do not affect baseline or CI failure logic.
56+
3957
- **Noise reduction (report‑only)**
4058
Merged overlapping segment windows into a single span per function and suppressed
4159
boilerplate-only groups (attribute assignment wiring) with deterministic AST criteria.

codeclone.baseline.json

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
{
2-
"functions": [],
2+
"functions": [
3+
"056967a5e5569762522015b120810353dc84986a|20-49",
4+
"2482e4ffa6c3c349be9626d4a95abbbb57193345|20-49",
5+
"54b7b79d1ff3384fb96b3dd97944d7b67990b3f3|0-19",
6+
"554bfacfa9bf1565bd2f5ea36e89b6efaee29c2d|0-19",
7+
"b0070927d98fa8274982caef45107f4e6b4d6fef|0-19",
8+
"c8e7da40a2dc106d1aa092044f88f01d9abae054|0-19"
9+
],
310
"blocks": [],
411
"python_version": "3.13",
512
"baseline_version": "1.3.0",

codeclone/cfg.py

Lines changed: 108 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,11 @@
1010

1111
import ast
1212
from collections.abc import Iterable
13+
from dataclasses import dataclass
1314
from typing import Protocol, cast
1415

1516
from .cfg_model import CFG, Block
17+
from .meta_markers import CFG_META_PREFIX
1618

1719
__all__ = ["CFG", "CFGBuilder"]
1820

@@ -26,17 +28,28 @@ class _TryLike(Protocol):
2628
finalbody: list[ast.stmt]
2729

2830

31+
@dataclass(slots=True)
32+
class _LoopContext:
33+
continue_target: Block
34+
break_target: Block
35+
36+
37+
def _meta_expr(value: str) -> ast.Expr:
38+
return ast.Expr(value=ast.Name(id=f"{CFG_META_PREFIX}{value}", ctx=ast.Load()))
39+
40+
2941
# =========================
3042
# CFG Builder
3143
# =========================
3244

3345

3446
class CFGBuilder:
35-
__slots__ = ("cfg", "current")
47+
__slots__ = ("_loop_stack", "cfg", "current")
3648

3749
def __init__(self) -> None:
3850
self.cfg: CFG
3951
self.current: Block
52+
self._loop_stack: list[_LoopContext] = []
4053

4154
def build(
4255
self,
@@ -73,6 +86,12 @@ def _visit(self, stmt: ast.stmt) -> None:
7386
self.current.is_terminated = True
7487
self.current.add_successor(self.cfg.exit)
7588

89+
case ast.Break():
90+
self._visit_break(stmt)
91+
92+
case ast.Continue():
93+
self._visit_continue(stmt)
94+
7695
case ast.If():
7796
self._visit_if(stmt)
7897

@@ -123,36 +142,61 @@ def _visit_if(self, stmt: ast.If) -> None:
123142
def _visit_while(self, stmt: ast.While) -> None:
124143
cond_block = self.cfg.create_block()
125144
body_block = self.cfg.create_block()
145+
else_block = self.cfg.create_block() if stmt.orelse else None
126146
after_block = self.cfg.create_block()
127147

128148
self.current.add_successor(cond_block)
129149

130150
self.current = cond_block
131-
self._emit_condition(stmt.test, body_block, after_block)
151+
false_target = else_block if else_block is not None else after_block
152+
self._emit_condition(stmt.test, body_block, false_target)
132153

154+
self._loop_stack.append(
155+
_LoopContext(continue_target=cond_block, break_target=after_block)
156+
)
133157
self.current = body_block
134158
self._visit_statements(stmt.body)
135159
if not self.current.is_terminated:
136160
self.current.add_successor(cond_block)
161+
self._loop_stack.pop()
162+
163+
if else_block is not None:
164+
self.current = else_block
165+
self._visit_statements(stmt.orelse)
166+
if not self.current.is_terminated:
167+
self.current.add_successor(after_block)
137168

138169
self.current = after_block
139170

140171
def _visit_for(self, stmt: ast.For | ast.AsyncFor) -> None:
141172
iter_block = self.cfg.create_block()
142173
body_block = self.cfg.create_block()
174+
else_block = self.cfg.create_block() if stmt.orelse else None
143175
after_block = self.cfg.create_block()
144176

145177
self.current.add_successor(iter_block)
146178

147179
self.current = iter_block
148180
self.current.statements.append(ast.Expr(value=stmt.iter))
149181
self.current.add_successor(body_block)
150-
self.current.add_successor(after_block)
182+
self.current.add_successor(
183+
else_block if else_block is not None else after_block
184+
)
151185

186+
self._loop_stack.append(
187+
_LoopContext(continue_target=iter_block, break_target=after_block)
188+
)
152189
self.current = body_block
153190
self._visit_statements(stmt.body)
154191
if not self.current.is_terminated:
155192
self.current.add_successor(iter_block)
193+
self._loop_stack.pop()
194+
195+
if else_block is not None:
196+
self.current = else_block
197+
self._visit_statements(stmt.orelse)
198+
if not self.current.is_terminated:
199+
self.current.add_successor(after_block)
156200

157201
self.current = after_block
158202

@@ -188,19 +232,36 @@ def _visit_try(self, stmt: _TryLike) -> None:
188232
self.current.add_successor(try_entry)
189233
self.current = try_entry
190234

191-
handlers_blocks = [self.cfg.create_block() for _ in stmt.handlers]
235+
handler_test_blocks = [self.cfg.create_block() for _ in stmt.handlers]
236+
handler_body_blocks = [self.cfg.create_block() for _ in stmt.handlers]
192237
else_block = self.cfg.create_block() if stmt.orelse else None
193238
final_block = self.cfg.create_block()
194239

240+
for idx, (handler, test_block, body_block) in enumerate(
241+
zip(stmt.handlers, handler_test_blocks, handler_body_blocks, strict=True)
242+
):
243+
test_block.statements.append(_meta_expr(f"TRY_HANDLER_INDEX:{idx}"))
244+
if handler.type is not None:
245+
type_repr = ast.dump(handler.type, annotate_fields=False)
246+
test_block.statements.append(
247+
_meta_expr(f"TRY_HANDLER_TYPE:{type_repr}")
248+
)
249+
else:
250+
test_block.statements.append(_meta_expr("TRY_HANDLER_TYPE:BARE"))
251+
test_block.add_successor(body_block)
252+
if idx + 1 < len(handler_test_blocks):
253+
test_block.add_successor(handler_test_blocks[idx + 1])
254+
else:
255+
test_block.add_successor(final_block)
256+
195257
# Process each statement in try body
196258
# Link only statements that can raise to exception handlers
197259
for stmt_node in stmt.body:
198260
if self.current.is_terminated:
199261
break
200262

201-
if _stmt_can_raise(stmt_node):
202-
for h_block in handlers_blocks:
203-
self.current.add_successor(h_block)
263+
if _stmt_can_raise(stmt_node) and handler_test_blocks:
264+
self.current.add_successor(handler_test_blocks[0])
204265

205266
self._visit(stmt_node)
206267

@@ -212,11 +273,8 @@ def _visit_try(self, stmt: _TryLike) -> None:
212273
self.current.add_successor(final_block)
213274

214275
# Process handlers
215-
for handler, h_block in zip(stmt.handlers, handlers_blocks, strict=True):
216-
self.current = h_block
217-
if handler.type:
218-
self.current.statements.append(ast.Expr(value=handler.type))
219-
276+
for handler, body_block in zip(stmt.handlers, handler_body_blocks, strict=True):
277+
self.current = body_block
220278
self._visit_statements(handler.body)
221279
if not self.current.is_terminated:
222280
self.current.add_successor(final_block)
@@ -236,25 +294,40 @@ def _visit_try(self, stmt: _TryLike) -> None:
236294
def _visit_match(self, stmt: ast.Match) -> None:
237295
self.current.statements.append(ast.Expr(value=stmt.subject))
238296

239-
subject_block = self.current
297+
previous_test_block: Block | None = None
240298
after_block = self.cfg.create_block()
241299

242-
for case_ in stmt.cases:
243-
case_block = self.cfg.create_block()
244-
subject_block.add_successor(case_block)
300+
for idx, case_ in enumerate(stmt.cases):
301+
case_test_block = self.cfg.create_block()
302+
case_body_block = self.cfg.create_block()
303+
304+
if previous_test_block is None:
305+
self.current.add_successor(case_test_block)
306+
else:
307+
previous_test_block.add_successor(case_test_block)
245308

246-
self.current = case_block
309+
case_test_block.statements.append(_meta_expr(f"MATCH_CASE_INDEX:{idx}"))
247310

248311
# Record pattern structure
249312
pattern_repr = ast.dump(case_.pattern, annotate_fields=False)
250-
self.current.statements.append(
251-
ast.Expr(value=ast.Constant(value=f"PATTERN:{pattern_repr}"))
313+
case_test_block.statements.append(
314+
_meta_expr(f"MATCH_PATTERN:{pattern_repr}")
252315
)
316+
if case_.guard is not None:
317+
case_test_block.statements.append(ast.Expr(value=case_.guard))
318+
319+
case_test_block.add_successor(case_body_block)
253320

321+
self.current = case_body_block
254322
self._visit_statements(case_.body)
255323
if not self.current.is_terminated:
256324
self.current.add_successor(after_block)
257325

326+
previous_test_block = case_test_block
327+
328+
if previous_test_block is not None:
329+
previous_test_block.add_successor(after_block)
330+
258331
self.current = after_block
259332

260333
def _emit_condition(
@@ -300,6 +373,22 @@ def _emit_boolop(
300373

301374
self.current = current
302375

376+
def _visit_break(self, stmt: ast.Break) -> None:
377+
self.current.statements.append(stmt)
378+
self.current.is_terminated = True
379+
if self._loop_stack:
380+
self.current.add_successor(self._loop_stack[-1].break_target)
381+
return
382+
self.current.add_successor(self.cfg.exit)
383+
384+
def _visit_continue(self, stmt: ast.Continue) -> None:
385+
self.current.statements.append(stmt)
386+
self.current.is_terminated = True
387+
if self._loop_stack:
388+
self.current.add_successor(self._loop_stack[-1].continue_target)
389+
return
390+
self.current.add_successor(self.cfg.exit)
391+
303392

304393
def _stmt_can_raise(stmt: ast.stmt) -> bool:
305394
if isinstance(stmt, ast.Raise):

0 commit comments

Comments
 (0)