From 6b395481b64f5df445241a4372ade9dd1fcef5f3 Mon Sep 17 00:00:00 2001 From: Bryce Boe Date: Wed, 20 May 2026 07:04:11 -0700 Subject: [PATCH 1/2] feat(cc): narrow mov eax, imm to mov al, imm for out dx, al MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After peephole_fold_byte_immediate_through_local rewrites the *(uint8_t *)&local byte-load idiom into a full-width mov eax, , the only AX-touching consumer is out dx, al — which reads AL only. Add peephole_narrow_acc_immediate_for_byte_out that walks forward to confirm out dx, al is the consumer and that EAX upper bits are dead until the next full {acc} clobber, then narrows the load. Save: 3 bytes per site in 32-bit (mov eax, imm32 is 5 bytes; mov al, imm8 is 2 bytes), 1 byte in 16-bit. Phase 2 bails conservatively at labels, ret, and jumps, so loop-tail and trailing-port sites where the function may return without clobbering EAX stay unnarrowed. Co-Authored-By: Claude Opus 4.7 (1M context) --- cc/codegen/x86/peephole.py | 111 ++++++++++++++++++++++++++++++++++ docs/CHANGELOG.md | 11 ++++ tests/unit/test_cc_codegen.py | 45 ++++++++++++++ 3 files changed, 167 insertions(+) diff --git a/cc/codegen/x86/peephole.py b/cc/codegen/x86/peephole.py index cf37ec49..18a85bd8 100644 --- a/cc/codegen/x86/peephole.py +++ b/cc/codegen/x86/peephole.py @@ -21,6 +21,8 @@ from cc.codegen.x86.jumps import JUMP_INVERT if TYPE_CHECKING: + from collections.abc import Callable + from cc.target import CodegenTarget @@ -1085,6 +1087,114 @@ def fits_imm8(literal: str, /) -> bool: del self.lines[i + 1 : i + 5] continue + def peephole_narrow_acc_immediate_for_byte_out(self) -> None: + """Narrow ``mov {acc}, <0..255>`` to ``mov al, `` when ``out dx, al`` consumes it. + + After :meth:`peephole_fold_byte_immediate_through_local` rewrites + ``movzx {acc}, byte [bp-N]`` into ``mov {acc}, `` (the + driver port-I/O byte-load idiom), the full-width load survives + even though the only consumer is ``out dx, al``, which reads + AL only. Narrowing to ``mov al, `` saves 3 bytes per site + in 32-bit mode (``mov eax, imm32`` is 5 bytes, ``mov al, imm8`` + is 2 bytes) and 1 byte in 16-bit mode (``mov ax, imm16`` is 3 + bytes, ``mov al, imm8`` is 2 bytes). + + After narrowing, the upper bits of {acc} hold the caller's + previous value instead of zero, so the rewrite is only safe + when nothing reads {acc} wider than AL between the load and + the next full clobber of {acc}. Forward scan from the + candidate ``mov {acc}, imm``: + + Phase 1 — find the consumer. Walk to ``out dx, al``; bail on + any wider-than-AL read of {acc}, any other write to {acc} + (would mean the load is already dead), or any control flow + (label, jump, ret, call). + + Phase 2 — confirm post-out safety. From the line after the + ``out``, walk until a full clobber of {acc} (``mov {acc}, ...``, + ``xor {acc}, {acc}``, ``movzx {acc}, ...``, ``pop {acc}``, + ``call ...``). Bail on any wider read, ``ret``, label, or + jump before that clobber. + + ``xor ah, ah`` and ``in al, dx`` are write-only on their + respective AX halves — neither reads {acc} wider than AL — + so the scan walks past them. They don't constitute a *full* + clobber of {acc} (the high 16 bits in 32-bit mode stay live), + which is why phase 2 must continue searching until a real + full-clobber instruction is found. + """ + acc = self.target.acc + mov_acc_imm_pattern = re.compile(rf"^\s*mov {re.escape(acc)}, (\d+)\s*$") + wider_tokens = ("ah", "ax") if acc == "ax" else ("ah", "ax", "eax") + wider_pattern = re.compile(rf"\b(?:{'|'.join(wider_tokens)})\b") + full_write_prefixes = ( + f"mov {acc}, ", + f"pop {acc}", + f"movzx {acc}, ", + ) + + def find_forward(start: int, *, matches: Callable[[str], bool], extra_bails: tuple[Callable[[str], bool], ...] = ()) -> int | None: + """Walk forward until ``matches`` fires; return that index, else None. + + Common bails for both phases: wider-than-AL read of {acc}, + label, ``ret``, ``jmp`` / Jcc, ``int``. ``extra_bails`` + adds caller-supplied bail predicates (phase 1 uses + :func:`is_full_acc_clobber` as one — reaching a clobber + before ``out`` means our load is already dead). + """ + for line_index in range(start, len(self.lines)): + candidate = self.lines[line_index].strip() + if matches(candidate): + return line_index + if reads_wider_than_al(candidate): + return None + if any(bail(candidate) for bail in extra_bails): + return None + if candidate.endswith(":") or candidate.startswith(("ret", "jmp ", "j", "int ")): + return None + return None + + def is_full_acc_clobber(stripped: str) -> bool: + return ( + any(stripped.startswith(prefix) for prefix in full_write_prefixes) + or stripped == f"xor {acc}, {acc}" + or stripped.startswith("call ") + ) + + def reads_wider_than_al(stripped: str) -> bool: + if any(stripped.startswith(prefix) for prefix in full_write_prefixes): + return False + if stripped == f"xor {acc}, {acc}": + return False + # ``xor ah, ah`` and ``mov ah, `` both write AH without + # reading it as a source; treat as non-reads. + if stripped == "xor ah, ah" or stripped.startswith("mov ah, "): + return False + return bool(wider_pattern.search(stripped)) + + for i, line in enumerate(self.lines): + match = mov_acc_imm_pattern.match(line) + if match is None: + continue + value = int(match.group(1)) + if not 0 <= value <= 255: + continue + # Phase 1: scan to the ``out dx, al`` consumer. An + # intervening full clobber means our load is dead — bail. + out_index = find_forward( + i + 1, + matches=lambda candidate: candidate == "out dx, al", + extra_bails=(is_full_acc_clobber,), + ) + if out_index is None: + continue + # Phase 2: confirm a full clobber follows before any wider + # read or control flow that could leak the upper bits. + if find_forward(out_index + 1, matches=is_full_acc_clobber) is None: + continue + indent = line[: len(line) - len(line.lstrip())] + self.lines[i] = f"{indent}mov al, {value}" + def peephole_redundant_bx(self) -> None: """Remove redundant ``mov bx, X`` / ``mov si, X`` reloads. @@ -1402,6 +1512,7 @@ def run(self) -> list[str]: self.peephole_store_reload() self.peephole_fold_byte_immediate_through_local() self.peephole_dead_temp_slots() + self.peephole_narrow_acc_immediate_for_byte_out() self.peephole_constant_to_register() self.peephole_register_arithmetic() self.peephole_self_move() diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 96ff7ab6..35581497 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -38,6 +38,17 @@ time. block sorts cleanly alphabetically — `strcmp` slides from PR #448's locked offset 52 into its natural slot at +84 — since `make_os.sh` re-emits every consumer against the current table on each run. + +- **cc.py: narrow ``mov eax, `` to ``mov al, `` when ``out dx, + al`` is the only consumer.** After + ``peephole_fold_byte_immediate_through_local`` rewrites the byte-load idiom + into a full-width immediate load, the only consumer is ``out`` which touches + AL only. A new peephole proves the upper bits of EAX are dead between the + load and the next full ``{acc}`` clobber, then narrows. Save: 3 bytes per + site in 32-bit (`B8 imm32` → `B0 imm8`), 1 byte in 16-bit. Phase 2 bails + conservatively at labels and `ret`, so loop-tail and trailing-port sites stay + unnarrowed. ~8 byte kernel-wide reduction in this pass. + - **cc.py: skip save-around-eval push/pop when ``kernel_outb`` / ``kernel_outw`` port is a literal.** The general non-const-value path was unconditionally emitting ``push eax; eval port → DX; pop eax`` to guard the value across port diff --git a/tests/unit/test_cc_codegen.py b/tests/unit/test_cc_codegen.py index 76d24a46..cf56e449 100644 --- a/tests/unit/test_cc_codegen.py +++ b/tests/unit/test_cc_codegen.py @@ -2662,6 +2662,51 @@ def test_peephole_fold_byte_immediate_through_local() -> None: assert "mov eax, 18" in asm or "mov al, 18" in asm, f"folded immediate load missing:\n{asm}" +def test_peephole_narrow_acc_immediate_for_byte_out() -> None: + """``mov eax, `` followed by ``out dx, al`` narrows to ``mov al, ``. + + After :meth:`peephole_fold_byte_immediate_through_local` produces + ``mov eax, `` from a byte-load idiom, the only consumer of AX + is ``out dx, al`` (which touches AL only). The 3-byte saving per + site applies when EAX's upper bits are dead after the out — proved + here by a follow-on call that caller-clobbers EAX. + """ + asm = _kernel( + """ + struct cr { uint8_t start: 1; uint8_t stop: 1; uint8_t txp: 1; + uint8_t reserved: 1; uint8_t page: 2; uint8_t rd: 2; }; + void other(); + void probe() { + struct cr c = { .stop = 1, .rd = 4, .page = 1 }; + kernel_outb(0x300, *(uint8_t *)&c); + other(); + } + """, + bits=32, + ) + assert "mov al, 18" in asm, f"narrowed byte-immediate load missing:\n{asm}" + assert "mov eax, 18" not in asm, f"unnarrowed full-width load survived:\n{asm}" + + +def test_peephole_narrow_acc_immediate_keeps_wider_consumer() -> None: + """Narrow is skipped when something reads {acc} wider than AL before clobber. + + Synthetic 16-bit sequence with an ``mov bx, ax`` between the + load and the ``out`` — the move-to-BX reads AX, which would see + caller-junk in AH if AX were narrowed to AL. The peephole must + leave ``mov ax, 18`` intact. + """ + out = _peephole_run([ + "f:", + " mov ax, 18", + " cmp ax, 17", + " mov dx, 768", + " out dx, al", + " ret", + ]) + assert " mov ax, 18" in out, f"narrowing should be skipped — AX consumed wider:\n{out}" + + def test_peephole_redundant_register_swap_drops_second_mov() -> None: """``mov A, B`` followed by ``mov B, A`` drops the second.""" out = _peephole_run([ From 417a5735ea7aeb726249faeb8984ad42f7d80538 Mon Sep 17 00:00:00 2001 From: Bryce Boe Date: Wed, 20 May 2026 09:53:50 -0700 Subject: [PATCH 2/2] feat(cc): skip dead pinned-register saves around builtin calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _pinned_registers_to_save was saving every pinned register in the builtin's clobber set unconditionally, even when the pinned local hadn't been written yet — preserving garbage from caller-supplied state. Add a per-function pre-pass over the IR that computes the may-defined set of pinned-register values at each builtin call site: - Initial set: registers held by parameters (loaded by the prologue). - On each ir.Copy / ir.BinaryOperation / ir.Index / ir.Call(dest=...) whose destination is a pinned local, add that local's register. - On each ir.Block, peek at the wrapped AST node — VarDecl with init and Assign are stores; MemberAssign / IndexAssign / opaque AST go through pointers or are escape-hatched, skip. - For each loop region (Label..backward-Jump), pre-merge body stores into the loop's entry set so the back-edge sees in-loop stores on every iteration past the first. Scope is intentionally limited to builtin calls. User function calls go through a separate save-set path that the IR-only analysis can't fully model — Block-wrapped statements that fall back to AST codegen, pointer-aliased pinned locals, and ir.CarryBranch wrapping carry-return callees all stay on the conservative save-everything path. Saves: -120 byte kasm reduction kernel-wide. Pre-loop kernel_outb / kernel_inb call sites in drivers (ata_init, etc.) no longer wrap with push/pop edx when the EDX-pinned local hasn't been initialised yet. ping user program shrinks 1544 -> 1522 bytes (archive table updated). Co-Authored-By: Claude Opus 4.7 (1M context) --- archive/README.md | 2 +- cc/codegen/x86/emission.py | 22 +++- cc/codegen/x86/generator.py | 205 +++++++++++++++++++++++++++++----- docs/CHANGELOG.md | 14 +++ tests/unit/test_cc_codegen.py | 58 ++++++++++ 5 files changed, 269 insertions(+), 32 deletions(-) diff --git a/archive/README.md b/archive/README.md index 16b79e73..39759785 100644 --- a/archive/README.md +++ b/archive/README.md @@ -29,7 +29,7 @@ programs (`make_os.sh` passes `--bits 32`); every row in this table is now | ls | 135 | 412 | 697 | +285 | | mkdir | 123 | 142 | 163 | +21 | | mv | 217 | 242 | 270 | +28 | -| ping | 1034 | 1230 | 1544 | +314 | +| ping | 1034 | 1230 | 1522 | +292 | | uptime | 50 | 67 | 102 | +35 | **arp (-99):** The three scratch arrays (`mac_buffer[6]`, `receive_buffer[128]`, diff --git a/cc/codegen/x86/emission.py b/cc/codegen/x86/emission.py index 30f280c5..487f9dc2 100644 --- a/cc/codegen/x86/emission.py +++ b/cc/codegen/x86/emission.py @@ -1672,11 +1672,19 @@ def _lower_ir_instruction(self, instruction: ir.Instruction) -> None: self.emit_store_local(expression=self._ir_value_to_ast(source), name=destination) case ir.Call(destination=None, name=name, args=args): call = Call(args=[self._ir_value_to_ast(a) for a in args], name=name) - self.generate_call(call, discard_return=True) + self._current_call_pinned_initialized = self._ir_call_pinned_initialized.get(id(instruction)) + try: + self.generate_call(call, discard_return=True) + finally: + self._current_call_pinned_initialized = None self.ax_clear() case ir.Call(destination=destination, name=name, args=args): call = Call(args=[self._ir_value_to_ast(a) for a in args], name=name) - self.emit_store_local(expression=call, name=destination) + self._current_call_pinned_initialized = self._ir_call_pinned_initialized.get(id(instruction)) + try: + self.emit_store_local(expression=call, name=destination) + finally: + self._current_call_pinned_initialized = None case ir.Index(destination=destination, base=base, index=index): expression = Index(array=Var(name=base), index=self._ir_value_to_ast(index)) self.emit_store_local(expression=expression, name=destination) @@ -1703,7 +1711,11 @@ def _lower_ir_instruction(self, instruction: ir.Instruction) -> None: # ``if`` / ``while`` condition. ``generate_call`` sets # up args (regparm / stack) the same way a direct call # would. - self.generate_call(call_ast, discard_return=True) + self._current_call_pinned_initialized = self._ir_call_pinned_initialized.get(id(instruction)) + try: + self.generate_call(call_ast, discard_return=True) + finally: + self._current_call_pinned_initialized = None self.emit(f" {'jc' if when == 'set' else 'jnc'} {target}") self.ax_clear() case ir.Return(value=value): @@ -1801,6 +1813,9 @@ def generate_function(self, function: Function | ir.Function, /) -> None: self.current_carry_return = function.carry_return self.current_function_is_main = name == "main" self.current_function_is_naked = function.naked + self._current_function_parameter_names: tuple[str, ...] = tuple(parameter.name for parameter in parameters) + self._ir_call_pinned_initialized = {} + self._current_call_pinned_initialized = None # Per-function user-label bookkeeping for the AST codegen path. # The IR path validates inside ir.Builder; main() and other AST- # path functions validate here after generate_body completes. @@ -2138,6 +2153,7 @@ def _body_has_stack_arrays(stmts: list[Node]) -> bool: if ir_body is not None: # IR path: lower the flat instruction list directly. + self._ir_call_pinned_initialized = self._compute_pinned_initialized_per_call(ir_body) self.lower_ir_body(ir_body) else: # Tail-call: if the last statement is a statement-level user- diff --git a/cc/codegen/x86/generator.py b/cc/codegen/x86/generator.py index bbc90003..b3d3b178 100644 --- a/cc/codegen/x86/generator.py +++ b/cc/codegen/x86/generator.py @@ -16,6 +16,7 @@ from dataclasses import fields from typing import ClassVar, NamedTuple +from cc import ir from cc.ast_nodes import ( AddressOf, ArrayDecl, @@ -271,6 +272,16 @@ def __init__( self.out_register_params: dict[str, dict[int, str]] = {} self.param_in_register: dict[str, str] = {} self.pinned_register: dict[str, str] = {} + # Liveness map for pinned-register saves: maps id(ir.Call / + # ir.CarryBranch) → frozenset of pinned-register names that are + # may-defined at that call site. Populated per function before + # IR lowering by _compute_pinned_initialized_per_call. + # _pinned_registers_to_save consults this to skip saves for + # pinned locals whose value isn't yet meaningful (e.g., + # auto-pinned locals declared but not yet stored to). None + # means "no info available" — fall back to saving everything. + self._ir_call_pinned_initialized: dict[int, frozenset[str]] = {} + self._current_call_pinned_initialized: frozenset[str] | None = None self.register_aliased_globals: dict[str, str] = {} # name → register (e.g. "si") self.store_target_register: str | None = None # known_local_bytes and _last_byte_store support the Phase C @@ -430,33 +441,6 @@ def _arg_pinned_sources(self, arg: Node, /) -> set[str]: return self._arg_pinned_sources(arg.left) | self._arg_pinned_sources(arg.right) return set() - def _collect_pinned_reads(self, node: Node, /) -> set[str]: - """Return every pinned register that *node*'s expression reads. - - Like :meth:`_arg_pinned_sources` but walks the full AST shape — - ``UnaryOperation``, ``AddressOf``, ``Index``, etc. — so it can - be used to schedule syscall-builtin argument loads where the - arg AST is not restricted to the simple-call shape. Returns - a set of register names (e.g. ``{"ebx", "edi"}``). - """ - reads: set[str] = set() - stack: list[Node] = [node] - while stack: - current = stack.pop() - if isinstance(current, Var): - if current.name in self.pinned_register: - reads.add(self.pinned_register[current.name]) - elif current.name in self.param_in_register: - reads.add(self.param_in_register[current.name]) - continue - for slot in getattr(type(current), "__slots__", ()): - child = getattr(current, slot, None) - if isinstance(child, Node): - stack.append(child) - elif isinstance(child, list): - stack.extend(item for item in child if isinstance(item, Node)) - return reads - def _arithmetic_element_size(self, var_name: str, /) -> int: """Return the element stride for pointer/array arithmetic on *var_name*. @@ -524,6 +508,133 @@ def _byte_index_direct(self, node: Index, /) -> str | None: offset = node.index.value return f"{const_base}+{offset}" if offset else const_base + def _collect_pinned_reads(self, node: Node, /) -> set[str]: + """Return every pinned register that *node*'s expression reads. + + Like :meth:`_arg_pinned_sources` but walks the full AST shape — + ``UnaryOperation``, ``AddressOf``, ``Index``, etc. — so it can + be used to schedule syscall-builtin argument loads where the + arg AST is not restricted to the simple-call shape. Returns + a set of register names (e.g. ``{"ebx", "edi"}``). + """ + reads: set[str] = set() + stack: list[Node] = [node] + while stack: + current = stack.pop() + if isinstance(current, Var): + if current.name in self.pinned_register: + reads.add(self.pinned_register[current.name]) + elif current.name in self.param_in_register: + reads.add(self.param_in_register[current.name]) + continue + for slot in getattr(type(current), "__slots__", ()): + child = getattr(current, slot, None) + if isinstance(child, Node): + stack.append(child) + elif isinstance(child, list): + stack.extend(item for item in child if isinstance(item, Node)) + return reads + + def _compute_pinned_initialized_per_call(self, ir_body: list, /) -> dict[int, frozenset[str]]: + """Pre-pass: for each ir.Call / ir.CarryBranch, the may-defined pinned register set. + + Auto-pinned locals are not initialized until the first store to + them. Saving a pinned register around a call before that + store preserves garbage — :meth:`_pinned_registers_to_save` + consults the map this method produces and skips the save when + the local can't yet hold a meaningful value. + + Initial defined set: registers held by parameters (loaded into + their pin in the prologue) and locals declared with + ``__attribute__((pinned_register(R)))`` whose initializer fired + as part of the declaration. Auto-pinned locals start + undefined. + + Loop bodies are pre-merged: any store inside a loop region + (Label..back-Jump) is added to the defined set BEFORE the + first instruction of the loop, so subsequent iterations see + the value as live. Without this, calls inside the loop body + that appear before the store in source order would skip a + save that the second iteration actually needs. + + Returns dict keyed by id(instruction). Empty / missing key + means "no live pin" so callers should treat absence as + ``frozenset()`` — distinct from ``None`` which means "no + analysis was performed" (AST path, naked function, etc.). + """ + pinned_locals: dict[str, str] = dict(self.pinned_register) + if not pinned_locals: + return {} + initial: set[str] = set(self._prologue_initialized_pinned_registers()) + + def store_target(instruction: object) -> str | None: + if isinstance(instruction, (ir.BinaryOperation, ir.Copy, ir.Index)): + return instruction.destination + if isinstance(instruction, ir.Block): + # Block-wrapped AST escape hatch. A VarDecl with + # initialiser is a store to its name; ditto an + # ``unsigned long`` Assign that the IR builder routes + # through Block. Pinned-to-register locals can't be + # ``unsigned long`` (they wouldn't fit a single register), + # so only the VarDecl case can hit a pinned target — + # but we still extract Assign / MemberAssign destinations + # defensively in case future IR shapes wrap them. + node = instruction.node + if isinstance(node, Assign): + return node.name + if isinstance(node, VarDecl) and node.init is not None: + return node.name + # MemberAssign / IndexAssign / inline asm write through + # pointers or are opaque — they don't store to a single + # named local register. Skip. + return None + if isinstance(instruction, ir.Call) and instruction.destination is not None: + return instruction.destination + if isinstance(instruction, ir.IndexAssign): + # IndexAssign writes through a base pointer, not to the + # named base itself — leaves the base's register + # contents unchanged. Not a store to the pin. + return None + return None + + label_positions: dict[str, int] = {} + for index, instruction in enumerate(ir_body): + if isinstance(instruction, ir.Label): + label_positions[instruction.name] = index + loop_ranges: list[tuple[int, int]] = [] + for index, instruction in enumerate(ir_body): + if isinstance(instruction, ir.Jump): + target = label_positions.get(instruction.target) + if target is not None and target < index: + loop_ranges.append((target, index)) + loop_stores: list[set[str]] = [] + for start, end in loop_ranges: + stores: set[str] = set() + for k in range(start, end + 1): + target_name = store_target(ir_body[k]) + if target_name in pinned_locals: + stores.add(pinned_locals[target_name]) + loop_stores.append(stores) + result: dict[int, frozenset[str]] = {} + defined: set[str] = set(initial) + for index, instruction in enumerate(ir_body): + for loop_index, (start, _end) in enumerate(loop_ranges): + if start == index: + defined |= loop_stores[loop_index] + # Only record filter sets for builtin calls. User function + # calls go through a different save-set path that this + # analysis can't fully model — Block-wrapped statements + # (the IR escape hatch) and pointer-aliased pinned locals + # could be invalidated by the call in ways our pre-pass + # doesn't see. CarryBranch always wraps a user-function + # (``carry_return`` callee); same skip. + if isinstance(instruction, ir.Call) and instruction.name in self._builtin_clobbers: + result[id(instruction)] = frozenset(defined) + target_name = store_target(instruction) + if target_name in pinned_locals: + defined.add(pinned_locals[target_name]) + return result + def _emit_bitfield_read(self, info: FieldInfo, /, *, addr: str) -> None: """Emit the load-shift-mask-extend sequence for a bitfield read. @@ -2166,18 +2277,56 @@ def _pinned_registers_to_save(self, clobbers: frozenset[str], /) -> list[str]: E-registers in protected mode and 16-bit aliases in real mode. Normalise both sides through ``target.low_word`` so the comparison still matches when the two halves disagree. + + When :attr:`_current_call_pinned_initialized` is set (by the + IR lowering pass via :meth:`_compute_pinned_initialized_per_call`), + registers whose pinned local has not yet been written are + filtered out — their value is undefined garbage and saving it + is dead. """ low_word = self.target.low_word normalised_clobbers = frozenset(low_word(register) for register in clobbers) + initialized_filter = self._current_call_pinned_initialized # Dedup via ``set``: liveness-driven sharing maps several names # to the same register, and emitting push/pop pairs once per # name would unbalance the stack. return sorted({ register for register in self.pinned_register.values() - if low_word(register) in normalised_clobbers and low_word(register) != "ax" + if low_word(register) in normalised_clobbers + and low_word(register) != "ax" + and (initialized_filter is None or register in initialized_filter) }) + def _prologue_initialized_pinned_registers(self) -> set[str]: + """Return the set of pinned registers whose value is meaningful at function entry. + + Parameters that are pinned (via ``in_register`` attribute, + auto-pin, or fastcall) are loaded into their pin by the + function prologue, so the register holds a meaningful caller- + supplied value from the first instruction onward. Auto-pinned + LOCALS (not parameters) are uninitialized until the first + store and are excluded. + + Locals with explicit ``__attribute__((pinned_register(R)))`` + live entirely in the register (no stack slot) — their first + write IS the initialisation, so they're treated the same as + auto-pinned locals here. + """ + initialized: set[str] = set() + for name, register in self.pinned_register.items(): + if name in self.param_in_register or name in self.in_register_params: + initialized.add(register) + # Catch all parameters that landed in self.pinned_register — + # the prologue loads them either from caller-pushed slots + # ([bp+N]) or from the register-convention fastcall slots + # (acc/dx/cx). Any name from the function's parameter list + # counts; locals do not. + for name in getattr(self, "_current_function_parameter_names", ()): + if name in self.pinned_register: + initialized.add(self.pinned_register[name]) + return initialized + def _register_globals(self, declarations: list[Node], /) -> None: """Record file-scope declarations and validate their shapes. diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 35581497..c7c983e0 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -39,6 +39,20 @@ time. offset 52 into its natural slot at +84 — since `make_os.sh` re-emits every consumer against the current table on each run. +- **cc.py: skip dead pinned-register saves around builtin calls before the + pinned local's first store.** ``_pinned_registers_to_save`` was saving every + pinned register in every clobber set, even when the pinned local hadn't been + assigned to yet — preserving garbage. New per-function pre-pass walks the IR, + computes the may-defined set of pinned registers at each builtin call site + (with loop bodies pre-merged into the entry set so the back-edge sees in-loop + stores), and threads it to the save logic. Block- wrapped ``VarDecl init`` / + ``Assign`` nodes are scanned so the IR escape hatch doesn't leak. Scope is + intentionally limited to builtin call sites — user-function call paths can + mutate pinned state through code the analysis can't see (Block-wrapped + statements, pointer aliases) so they stay on the conservative save-everything + path. -120 byte kasm reduction kernel-wide; ``ping`` user program shrinks + 1544 → 1522 bytes. + - **cc.py: narrow ``mov eax, `` to ``mov al, `` when ``out dx, al`` is the only consumer.** After ``peephole_fold_byte_immediate_through_local`` rewrites the byte-load idiom diff --git a/tests/unit/test_cc_codegen.py b/tests/unit/test_cc_codegen.py index cf56e449..7fb75b0e 100644 --- a/tests/unit/test_cc_codegen.py +++ b/tests/unit/test_cc_codegen.py @@ -2821,6 +2821,64 @@ def test_pinned_register_on_non_function_pointer_rejected() -> None: assert "pinned_register" in error, f"Expected pinned_register error, got: {error}" +def test_pinned_register_save_kept_after_var_decl_initialiser() -> None: + """``int count = n; kernel_outb(...);`` saves EDX once count is pinned to it. + + The IR builder routes ``VarDecl`` with initialiser through a + ``Block`` escape hatch — the pre-pass must look inside Block to + see the implicit store, otherwise it would mis-treat the pin as + uninitialised and elide the save for the first call. Regression + guard for that path. + """ + asm = _kernel( + """ + void test_param(int n) { + int count = n; + kernel_outb(0x1F3, 0); + kernel_outb(0x1F4, count); + } + """, + bits=32, + ) + first_call = asm.split("out dx, al", 1)[0] + assert "push edx" in first_call, f"VarDecl-initialised pin must be saved on first call:\n{first_call}" + + +def test_pinned_register_save_skipped_before_first_store() -> None: + """Builtin calls before the first store to a pinned local skip push/pop of its register. + + ``status_bits`` auto-pins to EDX, but its first write happens + inside the while loop. The two pre-loop ``kernel_outb`` calls + therefore have no meaningful EDX value to preserve — saving it + is dead. The loop-body ``kernel_inb`` still wraps with ``push + edx`` / ``pop edx`` because the body stores to status_bits, so + every iteration past the first sees a live pin. + """ + asm = _kernel( + """ + struct ata_status { uint8_t err: 1; uint8_t idx: 1; + uint8_t corr: 1; uint8_t drq: 1; uint8_t srv: 1; + uint8_t df: 1; uint8_t rdy: 1; uint8_t bsy: 1; }; + void test_init() { + uint8_t status; + struct ata_status *status_bits; + kernel_outb(0x3F6, 4); + kernel_outb(0x3F6, 0); + while (1) { + status = kernel_inb(0x1F7); + status_bits = (struct ata_status *)&status; + if (status_bits->bsy == 0) { break; } + } + } + """, + bits=32, + ) + pre_loop_body, _, after_loop = asm.partition("._ir_wloop") + assert "push edx" not in pre_loop_body, f"pre-loop save should be elided:\n{pre_loop_body}" + # The loop body's call still saves — second iteration sees a live pin. + assert "push edx" in after_loop, f"in-loop save must survive:\n{after_loop}" + + def test_pointer_compared_to_int_literal_is_rejected() -> None: """``char *p; if (p == 0)`` raises — must spell as ``p == NULL``.""" error = _kernel_error("""