From 9fff9597c9b35a52cb1bce4a356a67348897e353 Mon Sep 17 00:00:00 2001 From: Bryce Boe Date: Wed, 20 May 2026 01:20:53 -0700 Subject: [PATCH] feat(cc): fold byte-immediate store + movzx reload through local A bitfield-struct local whose only use is ``*(uint8_t *)&local`` (the driver port-I/O idiom) was emitting: sub esp, 5 mov byte [ebp-1], 18 ; designated-init const-fold movzx eax, byte [ebp-1] ; *(uint8_t *)&c push eax mov edx, 768 pop eax out dx, al The movzx provably reads the value just stored, so a direct ``mov eax, `` is equivalent. The new ``peephole_fold_byte_immediate_through_local`` walks forward from each ``mov byte [bp-N], `` store, scanning past instructions that don't touch ``[bp-N]`` (sibling designated-init stores, the rest of ``kernel_outb``'s codegen), and folds the matching ``movzx , byte [bp-N]`` reload into ``mov , ``. The scan stops at any write to ``[bp-N]``, control flow (``ret``, ``jmp``, ``jcc``, ``call``), or label. Extending ``peephole_dead_temp_slots`` to recognise ``mov byte [bp-N], ...`` as a store (it previously only matched ``mov [bp-N], `` width-less stores) lets it then reclaim the unreferenced byte store. Real-kernel impact: 217-byte reduction across the kernel kasm build, folds firing in ata / fdc / ne2k / ps2 / sb16 / opl3 / console. ``tests/test_cc_casts.py::test_byte_dereference_after_cast`` was using a const-fold struct literal whose load is now folded by the peephole; switch it to a runtime-sourced value so the byte-load idiom remains present (the const-immediate fold path is covered by the new unit test). Co-Authored-By: Claude Opus 4.7 (1M context) --- cc/codegen/x86/peephole.py | 54 ++++++++++++++++++++++++++++++++++- docs/CHANGELOG.md | 12 ++++++++ tests/test_cc_casts.py | 9 +++++- tests/unit/test_cc_codegen.py | 36 +++++++++++++++++++++++ 4 files changed, 109 insertions(+), 2 deletions(-) diff --git a/cc/codegen/x86/peephole.py b/cc/codegen/x86/peephole.py index 7630833d..cf37ec49 100644 --- a/cc/codegen/x86/peephole.py +++ b/cc/codegen/x86/peephole.py @@ -489,7 +489,7 @@ def peephole_dead_temp_slots(self) -> None: """ base_register = self.target.base_register slot_pattern = re.compile(rf"\[{base_register}-(\d+)(?:[+\-][^\]]+)?\]") - store_pattern = re.compile(rf"^mov \[{base_register}-(\d+)\],") + store_pattern = re.compile(rf"^mov (?:byte |word |dword )?\[{base_register}-(\d+)\],") result: list[str] = [] for function_lines in self._iter_function_chunks(): # Collect every slot READ within this function. For ``mov @@ -579,6 +579,57 @@ def peephole_dx_to_memory(self) -> None: continue i += 1 + def peephole_fold_byte_immediate_through_local(self) -> None: + """Fold ``mov byte [bp-N], `` + later ``movzx , byte [bp-N]`` to ``mov , ``. + + Motivating idiom: a bitfield-struct local whose only use is + ``*(uint8_t *)&local`` (the driver port-I/O pattern). cc.py's + const-fold emits a byte store of the designated-init constant, + followed eventually by the pointer-deref load. Intervening + instructions are common — sibling designated-init stores, the + rest of ``kernel_outb``'s codegen — but as long as none of + them write to ``[bp-N]`` or transfer control out of the basic + block, the movzx is provably reading the immediate just stored. + + Scan stops at: + - a write to ``[bp-N]`` (the slot may now hold a different value); + - a control-flow instruction (``ret``, ``jmp``, ``jcc``, ``call``); + - a label (potential branch target — predecessors might have + written ``[bp-N]`` after our store). + + Only the first matching movzx is folded; later reads of + ``[bp-N]`` are left alone so a downstream write+read pair + outside our window stays correct. ``peephole_dead_temp_slots`` + runs after this and reclaims the store if no other reader of + ``[bp-N]`` remains. + """ + base_register = self.target.base_register + accumulator = self.target.acc + store_pattern = re.compile(rf"^\s*mov byte \[{base_register}-(\d+)\], (\d+)\s*$") + label_pattern = re.compile(r"^\s*\.?\w+:\s*$") + flow_prefixes = ("ret", "jmp ", "call ", "j") + i = 0 + while i < len(self.lines): + store_match = store_pattern.match(self.lines[i]) + if store_match is None: + i += 1 + continue + slot, immediate = store_match.group(1), int(store_match.group(2)) + slot_address = f"[{base_register}-{slot}]" + target_load = f"movzx {accumulator}, byte {slot_address}" + write_signature = f"{slot_address}," + for j in range(i + 1, len(self.lines)): + candidate = self.lines[j].strip() + if candidate == target_load: + indent = self.lines[j][: len(self.lines[j]) - len(self.lines[j].lstrip())] + self.lines[j] = f"{indent}mov {accumulator}, {immediate}" + break + if write_signature in candidate: + break + if label_pattern.match(self.lines[j]) or any(candidate.startswith(prefix) for prefix in flow_prefixes): + break + i += 1 + def peephole_fold_zero_save(self) -> None: """Fuse ``xor reg, reg / push reg`` into ``push 0``. @@ -1349,6 +1400,7 @@ def run(self) -> list[str]: self.peephole_memory_arithmetic_byte() self.peephole_dx_to_memory() self.peephole_store_reload() + self.peephole_fold_byte_immediate_through_local() self.peephole_dead_temp_slots() self.peephole_constant_to_register() self.peephole_register_arithmetic() diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 2d2a9de0..3eb35bcb 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -11,6 +11,18 @@ time. ## [Unreleased](https://github.com/bboe/BBoeOS/compare/0.11.0...main) +- **cc.py: fold byte-immediate store + movzx reload through local.** When a + one-shot struct literal local is read via ``*(uint8_t *)&local`` — the driver + port-I/O idiom — the compiler was emitting ``mov byte [ebp-N], + ; movzx eax, byte [ebp-N]``. A new peephole observes that the movzx + reads exactly the value just stored (or stored earlier in the same basic + block, as long as nothing else writes ``[ebp-N]`` in between) and rewrites the + load to ``mov eax, ``; ``peephole_dead_temp_slots`` (now extended to + recognise byte stores) reclaims the unreferenced byte store afterwards. Saves + ~5 bytes per fold and trims the kernel's per-function frame traffic for every + bitfield-struct + ``kernel_outb`` site. Total kernel kasm reduction: 217 + bytes. + - **Switch libbboeos build to nasm-elf + ld + linker script.** vdso.asm is now `nasm -f elf32`-assembled with section directives (`.libbboeos.function_table`, `.libbboeos.text`, `.libbboeos.rodata`, diff --git a/tests/test_cc_casts.py b/tests/test_cc_casts.py index 3771bc45..05c70159 100755 --- a/tests/test_cc_casts.py +++ b/tests/test_cc_casts.py @@ -77,14 +77,21 @@ def test_byte_dereference_after_cast(*, work: Path) -> None: The ``AddressOf(Var)`` shortcut folds ``*(uint8_t *)&local`` into a direct frame-relative byte load, skipping the intermediate ``lea``. + + Uses a runtime-sourced value (``source()`` call) to keep the byte + load present — the ``peephole_fold_byte_immediate_through_local`` + pass folds the const-fold immediate path into a direct ``mov reg, + imm`` and is covered separately. """ asm = compile_snippet( name="byte_dereference_after_cast", source=( "struct foo { uint8_t a : 1; uint8_t b : 7; };\n" + "uint8_t source() { return 11; }\n" "uint8_t leak(uint8_t value) { return value; }\n" "void caller() {\n" - " struct foo s = { .a = 1, .b = 5 };\n" + " struct foo s;\n" + " *(uint8_t *)&s = source();\n" " leak(*(uint8_t *)&s);\n" "}\n" ), diff --git a/tests/unit/test_cc_codegen.py b/tests/unit/test_cc_codegen.py index 038b836d..e7bea1f4 100644 --- a/tests/unit/test_cc_codegen.py +++ b/tests/unit/test_cc_codegen.py @@ -2589,6 +2589,42 @@ def test_peephole_dead_temp_slot_kept_when_read() -> None: assert " mov [bp-2], ax" in out, f"live temp-slot store dropped: {out}" +def test_peephole_fold_byte_immediate_through_local() -> None: + """``mov byte [ebp-N], ; movzx eax, byte [ebp-N]`` folds to ``mov eax, ``. + + Motivating idiom: a bitfield-struct local whose only use is + ``*(uint8_t *)&local`` (typical of the driver port-I/O sites). + cc.py emits the byte store from the const-folded designated init, + then the pointer-deref load. The peephole observes that the + movzx reads exactly the value just stored and rewrites it to a + direct immediate load; ``peephole_dead_temp_slots`` then reclaims + the now-unreferenced store. + """ + asm = _kernel( + """ + struct cr { + uint8_t start: 1; + uint8_t stop: 1; + uint8_t txp: 1; + uint8_t reserved: 1; + uint8_t page: 2; + uint8_t rd: 2; + }; + void probe() { + struct cr c = { .stop = 1, .rd = 4, .page = 1 }; + kernel_outb(0x300, *(uint8_t *)&c); + } + """, + bits=32, + ) + # The store + load through [ebp-N] is gone; the immediate flows + # straight to AX/AL. + assert "mov byte [ebp-1]" not in asm, f"byte store to local should be elided:\n{asm}" + assert "movzx eax, byte [ebp-1]" not in asm, f"movzx reload should be folded:\n{asm}" + # The byte value (0b00010010 = 18) reaches AL one way or another. + assert "mov eax, 18" in asm or "mov al, 18" in asm, f"folded immediate load missing:\n{asm}" + + def test_peephole_redundant_register_swap_drops_second_mov() -> None: """``mov A, B`` followed by ``mov B, A`` drops the second.""" out = _peephole_run([