Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 53 additions & 1 deletion cc/codegen/x86/peephole.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,7 @@ def peephole_dead_temp_slots(self) -> None:
"""
base_register = self.target.base_register
slot_pattern = re.compile(rf"\[{base_register}-(\d+)(?:[+\-][^\]]+)?\]")
store_pattern = re.compile(rf"^mov \[{base_register}-(\d+)\],")
store_pattern = re.compile(rf"^mov (?:byte |word |dword )?\[{base_register}-(\d+)\],")
result: list[str] = []
for function_lines in self._iter_function_chunks():
# Collect every slot READ within this function. For ``mov
Expand Down Expand Up @@ -579,6 +579,57 @@ def peephole_dx_to_memory(self) -> None:
continue
i += 1

def peephole_fold_byte_immediate_through_local(self) -> None:
"""Fold ``mov byte [bp-N], <imm>`` + later ``movzx <reg>, byte [bp-N]`` to ``mov <reg>, <imm>``.

Motivating idiom: a bitfield-struct local whose only use is
``*(uint8_t *)&local`` (the driver port-I/O pattern). cc.py's
const-fold emits a byte store of the designated-init constant,
followed eventually by the pointer-deref load. Intervening
instructions are common — sibling designated-init stores, the
rest of ``kernel_outb``'s codegen — but as long as none of
them write to ``[bp-N]`` or transfer control out of the basic
block, the movzx is provably reading the immediate just stored.

Scan stops at:
- a write to ``[bp-N]`` (the slot may now hold a different value);
- a control-flow instruction (``ret``, ``jmp``, ``jcc``, ``call``);
- a label (potential branch target — predecessors might have
written ``[bp-N]`` after our store).

Only the first matching movzx is folded; later reads of
``[bp-N]`` are left alone so a downstream write+read pair
outside our window stays correct. ``peephole_dead_temp_slots``
runs after this and reclaims the store if no other reader of
``[bp-N]`` remains.
"""
base_register = self.target.base_register
accumulator = self.target.acc
store_pattern = re.compile(rf"^\s*mov byte \[{base_register}-(\d+)\], (\d+)\s*$")
label_pattern = re.compile(r"^\s*\.?\w+:\s*$")
flow_prefixes = ("ret", "jmp ", "call ", "j")
i = 0
while i < len(self.lines):
store_match = store_pattern.match(self.lines[i])
if store_match is None:
i += 1
continue
slot, immediate = store_match.group(1), int(store_match.group(2))
slot_address = f"[{base_register}-{slot}]"
target_load = f"movzx {accumulator}, byte {slot_address}"
write_signature = f"{slot_address},"
for j in range(i + 1, len(self.lines)):
candidate = self.lines[j].strip()
if candidate == target_load:
indent = self.lines[j][: len(self.lines[j]) - len(self.lines[j].lstrip())]
self.lines[j] = f"{indent}mov {accumulator}, {immediate}"
break
if write_signature in candidate:
break
if label_pattern.match(self.lines[j]) or any(candidate.startswith(prefix) for prefix in flow_prefixes):
break
i += 1

def peephole_fold_zero_save(self) -> None:
"""Fuse ``xor reg, reg / push reg`` into ``push 0``.

Expand Down Expand Up @@ -1349,6 +1400,7 @@ def run(self) -> list[str]:
self.peephole_memory_arithmetic_byte()
self.peephole_dx_to_memory()
self.peephole_store_reload()
self.peephole_fold_byte_immediate_through_local()
self.peephole_dead_temp_slots()
self.peephole_constant_to_register()
self.peephole_register_arithmetic()
Expand Down
12 changes: 12 additions & 0 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,18 @@ time.

## [Unreleased](https://github.com/bboe/BBoeOS/compare/0.11.0...main)

- **cc.py: fold byte-immediate store + movzx reload through local.** When a
one-shot struct literal local is read via ``*(uint8_t *)&local`` — the driver
port-I/O idiom — the compiler was emitting ``mov byte [ebp-N],
<imm>; movzx eax, byte [ebp-N]``. A new peephole observes that the movzx
reads exactly the value just stored (or stored earlier in the same basic
block, as long as nothing else writes ``[ebp-N]`` in between) and rewrites the
load to ``mov eax, <imm>``; ``peephole_dead_temp_slots`` (now extended to
recognise byte stores) reclaims the unreferenced byte store afterwards. Saves
~5 bytes per fold and trims the kernel's per-function frame traffic for every
bitfield-struct + ``kernel_outb`` site. Total kernel kasm reduction: 217
bytes.

- **Switch libbboeos build to nasm-elf + ld + linker script.** vdso.asm is now
`nasm -f elf32`-assembled with section directives
(`.libbboeos.function_table`, `.libbboeos.text`, `.libbboeos.rodata`,
Expand Down
9 changes: 8 additions & 1 deletion tests/test_cc_casts.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,14 +77,21 @@ def test_byte_dereference_after_cast(*, work: Path) -> None:

The ``AddressOf(Var)`` shortcut folds ``*(uint8_t *)&local`` into a
direct frame-relative byte load, skipping the intermediate ``lea``.

Uses a runtime-sourced value (``source()`` call) to keep the byte
load present — the ``peephole_fold_byte_immediate_through_local``
pass folds the const-fold immediate path into a direct ``mov reg,
imm`` and is covered separately.
"""
asm = compile_snippet(
name="byte_dereference_after_cast",
source=(
"struct foo { uint8_t a : 1; uint8_t b : 7; };\n"
"uint8_t source() { return 11; }\n"
"uint8_t leak(uint8_t value) { return value; }\n"
"void caller() {\n"
" struct foo s = { .a = 1, .b = 5 };\n"
" struct foo s;\n"
" *(uint8_t *)&s = source();\n"
" leak(*(uint8_t *)&s);\n"
"}\n"
),
Expand Down
36 changes: 36 additions & 0 deletions tests/unit/test_cc_codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -2589,6 +2589,42 @@ def test_peephole_dead_temp_slot_kept_when_read() -> None:
assert " mov [bp-2], ax" in out, f"live temp-slot store dropped: {out}"


def test_peephole_fold_byte_immediate_through_local() -> None:
"""``mov byte [ebp-N], <imm>; movzx eax, byte [ebp-N]`` folds to ``mov eax, <imm>``.

Motivating idiom: a bitfield-struct local whose only use is
``*(uint8_t *)&local`` (typical of the driver port-I/O sites).
cc.py emits the byte store from the const-folded designated init,
then the pointer-deref load. The peephole observes that the
movzx reads exactly the value just stored and rewrites it to a
direct immediate load; ``peephole_dead_temp_slots`` then reclaims
the now-unreferenced store.
"""
asm = _kernel(
"""
struct cr {
uint8_t start: 1;
uint8_t stop: 1;
uint8_t txp: 1;
uint8_t reserved: 1;
uint8_t page: 2;
uint8_t rd: 2;
};
void probe() {
struct cr c = { .stop = 1, .rd = 4, .page = 1 };
kernel_outb(0x300, *(uint8_t *)&c);
}
""",
bits=32,
)
# The store + load through [ebp-N] is gone; the immediate flows
# straight to AX/AL.
assert "mov byte [ebp-1]" not in asm, f"byte store to local should be elided:\n{asm}"
assert "movzx eax, byte [ebp-1]" not in asm, f"movzx reload should be folded:\n{asm}"
# The byte value (0b00010010 = 18) reaches AL one way or another.
assert "mov eax, 18" in asm or "mov al, 18" in asm, f"folded immediate load missing:\n{asm}"


def test_peephole_redundant_register_swap_drops_second_mov() -> None:
"""``mov A, B`` followed by ``mov B, A`` drops the second."""
out = _peephole_run([
Expand Down