From 6d304968f48f5d3e107b47d00ce260a9f901ced7 Mon Sep 17 00:00:00 2001 From: davidmaamoaix Date: Fri, 20 Mar 2026 14:31:19 -0400 Subject: [PATCH] [stack-switching] Larger stack cache --- src/engine/Tuning.v3 | 5 ++++ src/engine/x86-64/Mmap.v3 | 3 ++ src/engine/x86-64/V3Offsets.v3 | 2 ++ src/engine/x86-64/X86_64Stack.v3 | 47 +++++++++++++++++++++++--------- 4 files changed, 44 insertions(+), 13 deletions(-) diff --git a/src/engine/Tuning.v3 b/src/engine/Tuning.v3 index c1c12d3e1..c7a6f837b 100644 --- a/src/engine/Tuning.v3 +++ b/src/engine/Tuning.v3 @@ -69,3 +69,8 @@ component SpcTuning { var inlineGlobalAccess = true; // enable inline access of (primitive) globals var disableMemoryBoundsChecks = false; // unsafe! don't emit bounds checks } + +// Tuning settings for stack-switching (have no effect on correctness). +component StackTuning { + def stackCacheSize = 8; // number of stacks to allocate in a batch in the stack manager. +} diff --git a/src/engine/x86-64/Mmap.v3 b/src/engine/x86-64/Mmap.v3 index 5305ef0c6..672bba1c9 100644 --- a/src/engine/x86-64/Mmap.v3 +++ b/src/engine/x86-64/Mmap.v3 @@ -49,4 +49,7 @@ class MemoryRange { def range(offset: int, length: int) -> Range { return CiRuntime.forgeRange(this.start + offset, length); } + def render(buf: StringBuilder) -> StringBuilder { + return buf.put2("MemRange[0x%x, 0x%x)", start - Pointer.NULL, end - Pointer.NULL); + } } diff --git a/src/engine/x86-64/V3Offsets.v3 b/src/engine/x86-64/V3Offsets.v3 index 5de4559e9..a7c7860f8 100644 --- a/src/engine/x86-64/V3Offsets.v3 +++ b/src/engine/x86-64/V3Offsets.v3 @@ -63,6 +63,7 @@ class V3Offsets { def NativeWasmMemory_num_pages = int.view(Pointer.atField(mem.num_pages) - Pointer.atObject(mem)); def X86_64Runtime_curStack = Pointer.atField(RT.curStack); + def X86_64StackManager_cache = Pointer.atField(X86_64StackManager.cache); def Interpreter_dispatchTable = Pointer.atField(I.dispatchTable); def X86_64Stack_version = int.view(Pointer.atField(vs.version) - Pointer.atObject(vs)); def X86_64Stack_vsp = int.view(Pointer.atField(vs.vsp) - Pointer.atObject(vs)); @@ -73,6 +74,7 @@ class V3Offsets { def X86_64Stack_bottom = int.view(Pointer.atField(vs.cont_bottom) - Pointer.atObject(vs)); def X86_64Stack_state = int.view(Pointer.atField(vs.state_) - Pointer.atObject(vs)); def X86_64Stack_return_results = int.view(Pointer.atField(vs.return_results) - Pointer.atObject(vs)); + def X86_64Stack_next_stack = int.view(Pointer.atField(vs.next_stack) - Pointer.atObject(vs)); def WasmFunction_typeId = Pointer.atObject(wf).load(); diff --git a/src/engine/x86-64/X86_64Stack.v3 b/src/engine/x86-64/X86_64Stack.v3 index cb3e94c93..8c5ec701f 100644 --- a/src/engine/x86-64/X86_64Stack.v3 +++ b/src/engine/x86-64/X86_64Stack.v3 @@ -22,6 +22,9 @@ class X86_64Stack extends WasmStack { var return_results: Array; var state_: StackState; + // Single-linked list for stack manager cache. + var next_stack: X86_64Stack; + new(size) { mapping = Target.mmap_reserve(size, Mmap.PROT_READ | Mmap.PROT_WRITE); if (mapping == null) fatal("out of memory allocating value stack"); @@ -769,8 +772,18 @@ def genStackReturnParentStub(ic: X86_64InterpreterCode, w: DataWriter) { masm.emit_mov_m_l(MasmAddr(r_stack, masm.offsets.X86_64Stack_parent), 0); // mov [%stack.parent_rsp_ptr], nullptr masm.emit_mov_m_l(MasmAddr(r_stack, masm.offsets.X86_64Stack_parent_rsp_ptr), 0); + // l_return: masm.bindLabel(l_return); + + // recycle %stack (already set at this point) + masm.emit_mov_m_m( + ValueKind.REF, + MasmAddr(r_stack, masm.offsets.X86_64Stack_next_stack), + MasmAddr(Reg(0), int.!(masm.offsets.X86_64StackManager_cache - Pointer.NULL)) + ); + masm.emit_mov_m_r(ValueKind.REF, MasmAddr(Reg(0), int.!(masm.offsets.X86_64StackManager_cache - Pointer.NULL)), r_stack); + // mov [cur_stack], %parent masm.emit_set_curstack(r_parent); // pop %rsp @@ -1153,29 +1166,37 @@ private class X86_64FrameWriter extends FrameWriter { component X86_64StackManager { var cache: X86_64Stack; - def getFreshStack() -> X86_64Stack { - var result: X86_64Stack; - if (cache == null) { - result = X86_64Stack.new(EngineOptions.STACK_SIZE.get()); - } else { - result = cache; - cache = null; + def allocStackBatch() { + if (Trace.stack) Trace.OUT.put1("Batch allocating %d stacks", StackTuning.stackCacheSize).ln(); + for (i < StackTuning.stackCacheSize) { + var curr = X86_64Stack.new(EngineOptions.STACK_SIZE.get()); + curr.next_stack = cache; + cache = curr; } - if (Trace.stack) Trace.OUT.put1( - "Requested fresh x86_stack @ 0x%x", Pointer.atObject(result) - Pointer.NULL - ).ln(); - return result; } + + def getFreshStack() -> X86_64Stack { + if (cache == null) allocStackBatch(); + + var result = cache; + cache = cache.next_stack; + result.next_stack = null; + + if (Trace.stack) Trace.OUT.put1("Requested fresh x86_stack @ 0x%x", Pointer.atObject(result) - Pointer.NULL).ln(); + return result.clear(); + } + def recycleStack(stack: X86_64Stack) { - if (cache == null) cache = stack; // XXX: save the larger/smaller of the stacks? + stack.next_stack = cache; + cache = stack; } + def runOnFreshStack(f: Function, args: Range) -> Result { // Always run functions on a separate, fresh stack. var prev = X86_64Runtime.curStack; // handle reentrancy var stack = X86_64StackManager.getFreshStack(); var result = stack.reset(f).bind(args).resume(); X86_64Runtime.curStack = prev; - X86_64StackManager.recycleStack(stack.clear()); return result; } }