From 94855b19c669e65fc3aecfe2cdf11290928bf6e6 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Fri, 20 Mar 2026 12:06:47 -0400 Subject: [PATCH 01/55] Add SpcInlinedFrame --- src/engine/compiler/SinglePassCompiler.v3 | 1 + src/engine/x86-64/X86_64SinglePassCompiler.v3 | 15 +++++++++++++++ src/engine/x86-64/X86_64Stack.v3 | 3 +++ 3 files changed, 19 insertions(+) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 774443b17..a10ff3b60 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -2082,6 +2082,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl trap_labels.put((reason, label, frames)); return label; } + def getSpcInlinedFrameIp() -> long; def unsupported() { success = false; // XXX: add opcode } diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3 index 88f62602a..23c5fae6f 100644 --- a/src/engine/x86-64/X86_64SinglePassCompiler.v3 +++ b/src/engine/x86-64/X86_64SinglePassCompiler.v3 @@ -1130,6 +1130,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { state.push(b.kindFlagsMatching(kind, IN_REG), b.reg, 0); return true; } + def getSpcInlinedFrameIp() -> long { + return INLINED_FRAME_STUB.start - Pointer.NULL; + } } def ucontext_rip_offset = 168; @@ -1323,6 +1326,12 @@ class X86_64SpcTrapsStub extends X86_64SpcCode { } } +// Marker for reconstructed inlined frames in stack traces. +// Tells the stalk walker that it should look inside the frame to find the function's pc. +class X86_64SpcInlinedFrame extends X86_64SpcCode { + new() super("inlined-frame", Pointer.NULL, Pointer.NULL) { } +} + // The lazy-compile stub needs special handling in the Virgil runtime because it has // a frame that stores the function being compiled. class X86_64SpcCompileStub extends RiUserCode { @@ -1364,6 +1373,8 @@ def LAZY_COMPILE_STUB = X86_64PreGenStub.new("spc-lazy-compile", X86_64SpcCompil def TIERUP_COMPILE_STUB = X86_64PreGenStub.new("spc-tierup-compile", X86_64SpcCompileStub.new("tierup"), genTierUpCompileStub); def TRAPS_STUB = X86_64SpcTrapsStub.new(); def TRAPS_PREGEN = X86_64PreGenStub.new("spc-trap", TRAPS_STUB, genTrapsStub); +def INLINED_FRAME_STUB = X86_64SpcInlinedFrame.new(); +def INLINED_FRAME_PREGEN = X86_64PreGenStub.new("spc-inlined-frame", INLINED_FRAME_STUB, genSpcInlinedFrame); def genSpcEntryFunc(ic: X86_64InterpreterCode, w: DataWriter) { if (SpcTuning.disable) return; @@ -1468,6 +1479,10 @@ def genTrapsStub(ic: X86_64InterpreterCode, w: DataWriter) { w.skipN(skip); } } +def genSpcInlinedFrame(ic: X86_64InterpreterCode, w: DataWriter) { + var masm = X86_64MacroAssembler.new(w, X86_64MasmRegs.CONFIG); + masm.emit_intentional_crash(); // do not execute this +} def codePointer(f: P -> R) -> Pointer { return CiRuntime.unpackClosure(f).0; } diff --git a/src/engine/x86-64/X86_64Stack.v3 b/src/engine/x86-64/X86_64Stack.v3 index 5849cc754..cb3e94c93 100644 --- a/src/engine/x86-64/X86_64Stack.v3 +++ b/src/engine/x86-64/X86_64Stack.v3 @@ -144,6 +144,7 @@ class X86_64Stack extends WasmStack { null => break; x: X86_64InterpreterCode => if (f != null && !f(retip, code, pos, param)) return (true, pos); x: X86_64SpcModuleCode => if (f != null && !f(retip, code, pos, param)) return (true, pos); + x: X86_64SpcInlinedFrame => if (f != null && !f(retip, code, pos, param)) return (true, pos); x: X86_64SpcTrapsStub => if (f != null && !f(retip, code, pos, param)) return (true, pos); x: X86_64ReturnParentStub => { if (stack.parent == null || !continue_to_parent) { @@ -955,6 +956,7 @@ class X86_64FrameAccessor(stack: X86_64Stack, sp: Pointer, decl: FuncDecl) exten var code = RiRuntime.findUserCode(ip); match (code) { x: X86_64SpcModuleCode => cached_pc = x.lookupTopPc(ip, true); + x: X86_64SpcInlinedFrame => cached_pc = (sp + X86_64InterpreterFrame.curpc.offset).load(); x: X86_64InterpreterCode => cached_pc = X86_64Interpreter.computePCFromFrame(sp); x: X86_64SpcTrapsStub => cached_pc = (sp + X86_64InterpreterFrame.curpc.offset).load(); _ => cached_pc = -1; @@ -982,6 +984,7 @@ class X86_64FrameAccessor(stack: X86_64Stack, sp: Pointer, decl: FuncDecl) exten match (code) { x: X86_64InterpreterCode => ; x: X86_64SpcCode => ; + x: X86_64SpcInlinedFrame => ; // in the future, we could indicate inlining depth in the frame _ => return depth; } depth++; From 9a38ea7e595a27eb33950baae14e742175631120 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Mon, 23 Mar 2026 03:01:59 -0400 Subject: [PATCH 02/55] Add frame reconstructions methods and guard reconstruction points --- src/engine/compiler/SinglePassCompiler.v3 | 164 +++++++++++++++--- src/engine/x86-64/X86_64SinglePassCompiler.v3 | 30 +++- 2 files changed, 166 insertions(+), 28 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index a10ff3b60..b1efa797a 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -182,7 +182,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // Emit function entry probe, if any. if (!FeatureDisable.entryProbes && func.entry_probed) { var probe = Instrumentation.getLocalProbe(module, func.func_index, 0); - emitProbe0(0, probe); + withReconstructedInlinedFrames(fun => + emitProbe0(0, probe)); } masm.current_fid = func.func_index; @@ -214,8 +215,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.bindLabel(label); if (frames.length > 1) { - // no inlining yet: this should never happen - System.error("SpcError", "attempt to emit trap in inlined context"); + unrefRegs(); + emitReconstructStackFrames(frames); } else { masm.emit_mov_m_i(xenv.pc_slot, label.create_pos); } @@ -395,11 +396,24 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl last_probe = it.pc; if (orig_op != Opcode.LOOP && orig_op != Opcode.END) emitProbe(); } + // Guards compiler code with frame reconstruction (if necessary). + def withReconstructedInlinedFrames(emit: void -> void) { + if (isInlined()) { + unrefRegs(); + def space = emitReconstructStackFrames(snapshotFrames()); + emit(); + if (space > 0) masm.emit_addw_r_i(regs.sp, space); + } else { + emit(); + } + + } def emitProbe() { if (last_probe == 0) return; var probe = Instrumentation.getLocalProbe(module, func.func_index, last_probe); last_probe = 0; - emitProbe0(it.pc, probe); + withReconstructedInlinedFrames(fun => + emitProbe0(it.pc, probe)); if (Trace.compiler) traceOpcodeAndStack(true); } def emitProbe0(pc: int, probe: Probe) { @@ -859,18 +873,21 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } def visitCallDirect(op: Opcode, index: u31, tailCall: bool) { var func = module.functions[index]; - var retpt = masm.newLabel(it.pc), wasmcall_label = masm.newLabel(it.pc); - // Load the instance (which must happen before frame is unwound). - var vsp_reg = allocTmpFixed(ValueKind.REF, regs.vsp); - var func_reg = allocTmpFixed(ValueKind.REF, regs.func_arg); - var tmp = allocTmp(ValueKind.REF); - emit_load_instance(tmp); - // Load the function, XXX: skip and compute function from instance + code on stack? - masm.emit_v3_Instance_functions_r_r(func_reg, tmp); - masm.emit_v3_Array_elem_r_ri(ValueKind.REF, func_reg, func_reg, func.func_index); + withReconstructedInlinedFrames(fun { + var retpt = masm.newLabel(it.pc), wasmcall_label = masm.newLabel(it.pc); + // Load the instance (which must happen before frame is unwound). + var vsp_reg = allocTmpFixed(ValueKind.REF, regs.vsp); + var func_reg = allocTmpFixed(ValueKind.REF, regs.func_arg); + var tmp = allocTmp(ValueKind.REF); + emit_load_instance(tmp); + + // Load the function, XXX: skip and compute function from instance + code on stack? + masm.emit_v3_Instance_functions_r_r(func_reg, tmp); + masm.emit_v3_Array_elem_r_ri(ValueKind.REF, func_reg, func_reg, func.func_index); - emitCallToReg(func.sig, func_reg, vsp_reg, tmp, func.imp != null, tailCall); + emitCallToReg(func.sig, func_reg, vsp_reg, tmp, func.imp != null, tailCall); + }); } def emitCallToReg(sig: SigDecl, func_reg: Reg, vsp_reg: Reg, tmp: Reg, checkHostCall: bool, tailCall: bool) { var retpt = masm.newLabel(it.pc), wasmcall_label = masm.newLabel(it.pc); @@ -1935,12 +1952,20 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl state.emitSaveAll(resolver, runtimeSpillMode); emit_compute_vsp(regs.vsp, state.sp); masm.emit_store_curstack_vsp(regs.vsp); - masm.emit_get_curstack(regs.runtime_arg0); - masm.emit_v3_set_X86_64Stack_rsp_r_r(regs.runtime_arg0, regs.sp); - masm.emit_push_X86_64Stack_rsp_r_r(regs.runtime_arg0); - emit_load_instance(regs.runtime_arg1); - masm.emit_mov_r_i(regs.runtime_arg2, arg1); - masm.emit_call_runtime_op(op); + + def emit = fun { + masm.emit_get_curstack(regs.runtime_arg0); + masm.emit_v3_set_X86_64Stack_rsp_r_r(regs.runtime_arg0, regs.sp); + masm.emit_push_X86_64Stack_rsp_r_r(regs.runtime_arg0); + emit_load_instance(regs.runtime_arg1); + masm.emit_mov_r_i(regs.runtime_arg2, arg1); + masm.emit_call_runtime_op(op); + }; + // Reconstruct stack frames across runtime calls that might (Wasm-level) trap. + if (canTrap) + withReconstructedInlinedFrames(emit); + else + emit(); masm.emit_get_curstack(regs.scratch); masm.emit_pop_X86_64Stack_rsp_r_r(regs.scratch); dropN(args); @@ -1952,13 +1977,21 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl state.emitSaveAll(resolver, runtimeSpillMode); emit_compute_vsp(regs.vsp, state.sp); masm.emit_store_curstack_vsp(regs.vsp); - masm.emit_get_curstack(regs.runtime_arg0); - masm.emit_v3_set_X86_64Stack_rsp_r_r(regs.runtime_arg0, regs.sp); - masm.emit_push_X86_64Stack_rsp_r_r(regs.runtime_arg0); - emit_load_instance(regs.runtime_arg1); - masm.emit_mov_r_i(regs.runtime_arg2, arg1); - masm.emit_mov_r_i(regs.runtime_arg3, arg2); - masm.emit_call_runtime_op(op); + + def emit = fun { + masm.emit_get_curstack(regs.runtime_arg0); + masm.emit_v3_set_X86_64Stack_rsp_r_r(regs.runtime_arg0, regs.sp); + masm.emit_push_X86_64Stack_rsp_r_r(regs.runtime_arg0); + emit_load_instance(regs.runtime_arg1); + masm.emit_mov_r_i(regs.runtime_arg2, arg1); + masm.emit_mov_r_i(regs.runtime_arg3, arg2); + masm.emit_call_runtime_op(op); + }; + // Reconstruct stack frames across runtime calls that might (Wasm-level) trap. + if (canTrap) + withReconstructedInlinedFrames(emit); + else + emit(); masm.emit_get_curstack(regs.scratch); masm.emit_pop_X86_64Stack_rsp_r_r(regs.scratch); dropN(args); @@ -2083,6 +2116,83 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl return label; } def getSpcInlinedFrameIp() -> long; + // Emit code to materialize stack frames for each inlined function. + def emitReconstructStackFrames(frames: Array) -> int { + // Metrics.spc_static_reconst.val++; + // masm.emit_inc_metric(Metrics.spc_dynamic_reconst); + def real_frame = frames[0]; + masm.emit_mov_m_i(xenv.pc_slot, real_frame.pc); + + // Use inlined frame stub IP as return address for all reconstructed frames + var return_addr = getSpcInlinedFrameIp(); + var total_space = 0; + + // load instance + var inst_reg = allocTmp(ValueKind.REF); + //emit_load_instance(inst_reg); + masm.emit_mov_r_m(ValueKind.REF, inst_reg, frame.instance_slot); + var mem_reg = allocTmp(ValueKind.REF); + masm.emit_mov_r_m(ValueKind.REF, mem_reg, frame.mem0_base_slot); + // Load instance.functions + def func_reg = allocTmp(ValueKind.REF); + masm.emit_v3_Instance_functions_r_r(func_reg, inst_reg); + // use same vfp for all frames + def vfp_reg = allocTmp(ValueKind.REF); + masm.emit_mov_r_m(ValueKind.REF, vfp_reg, frame.vfp_slot); + var wasm_func_reg = allocTmp(ValueKind.REF); + + var inl_inst_reg: Reg, inl_mem0_reg: Reg; + if (is_inlined) { + inl_inst_reg = allocTmp(ValueKind.REF); + inl_mem0_reg = allocTmp(ValueKind.REF); + masm.emit_mov_r_m(ValueKind.REF, inl_inst_reg, frame.inlined_instance_slot); + masm.emit_mov_r_m(ValueKind.REF, inl_mem0_reg, frame.inlined_mem0_base_slot); + } + + // Process the inlined frames (skip the outermost which already exists on native stack) + for (i = 1; i < frames.length; i++) { + var frame_info = frames[i]; + + // Push inlined frame stub IP as return address + masm.emit_subw_r_i(regs.sp, 8); + masm.emit_mov_m_l(MasmAddr(regs.sp, 0), return_addr); + total_space += 8; + + // Allocate concrete stack frame for inlined function + masm.emit_subw_r_i(regs.sp, frame.frameSize); + total_space += frame.frameSize; + + // get functions[func_index] and save into frame + masm.emit_v3_Array_elem_r_ri(ValueKind.REF, wasm_func_reg, func_reg, frame_info.func.func_index); + masm.emit_mov_m_r(ValueKind.REF, frame.wasm_func_slot, wasm_func_reg); + + // Save instance to frame.instance_slot + masm.emit_mov_m_r(ValueKind.REF, frame.instance_slot, inst_reg); + + // Save mem0 base + masm.emit_mov_m_r(ValueKind.REF, frame.mem0_base_slot, mem_reg); + + // use same vfp for all frames + masm.emit_mov_m_r(ValueKind.REF, frame.vfp_slot, vfp_reg); + + // Save PC into frame.pc_slot + masm.emit_mov_m_i(frame.pc_slot, frame_info.pc); + + // Clear FrameAccessor + masm.emit_mov_m_l(frame.accessor_slot, 0); + + // if an inlined whamm probe, also grab inlined slots + if (is_inlined) { + masm.emit_mov_m_r(ValueKind.REF, frame.inlined_instance_slot, inl_inst_reg); + masm.emit_mov_m_r(ValueKind.REF, frame.inlined_mem0_base_slot, inl_mem0_reg); + } else { + masm.emit_mov_m_l(frame.inlined_instance_slot, 0); + masm.emit_mov_m_l(frame.inlined_mem0_base_slot, 0); + } + } + + return total_space; + } def unsupported() { success = false; // XXX: add opcode } diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3 index 23c5fae6f..b9d7728c3 100644 --- a/src/engine/x86-64/X86_64SinglePassCompiler.v3 +++ b/src/engine/x86-64/X86_64SinglePassCompiler.v3 @@ -1256,7 +1256,35 @@ class X86_64SpcModuleCode extends X86_64SpcCode { } // Reconstructs inlined interpreter frames for an inlined hardware trap context. // Returns the new rsp to write into the ucontext (top of stack). - private def reconstructInlinedFramesForTrap(r_rsp: Pointer, inline_ctx: List) -> Pointer; + private def reconstructInlinedFramesForTrap(r_rsp: Pointer, inline_ctx: List) -> Pointer { + def frames: Array = Lists.toArray(inline_ctx); + def outer = frames[frames.length - 1]; + def inlined = frames[0 ... (frames.length - 1)]; + def count = inlined.length; + + // set outermost pc in the real frame + (r_rsp + X86_64InterpreterFrame.curpc.offset).store(outer.pc); + + // Read instance from the real outer frame (shared across all inlined frames) + var instance = (r_rsp + X86_64InterpreterFrame.instance.offset).load(); + + // Push inlined frames + for (i = count - 1; i >= 0; i--) { + var fid = inlined[i].func_index; + var pc = inlined[i].pc; + + r_rsp += -8; + r_rsp.store(INLINED_FRAME_STUB.start); + + r_rsp += -X86_64InterpreterFrame.size; // move rsp? + // write func, pc, frame accessor + var wasm_func = WasmFunction.!(instance.functions[fid]); + (r_rsp + X86_64InterpreterFrame.wasm_func.offset).store(wasm_func); + (r_rsp + X86_64InterpreterFrame.curpc.offset).store(pc); + (r_rsp + X86_64InterpreterFrame.accessor.offset).store(null); + } + return r_rsp; + } // Look up the source {pc} of a location {i} in this code. Returns {-1} if no exact entry is found. // Return addresses are treated differently than other addresses in the code. def lookupPc(ip: Pointer, isRetAddr: bool) -> List { From 731c9e7c8187c1945ea7a4cfe47de513ac60e804 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Mon, 23 Mar 2026 04:03:12 -0400 Subject: [PATCH 03/55] Move function to a more sensible spot --- src/engine/compiler/SinglePassCompiler.v3 | 24 +++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 8b5792612..83c5def7a 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -396,18 +396,6 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl last_probe = it.pc; if (orig_op != Opcode.LOOP && orig_op != Opcode.END) emitProbe(); } - // Guards compiler code with frame reconstruction (if necessary). - def withReconstructedInlinedFrames(emit: void -> void) { - if (isInlined()) { - unrefRegs(); - def space = emitReconstructStackFrames(snapshotFrames()); - emit(); - if (space > 0) masm.emit_addw_r_i(regs.sp, space); - } else { - emit(); - } - - } def emitProbe() { if (last_probe == 0) return; var probe = Instrumentation.getLocalProbe(module, func.func_index, last_probe); @@ -2197,6 +2185,18 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl return total_space; } + // Guards compiler code with frame reconstruction (if necessary). + def withReconstructedInlinedFrames(emit: void -> void) { + if (isInlined()) { + unrefRegs(); + def space = emitReconstructStackFrames(snapshotFrames()); + emit(); + if (space > 0) masm.emit_addw_r_i(regs.sp, space); + } else { + emit(); + } + + } def unsupported() { success = false; // XXX: add opcode } From 194f26d9e02b88f13c460faf419a379c60c5410c Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Wed, 25 Mar 2026 11:20:47 -0400 Subject: [PATCH 04/55] Make the if statement one line --- src/engine/compiler/SinglePassCompiler.v3 | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 83c5def7a..5a39a058d 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -1954,10 +1954,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_call_runtime_op(op); }; // Reconstruct stack frames across runtime calls that might (Wasm-level) trap. - if (canTrap) - withReconstructedInlinedFrames(emit); - else - emit(); + if (canTrap) withReconstructedInlinedFrames(emit); else emit(); masm.emit_get_curstack(regs.scratch); masm.emit_pop_X86_64Stack_rsp_r_r(regs.scratch); dropN(args); @@ -1980,10 +1977,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_call_runtime_op(op); }; // Reconstruct stack frames across runtime calls that might (Wasm-level) trap. - if (canTrap) - withReconstructedInlinedFrames(emit); - else - emit(); + if (canTrap) withReconstructedInlinedFrames(emit); else emit(); masm.emit_get_curstack(regs.scratch); masm.emit_pop_X86_64Stack_rsp_r_r(regs.scratch); dropN(args); From f021600c7f639efb9f66f13fd830c7012f618cba Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Wed, 25 Mar 2026 23:44:13 -0400 Subject: [PATCH 05/55] Compute vfp for every inlined frame --- src/engine/compiler/SinglePassCompiler.v3 | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 5a39a058d..ed0e6a1ea 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -2122,9 +2122,9 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // Load instance.functions def func_reg = allocTmp(ValueKind.REF); masm.emit_v3_Instance_functions_r_r(func_reg, inst_reg); - // use same vfp for all frames - def vfp_reg = allocTmp(ValueKind.REF); - masm.emit_mov_r_m(ValueKind.REF, vfp_reg, frame.vfp_slot); + // base vfp of the outermost frame + def base_vfp_reg = allocTmp(ValueKind.REF); + masm.emit_mov_r_m(ValueKind.REF, base_vfp_reg, frame.vfp_slot); var wasm_func_reg = allocTmp(ValueKind.REF); var inl_inst_reg: Reg, inl_mem0_reg: Reg; @@ -2158,8 +2158,11 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // Save mem0 base masm.emit_mov_m_r(ValueKind.REF, frame.mem0_base_slot, mem_reg); - // use same vfp for all frames - masm.emit_mov_m_r(ValueKind.REF, frame.vfp_slot, vfp_reg); + // Save vfp for every inlined frame = base_vfp + local_base_sp * slot_size + masm.emit_mov_r_r(ValueKind.REF, wasm_func_reg, base_vfp_reg); // reusing wasm_func_reg as scratch + var offset = int.view(frame_info.local_base_sp) * masm.valuerep.slot_size; + if (offset != 0) masm.emit_addw_r_i(wasm_func_reg, offset); + masm.emit_mov_m_r(ValueKind.REF, frame.vfp_slot, wasm_func_reg); // Save PC into frame.pc_slot masm.emit_mov_m_i(frame.pc_slot, frame_info.pc); From 9bf2aebc8f5f1a43befa7486d3f753b3e244406f Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Wed, 25 Mar 2026 23:44:59 -0400 Subject: [PATCH 06/55] Uncomment stack reconstruction metrics --- src/engine/compiler/SinglePassCompiler.v3 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index ed0e6a1ea..5556b1302 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -2104,8 +2104,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl def getSpcInlinedFrameIp() -> long; // Emit code to materialize stack frames for each inlined function. def emitReconstructStackFrames(frames: Array) -> int { - // Metrics.spc_static_reconst.val++; - // masm.emit_inc_metric(Metrics.spc_dynamic_reconst); + Metrics.spc_static_reconst.val++; + masm.emit_inc_metric(Metrics.spc_dynamic_reconst); def real_frame = frames[0]; masm.emit_mov_m_i(xenv.pc_slot, real_frame.pc); From e4b2d4a6dffb89b298d3bea6d8014e0fe40c7d75 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Thu, 26 Mar 2026 01:51:06 -0400 Subject: [PATCH 07/55] Use optimized vfp calculation --- src/engine/compiler/SinglePassCompiler.v3 | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 5556b1302..c8874cfb6 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -2122,9 +2122,9 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // Load instance.functions def func_reg = allocTmp(ValueKind.REF); masm.emit_v3_Instance_functions_r_r(func_reg, inst_reg); - // base vfp of the outermost frame - def base_vfp_reg = allocTmp(ValueKind.REF); - masm.emit_mov_r_m(ValueKind.REF, base_vfp_reg, frame.vfp_slot); + def vfp_reg = allocTmp(ValueKind.REF); + masm.emit_mov_r_m(ValueKind.REF, vfp_reg, frame.vfp_slot); + var prev_base_sp = int.view(frames[0].local_base_sp); var wasm_func_reg = allocTmp(ValueKind.REF); var inl_inst_reg: Reg, inl_mem0_reg: Reg; @@ -2158,11 +2158,12 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // Save mem0 base masm.emit_mov_m_r(ValueKind.REF, frame.mem0_base_slot, mem_reg); - // Save vfp for every inlined frame = base_vfp + local_base_sp * slot_size - masm.emit_mov_r_r(ValueKind.REF, wasm_func_reg, base_vfp_reg); // reusing wasm_func_reg as scratch - var offset = int.view(frame_info.local_base_sp) * masm.valuerep.slot_size; - if (offset != 0) masm.emit_addw_r_i(wasm_func_reg, offset); - masm.emit_mov_m_r(ValueKind.REF, frame.vfp_slot, wasm_func_reg); + // Step vfp_reg by change in local_base_sp from previous frame + def cur_base_sp = int.view(frame_info.local_base_sp); + var delta = (cur_base_sp - prev_base_sp) * masm.valuerep.slot_size; + if (delta != 0) masm.emit_addw_r_i(vfp_reg, delta); + masm.emit_mov_m_r(ValueKind.REF, frame.vfp_slot, vfp_reg); + prev_base_sp = cur_base_sp; // Save PC into frame.pc_slot masm.emit_mov_m_i(frame.pc_slot, frame_info.pc); From a7044bbd852e21263758be0c35936ba5f36056c5 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Tue, 31 Mar 2026 10:22:27 -0400 Subject: [PATCH 08/55] Use a single `rsp` adjustment in reconstructing stack frames --- src/engine/compiler/SinglePassCompiler.v3 | 53 +++++++++++++---------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index c8874cfb6..ee0808866 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -2111,7 +2111,6 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // Use inlined frame stub IP as return address for all reconstructed frames var return_addr = getSpcInlinedFrameIp(); - var total_space = 0; // load instance var inst_reg = allocTmp(ValueKind.REF); @@ -2135,49 +2134,59 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_mov_r_m(ValueKind.REF, inl_mem0_reg, frame.inlined_mem0_base_slot); } + // Pre-allocate stack space for all reconstructed frames at once. + def total_space = (frames.length - 1) * (frame.frameSize + 8); + masm.emit_subw_r_i(regs.sp, total_space); + // Process the inlined frames (skip the outermost which already exists on native stack) for (i = 1; i < frames.length; i++) { var frame_info = frames[i]; + def frame_offset = (frames.length - i) * (frame.frameSize + 8); - // Push inlined frame stub IP as return address - masm.emit_subw_r_i(regs.sp, 8); - masm.emit_mov_m_l(MasmAddr(regs.sp, 0), return_addr); - total_space += 8; - - // Allocate concrete stack frame for inlined function - masm.emit_subw_r_i(regs.sp, frame.frameSize); - total_space += frame.frameSize; + // Write inlined frame stub IP as return address + def retaddr_slot = MasmAddr(regs.sp, frame_offset + frame.frameSize); + masm.emit_mov_m_l(retaddr_slot, return_addr); // get functions[func_index] and save into frame + def wasm_func_slot = frame.wasm_func_slot.plus(frame_offset); masm.emit_v3_Array_elem_r_ri(ValueKind.REF, wasm_func_reg, func_reg, frame_info.func.func_index); - masm.emit_mov_m_r(ValueKind.REF, frame.wasm_func_slot, wasm_func_reg); + masm.emit_mov_m_r(ValueKind.REF, wasm_func_slot, wasm_func_reg); - // Save instance to frame.instance_slot - masm.emit_mov_m_r(ValueKind.REF, frame.instance_slot, inst_reg); + // Save instance + def instance_slot = frame.instance_slot.plus(frame_offset); + masm.emit_mov_m_r(ValueKind.REF, instance_slot, inst_reg); // Save mem0 base - masm.emit_mov_m_r(ValueKind.REF, frame.mem0_base_slot, mem_reg); + def mem0_base_slot = frame.mem0_base_slot.plus(frame_offset); + masm.emit_mov_m_r(ValueKind.REF, mem0_base_slot, mem_reg); - // Step vfp_reg by change in local_base_sp from previous frame + // Step vfp_reg by change in local_base_sp from previous frame and save + def vfp_slot = frame.vfp_slot.plus(frame_offset); def cur_base_sp = int.view(frame_info.local_base_sp); var delta = (cur_base_sp - prev_base_sp) * masm.valuerep.slot_size; if (delta != 0) masm.emit_addw_r_i(vfp_reg, delta); - masm.emit_mov_m_r(ValueKind.REF, frame.vfp_slot, vfp_reg); + masm.emit_mov_m_r(ValueKind.REF, vfp_slot, vfp_reg); prev_base_sp = cur_base_sp; - // Save PC into frame.pc_slot - masm.emit_mov_m_i(frame.pc_slot, frame_info.pc); + // Save PC + def pc_slot = frame.pc_slot.plus(frame_offset); + masm.emit_mov_m_i(pc_slot, frame_info.pc); // Clear FrameAccessor - masm.emit_mov_m_l(frame.accessor_slot, 0); + def accessor_slot = frame.accessor_slot.plus(frame_offset); + masm.emit_mov_m_l(accessor_slot, 0); // if an inlined whamm probe, also grab inlined slots if (is_inlined) { - masm.emit_mov_m_r(ValueKind.REF, frame.inlined_instance_slot, inl_inst_reg); - masm.emit_mov_m_r(ValueKind.REF, frame.inlined_mem0_base_slot, inl_mem0_reg); + def inl_instance_slot = frame.inlined_instance_slot.plus(frame_offset); + masm.emit_mov_m_r(ValueKind.REF, inl_instance_slot, inl_inst_reg); + def inl_mem0_base_slot = frame.inlined_mem0_base_slot.plus(frame_offset); + masm.emit_mov_m_r(ValueKind.REF, inl_mem0_base_slot, inl_mem0_reg); } else { - masm.emit_mov_m_l(frame.inlined_instance_slot, 0); - masm.emit_mov_m_l(frame.inlined_mem0_base_slot, 0); + def inl_instance_slot = frame.inlined_instance_slot.plus(frame_offset); + masm.emit_mov_m_l(inl_instance_slot, 0); + def inl_mem0_base_slot = frame.inlined_mem0_base_slot.plus(frame_offset); + masm.emit_mov_m_l(inl_mem0_base_slot, 0); } } From 7b8ae6d24dff35a0648d57cd274cdffa9bef2be2 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Tue, 31 Mar 2026 10:58:14 -0400 Subject: [PATCH 09/55] Separate single-frame reconstruction into its own method --- src/engine/compiler/SinglePassCompiler.v3 | 95 ++++++++++++----------- 1 file changed, 49 insertions(+), 46 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index ee0808866..c9a9a904d 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -2109,12 +2109,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl def real_frame = frames[0]; masm.emit_mov_m_i(xenv.pc_slot, real_frame.pc); - // Use inlined frame stub IP as return address for all reconstructed frames - var return_addr = getSpcInlinedFrameIp(); - // load instance var inst_reg = allocTmp(ValueKind.REF); - //emit_load_instance(inst_reg); masm.emit_mov_r_m(ValueKind.REF, inst_reg, frame.instance_slot); var mem_reg = allocTmp(ValueKind.REF); masm.emit_mov_r_m(ValueKind.REF, mem_reg, frame.mem0_base_slot); @@ -2140,57 +2136,64 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // Process the inlined frames (skip the outermost which already exists on native stack) for (i = 1; i < frames.length; i++) { - var frame_info = frames[i]; - def frame_offset = (frames.length - i) * (frame.frameSize + 8); + def frame_info = frames[i]; + def cur_base_sp = int.view(frame_info.local_base_sp); + def delta = (cur_base_sp - prev_base_sp) * masm.valuerep.slot_size; + emitReconstructStackFrame(frame_info, frames.length - 1, delta, + wasm_func_reg, func_reg, inst_reg, mem_reg, vfp_reg, inl_inst_reg, inl_mem0_reg); + prev_base_sp = cur_base_sp; + } - // Write inlined frame stub IP as return address - def retaddr_slot = MasmAddr(regs.sp, frame_offset + frame.frameSize); - masm.emit_mov_m_l(retaddr_slot, return_addr); + return total_space; + } + def emitReconstructStackFrame(spcFrame: SpcFrame, offset: int, vfp_delta: int, + wasm_func_reg: Reg, func_reg: Reg, inst_reg: Reg, mem_reg: Reg, vfp_reg: Reg, inl_inst_reg: Reg, inl_mem0_reg: Reg) { + // Use inlined frame stub IP as return address for all reconstructed frames + def return_addr = getSpcInlinedFrameIp(); - // get functions[func_index] and save into frame - def wasm_func_slot = frame.wasm_func_slot.plus(frame_offset); - masm.emit_v3_Array_elem_r_ri(ValueKind.REF, wasm_func_reg, func_reg, frame_info.func.func_index); - masm.emit_mov_m_r(ValueKind.REF, wasm_func_slot, wasm_func_reg); + def frame_offset = offset * (frame.frameSize + 8); + // Write inlined frame stub IP as return address + def retaddr_slot = MasmAddr(regs.sp, frame_offset + frame.frameSize); + masm.emit_mov_m_l(retaddr_slot, return_addr); - // Save instance - def instance_slot = frame.instance_slot.plus(frame_offset); - masm.emit_mov_m_r(ValueKind.REF, instance_slot, inst_reg); + // get functions[func_index] and save into frame + def wasm_func_slot = frame.wasm_func_slot.plus(frame_offset); + masm.emit_v3_Array_elem_r_ri(ValueKind.REF, wasm_func_reg, func_reg, spcFrame.func.func_index); + masm.emit_mov_m_r(ValueKind.REF, wasm_func_slot, wasm_func_reg); - // Save mem0 base - def mem0_base_slot = frame.mem0_base_slot.plus(frame_offset); - masm.emit_mov_m_r(ValueKind.REF, mem0_base_slot, mem_reg); + // Save instance + def instance_slot = frame.instance_slot.plus(frame_offset); + masm.emit_mov_m_r(ValueKind.REF, instance_slot, inst_reg); - // Step vfp_reg by change in local_base_sp from previous frame and save - def vfp_slot = frame.vfp_slot.plus(frame_offset); - def cur_base_sp = int.view(frame_info.local_base_sp); - var delta = (cur_base_sp - prev_base_sp) * masm.valuerep.slot_size; - if (delta != 0) masm.emit_addw_r_i(vfp_reg, delta); - masm.emit_mov_m_r(ValueKind.REF, vfp_slot, vfp_reg); - prev_base_sp = cur_base_sp; + // Save mem0 base + def mem0_base_slot = frame.mem0_base_slot.plus(frame_offset); + masm.emit_mov_m_r(ValueKind.REF, mem0_base_slot, mem_reg); - // Save PC - def pc_slot = frame.pc_slot.plus(frame_offset); - masm.emit_mov_m_i(pc_slot, frame_info.pc); + // Step vfp_reg by change in local_base_sp from previous frame and save + if (vfp_delta != 0) masm.emit_addw_r_i(vfp_reg, vfp_delta); + def vfp_slot = frame.vfp_slot.plus(frame_offset); + masm.emit_mov_m_r(ValueKind.REF, vfp_slot, vfp_reg); - // Clear FrameAccessor - def accessor_slot = frame.accessor_slot.plus(frame_offset); - masm.emit_mov_m_l(accessor_slot, 0); + // Save PC + def pc_slot = frame.pc_slot.plus(frame_offset); + masm.emit_mov_m_i(pc_slot, spcFrame.pc); - // if an inlined whamm probe, also grab inlined slots - if (is_inlined) { - def inl_instance_slot = frame.inlined_instance_slot.plus(frame_offset); - masm.emit_mov_m_r(ValueKind.REF, inl_instance_slot, inl_inst_reg); - def inl_mem0_base_slot = frame.inlined_mem0_base_slot.plus(frame_offset); - masm.emit_mov_m_r(ValueKind.REF, inl_mem0_base_slot, inl_mem0_reg); - } else { - def inl_instance_slot = frame.inlined_instance_slot.plus(frame_offset); - masm.emit_mov_m_l(inl_instance_slot, 0); - def inl_mem0_base_slot = frame.inlined_mem0_base_slot.plus(frame_offset); - masm.emit_mov_m_l(inl_mem0_base_slot, 0); - } - } + // Clear FrameAccessor + def accessor_slot = frame.accessor_slot.plus(frame_offset); + masm.emit_mov_m_l(accessor_slot, 0); - return total_space; + // if an inlined whamm probe, also grab inlined slots + if (is_inlined) { + def inl_instance_slot = frame.inlined_instance_slot.plus(frame_offset); + masm.emit_mov_m_r(ValueKind.REF, inl_instance_slot, inl_inst_reg); + def inl_mem0_base_slot = frame.inlined_mem0_base_slot.plus(frame_offset); + masm.emit_mov_m_r(ValueKind.REF, inl_mem0_base_slot, inl_mem0_reg); + } else { + def inl_instance_slot = frame.inlined_instance_slot.plus(frame_offset); + masm.emit_mov_m_l(inl_instance_slot, 0); + def inl_mem0_base_slot = frame.inlined_mem0_base_slot.plus(frame_offset); + masm.emit_mov_m_l(inl_mem0_base_slot, 0); + } } // Guards compiler code with frame reconstruction (if necessary). def withReconstructedInlinedFrames(emit: void -> void) { From 036753d8a37c19e04c2c0f06e14df1cb3b96f9ce Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Tue, 31 Mar 2026 11:20:35 -0400 Subject: [PATCH 10/55] Revert "Separate single-frame reconstruction into its own method" This reverts commit 7b8ae6d24dff35a0648d57cd274cdffa9bef2be2. --- src/engine/compiler/SinglePassCompiler.v3 | 95 +++++++++++------------ 1 file changed, 46 insertions(+), 49 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index c9a9a904d..ee0808866 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -2109,8 +2109,12 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl def real_frame = frames[0]; masm.emit_mov_m_i(xenv.pc_slot, real_frame.pc); + // Use inlined frame stub IP as return address for all reconstructed frames + var return_addr = getSpcInlinedFrameIp(); + // load instance var inst_reg = allocTmp(ValueKind.REF); + //emit_load_instance(inst_reg); masm.emit_mov_r_m(ValueKind.REF, inst_reg, frame.instance_slot); var mem_reg = allocTmp(ValueKind.REF); masm.emit_mov_r_m(ValueKind.REF, mem_reg, frame.mem0_base_slot); @@ -2136,64 +2140,57 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // Process the inlined frames (skip the outermost which already exists on native stack) for (i = 1; i < frames.length; i++) { - def frame_info = frames[i]; - def cur_base_sp = int.view(frame_info.local_base_sp); - def delta = (cur_base_sp - prev_base_sp) * masm.valuerep.slot_size; - emitReconstructStackFrame(frame_info, frames.length - 1, delta, - wasm_func_reg, func_reg, inst_reg, mem_reg, vfp_reg, inl_inst_reg, inl_mem0_reg); - prev_base_sp = cur_base_sp; - } + var frame_info = frames[i]; + def frame_offset = (frames.length - i) * (frame.frameSize + 8); - return total_space; - } - def emitReconstructStackFrame(spcFrame: SpcFrame, offset: int, vfp_delta: int, - wasm_func_reg: Reg, func_reg: Reg, inst_reg: Reg, mem_reg: Reg, vfp_reg: Reg, inl_inst_reg: Reg, inl_mem0_reg: Reg) { - // Use inlined frame stub IP as return address for all reconstructed frames - def return_addr = getSpcInlinedFrameIp(); + // Write inlined frame stub IP as return address + def retaddr_slot = MasmAddr(regs.sp, frame_offset + frame.frameSize); + masm.emit_mov_m_l(retaddr_slot, return_addr); - def frame_offset = offset * (frame.frameSize + 8); - // Write inlined frame stub IP as return address - def retaddr_slot = MasmAddr(regs.sp, frame_offset + frame.frameSize); - masm.emit_mov_m_l(retaddr_slot, return_addr); + // get functions[func_index] and save into frame + def wasm_func_slot = frame.wasm_func_slot.plus(frame_offset); + masm.emit_v3_Array_elem_r_ri(ValueKind.REF, wasm_func_reg, func_reg, frame_info.func.func_index); + masm.emit_mov_m_r(ValueKind.REF, wasm_func_slot, wasm_func_reg); - // get functions[func_index] and save into frame - def wasm_func_slot = frame.wasm_func_slot.plus(frame_offset); - masm.emit_v3_Array_elem_r_ri(ValueKind.REF, wasm_func_reg, func_reg, spcFrame.func.func_index); - masm.emit_mov_m_r(ValueKind.REF, wasm_func_slot, wasm_func_reg); + // Save instance + def instance_slot = frame.instance_slot.plus(frame_offset); + masm.emit_mov_m_r(ValueKind.REF, instance_slot, inst_reg); - // Save instance - def instance_slot = frame.instance_slot.plus(frame_offset); - masm.emit_mov_m_r(ValueKind.REF, instance_slot, inst_reg); + // Save mem0 base + def mem0_base_slot = frame.mem0_base_slot.plus(frame_offset); + masm.emit_mov_m_r(ValueKind.REF, mem0_base_slot, mem_reg); - // Save mem0 base - def mem0_base_slot = frame.mem0_base_slot.plus(frame_offset); - masm.emit_mov_m_r(ValueKind.REF, mem0_base_slot, mem_reg); - - // Step vfp_reg by change in local_base_sp from previous frame and save - if (vfp_delta != 0) masm.emit_addw_r_i(vfp_reg, vfp_delta); - def vfp_slot = frame.vfp_slot.plus(frame_offset); - masm.emit_mov_m_r(ValueKind.REF, vfp_slot, vfp_reg); + // Step vfp_reg by change in local_base_sp from previous frame and save + def vfp_slot = frame.vfp_slot.plus(frame_offset); + def cur_base_sp = int.view(frame_info.local_base_sp); + var delta = (cur_base_sp - prev_base_sp) * masm.valuerep.slot_size; + if (delta != 0) masm.emit_addw_r_i(vfp_reg, delta); + masm.emit_mov_m_r(ValueKind.REF, vfp_slot, vfp_reg); + prev_base_sp = cur_base_sp; - // Save PC - def pc_slot = frame.pc_slot.plus(frame_offset); - masm.emit_mov_m_i(pc_slot, spcFrame.pc); + // Save PC + def pc_slot = frame.pc_slot.plus(frame_offset); + masm.emit_mov_m_i(pc_slot, frame_info.pc); - // Clear FrameAccessor - def accessor_slot = frame.accessor_slot.plus(frame_offset); - masm.emit_mov_m_l(accessor_slot, 0); + // Clear FrameAccessor + def accessor_slot = frame.accessor_slot.plus(frame_offset); + masm.emit_mov_m_l(accessor_slot, 0); - // if an inlined whamm probe, also grab inlined slots - if (is_inlined) { - def inl_instance_slot = frame.inlined_instance_slot.plus(frame_offset); - masm.emit_mov_m_r(ValueKind.REF, inl_instance_slot, inl_inst_reg); - def inl_mem0_base_slot = frame.inlined_mem0_base_slot.plus(frame_offset); - masm.emit_mov_m_r(ValueKind.REF, inl_mem0_base_slot, inl_mem0_reg); - } else { - def inl_instance_slot = frame.inlined_instance_slot.plus(frame_offset); - masm.emit_mov_m_l(inl_instance_slot, 0); - def inl_mem0_base_slot = frame.inlined_mem0_base_slot.plus(frame_offset); - masm.emit_mov_m_l(inl_mem0_base_slot, 0); + // if an inlined whamm probe, also grab inlined slots + if (is_inlined) { + def inl_instance_slot = frame.inlined_instance_slot.plus(frame_offset); + masm.emit_mov_m_r(ValueKind.REF, inl_instance_slot, inl_inst_reg); + def inl_mem0_base_slot = frame.inlined_mem0_base_slot.plus(frame_offset); + masm.emit_mov_m_r(ValueKind.REF, inl_mem0_base_slot, inl_mem0_reg); + } else { + def inl_instance_slot = frame.inlined_instance_slot.plus(frame_offset); + masm.emit_mov_m_l(inl_instance_slot, 0); + def inl_mem0_base_slot = frame.inlined_mem0_base_slot.plus(frame_offset); + masm.emit_mov_m_l(inl_mem0_base_slot, 0); + } } + + return total_space; } // Guards compiler code with frame reconstruction (if necessary). def withReconstructedInlinedFrames(emit: void -> void) { From 296ea0f394388e9ee57942a67733f1fc21f154c7 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Tue, 31 Mar 2026 11:26:01 -0400 Subject: [PATCH 11/55] Fix wrong way indexing --- src/engine/compiler/SinglePassCompiler.v3 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index ee0808866..457b25cd0 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -2141,7 +2141,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // Process the inlined frames (skip the outermost which already exists on native stack) for (i = 1; i < frames.length; i++) { var frame_info = frames[i]; - def frame_offset = (frames.length - i) * (frame.frameSize + 8); + def frame_offset = (frames.length - 1 - i) * (frame.frameSize + 8); // Write inlined frame stub IP as return address def retaddr_slot = MasmAddr(regs.sp, frame_offset + frame.frameSize); From 6b269728201cb5a294181b8a446709fca0e259df Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Tue, 31 Mar 2026 11:27:51 -0400 Subject: [PATCH 12/55] Reapply "Separate single-frame reconstruction into its own method" This reverts commit 036753d8a37c19e04c2c0f06e14df1cb3b96f9ce. --- src/engine/compiler/SinglePassCompiler.v3 | 95 ++++++++++++----------- 1 file changed, 49 insertions(+), 46 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 457b25cd0..6373e3964 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -2109,12 +2109,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl def real_frame = frames[0]; masm.emit_mov_m_i(xenv.pc_slot, real_frame.pc); - // Use inlined frame stub IP as return address for all reconstructed frames - var return_addr = getSpcInlinedFrameIp(); - // load instance var inst_reg = allocTmp(ValueKind.REF); - //emit_load_instance(inst_reg); masm.emit_mov_r_m(ValueKind.REF, inst_reg, frame.instance_slot); var mem_reg = allocTmp(ValueKind.REF); masm.emit_mov_r_m(ValueKind.REF, mem_reg, frame.mem0_base_slot); @@ -2140,57 +2136,64 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // Process the inlined frames (skip the outermost which already exists on native stack) for (i = 1; i < frames.length; i++) { - var frame_info = frames[i]; - def frame_offset = (frames.length - 1 - i) * (frame.frameSize + 8); + def frame_info = frames[i]; + def cur_base_sp = int.view(frame_info.local_base_sp); + def delta = (cur_base_sp - prev_base_sp) * masm.valuerep.slot_size; + emitReconstructStackFrame(frame_info, frames.length - i - 1, delta, + wasm_func_reg, func_reg, inst_reg, mem_reg, vfp_reg, inl_inst_reg, inl_mem0_reg); + prev_base_sp = cur_base_sp; + } - // Write inlined frame stub IP as return address - def retaddr_slot = MasmAddr(regs.sp, frame_offset + frame.frameSize); - masm.emit_mov_m_l(retaddr_slot, return_addr); + return total_space; + } + def emitReconstructStackFrame(spcFrame: SpcFrame, offset: int, vfp_delta: int, + wasm_func_reg: Reg, func_reg: Reg, inst_reg: Reg, mem_reg: Reg, vfp_reg: Reg, inl_inst_reg: Reg, inl_mem0_reg: Reg) { + // Use inlined frame stub IP as return address for all reconstructed frames + def return_addr = getSpcInlinedFrameIp(); - // get functions[func_index] and save into frame - def wasm_func_slot = frame.wasm_func_slot.plus(frame_offset); - masm.emit_v3_Array_elem_r_ri(ValueKind.REF, wasm_func_reg, func_reg, frame_info.func.func_index); - masm.emit_mov_m_r(ValueKind.REF, wasm_func_slot, wasm_func_reg); + def frame_offset = offset * (frame.frameSize + 8); + // Write inlined frame stub IP as return address + def retaddr_slot = MasmAddr(regs.sp, frame_offset + frame.frameSize); + masm.emit_mov_m_l(retaddr_slot, return_addr); - // Save instance - def instance_slot = frame.instance_slot.plus(frame_offset); - masm.emit_mov_m_r(ValueKind.REF, instance_slot, inst_reg); + // get functions[func_index] and save into frame + def wasm_func_slot = frame.wasm_func_slot.plus(frame_offset); + masm.emit_v3_Array_elem_r_ri(ValueKind.REF, wasm_func_reg, func_reg, spcFrame.func.func_index); + masm.emit_mov_m_r(ValueKind.REF, wasm_func_slot, wasm_func_reg); - // Save mem0 base - def mem0_base_slot = frame.mem0_base_slot.plus(frame_offset); - masm.emit_mov_m_r(ValueKind.REF, mem0_base_slot, mem_reg); + // Save instance + def instance_slot = frame.instance_slot.plus(frame_offset); + masm.emit_mov_m_r(ValueKind.REF, instance_slot, inst_reg); - // Step vfp_reg by change in local_base_sp from previous frame and save - def vfp_slot = frame.vfp_slot.plus(frame_offset); - def cur_base_sp = int.view(frame_info.local_base_sp); - var delta = (cur_base_sp - prev_base_sp) * masm.valuerep.slot_size; - if (delta != 0) masm.emit_addw_r_i(vfp_reg, delta); - masm.emit_mov_m_r(ValueKind.REF, vfp_slot, vfp_reg); - prev_base_sp = cur_base_sp; + // Save mem0 base + def mem0_base_slot = frame.mem0_base_slot.plus(frame_offset); + masm.emit_mov_m_r(ValueKind.REF, mem0_base_slot, mem_reg); - // Save PC - def pc_slot = frame.pc_slot.plus(frame_offset); - masm.emit_mov_m_i(pc_slot, frame_info.pc); + // Step vfp_reg by change in local_base_sp from previous frame and save + if (vfp_delta != 0) masm.emit_addw_r_i(vfp_reg, vfp_delta); + def vfp_slot = frame.vfp_slot.plus(frame_offset); + masm.emit_mov_m_r(ValueKind.REF, vfp_slot, vfp_reg); - // Clear FrameAccessor - def accessor_slot = frame.accessor_slot.plus(frame_offset); - masm.emit_mov_m_l(accessor_slot, 0); + // Save PC + def pc_slot = frame.pc_slot.plus(frame_offset); + masm.emit_mov_m_i(pc_slot, spcFrame.pc); - // if an inlined whamm probe, also grab inlined slots - if (is_inlined) { - def inl_instance_slot = frame.inlined_instance_slot.plus(frame_offset); - masm.emit_mov_m_r(ValueKind.REF, inl_instance_slot, inl_inst_reg); - def inl_mem0_base_slot = frame.inlined_mem0_base_slot.plus(frame_offset); - masm.emit_mov_m_r(ValueKind.REF, inl_mem0_base_slot, inl_mem0_reg); - } else { - def inl_instance_slot = frame.inlined_instance_slot.plus(frame_offset); - masm.emit_mov_m_l(inl_instance_slot, 0); - def inl_mem0_base_slot = frame.inlined_mem0_base_slot.plus(frame_offset); - masm.emit_mov_m_l(inl_mem0_base_slot, 0); - } - } + // Clear FrameAccessor + def accessor_slot = frame.accessor_slot.plus(frame_offset); + masm.emit_mov_m_l(accessor_slot, 0); - return total_space; + // if an inlined whamm probe, also grab inlined slots + if (is_inlined) { + def inl_instance_slot = frame.inlined_instance_slot.plus(frame_offset); + masm.emit_mov_m_r(ValueKind.REF, inl_instance_slot, inl_inst_reg); + def inl_mem0_base_slot = frame.inlined_mem0_base_slot.plus(frame_offset); + masm.emit_mov_m_r(ValueKind.REF, inl_mem0_base_slot, inl_mem0_reg); + } else { + def inl_instance_slot = frame.inlined_instance_slot.plus(frame_offset); + masm.emit_mov_m_l(inl_instance_slot, 0); + def inl_mem0_base_slot = frame.inlined_mem0_base_slot.plus(frame_offset); + masm.emit_mov_m_l(inl_mem0_base_slot, 0); + } } // Guards compiler code with frame reconstruction (if necessary). def withReconstructedInlinedFrames(emit: void -> void) { From a96f168ebb80067ac5d206575479b5b53af4257e Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Tue, 31 Mar 2026 12:37:56 -0400 Subject: [PATCH 13/55] Restore vfp after out call --- src/engine/compiler/SinglePassCompiler.v3 | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 6373e3964..d9c81072e 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -2201,7 +2201,10 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl unrefRegs(); def space = emitReconstructStackFrames(snapshotFrames()); emit(); - if (space > 0) masm.emit_addw_r_i(regs.sp, space); + if (space > 0) { + masm.emit_addw_r_i(regs.sp, space); + masm.emit_mov_r_m(ValueKind.REF, regs.vfp, frame.vfp_slot); + } } else { emit(); } From 3fb2fd1cec1b0e4c901c5f7d14f2713ea775d350 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Mon, 23 Mar 2026 03:37:07 -0400 Subject: [PATCH 14/55] The first of the final inlinings --- src/engine/compiler/SinglePassCompiler.v3 | 417 ++++++++++++++-------- src/util/Whamm.v3 | 32 +- test/inline/failures.x86-64-linux | 3 - 3 files changed, 298 insertions(+), 154 deletions(-) delete mode 100644 test/inline/failures.x86-64-linux diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index d9c81072e..cd131af15 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -112,9 +112,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var ret_label: MasmLabel; var last_probe = 0; var skip_to_end: bool; - // this is Whamm probe inlining, not arbitrary function inlining (yet) - var is_inlined = false; - var whamm_probe_ctl_base: u31; // ctl_stack.top when Whamm probe compilation started + var whamm_config: WhammInlineConfig; // XXX: hack var handler_dest_info = Vector.new(); @@ -486,40 +484,33 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // saves the overhead of using a runtime call by directly invoking the wasm function associated with the whamm probe def emitWhammProbe(probe: WhammProbe) { + if (Trace.compiler) Trace.OUT.puts("emitting whamm probe\n"); // set up args and push to frame slots. var whamm_sig = probe.sig; - var inline_config = InlineConfig(false, false, false); - var new_local_base_sp = 0; var orig_sp = state.sp; var callee_func = WasmFunction.!(probe.func); + def inline_decision = shouldInline(callee_func.decl) && SpcTuning.inlineWhammProbes; // TODO move to shouldInline + var swap_instance = false; + var swap_membase = false; - if (SpcTuning.inlineWhammProbes) { - inline_config = InlineConfig(probe.spc_swap_membase, probe.spc_swap_instance, probe.spc_inline_func); - if (!probe.inline_heuristic_checked) { - inline_config = funcCanInline(callee_func.decl); - probe.inline_heuristic_checked = true; - probe.spc_swap_instance = inline_config.swap_instance; - probe.spc_swap_membase = inline_config.swap_membase; - probe.spc_inline_func = inline_config.can_inline; - } + if (inline_decision) { + probe.checkSwap(); + swap_instance = probe.swap_instance; + swap_membase = probe.swap_membase; - if (inline_config.swap_instance) { // push whamm instance onto abstract stack directly + if (swap_instance) { masm.emit_mov_r_Instance(regs.scratch, callee_func.instance); masm.emit_mov_m_r(ValueKind.REF, frame.inlined_instance_slot, regs.scratch); } - - // overwrite mem0_base with whamm instance's memory base, restore from frame slot later - if (inline_config.swap_membase) { - var membase = callee_func.instance.memories[0].getMemBase64(); - masm.emit_mov_r_l(regs.mem0_base, i64.view(membase)); + if (swap_membase) { + if (callee_func.instance.memories.length > 0) { + var membase = callee_func.instance.memories[0].getMemBase64(); + masm.emit_mov_r_l(regs.mem0_base, i64.view(membase)); + } masm.emit_mov_m_r(ValueKind.REF, frame.inlined_mem0_base_slot, regs.mem0_base); } - } - - if (!inline_config.can_inline) { - state.emitSaveAll(resolver, probeSpillMode); } else { - new_local_base_sp = int.view(state.sp); + state.emitSaveAll(resolver, probeSpillMode); } for (i < whamm_sig.length) { @@ -528,13 +519,13 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var kind: byte; match(whamm_sig[i]) { FrameAccessor => { - if (inline_config.can_inline) state.emitSaveAll(resolver, probeSpillMode); // spill entire value stack. + if (inline_decision) state.emitSaveAll(resolver, probeSpillMode); // spill entire value stack. masm.emit_call_runtime_getFrameAccessorMetaRef(); emit_reload_regs(); - if (inline_config.can_inline && !probeSpillMode.free_regs) state.emitRestoreAll(resolver); + if (inline_decision && !probeSpillMode.free_regs) state.emitRestoreAll(resolver); // move result to mem slot or reg, depending on inlining - if (inline_config.can_inline) { + if (inline_decision) { var reg = allocRegTos(ValueKind.REF); masm.emit_mov_r_r(ValueKind.REF, reg, xenv.runtime_ret0); state.push(KIND_REF | IN_REG, reg, 0); @@ -546,7 +537,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl Val(val) => { match (val) { I31(v) => { - if (inline_config.can_inline) { + if (inline_decision) { var reg = allocRegTos(ValueKind.REF); masm.emit_mov_r_i(reg, i32.view(v) << 1); state.push(KIND_REF | IN_REG, reg, 0); @@ -556,7 +547,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl kind = ValueKind.REF.code; } I32(v) => { - if (inline_config.can_inline) { + if (inline_decision) { state.push(KIND_I32 | IS_CONST, NO_REG, i32.view(v)); } else { masm.emit_mov_m_d(slot_addr, v); @@ -564,7 +555,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl kind = ValueKind.I32.code; } I64(v) => { - if (inline_config.can_inline) { + if (inline_decision) { var reg = allocRegTos(ValueKind.I64); masm.emit_mov_r_l(reg, i64.view(v)); state.push(KIND_I64 | IN_REG, reg, 0); @@ -574,7 +565,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl kind = ValueKind.I64.code; } F32(v) => { - if (inline_config.can_inline) { + if (inline_decision) { var reg = allocRegTos(ValueKind.F32); masm.emit_mov_r_f32(reg, v); state.push(KIND_F32 | IN_REG, reg, 0); @@ -584,7 +575,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl kind = ValueKind.F32.code; } F64(v) => { - if (inline_config.can_inline) { + if (inline_decision) { var reg = allocRegTos(ValueKind.F64); masm.emit_mov_r_d64(reg, v); state.push(KIND_F64 | IN_REG, reg, 0); @@ -594,7 +585,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl kind = ValueKind.F64.code; } V128(l, h) => { - if (inline_config.can_inline) { + if (inline_decision) { var reg = allocRegTos(ValueKind.V128); masm.emit_mov_r_q(reg, l, h); state.push(KIND_V128 | IN_REG, reg, 0); @@ -605,7 +596,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl kind = ValueKind.V128.code; } Ref(v) => { - if (inline_config.can_inline) { + if (inline_decision) { var reg = allocRegTos(ValueKind.REF); masm.emit_mov_r_Object(reg, v); state.push(KIND_REF | IN_REG, reg, 0); @@ -616,7 +607,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl kind = ValueKind.REF.code; } Cont(v) => { - if (inline_config.can_inline) { + if (inline_decision) { var reg = allocRegTos(ValueKind.REF_U64); masm.emit_mov_r_Cont(reg, v); state.push(KIND_REF_U64 | IN_REG, reg, 0); @@ -631,15 +622,15 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } Operand(_, i) => { var index = orig_sp + u32.view(i) - 1; - if (inline_config.can_inline) { - visit_LOCAL_GET(u31.view(index)); + if (inline_decision) { + visit_LOCAL_GET(u31.view(index - local_base_sp)); } else { masm.emit_mov_m_m(state.state[index].kind(), slot_addr, masm.slotAddr(index)); } kind = state.state[index].kind().code; } Local(_, i) => { - if (inline_config.can_inline) { + if (inline_decision) { visit_LOCAL_GET(u31.view(i)); } else { masm.emit_mov_m_m(state.state[u31.view(i)].kind(), slot_addr, masm.slotAddr(u32.view(i))); @@ -648,7 +639,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } Null => System.error("whamm", "null whamm arg!"); } - if (!inline_config.can_inline) { + if (!inline_decision) { masm.emit_mov_m_i(slot_tag_addr, kind); } } @@ -656,49 +647,14 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var func_id = callee_func.decl.func_index; var whamm_module = whamm_instance.module; var whamm_func_decl = callee_func.decl; - if (inline_config.can_inline) { - var prev_it = it; - it = BytecodeIterator.new().reset(whamm_func_decl); - var orig_module = module; - - // prepare spc for inlining - this.local_base_sp = u31.view(new_local_base_sp); - this.module = whamm_module; - this.func = whamm_func_decl; - this.sig = whamm_func_decl.sig; - - // inline codegen - it.dispatchLocalDecls(this); - this.is_inlined = true; - if (Trace.compiler) Trace.OUT.puts("Start compiling inlined whamm probe").ln(); - while (it.more() && success) { - if (Trace.compiler) traceOpcodeAndStack(false); - last_probe = 0; - masm.source_loc = it.pc; - it.dispatch(this); - if (Trace.compiler && Trace.asm) { - OUT.puts("JIT code: "); - masm.printCodeBytes(OUT, codegen_offset, masm.curCodeBytes()); - codegen_offset = masm.curCodeBytes(); - OUT.ln(); - } - unrefRegs(); - if (Debug.compiler) checkRegAlloc(); - it.next(); + if (inline_decision) { + whamm_config = WhammInlineConfig(swap_membase, swap_instance, true); + emitInlinedCall(whamm_func_decl, probe); + whamm_config = WhammInlineConfig(false, false, false); + // Restore mem0_base after probe + if (module.memories.length > 0) { + masm.emit_mov_r_m(ValueKind.REF, regs.mem0_base, frame.mem0_base_slot); } - if (Trace.compiler) Trace.OUT.puts("Finished compiling inlined whamm probe").ln(); - - // restore spc after inlining - it = prev_it; - this.local_base_sp = 0; - this.is_inlined = false; - this.module = orig_module; - this.func = it.func; - this.sig = it.func.sig; - masm.emit_mov_r_m(ValueKind.REF, regs.mem0_base, frame.mem0_base_slot); - - // clear callee params/locals from abstract state - dropN(state.sp - orig_sp); } else { var vsp_reg = allocTmpFixed(ValueKind.REF, regs.vsp); var func_reg = allocTmpFixed(ValueKind.REF, regs.func_arg); @@ -794,37 +750,38 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl setUnreachable(); } def visit_END() { - if (!this.is_inlined) { - var ctl_top = state.ctl_stack.peek(); - if (ctl_top.opcode == Opcode.LOOP.code) { - state.ctl_stack.pop(); - if (!ctl_top.reachable) setUnreachable(); - } else if (ctl_top.opcode == Opcode.IF.code) { - // simulate empty if-true block - state.emitFallthru(resolver); - masm.emit_br(ctl_top.label); - masm.bindLabel(ctl_top.else_label); - state.doElse(); - ctl_top.opcode = Opcode.ELSE.code; - state.emitFallthru(resolver); - masm.bindLabel(ctl_top.label); - state.resetToMerge(ctl_top); - state.ctl_stack.pop(); - } else if (ctl_top.opcode == Opcode.BLOCK.code || ctl_top.opcode == Opcode.ELSE.code) { - state.emitFallthru(resolver); - masm.bindLabel(ctl_top.label); - state.resetToMerge(ctl_top); - state.ctl_stack.pop(); - } else if (ctl_top.opcode == Opcode.RETURN.code) { - state.emitFallthru(resolver); - masm.bindLabel(ctl_top.label); - state.resetToMerge(ctl_top); - emitProbe(); - if (ctl_top.merge_count > 1) emitReturn(ctl_top); - state.ctl_stack.pop(); - } + var frame = state.frame_stack.peek(); + var is_implicit_function_block = isInlined() && state.ctl_stack.top == frame.ctl_base_sp + 1; + + var ctl_top = state.ctl_stack.peek(); + if (ctl_top.opcode == Opcode.LOOP.code) { + state.ctl_stack.pop(); + if (!ctl_top.reachable) setUnreachable(); + } else if (ctl_top.opcode == Opcode.IF.code) { + // simulate empty if-true block + state.emitFallthru(resolver); + masm.emit_br(ctl_top.label); + masm.bindLabel(ctl_top.else_label); + state.doElse(); + ctl_top.opcode = Opcode.ELSE.code; + state.emitFallthru(resolver); + masm.bindLabel(ctl_top.label); + state.resetToMerge(ctl_top); + state.ctl_stack.pop(); + } else if (ctl_top.opcode == Opcode.BLOCK.code || ctl_top.opcode == Opcode.ELSE.code) { + state.emitFallthru(resolver); + masm.bindLabel(ctl_top.label); + state.resetToMerge(ctl_top); + state.ctl_stack.pop(); + } else if (ctl_top.opcode == Opcode.RETURN.code) { + state.emitFallthru(resolver); + masm.bindLabel(ctl_top.label); + state.resetToMerge(ctl_top); emitProbe(); + if (ctl_top.merge_count > 1) emitReturn(ctl_top); + state.ctl_stack.pop(); } + emitProbe(); } def visit_BR(depth: u31) { var target = state.getControl(depth); @@ -866,6 +823,18 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } var func = module.functions[index]; + // Try inlining for intra-module, non-tail calls + if (!tailCall && shouldInline(func)) { + if (Trace.compiler) Trace.OUT.put2("Inlining call to func #%d (%d bytes)", index, func.orig_bytecode.length).ln(); + if (op == Opcode.CALL) { + Metrics.spc_static_inlined_calls.val++; + masm.emit_inc_metric(Metrics.spc_dynamic_inlined_calls); + masm.emit_inc_metric(Metrics.spc_dynamic_calls); + } + emitInlinedCall(func, null); + return; + } + withReconstructedInlinedFrames(fun { var retpt = masm.newLabel(it.pc), wasmcall_label = masm.newLabel(it.pc); // Load the instance (which must happen before frame is unwound). @@ -881,6 +850,160 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl emitCallToReg(func.sig, func_reg, vsp_reg, tmp, func.imp != null, tailCall); }); } + def emitInlinedCall(callee_func: FuncDecl, whamm: WhammProbe) { + var sig = callee_func.sig; + var params_count = u32.view(sig.params.length); + var results_count = u32.view(sig.results.length); + var orig_sp = state.sp; + + // Arguments are already on stack + // Stack: [..., arg0, arg1, ..., argN] <- sp + // We want callee's local 0 = arg0, so: + var new_local_base_sp: u31 = u31.view(orig_sp - params_count); + var new_ctl_base_sp = u31.view(state.ctl_stack.top); + + var num_locals = callee_func.num_slots(); + + // Push an implicit block for the head of the function + var end_label = masm.newLabel(callee_func.cur_bytecode.length); + state.pushBlock(sig.params, sig.results, end_label); + + var m: Module = module; + + // Whamm probe configuration + if (whamm != null) { + def whamm_sig = whamm.sig; + def whamm_wf = WasmFunction.!(whamm.func); + def whamm_instance = whamm_wf.instance; + def whamm_func_decl = whamm_wf.decl; + + m = whamm_instance.module; + new_local_base_sp = u31.view(state.sp) - u31.view(whamm_sig.length); // XXX + } + + // Create and push frame for inlined function + var callee_frame = SpcFrame.new(callee_func, + m, new_local_base_sp, new_ctl_base_sp, num_locals, 0); + + pushSpcFrame(callee_frame); + + // Emit function entry probe, if any. + // XXX expensive because frame materialization required + if (whamm == null && !FeatureDisable.entryProbes && func.entry_probed) { + var probe = Instrumentation.getLocalProbe(module, callee_func.func_index, 0); + + // Reconstruct inlined frames before emitting probe + var reconstructed_space = 0; + if (isInlined()) { + var frames = snapshotFrames(); + unrefRegs(); + reconstructed_space = emitReconstructStackFrames(frames); + } + emitProbe0(0, probe); + // Clean up reconstructed frames after the call returns + if (reconstructed_space > 0) { + masm.emit_addw_r_i(regs.sp, reconstructed_space); + } + } + + // Allocate callee's non-parameter locals + it.dispatchLocalDecls(this); + + // Compile callee's bytecode + if (Trace.compiler) Trace.OUT.puts(" Start inlined function body").ln(); + while (it.more() && success) { + if (Trace.compiler) traceOpcodeAndStack(false); + last_probe = 0; + masm.source_loc = it.pc; + masm.current_fid = func.func_index; + it.dispatch(this); + if (Trace.compiler && Trace.asm) { + OUT.puts("JIT code: "); + masm.printCodeBytes(OUT, codegen_offset, masm.curCodeBytes()); + codegen_offset = masm.curCodeBytes(); + OUT.ln(); + } + unrefRegs(); + if (Debug.compiler) checkRegAlloc(); + it.next(); + if (skip_to_end) doSkipToEndOfBlock(); + } + if (Trace.compiler) Trace.OUT.puts(" End inlined function body").ln(); + + // Check if the inlined function is unreachable (e.g., ended with UNREACHABLE, RETURN, THROW) + var inlined_reachable = state.ctl_stack.peek().reachable; + + // Restore caller context by popping frame + popSpcFrame(); // Automatically restores cached fields + + // Note: Control stack cleanup (popping implicit BLOCK) is handled by visit_END + + // If inlined function is unreachable, no results to clean up + if (!inlined_reachable) { + if (Trace.compiler) { + Trace.OUT.puts(" Inlined function unreachable, skipping result cleanup").ln(); + Trace.OUT.put3(" state.sp=%d, new_local_base_sp=%d, callee_slots=%d", + state.sp, new_local_base_sp, state.sp - new_local_base_sp).ln(); + } + // Drop all callee state (params + locals, no results) + var callee_slots = state.sp - new_local_base_sp; + if (callee_slots > 0) dropN(u32.view(callee_slots)); + if (Trace.compiler) Trace.OUT.put1(" After dropN: state.sp=%d", state.sp).ln(); + setUnreachable(); + return; + } + + // Clean up stack: + // Before: [..., arg0, arg1, ..., argN, local0, local1, ..., localM, result0, ..., resultK] + // After: [..., result0, ..., resultK] + + var total_callee_slots = state.sp - new_local_base_sp; // All callee state + var slots_to_drop = total_callee_slots - results_count; + + // for whamm probes, results_count SHOULD be zero + if (slots_to_drop > 0 && results_count > 0) { + // Need to move results down over parameters and locals + for (i < results_count) { + var result_slot = state.sp - results_count + u32.view(i); + var target_slot = new_local_base_sp + u32.view(i); + if (Trace.compiler) { + Trace.OUT.put3(" Moving result %d: slot %d -> slot %d", i, result_slot, target_slot).ln(); + } + if (result_slot != target_slot) { + var rv = state.state[result_slot]; + if (Trace.compiler) { + Trace.OUT.put2(" rv: flags=%x, const=%d", rv.flags, rv.const).ln(); + } + if (rv.inReg()) { + regAlloc.reassign(rv.reg, int.!(result_slot), int.!(target_slot)); + } else { + // Move in memory (rarely needed if results are in regs) + resolver.addMove((target_slot, rv), (result_slot, rv)); + } + state.state[target_slot] = rv; + } else { + // Result already in the right place + if (Trace.compiler) Trace.OUT.puts(" (already in place)").ln(); + } + } + resolver.emitMoves(); + + // Drop everything above results + for (slot = new_local_base_sp + results_count; slot < state.sp; slot++) { + unrefSlot(slot); + } + state.sp = new_local_base_sp + results_count; + } else if (slots_to_drop > 0) { + // No results, just drop everything + if (Trace.compiler) Trace.OUT.put1("dropping %d slots\n", slots_to_drop); + dropN(u32.view(slots_to_drop)); + } + // If slots_to_drop <= 0, results are already in the right place + + if (Trace.compiler) { + Trace.OUT.put1(" Inlined call complete, sp=%d", state.sp).ln(); + } + } def emitCallToReg(sig: SigDecl, func_reg: Reg, vsp_reg: Reg, tmp: Reg, checkHostCall: bool, tailCall: bool) { var retpt = masm.newLabel(it.pc), wasmcall_label = masm.newLabel(it.pc); // Handle the current stack state. @@ -2123,7 +2246,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var wasm_func_reg = allocTmp(ValueKind.REF); var inl_inst_reg: Reg, inl_mem0_reg: Reg; - if (is_inlined) { + if (whamm_config.is_inlined) { // TODO investigate, check individual configs? inl_inst_reg = allocTmp(ValueKind.REF); inl_mem0_reg = allocTmp(ValueKind.REF); masm.emit_mov_r_m(ValueKind.REF, inl_inst_reg, frame.inlined_instance_slot); @@ -2183,7 +2306,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_mov_m_l(accessor_slot, 0); // if an inlined whamm probe, also grab inlined slots - if (is_inlined) { + if (whamm_config.is_inlined) { def inl_instance_slot = frame.inlined_instance_slot.plus(frame_offset); masm.emit_mov_m_r(ValueKind.REF, inl_instance_slot, inl_inst_reg); def inl_mem0_base_slot = frame.inlined_mem0_base_slot.plus(frame_offset); @@ -2304,7 +2427,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // XXX: recompute VFP from VSP - #slots? masm.emit_mov_r_m(ValueKind.REF, regs.vfp, frame.vfp_slot); if (module.memories.length > 0) { - if (is_inlined) { + if (whamm_config.is_inlined) { masm.emit_mov_r_m(ValueKind.REF, regs.mem0_base, frame.inlined_mem0_base_slot); } else { masm.emit_mov_r_m(ValueKind.REF, regs.mem0_base, frame.mem0_base_slot); @@ -2312,7 +2435,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } } def emit_load_instance(reg: Reg) { - if (is_inlined) { // inline compilation + if (whamm_config.is_inlined) { // inline compilation masm.emit_mov_r_m(ValueKind.REF, reg, frame.inlined_instance_slot); } else { masm.emit_mov_r_m(ValueKind.REF, reg, frame.instance_slot); @@ -2680,6 +2803,37 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } return frames; } + // Determine if a regular function call should be inlined + def shouldInline(func: FuncDecl) -> bool { + if (Trace.compiler) OUT.put1("deciding on inlining call to func #%d: ", func.func_index); + + if (func.imp != null) return no("imported"); + if (inlineDepth() >= SpcTuning.maxInlineDepth) return no("max inline depth exceeded"); + if (func.orig_bytecode.length > SpcTuning.maxInlineBytecodeSize) return no("func too large"); + if (func.sig.params.length > SpcTuning.maxInlineParams) return no("too many parameters"); + + // Scan bytecode for unsupported instructions + var bi = BytecodeIterator.new().reset(func); + while (bi.more()) { + match (bi.current()) { + RETURN, RETURN_CALL, RETURN_CALL_INDIRECT, RETURN_CALL_REF => + return no("uses return instruction"); + TRY, CATCH, THROW, RETHROW, THROW_REF, DELEGATE, CATCH_ALL, TRY_TABLE => + return no("uses exception handling instruction"); + CONT_NEW, CONT_BIND, SUSPEND, RESUME, RESUME_THROW, RESUME_THROW_REF, SWITCH => + return no("uses stack switching instruction"); + _ => ; + } + bi.next(); + } + + if (Trace.compiler) OUT.puts("YES\n"); + return true; + } + private def no(reason: string) -> bool { + if (Trace.compiler) OUT.puts("NO (").puts(reason).putc(')').ln(); + return false; + } } // Different branch instructions have different repush enum BrRepush(taken: bool, not_taken: bool) { @@ -3386,38 +3540,7 @@ class MoveNode { var dstNext: MoveNode; // next in a list of successors } -// checks function bytecode to see if it can be inlined based on -// simple heuristics: length <= maxInlineBytecodeSize and straightline code. -def funcCanInline(decl: FuncDecl) -> InlineConfig { - var default = InlineConfig(false, false, false); - if (decl.orig_bytecode.length > SpcTuning.maxInlineBytecodeSize || decl.sig.params.length > SpcTuning.maxInlineParams) return default; - var bi = BytecodeIterator.new().reset(decl); - var swap_instance = false; - var swap_membase = false; - while (bi.more()) { - var op = bi.current(); - match (op) { - // Cannot handle control flow yet. - IF, BR, BR_IF, BR_TABLE, BR_ON_NULL, BR_ON_NON_NULL, BR_ON_CAST, BR_ON_CAST_FAIL, RETURN => return default; - // These opcodes require swapping the instance. - THROW, CALL, CALL_INDIRECT, MEMORY_INIT, MEMORY_SIZE, MEMORY_GROW, MEMORY_COPY, MEMORY_FILL, REF_FUNC, DATA_DROP, - ELEM_DROP, TABLE_INIT, TABLE_SIZE, TABLE_COPY, TABLE_GROW, GLOBAL_SET, GLOBAL_GET, TABLE_SET, TABLE_GET => swap_instance = true; - // Load/store opcodes require either the memory base or the instance. - I32_STORE, I64_STORE, F32_STORE, F64_STORE, I32_STORE8, I32_STORE16, I64_STORE8, I64_STORE16, I64_STORE32, - V128_STORE, I32_LOAD, I64_LOAD, F32_LOAD, F64_LOAD, I32_LOAD8_S, I32_LOAD8_U, I32_LOAD16_S, I32_LOAD16_U, - I64_LOAD8_S, I64_LOAD8_U, I64_LOAD16_S, I64_LOAD16_U, I64_LOAD32_S, I64_LOAD32_U, V128_LOAD => { - var memarg = bi.immptr().read_MemArg(); - if (memarg.memory_index == 0) swap_membase = true; - else swap_instance = true; - } - _ => ; - } - bi.next(); - } - return InlineConfig(swap_membase, swap_instance, true); -} - -type InlineConfig(swap_membase: bool, swap_instance: bool, can_inline: bool); +type WhammInlineConfig(swap_membase: bool, swap_instance: bool, is_inlined: bool); // Used to record the entry point of exception/suspension handlers. Jumping to {stub_label} allows // control transfer to its corresponding handler without falling back to fast-int. diff --git a/src/util/Whamm.v3 b/src/util/Whamm.v3 index 9b93b746d..ae1649d8b 100644 --- a/src/util/Whamm.v3 +++ b/src/util/Whamm.v3 @@ -175,10 +175,9 @@ component Whamm { class WhammProbe(func: Function, sig: Array) extends Probe { var trampoline: TargetCode; // properties set by the spc to make inlining optimization decisions. - var inline_heuristic_checked = false; - var spc_inline_func = false; - var spc_swap_instance = false; - var spc_swap_membase = false; + var swap_checked = false; + var swap_instance = false; + var swap_membase = false; private def args = if(sig.length == 0, Values.NONE, Array.new(sig.length)); @@ -203,6 +202,31 @@ class WhammProbe(func: Function, sig: Array) extends Probe { } return ProbeAction.Continue; } + + // If function is to be inlined, check to see if instance or mem0_base need to be swapped. + def checkSwap() { + if (swap_checked) return; + var bi = BytecodeIterator.new().reset(WasmFunction.!(func).decl); + while (bi.more()) { + var op = bi.current(); + match (op) { + // These opcodes require swapping the instance. + THROW, CALL, CALL_INDIRECT, MEMORY_INIT, MEMORY_SIZE, MEMORY_GROW, MEMORY_COPY, MEMORY_FILL, REF_FUNC, DATA_DROP, + ELEM_DROP, TABLE_INIT, TABLE_SIZE, TABLE_COPY, TABLE_GROW, GLOBAL_SET, GLOBAL_GET, TABLE_SET, TABLE_GET => swap_instance = true; + // Load/store opcodes require either the memory base or the instance. + I32_STORE, I64_STORE, F32_STORE, F64_STORE, I32_STORE8, I32_STORE16, I64_STORE8, I64_STORE16, I64_STORE32, + V128_STORE, I32_LOAD, I64_LOAD, F32_LOAD, F64_LOAD, I32_LOAD8_S, I32_LOAD8_U, I32_LOAD16_S, I32_LOAD16_U, + I64_LOAD8_S, I64_LOAD8_U, I64_LOAD16_S, I64_LOAD16_U, I64_LOAD32_S, I64_LOAD32_U, V128_LOAD => { + var memarg = bi.immptr().read_MemArg(); + if (memarg.memory_index == 0) swap_membase = true; + else swap_instance = true; + } + _ => ; + } + bi.next(); + } + swap_checked = true; + } } def parseParam0(r: TextReader) -> WhammParam { diff --git a/test/inline/failures.x86-64-linux b/test/inline/failures.x86-64-linux deleted file mode 100644 index 925e70891..000000000 --- a/test/inline/failures.x86-64-linux +++ /dev/null @@ -1,3 +0,0 @@ -inline_test_arithmetic.wasm -inline_test_locals_control.wasm -inline_test_nesting.wasm From 4ad0cbd2c63cbf4ef4019294c3b175a4436262ef Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Mon, 23 Mar 2026 03:38:34 -0400 Subject: [PATCH 15/55] Remove dead lines --- src/engine/compiler/SinglePassCompiler.v3 | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index cd131af15..45da336e0 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -750,9 +750,6 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl setUnreachable(); } def visit_END() { - var frame = state.frame_stack.peek(); - var is_implicit_function_block = isInlined() && state.ctl_stack.top == frame.ctl_base_sp + 1; - var ctl_top = state.ctl_stack.peek(); if (ctl_top.opcode == Opcode.LOOP.code) { state.ctl_stack.pop(); From 3f9b030cc5086154dac9ccbc4f0e028d8b3e9eb7 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Mon, 23 Mar 2026 03:43:24 -0400 Subject: [PATCH 16/55] Remove extra metric increment --- src/engine/compiler/SinglePassCompiler.v3 | 1 - 1 file changed, 1 deletion(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 45da336e0..e4bf4f6a2 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -826,7 +826,6 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl if (op == Opcode.CALL) { Metrics.spc_static_inlined_calls.val++; masm.emit_inc_metric(Metrics.spc_dynamic_inlined_calls); - masm.emit_inc_metric(Metrics.spc_dynamic_calls); } emitInlinedCall(func, null); return; From c0ce246425b35408caa448e893b86d3be95fb526 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Mon, 23 Mar 2026 03:49:25 -0400 Subject: [PATCH 17/55] Fix osr bug on dyn --- src/engine/compiler/SinglePassCompiler.v3 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index e4bf4f6a2..7bd635556 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -697,7 +697,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl state.prepareLoop(resolver); masm.bindLabel(ctl_top.label); emitProbe(); - if (it.pc == osr_pc) { + if (it.pc == osr_pc && !isInlined()) { osr_state = state.ctl_stack.peek().copyMerge(); osr_loop_label = masm.newLabel(it.pc); masm.bindLabel(osr_loop_label); From 364664a1840fa20db4cb20f0240ff75f7a45496b Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Wed, 25 Mar 2026 01:35:09 -0400 Subject: [PATCH 18/55] Support RETURN instruction * we now use RETURN for SpcControl for inlined functions instead of BLOCK * RETURN instructions are now supported * ret_labels are now eagerly instantiated and added to SpcState * added pushFuncBody to abstract control push of RETURN --- src/engine/compiler/SinglePassCompiler.v3 | 47 +++++++---- test/inline/inline_test_return.wasm | Bin 0 -> 182 bytes test/inline/inline_test_return.wasm.exit | 1 + test/inline/inline_test_return.wasm.flags | 1 + test/inline/inline_test_return.wasm.out | 4 + test/inline/inline_test_return.wat | 97 ++++++++++++++++++++++ 6 files changed, 134 insertions(+), 16 deletions(-) create mode 100644 test/inline/inline_test_return.wasm create mode 100644 test/inline/inline_test_return.wasm.exit create mode 100644 test/inline/inline_test_return.wasm.flags create mode 100644 test/inline/inline_test_return.wasm.out create mode 100644 test/inline/inline_test_return.wat diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 7bd635556..a840a7b98 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -102,6 +102,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var sig: SigDecl; var num_locals: int; var local_base_sp: u31; // can use a Range for 0-indexing instead of from offset + var ctl_base_sp: u31; // index of the RETURN control in ctl_stack for the current frame var success = true; var osr_pc: int; @@ -164,7 +165,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // Push initial frame for top-level function state.frame_stack.clear(); - var initial_frame = SpcFrame.new(func, module, 0, 0, func.num_slots(), 0); + var initial_frame = SpcFrame.new(func, module, 0, 0, func.num_slots(), 0, masm.newLabel(func.cur_bytecode.length)); pushSpcFrame(initial_frame); // Emit prologue, which allocates the frame and initializes various registers. @@ -777,6 +778,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl emitProbe(); if (ctl_top.merge_count > 1) emitReturn(ctl_top); state.ctl_stack.pop(); + return; } emitProbe(); } @@ -807,9 +809,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl setUnreachable(); } def visit_RETURN() { - var target = state.ctl_stack.elems[0]; + var target = state.ctl_stack.elems[ctl_base_sp]; state.emitTransfer(target, resolver); - if (ret_label == null) ret_label = masm.newLabel(func.cur_bytecode.length); masm.emit_br(ret_label); setUnreachable(); } @@ -860,9 +861,11 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var num_locals = callee_func.num_slots(); - // Push an implicit block for the head of the function + // Push a RETURN control for the inlined callee's function body. var end_label = masm.newLabel(callee_func.cur_bytecode.length); - state.pushBlock(sig.params, sig.results, end_label); + var func_body_ctl = state.pushFuncBody(sig.params, sig.results, end_label); + func_body_ctl.merge_state = state.getInMemoryMergeWithArgs(int.view(func_body_ctl.val_stack_top), sig.results); // preserve outer frame state below callee's results + func_body_ctl.merge_count = 1; var m: Module = module; @@ -878,8 +881,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } // Create and push frame for inlined function - var callee_frame = SpcFrame.new(callee_func, - m, new_local_base_sp, new_ctl_base_sp, num_locals, 0); + var callee_frame = SpcFrame.new(callee_func, + m, new_local_base_sp, new_ctl_base_sp, num_locals, 0, masm.newLabel(callee_func.cur_bytecode.length)); pushSpcFrame(callee_frame); @@ -2170,10 +2173,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } def emitReturn(ctl: SpcControl) { // All explicit RETURN instructions branch here. - if (ret_label != null) { - masm.bindLabel(ret_label); - ret_label = null; - } + masm.bindLabel(ret_label); + var results = sig.results; if (masm.valuerep.tagged) { // update mismatched value tags @@ -2184,6 +2185,9 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_mov_m_i(masm.tagAddr(state.sp - u32.view(results.length) + u32.view(i)), rtag.code); } } + + if (isInlined()) return; + // Compute VSP = VFP + state.sp emit_compute_vsp(regs.vsp, state.sp); // Return to caller @@ -2759,7 +2763,10 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl if (func != null) masm.pushInlineContext(func.func_index); def current = state.frame_stack.peek(); - if (current != null) current.pc = it.pc; + if (current != null) { + current.pc = it.pc; + current.ret_label = ret_label; + } state.frame_stack.push(frame); // Update cached copies from new top frame it.reset(frame.func).at(frame.pc, -1); @@ -2768,6 +2775,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl sig = func.sig; num_locals = frame.num_locals; local_base_sp = frame.local_base_sp; + ctl_base_sp = frame.ctl_base_sp; + ret_label = frame.ret_label; } def popSpcFrame() -> SpcFrame { masm.popInlineContext(); @@ -2781,6 +2790,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl sig = func.sig; num_locals = current.num_locals; local_base_sp = current.local_base_sp; + ctl_base_sp = current.ctl_base_sp; + ret_label = current.ret_label; return frame; } @@ -2795,7 +2806,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl for (i < state.frame_stack.top) { var f = state.frame_stack.elems[i]; var pc = if(i == state.frame_stack.top - 1, it.pc, f.pc); - frames[i] = SpcFrame.new(f.func, f.module, f.local_base_sp, f.ctl_base_sp, f.num_locals, pc); + frames[i] = SpcFrame.new(f.func, f.module, f.local_base_sp, f.ctl_base_sp, f.num_locals, pc, null); } return frames; } @@ -2812,7 +2823,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var bi = BytecodeIterator.new().reset(func); while (bi.more()) { match (bi.current()) { - RETURN, RETURN_CALL, RETURN_CALL_INDIRECT, RETURN_CALL_REF => + RETURN_CALL, RETURN_CALL_INDIRECT, RETURN_CALL_REF => return no("uses return instruction"); TRY, CATCH, THROW, RETHROW, THROW_REF, DELEGATE, CATCH_ALL, TRY_TABLE => return no("uses exception handling instruction"); @@ -2983,8 +2994,9 @@ class SpcFrame { var ctl_base_sp: u31; // Base index into SpcState.ctl_stack var num_locals: int; var pc: int; + var ret_label: MasmLabel; - new(func, module, local_base_sp, ctl_base_sp, num_locals, pc) {} + new(func, module, local_base_sp, ctl_base_sp, num_locals, pc, ret_label) {} } class SpcState(regAlloc: RegAlloc) { @@ -3001,7 +3013,7 @@ class SpcState(regAlloc: RegAlloc) { ctl_stack.clear(); // manually set up first control entry and return merge state var results = sig.results; - var ctl = pushControl(Opcode.RETURN.code, ValueTypes.NONE, results, ret_label); + var ctl = pushFuncBody(ValueTypes.NONE, results, ret_label); var merge_state = Array.new(results.length); for (i < results.length) { // request the merged values be stored to the stack, but don't require tags @@ -3033,6 +3045,9 @@ class SpcState(regAlloc: RegAlloc) { def pushBlock(params: Array, results: Array, end_label: MasmLabel) -> SpcControl { return pushControl(Opcode.BLOCK.code, params, results, end_label); } + def pushFuncBody(params: Array, results: Array, end_label: MasmLabel) -> SpcControl { + return pushControl(Opcode.RETURN.code, params, results, end_label); + } def pushLoop(params: Array, results: Array, start_label: MasmLabel) -> SpcControl { var ctl = pushControl(Opcode.LOOP.code, params, results, start_label); return ctl; diff --git a/test/inline/inline_test_return.wasm b/test/inline/inline_test_return.wasm new file mode 100644 index 0000000000000000000000000000000000000000..d7bcbbaa0b658f86faa93be8c09ba06cbf0b59e7 GIT binary patch literal 182 zcmXZSF%H5o429umCsk2Aq+(>s#5rm)1aXRX1W|@EBad7gr*!(iJO}f8Apmp?6PuAu z!M1=b!~o*{KyjJxFL3%&ID@S~`N^XPw>TF12ZbJ4L_8uV6n|gaHH(wmM{bl0G-w>4 beRY_ltE<{Jv8z*P8faTTyWxA@o$4w-Cl(w8 literal 0 HcmV?d00001 diff --git a/test/inline/inline_test_return.wasm.exit b/test/inline/inline_test_return.wasm.exit new file mode 100644 index 000000000..573541ac9 --- /dev/null +++ b/test/inline/inline_test_return.wasm.exit @@ -0,0 +1 @@ +0 diff --git a/test/inline/inline_test_return.wasm.flags b/test/inline/inline_test_return.wasm.flags new file mode 100644 index 000000000..0c2fe67af --- /dev/null +++ b/test/inline/inline_test_return.wasm.flags @@ -0,0 +1 @@ +--metrics=spc*calls --inline-max-depth=1 diff --git a/test/inline/inline_test_return.wasm.out b/test/inline/inline_test_return.wasm.out new file mode 100644 index 000000000..79d1497bf --- /dev/null +++ b/test/inline/inline_test_return.wasm.out @@ -0,0 +1,4 @@ +spc:static_calls : 6 calls +spc:static_inlined_calls : 6 calls +spc:dynamic_calls : 6 calls +spc:dynamic_inlined_calls : 6 calls diff --git a/test/inline/inline_test_return.wat b/test/inline/inline_test_return.wat new file mode 100644 index 000000000..c1dd8b196 --- /dev/null +++ b/test/inline/inline_test_return.wat @@ -0,0 +1,97 @@ +;; Test inlined functions with explicit RETURN, including nested control flow +;; and paths where extra values are on the stack at the time of return. +(module + ;; Two levels of nested ifs; in the early-return path, 2*a is an extra value + ;; on the value stack below the returned a+b. + (func $weighted (param i32) (param i32) (result i32) + block (result i32) + local.get 0 + i32.const 2 + i32.mul ;; [2a] -- extra below when early return fires + block + local.get 0 + i32.const 0 + i32.gt_s + if + local.get 1 + i32.const 0 + i32.gt_s + if + ;; both positive: return a+b; 2a is extra on stack + local.get 0 + local.get 1 + i32.add + return + end + end + end + local.get 1 + i32.add ;; fallthrough: 2a+b + end + ) + + ;; Clamp x to [lo, hi]; two levels of nesting, returns on multiple paths. + (func $clamp (param i32) (param i32) (param i32) (result i32) + local.get 0 + local.get 1 + i32.lt_s + if + local.get 1 + return + end + local.get 0 + local.get 2 + i32.gt_s + if + local.get 2 + return + end + local.get 0 + ) + + (func (export "main") (result i32) + i32.const 3 + i32.const 4 + call $weighted + i32.const 7 ;; both positive: 3+4=7 + i32.ne + + i32.const 3 + i32.const -1 + call $weighted + i32.const 5 ;; b<=0: 2*3+(-1)=5 + i32.ne + i32.or + + i32.const -1 + i32.const 4 + call $weighted + i32.const 2 ;; a<=0: 2*(-1)+4=2 + i32.ne + i32.or + + i32.const 5 + i32.const 0 + i32.const 10 + call $clamp + i32.const 5 + i32.ne + i32.or + + i32.const -3 + i32.const 0 + i32.const 10 + call $clamp + i32.const 0 + i32.ne + i32.or + + i32.const 15 + i32.const 0 + i32.const 10 + call $clamp + i32.const 10 + i32.ne + i32.or + ) +) From 437da05222311b17a9efd4e77297ba3c93198a5b Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Wed, 25 Mar 2026 01:39:31 -0400 Subject: [PATCH 19/55] Remove manual result slot cleanup (using merge state transfer instead) q --- src/engine/compiler/SinglePassCompiler.v3 | 47 ----------------------- 1 file changed, 47 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index a840a7b98..d52c43af0 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -952,53 +952,6 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl return; } - // Clean up stack: - // Before: [..., arg0, arg1, ..., argN, local0, local1, ..., localM, result0, ..., resultK] - // After: [..., result0, ..., resultK] - - var total_callee_slots = state.sp - new_local_base_sp; // All callee state - var slots_to_drop = total_callee_slots - results_count; - - // for whamm probes, results_count SHOULD be zero - if (slots_to_drop > 0 && results_count > 0) { - // Need to move results down over parameters and locals - for (i < results_count) { - var result_slot = state.sp - results_count + u32.view(i); - var target_slot = new_local_base_sp + u32.view(i); - if (Trace.compiler) { - Trace.OUT.put3(" Moving result %d: slot %d -> slot %d", i, result_slot, target_slot).ln(); - } - if (result_slot != target_slot) { - var rv = state.state[result_slot]; - if (Trace.compiler) { - Trace.OUT.put2(" rv: flags=%x, const=%d", rv.flags, rv.const).ln(); - } - if (rv.inReg()) { - regAlloc.reassign(rv.reg, int.!(result_slot), int.!(target_slot)); - } else { - // Move in memory (rarely needed if results are in regs) - resolver.addMove((target_slot, rv), (result_slot, rv)); - } - state.state[target_slot] = rv; - } else { - // Result already in the right place - if (Trace.compiler) Trace.OUT.puts(" (already in place)").ln(); - } - } - resolver.emitMoves(); - - // Drop everything above results - for (slot = new_local_base_sp + results_count; slot < state.sp; slot++) { - unrefSlot(slot); - } - state.sp = new_local_base_sp + results_count; - } else if (slots_to_drop > 0) { - // No results, just drop everything - if (Trace.compiler) Trace.OUT.put1("dropping %d slots\n", slots_to_drop); - dropN(u32.view(slots_to_drop)); - } - // If slots_to_drop <= 0, results are already in the right place - if (Trace.compiler) { Trace.OUT.put1(" Inlined call complete, sp=%d", state.sp).ln(); } From 644ba206d54f49978274336b02647a9e3818758b Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Wed, 25 Mar 2026 02:06:12 -0400 Subject: [PATCH 20/55] Revert "Remove manual result slot cleanup (using merge state transfer instead)" This reverts commit 112716e1942feda187004249639502131730f592. --- src/engine/compiler/SinglePassCompiler.v3 | 47 +++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index d52c43af0..a840a7b98 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -952,6 +952,53 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl return; } + // Clean up stack: + // Before: [..., arg0, arg1, ..., argN, local0, local1, ..., localM, result0, ..., resultK] + // After: [..., result0, ..., resultK] + + var total_callee_slots = state.sp - new_local_base_sp; // All callee state + var slots_to_drop = total_callee_slots - results_count; + + // for whamm probes, results_count SHOULD be zero + if (slots_to_drop > 0 && results_count > 0) { + // Need to move results down over parameters and locals + for (i < results_count) { + var result_slot = state.sp - results_count + u32.view(i); + var target_slot = new_local_base_sp + u32.view(i); + if (Trace.compiler) { + Trace.OUT.put3(" Moving result %d: slot %d -> slot %d", i, result_slot, target_slot).ln(); + } + if (result_slot != target_slot) { + var rv = state.state[result_slot]; + if (Trace.compiler) { + Trace.OUT.put2(" rv: flags=%x, const=%d", rv.flags, rv.const).ln(); + } + if (rv.inReg()) { + regAlloc.reassign(rv.reg, int.!(result_slot), int.!(target_slot)); + } else { + // Move in memory (rarely needed if results are in regs) + resolver.addMove((target_slot, rv), (result_slot, rv)); + } + state.state[target_slot] = rv; + } else { + // Result already in the right place + if (Trace.compiler) Trace.OUT.puts(" (already in place)").ln(); + } + } + resolver.emitMoves(); + + // Drop everything above results + for (slot = new_local_base_sp + results_count; slot < state.sp; slot++) { + unrefSlot(slot); + } + state.sp = new_local_base_sp + results_count; + } else if (slots_to_drop > 0) { + // No results, just drop everything + if (Trace.compiler) Trace.OUT.put1("dropping %d slots\n", slots_to_drop); + dropN(u32.view(slots_to_drop)); + } + // If slots_to_drop <= 0, results are already in the right place + if (Trace.compiler) { Trace.OUT.put1(" Inlined call complete, sp=%d", state.sp).ln(); } From 9a1af469a0e9b13a2971f35ba5ceef2f85abc873 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Wed, 25 Mar 2026 02:21:30 -0400 Subject: [PATCH 21/55] Fix whamm arg for merge state --- src/engine/compiler/SinglePassCompiler.v3 | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index a840a7b98..5c58f883c 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -864,8 +864,6 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // Push a RETURN control for the inlined callee's function body. var end_label = masm.newLabel(callee_func.cur_bytecode.length); var func_body_ctl = state.pushFuncBody(sig.params, sig.results, end_label); - func_body_ctl.merge_state = state.getInMemoryMergeWithArgs(int.view(func_body_ctl.val_stack_top), sig.results); // preserve outer frame state below callee's results - func_body_ctl.merge_count = 1; var m: Module = module; @@ -878,8 +876,13 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl m = whamm_instance.module; new_local_base_sp = u31.view(state.sp) - u31.view(whamm_sig.length); // XXX + func_body_ctl.val_stack_top = new_local_base_sp; // correct val_stack_top for whamm arg count } + // create merge state based on outer function's base sp given inlined function's results + func_body_ctl.merge_state = state.getInMemoryMergeWithArgs(int.view(new_local_base_sp), sig.results); + func_body_ctl.merge_count = 1; + // Create and push frame for inlined function var callee_frame = SpcFrame.new(callee_func, m, new_local_base_sp, new_ctl_base_sp, num_locals, 0, masm.newLabel(callee_func.cur_bytecode.length)); From e1eb4cddb7d63f64c8edf0928d2c0e10e38edad0 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Wed, 25 Mar 2026 02:27:55 -0400 Subject: [PATCH 22/55] Reapply "Remove manual result slot cleanup (using merge state transfer instead)" This reverts commit 7c09bfc5ea27573da05b85a996cffd36a82170bd. --- src/engine/compiler/SinglePassCompiler.v3 | 47 ----------------------- 1 file changed, 47 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 5c58f883c..e2cad3243 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -955,53 +955,6 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl return; } - // Clean up stack: - // Before: [..., arg0, arg1, ..., argN, local0, local1, ..., localM, result0, ..., resultK] - // After: [..., result0, ..., resultK] - - var total_callee_slots = state.sp - new_local_base_sp; // All callee state - var slots_to_drop = total_callee_slots - results_count; - - // for whamm probes, results_count SHOULD be zero - if (slots_to_drop > 0 && results_count > 0) { - // Need to move results down over parameters and locals - for (i < results_count) { - var result_slot = state.sp - results_count + u32.view(i); - var target_slot = new_local_base_sp + u32.view(i); - if (Trace.compiler) { - Trace.OUT.put3(" Moving result %d: slot %d -> slot %d", i, result_slot, target_slot).ln(); - } - if (result_slot != target_slot) { - var rv = state.state[result_slot]; - if (Trace.compiler) { - Trace.OUT.put2(" rv: flags=%x, const=%d", rv.flags, rv.const).ln(); - } - if (rv.inReg()) { - regAlloc.reassign(rv.reg, int.!(result_slot), int.!(target_slot)); - } else { - // Move in memory (rarely needed if results are in regs) - resolver.addMove((target_slot, rv), (result_slot, rv)); - } - state.state[target_slot] = rv; - } else { - // Result already in the right place - if (Trace.compiler) Trace.OUT.puts(" (already in place)").ln(); - } - } - resolver.emitMoves(); - - // Drop everything above results - for (slot = new_local_base_sp + results_count; slot < state.sp; slot++) { - unrefSlot(slot); - } - state.sp = new_local_base_sp + results_count; - } else if (slots_to_drop > 0) { - // No results, just drop everything - if (Trace.compiler) Trace.OUT.put1("dropping %d slots\n", slots_to_drop); - dropN(u32.view(slots_to_drop)); - } - // If slots_to_drop <= 0, results are already in the right place - if (Trace.compiler) { Trace.OUT.put1(" Inlined call complete, sp=%d", state.sp).ln(); } From 072d51cbee1657f578df420183ad4ab99a937d37 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Wed, 25 Mar 2026 11:58:02 -0400 Subject: [PATCH 23/55] Add return test to failures on dyn --- test/inline/failures.x86-64-linux.dyn | 1 + 1 file changed, 1 insertion(+) diff --git a/test/inline/failures.x86-64-linux.dyn b/test/inline/failures.x86-64-linux.dyn index da02fa079..50325688b 100644 --- a/test/inline/failures.x86-64-linux.dyn +++ b/test/inline/failures.x86-64-linux.dyn @@ -1,4 +1,5 @@ inline_test_arithmetic.wasm inline_test_locals_control.wasm inline_test_nesting.wasm +inline_test_return.wasm From 8df10deb9c29a83e1c9639edf8cf617637e591d5 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Tue, 31 Mar 2026 14:38:13 -0400 Subject: [PATCH 24/55] Use withReconstruct on inlined function entry probes --- src/engine/compiler/SinglePassCompiler.v3 | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index e2cad3243..956a81fa4 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -893,19 +893,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // XXX expensive because frame materialization required if (whamm == null && !FeatureDisable.entryProbes && func.entry_probed) { var probe = Instrumentation.getLocalProbe(module, callee_func.func_index, 0); - - // Reconstruct inlined frames before emitting probe - var reconstructed_space = 0; - if (isInlined()) { - var frames = snapshotFrames(); - unrefRegs(); - reconstructed_space = emitReconstructStackFrames(frames); - } - emitProbe0(0, probe); - // Clean up reconstructed frames after the call returns - if (reconstructed_space > 0) { - masm.emit_addw_r_i(regs.sp, reconstructed_space); - } + withReconstructedInlinedFrames(fun => + emitProbe0(0, probe)); } // Allocate callee's non-parameter locals From 6fa3289fb16292f05dbc64429300ecbe40f2cb63 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Tue, 31 Mar 2026 23:30:47 -0400 Subject: [PATCH 25/55] Prohibit nested frame reconstruction (Whamm probe hackfix when nesting depth > 1) --- src/engine/compiler/SinglePassCompiler.v3 | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 956a81fa4..a2b50ce84 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -114,6 +114,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var last_probe = 0; var skip_to_end: bool; var whamm_config: WhammInlineConfig; + var frames_reconstructed = false; // XXX: hack var handler_dest_info = Vector.new(); @@ -2266,9 +2267,20 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // Guards compiler code with frame reconstruction (if necessary). def withReconstructedInlinedFrames(emit: void -> void) { if (isInlined()) { + if (frames_reconstructed) { + // XXX this should not happen (but does), in the case of deep nesting + // when one layer is a Whamm probe + if (Trace.compiler) Trace.OUT.puts(" nested frame reconstruction inhibited\n"); + // need to save vfp into the frame (because Whamm probe doesn't?) + masm.emit_mov_m_r(ValueKind.REF, frame.vfp_slot, regs.vfp); + emit(); + return; + } unrefRegs(); + frames_reconstructed = true; def space = emitReconstructStackFrames(snapshotFrames()); emit(); + frames_reconstructed = false; if (space > 0) { masm.emit_addw_r_i(regs.sp, space); masm.emit_mov_r_m(ValueKind.REF, regs.vfp, frame.vfp_slot); From 44e8babe69bae3fb3e0c123b6a2948f2dfd54214 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Tue, 31 Mar 2026 23:40:47 -0400 Subject: [PATCH 26/55] Increase code size estimate when inlining is enabled --- src/engine/x86-64/X86_64SinglePassCompiler.v3 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3 index c277b9b98..20fbc427b 100644 --- a/src/engine/x86-64/X86_64SinglePassCompiler.v3 +++ b/src/engine/x86-64/X86_64SinglePassCompiler.v3 @@ -1535,7 +1535,7 @@ component X86_64Spc { return addr; } def estimateCodeSizeFor(decl: FuncDecl) -> int { - return 60 + decl.orig_bytecode.length * 20; // TODO: huge overestimate + return 60 + decl.orig_bytecode.length * 20 * (2 << byte.view(SpcTuning.maxInlineDepth)); // TODO: huge overestimate } private def lazyCompile(wf: WasmFunction) -> (WasmFunction, Pointer, Throwable) { // The global stub simply consults the execution strategy. From 884b8cc928472088eab37c2ed6931ec571dffa49 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Wed, 1 Apr 2026 11:18:04 -0400 Subject: [PATCH 27/55] Move vfp fixing to emitWhammProbe --- src/engine/compiler/SinglePassCompiler.v3 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index a2b50ce84..280b86e60 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -651,6 +651,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var whamm_func_decl = callee_func.decl; if (inline_decision) { whamm_config = WhammInlineConfig(swap_membase, swap_instance, true); + masm.emit_mov_m_r(ValueKind.REF, frame.vfp_slot, regs.vfp); emitInlinedCall(whamm_func_decl, probe); whamm_config = WhammInlineConfig(false, false, false); // Restore mem0_base after probe @@ -2268,11 +2269,10 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl def withReconstructedInlinedFrames(emit: void -> void) { if (isInlined()) { if (frames_reconstructed) { - // XXX this should not happen (but does), in the case of deep nesting - // when one layer is a Whamm probe + // FIXME this should not happen (but does): + // - in the case of deep nesting when one layer is a Whamm probe + // - when refactoring to avoid `with` clause, GC test fails (inlining depth 2) if (Trace.compiler) Trace.OUT.puts(" nested frame reconstruction inhibited\n"); - // need to save vfp into the frame (because Whamm probe doesn't?) - masm.emit_mov_m_r(ValueKind.REF, frame.vfp_slot, regs.vfp); emit(); return; } From 3a66aaa1b345cd51d6b7898babe5bac53e9a6d25 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Mon, 6 Oct 2025 10:09:57 -0400 Subject: [PATCH 28/55] Add FAST_CALL instruction --- fast_call.wasm | Bin 0 -> 51 bytes fast_call.wat | 10 ++++++++++ src/engine/BytecodeIterator.v3 | 2 +- src/engine/CodeValidator.v3 | 2 +- src/engine/Opcodes.v3 | 2 ++ src/engine/v3/V3Interpreter.v3 | 4 ++-- src/engine/x86-64/X86_64Interpreter.v3 | 1 + 7 files changed, 17 insertions(+), 4 deletions(-) create mode 100644 fast_call.wasm create mode 100644 fast_call.wat diff --git a/fast_call.wasm b/fast_call.wasm new file mode 100644 index 0000000000000000000000000000000000000000..671de0268347c954bd8763413398f56d6c56e687 GIT binary patch literal 51 zcmZQbEY4+QU|?WmWlUgTtY>CsVqjqBU}VWn%* v.visit_BR_IF(read_LABEL()); BR_TABLE => v.visit_BR_TABLE(cp.read_labels()); RETURN => v.visit_RETURN(); - CALL => v.visit_CALL(read_FUNC()); + CALL, FAST_CALL => v.visit_CALL(read_FUNC()); CALL_INDIRECT => v.visit_CALL_INDIRECT(read_SIG(), read_TABLE()); RETURN_CALL => v.visit_RETURN_CALL(read_FUNC()); RETURN_CALL_INDIRECT => v.visit_RETURN_CALL_INDIRECT(read_SIG(), read_TABLE()); diff --git a/src/engine/CodeValidator.v3 b/src/engine/CodeValidator.v3 index bf3511288..2372ca1a7 100644 --- a/src/engine/CodeValidator.v3 +++ b/src/engine/CodeValidator.v3 @@ -416,7 +416,7 @@ class CodeValidator(extensions: Extension.set, limits: Limits, module: Module, e checkAndPopArgs(sig.results); setUnreachable(); } - CALL => { + CALL, FAST_CALL => { var func = parser.readFuncRef(); if (func == null) return; checkSignature(func.sig); diff --git a/src/engine/Opcodes.v3 b/src/engine/Opcodes.v3 index b0a78d735..f3beb943c 100644 --- a/src/engine/Opcodes.v3 +++ b/src/engine/Opcodes.v3 @@ -7,6 +7,8 @@ def imm: ImmSigs; // An enumeration of the WebAssembly opcodes, including their mnenomic names, // the kind of immediate expected, and the (monomorphic) operator signature. enum Opcode(prefix: u8, code: u16, mnemonic: string, imms: Array, sig: SigDecl) { + // Fast handler + FAST_CALL (0x00, 0xF0, "fast_call", imm.FUNC, null), // Default, invalid opcode. INVALID (0x00, 0xFF, "", imm.NONE, null), // Control and calls. diff --git a/src/engine/v3/V3Interpreter.v3 b/src/engine/v3/V3Interpreter.v3 index ff4ae0d13..edd48abc0 100644 --- a/src/engine/v3/V3Interpreter.v3 +++ b/src/engine/v3/V3Interpreter.v3 @@ -349,7 +349,7 @@ class V3Interpreter extends WasmStack { RETURN => { doReturn(frame.fp, frame.func.sig); } - CALL => { + CALL, FAST_CALL => { var func_index = codeptr.read_uleb32(); var f = frame.func.instance.functions[func_index]; return doCallFunction(f); @@ -1615,7 +1615,7 @@ class V3Interpreter extends WasmStack { // XXX: use read_opcode_and_skip() var opcode = codeptr.read_opcode_but_skip_probe(frame.func.decl); match (opcode) { - CALL, CALL_REF => { + CALL, CALL_REF, FAST_CALL => { codeptr.skip_leb(); frame.pc = codeptr.pos; } diff --git a/src/engine/x86-64/X86_64Interpreter.v3 b/src/engine/x86-64/X86_64Interpreter.v3 index 29307ca98..fe47c2ca3 100644 --- a/src/engine/x86-64/X86_64Interpreter.v3 +++ b/src/engine/x86-64/X86_64Interpreter.v3 @@ -1308,6 +1308,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { genPopFrameAndRet(); bindHandler(Opcode.CALL); + bindHandler(Opcode.FAST_CALL); computeCurIpForTrap(-1); genReadUleb32(r_tmp1); From a8c9753b3c535525ade36a1b4f3d97ded7d108b7 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Mon, 6 Oct 2025 13:41:00 -0400 Subject: [PATCH 29/55] Update tag for fast_call --- fast_call.wasm | Bin 51 -> 51 bytes src/engine/Opcodes.v3 | 5 +++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fast_call.wasm b/fast_call.wasm index 671de0268347c954bd8763413398f56d6c56e687..f6dc1a5e098fa11b854805bf9bb3dad3057cd83e 100644 GIT binary patch delta 16 UcmXpuo}j@k&fxeDgtD@@0VH??o&W#< delta 16 VcmXpuo}j_~fx+<~2xVn)0{|_{1$O`d diff --git a/src/engine/Opcodes.v3 b/src/engine/Opcodes.v3 index f3beb943c..2b44e01ae 100644 --- a/src/engine/Opcodes.v3 +++ b/src/engine/Opcodes.v3 @@ -7,8 +7,6 @@ def imm: ImmSigs; // An enumeration of the WebAssembly opcodes, including their mnenomic names, // the kind of immediate expected, and the (monomorphic) operator signature. enum Opcode(prefix: u8, code: u16, mnemonic: string, imms: Array, sig: SigDecl) { - // Fast handler - FAST_CALL (0x00, 0xF0, "fast_call", imm.FUNC, null), // Default, invalid opcode. INVALID (0x00, 0xFF, "", imm.NONE, null), // Control and calls. @@ -34,6 +32,8 @@ enum Opcode(prefix: u8, code: u16, mnemonic: string, imms: Array, sig: RETURN_CALL_INDIRECT (0x00, 0x13, "return_call_indirect", imm.SIG_TABLE, null), CALL_REF (0x00, 0x14, "call_ref", imm.SIG, null), RETURN_CALL_REF (0x00, 0x15, "return_call_ref", imm.SIG, null), + // Fast handler custom instruction + FAST_CALL (0x00, 0x17, "fast_call", imm.FUNC, null), DELEGATE (0x00, 0x18, "delegate", imm.LABEL, null), CATCH_ALL (0x00, 0x19, "catch_all", imm.NONE, null), DROP (0x00, 0x1A, "drop", imm.NONE, null), @@ -811,6 +811,7 @@ component Opcodes { attributes[InternalOpcode.PROBE.code] = OpcodeAttribute.INTERNAL | OpcodeAttribute.PROBE; attributes[InternalOpcode.WHAMM_PROBE.code] = OpcodeAttribute.INTERNAL | OpcodeAttribute.PROBE; attributes[InternalOpcode.BREAK_PROBE.code] = OpcodeAttribute.INTERNAL | OpcodeAttribute.PROBE; + attributes[Opcode.FAST_CALL.tag] = OpcodeAttribute.INTERNAL; for (op in [Opcode.END, Opcode.I32_CONST, Opcode.I64_CONST, Opcode.F32_CONST, Opcode.F64_CONST, Opcode.GLOBAL_GET, Opcode.REF_NULL, Opcode.REF_FUNC, Opcode.STRUCT_NEW, Opcode.STRUCT_NEW_DEFAULT, From 7e102d86001e578d1a34fec01882610f7a23d33a Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Wed, 8 Oct 2025 13:02:45 -0400 Subject: [PATCH 30/55] Separate more fast call behavior --- fast_call.wasm | Bin 51 -> 68 bytes fast_call.wat | 12 ++++---- src/engine/BytecodeIterator.v3 | 3 +- src/engine/compiler/SinglePassCompiler.v3 | 33 ++++++++++++++++++++++ src/util/BytecodeVisitor.v3 | 1 + 5 files changed, 42 insertions(+), 7 deletions(-) diff --git a/fast_call.wasm b/fast_call.wasm index f6dc1a5e098fa11b854805bf9bb3dad3057cd83e..36f858295824074b732c768e522caf4cc349e47b 100644 GIT binary patch literal 68 zcmV~$F%Ezr5Cp*8CsVqjqBU}VWn%* v.visit_BR_IF(read_LABEL()); BR_TABLE => v.visit_BR_TABLE(cp.read_labels()); RETURN => v.visit_RETURN(); - CALL, FAST_CALL => v.visit_CALL(read_FUNC()); + CALL => v.visit_CALL(read_FUNC()); + FAST_CALL => v.visit_FAST_CALL(read_FUNC()); CALL_INDIRECT => v.visit_CALL_INDIRECT(read_SIG(), read_TABLE()); RETURN_CALL => v.visit_RETURN_CALL(read_FUNC()); RETURN_CALL_INDIRECT => v.visit_RETURN_CALL_INDIRECT(read_SIG(), read_TABLE()); diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 280b86e60..8eb035dee 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -816,6 +816,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_br(ret_label); setUnreachable(); } + // for CALL, FAST_CALL, and RETURN_CALL def visitCallDirect(op: Opcode, index: u31, tailCall: bool) { if (op == Opcode.CALL) { Metrics.spc_static_calls.val++; @@ -3508,6 +3509,38 @@ class MoveNode { var dstNext: MoveNode; // next in a list of successors } +// checks function bytecode to see if it can be inlined based on +// simple heuristics: length <= maxInlineBytecodeSize and straightline code. +def funcCanInline(decl: FuncDecl) -> InlineConfig { + var default = InlineConfig(false, false, false); + if (decl.orig_bytecode.length > SpcTuning.maxInlineBytecodeSize || decl.sig.params.length > SpcTuning.maxInlineParams) return default; + var bi = BytecodeIterator.new().reset(decl); + var swap_instance = false; + var swap_membase = false; + while (bi.more()) { + var op = bi.current(); + match (op) { + // Cannot handle control flow yet. + IF, BR, BR_IF, BR_TABLE, BR_ON_NULL, BR_ON_NON_NULL, BR_ON_CAST, BR_ON_CAST_FAIL, RETURN => return default; + // These opcodes require swapping the instance. + THROW, CALL, FAST_CALL, CALL_INDIRECT, MEMORY_INIT, MEMORY_SIZE, MEMORY_GROW, MEMORY_COPY, MEMORY_FILL, REF_FUNC, DATA_DROP, + ELEM_DROP, TABLE_INIT, TABLE_SIZE, TABLE_COPY, TABLE_GROW, GLOBAL_SET, GLOBAL_GET, TABLE_SET, TABLE_GET => swap_instance = true; + // Load/store opcodes require either the memory base or the instance. + I32_STORE, I64_STORE, F32_STORE, F64_STORE, I32_STORE8, I32_STORE16, I64_STORE8, I64_STORE16, I64_STORE32, + V128_STORE, I32_LOAD, I64_LOAD, F32_LOAD, F64_LOAD, I32_LOAD8_S, I32_LOAD8_U, I32_LOAD16_S, I32_LOAD16_U, + I64_LOAD8_S, I64_LOAD8_U, I64_LOAD16_S, I64_LOAD16_U, I64_LOAD32_S, I64_LOAD32_U, V128_LOAD => { + var memarg = bi.immptr().read_MemArg(); + if (memarg.memory_index == 0) swap_membase = true; + else swap_instance = true; + } + _ => ; + } + bi.next(); + } + return InlineConfig(swap_membase, swap_instance, true); +} + +type InlineConfig(swap_membase: bool, swap_instance: bool, can_inline: bool); type WhammInlineConfig(swap_membase: bool, swap_instance: bool, is_inlined: bool); // Used to record the entry point of exception/suspension handlers. Jumping to {stub_label} allows diff --git a/src/util/BytecodeVisitor.v3 b/src/util/BytecodeVisitor.v3 index fbef4b056..da03cc410 100644 --- a/src/util/BytecodeVisitor.v3 +++ b/src/util/BytecodeVisitor.v3 @@ -70,6 +70,7 @@ class BytecodeVisitor { def visit_BR_TABLE (labels: Range) { visitControl(Opcode.BR_TABLE); } def visit_RETURN () { visitControl(Opcode.RETURN); } def visit_CALL (func_index: u31) { visitCallDirect(Opcode.CALL, func_index, false); } + def visit_FAST_CALL (func_index: u31) { visitCallDirect(Opcode.FAST_CALL, func_index, false); } def visit_CALL_INDIRECT (sig_index: u31, table_index: u31) { visitCallIndirect(Opcode.CALL_INDIRECT, sig_index, table_index, false); } def visit_RETURN_CALL (func_index: u31) { visitCallDirect(Opcode.RETURN_CALL, func_index, true); } def visit_RETURN_CALL_INDIRECT(sig_index: u31, table_index: u31) { visitCallIndirect(Opcode.RETURN_CALL_INDIRECT, sig_index, table_index, true); } From 6a771fff5c9302574ab5e17840fcc2cd98bd5f1c Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Tue, 14 Oct 2025 14:56:21 -0400 Subject: [PATCH 31/55] Add CallProperty to distinguish between tail call and fast call --- src/engine/compiler/SinglePassCompiler.v3 | 22 +++++++++--------- src/util/BytecodeVisitor.v3 | 28 +++++++++++++++-------- 2 files changed, 29 insertions(+), 21 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 8eb035dee..616fde5f2 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -817,7 +817,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl setUnreachable(); } // for CALL, FAST_CALL, and RETURN_CALL - def visitCallDirect(op: Opcode, index: u31, tailCall: bool) { + def visitCallDirect(op: Opcode, index: u31, prop: CallProperty) { if (op == Opcode.CALL) { Metrics.spc_static_calls.val++; masm.emit_inc_metric(Metrics.spc_dynamic_calls); @@ -825,7 +825,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var func = module.functions[index]; // Try inlining for intra-module, non-tail calls - if (!tailCall && shouldInline(func)) { + if (prop != CallProperty.TAIL && shouldInline(func)) { if (Trace.compiler) Trace.OUT.put2("Inlining call to func #%d (%d bytes)", index, func.orig_bytecode.length).ln(); if (op == Opcode.CALL) { Metrics.spc_static_inlined_calls.val++; @@ -847,7 +847,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_v3_Instance_functions_r_r(func_reg, tmp); masm.emit_v3_Array_elem_r_ri(ValueKind.REF, func_reg, func_reg, func.func_index); - emitCallToReg(func.sig, func_reg, vsp_reg, tmp, func.imp != null, tailCall); + emitCallToReg(func.sig, func_reg, vsp_reg, tmp, func.imp != null, prop); }); } def emitInlinedCall(callee_func: FuncDecl, whamm: WhammProbe) { @@ -951,10 +951,10 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl Trace.OUT.put1(" Inlined call complete, sp=%d", state.sp).ln(); } } - def emitCallToReg(sig: SigDecl, func_reg: Reg, vsp_reg: Reg, tmp: Reg, checkHostCall: bool, tailCall: bool) { + def emitCallToReg(sig: SigDecl, func_reg: Reg, vsp_reg: Reg, tmp: Reg, checkHostCall: bool, prop: CallProperty) { var retpt = masm.newLabel(it.pc), wasmcall_label = masm.newLabel(it.pc); // Handle the current stack state. - if (tailCall) emitMoveTailCallArgs(sig); // transfer tail call args + if (prop == CallProperty.TAIL) emitMoveTailCallArgs(sig); // transfer tail call args else state.emitSaveAll(resolver, SpillMode.SAVE_AND_FREE_REGS); // spill entire value stack // Compute the value stack pointer. @@ -962,7 +962,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl if (checkHostCall) { // A call to imported function must first check for WasmFunction. masm.emit_br_r(func_reg, MasmBrCond.IS_WASM_FUNC, wasmcall_label); - if (tailCall) { + if (prop == CallProperty.TAIL) { masm.emit_jump_HostCallStub(); // XXX: stub relies on func_arg and VSP } else { masm.emit_call_HostCallStub(); // XXX: stub relies on func_arg and VSP @@ -975,7 +975,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_v3_FuncDecl_target_code_r_r(tmp, tmp); // Call or jump to the entrypoint. - if (tailCall) { + if (prop == CallProperty.TAIL) { masm.emit_jump_r(tmp); setUnreachable(); } else { @@ -999,7 +999,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // adjust frame masm.emit_addw_r_i(regs.sp, frame.frameSize); } - def visitCallIndirect(op: Opcode, sig_index: u31, table_index: u31, tailCall: bool) { + def visitCallIndirect(op: Opcode, sig_index: u31, table_index: u31, prop: CallProperty) { var sig = SigDecl.!(module.heaptypes[sig_index]); var retpt = masm.newLabel(it.pc), wasmcall_label = masm.newLabel(it.pc); var vsp_reg = allocTmpFixed(ValueKind.REF, regs.vsp); @@ -1056,9 +1056,9 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.bindLabel(end); } - emitCallToReg(sig, func_reg, vsp_reg, tmp_reg, true, tailCall); + emitCallToReg(sig, func_reg, vsp_reg, tmp_reg, true, prop); } - def visitCallRef(op: Opcode, index: u31, tailCall: bool) { + def visitCallRef(op: Opcode, index: u31, prop: CallProperty) { var sig = SigDecl.!(module.heaptypes[index]); var sv = state.peek(); if (sv.isConst() && sv.const == 0) { @@ -1071,7 +1071,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var tmp = allocTmp(ValueKind.REF); var func_reg = sv.reg; - emitCallToReg(sig, func_reg, vsp_reg, tmp, true, tailCall); + emitCallToReg(sig, func_reg, vsp_reg, tmp, true, prop); } def visit_DROP() { dropN(1); diff --git a/src/util/BytecodeVisitor.v3 b/src/util/BytecodeVisitor.v3 index da03cc410..adc822bfe 100644 --- a/src/util/BytecodeVisitor.v3 +++ b/src/util/BytecodeVisitor.v3 @@ -20,9 +20,9 @@ class BytecodeVisitor { def visitMisc(op: Opcode) { visitOp(op); } def visitControl(op: Opcode) { visitOp(op); } def visitCall(op: Opcode) { visitOp(op); } - def visitCallDirect(op: Opcode, func_index: u31, tailCall: bool) { visitCall(op); } - def visitCallIndirect(op: Opcode, sig_index: u31, table_index: u31, tailCall: bool) { visitCall(op); } - def visitCallRef(op: Opcode, sig_index: u31, tailCall: bool) { visitCall(op); } + def visitCallDirect(op: Opcode, func_index: u31, prop: CallProperty) { visitCall(op); } + def visitCallIndirect(op: Opcode, sig_index: u31, table_index: u31, prop: CallProperty) { visitCall(op); } + def visitCallRef(op: Opcode, sig_index: u31, prop: CallProperty) { visitCall(op); } def visitLocal(op: Opcode, local_index: u31) { visitOp(op); } def visitGlobal(op: Opcode, local_index: u31) { visitOp(op); } def visitTable(op: Opcode, table_index: u31) { visitOp(op); } @@ -69,13 +69,13 @@ class BytecodeVisitor { def visit_BR_IF (depth: u31) { visitControl(Opcode.BR_IF); } def visit_BR_TABLE (labels: Range) { visitControl(Opcode.BR_TABLE); } def visit_RETURN () { visitControl(Opcode.RETURN); } - def visit_CALL (func_index: u31) { visitCallDirect(Opcode.CALL, func_index, false); } - def visit_FAST_CALL (func_index: u31) { visitCallDirect(Opcode.FAST_CALL, func_index, false); } - def visit_CALL_INDIRECT (sig_index: u31, table_index: u31) { visitCallIndirect(Opcode.CALL_INDIRECT, sig_index, table_index, false); } - def visit_RETURN_CALL (func_index: u31) { visitCallDirect(Opcode.RETURN_CALL, func_index, true); } - def visit_RETURN_CALL_INDIRECT(sig_index: u31, table_index: u31) { visitCallIndirect(Opcode.RETURN_CALL_INDIRECT, sig_index, table_index, true); } - def visit_CALL_REF (sig_index: u31) { visitCallRef(Opcode.CALL_REF, sig_index, false); } - def visit_RETURN_CALL_REF(sig_index: u31) { visitCallRef(Opcode.RETURN_CALL_REF, sig_index, true); } + def visit_CALL (func_index: u31) { visitCallDirect(Opcode.CALL, func_index, SLOW); } + def visit_FAST_CALL (func_index: u31) { visitCallDirect(Opcode.FAST_CALL, func_index, FAST); } + def visit_CALL_INDIRECT (sig_index: u31, table_index: u31) { visitCallIndirect(Opcode.CALL_INDIRECT, sig_index, table_index, SLOW); } + def visit_RETURN_CALL (func_index: u31) { visitCallDirect(Opcode.RETURN_CALL, func_index, TAIL); } + def visit_RETURN_CALL_INDIRECT(sig_index: u31, table_index: u31) { visitCallIndirect(Opcode.RETURN_CALL_INDIRECT, sig_index, table_index, TAIL); } + def visit_CALL_REF (sig_index: u31) { visitCallRef(Opcode.CALL_REF, sig_index, SLOW); } + def visit_RETURN_CALL_REF(sig_index: u31) { visitCallRef(Opcode.RETURN_CALL_REF, sig_index, TAIL); } def visit_DELEGATE (depth: u31) { visitControl(Opcode.DELEGATE); } def visit_CATCH_ALL () { visitControl(Opcode.CATCH_ALL); } def visit_DROP () { visitMisc(Opcode.DROP); } @@ -654,3 +654,11 @@ class BytecodeVisitor { def visit_SUSPEND (tag: u31) { visitOp(Opcode.SUSPEND); } def visit_SWITCH (cont: u31, tag: u31) { visitOp(Opcode.SWITCH); } } + +enum CallProperty { + SLOW, TAIL, FAST +} + +def SLOW = CallProperty.SLOW; +def TAIL = CallProperty.TAIL; +def FAST = CallProperty.FAST; From f80bb2b98447582e95e8a9402927744135895652 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Mon, 5 Jan 2026 15:56:01 -0500 Subject: [PATCH 32/55] Add stubs/initialization for fast call entries and no-op FAST_CALL in interpreter --- src/engine/Module.v3 | 3 + src/engine/compiler/SinglePassCompiler.v3 | 68 ++++++++------- src/engine/x86-64/V3Offsets.v3 | 1 + src/engine/x86-64/X86_64Interpreter.v3 | 23 ++++- src/engine/x86-64/X86_64SinglePassCompiler.v3 | 84 +++++++++++++++++++ src/engine/x86-64/X86_64Target.v3 | 70 +++++++++++++--- 6 files changed, 203 insertions(+), 46 deletions(-) diff --git a/src/engine/Module.v3 b/src/engine/Module.v3 index 90d85a4ae..f1598f07c 100644 --- a/src/engine/Module.v3 +++ b/src/engine/Module.v3 @@ -141,6 +141,7 @@ class FuncDecl(sig_index: int) extends Decl { var cbd_sidetable: Array; // CBD u8 sidetable var frame_var_tags: Array; // value tags for frame variables var target_code: TargetCode; + var fast_target_code: TargetCode; var tierup_trigger: int = int.max; var handlers = FuncHandlerInfo.new(); @@ -154,6 +155,7 @@ class FuncDecl(sig_index: int) extends Decl { var tc: TargetCode; var tr: TargetCode; target_code = tc; // reset target code as well + fast_target_code = tc; sidetable = Sidetables.NO_SIDETABLE; cbd_sidetable = null; } @@ -183,6 +185,7 @@ class FuncDecl(sig_index: int) extends Decl { n.sidetable = this.sidetable; n.num_locals = this.num_locals; n.target_code = this.target_code; + n.fast_target_code = this.fast_target_code; return n; } def findExHandler(instance: Instance, tag: Tag, throw_pc: int) -> ExHandler { diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 616fde5f2..81ec89c18 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -131,15 +131,15 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl def gen(module: Module, func: FuncDecl, err: ErrorGen) -> bool { this.osr_pc = -1; this.err = err; - return Metrics.spc_time_us.run(gen0, (module, func)); + return Metrics.spc_time_us.run(gen0(_, _, false), (module, func)); } def genOsr(module: Module, func: FuncDecl, pc: int, err: ErrorGen) -> MasmLabel { this.osr_pc = pc; this.err = err; - var ok = Metrics.spc_time_us.run(gen0, (module, func)); + var ok = Metrics.spc_time_us.run(gen0(_, _, false), (module, func)); return if(ok, osr_entry_label); } - private def gen0(module: Module, func: FuncDecl) -> bool { + private def gen0(module: Module, func: FuncDecl, fast: bool) -> bool { if (Trace.compiler) OUT.put1("==== begin compile: %q ========================", func.render(module.names, _)).ln(); var before_code_bytes = masm.curCodeBytes(); var before_data_bytes = masm.curDataBytes(); @@ -170,7 +170,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl pushSpcFrame(initial_frame); // Emit prologue, which allocates the frame and initializes various registers. - emitPrologue(); + if (!fast) emitPrologue(); // Visit all local declarations. it.dispatchLocalDecls(this); @@ -753,34 +753,36 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl setUnreachable(); } def visit_END() { - var ctl_top = state.ctl_stack.peek(); - if (ctl_top.opcode == Opcode.LOOP.code) { - state.ctl_stack.pop(); - if (!ctl_top.reachable) setUnreachable(); - } else if (ctl_top.opcode == Opcode.IF.code) { - // simulate empty if-true block - state.emitFallthru(resolver); - masm.emit_br(ctl_top.label); - masm.bindLabel(ctl_top.else_label); - state.doElse(); - ctl_top.opcode = Opcode.ELSE.code; - state.emitFallthru(resolver); - masm.bindLabel(ctl_top.label); - state.resetToMerge(ctl_top); - state.ctl_stack.pop(); - } else if (ctl_top.opcode == Opcode.BLOCK.code || ctl_top.opcode == Opcode.ELSE.code) { - state.emitFallthru(resolver); - masm.bindLabel(ctl_top.label); - state.resetToMerge(ctl_top); - state.ctl_stack.pop(); - } else if (ctl_top.opcode == Opcode.RETURN.code) { - state.emitFallthru(resolver); - masm.bindLabel(ctl_top.label); - state.resetToMerge(ctl_top); - emitProbe(); - if (ctl_top.merge_count > 1) emitReturn(ctl_top); - state.ctl_stack.pop(); - return; + if (!isInlined()) { + var ctl_top = state.ctl_stack.peek(); + if (ctl_top.opcode == Opcode.LOOP.code) { + state.ctl_stack.pop(); + if (!ctl_top.reachable) setUnreachable(); + } else if (ctl_top.opcode == Opcode.IF.code) { + // simulate empty if-true block + state.emitFallthru(resolver); + masm.emit_br(ctl_top.label); + masm.bindLabel(ctl_top.else_label); + state.doElse(); + ctl_top.opcode = Opcode.ELSE.code; + state.emitFallthru(resolver); + masm.bindLabel(ctl_top.label); + state.resetToMerge(ctl_top); + state.ctl_stack.pop(); + } else if (ctl_top.opcode == Opcode.BLOCK.code || ctl_top.opcode == Opcode.ELSE.code) { + state.emitFallthru(resolver); + masm.bindLabel(ctl_top.label); + state.resetToMerge(ctl_top); + state.ctl_stack.pop(); + // case for END for fallthrough at end of function? + } else if (ctl_top.opcode == Opcode.RETURN.code) { + state.emitFallthru(resolver); + masm.bindLabel(ctl_top.label); + state.resetToMerge(ctl_top); + emitProbe(); + if (ctl_top.merge_count > 1) emitReturn(ctl_top); + state.ctl_stack.pop(); + } } emitProbe(); } @@ -2119,11 +2121,13 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_br(target.label); } } + // Return includes epilogue? def emitReturn(ctl: SpcControl) { // All explicit RETURN instructions branch here. masm.bindLabel(ret_label); var results = sig.results; + // fix values? if (masm.valuerep.tagged) { // update mismatched value tags var params = sig.params; diff --git a/src/engine/x86-64/V3Offsets.v3 b/src/engine/x86-64/V3Offsets.v3 index 5de4559e9..2761ab15a 100644 --- a/src/engine/x86-64/V3Offsets.v3 +++ b/src/engine/x86-64/V3Offsets.v3 @@ -31,6 +31,7 @@ class V3Offsets { def FuncDecl_orig_bytecode = int.view(Pointer.atField(decl.orig_bytecode) - Pointer.atObject(decl)); def FuncDecl_sidetable = int.view(Pointer.atField(decl.sidetable.entries) - Pointer.atObject(decl)); def FuncDecl_target_code = int.view(Pointer.atField(decl.target_code.spc_entry) - Pointer.atObject(decl)); + def FuncDecl_fast_target_code = int.view(Pointer.atField(decl.fast_target_code.spc_entry) - Pointer.atObject(decl)); def FuncDecl_tierup_trigger = int.view(Pointer.atField(decl.tierup_trigger) - Pointer.atObject(decl)); def FuncDecl_entry_probed = int.view(Pointer.atField(decl.entry_probed) - Pointer.atObject(decl)); def FuncDecl_frame_var_tags = int.view(Pointer.atField(decl.frame_var_tags) - Pointer.atObject(decl)); diff --git a/src/engine/x86-64/X86_64Interpreter.v3 b/src/engine/x86-64/X86_64Interpreter.v3 index fe47c2ca3..1f4c510da 100644 --- a/src/engine/x86-64/X86_64Interpreter.v3 +++ b/src/engine/x86-64/X86_64Interpreter.v3 @@ -1307,8 +1307,27 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { asm.movd_r_i(G(Target.V3_RET_GPRS[0]), 0); genPopFrameAndRet(); - bindHandler(Opcode.CALL); + // FAST_CALL bindHandler(Opcode.FAST_CALL); + var dispatchLabel = X86_64Label.new(); + // genTagPush(BpTypeCode.I32.code); + // asm.movq_m_i(vsph[0].value, 770); + // incrementVsp(); + + genReadUleb32(r_tmp1); + asm.movq_r_m(r_tmp0, r_instance.plus(offsets.Instance_functions)); + asm.movq_r_m(func_arg, r_tmp0.plusR(r_tmp1, offsets.REF_SIZE, offsets.Array_contents)); + + var tmp = r_tmp2; + asm.movq_r_m(tmp, func_arg.plus(offsets.WasmFunction_decl)); + asm.ijmp_m(tmp.plus(offsets.FuncDecl_fast_target_code)); + asm.invalid(); + + // don't go here + asm.bind(dispatchLabel); + endHandler(); + + bindHandler(Opcode.CALL); computeCurIpForTrap(-1); genReadUleb32(r_tmp1); @@ -1327,6 +1346,8 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { var tmp = r_tmp2; asm.movq_r_m(tmp, func_arg.plus(offsets.WasmFunction_decl)); asm.icall_m(tmp.plus(offsets.FuncDecl_target_code)); + // assembly call to target function + // if not compiled, interpreter's entry point } else { asm.call_rel_far(callReentryLabel); } diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3 index 20fbc427b..e8881e849 100644 --- a/src/engine/x86-64/X86_64SinglePassCompiler.v3 +++ b/src/engine/x86-64/X86_64SinglePassCompiler.v3 @@ -34,6 +34,50 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { mmasm.trap_stubs = TRAPS_STUB; } + var dispatchJmpOffset: int = -1; + // Generate a load of the next bytecode and a dispatch through the dispatch table. + def genDispatch0(ic: X86_64InterpreterCode, ptr: X86_64Addr, table: IcCodeRef, increment: bool) { + def xenv: IntExecEnv = X86_64MasmRegs.INT_EXEC_ENV; + def r_ip = G(xenv.ip); + def r_dispatch = G(xenv.dispatch); + def r_tmp0 = G(xenv.tmp0); // RCX + def r_tmp1 = G(xenv.tmp1); // RDX + + var opcode = r_tmp0; + var base = r_tmp1; + if (ptr != null) asm.movbzx_r_m(opcode, ptr); + if (increment) asm.inc_r(r_ip); + match (FastIntTuning.dispatchEntrySize) { + 2 => { + if (table == null) asm.movq_r_r(base, r_dispatch); + else asm.lea(base, table); // RIP-relative LEA + asm.movwsx_r_m(opcode, base.plusR(opcode, 2, 0)); // load 16-bit offset + asm.add_r_r(base, opcode); + if (dispatchJmpOffset < 0) dispatchJmpOffset = w.pos; + asm.ijmp_r(base); + } + 4 => { + if (table == null) { + asm.movd_r_m(base, r_dispatch.plusR(opcode, 4, 0)); + } else { + var addr = ic.start + table.offset; + asm.movd_r_m(base, X86_64Addr.new(null, opcode, 4, int.!(addr - Pointer.NULL))); + } + if (dispatchJmpOffset < 0) dispatchJmpOffset = w.pos; + asm.ijmp_r(base); + } + 8 => { + if (table == null) { + if (dispatchJmpOffset < 0) dispatchJmpOffset = w.pos; + asm.ijmp_m(r_dispatch.plusR(opcode, 8, 0)); + } else { + var addr = ic.start + table.offset; + if (dispatchJmpOffset < 0) dispatchJmpOffset = w.pos; + asm.ijmp_m(X86_64Addr.new(null, opcode, 8, int.!(addr - Pointer.NULL))); + } + } + } + } private def visitCompareI(asm: X86_64Assembler, cond: X86_64Cond) -> bool { var b = pop(), a = popReg(); if (b.isConst()) asm.cmp_r_i(G(a.reg), b.const); @@ -1396,6 +1440,7 @@ class X86_64SpcCompileStub extends RiUserCode { def V3_SPC_ENTRY_FUNC = X86_64PreGenFunc<(WasmFunction, Pointer, Pointer), Throwable>.new("v3-spc-entry", null, genSpcEntryFunc); def LAZY_COMPILE_STUB = X86_64PreGenStub.new("spc-lazy-compile", X86_64SpcCompileStub.new("lazy"), genLazyCompileStub); +def FAST_COMPILE_STUB = X86_64PreGenStub.new("spc-fast-compile", X86_64SpcCompileStub.new("fast"), genFastCompileStub); def TIERUP_COMPILE_STUB = X86_64PreGenStub.new("spc-tierup-compile", X86_64SpcCompileStub.new("tierup"), genTierUpCompileStub); def TRAPS_STUB = X86_64SpcTrapsStub.new(); def TRAPS_PREGEN = X86_64PreGenStub.new("spc-trap", TRAPS_STUB, genTrapsStub); @@ -1451,6 +1496,36 @@ def genLazyCompileStub(ic: X86_64InterpreterCode, w: DataWriter) { asm.movq_r_r(G(Target.V3_RET_GPRS[0]), G(Target.V3_RET_GPRS[2])); asm.ret(); } +// XXX the stub must also respect the register usage of the fast interpreter +def genFastCompileStub(ic: X86_64InterpreterCode, w: DataWriter) { + if (SpcTuning.disable) return; + var masm = X86_64MacroAssembler.new(w, X86_64MasmRegs.CONFIG); + var asm = X86_64Assembler.!(masm.asm); + var regs = X86_64MasmRegs.SPC_EXEC_ENV; + var func_arg = G(regs.func_arg); + + def xenv: IntExecEnv = X86_64MasmRegs.INT_EXEC_ENV; + def r_ip = G(xenv.ip); + def r_dispatch = G(xenv.dispatch); + def r_tmp0 = G(xenv.tmp0); // RCX + def r_tmp1 = G(xenv.tmp1); // RDX + def ip_ptr = r_ip.plus(0); + + // simplified dispatch sequence + + var opcode = r_tmp0; + var base = r_tmp1; + + asm.movbzx_r_m(opcode, ip_ptr); + asm.inc_r(r_ip); + + // flattened 4 case + var addr = ic.start + ic.header.fastDispatchTableOffset; + asm.movd_r_m(base, X86_64Addr.new(null, opcode, 4, int.!(addr - Pointer.NULL))); + asm.ijmp_r(base); + + asm.invalid(); +} def genTierUpCompileStub(ic: X86_64InterpreterCode, w: DataWriter) { if (SpcTuning.disable) return; var masm = X86_64MacroAssembler.new(w, X86_64MasmRegs.CONFIG); @@ -1519,6 +1594,10 @@ component X86_64Spc { def invoke(wf: WasmFunction, sp: Pointer) -> Throwable { return V3_SPC_ENTRY_FUNC.get()(wf, sp, wf.decl.target_code.spc_entry); } + def setFastCompileFor(module: Module, decl: FuncDecl) { + if (Debug.runtime) Trace.OUT.put1("setFastCompile %q", decl.render(module.names, _)).ln(); + decl.fast_target_code = TargetCode(FAST_COMPILE_STUB.getEntry()); + } def setLazyCompileFor(module: Module, decl: FuncDecl) { if (Debug.runtime) Trace.OUT.put1("setLazyCompile %q", decl.render(module.names, _)).ln(); decl.target_code = TargetCode(LAZY_COMPILE_STUB.getEntry()); @@ -1542,6 +1621,11 @@ component X86_64Spc { var result = X86_64SpcStrategy.!(Execute.tiering).lazyCompile(wf); return (result.wf, result.entrypoint, result.thrown); } + private def fastCompile(wf: WasmFunction) -> (WasmFunction, Pointer, Throwable) { + // The global stub simply consults the execution strategy. + var result = X86_64ExecutionStrategy.!(Execute.tiering).fastCompile(wf); // no condition that tiering uses SPC (int => fast SPC) + return (result.wf, result.entrypoint, result.thrown); + } private def tierupCompile(wf: WasmFunction) -> (WasmFunction, Pointer, Throwable) { // The global stub simply consults the execution strategy. var result = X86_64SpcStrategy.!(Execute.tiering).tierupCompile(wf); diff --git a/src/engine/x86-64/X86_64Target.v3 b/src/engine/x86-64/X86_64Target.v3 index 015db9508..a413b1ba0 100644 --- a/src/engine/x86-64/X86_64Target.v3 +++ b/src/engine/x86-64/X86_64Target.v3 @@ -188,6 +188,7 @@ class X86_64ExecutionStrategy extends ExecutionStrategy { } // Compilation methods called directly by stubs. def lazyCompile(wf: WasmFunction) -> SpcResultForStub; + def fastCompile(wf: WasmFunction) -> SpcResultForStub; def tierupCompile(wf: WasmFunction) -> SpcResultForStub; // Tiering may require setting up the whole module. def onTestModule(module: Module) { @@ -196,6 +197,20 @@ class X86_64ExecutionStrategy extends ExecutionStrategy { def disableLazyNameDecodingDuringGC(module: Module) { if (module.names != null) module.names.lazyDecodeDisabled = RiGc.inGC; } + + def installStubForModule(module: Module, set: (Module, FuncDecl) -> void) { + // ensure entrypoint and lazy compile stubs are generated + X86_64PreGenStubs.gen(); + // Set all functions to refer to the tier-up compile stub. + var codeSize = MINIMUM_CODE_SIZE; + for (i < module.functions.length) { + var f = module.functions[i]; + if (f.imported()) continue; + set(module, f); + codeSize += X86_64Spc.estimateCodeSizeFor(f); + } + allocateCodeForModule(module, codeSize); + } } // One tier: fast-int, modules require no pre-processing. @@ -219,6 +234,48 @@ class X86_64InterpreterOnlyStrategy extends X86_64ExecutionStrategy { if (FastIntTuning.enableWhammProbeTrampoline && WhammProbe.?(p)) X86_64WhammTrampoline.makeTrampoline(WhammProbe.!(p), X86_64PreGenStubs.getInterpreterCode()); } + + // TODO avoid duplicated function here + def fastCompile(wf: WasmFunction) -> SpcResultForStub { + // Check the JIT filter, if there is one + if (!applyJitFilter(wf.instance.module, wf.decl, "fast")) return SpcResultForStub(wf, X86_64Spc.setInterpreterFallback(wf.decl), null); + + var module = wf.instance.module; + var code = module.target_module.spc_code; + var compiler = newCompiler(module.filename); // XXX: cache per-thread + var masm = X86_64MacroAssembler.!(compiler.masm), w = masm.asm.w; + + // generate code for the function + var success = compiler.gen(module, wf.decl, null); + + // Check for remaining code space + var regionSize = code.mapping.range.size(); + var remaining = regionSize - u64.!(code.codeEnd); + var codeSize = w.atEnd().pos; + if (codeSize > remaining) { + if (Trace.compiler) Trace.OUT.put3("exhausted code space for module (%d of %d bytes remaining, need %d)", + remaining, regionSize, codeSize).ln(); + success = false; + } + + var entrypoint: Pointer; + if (success) { + // Copy code into end of region + entrypoint = code.appendCode(masm); + Target.setTargetCode(wf.decl, entrypoint, entrypoint + codeSize); + } else { + // Failed, enter interpreter + var f = wf.decl; + if (Trace.compiler) Trace.OUT.put1("func[%d] FAST compile failed", f.func_index).ln(); + entrypoint = X86_64Spc.setInterpreterFallback(f); + } + return SpcResultForStub(wf, entrypoint, null); + } + + // XXX not an exhaustive way to add stubs. but what is? + def onModuleFinish(module: Module, size: u32, err: ErrorGen) { + installStubForModule(module, X86_64Spc.setFastCompileFor); + } } // Base class of all strategies that use SPC. @@ -271,19 +328,6 @@ class X86_64SpcStrategy extends X86_64ExecutionStrategy { } return SpcResultForStub(wf, entrypoint, null); } - def installStubForModule(module: Module, set: (Module, FuncDecl) -> void) { - // ensure entrypoint and lazy compile stubs are generated - X86_64PreGenStubs.gen(); - // Set all functions to refer to the tier-up compile stub. - var codeSize = MINIMUM_CODE_SIZE; - for (i < module.functions.length) { - var f = module.functions[i]; - if (f.imported()) continue; - set(module, f); - codeSize += X86_64Spc.estimateCodeSizeFor(f); - } - allocateCodeForModule(module, codeSize); - } } // One tier: SPC, modules are eagerly compiled. From d82b2037ddff40d0682db46272b40537f90c5247 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Mon, 5 Jan 2026 16:01:03 -0500 Subject: [PATCH 33/55] Clean up unused dispatch function and specify goals --- src/engine/x86-64/X86_64SinglePassCompiler.v3 | 52 +++---------------- 1 file changed, 7 insertions(+), 45 deletions(-) diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3 index e8881e849..632c70837 100644 --- a/src/engine/x86-64/X86_64SinglePassCompiler.v3 +++ b/src/engine/x86-64/X86_64SinglePassCompiler.v3 @@ -34,50 +34,6 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { mmasm.trap_stubs = TRAPS_STUB; } - var dispatchJmpOffset: int = -1; - // Generate a load of the next bytecode and a dispatch through the dispatch table. - def genDispatch0(ic: X86_64InterpreterCode, ptr: X86_64Addr, table: IcCodeRef, increment: bool) { - def xenv: IntExecEnv = X86_64MasmRegs.INT_EXEC_ENV; - def r_ip = G(xenv.ip); - def r_dispatch = G(xenv.dispatch); - def r_tmp0 = G(xenv.tmp0); // RCX - def r_tmp1 = G(xenv.tmp1); // RDX - - var opcode = r_tmp0; - var base = r_tmp1; - if (ptr != null) asm.movbzx_r_m(opcode, ptr); - if (increment) asm.inc_r(r_ip); - match (FastIntTuning.dispatchEntrySize) { - 2 => { - if (table == null) asm.movq_r_r(base, r_dispatch); - else asm.lea(base, table); // RIP-relative LEA - asm.movwsx_r_m(opcode, base.plusR(opcode, 2, 0)); // load 16-bit offset - asm.add_r_r(base, opcode); - if (dispatchJmpOffset < 0) dispatchJmpOffset = w.pos; - asm.ijmp_r(base); - } - 4 => { - if (table == null) { - asm.movd_r_m(base, r_dispatch.plusR(opcode, 4, 0)); - } else { - var addr = ic.start + table.offset; - asm.movd_r_m(base, X86_64Addr.new(null, opcode, 4, int.!(addr - Pointer.NULL))); - } - if (dispatchJmpOffset < 0) dispatchJmpOffset = w.pos; - asm.ijmp_r(base); - } - 8 => { - if (table == null) { - if (dispatchJmpOffset < 0) dispatchJmpOffset = w.pos; - asm.ijmp_m(r_dispatch.plusR(opcode, 8, 0)); - } else { - var addr = ic.start + table.offset; - if (dispatchJmpOffset < 0) dispatchJmpOffset = w.pos; - asm.ijmp_m(X86_64Addr.new(null, opcode, 8, int.!(addr - Pointer.NULL))); - } - } - } - } private def visitCompareI(asm: X86_64Assembler, cond: X86_64Cond) -> bool { var b = pop(), a = popReg(); if (b.isConst()) asm.cmp_r_i(G(a.reg), b.const); @@ -1496,7 +1452,13 @@ def genLazyCompileStub(ic: X86_64InterpreterCode, w: DataWriter) { asm.movq_r_r(G(Target.V3_RET_GPRS[0]), G(Target.V3_RET_GPRS[2])); asm.ret(); } -// XXX the stub must also respect the register usage of the fast interpreter +/* This stub should: + * - save program state (i.e. an epilogue as if it was a call/new frame) + * - compile the function (given register constraints imposed by fast int) + * - rewrite the `fast_target_code` field with this new function + * - restore program state + * - jump into the new `fast_target_code` (or re-dispatch on itself) + */ def genFastCompileStub(ic: X86_64InterpreterCode, w: DataWriter) { if (SpcTuning.disable) return; var masm = X86_64MacroAssembler.new(w, X86_64MasmRegs.CONFIG); From 4cc8807b4d1deb084a5fdcfab05b0318b866e923 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Mon, 5 Jan 2026 17:35:50 -0500 Subject: [PATCH 34/55] Work on SPC accepting fast mode and emitting dispatch sequence --- src/engine/compiler/SinglePassCompiler.v3 | 27 +++--- src/engine/x86-64/X86_64SinglePassCompiler.v3 | 87 ++++++++++++++++++- src/engine/x86-64/X86_64Target.v3 | 16 ++-- 3 files changed, 108 insertions(+), 22 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 81ec89c18..0184eb102 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -77,7 +77,7 @@ def KIND_REF = SpcConsts.KIND_REF; def KIND_REF_U64 = SpcConsts.KIND_REF_U64; // Compiles Wasm bytecode to machine code in a single pass via a MacroAssembler. -class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAlloc, extensions: Extension.set, limits: Limits) extends BytecodeVisitor { +class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAlloc, extensions: Extension.set, limits: Limits, fast: bool) extends BytecodeVisitor { def instrTracer = if(Trace.compiler, InstrTracer.new()); def config = masm.regConfig; def regs = xenv; @@ -131,15 +131,15 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl def gen(module: Module, func: FuncDecl, err: ErrorGen) -> bool { this.osr_pc = -1; this.err = err; - return Metrics.spc_time_us.run(gen0(_, _, false), (module, func)); + return Metrics.spc_time_us.run(gen0(_, _), (module, func)); } def genOsr(module: Module, func: FuncDecl, pc: int, err: ErrorGen) -> MasmLabel { this.osr_pc = pc; this.err = err; - var ok = Metrics.spc_time_us.run(gen0(_, _, false), (module, func)); + var ok = Metrics.spc_time_us.run(gen0(_, _), (module, func)); return if(ok, osr_entry_label); } - private def gen0(module: Module, func: FuncDecl, fast: bool) -> bool { + private def gen0(module: Module, func: FuncDecl) -> bool { if (Trace.compiler) OUT.put1("==== begin compile: %q ========================", func.render(module.names, _)).ln(); var before_code_bytes = masm.curCodeBytes(); var before_data_bytes = masm.curDataBytes(); @@ -2140,14 +2140,19 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl if (isInlined()) return; - // Compute VSP = VFP + state.sp - emit_compute_vsp(regs.vsp, state.sp); - // Return to caller - masm.emit_mov_r_i(regs.ret_throw, 0); - // Deallocate stack frame - masm.emit_addw_r_i(regs.sp, frame.frameSize); - masm.emit_ret(); + if (!fast) { + // Compute VSP = VFP + state.sp // \ + emit_compute_vsp(regs.vsp, state.sp); // | + // Return to caller // | fast context: do not emit these instructions + masm.emit_mov_r_i(regs.ret_throw, 0); // | instead, emit the dispatch sequence from the interpreter + // Deallocate stack frame // | + masm.emit_addw_r_i(regs.sp, frame.frameSize); // | + masm.emit_ret(); // / + } else { + emitFastDispatch(); + } } + def emitFastDispatch() -> void; def emitOsrEntry(osr_entry_label: MasmLabel, state: Array) { if (Trace.compiler) Trace.OUT.put1(" OSR (+%d)", osr_entry_label.create_pos).ln(); masm.bindLabel(osr_entry_label); diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3 index 632c70837..9c1d373cb 100644 --- a/src/engine/x86-64/X86_64SinglePassCompiler.v3 +++ b/src/engine/x86-64/X86_64SinglePassCompiler.v3 @@ -28,12 +28,37 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { def w = DataWriter.new(); def mmasm = X86_64MacroAssembler.new(w, X86_64MasmRegs.CONFIG); def asm = mmasm.asm; + var ic: X86_64InterpreterCode; - new(extensions: Extension.set, limits: Limits, config: RegConfig) - super(X86_64MasmRegs.SPC_EXEC_ENV, mmasm, X86_64MasmRegs.SPC_ALLOC.copy(), extensions, limits) { + new(ic, extensions: Extension.set, limits: Limits, config: RegConfig, fast: bool) + super(X86_64MasmRegs.SPC_EXEC_ENV, mmasm, + if(fast, X86_64MasmRegs.INT_ALLOC.copy(), X86_64MasmRegs.SPC_ALLOC.copy()), + extensions, limits, fast) { mmasm.trap_stubs = TRAPS_STUB; } + def emitFastDispatch() { + ic = X86_64Spc.ic; + + def xenv: IntExecEnv = X86_64MasmRegs.INT_EXEC_ENV; + def r_ip = G(xenv.ip); + def r_dispatch = G(xenv.dispatch); + def r_tmp0 = G(xenv.tmp0); // RCX + def r_tmp1 = G(xenv.tmp1); // RDX + def ip_ptr = r_ip.plus(0); + + // simplified dispatch sequence + var opcode = r_tmp0; + var base = r_tmp1; + asm.movbzx_r_m(opcode, ip_ptr); + asm.inc_r(r_ip); + var addr = ic.start + ic.header.fastDispatchTableOffset; + asm.movd_r_m(base, X86_64Addr.new(null, opcode, 4, int.!(addr - Pointer.NULL))); + asm.ijmp_r(base); + + asm.invalid(); + } + private def visitCompareI(asm: X86_64Assembler, cond: X86_64Cond) -> bool { var b = pop(), a = popReg(); if (b.isConst()) asm.cmp_r_i(G(a.reg), b.const); @@ -1423,6 +1448,8 @@ def genSpcEntryFunc(ic: X86_64InterpreterCode, w: DataWriter) { asm.invalid(); } def genLazyCompileStub(ic: X86_64InterpreterCode, w: DataWriter) { + + if (SpcTuning.disable) return; var masm = X86_64MacroAssembler.new(w, X86_64MasmRegs.CONFIG); var asm = X86_64Assembler.!(masm.asm); @@ -1460,6 +1487,59 @@ def genLazyCompileStub(ic: X86_64InterpreterCode, w: DataWriter) { * - jump into the new `fast_target_code` (or re-dispatch on itself) */ def genFastCompileStub(ic: X86_64InterpreterCode, w: DataWriter) { + X86_64Spc.ic = ic; + // TODO: figure out this ic thing, because this sucks (and doesn't even work) + + if (SpcTuning.disable) return; + var masm = X86_64MacroAssembler.new(w, X86_64MasmRegs.CONFIG); + var asm = X86_64Assembler.!(masm.asm); + var regs = X86_64MasmRegs.SPC_EXEC_ENV; + var func_arg = G(regs.func_arg); + // TODO ensure that register use is compatible with fast-int usage + asm.pushq_r(G(regs.func_arg)); // push function onto stack + masm.emit_store_curstack_vsp(regs.vsp); + asm.movq_r_r(Target.V3_PARAM_GPRS[1], G(regs.func_arg)); // function + // Load {null} for the receiver. + asm.movq_r_i(Target.V3_PARAM_GPRS[0], 0); + // Call {X86_64Spc.fastCompile} directly. + masm.emit_call_abs(codePointer(X86_64Spc.fastCompile)); + asm.q.add_r_i(R.RSP, Pointer.SIZE); // pop function off stack + // Check for non-null abrupt return. + var unwind = X86_64Label.new(); + asm.q.cmp_r_i(Target.V3_RET_GPRS[2], 0); + asm.jc_rel_near(C.NZ, unwind); + // Tail-call the result of the compile. + var scratch = X86_64Regs.R15; +// asm.movq_r_r(scratch, Target.V3_RET_GPRS[1]); // entrypoint +// asm.movq_r_r(G(regs.func_arg), Target.V3_RET_GPRS[0]); // function +// masm.emit_load_curstack_vsp(regs.vsp); +// asm.ijmp_r(scratch); // jump to entrypoint + asm.invalid(); + // Simply return the {Throwable} object. + asm.bind(unwind); + asm.movq_r_r(Target.V3_RET_GPRS[0], Target.V3_RET_GPRS[2]); + + // DISPATCH + + def xenv: IntExecEnv = X86_64MasmRegs.INT_EXEC_ENV; + def r_ip = G(xenv.ip); + def r_dispatch = G(xenv.dispatch); + def r_tmp0 = G(xenv.tmp0); // RCX + def r_tmp1 = G(xenv.tmp1); // RDX + def ip_ptr = r_ip.plus(0); + + // simplified dispatch sequence + var opcode = r_tmp0; + var base = r_tmp1; + asm.movbzx_r_m(opcode, ip_ptr); + asm.inc_r(r_ip); + var addr = ic.start + ic.header.fastDispatchTableOffset; + asm.movd_r_m(base, X86_64Addr.new(null, opcode, 4, int.!(addr - Pointer.NULL))); + asm.ijmp_r(base); + + asm.invalid(); +} +def genFastNopStub(ic: X86_64InterpreterCode, w: DataWriter) { if (SpcTuning.disable) return; var masm = X86_64MacroAssembler.new(w, X86_64MasmRegs.CONFIG); var asm = X86_64Assembler.!(masm.asm); @@ -1552,6 +1632,7 @@ def codePointer(f: P -> R) -> Pointer { // Global functionality associated with the single-pass compiler for X86-64. component X86_64Spc { + var ic: X86_64InterpreterCode; // A handy chokepoint for entering JIT code from V3. def invoke(wf: WasmFunction, sp: Pointer) -> Throwable { return V3_SPC_ENTRY_FUNC.get()(wf, sp, wf.decl.target_code.spc_entry); @@ -1585,7 +1666,7 @@ component X86_64Spc { } private def fastCompile(wf: WasmFunction) -> (WasmFunction, Pointer, Throwable) { // The global stub simply consults the execution strategy. - var result = X86_64ExecutionStrategy.!(Execute.tiering).fastCompile(wf); // no condition that tiering uses SPC (int => fast SPC) + var result = X86_64ExecutionStrategy.!(Execute.tiering).fastCompile(wf, ic); // no condition that tiering uses SPC (int => fast SPC) return (result.wf, result.entrypoint, result.thrown); } private def tierupCompile(wf: WasmFunction) -> (WasmFunction, Pointer, Throwable) { diff --git a/src/engine/x86-64/X86_64Target.v3 b/src/engine/x86-64/X86_64Target.v3 index a413b1ba0..690703b68 100644 --- a/src/engine/x86-64/X86_64Target.v3 +++ b/src/engine/x86-64/X86_64Target.v3 @@ -188,7 +188,7 @@ class X86_64ExecutionStrategy extends ExecutionStrategy { } // Compilation methods called directly by stubs. def lazyCompile(wf: WasmFunction) -> SpcResultForStub; - def fastCompile(wf: WasmFunction) -> SpcResultForStub; + def fastCompile(wf: WasmFunction, ic: X86_64InterpreterCode) -> SpcResultForStub; def tierupCompile(wf: WasmFunction) -> SpcResultForStub; // Tiering may require setting up the whole module. def onTestModule(module: Module) { @@ -236,13 +236,13 @@ class X86_64InterpreterOnlyStrategy extends X86_64ExecutionStrategy { } // TODO avoid duplicated function here - def fastCompile(wf: WasmFunction) -> SpcResultForStub { + def fastCompile(wf: WasmFunction, ic: X86_64InterpreterCode) -> SpcResultForStub { // Check the JIT filter, if there is one if (!applyJitFilter(wf.instance.module, wf.decl, "fast")) return SpcResultForStub(wf, X86_64Spc.setInterpreterFallback(wf.decl), null); var module = wf.instance.module; var code = module.target_module.spc_code; - var compiler = newCompiler(module.filename); // XXX: cache per-thread + var compiler = newCompiler(module.filename, true, ic); var masm = X86_64MacroAssembler.!(compiler.masm), w = masm.asm.w; // generate code for the function @@ -299,7 +299,7 @@ class X86_64SpcStrategy extends X86_64ExecutionStrategy { var module = wf.instance.module; var code = module.target_module.spc_code; - var compiler = newCompiler(module.filename); // XXX: cache per-thread + var compiler = newCompiler(module.filename, false, null); // XXX: cache per-thread var masm = X86_64MacroAssembler.!(compiler.masm), w = masm.asm.w; // generate code for the function @@ -363,7 +363,7 @@ class X86_64SpcAotStrategy(interpreter_fallback: bool) extends X86_64SpcStrategy // ensure entrypoint and lazy compile stubs are generated X86_64PreGenStubs.gen(); - var compiler = newCompiler(module.filename); + var compiler = newCompiler(module.filename, false, null); var w = compiler.w; // generate code for all functions @@ -456,7 +456,7 @@ class X86_64DynamicStrategy extends X86_64SpcStrategy { } def onTierUp(wf: WasmFunction, pc: int) -> TargetOsrInfo { var module = wf.instance.module; - var compiler = newCompiler(module.filename); + var compiler = newCompiler(module.filename, false, null); if (!applyJitFilter(wf.instance.module, wf.decl, "osr")) { // OSR compile suppressed wf.decl.tierup_trigger = int.max; // no point in trying for a while @@ -485,10 +485,10 @@ class X86_64DynamicStrategy extends X86_64SpcStrategy { } } -def newCompiler(filename: string) -> X86_64SinglePassCompiler { +def newCompiler(filename: string, fast: bool, ic: X86_64InterpreterCode) -> X86_64SinglePassCompiler { var extensions = Extension.set.all; // TODO: all extensions enabled for compilation var limits = Limits.new(); - var compiler = X86_64SinglePassCompiler.new(extensions, limits, X86_64MasmRegs.CONFIG); + var compiler = X86_64SinglePassCompiler.new(ic, extensions, limits, X86_64MasmRegs.CONFIG, fast); return compiler; } def MINIMUM_CODE_SIZE = PAGE_SIZE_i; From 4e194c705ac05656095545dc04ea83ed9607f978 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Wed, 14 Jan 2026 17:39:15 -0500 Subject: [PATCH 35/55] macro dispatch, add fast_target_code correctly, better stub caller discipline --- src/engine/Debug.v3 | 6 +- src/engine/Opcodes.v3 | 51 ++++++- src/engine/Trace.v3 | 2 +- src/engine/compiler/MacroAssembler.v3 | 2 + src/engine/x86-64/X86_64Interpreter.v3 | 18 ++- src/engine/x86-64/X86_64MacroAssembler.v3 | 71 ++++++++++ src/engine/x86-64/X86_64PreGenStubs.v3 | 2 +- src/engine/x86-64/X86_64SinglePassCompiler.v3 | 127 +++++++++++------- src/engine/x86-64/X86_64Target.v3 | 23 +++- 9 files changed, 243 insertions(+), 59 deletions(-) diff --git a/src/engine/Debug.v3 b/src/engine/Debug.v3 index 29b91af78..55a445978 100644 --- a/src/engine/Debug.v3 +++ b/src/engine/Debug.v3 @@ -6,9 +6,9 @@ component Debug { // Debug tracing options. def paranoid = false; def verbose = false; - def interpreter = false; - def runtime = false; - def compiler = false; + def interpreter = true; + def runtime = true; + def compiler = true; def pregen = false; def stack = false; def memory = false; diff --git a/src/engine/Opcodes.v3 b/src/engine/Opcodes.v3 index 2b44e01ae..4204e3bf1 100644 --- a/src/engine/Opcodes.v3 +++ b/src/engine/Opcodes.v3 @@ -33,7 +33,6 @@ enum Opcode(prefix: u8, code: u16, mnemonic: string, imms: Array, sig: CALL_REF (0x00, 0x14, "call_ref", imm.SIG, null), RETURN_CALL_REF (0x00, 0x15, "return_call_ref", imm.SIG, null), // Fast handler custom instruction - FAST_CALL (0x00, 0x17, "fast_call", imm.FUNC, null), DELEGATE (0x00, 0x18, "delegate", imm.LABEL, null), CATCH_ALL (0x00, 0x19, "catch_all", imm.NONE, null), DROP (0x00, 0x1A, "drop", imm.NONE, null), @@ -611,6 +610,56 @@ enum Opcode(prefix: u8, code: u16, mnemonic: string, imms: Array, sig: RESUME_THROW (0x00, 0xE4, "resume_throw", imm.CONT_TAG_HANDLE, null), RESUME_THROW_REF (0x00, 0xE5, "resume_throw_ref", imm.CONT_HANDLE, null), SWITCH (0x00, 0xE6, "switch", imm.CONT_TAG, null) + + // fast call instructions + FAST_CALL (0x00, 0x17, "fast_call", imm.FUNC, null) + //FAST_CALL0 (0x00, 0x27, "fast_call0", imm.NONE, null), + //FAST_CALL1 (0x00, 0xC5, "fast_call1", imm.NONE, null), + //FAST_CALL2 (0x00, 0xC6, "fast_call2", imm.NONE, null), + //FAST_CALL3 (0x00, 0xC7, "fast_call3", imm.NONE, null), + //FAST_CALL4 (0x00, 0xC8, "fast_call4", imm.NONE, null), + //FAST_CALL5 (0x00, 0xC9, "fast_call5", imm.NONE, null), + //FAST_CALL6 (0x00, 0xCA, "fast_call6", imm.NONE, null), + //FAST_CALL7 (0x00, 0xCB, "fast_call7", imm.NONE, null), + //FAST_CALL8 (0x00, 0xCC, "fast_call8", imm.NONE, null), + //FAST_CALL9 (0x00, 0xCD, "fast_call9", imm.NONE, null), + //FAST_CALL10 (0x00, 0xCE, "fast_call10", imm.NONE, null), + //FAST_CALL11 (0x00, 0xCF, "fast_call11", imm.NONE, null), + //FAST_CALL12 (0x00, 0xD7, "fast_call12", imm.NONE, null), + //FAST_CALL13 (0x00, 0xD8, "fast_call13", imm.NONE, null), + //FAST_CALL14 (0x00, 0xD9, "fast_call14", imm.NONE, null), + //FAST_CALL15 (0x00, 0xDA, "fast_call15", imm.NONE, null), + //FAST_CALL16 (0x00, 0xDB, "fast_call16", imm.NONE, null), + //FAST_CALL17 (0x00, 0xDC, "fast_call17", imm.NONE, null), + //FAST_CALL18 (0x00, 0xDD, "fast_call18", imm.NONE, null), + //FAST_CALL19 (0x00, 0xDE, "fast_call19", imm.NONE, null), + //FAST_CALL20 (0x00, 0xDF, "fast_call20", imm.NONE, null), + //FAST_CALL21 (0x00, 0xE0, "fast_call21", imm.NONE, null), + //FAST_CALL22 (0x00, 0xE1, "fast_call22", imm.NONE, null), + //FAST_CALL23 (0x00, 0xE2, "fast_call23", imm.NONE, null), + //FAST_CALL24 (0x00, 0xE3, "fast_call24", imm.NONE, null), + //FAST_CALL25 (0x00, 0xE4, "fast_call25", imm.NONE, null), + //FAST_CALL26 (0x00, 0xE5, "fast_call26", imm.NONE, null), + //FAST_CALL27 (0x00, 0xE6, "fast_call27", imm.NONE, null), + //FAST_CALL28 (0x00, 0xE7, "fast_call28", imm.NONE, null), + //FAST_CALL29 (0x00, 0xE8, "fast_call29", imm.NONE, null), + //FAST_CALL30 (0x00, 0xE9, "fast_call30", imm.NONE, null), + //FAST_CALL31 (0x00, 0xEA, "fast_call31", imm.NONE, null), + //FAST_CALL32 (0x00, 0xEB, "fast_call32", imm.NONE, null), + //FAST_CALL33 (0x00, 0xEC, "fast_call33", imm.NONE, null), + //FAST_CALL34 (0x00, 0xED, "fast_call34", imm.NONE, null), + //FAST_CALL35 (0x00, 0xEE, "fast_call35", imm.NONE, null), + //FAST_CALL36 (0x00, 0xEF, "fast_call36", imm.NONE, null), + //FAST_CALL37 (0x00, 0xF2, "fast_call37", imm.NONE, null), + //FAST_CALL38 (0x00, 0xF3, "fast_call38", imm.NONE, null), + //FAST_CALL39 (0x00, 0xF4, "fast_call39", imm.NONE, null), + //FAST_CALL40 (0x00, 0xF5, "fast_call40", imm.NONE, null), + //FAST_CALL41 (0x00, 0xF6, "fast_call41", imm.NONE, null), + //FAST_CALL42 (0x00, 0xF7, "fast_call42", imm.NONE, null), + //FAST_CALL43 (0x00, 0xF8, "fast_call43", imm.NONE, null), + //FAST_CALL44 (0x00, 0xF9, "fast_call44", imm.NONE, null), + //FAST_CALL45 (0x00, 0xFA, "fast_call45", imm.NONE, null), + //FAST_CALL46 (0x00, 0xFF, "fast_call46", imm.NONE, null) } // Enumeration of the different kinds of immediates to opcodes. diff --git a/src/engine/Trace.v3 b/src/engine/Trace.v3 index 22624e980..9ccc5a514 100644 --- a/src/engine/Trace.v3 +++ b/src/engine/Trace.v3 @@ -8,7 +8,7 @@ component Trace { var binparse = false; var canon = false; - var compiler = false; + var compiler = true; var asm = false; var exception = false; var fatal = false; diff --git a/src/engine/compiler/MacroAssembler.v3 b/src/engine/compiler/MacroAssembler.v3 index a3eb8110b..294c47b8d 100644 --- a/src/engine/compiler/MacroAssembler.v3 +++ b/src/engine/compiler/MacroAssembler.v3 @@ -368,6 +368,8 @@ class MacroAssembler(valuerep: Tagging, regConfig: RegConfig) { // Destructive on {parent}. def emit_cont_mv(from_vsp: Reg, contStack: Reg, n_vals: Reg, tmp1: Reg, tmp2: Reg, xmm0: Reg); + def emit_dispatchSequence(); + // Validates {cont} and: // - Mark {cont} as used // - Move {cont.stack} to {destContStack} diff --git a/src/engine/x86-64/X86_64Interpreter.v3 b/src/engine/x86-64/X86_64Interpreter.v3 index 1f4c510da..780f479f5 100644 --- a/src/engine/x86-64/X86_64Interpreter.v3 +++ b/src/engine/x86-64/X86_64Interpreter.v3 @@ -536,6 +536,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { var tmp = r_scratch; { // Entrypoint for calls coming from V3 ic.header.intV3EntryOffset = w.pos; + //masm.emit_debugger_breakpoint(); // Allocate and initialize interpreter stack frame from incoming V3 args. asm.q.sub_r_i(r_sp, k_frame_size); @@ -1244,7 +1245,9 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { } def genLocals() { bindHandler(Opcode.DROP); + //masm.emit_debugger_breakpoint(); decrementVsp(); + //masm.emit_debugger_breakpoint(); endHandler(); bindHandler(Opcode.LOCAL_GET); @@ -1309,6 +1312,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { // FAST_CALL bindHandler(Opcode.FAST_CALL); + //masm.emit_debugger_breakpoint(); var dispatchLabel = X86_64Label.new(); // genTagPush(BpTypeCode.I32.code); // asm.movq_m_i(vsph[0].value, 770); @@ -1320,11 +1324,20 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { var tmp = r_tmp2; asm.movq_r_m(tmp, func_arg.plus(offsets.WasmFunction_decl)); + + //masm.emit_debugger_breakpoint(); asm.ijmp_m(tmp.plus(offsets.FuncDecl_fast_target_code)); - asm.invalid(); + //asm.icall_m(tmp.plus(offsets.FuncDecl_fast_target_code)); + //asm.invalid(); // don't go here asm.bind(dispatchLabel); + masm.emit_nop(); + masm.emit_nop(); + masm.emit_nop(); + masm.emit_nop(); + masm.emit_nop(); + masm.emit_nop(); endHandler(); bindHandler(Opcode.CALL); @@ -2722,7 +2735,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { if (FastIntTuning.enableWhammProbeTrampoline) { var pos = w.atEnd().pos; writeDispatchEntry(dispatchTables[0].1, InternalOpcode.BREAK_PROBE.code, pos); - masm.emit_debugger_breakpoint(); + //masm.emit_debugger_breakpoint(); // Compute a pointer to the original code at this pc offset var pc = r_tmp1; // = IP - CODE asm.movq_r_r(pc, r_ip); @@ -4022,6 +4035,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { // Generate a dispatch from the main dispatch table. def genDispatch() { genDispatch0(ip_ptr, if (FeatureDisable.globalProbes, dispatchTables[0].1), true); + //masm.emit_int_dispatch(r_tmp0, r_tmp1, r_ip, r_dispatch, ip_ptr, dispatchTables[0].1, true, ic); } // Generate a load of the next bytecode and a dispatch through the dispatch table. def genDispatch0(ptr: X86_64Addr, table: IcCodeRef, increment: bool) { diff --git a/src/engine/x86-64/X86_64MacroAssembler.v3 b/src/engine/x86-64/X86_64MacroAssembler.v3 index 20a76af68..71ed05b91 100644 --- a/src/engine/x86-64/X86_64MacroAssembler.v3 +++ b/src/engine/x86-64/X86_64MacroAssembler.v3 @@ -47,6 +47,34 @@ class X86_64MacroAssembler extends MacroAssembler { } } + def saveIVar(r: X86_64Gpr, ivars: Array<(X86_64Gpr, X86_64Addr)>) { + for (t in ivars) { + if (t.0 == r) asm.movq_m_r(t.1, r); + } + } + def saveCallerIVars(r_ip: X86_64Gpr, r_stp: X86_64Gpr, r_curpc: X86_64Gpr, + ivars: Array<(X86_64Gpr, X86_64Addr)>) { + saveIVar(r_ip, ivars); + saveIVar(r_stp, ivars); + if (!FeatureDisable.stacktraces) saveIVar(r_curpc, ivars); + } + def restoreReg(r: X86_64Gpr, ivars: Array<(X86_64Gpr, X86_64Addr)>) { + for (t in ivars) { + if (t.0 == r) asm.movq_r_m(r, t.1); + } + } + def restoreCallerIVars(r_ip: X86_64Gpr, r_stp: X86_64Gpr, r_eip: X86_64Gpr, + r_instance: X86_64Gpr, r_func_decl: X86_64Gpr, r_mem0_base: X86_64Gpr, r_vfp: X86_64Gpr, + ivars: Array<(X86_64Gpr, X86_64Addr)>) { + restoreReg(r_ip, ivars); + restoreReg(r_stp, ivars); + restoreReg(r_eip, ivars); + restoreReg(r_instance, ivars); + restoreReg(r_func_decl, ivars); + restoreReg(r_mem0_base, ivars); + restoreReg(r_vfp, ivars); + } + // Label operations def newLabel(create_pos: int) -> X86_64MasmLabel { return X86_64MasmLabel.new(create_pos, asm.newLabel()); @@ -1592,6 +1620,49 @@ class X86_64MacroAssembler extends MacroAssembler { asm.pextrq_r_s_i(G(to), X(from), 1); } + // xenv: IntExecEnv = X86_64MasmRegs.INT_EXEC_ENV; + // r_ip rax + // ip_ptr + // r_dispatch r14 + // r_tmp0 rcx + // r_tmp1 rdx + def emit_int_dispatch(opcode: X86_64Gpr, base: X86_64Gpr, r_ip: X86_64Gpr, r_dispatch: X86_64Gpr, + ptr: X86_64Addr, table: IcCodeRef, increment: bool, ic: X86_64InterpreterCode) { + if (ptr != null) asm.movbzx_r_m(opcode, ptr); + if (increment) asm.inc_r(r_ip); + match (FastIntTuning.dispatchEntrySize) { + 2 => { + if (table == null) asm.movq_r_r(base, r_dispatch); + else asm.lea(base, table); // RIP-relative LEA + asm.movwsx_r_m(opcode, base.plusR(opcode, 2, 0)); // load 16-bit offset + asm.add_r_r(base, opcode); + //if (dispatchJmpOffset < 0) dispatchJmpOffset = w.pos; + asm.ijmp_r(base); + } + 4 => { + if (table == null) { + asm.movd_r_m(base, r_dispatch.plusR(opcode, 4, 0)); + } else { + var addr = ic.start + table.offset; + asm.movd_r_m(base, X86_64Addr.new(null, opcode, 4, int.!(addr - Pointer.NULL))); + } + //if (dispatchJmpOffset < 0) dispatchJmpOffset = w.pos; + asm.ijmp_r(base); + } + 8 => { + if (table == null) { + //if (dispatchJmpOffset < 0) dispatchJmpOffset = w.pos; + asm.ijmp_m(r_dispatch.plusR(opcode, 8, 0)); + } else { + var addr = ic.start + table.offset; + //if (dispatchJmpOffset < 0) dispatchJmpOffset = w.pos; + asm.ijmp_m(X86_64Addr.new(null, opcode, 8, int.!(addr - Pointer.NULL))); + } + } + } + + } + // Reads a 32- or 64-bit unsigned LEB from {rw_ptr} into {w_dest}. def emit_read_uleb(w_dest: X86_64Gpr, rw_ptr: X86_64Gpr, w_scratch1: X86_64Gpr, w_scratch2: X86_64Gpr) -> this { // TODO: handle w_dest = rcx diff --git a/src/engine/x86-64/X86_64PreGenStubs.v3 b/src/engine/x86-64/X86_64PreGenStubs.v3 index f8a780792..4252307d2 100644 --- a/src/engine/x86-64/X86_64PreGenStubs.v3 +++ b/src/engine/x86-64/X86_64PreGenStubs.v3 @@ -25,7 +25,7 @@ layout X86_64PreGenHeader { +24 intV3EntryOffset: i32; // entry into interpreter from V3 caller +28 intSpcEntryOffset: i32; // entry into interpreter from SPC caller +32 intIntEntryOffset: i32; // entry into interpreter from interpreter caller - +36 intSuspendEntryOffset: i32; // entry into interpreter from a suspended child stack + +36 intSuspendEntryOffset: i32; // entry into interpreter from a suspended child stack +40 deoptReentryOffset: i32; // re-enter interpreter from optimized code +44 oobMemoryHandlerOffset: i32; // handler for signals caused by OOB memory access +48 divZeroHandlerOffset: i32; // handler for signals caused by divide by zero diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3 index 9c1d373cb..2a00de951 100644 --- a/src/engine/x86-64/X86_64SinglePassCompiler.v3 +++ b/src/engine/x86-64/X86_64SinglePassCompiler.v3 @@ -36,27 +36,16 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { extensions, limits, fast) { mmasm.trap_stubs = TRAPS_STUB; } - def emitFastDispatch() { - ic = X86_64Spc.ic; - + // DISPATCH def xenv: IntExecEnv = X86_64MasmRegs.INT_EXEC_ENV; def r_ip = G(xenv.ip); + def ip_ptr = r_ip.plus(0); def r_dispatch = G(xenv.dispatch); def r_tmp0 = G(xenv.tmp0); // RCX def r_tmp1 = G(xenv.tmp1); // RDX - def ip_ptr = r_ip.plus(0); - - // simplified dispatch sequence - var opcode = r_tmp0; - var base = r_tmp1; - asm.movbzx_r_m(opcode, ip_ptr); - asm.inc_r(r_ip); - var addr = ic.start + ic.header.fastDispatchTableOffset; - asm.movd_r_m(base, X86_64Addr.new(null, opcode, 4, int.!(addr - Pointer.NULL))); - asm.ijmp_r(base); - - asm.invalid(); + mmasm.emit_int_dispatch(r_tmp0, r_tmp1, r_ip, r_dispatch, ip_ptr, + IcCodeRef.new(ic.header.fastDispatchTableOffset), true, ic); } private def visitCompareI(asm: X86_64Assembler, cond: X86_64Cond) -> bool { @@ -1422,6 +1411,7 @@ class X86_64SpcCompileStub extends RiUserCode { def V3_SPC_ENTRY_FUNC = X86_64PreGenFunc<(WasmFunction, Pointer, Pointer), Throwable>.new("v3-spc-entry", null, genSpcEntryFunc); def LAZY_COMPILE_STUB = X86_64PreGenStub.new("spc-lazy-compile", X86_64SpcCompileStub.new("lazy"), genLazyCompileStub); def FAST_COMPILE_STUB = X86_64PreGenStub.new("spc-fast-compile", X86_64SpcCompileStub.new("fast"), genFastCompileStub); +def FAST_CALL_NOP = X86_64PreGenStub.new("spc-fast-nop", X86_64SpcCompileStub.new("fast"), genFastNopStub); def TIERUP_COMPILE_STUB = X86_64PreGenStub.new("spc-tierup-compile", X86_64SpcCompileStub.new("tierup"), genTierUpCompileStub); def TRAPS_STUB = X86_64SpcTrapsStub.new(); def TRAPS_PREGEN = X86_64PreGenStub.new("spc-trap", TRAPS_STUB, genTrapsStub); @@ -1448,8 +1438,6 @@ def genSpcEntryFunc(ic: X86_64InterpreterCode, w: DataWriter) { asm.invalid(); } def genLazyCompileStub(ic: X86_64InterpreterCode, w: DataWriter) { - - if (SpcTuning.disable) return; var masm = X86_64MacroAssembler.new(w, X86_64MasmRegs.CONFIG); var asm = X86_64Assembler.!(masm.asm); @@ -1487,57 +1475,102 @@ def genLazyCompileStub(ic: X86_64InterpreterCode, w: DataWriter) { * - jump into the new `fast_target_code` (or re-dispatch on itself) */ def genFastCompileStub(ic: X86_64InterpreterCode, w: DataWriter) { - X86_64Spc.ic = ic; - // TODO: figure out this ic thing, because this sucks (and doesn't even work) - if (SpcTuning.disable) return; var masm = X86_64MacroAssembler.new(w, X86_64MasmRegs.CONFIG); var asm = X86_64Assembler.!(masm.asm); var regs = X86_64MasmRegs.SPC_EXEC_ENV; var func_arg = G(regs.func_arg); + + var xenv = X86_64MasmRegs.INT_EXEC_ENV; + // TODO ensure that register use is compatible with fast-int usage + def r_mem0_base = G(xenv.mem0_base); + def r_vfp = G(xenv.vfp); + def r_vsp = G(xenv.vsp); + def r_stp = G(xenv.stp); + def r_ip = G(xenv.ip); + def r_eip = G(xenv.eip); + def r_func_decl = G(xenv.func_decl); + def r_instance = G(xenv.instance); + def r_curpc = G(xenv.curpc); + + def m_mem0_base = R.RSP.plus(X86_64InterpreterFrame.mem0_base.offset); + def m_vfp = R.RSP.plus(X86_64InterpreterFrame.vfp.offset); + def m_vsp = R.RSP.plus(X86_64InterpreterFrame.vsp.offset); + def m_stp = R.RSP.plus(X86_64InterpreterFrame.stp.offset); + def m_ip = R.RSP.plus(X86_64InterpreterFrame.ip.offset); + def m_eip = R.RSP.plus(X86_64InterpreterFrame.eip.offset); + def m_func_decl = R.RSP.plus(X86_64InterpreterFrame.func_decl.offset); + def m_instance = R.RSP.plus(X86_64InterpreterFrame.instance.offset); + def m_curpc = R.RSP.plus(X86_64InterpreterFrame.curpc.offset); + + def ivar_MEM0_BASE = (r_mem0_base, m_mem0_base); + def ivar_VFP = (r_vfp, m_vfp); + def ivar_VSP = (r_vsp, m_vsp); + def ivar_STP = (r_stp, m_stp); + def ivar_IP = (r_ip, m_ip); + def ivar_EIP = (r_eip, m_eip); + def ivar_FUNC_DECL = (r_func_decl, m_func_decl); + def ivar_INSTANCE = (r_instance, m_instance); + def ivar_CURPC = (r_curpc, m_curpc); + + def all_ivars = [ + ivar_MEM0_BASE, + ivar_VFP, + ivar_VSP, + ivar_STP, + ivar_IP, + ivar_EIP, + ivar_FUNC_DECL, + ivar_INSTANCE, + ivar_CURPC + ]; + + //masm.emit_debugger_breakpoint(); + for (i < all_ivars.length) { + asm.pushq_r(all_ivars[i].0); + } asm.pushq_r(G(regs.func_arg)); // push function onto stack + //masm.saveCallerIVars(r_ip, r_stp, r_curpc, all_ivars); + // saveCallerIVars (move to macro assembler) + // look at runtime calls in int, and int->spc calls masm.emit_store_curstack_vsp(regs.vsp); - asm.movq_r_r(Target.V3_PARAM_GPRS[1], G(regs.func_arg)); // function + //masm.emit_debugger_breakpoint(); + + asm.movq_r_r(Target.V3_PARAM_GPRS[1], G(regs.func_arg)); // function (rdx) + asm.movq_r_i(Target.V3_PARAM_GPRS[2], int.!(Pointer.atObject(ic) - Pointer.NULL)); // load into rdx // Load {null} for the receiver. asm.movq_r_i(Target.V3_PARAM_GPRS[0], 0); // Call {X86_64Spc.fastCompile} directly. + //masm.emit_debugger_breakpoint(); masm.emit_call_abs(codePointer(X86_64Spc.fastCompile)); + //masm.emit_debugger_breakpoint(); asm.q.add_r_i(R.RSP, Pointer.SIZE); // pop function off stack // Check for non-null abrupt return. var unwind = X86_64Label.new(); asm.q.cmp_r_i(Target.V3_RET_GPRS[2], 0); asm.jc_rel_near(C.NZ, unwind); // Tail-call the result of the compile. - var scratch = X86_64Regs.R15; -// asm.movq_r_r(scratch, Target.V3_RET_GPRS[1]); // entrypoint -// asm.movq_r_r(G(regs.func_arg), Target.V3_RET_GPRS[0]); // function -// masm.emit_load_curstack_vsp(regs.vsp); -// asm.ijmp_r(scratch); // jump to entrypoint - asm.invalid(); - // Simply return the {Throwable} object. - asm.bind(unwind); - asm.movq_r_r(Target.V3_RET_GPRS[0], Target.V3_RET_GPRS[2]); + var scratch = X86_64Regs.R9; + asm.movq_r_r(scratch, Target.V3_RET_GPRS[1]); // entrypoint + asm.movq_r_r(G(regs.func_arg), Target.V3_RET_GPRS[0]); // function - // DISPATCH - - def xenv: IntExecEnv = X86_64MasmRegs.INT_EXEC_ENV; - def r_ip = G(xenv.ip); - def r_dispatch = G(xenv.dispatch); - def r_tmp0 = G(xenv.tmp0); // RCX - def r_tmp1 = G(xenv.tmp1); // RDX - def ip_ptr = r_ip.plus(0); + for (i < all_ivars.length) { + asm.popq_r(all_ivars[all_ivars.length - i - 1].0); + } + //masm.restoreCallerIVars(r_ip, r_stp, r_eip, r_instance, r_func_decl, r_mem0_base, r_vfp, all_ivars); + masm.emit_load_curstack_vsp(regs.vsp); - // simplified dispatch sequence - var opcode = r_tmp0; - var base = r_tmp1; - asm.movbzx_r_m(opcode, ip_ptr); - asm.inc_r(r_ip); - var addr = ic.start + ic.header.fastDispatchTableOffset; - asm.movd_r_m(base, X86_64Addr.new(null, opcode, 4, int.!(addr - Pointer.NULL))); - asm.ijmp_r(base); + //masm.emit_debugger_breakpoint(); + asm.ijmp_r(scratch); // jump to entrypoint asm.invalid(); + asm.ret(); + + // Simply return the {Throwable} object. ?? + asm.bind(unwind); + asm.movq_r_r(Target.V3_RET_GPRS[0], Target.V3_RET_GPRS[2]); + asm.ret(); } def genFastNopStub(ic: X86_64InterpreterCode, w: DataWriter) { if (SpcTuning.disable) return; @@ -1664,7 +1697,7 @@ component X86_64Spc { var result = X86_64SpcStrategy.!(Execute.tiering).lazyCompile(wf); return (result.wf, result.entrypoint, result.thrown); } - private def fastCompile(wf: WasmFunction) -> (WasmFunction, Pointer, Throwable) { + private def fastCompile(wf: WasmFunction, ic: X86_64InterpreterCode) -> (WasmFunction, Pointer, Throwable) { // The global stub simply consults the execution strategy. var result = X86_64ExecutionStrategy.!(Execute.tiering).fastCompile(wf, ic); // no condition that tiering uses SPC (int => fast SPC) return (result.wf, result.entrypoint, result.thrown); diff --git a/src/engine/x86-64/X86_64Target.v3 b/src/engine/x86-64/X86_64Target.v3 index 690703b68..5dcae5d53 100644 --- a/src/engine/x86-64/X86_64Target.v3 +++ b/src/engine/x86-64/X86_64Target.v3 @@ -65,6 +65,23 @@ component Target { f.target_code = TargetCode(addr); Debug.afterCompile(f, u64.view(addr - Pointer.NULL)); } + def setFastTargetCode(f: FuncDecl, addr: Pointer, end: Pointer) { + if (Trace.compiler) { + Trace.OUT.put2("func[%d].fast_target_code: break *0x%x", f.func_index, addr - Pointer.NULL) + .put2(" disass 0x%x, 0x%x", addr - Pointer.NULL, end - Pointer.NULL).ln(); + if (Trace.asm) { + var cur_byte = addr; + Trace.OUT.puts("JIT code: "); + while (cur_byte < end) { + Trace.OUT.put1("%x ", cur_byte.load()); + cur_byte++; + } + Trace.OUT.ln(); + } + } + f.fast_target_code = TargetCode(addr); + Debug.afterCompile(f, u64.view(addr - Pointer.NULL)); + } def pregenIntoFile(filename: string) -> ErrorBuilder { var data = System.fileLoad(filename); var err = ErrorBuilder.new().puts("interpreter generator: "); @@ -228,6 +245,7 @@ class X86_64InterpreterOnlyStrategy extends X86_64ExecutionStrategy { } def onNewFunction(wf: WasmFunction, err: ErrorGen) { Target.setUnconditionalInterpreterEntryIfMultiTier(wf.decl); + X86_64Spc.setFastCompileFor(wf.instance.module, wf.decl); } def onFuncProbeInsert1(module: Module, func: FuncDecl, offset: int, p: Probe) { @@ -237,9 +255,6 @@ class X86_64InterpreterOnlyStrategy extends X86_64ExecutionStrategy { // TODO avoid duplicated function here def fastCompile(wf: WasmFunction, ic: X86_64InterpreterCode) -> SpcResultForStub { - // Check the JIT filter, if there is one - if (!applyJitFilter(wf.instance.module, wf.decl, "fast")) return SpcResultForStub(wf, X86_64Spc.setInterpreterFallback(wf.decl), null); - var module = wf.instance.module; var code = module.target_module.spc_code; var compiler = newCompiler(module.filename, true, ic); @@ -262,7 +277,7 @@ class X86_64InterpreterOnlyStrategy extends X86_64ExecutionStrategy { if (success) { // Copy code into end of region entrypoint = code.appendCode(masm); - Target.setTargetCode(wf.decl, entrypoint, entrypoint + codeSize); + Target.setFastTargetCode(wf.decl, entrypoint, entrypoint + codeSize); } else { // Failed, enter interpreter var f = wf.decl; From 4c2e31ce0b1aeb1df7b7057fd3db2a0dff1833f9 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Thu, 15 Jan 2026 11:13:20 -0500 Subject: [PATCH 36/55] Fix vsp in fast spc --- src/engine/Trace.v3 | 2 +- src/engine/compiler/SinglePassCompiler.v3 | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/engine/Trace.v3 b/src/engine/Trace.v3 index 9ccc5a514..22624e980 100644 --- a/src/engine/Trace.v3 +++ b/src/engine/Trace.v3 @@ -8,7 +8,7 @@ component Trace { var binparse = false; var canon = false; - var compiler = true; + var compiler = false; var asm = false; var exception = false; var fatal = false; diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 0184eb102..10e52ccc4 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -2121,7 +2121,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_br(target.label); } } - // Return includes epilogue? + // Return includes epilogue def emitReturn(ctl: SpcControl) { // All explicit RETURN instructions branch here. masm.bindLabel(ret_label); @@ -2140,10 +2140,10 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl if (isInlined()) return; + // Compute VSP = VFP + state.sp + emit_compute_vsp(regs.vsp, state.sp); if (!fast) { - // Compute VSP = VFP + state.sp // \ - emit_compute_vsp(regs.vsp, state.sp); // | - // Return to caller // | fast context: do not emit these instructions + // Return to caller // \ fast context: do not emit these instructions masm.emit_mov_r_i(regs.ret_throw, 0); // | instead, emit the dispatch sequence from the interpreter // Deallocate stack frame // | masm.emit_addw_r_i(regs.sp, frame.frameSize); // | From e302f07b9a6fdedf3d9f70f6d72a3c6b7b86e19b Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Thu, 15 Jan 2026 11:52:43 -0500 Subject: [PATCH 37/55] Replace CALL with FAST_CALL when function is exported with "fast:" prefix --- src/engine/CodeValidator.v3 | 16 +++++++++++++++- src/engine/Module.v3 | 9 +++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/engine/CodeValidator.v3 b/src/engine/CodeValidator.v3 index 2372ca1a7..24b25887c 100644 --- a/src/engine/CodeValidator.v3 +++ b/src/engine/CodeValidator.v3 @@ -416,7 +416,21 @@ class CodeValidator(extensions: Extension.set, limits: Limits, module: Module, e checkAndPopArgs(sig.results); setUnreachable(); } - CALL, FAST_CALL => { + CALL => { + var func = parser.readFuncRef(); + if (func == null) return; + checkSignature(func.sig); + + // fast call: if function is exported with fast name, replace the bytecode with FAST_CALL + for (i < module.exports.length) { + def ex = module.exports[i]; + if (ex.1 == func && Strings.startsWith(ex.0, "fast:")) { + this.func.replaceCall(opcode_pos); + } + } + + } + FAST_CALL => { var func = parser.readFuncRef(); if (func == null) return; checkSignature(func.sig); diff --git a/src/engine/Module.v3 b/src/engine/Module.v3 index f1598f07c..4b37e48f0 100644 --- a/src/engine/Module.v3 +++ b/src/engine/Module.v3 @@ -170,6 +170,15 @@ class FuncDecl(sig_index: int) extends Decl { if (cur_bytecode == orig_bytecode) return; cur_bytecode[pc] = orig_bytecode[pc]; } + def replaceCall(pc: int) { + // "orig" will become a copy of the original code, to allow in-place modification of old code + if (cur_bytecode == orig_bytecode) orig_bytecode = Arrays.dup(orig_bytecode); + if (cur_bytecode[pc] != Opcode.CALL.code) { + def realOp = Opcodes.find(0, cur_bytecode[pc]); + System.error("replace bytecode", Strings.format1("not replacing call (got %s)", realOp.mnemonic)); + } + cur_bytecode[pc] = byte.!(Opcode.FAST_CALL.code); + } def reset() -> this { if (cur_bytecode == orig_bytecode) return; ArrayUtil.copyInto(cur_bytecode, 0, orig_bytecode, 0, orig_bytecode.length); From 2e3f2712b7b2bacce5969c206ef45ce8876462aa Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Wed, 21 Jan 2026 15:30:27 -0500 Subject: [PATCH 38/55] AOT compile fast functions declared in export name --- src/engine/compiler/SinglePassCompiler.v3 | 15 +++- src/engine/x86-64/X86_64MasmRegs.v3 | 13 ++-- src/engine/x86-64/X86_64SinglePassCompiler.v3 | 30 ++++---- src/engine/x86-64/X86_64Target.v3 | 72 ++++++++++++++++++- 4 files changed, 106 insertions(+), 24 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 10e52ccc4..c39ba9385 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -170,7 +170,13 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl pushSpcFrame(initial_frame); // Emit prologue, which allocates the frame and initializes various registers. - if (!fast) emitPrologue(); + if (fast) { + masm.emit_nop(); + masm.emit_nop(); + } else { + emitPrologue(); + + } // Visit all local declarations. it.dispatchLocalDecls(this); @@ -2124,7 +2130,12 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // Return includes epilogue def emitReturn(ctl: SpcControl) { // All explicit RETURN instructions branch here. - masm.bindLabel(ret_label); + if (ret_label != null) { + masm.bindLabel(ret_label); + ret_label = null; + } + masm.emit_nop(); + masm.emit_nop(); var results = sig.results; // fix values? diff --git a/src/engine/x86-64/X86_64MasmRegs.v3 b/src/engine/x86-64/X86_64MasmRegs.v3 index 0668c35a5..9f789233c 100644 --- a/src/engine/x86-64/X86_64MasmRegs.v3 +++ b/src/engine/x86-64/X86_64MasmRegs.v3 @@ -94,11 +94,11 @@ component X86_64MasmRegs { var xint = IntExecEnv.new(); xint.sp = xspc.sp = RSP; - xint.func_arg = xspc.func_arg = RDX; - xint.vsp = xspc.vsp = RSI; + xint.func_arg = xspc.func_arg = RDX; // cache of frame (callee-restore) + xint.vsp = xspc.vsp = RSI; xint.vfp = xspc.vfp = R11; - xint.mem0_base = xspc.mem0_base = R10; - xint.instance = xspc.instance = RDI; + xint.mem0_base = xspc.mem0_base = R10; // cache of frame (callee-restore) + xint.instance = xspc.instance = RDI; // cache of frame (callee-restore) xint.runtime_arg0 = xspc.runtime_arg0 = RSI; xint.runtime_arg1 = xspc.runtime_arg1 = RDX; xint.runtime_arg2 = xspc.runtime_arg2 = RCX; @@ -114,7 +114,7 @@ component X86_64MasmRegs { xint.ip = RAX; xint.func_decl = R12; xint.eip = R13; - xint.dispatch = R14; + xint.dispatch = R14; // cache of field (see how it is saved/stored in interpreter) xint.xmm0 = XMM0; xint.xmm1 = XMM1; xint.xmm2 = XMM2; @@ -163,7 +163,8 @@ component X86_64MasmRegs { // A register allocator for interpreter contexts. def INT_ALLOC = (fun -> RegAlloc { var pools = [ - RegPool32.new([RCX, RDX, R8, R9]), + RegPool32.new([RCX, RDX, R8, R9]), // could use callee-restore (but put at end) + // if callee-restore registers are used, have to emit a restore at the end RegPool32.new([XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14]) ]; return RegAlloc.new(CONFIG.poolMap, pools, null); diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3 index 2a00de951..6111a86c4 100644 --- a/src/engine/x86-64/X86_64SinglePassCompiler.v3 +++ b/src/engine/x86-64/X86_64SinglePassCompiler.v3 @@ -45,7 +45,7 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { def r_tmp0 = G(xenv.tmp0); // RCX def r_tmp1 = G(xenv.tmp1); // RDX mmasm.emit_int_dispatch(r_tmp0, r_tmp1, r_ip, r_dispatch, ip_ptr, - IcCodeRef.new(ic.header.fastDispatchTableOffset), true, ic); + if(ic != null, IcCodeRef.new(ic.header.fastDispatchTableOffset)), true, ic); } private def visitCompareI(asm: X86_64Assembler, cond: X86_64Cond) -> bool { @@ -1526,25 +1526,26 @@ def genFastCompileStub(ic: X86_64InterpreterCode, w: DataWriter) { ivar_CURPC ]; - //masm.emit_debugger_breakpoint(); - for (i < all_ivars.length) { - asm.pushq_r(all_ivars[i].0); - } + masm.emit_debugger_breakpoint(); + //for (i < all_ivars.length) { + // asm.pushq_r(all_ivars[i].0); + //} + masm.saveCallerIVars(r_ip, r_stp, r_curpc, all_ivars); asm.pushq_r(G(regs.func_arg)); // push function onto stack - //masm.saveCallerIVars(r_ip, r_stp, r_curpc, all_ivars); // saveCallerIVars (move to macro assembler) // look at runtime calls in int, and int->spc calls masm.emit_store_curstack_vsp(regs.vsp); - //masm.emit_debugger_breakpoint(); + masm.emit_debugger_breakpoint(); asm.movq_r_r(Target.V3_PARAM_GPRS[1], G(regs.func_arg)); // function (rdx) asm.movq_r_i(Target.V3_PARAM_GPRS[2], int.!(Pointer.atObject(ic) - Pointer.NULL)); // load into rdx + // dispatch is in r14 (don't overwrite, just access directly) // Load {null} for the receiver. asm.movq_r_i(Target.V3_PARAM_GPRS[0], 0); // Call {X86_64Spc.fastCompile} directly. //masm.emit_debugger_breakpoint(); masm.emit_call_abs(codePointer(X86_64Spc.fastCompile)); - //masm.emit_debugger_breakpoint(); + masm.emit_debugger_breakpoint(); asm.q.add_r_i(R.RSP, Pointer.SIZE); // pop function off stack // Check for non-null abrupt return. var unwind = X86_64Label.new(); @@ -1555,13 +1556,13 @@ def genFastCompileStub(ic: X86_64InterpreterCode, w: DataWriter) { asm.movq_r_r(scratch, Target.V3_RET_GPRS[1]); // entrypoint asm.movq_r_r(G(regs.func_arg), Target.V3_RET_GPRS[0]); // function - for (i < all_ivars.length) { - asm.popq_r(all_ivars[all_ivars.length - i - 1].0); - } - //masm.restoreCallerIVars(r_ip, r_stp, r_eip, r_instance, r_func_decl, r_mem0_base, r_vfp, all_ivars); + //for (i < all_ivars.length) { + // asm.popq_r(all_ivars[all_ivars.length - i - 1].0); + //} + masm.restoreCallerIVars(r_ip, r_stp, r_eip, r_instance, r_func_decl, r_mem0_base, r_vfp, all_ivars); masm.emit_load_curstack_vsp(regs.vsp); - //masm.emit_debugger_breakpoint(); + masm.emit_debugger_breakpoint(); asm.ijmp_r(scratch); // jump to entrypoint asm.invalid(); @@ -1701,6 +1702,9 @@ component X86_64Spc { // The global stub simply consults the execution strategy. var result = X86_64ExecutionStrategy.!(Execute.tiering).fastCompile(wf, ic); // no condition that tiering uses SPC (int => fast SPC) return (result.wf, result.entrypoint, result.thrown); + // need to compute _a_ new vfp + // bump the stack pointer? + // goal: avoid having to make new frame entirely } private def tierupCompile(wf: WasmFunction) -> (WasmFunction, Pointer, Throwable) { // The global stub simply consults the execution strategy. diff --git a/src/engine/x86-64/X86_64Target.v3 b/src/engine/x86-64/X86_64Target.v3 index 5dcae5d53..c24553c74 100644 --- a/src/engine/x86-64/X86_64Target.v3 +++ b/src/engine/x86-64/X86_64Target.v3 @@ -242,10 +242,16 @@ class X86_64InterpreterOnlyStrategy extends X86_64ExecutionStrategy { def onFuncValidationFinish(module: Module, func: FuncDecl, err: ErrorGen) { if (err != null && !err.ok()) return; Target.setUnconditionalInterpreterEntryIfMultiTier(func); + + for (i < module.exports.length) { + def ex = module.exports[i]; + if (ex.1 == func && Strings.startsWith(ex.0, "fast:")) { + System.puts(Strings.format1("fast function %s\n", ex.0)); + } + } } def onNewFunction(wf: WasmFunction, err: ErrorGen) { Target.setUnconditionalInterpreterEntryIfMultiTier(wf.decl); - X86_64Spc.setFastCompileFor(wf.instance.module, wf.decl); } def onFuncProbeInsert1(module: Module, func: FuncDecl, offset: int, p: Probe) { @@ -257,7 +263,7 @@ class X86_64InterpreterOnlyStrategy extends X86_64ExecutionStrategy { def fastCompile(wf: WasmFunction, ic: X86_64InterpreterCode) -> SpcResultForStub { var module = wf.instance.module; var code = module.target_module.spc_code; - var compiler = newCompiler(module.filename, true, ic); + var compiler = newCompiler(module.filename, true, null); var masm = X86_64MacroAssembler.!(compiler.masm), w = masm.asm.w; // generate code for the function @@ -286,10 +292,70 @@ class X86_64InterpreterOnlyStrategy extends X86_64ExecutionStrategy { } return SpcResultForStub(wf, entrypoint, null); } + def fastCompileEntireModule(module: Module, size: u32, interpreter_fallback: bool, err: ErrorGen, ballast: u32) { + // ensure entrypoint and lazy compile stubs are generated + X86_64PreGenStubs.gen(); + + var compiler = newCompiler(module.filename, true, null); + var w = compiler.w; + + // generate code for all functions + var bounds = Array<(int, int)>.new(module.functions.length); + var suberr = if(!interpreter_fallback, err); + for (i = 0; err.ok() && i < module.functions.length; i++) { + var f = module.functions[i]; + if (f.imported()) continue; + for (j < module.exports.length) { + def ex = module.exports[j]; + if (ex.1 == f && Strings.startsWith(ex.0, "fast:")) { + var start = w.atEnd().pos; + var compiled = compiler.gen(module, f, suberr); + if (compiled) bounds[i] = (start, w.end()); + else bounds[i] = (-1, -1); + } + } + } + + // copy and map code + var length = u64.view(w.atEnd().pos) + ballast; + var mapping = Mmap.reserve(length, Mmap.PROT_WRITE), range = mapping.range; // TODO: handle failure + var masm = X86_64MacroAssembler.!(compiler.masm); + masm.setTargetAddress(u64.view(range.start - Pointer.NULL)); + Target.copyInto(mapping.range, 0, w); + // TODO: for security, move embedded references out of the code region and make it non-writable + Mmap.protect(range.start, u64.!(range.end - range.start), Mmap.PROT_WRITE | Mmap.PROT_READ | Mmap.PROT_EXEC); + for (i < bounds.length) { + var b = bounds[i]; + if (b.0 >= 0) { + var addr = mapping.range.start; + var f = module.functions[i]; + Target.setFastTargetCode(f, addr + b.0, addr + b.1); + } else { + var f = module.functions[i]; + if (Trace.compiler) Trace.OUT.put1("func[%d] initial compile failed", f.func_index).ln(); + X86_64Spc.setInterpreterFallback(f); + } + } + // XXX: reduce duplication with {X86_64SpcModuleCode.appendCode}. + var code = X86_64SpcModuleCode.new(mapping); + if (masm.source_locs != null) { + code.sourcePcs = Vector.new(); + code.sourcePcs.putv(masm.source_locs); + } + if (masm.embeddedRefOffsets != null) { + if (code.embeddedRefOffsets == null) code.embeddedRefOffsets = Vector.new(); + code.embeddedRefOffsets.putv(masm.embeddedRefOffsets); + } + + module.target_module = TargetModule(code); + RiRuntime.registerUserCode(code); + module.target_module.spc_code.keepAlive(); + Debug.afterCompileModule(module); + } // XXX not an exhaustive way to add stubs. but what is? def onModuleFinish(module: Module, size: u32, err: ErrorGen) { - installStubForModule(module, X86_64Spc.setFastCompileFor); + fastCompileEntireModule(module, size, false, err, 1024); } } From b792c67a75c005a42f0e7ace3442b2e380ba236c Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Wed, 21 Jan 2026 16:06:06 -0500 Subject: [PATCH 39/55] Add fast prologue/epilogue stubs --- src/engine/compiler/SinglePassCompiler.v3 | 11 ++++++----- src/engine/x86-64/X86_64SinglePassCompiler.v3 | 9 +++++++++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index c39ba9385..b8775c266 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -171,11 +171,9 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // Emit prologue, which allocates the frame and initializes various registers. if (fast) { - masm.emit_nop(); - masm.emit_nop(); + emitFastPrologue(); } else { emitPrologue(); - } // Visit all local declarations. @@ -2134,8 +2132,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.bindLabel(ret_label); ret_label = null; } - masm.emit_nop(); - masm.emit_nop(); + emitFastEpilogue1(); var results = sig.results; // fix values? @@ -2160,10 +2157,14 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_addw_r_i(regs.sp, frame.frameSize); // | masm.emit_ret(); // / } else { + emitFastEpilogue2(); emitFastDispatch(); } } def emitFastDispatch() -> void; + def emitFastPrologue() -> void; + def emitFastEpilogue1() -> void; + def emitFastEpilogue2() -> void; def emitOsrEntry(osr_entry_label: MasmLabel, state: Array) { if (Trace.compiler) Trace.OUT.put1(" OSR (+%d)", osr_entry_label.create_pos).ln(); masm.bindLabel(osr_entry_label); diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3 index 6111a86c4..b46cf8b10 100644 --- a/src/engine/x86-64/X86_64SinglePassCompiler.v3 +++ b/src/engine/x86-64/X86_64SinglePassCompiler.v3 @@ -47,6 +47,15 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { mmasm.emit_int_dispatch(r_tmp0, r_tmp1, r_ip, r_dispatch, ip_ptr, if(ic != null, IcCodeRef.new(ic.header.fastDispatchTableOffset)), true, ic); } + def emitFastPrologue() { + asm.nop1(); + } + def emitFastEpilogue1() { + asm.nop1(); + } + def emitFastEpilogue2() { + asm.nop1(); + } private def visitCompareI(asm: X86_64Assembler, cond: X86_64Cond) -> bool { var b = pop(), a = popReg(); From 71c737dfddd8f914427618a889ee7e891b739de2 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Thu, 22 Jan 2026 14:59:45 -0500 Subject: [PATCH 40/55] Add small frame in fast call to save vfp (for local variables) --- src/engine/compiler/SinglePassCompiler.v3 | 33 +++++++++++++++++++ src/engine/x86-64/X86_64MasmRegs.v3 | 5 +++ src/engine/x86-64/X86_64SinglePassCompiler.v3 | 17 +++++++++- src/engine/x86-64/X86_64Stack.v3 | 6 ++++ 4 files changed, 60 insertions(+), 1 deletion(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index b8775c266..bb6f1587d 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -31,6 +31,39 @@ class SpcExecEnv { var runtime_ret1: Reg; var ret_throw: Reg; var scratch: Reg; + + def dup() -> SpcExecEnv { + def env = SpcExecEnv.new(); + + env.frameSize = this.frameSize; + env.vsp_slot = this.vsp_slot; + env.vfp_slot = this.vfp_slot; + env.pc_slot = this.pc_slot; + env.instance_slot = this.instance_slot; + env.inlined_instance_slot = this.inlined_instance_slot; + env.wasm_func_slot = this.wasm_func_slot; + env.mem0_base_slot = this.mem0_base_slot; + env.inlined_mem0_base_slot = this.inlined_mem0_base_slot; + env.accessor_slot = this.accessor_slot; + + env.sp = this.sp; + env.func_arg = this.func_arg; + env.vsp = this.vsp; + env.vfp = this.vfp; + env.mem0_base = this.mem0_base; + env.instance = this.instance; + env.runtime_arg0 = this.runtime_arg0; + env.runtime_arg1 = this.runtime_arg1; + env.runtime_arg2 = this.runtime_arg2; + env.runtime_arg3 = this.runtime_arg3; + env.runtime_arg4 = this.runtime_arg4; + env.runtime_ret0 = this.runtime_ret0; + env.runtime_ret1 = this.runtime_ret1; + env.ret_throw = this.ret_throw; + env.scratch = this.scratch; + + return env; + } } def INITIAL_VALUE_STACK_SIZE = 16; diff --git a/src/engine/x86-64/X86_64MasmRegs.v3 b/src/engine/x86-64/X86_64MasmRegs.v3 index 9f789233c..53f9ef1b1 100644 --- a/src/engine/x86-64/X86_64MasmRegs.v3 +++ b/src/engine/x86-64/X86_64MasmRegs.v3 @@ -151,6 +151,8 @@ component X86_64MasmRegs { // The execution environment for interpreter compilation contexts. def INT_EXEC_ENV = t.1; + def FAST_SPC_EXEC_ENV = SPC_EXEC_ENV.dup(); + // A register allocator for single-pass compilation contexts. def SPC_ALLOC = (fun -> RegAlloc { var pools = [ @@ -190,3 +192,6 @@ component X86_64MasmRegs { return reg; } } + +def nothing = X86_64MasmRegs.FAST_SPC_EXEC_ENV.frameSize = X86_64InterpreterFastCallFrame.size; +def nothing2 = X86_64MasmRegs.FAST_SPC_EXEC_ENV.vfp_slot = MasmAddr(X86_64MasmRegs.FAST_SPC_EXEC_ENV.sp, X86_64InterpreterFastCallFrame.vfp.offset); diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3 index b46cf8b10..e7b61881c 100644 --- a/src/engine/x86-64/X86_64SinglePassCompiler.v3 +++ b/src/engine/x86-64/X86_64SinglePassCompiler.v3 @@ -31,7 +31,7 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { var ic: X86_64InterpreterCode; new(ic, extensions: Extension.set, limits: Limits, config: RegConfig, fast: bool) - super(X86_64MasmRegs.SPC_EXEC_ENV, mmasm, + super(if(fast, X86_64MasmRegs.FAST_SPC_EXEC_ENV, X86_64MasmRegs.SPC_EXEC_ENV), mmasm, if(fast, X86_64MasmRegs.INT_ALLOC.copy(), X86_64MasmRegs.SPC_ALLOC.copy()), extensions, limits, fast) { mmasm.trap_stubs = TRAPS_STUB; @@ -47,7 +47,16 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { mmasm.emit_int_dispatch(r_tmp0, r_tmp1, r_ip, r_dispatch, ip_ptr, if(ic != null, IcCodeRef.new(ic.header.fastDispatchTableOffset)), true, ic); } + // TODO do we need to spill VFP if the function has no locals? when exactly is spilling VFP needed? + // and can it be determined independent of call site? def emitFastPrologue() { + // Allocate (very cheap) stack frame + masm.emit_subw_r_i(regs.sp, frame.frameSize); + // spill VFP + masm.emit_mov_m_r(ValueKind.REF, frame.vfp_slot, regs.vfp); + // Compute VFP = VSP - sig.params.length * SLOT_SIZE + masm.emit_mov_r_r(ValueKind.REF, regs.vfp, regs.vsp); // XXX: use 3-addr adjustment of VFP + masm.emit_subw_r_i(regs.vfp, sig.params.length * masm.valuerep.slot_size); asm.nop1(); } def emitFastEpilogue1() { @@ -55,6 +64,12 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { } def emitFastEpilogue2() { asm.nop1(); + asm.nop1(); + // restore VFP + masm.emit_mov_r_m(ValueKind.REF, regs.vfp, frame.vfp_slot); + masm.emit_addw_r_i(regs.sp, frame.frameSize); + asm.nop1(); + asm.nop1(); } private def visitCompareI(asm: X86_64Assembler, cond: X86_64Cond) -> bool { diff --git a/src/engine/x86-64/X86_64Stack.v3 b/src/engine/x86-64/X86_64Stack.v3 index cb3e94c93..b2717e4a8 100644 --- a/src/engine/x86-64/X86_64Stack.v3 +++ b/src/engine/x86-64/X86_64Stack.v3 @@ -926,6 +926,12 @@ layout X86_64InterpreterFrame { =104; } +// XXX: this frame may be differently sized depending on other touched registers +layout X86_64InterpreterFastCallFrame { + +0 vfp : i64; // Pointer + =8; +} + // Native frame states used in the implementation of {FrameStateAccessor}. Since a frame // can be optimized or deoptimized in place, the frame state accessor has to check the // state for every call. From 5ce31ecaa9ab0818d3362d3a104cd23b1dcb2f14 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Fri, 23 Jan 2026 16:09:35 -0500 Subject: [PATCH 41/55] Add test files/programs --- fast_call2.wasm | Bin 0 -> 43 bytes fast_call2.wat | 7 + fast_call_export.wasm | Bin 0 -> 54 bytes fast_call_export.wat | 10 ++ fast_call_nop.wasm | Bin 0 -> 53 bytes fast_call_nop.wat | 10 ++ fast_call_param.wasm | Bin 0 -> 91 bytes fast_call_param.wat | 18 +++ int/Export | 9 ++ int/Export.v3 | 12 ++ int/Export.wasm | Bin 0 -> 8230 bytes int/Interpreter | 9 ++ int/Interpreter.v3 | 293 ++++++++++++++++++++++++++++++++++++++++ int/Interpreter.wasm | Bin 0 -> 13303 bytes int/InterpreterBug | Bin 0 -> 14264 bytes int/InterpreterBug.v3 | 25 ++++ int/InterpreterBug.wasm | Bin 0 -> 5307 bytes int/RiRuntime | 9 ++ int/RiRuntime.wasm | Bin 0 -> 4434 bytes slow_call.wasm | Bin 0 -> 68 bytes slow_call_nop.wasm | Bin 0 -> 46 bytes slow_call_nop.wat | 7 + 22 files changed, 409 insertions(+) create mode 100644 fast_call2.wasm create mode 100644 fast_call2.wat create mode 100644 fast_call_export.wasm create mode 100644 fast_call_export.wat create mode 100644 fast_call_nop.wasm create mode 100644 fast_call_nop.wat create mode 100644 fast_call_param.wasm create mode 100644 fast_call_param.wat create mode 100755 int/Export create mode 100644 int/Export.v3 create mode 100644 int/Export.wasm create mode 100755 int/Interpreter create mode 100644 int/Interpreter.v3 create mode 100644 int/Interpreter.wasm create mode 100755 int/InterpreterBug create mode 100644 int/InterpreterBug.v3 create mode 100644 int/InterpreterBug.wasm create mode 100755 int/RiRuntime create mode 100644 int/RiRuntime.wasm create mode 100644 slow_call.wasm create mode 100644 slow_call_nop.wasm create mode 100644 slow_call_nop.wat diff --git a/fast_call2.wasm b/fast_call2.wasm new file mode 100644 index 0000000000000000000000000000000000000000..b3dcbf2bfd73fa58e127a050a851e7c035a258d2 GIT binary patch literal 43 ycmZQbEY4+QU|?WmWlUgTtY>CsVqjqBU}VWn%*L#;06G4wFH9z literal 0 HcmV?d00001 diff --git a/fast_call2.wat b/fast_call2.wat new file mode 100644 index 000000000..3dd58686b --- /dev/null +++ b/fast_call2.wat @@ -0,0 +1,7 @@ +(module + (func $f (result i32) + i32.const 10) + (func (export "main") (result i32) + call $f + ) +) diff --git a/fast_call_export.wasm b/fast_call_export.wasm new file mode 100644 index 0000000000000000000000000000000000000000..de5abe4d8a74f8a24c8d888a332931be286cbba9 GIT binary patch literal 54 zcmZQbEY4+QU|?WmWlUgTtY>CsVqjnwX5vUoEH1H1%g<+EV98C)%wu5W;$~uDaAe|U JVGv;81_0r`2ebeH literal 0 HcmV?d00001 diff --git a/fast_call_export.wat b/fast_call_export.wat new file mode 100644 index 000000000..20c428045 --- /dev/null +++ b/fast_call_export.wat @@ -0,0 +1,10 @@ +;; export name holds fast information, we don't modify binary ahead of time + +(module + (func $fast (export "fast:foo") (result i32) + (i32.const 2) + ) + (func (export "main") (result i32) + (call $fast) + ) +) diff --git a/fast_call_nop.wasm b/fast_call_nop.wasm new file mode 100644 index 0000000000000000000000000000000000000000..403dc7cf1dac9ad70c13cfaf0ac80a170e8d6a3b GIT binary patch literal 53 zcmZQbEY4+QU|?Y6U`k+MNMK;BXJ%mra@jc;S#lFI^B9=81euu_xPge1!HHW+oY9ei F8vvyN1z-RG literal 0 HcmV?d00001 diff --git a/fast_call_nop.wat b/fast_call_nop.wat new file mode 100644 index 000000000..c406ac91a --- /dev/null +++ b/fast_call_nop.wat @@ -0,0 +1,10 @@ +(module + (func $f) + (func $g) + (func (export "main") (result i32) + i64.const 11 + drop + call $g + i32.const 0 + ) +) diff --git a/fast_call_param.wasm b/fast_call_param.wasm new file mode 100644 index 0000000000000000000000000000000000000000..c70071c25eaade813ac8547a90cd3b2846efe266 GIT binary patch literal 91 zcmZQbEY4+QU|?Y6V@_bKX8>Zx`UD2XdM18Gw(`uX)Vy?-g3^*q1_owkCPpT94n~&T n#LPSfCN4!LJ_ZE_mU_qM?5vI}>bSWD7#tbJ8G$4NgyaSQ2wV?q literal 0 HcmV?d00001 diff --git a/fast_call_param.wat b/fast_call_param.wat new file mode 100644 index 000000000..b3f4ad728 --- /dev/null +++ b/fast_call_param.wat @@ -0,0 +1,18 @@ +(module + (import "wizeng" "puti" (func $puti (param i32))) + (func $f (param i32) (result i32) + local.get 0 + if (result i32) + i32.const 999 + else + i32.const -216 + end + ) + (func (export "main") (result i32) + (call $f (i32.const 1)) + call $puti + (call $f (i32.const 0)) + call $puti + i32.const 0 + ) +) diff --git a/int/Export b/int/Export new file mode 100755 index 000000000..df2a0f18a --- /dev/null +++ b/int/Export @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +S=${BASH_SOURCE[0]} +while [ -h "$S" ]; do + DIR=$(cd -P $(dirname "$S") >/dev/null 2>&1 && pwd) + S=$(readlink "$S") + [[ $S != /* ]] && S=$DIR/$S +done +DIR=$(cd -P $(dirname "$S") >/dev/null 2>&1 && pwd) +wizeng $DIR/Export.wasm "$@" diff --git a/int/Export.v3 b/int/Export.v3 new file mode 100644 index 000000000..2409223df --- /dev/null +++ b/int/Export.v3 @@ -0,0 +1,12 @@ +export "fast:foo" def foo(x: int, y: int) -> int { + var val = y; + for (i < x) { + val += i * y; + } + return val; +} + +export "main" def main() -> int { + System.puts(Strings.format1("%d\n", foo(11, 2))); + return 0; +} diff --git a/int/Export.wasm b/int/Export.wasm new file mode 100644 index 0000000000000000000000000000000000000000..65fa630c5756a892cb837eeb1aa7b0398e3baab3 GIT binary patch literal 8230 zcmc&(Ta08!d9JE+?XI5bTI_|my?E=iy~ZG|S=RP=J*@4iUGLgUxW?puvD@PwJUz2B zJG1REmWX?B96&&X6-<-|BCR5#<-rz+06|d{d5IHQ0VzBXiHArb!a|X9QKFy-MIh(< z{yN>$y}L%vQ)Z_7)T#5=fB(Pw>x}B1KdY5e`tNslch&sEdZb4WC!-NBDJ~=Ru)-hi zjLd0F@UemqPP8_ht~TWO37`>+vUx3PbwYn zSI$55#P;g2KfWJ6w!QhpL;Vjcll(Ke{1B?9nbKLVv?|I(Yu(7QOl6r#bVK1c!EF$WtQWS=8C!W8A{XV)ZN*~WTY>( zF13p1InJ{F#&G*dmE~vqXE(Q>G%6eR9$D{eRj%~T4;NQ9H&v0I?X7O8yzxz0t%A14 z7fzkN|ADuhdDqXs`#pEQ_kHjGz%M-b#t;7Dn||q+f8|$y?bm?xBaTZ}o0{ z+o8j6eZ<|n+#|Js$ecRqCU9gnRxfBI)){c){F;a{Pgy2YqCnB00b=<4tv^}~0i zOI`*1x<=c2yBdMF1jg^t%i z8?Sx7VA23V5MpW}PIa^KiY(Yc_{bJkBq8I9#N(3s1R}sNxy7U%ss*k-bGieLh9Xn1 z@yMs|Az73PwaEaF2T^k^ntxK<;HisoZ?K;in%On853IG4wh%6MUK_ye+vM8`qtl31kje@_;gs7C8`}17 zB%~A(moSX=qAHE65YSf?mV+d5*LRo>dH4ZmLKzOi_YBNVRUn*I!E`82)vm&WSM`>A zW2a@f=EoB*olYy^%ztqPR_Eb=CZqops>EGaf#EIYJ5&~;vwfNBwDK%mpvK{={DuuV zjOLF|?Z?Fw1XYC}u{8Mom3q7u^yA^&)}RYd#-|JB%jCRZzRC+?gW-j};TPg?Oe=nk zOTPlbFU~%dGL0;xDBJeyzXPT^gD!{FH%JU;eODBJh2P@q0~xajwX+yWXsvLf;k>!P zg$qVxySru0@yUFZnI%^sDD*SO&-`(hoDj(0f zH*=9#=JPdH={PBFkU-QfxW$NT+(FojvaU^^EL||-fr*GJnj&Ofl;(ts2toxXSBN4G z4Q4otBO;?ZX!Q(XVjL+bFQKfXQ^yxtrjr;`%a_G=4FPckPuKo;H~9qK`CUn)Yy`jo z#K4mg)ru=Isvt(`Dn%GlZwQ?U28628WfZGJ!OoN8G}s&lVM*1GG}s zS0bSG?@u#{+JNYBJrNJO`@C6mhk#R6YL@sb^7oydn}Ln7SUnL7ygkG+PW?WBr;v-t zu8T10Ch9=PZYvVQV`X;^%43jr+t3h3M|I{fxw_#c}@8|sdz_+3;hPD`_4ulx2 z`ZgW!TWy;~)N72H0!jCnHptXQrj|^bzTumfIf(TxiowxQv2vP6P9A0CQIbdDb6-sJ zNLm_*Q6*A<+%ZJCFtSfe@N;B7Y$TpR+1mMF8sUwoUtt@Yn#YD2=~F;+9OZFTgE-2J zBd$zk6p1nrkuQ-z_T3cw8-oAW^p#Es{Fr%I4Z5;Exv zut>}9a&spOF787*xE-^Gb^&A=_#0xbnlM;7kBJBJY z{R8}rD9LXzsXG^vt@#ulNKzaAkgEB)0c-`l;b!;4TNH>k%^huL;R0q9@aHE|?uZzdmqBXgKRx<{s|?;#eq6VQYkKv%;2NwFIHpphW?cXOlg`xCMx8GG72ZZX2K_0Gk|`Qg8P@ilWPRvc<;=tWbQb?A%1&O^RK4}oCfvKJFlUS1>-3jfjZWB`5 zTR=+cLr)9eZUYhO@`OW&>_U0vn*yw3$So$WfDGS=pxa3=&ddzLrx5`u&>$%h;rleaRNjC2`)kJ7k{V zxj^8y_ItttZIiwzJhB(O{(!WQ*_a6)kV`UolGJSU#jcr58sV71SK8efc~4q^RNuTHu6x+&jTQ~tR)0XrSF>N1rAp%vK&RTmXbaE!xL zrlE^NG_c7VS~>{>$$*J&g1Jd-NfN`2iX1Oy-WPsk3MUF0D zVlTs-ir%1?x=?~Am8+DwiDeE?Nr}|GP8Lf#8P^8F!I&iDFv)|y8GN{_qR!L}Wz1~doACu{#!%9bF;trO>9hymfW1a^XZz6yuwD})gnCd&F4=owXb%pJ4# zPbD9+QHFfD*ezc0irWHD7C4@2 ziOzdDtD00*$;&a`m$#^s1(+z*wg&D#r^ZtjOM|u*jh4=#K)UhNZmVApb1#25;mGOJ(`dNXew)CW99-D8ot6s zaN_#*_(%fX6HhcR1Q9QKq%^iPwq}mjy?I*vB%v9 z-l75aY}5@8J=yf3$H-ha5nl1Mlem+3bsjE&IgR5PDjs3neeoT7$(ThZQ^yjXQgGgg z-4xz}fQFkPoaIsWNL|1e^9DO7l8cdtr}z&GR)~0txhntX8oy+RuNz+8bC_^R4cMZF z<{I890N#$sZS1QOylJRkU(0q}M+_M%Sjw%qZNEl)X1gQ}?F=+Cu}jjjQKKc&)QP1e zoAuQlGkHtjbN(Oseth-P5x-&holpPg_5xfAci`?X&dP?9pF0 zO;*^C*E+cu)!z88rp7O+y|qU!H{pP_|4nSsIH+3zuTm++4}W&AQ2@yJk%|A~R|fh7 zpF@u}703MH)#7t{bNg&>cxrolb6e&*m>X_x`m?=_C;f^%^ykm@mseL-``hPVd)?1r z-Rj0pZ+-RAE0$4KfzSQRdvb2X{dS-KYVB-8-48eYgZ;Cs51#8S_itU^Twm`m z4@IXGw8QPk`{cvVN^kvqA6L#Bd~U)Q=Q&>VaZG|l9KG%KI~MLdc2_uY^1f52x%P7X_har%ZSUqI z1E}gB>-Qe*Z)2W%X?z~2^|}G$sy2SnkG-_rU+Hi6H=qEv?uniL@IC%&@;JBB8}`<5 U<@h>$e%8(^b literal 0 HcmV?d00001 diff --git a/int/Interpreter b/int/Interpreter new file mode 100755 index 000000000..55ced8cc9 --- /dev/null +++ b/int/Interpreter @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +S=${BASH_SOURCE[0]} +while [ -h "$S" ]; do + DIR=$(cd -P $(dirname "$S") >/dev/null 2>&1 && pwd) + S=$(readlink "$S") + [[ $S != /* ]] && S=$DIR/$S +done +DIR=$(cd -P $(dirname "$S") >/dev/null 2>&1 && pwd) +wizeng $DIR/Interpreter.wasm "$@" diff --git a/int/Interpreter.v3 b/int/Interpreter.v3 new file mode 100644 index 000000000..9cc0be2dd --- /dev/null +++ b/int/Interpreter.v3 @@ -0,0 +1,293 @@ +export "fast:const0" def const0() -> int { return 0; } +export "fast:const1" def const1() -> int { return 1; } +export "fast:constN" def constN(n: int) -> int { return n; } +export "fast:add" def add(l: int, r: int) -> int { return l + r; } +export "fast:sub" def sub(l: int, r: int) -> int { return l - r; } +export "fast:seq" def seq(f: int, s: int) -> int { return s; } +export "fast:select" def select(c: int, t: int, f: int) -> int { return if(c != 0, t, f); } +export "fast:nop" def nop() -> void {} + +def HANDLER_CONST0 = CiWasmTarget.functionId(const0); +def HANDLER_CONST1 = CiWasmTarget.functionId(const1); +def HANDLER_CONSTN = CiWasmTarget.functionId(constN); +def HANDLER_ADD = CiWasmTarget.functionId(add); +def HANDLER_SUB = CiWasmTarget.functionId(sub); +def HANDLER_SEQ = CiWasmTarget.functionId(seq); +def HANDLER_SELECT = CiWasmTarget.functionId(select); +def HANDLER_NOP = CiWasmTarget.functionId(nop); + +export "main" def main() -> int { + //def prog = Select(Sub(ConstN(1), Const1), Add(Const1, ConstN(100)), Seq(Sub(Add(Const1, ConstN(2)), Const0), ConstN(15))); + def prog = AST.If(Const1, ConstN(2), ConstN(3)); + + //def prog = Const1; + def bytecode = compile(prog); + def val = eval(bytecode); + + def f: Func.F = wasmCompile(bytecode); + def val_ = f.f(); + + def buf = StringBuilder.new(); + prog.display(buf); + buf.ln(); + buf.put1("=> %d", val); + buf.ln(); + buf.put1("=> %d", val_); + buf.ln(); + System.puts(buf.extract()); + + call(); + return 0; +} + +def CALL_FUNC = CiWasmTarget.functionId(call); + +def call() { + System.puts(Strings.format1("call=%d\n", CALL_FUNC)); +} + +def eval(bytecode: Array) -> int { + def vstk = ArrayStack.new(); + var pc = 0; + while (pc < bytecode.length) { + def instruction = Ref.at(bytecode, pc); + def opcode = instruction.opcode; + def operand = instruction.operand; + pc += Instruction.size; + + match (opcode) { + CONST0 => vstk.push(0); + CONST1 => vstk.push(1); + CONSTN => vstk.push(operand); + ADD => { + def right = vstk.pop(); + def left = vstk.pop(); + vstk.push(left + right); + } + SUB => { + def right = vstk.pop(); + def left = vstk.pop(); + vstk.push(left - right); + } + SEQ => { + def snd = vstk.pop(); + def fst = vstk.pop(); + vstk.push(snd); + } + SELECT => { + def snd = vstk.pop(); + def fst = vstk.pop(); + def cond = vstk.pop(); + vstk.push(if(cond != 0, fst, snd)); + } + IF => { + def cond = vstk.pop(); + if (cond != 0) pc += operand; + } + ELSE => { + pc += operand; + } + END => {} // nop + } + } + return vstk.peek(); +} + +enum Opcode(handler: int) { + CONST0 (HANDLER_CONST0) + CONST1 (HANDLER_CONST1) + CONSTN (HANDLER_CONSTN) + ADD (HANDLER_ADD) + SUB (HANDLER_SUB) + SEQ (HANDLER_SEQ) + SELECT (HANDLER_SELECT) + IF (HANDLER_NOP) + ELSE (HANDLER_NOP) + END (HANDLER_NOP) +} + +layout Instruction { + +0 opcode: Opcode; + +1 operand: byte; + =2; +} + +type AST { + case Const0 { + def compile(w: DataWriter) { + w.putb(Opcode.CONST0.tag).putb(0); + } + def display(s: StringBuilder) { + s.putc('0'); + } + } + case Const1 { + def compile(w: DataWriter) { + w.putb(Opcode.CONST1.tag).putb(0); + } + def display(s: StringBuilder) { + s.putc('1'); + } + } + case ConstN(n: byte) { + def compile(w: DataWriter) { + w.putb(Opcode.CONSTN.tag).putb(n); + } + def display(s: StringBuilder) { + s.putd(n); + } + } + case Add(left: AST, right: AST) { + def compile(w: DataWriter) { + left.compile(w); + right.compile(w); + w.putb(Opcode.ADD.tag).putb(0); + } + def display(s: StringBuilder) { + s.putc('('); + left.display(s); + s.puts(" + "); + right.display(s); + s.putc(')'); + } + } + case Sub(left: AST, right: AST) { + def compile(w: DataWriter) { + left.compile(w); + right.compile(w); + w.putb(Opcode.SUB.tag).putb(0); + } + def display(s: StringBuilder) { + s.putc('('); + left.display(s); + s.puts(" - "); + right.display(s); + s.putc(')'); + } + } + case Seq(fst: AST, snd: AST) { + def compile(w: DataWriter) { + fst.compile(w); + snd.compile(w); + w.putb(Opcode.SEQ.tag).putb(0); + } + def display(s: StringBuilder) { + s.putc('('); + fst.display(s); + s.puts(" ; "); + snd.display(s); + s.putc(')'); + } + } + // eager evaluation of branches + case Select(cond: AST, left: AST, right: AST) { + def compile(w: DataWriter) { + cond.compile(w); + left.compile(w); + right.compile(w); + w.putb(Opcode.SELECT.tag).putb(0); + } + def display(s: StringBuilder) { + s.puts("(select "); + cond.display(s); + s.putc(' '); + left.display(s); + s.putc(' '); + right.display(s); + s.putc(')'); + } + } + // lazy evaluation of branches + case If(cond: AST, left: AST, right: AST) { + def compile(w: DataWriter) { + cond.compile(w); + w.putb(Opcode.IF.tag); + def hole1 = w.pos; + left.compile(w); + w.data[hole1] = byte.!(w.pos - hole1 + 1); + w.putb(Opcode.ELSE.tag); + def hole2 = w.pos; + right.compile(w); + w.data[hole2] = byte.!(w.pos - hole2 + 1); + w.putb(Opcode.END.tag).putb(0); + + } + def display(s: StringBuilder) { + s.puts("(if "); + cond.display(s); + s.putc(' '); + left.display(s); + s.putc(' '); + right.display(s); + s.putc(')'); + } + } + + def compile(w: DataWriter); + def display(s: StringBuilder); +} + +def Const0 = AST.Const0; +def Const1 = AST.Const1; +def ConstN = AST.ConstN; +def Add = AST.Add; +def Sub = AST.Sub; +def Seq = AST.Seq; +def Select = AST.Select; +def If = AST.If; + +def compile(prog: AST) -> Array { + def w = DataWriter.new(); + + prog.compile(w); + + return w.extract(); +} + +type Func { + case F(f: () -> int); +} + +def wasmCompile(bytecode: Array) -> Func.F { + def w = DataWriter.new(); + + w.put_uleb32(0); // 0 locals + + var pc = 0; + while (pc < bytecode.length) { + def instruction = Ref.at(bytecode, pc); + def opcode = instruction.opcode; + def operand = instruction.operand; + pc += Instruction.size; + + // setup for handler, if necessary (guest-level operands) + match (opcode) { + CONSTN => { + w.putb(I32_CONST); + w.put_sleb32(operand); + } + _ => ; + } + // call handler function + w.putb(CALL); + w.put_uleb32(u32.!(opcode.handler)); + } + w.putb(END); + + // create wasm function + def sig = CiWasmTarget.functionTypeId(); + def wasm = w.extract(); + def fid = wave.new_func(sig, Pointer.atContents(wasm), wasm.length); + if (fid < 0) { + System.puts("failed to compile wasm function\n"); + System.error("error", "failed to compile"); + } + def func = CiRuntime.forgeClosure(Pointer.NULL + fid, void); + + return Func.F(func); +} + +def END: byte = 0x0B; +def CALL: byte = 0x10; +def DROP: byte = 0x1A; +def I32_CONST: byte = 0x41; diff --git a/int/Interpreter.wasm b/int/Interpreter.wasm new file mode 100644 index 0000000000000000000000000000000000000000..efa0a14d209036a0fbc675b325e7378e090c282a GIT binary patch literal 13303 zcmc&*dvIJ=dB5jA_H|dK4)*NT6<1emZko2b#>LMJZPzv2h|F`>=k}HEAp_c z2VJyQEP^c{STd481^XdlKP1@hig|jR0#x%&&UyxM?Zr7TBGf zU$|g7rr(Uuotj>1PNx0o_NjBNdM8f%owKd^bEo3-mgjuo{h<0e<)(kwjBVSF>ndd_ zM=6!VcizT>O$_DiLeWvSa)@Tz0J&_I@L$PuogA=oZV^4l@qLUdVB}m-C=`?`mkWhl z!2yY4exT?S^Nw8{#O#55u9(Lp`E#6tS19C*MfVYISuI!--iM^6E|xEr^B*M3WiM`a zS{JOXxw&|5zIDO2yiV=(Y^*GQy4LROpPX;DJ3FnxOmCM}$@K29f`VYyrlzbR(`+xD zwuX~tyl8DQ=_isF9?TUMm}le2G{d)EWg&HDVi7ZzLX z&eHO^^B1n&F?#*j%}1`={)X4T@jbuw-uFGUvZ~&%9#$VvkErm2>d=SOi4UvekE)NT zkE&ZfrjCAG-SY|c_TN_HpH!bx_kLRK{ERyAS@oFGpHtP(tG!=P`yW@ke@E>q{n2+t zYmR04zwRS4v?9w-8Y*dM%l|OBx3r~IBh#Kq5tRQ0BlQ(+jk|}m9Xg&?p;NYe?f4&% z$rb;T@{B%3r9_tn*92Er`~ym@M1KsOS=(*PG`d}FHL5ClIvIQm=z_V(3SLw3wHmW6 zzX*uGPh?S?M-Nnl_QutLJ4c88P| zs;C@XUbP|@#3KdT+wjpNSoY^+UE4v3j)AOlMXh>}pc-5qTn6cVRd~s*L{D)NIPijq z3xniLH5XYY+(YQ)B6l__G_anEHeo%jbZ*RcBNf^X1gMFI1+A)=&hR3?5xsX2Gn~ld z3`gVPY6nYD&e-7yCOV+1y+DnCqRk~;tuUnB+lgjYZD@xCFx&|3vVyKEzHj*-6$Met zSoyI3MRt|F5^cqrb`)?5g>Y?E1+T-_uk*hM)mmzJMdUUdZ6?NWupNT5LI>O6?#Y^l zc3o~L0kEqbVF>)%jc}-MWaw6zW<>?SONE|U%hlzGO-1Q^g6jw_$K5dJX;*uV zFh`Ue*tLBFfC|w90C2dTC^UGzsPsP`PBcUv4jUZ~LxXwj`T7wE5Yp=@3P!qtqI{z$ z|B{OGUtLi}Dym54)~Tedg=$F`Bl{ROnU5}7!HyA~*X3Yz#N(WN6aWIpe?mO2m{>5v zMYV97(QaNBuo+zneQX_4@j1)qEET#C<#lNwfCTCOpepe|1PPIf$MoQV4G*LLMc?O= z!PU{rd`6E_mcb3=DuVaHb;RyrKD2xI482kLpEh2k3_%s`3U-cE{EsER6_wD{mj77K z*Dx3K>|l4W3j;q$ypF0UR6%7W`VtrRL^oAir-Y%YV=e+NmoJzHmWMA3u|53*kjdAC zureVU8Ubp?9Ye)!Da=Cn$iM>!%>Q08RrYI3d}N4%=^`N7!QM0vSpMT?TPcr-$IvDn zYzN5vII}fZ_Q$U>_GATP3>wDoyAAABAd%G|jhW!4beq{tmkn^Uc?ACNr_64a;3}a~ zMI|Z+9LvN1bHVXeASw~FC&3941uq_gX6PF2*$h-Ba%vZn2^~h*2I3IJM;X#WmH*R( zTz5~dSX^Rf)f`RR(jJ1o-3j)A1+i@U8uxBkK`YmeGj`GVUW6)p$05?b+f5^$$2h@FYgl)aRJ{vEZ zh6E7k5ZuGUe?_2jD13Dr)_{{T2u06W+GQV)?3!+rUqb3mJsWXOY72>5PLx}UR*nt8 zZV}uwj5?ge`UBa=1jb}1!nFS|9a`nOzJSo1v3V1O=?SPKvT#S74|;dO0yKpdLP~1v zP^XE_iZi9yNRT`raa7a}17g??+=~D0L=qyo{g5VIKCuY5YAoW?mYB-)%cKP1YUnL^ z6Gi_LAtMkD3Xtf_j3Ci7CV)XI75~pf0*oJK3M-RjQs|mlFxM9?W4caoXoP7c7yX2g zs1R+VbvRLB8EFe*khGN_H`Xhq!v9 zeqWvaS7cS%M50FqD46aEfDnI;SXGx1+mxY@x?#itBdAHueqb6#bPzN}msESEJ_T=Bm>bvm@kN~*Uxan*oOVtOSS*4- zGq`btL77t!VDLb0WRd}O(5t#8$3PCI0nM6*i=tS&GxT^?$EbE+pg2GSa+Wye|GUTs zS|Qb{4IynL7T1Ucc13H0ksuU%n%H2AvZ$I3&7`V_0I{^wXfQPtXawm7d{e$vApX>| z7)Po@+MWpzFKk9m{u2`zIyL}O1~F6egL-PmI80nihj#T4l7yg(4Oby&ia6-nH_}73 zY>;KM1K0=}F3NAFsU#>3a8gAk=?WAc4(b6^-MVa?1BJbd2mlefs2#^lNYdqsFS;mH ziz!hQ9zCK(67ANUAd2cxm)3Y3Jzp4u=NiTeSsFRgC>X{HWGoLO$!BT6>{v51MSwpj zT;gT4;u|N(uNAo+><?{TI)M3};jSg62>bzkhPbP92wz|&t4gZ<~ zg1JZ>=>`ggSo=5+Sx9=(mrO&915y3}%SeD$Ez9zMDW-|DS`wL&f|hXzC&R?&84-d~ zaxytO;zfZ8;3^7`&xvLZ4_&lZoW)?AAs@n#B#j3Fi05fbRA3H?^pY;t@ezY_!uVlw zHpemFioT8jDc@g7+B1<_iWU|dICLkFG5c9=Wz2q#570&P>0VRHlpgLgW`u;l0>p3j zx+Ed@3JD3WT=Bm~5h7JY)(#67BdZqV`!r$Ev;2k68NF)ongSVAcsW$!36G*eE(0R zV1Zu(8MTpkf9UZc>A*|V0gSxVsKQlEx+-+jRo=v992V(ldE=v&&eKcr5bujHhX2aU zDM?~RsfG^zp?+-ycazr_$ZP1Z`Jy!CPS?Md7;$z9Yntk?7~f&?0v^ zsw`GL z0mhXcnWTgrV0&SrP z)r0oEhIXd3!U3C=w2>oIDY5Kvt5u**#V?@Phf~dMp;FwgPf*UIS6Naz^D3SD-Eo;%w$83RA z{LdStCuH@f`7P1wt+=)9rW~XK=MYP2hA7+IZQ#gnE((#Ix~F*BlsK@qU^X7?8-QoI z!mR_AjdX)6nh*q%fN9Q?(tk#fNFE1h+e0QcAvZZr<9gTf|A#noNZf9I&oCR2pEgVq zhOjD9mlsg8!61zZTR1KtIvDSuPqS1>x)#vMj}YmiAXG5_3y=bp8#L_mJ3MKq3M$d` zpKPf*5IKv3u3RAc(c@%O4>WikRf*79J-^5WE6`}I;zbe=Q5(I2zx*PO|F~QUPD6}h zpWz^`I0qtoSr5)Of>Yc$+z>7d*b>VlI$}x4+CmmrT*%dBNhW1oV5#k+I`V@WiY~Z% zSkBym`8cK~3>1X{B@7^Tl$grWOl8P;NJn09x`87Z$mDQTO%L3jIm#kP&X7S=7!rKl zS=`P`aZVa!Fl* z&?5!zq8D*hibM)H>I9t4qEeHy2QM{yP%?Q`S|DcG9QX!g0U^#aTO>z;97CYSm3_4O z*w23G@PYvr$MZ5mjoYj;;01rC?DSWUjAhC>T>W8Kn^*|L4J!U94@UQ z`s1Pyf_3ndP^n)IKVn#56$Y-|FB`6L&j7C9f!!UpCVpelU?bN1L^StjL~}nx{S8^e zTvFkZLqd`6krnn76`(2CjY`)?(d4Q8F2(01*P^@4-DWm_sQBF)<4_ zBr-phJV274B}a&DxUz^=+AskZtIaI>pE2;~AHi_+62F5RP{~M5B0}PCCk&jt(=iz) zLV1cv&AK5{%&1@sC+{Gp-T+$0UYNOfD`Akyml7U?%WLUyA317%kfnyeRZ=PKS8V1mkIgi-IzYo{r`qi8`EZuE)Aq!5u8Y z{za%Dm41qi(84^+lp>g$qd%A@(qM{@LxqD}C39JqLN;eH5g8CG$^-K&X%z|HK21i9)C zM~{9GEJTm-TPkTbP@YVqzLt@1_KzrzJSQX1%Lqzs{}YmMArr`9hJq9WA#57=xR+R|avZw#;=SC}ljZB<>;+ z@>@v>U_<`qpV61QzE*XmR?S6FMN+Hgf=0Je&1EXpfYH}j5Ke^P4RxxQty6K`3|Q3> zTBbmCf(CyXGm6JAGWau9XD}nxJ?KDZhDfH8WMPRC(Zn8HumXpr280EtGZ)+*?FL>2 zO$Z!YsOI$?x@1X@-HwMtb52G`SkXOF7D#1sX$Kyw+>;I(um6F|N#JhAGPD5Nlh{*egH92Q{U ziP(b%Q~^4SC-llJsgIG8m^?h-AFT8jWrzy>zqkBHyW_#@vYl-{ZhmBDaCH`bL$-5G z56SXB(xdc~?pq;d9hJT$V!DnGrvUBXMO1y=SyWTt_G8-F)%Nv|?MZ$=4lc_g-PmL4 zwf*=v`Jd=t>jzd3_sU-LV6S=A+9}_$Qr0haUvE;uKd%w%x4Kw=vu5NkGPAyyY5!}c z{nJeQXPNeoGVQ<0v|s2G{g3;4|J2v}%Qf7*)HnM5zTRJV_wd~{v;J;P|9M3JZ}6H> z;Sl0W7%fvl*W)nGb1ce5Pyl*?3BK=Jn+K+ydJA9Nx^-d<^?@1gSx#CPu8dFajX66 z^R{AMy}4YQtxv6+M!GKPwwB_Q*Xi19I|hvXJU+)Kdpul?ekq?E_+0AmH^JEl z{`NVCa@I#sl38xEHW#a3;e|)*ca>G|DIbI@||nFP+}h4~Q2x_mMI0wr=S|!W*ua z1zEZ@u*-9!_@1BKnj4iDZ?EXX;Onp>gD2}JmztgWTs&HD);q5$_xfihuiVr}(YWbr z`-sGFj;k)uqS1?w7}RaU(Zt=OM<|_Lqf_>Jn9SeRe9 zeoJOGFn(%jHa@*)_l` z9JS7ffcFf`#@eF@NoQiS;k}pjqj%;_|HiEuC0}!9cK&p2c64rR=iZygvcis=Q#S=C zNv=IQwX|Sp?i)(hIfGZ&4d~{q^lOugOZ8T~e~rEq@SC>wbA;XAvvBu3H?dw0?KiK7 zz31Rf{S+qgvN;8Cz3ziEO7#H_-m*rhSo*w3?$G*yDH(_c8PO%wibEY9>h+Tde~lIQ0rHbZX!D8eI^&d}`0`V;jQ9_8wmkpE|X)XZMMX;PpMbZ`}yKG`9CG z8^P;id*8aX7ab&QkDs567nrSY+jO#id~(zn-NaxYoM!@*Zj{M#wsXz;zm4c-OGtwU(O4>P_U~_kix(DgtX`nL8y(oeWZ> zbm@tpN6#>I_fTS$JsZ4djlGG`ca6P$Bk-Qx_ih9p+k4*@_ys%&h99@?nr|OPRWVna zeFr0U3Ql16Uv}Gk^Gpit;l6WI7M>tKu;q?gr@kB?LuCY`jB66_Cf~Ioxc%-;>#%2= zr;J!2wKIvJ9yHQSk~Jy@@LcOj^JF}Idu`!#4={ed1Hj~!6K88v^XDWiCK4W{^aU`r z%`AOfLPB=fK-%$ePZu(4cD^&c;Wie|n59kWF#%s#0D@@#zm$k|$} UVJ4#bn5!*t;*0?xt|x&11xP$it^fc4 literal 0 HcmV?d00001 diff --git a/int/InterpreterBug b/int/InterpreterBug new file mode 100755 index 0000000000000000000000000000000000000000..a88079884ac045c9524394129b06524883b48ba5 GIT binary patch literal 14264 zcmcgydw5jUwclsvOy(qv%m7hGMLmIm4%)zkhX$|c009ndfJm^WT&j5uNhB{P14gga zfdj~N)FSFF)_MiC)q2~4qBfu%G6@F7))KT9#d>)to*1w^RDxLM{?^{-Odi_W-aqbq z-HL?d|m%8YN`FY zXq}^9kDzCDZJTJ;f>zJpDTDpjrDu84HC{>c88G`x`>$(pn*E-rtMAIZlop@p)9zj1 zQ}SLjmSISXb!hjF+Hp3OGOp##oh?_RbUS9pPHFZ@#fupWpRfJfr1_}aI#`QMnu~ zRhzQGtt#%7v=het=U{D|(Cc3?_bO5dX6QTe+v7wyUO*$QKwIGR>Yqs9M%B+U_dQ#u zFt+4#Mek6I)6fdY`{IQ@btfAM3^7g4YsCr$%N6RL{HpujSr{OEUUVqQ19mNWK-H3` zRj67qo&jfCtX+${d|*advWG9#+dz2u5WQ6bG7r{z2x<2(wYco#JOU+spr;~m<{>@h zYq3IKPy_ctqd+FCSR$HJcoR!8s2Eq8gPNxIF!#!_geQInAM}t$@-;9TEF=*M!Ip6= z=3kA8q&5&QA_aN$vx2HkUI3)=-5VPtR;#j3jAU`mlkMJhv5g&SdtqU7b`y#lVcr5BepV; z!b;6qs&4jt|9lRu=gb2~E5n&DO42dA+t6x}@mRR49q?gpK8rf-@+l$O`Em3OLjKScYRH*e1CJ)1u&ZKn3vt!D3bEWJuGIjy5--sdu>b!#zg) z3lVJyi+?9SPQEtWUO0Ci%YSfhzQlKnD(7zCx%}N#qdTUb7D&Le0Kj8BkhcJ(Sg>B zMN8~AH{N)ZukGh?My%gt~krOy#4x{G2jP8OUsq;WVM+W5d85pk~e@jZ#plA#pHi5*x)#~we5 zyj@&m3Iy?)G(pAQt0=wb;_)7PhT{gkN?enTOnxqMD*kqD`H?7*F-bHE++ugH_9X%s z6D^#he9TWP%S=7(Yqut>Wvr4 zlXtmS7Z58+r9)P7vA_#FN*4t>G* z3Z^Uoxl$oXW z`U&)mye^(W3*&R;O|a%%Xcq zYX-${jbb;Fo(aMzydpD9yK?vMFqe+QU^{5?V4qm~X z(T*M0636`J3B8St5j7(sv}z1TIzw9rzEs*bZ`>xfX%yWAh|((9JU1S1+~tG6VkbS| zbri`NgD#9rk?28sg*@FDBBWJGLBJvlEIccvWhp!9LT!iY(+MZXhk*fmu<36UcX(*0 z6exCrTuN#){)lwbv^TS0T`%N;A8>SBYpEP7giId5E*#)DfOzZ~cND(MmN~dzlrtzeoC^XpAZ&sg6KSCP4`mnNVCo2H$L2_C3a5`%ol)=H=ZB61Af!(T7HO|)ge#vuXIUKGS6s;CuS^x zPRZnO-RU*o-(!D4KWStO7P=mx;G0$((^^`p1R-r}OVtKCxtO-Ri#oN=)0%$Z7?ec0 z*)g1wycf5>9y&2m?l#N4RAP={%|U%XuW!hv{-=S>TDuu_C>{F72^2xZhbF4E<)>9V z2)OmWd2S;PHZise`J{Wrz%g139aL+%w3gyisWu9}&qEkOVj%=?A~mq&KzLB05JXlkjllBZ@$ma|8mVhfuEkA@9Kk6JI>{^-xXi%C%@g z@_tBG;+709DDsOXKu0U6iwAGcJ$xIv4MHTI9e9YxEWPzh*am4Eb}6nH)F5_f3Jn3X zm4|9LidW(O!fBBe__}b#;mH84AbdNJO=3O<5pxR6pr~!L!lS{=Tf8=fy4*T^gx2eb zbtVE-$blE-nx?tDi)NC$0@01F6hW(L{kJh<+2k`hz-5u{QKyKi9P znRYqSR3EoFdM1@h@Oygi`Enaw2hvT#>@{A&JiL}HHu+4C8gG3@#A!DY7LDGu@u1w7 zSQJRem?-zTubfny#-k-XCT?Q|FoLGiXVAKf?!-{ys(uTN*H2?QvgBS=*6fFgwchT8 zYEe*WK;Pk~m9O)Jzd@Buz;KEV;J@ts3M|>1ORWww+9+I>ZVI z+scE028$$DNJkEe7)BJHO|}OA!JCf}qi2QmoC0!5`ohux)$rxwk%s%;&BO@r#klFJ zeaD5rQ2y|o9S)Dz3z}x01{B(`EJ&U3Ym9XN4!e22*v-5>Mh8q=tRU*$QR5-XMvw;* zyI@3wIc?1DG)8h~p7fFKFiGWs!T6y#3yWaj69x0}>`@dpjW~^v0CWyoq%{Or3YuvO zXj9-crcgNN^s`NcX2Z?1O_U}(;8po2cAY&^r3?M+-0)j61x^3 z&d&n6&x`mk1koUnI2W>c?5A~fW{@b<4_&D+9y&5b&9n_>t?L8??LYBe~Oc)yueT7JXGS?p*F209J6EC>P~Ij8p}`-PCygaEeeFpY*w~w#p^!j+hH^gM~0^2`+ z5_yGdjp83vMh#%+770C=g7Z6Iwb>1x?;zX8Q&app9)=U z`#kT2CHsuu12-;YL>oH+Cri9#CeOggm*Nb)Ye0VgEy{q+Me*l3uO#eNbTcbL#APM< ziOs#~n2z`Cv3;nBmojod!SYLDmB{Ym!!JMnzb>p!X1l2&4&ubD1{sZ0C#v>A>|O#u*- zc_Ll}d5RG9INvh}4H(BHdR%@%aE%ERXM_FtEu_?jp7Toxp{Ai0M5P2}QTKmSUy%Cp zX(hgJR*vCsgh;Cns5Fd~*ep+(qd4M?2o`pBj&wa+(y&(I?-oz6B4t7 zs``;YI<65rB1(d~pZuOeCT4MB;E5a*!2Q12>;}0zNRY4kJr{v{(Zzf&t#+CCg?khF zuAKBB-F9Lc!iLL?*7+jO-*o!Z2jRZt(ZIbk-Tb6DTAGVLwTN(xdlC?v-;uhd1_1ToDu^ z$n;cFO`lxxSx`Y2tjN>Z1$V?#fdWJd)+#ZMmmY{b-M^r9mO$V~qy2yh;`blx!b#Rh zb7^O}aYEnxSVnk`2aGGB5)22pHDExI+G5$hC9hoykG`jXvqy29IKe2)w_lj>!X0En z?*}6&_;Drdhv-2ySloRd2EnB!)J}F&LVOkxpBl{r+n9k0x`@@phS8?&Y};(rhy&w- z6WBofJd{KtNK=Etp7?Vvc^gm!okCphYZ0zW^TZa`eA2u~to!yUhaMtJPA5;i1a;^I%H z0-aOzMp)%JF=c7|J|;G`PnPE6c=%Q$kYXPHnN}#DPmX1DUC-wrhW{33fnD{|uhkF~ z*y+AR8x8BjGcsKS_%rfeZbp!UPcSI%2?;3Da<#>fN77y16m1IKF5EgjWbuhp$gbTh zdgb4K%-Z<`!&J;;2Ky*&Cf2CIA8-?XbBdN;X1saWGL!UNm8fmgthtGN3-L|jeUC1l z*cW&Yy~7>TbOX5sxB6N1LkV4vSAT*drX#2Xcw2I>{(!eLo5}64p|AiROZl6e-s5rK zJDZ5|M|@ZmRN>V-@EZ8>)%roVQAM{YiU-gnT_+G7p)ip&U^7Y0gpu|IrMLSTLbR2e zV!t0V`uI#fDPGs*V&HD`OK;djOCC6#G;q3;?~rR}xkP z6L^D}tx~QJS@MlsNC2Y2_I_TF4=?yia$0Up#liqF`}fn*pY(A5!qYazGWdG`sjk7K zrldi+s7g$U+>?H!0&dk45AxX<9p$(Sb>uMXK^QZN?xuu@$(f2Dh=&u`7ss z=!*wuEj~nxA&Mp8hU9+wz`#q3MTqYHE%*_?j?gCY{KEq-KynsVLqh$FO!s|snZz1Z zZ=K5D(sd&XV}5k`V~FGDr-uX9Q@eO@791;d)Z*DO*{vW%=x1>A`ZG2nULsF^-MxAb zs$g|ZHn~2}q`-$C>KKp$`O_&78G3IDM4JoAiN-%nZQj5nP=EuIoLt{Lry^xQx!lF@ z6{l#Xq|KQB$j0>*clr3Kw`9+yQYRUgm8z&f;DH)wmdX)zzd&}e9ML6pmoXOKu_TD& zA%_IKe*X7v$V=V0@cYJs_fn}Jw71|j7)J}f*5iLjDA-!FR2`!2L7xYPw-+!R9#Vsj zy8yq&!Lt+43D^ht1dt~wZ0Qh%y)FgWy8r{=wFOxrU=v_FU?-px@FCzRz-e1R_EW%b0BwMM0H@8yase7(5nvhM$AG^;Mki!B4^RgnIiCZ3 z43Om@8xOb>kl_fj)i`c;y=Y@c0gpIYbocozD*0LTo_rRaHk(Cr7P9D~n_0B&DhDgQ z2yuRp%mOYIJV)7KfO1ZhEdpc?_OLZ~GWN3@8G8Zn>RiU&1N;q;Kaa60fXZyF1h5Iv z4tNjnSAYUpqC;f%8ep2u!|DL*0e=PzwnK-2g@CCJy4&G{Pz$r*kb#nbvEbhYvZoCa zV{c*60z2#}n8WVcA7sDYAC`6opOZR+ZPLEr9_f?KC#0Ot`L;YjUEWgL(r3fA*QcJa zy?gwm%}Cnp-nI$$!WB*SP3vB;zO(!-d*`}C_7B&|^3msp$vC@Ze?@_;AJXNG zcAMknEq2H26|&=hp2%_Z3@djG_KlDp{qzI-xvM^7`48=JMjeNoKmPCo=VO2a zKmXL}Sv*1VE^d;ppE=Tb^Xg&Fwv2M+?O&8DpI)&-8G3$`GJ0o|GHF&snf}o_WzJI@ zm6EEH()&C2NU0m%lrpqV>B_wu*|4qCq%mFBNzcz-A$|PXIw|8j_e*0I=E&tAA7asG zbZPB{4@imIlhT1*gdDH&B>yX_)r`R5xbHH9y_vASZxi30?b#aCL z;GD4i%iM|#$Ka-n(RW5NDiXBKiy8EsZS&!ai8+7ER;z6f^ zdopjl{}yLuneP1QkT&OUhO|4|?myt%xBQ^UxFYAh(hrzdoo8qe=u|W z#PL~oJ{ZZ$Sh{gg{tFXbs~=k7`kiCl;7dP0=-PF~Aa{GjsWI|r3rlh4h}x%rCB zMGM23ZM)VDegCrY!+I`j8!BJEXK3lZjhP+s2L@gCt72E+<*@7hMNdeFT@OmBFAv$2 zKb*8pnX}Tq=wjX8a`7-}*Toa1%=(kkn*Yjk{_L9Z&KKSo?|k*n3C{N}ndto6HPfB> zo91U#Z5ola{C{lrS5n1xHQ~s7zI?ADR<~hce z-D10@Y`HBuwcVCT9<;qS^?+^v@@cja>prwid%W0Q_xMQrn(+_W*N=a~zGeKM?0-H{ zEDt_8QqG;wBo|EhxxBFWpuD+wn0w2Fq3-J{huRla4zVA7@{nWdxt}3A!q))|e&q#} zqPm7Pvdz*4M*lszgUd1JBZ&sA`AXfTa9#)NiwybppS>PO;*?=NI31H1RL3Rw#hB?`Kal zmF>kJ?uf5|RD@Wvfq8J?_yHQA8n70zZ@(kJ8Z(NR>MWJ2vr3sCfBFc}A3p%qgGxbv zE!*JQ3_4L(Jy>NPx5^sbpydX?QYp@-C@Zx$GQ(cV8s#FkTP7`#O3nw=1C}C2H8VSl z0(PPO4$98}XHb_UnGKe3sz`P=3FS=5!LA4V9I#!=K$N%R^vhs5Hai<@bFiyyPIfKY z#i&=>WL5*X747wCKZ){}HU+dY0a>8$WT!FTVRtf*J%e2Y7-!F7SJ?-#Y3S4KE_M&v zyHM^!`2ork04BTGAQ|gXUIMrbkPnzHpTp+LgV|HEn>idVHWZOK$C1S@cR1KYv?l`! zP%lKe0C1C|uB@h>&90Ax!j0h&zNam%8oO))tEsPvuyNH5b)j)}O%>Jkp_=jI7Bn=} zHjP_W6RxVM9T$#_t1pYxEDMdBUo(GceWa!?MC;NGRv4vXYGw76wV^PJG!#d|HT6|&!EKG90c=!-BC{JAzKa{#;+p!(>zhL1=?#@3 zR?Qa|3ohr^r27|_0d=n!SUI(#BGlA0wX!md0<`+5Uk8f!y!Auv+ePsKBu%^k%0+XdD+p-6Q@rNBBB_EYjrB3D(6D-PAw6gQStaQmc2 z6E#>W$-<%9hKlLfx^nm#3oosoSpzSuxt)kKf!e(Cn@JE1q-hIQ7-m&j!!`BG z(5%!#WtDI*S~owmcurX(ug++O{zUcvXo@E30j6m7avHO08_LUSzfmv!9Lrqu738Ad z!X65T8^TQN2q(p55Zl}YL3?Y(wGD8IrIAL~z?D`EEpkOSG~ULlL(S8xxxN?Il{NAq z*pOA_M-Ic0#^wQVrC@gesQn7ye!xXI2dV!-lnViw$b=K*A&SgG)d)ayA z63byTw@B<=HuNO*kIunuhUFf&vHRG0ZFcr8{_SU@kK0)uJMRR2e;*ti1Uv(7mzx|B zSN}gB#Uw+zp;x(rnq@VWp=q}*422tdBMR~wj1hAs=Gz&~rw6I)5sez*0ij9}`L9yZ zi4Tnz6d|@)0ZB$i)Gf-t0*{q zYWv5i|8BLhjZ6)y)a2u|{s;6Jjt}8Os>Wtj%-{x1?U8_R2*EX6Rv%F*eyfoNwKpbO zQ4G`gdgK-iF5cKjT;7i>C7rNK7cZ`P-;$s92L{BVefH-Pv~x^>u2Rhyq=rL_L*Y<;MTo0{ int { + def x = A.Z(ay); + + x.foo(); + + return 0; +} + +def ay = A.Y; + +type A { + case X { + def foo() {} + } + case Y { + def foo() {} + } + case Z(a: A) { + def foo() { a.foo(); } + } + + def foo(); +} + + diff --git a/int/InterpreterBug.wasm b/int/InterpreterBug.wasm new file mode 100644 index 0000000000000000000000000000000000000000..dc25a1547290bd15158034dc6f6fc172e5b574a2 GIT binary patch literal 5307 zcma)ATWlLy89wLC&CWRSB~Ysax2+euS| zl(H0d#YG?x2q7WyutMM4OP*L~O*w0!@6W;{+?p^Y=1 z|D5yRuj3KPU|k9!tw1ot+)Jxw!3!s{;R6Nq~yh_{wm^RgX}x$Eb2g zF&Edpm29O!(!X(~opyxaWr>!JZtu2`c0Ih=@7}tSZVP4CtAi`I`psb~r1KW(@&H=7 zLVBJfl_PzDj}rvAi6HPDFA${?)b>iMT=M*qH$hk62SG4_PdNx=$-PT_WKSIPzRi$) zSb10peonCCrJZ5_w($J*biLcZt%NsBuC-Gs%J&#o7#Y?+bn3$&`RK<^*FS#dxlerZ zQ~N)?Z{ahajX!tx_~)NL@P!w?xM*Hnnz;81*7bQQcg)-01bj*f3yhXpv})2k$_~Hm zE^1-$iAvQ_%LOIEvd|(5<|03Qm1GN}qOuX;8{fb}`a9+o+Ff=Ry(p}R5Y%$5<{R;n zyNH!<+_ovTA{R1d!A(o;FDTc@SUHPYX+qN~5;ZTU@J!e;U*7$uvWAYMxzn;NsMsAwkMyn?B(*^ZAqFYdJ(#^nk3yNcA z!C%{5%V|OVrdYlX#QVjPSa8HF9ml5{{+1Mw9&|Y^9uTs-X#UMa@6k6~H0dD>P)DA% zC}b6Lis55gusWIS=qS@18STfuT0~?Jz*APc3kZ-O2fQ2X+6{CN&9>qIo~@`b49$;N z6A|svLG0(d(G(G}W!hh(fbV3kuR@4IVpj6}>>Qy8s=C})n$ z1R)umr$ixqQ zoyUYA%xjcQCXV*aSrE0gA32QVw(ID?oR=ZpCsdqfh;jr1hbv9X=wLPcrDe>aeVgC~ z^=G<~hhQ#&)$pel=}ngYh`w#Eppbt>M`#dQWHl)%wuUem zJIGDy1XYhZK`BNffFaZrw3rSA1)lo^bqp6!wiG}o`OIX5QbH7wJG@EqRqQ~cF#OBQ zkSeP?YBB$?NwrDiY)rWV9p4Sj&qy^r*@CULs=?yU_67;8BBJw1ZB8X!HSgmu*f46? z!ogvaiAxt3FI|y*?2lYBR$+g&4Y~}BPzC@wHLeO7Pxke z6f`9R5NftDFd#Iu!nd@=e-^j-@7p&xq)*cybIWbd3FE z=y(&c<(MNCP?N|2Lg&1H5}h~4Bc1p7-%~KT29Q)W1se^eHsypyA1$%jT3Io4j1dIm zjf|W*r0gG_*GS4~h}5bTX%dO8 ztt@wJ5ek$cdlgR@ut7`8H%>NRBLFDeXugKt`>oBH}JqG$BH6qg8yPJNai0B5DU-=+YCUPRXr#K z%AFjF2OX|ixjZT1_RMf)t)}iun_GkiH)J6dkV~R|zExeG^DduY*oEKc zx`0pMhc$9cspi4YdAWM#43#Tyk#MAzXWqjoE|XWXMC9%%5M$MXZ|1j5bt5_s$Q@#j zIvYv@kq2|`cW4N3J=Fy^vcjL>s)arxe2X>mpPCl#)6>G`Y2h2nK07V^=)`DN_+zso zItku3gFH+MmnMZbJ}DIM)g0z!M}!ldh-KdQow2?jo#gonJ}5ZLWUQA{Ye$o|Ezp5u zQIs*PN5RS1MS7zZP~M=bkX#gbcnXi!Tp{c!%GJMeS=bg>g0IV3@uR(YC|c%y%%6!h z>W9jl;iYYcjwx*|Ti=2=64OQu0QwU|oTcaQH%6RCJw-x01+k_UJ^5`A|QhsvKwaDsd9zn&@zNmlx5|?Dr7wM)|J_ zpiq&7-y2bohe=x`2yBoh+;7=D|xd3Yg zC<^ezZZ5JoIl249DNg2|v{)5qvjuo)T8DYwjOr%O1j=$=_QS{-sra5 zX=8}CCxpUvS6V&+ZD{#Oj-Aay{*1pJYI^xR=R40yxeLIjfr)RX<%#`)LP)7PZyQ z1)e_rf_|3mw6~fJlQ!-IAH?-$Dhson$?g?-2MUQK9!R_3S;2 zoE)v_HH@;${4I=Qm-$N=*RLZm-G=Aa`8|n0d!YkA_WCKlU);P=Ur(ByVCr)7@@8k) zTu1${d#lIYbUoSW@x?C?F@#`<$K{2#hnAi94gbrI?IE} z0|ird-ST!L?G2mV&eZJX=J`f_6`}_>R~kvDllBMIxq<_uU)mgATD{2cc%n%rzzd(H zcAE63_ZO)B6vi&WN$tlS<9)Af2v1vF=cLZ=&<8i%Z| zSvsO7ki7GSu9wr*i%IWrv9Y{8#Kw{Nm7B@+?yZ#}%+2(#lWn|l^r%(je7k!sX^$V< zoq?VyEM=;8cNjm@s9w#Aw#BsqR~|!>AZio@Wa04xPuYAk-Cnwx^iNnq)Yp2_Bf^t* KBi$Zm8~+2|nOxHV literal 0 HcmV?d00001 diff --git a/int/RiRuntime b/int/RiRuntime new file mode 100755 index 000000000..716d1b2e8 --- /dev/null +++ b/int/RiRuntime @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +S=${BASH_SOURCE[0]} +while [ -h "$S" ]; do + DIR=$(cd -P $(dirname "$S") >/dev/null 2>&1 && pwd) + S=$(readlink "$S") + [[ $S != /* ]] && S=$DIR/$S +done +DIR=$(cd -P $(dirname "$S") >/dev/null 2>&1 && pwd) +wizeng $DIR/RiRuntime.wasm "$@" diff --git a/int/RiRuntime.wasm b/int/RiRuntime.wasm new file mode 100644 index 0000000000000000000000000000000000000000..049ef40aa3e69f8aebd5db3e2624642b3499f831 GIT binary patch literal 4434 zcmb7HOOISf6|SmVb#Hf9kGmYlPGFL{t{#G5XR^vH6qJE)=aH#jFFj4GnJ(}OKbQy*wZ)354k6D^JAvuz2?1U_A8=Q zdaby&J9t?MGwi+4FQlmdk$FUr)_*F0;?tiwzWUkEef|sQpZwy5FMatdU;SF=>)-h1 zqie@5KJ~3LPd7e%4LUA9CFMB0@gtC{NxblKIu7jFy6zfRYYGbix!q2Kcvg3@N`vl)TIn>@9)kieecDkv$Xq2|uRO7N zS4p>`rCdono1j)BQYET|^!UT8-XaLITn}t4~D{PmcY>L@Qo}+1kir z70AjElASzc3WXxd1*;J+}Z7f!-fL*}P3;PjUN!ZHUM8}IT znJ1`CLVC*((rY<@j*AfJy*O$FC7$~Lbp#EnS_-g}eO5AMsUeG~9o{7SDo@~%wSRww zBGMYG?&OcJ)Lsh7{t+$Ex3vwwBiH;=iMzI20gJE}b6rU&b31{Yx4E7;P-h1uJCJ9-K}}$S@uY8&5@OeXK!*%VL@oO$-z(y zAmL^oL!it7Cvs9DBgg{Rj5|Fm7;dJJa*x~JP`$~R(YexO@)I(;kWbDbrH;8DOdW3` zx14j70x*dRAojaR2(rru9aUwTH|H~ro^mKe%l?I)i~$r?Tj7p@N}FoJqmS13+FnI7 zbc_*$%LnO_6_o6|EUo|{N+LwSh%hIZC{ct)4~i@5qscV4A65K}JwZqtXbn*L3dbyk z%BU+dOhE(N$VgX=|H3}d5czeqP03#LA|**lnd9VE>;_EaBf*i_gDqH)EKw(I~*X2|hsu;6~ z=K4L>7uy#|IF3*{6*29T{# z4+#OdQ$X=WhA}JGmn6boU0lUmQ+K7!5wRf*RZ0cylBr*EAEJ3lO@=as%SrNwClc8M z&5hSfl&k7e5=D;UowQ#1y5o7x6Oyi{Jte|;o?rr2XN?#GvA|{B#HU*~EHi+{Ct|nX{xbhYmM`@YxZ;awPMWrf4?w+Wipi0E%)F`y}T@FeQ zu}7T^&_L#)ockRb0zyx?z(xi92BQ}K2>Tv;ls+^q^uyCa^R!3<6dXtm|I!SNCvFl;LB{pT;)z zLlrLYGB(2`)JE$uw$P2jj2Q!g{sa?i)SKU$YM%BK3GWm05I(A)#RQd_c(f<$F2&*42Nik;!0w=?t`y?);hcm4ClwXNs(dK<-)8@v5}u`vXf5DM?B5zkWf8(UEC4@jNwZ4Qfpf9Bh)<3V4JL*~-XsMp`R z>UUo#Vb;G`^sW{I?9&UT#jBP5GobV8B5QuDk{T47#h}=Mp{|d Date: Fri, 23 Jan 2026 16:09:50 -0500 Subject: [PATCH 42/55] Add logic for specialized FAST_CALL bytecodes --- src/engine/Opcodes.v3 | 56 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/src/engine/Opcodes.v3 b/src/engine/Opcodes.v3 index 4204e3bf1..f05052280 100644 --- a/src/engine/Opcodes.v3 +++ b/src/engine/Opcodes.v3 @@ -662,6 +662,62 @@ enum Opcode(prefix: u8, code: u16, mnemonic: string, imms: Array, sig: //FAST_CALL46 (0x00, 0xFF, "fast_call46", imm.NONE, null) } +//def indexToFastCall(index: int) -> Opcode { +// var op: Opcode; +// match (index) { +// 0 => op = Opcode.FAST_CALL0; +// 1 => op = Opcode.FAST_CALL1; +// 2 => op = Opcode.FAST_CALL2; +// 3 => op = Opcode.FAST_CALL3; +// 4 => op = Opcode.FAST_CALL4; +// 5 => op = Opcode.FAST_CALL5; +// 6 => op = Opcode.FAST_CALL6; +// 7 => op = Opcode.FAST_CALL7; +// 8 => op = Opcode.FAST_CALL8; +// 9 => op = Opcode.FAST_CALL9; +// 10 => op = Opcode.FAST_CALL10; +// 11 => op = Opcode.FAST_CALL11; +// 12 => op = Opcode.FAST_CALL12; +// 13 => op = Opcode.FAST_CALL13; +// 14 => op = Opcode.FAST_CALL14; +// 15 => op = Opcode.FAST_CALL15; +// 16 => op = Opcode.FAST_CALL16; +// 17 => op = Opcode.FAST_CALL17; +// 18 => op = Opcode.FAST_CALL18; +// 19 => op = Opcode.FAST_CALL19; +// 20 => op = Opcode.FAST_CALL20; +// 21 => op = Opcode.FAST_CALL21; +// 22 => op = Opcode.FAST_CALL22; +// 23 => op = Opcode.FAST_CALL23; +// 24 => op = Opcode.FAST_CALL24; +// 25 => op = Opcode.FAST_CALL25; +// 26 => op = Opcode.FAST_CALL26; +// 27 => op = Opcode.FAST_CALL27; +// 28 => op = Opcode.FAST_CALL28; +// 29 => op = Opcode.FAST_CALL29; +// 30 => op = Opcode.FAST_CALL30; +// 31 => op = Opcode.FAST_CALL31; +// 32 => op = Opcode.FAST_CALL32; +// 33 => op = Opcode.FAST_CALL33; +// 34 => op = Opcode.FAST_CALL34; +// 35 => op = Opcode.FAST_CALL35; +// 36 => op = Opcode.FAST_CALL36; +// 37 => op = Opcode.FAST_CALL37; +// 38 => op = Opcode.FAST_CALL38; +// 39 => op = Opcode.FAST_CALL39; +// 40 => op = Opcode.FAST_CALL40; +// 41 => op = Opcode.FAST_CALL41; +// 42 => op = Opcode.FAST_CALL42; +// 43 => op = Opcode.FAST_CALL43; +// 44 => op = Opcode.FAST_CALL44; +// 45 => op = Opcode.FAST_CALL45; +// 46 => op = Opcode.FAST_CALL46; +// 47 => op = Opcode.FAST_CALL47; +// _ => System.error("indexToFastCall", "out of range"); +// } +// return op; +//} + // Enumeration of the different kinds of immediates to opcodes. enum ImmKind { ARRAY_TYPE_INDEX, // ARRAYT From 04f887836a275fca8d3c9548597a83cd74721a76 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Wed, 11 Mar 2026 13:42:18 -0400 Subject: [PATCH 43/55] Add more instructions --- int/Interpreter.v3 | 174 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 142 insertions(+), 32 deletions(-) diff --git a/int/Interpreter.v3 b/int/Interpreter.v3 index 9cc0be2dd..13f0a1b9e 100644 --- a/int/Interpreter.v3 +++ b/int/Interpreter.v3 @@ -1,55 +1,92 @@ -export "fast:const0" def const0() -> int { return 0; } -export "fast:const1" def const1() -> int { return 1; } -export "fast:constN" def constN(n: int) -> int { return n; } -export "fast:add" def add(l: int, r: int) -> int { return l + r; } -export "fast:sub" def sub(l: int, r: int) -> int { return l - r; } -export "fast:seq" def seq(f: int, s: int) -> int { return s; } -export "fast:select" def select(c: int, t: int, f: int) -> int { return if(c != 0, t, f); } -export "fast:nop" def nop() -> void {} +export "fast:const0" def const0() -> long { return 0; } +export "fast:const1" def const1() -> long { return 1; } +export "fast:constN" def constN(n: int) -> long { return n; } +export "fast:add" def add(l: long, r: long) -> long { return l + r; } +export "fast:sub" def sub(l: long, r: long) -> long { return l - r; } +export "fast:fact" def fact(n: long) -> long { + var v: long = 1; + for (i < n) { + v *= i + 1; + } + return v; + } +export "fast:seq" def seq(f: long, s: long) -> long { return s; } +export "fast:select" def select(c: long, t: long, f: long) -> long { return if(c != 0, t, f); } +export "fast:if" def if_(c: long) -> bool { return c != 0; } +export "fast:nop" def nop() -> void {} +export "fast:print" def print(n: long) -> long { + System.puts(Strings.format1("%d\n", n)); + return 0; + } +export "fast:double" def double(n: long) -> long { return add(n, n); } def HANDLER_CONST0 = CiWasmTarget.functionId(const0); def HANDLER_CONST1 = CiWasmTarget.functionId(const1); def HANDLER_CONSTN = CiWasmTarget.functionId(constN); def HANDLER_ADD = CiWasmTarget.functionId(add); def HANDLER_SUB = CiWasmTarget.functionId(sub); +def HANDLER_FACT = CiWasmTarget.functionId(fact); def HANDLER_SEQ = CiWasmTarget.functionId(seq); def HANDLER_SELECT = CiWasmTarget.functionId(select); +def HANDLER_IF = CiWasmTarget.functionId(if_); def HANDLER_NOP = CiWasmTarget.functionId(nop); +def HANDLER_PRINT = CiWasmTarget.functionId(print); +def HANDLER_DOUBLE = CiWasmTarget.functionId(double); export "main" def main() -> int { + def buf = StringBuilder.new(); + //def prog = Select(Sub(ConstN(1), Const1), Add(Const1, ConstN(100)), Seq(Sub(Add(Const1, ConstN(2)), Const0), ConstN(15))); - def prog = AST.If(Const1, ConstN(2), ConstN(3)); + + //def prog = AST.If(Const1, ConstN(2), ConstN(3)); + def prog = Double(Fact(ConstN(13))); //def prog = Const1; def bytecode = compile(prog); def val = eval(bytecode); - def f: Func.F = wasmCompile(bytecode); - def val_ = f.f(); - - def buf = StringBuilder.new(); prog.display(buf); buf.ln(); buf.put1("=> %d", val); buf.ln(); + System.puts(buf.extract()); + + def f: Func.F = wasmCompile(bytecode); + def val_ = f.f(); + buf.put1("=> %d", val_); buf.ln(); System.puts(buf.extract()); - call(); return 0; } -def CALL_FUNC = CiWasmTarget.functionId(call); +def eval(bytecode: Array) -> long { + def vstk = ArrayStack.new(); + var pc = 0; -def call() { - System.puts(Strings.format1("call=%d\n", CALL_FUNC)); -} + // print out bytecode + def b = StringBuilder.new(); + while (pc < bytecode.length) { + b.put1("+%d ", pc); -def eval(bytecode: Array) -> int { - def vstk = ArrayStack.new(); - var pc = 0; + def instruction = Ref.at(bytecode, pc); + def opcode = instruction.opcode; + def operand = instruction.operand; + pc += Instruction.size; + + b.puts(opcode.name); + match (opcode) { + CONSTN, IF, ELSE => b.put1(" %d", operand); + _ => ; + } + b.ln(); + } + System.puts(b.extract()); + + pc = 0; while (pc < bytecode.length) { + System.puts(Strings.format1("pc=%d\n", pc)); def instruction = Ref.at(bytecode, pc); def opcode = instruction.opcode; def operand = instruction.operand; @@ -69,6 +106,23 @@ def eval(bytecode: Array) -> int { def left = vstk.pop(); vstk.push(left - right); } + FACT => { + def arg = vstk.pop(); + var val: long = 1; + for (i < arg) { + val *= i + 1; + } + vstk.push(val); + } + PRINT => { + def arg = vstk.pop(); + System.puts(Strings.format1("%d\n", arg)); + vstk.push(0); + } + DOUBLE => { + def arg = vstk.pop(); + vstk.push(arg + arg); + } SEQ => { def snd = vstk.pop(); def fst = vstk.pop(); @@ -82,7 +136,7 @@ def eval(bytecode: Array) -> int { } IF => { def cond = vstk.pop(); - if (cond != 0) pc += operand; + if (cond == 0) pc += operand; } ELSE => { pc += operand; @@ -99,11 +153,14 @@ enum Opcode(handler: int) { CONSTN (HANDLER_CONSTN) ADD (HANDLER_ADD) SUB (HANDLER_SUB) + FACT (HANDLER_FACT) SEQ (HANDLER_SEQ) SELECT (HANDLER_SELECT) - IF (HANDLER_NOP) + IF (HANDLER_IF) ELSE (HANDLER_NOP) END (HANDLER_NOP) + PRINT (HANDLER_PRINT) + DOUBLE (HANDLER_DOUBLE) } layout Instruction { @@ -151,6 +208,39 @@ type AST { s.putc(')'); } } + case Fact(arg: AST) { + def compile(w: DataWriter) { + arg.compile(w); + w.putb(Opcode.FACT.tag).putb(0); + } + def display(s: StringBuilder) { + s.puts("(fact "); + arg.display(s); + s.putc(')'); + } + } + case Print(arg: AST) { + def compile(w: DataWriter) { + arg.compile(w); + w.putb(Opcode.PRINT.tag).putb(0); + } + def display(s: StringBuilder) { + s.puts("(print "); + arg.display(s); + s.putc(')'); + } + } + case Double(arg: AST) { + def compile(w: DataWriter) { + arg.compile(w); + w.putb(Opcode.DOUBLE.tag).putb(0); + } + def display(s: StringBuilder) { + s.puts("(double "); + arg.display(s); + s.putc(')'); + } + } case Sub(left: AST, right: AST) { def compile(w: DataWriter) { left.compile(w); @@ -201,14 +291,14 @@ type AST { case If(cond: AST, left: AST, right: AST) { def compile(w: DataWriter) { cond.compile(w); - w.putb(Opcode.IF.tag); + w.putb(Opcode.IF.tag).putb(0); def hole1 = w.pos; left.compile(w); - w.data[hole1] = byte.!(w.pos - hole1 + 1); - w.putb(Opcode.ELSE.tag); + w.putb(Opcode.ELSE.tag).putb(0); + w.data[hole1 - 1] = byte.!(w.pos - hole1); def hole2 = w.pos; right.compile(w); - w.data[hole2] = byte.!(w.pos - hole2 + 1); + w.data[hole2 - 1] = byte.!(w.pos - hole2); w.putb(Opcode.END.tag).putb(0); } @@ -232,9 +322,12 @@ def Const1 = AST.Const1; def ConstN = AST.ConstN; def Add = AST.Add; def Sub = AST.Sub; +def Fact= AST.Fact; +def Print = AST.Print; def Seq = AST.Seq; def Select = AST.Select; def If = AST.If; +def Double = AST.Double; def compile(prog: AST) -> Array { def w = DataWriter.new(); @@ -245,7 +338,7 @@ def compile(prog: AST) -> Array { } type Func { - case F(f: () -> int); + case F(f: () -> long); } def wasmCompile(bytecode: Array) -> Func.F { @@ -269,25 +362,42 @@ def wasmCompile(bytecode: Array) -> Func.F { _ => ; } // call handler function - w.putb(CALL); - w.put_uleb32(u32.!(opcode.handler)); + if (opcode.handler != HANDLER_NOP) { + w.putb(CALL); + w.put_uleb32(u32.!(opcode.handler)); + } + // post-handler wasm bytecodes + match (opcode) { + IF => { + w.putb(IF); + w.putb(RESULT_I64); + } + ELSE => w.putb(ELSE); // didn't emit handler anyway + END => w.putb(END); // didn't emit handler anyway + _ => ; + } } w.putb(END); // create wasm function - def sig = CiWasmTarget.functionTypeId(); + def sig = CiWasmTarget.functionTypeId(); def wasm = w.extract(); def fid = wave.new_func(sig, Pointer.atContents(wasm), wasm.length); if (fid < 0) { System.puts("failed to compile wasm function\n"); System.error("error", "failed to compile"); } - def func = CiRuntime.forgeClosure(Pointer.NULL + fid, void); + def func = CiRuntime.forgeClosure(Pointer.NULL + fid, void); return Func.F(func); } +def IF: byte = 0x04; +def ELSE: byte = 0x05; def END: byte = 0x0B; def CALL: byte = 0x10; def DROP: byte = 0x1A; def I32_CONST: byte = 0x41; +def I64_CONST: byte = 0x42; + +def RESULT_I64: byte = 0x7E; From 13799d0e922716506946d444fdd01133bfea5b56 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Wed, 18 Mar 2026 16:46:27 -0400 Subject: [PATCH 44/55] Remove use of fast frame --- src/engine/x86-64/X86_64MasmRegs.v3 | 4 ++-- src/engine/x86-64/X86_64SinglePassCompiler.v3 | 19 +++---------------- 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/src/engine/x86-64/X86_64MasmRegs.v3 b/src/engine/x86-64/X86_64MasmRegs.v3 index 53f9ef1b1..d1fe06a79 100644 --- a/src/engine/x86-64/X86_64MasmRegs.v3 +++ b/src/engine/x86-64/X86_64MasmRegs.v3 @@ -193,5 +193,5 @@ component X86_64MasmRegs { } } -def nothing = X86_64MasmRegs.FAST_SPC_EXEC_ENV.frameSize = X86_64InterpreterFastCallFrame.size; -def nothing2 = X86_64MasmRegs.FAST_SPC_EXEC_ENV.vfp_slot = MasmAddr(X86_64MasmRegs.FAST_SPC_EXEC_ENV.sp, X86_64InterpreterFastCallFrame.vfp.offset); +def nothing = X86_64MasmRegs.FAST_SPC_EXEC_ENV.frameSize = 0; +def nothing2 = X86_64MasmRegs.FAST_SPC_EXEC_ENV.vfp_slot = MasmAddr(X86_64MasmRegs.FAST_SPC_EXEC_ENV.sp, X86_64InterpreterFrame.vfp.offset); diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3 index e7b61881c..6249270b4 100644 --- a/src/engine/x86-64/X86_64SinglePassCompiler.v3 +++ b/src/engine/x86-64/X86_64SinglePassCompiler.v3 @@ -47,29 +47,16 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { mmasm.emit_int_dispatch(r_tmp0, r_tmp1, r_ip, r_dispatch, ip_ptr, if(ic != null, IcCodeRef.new(ic.header.fastDispatchTableOffset)), true, ic); } - // TODO do we need to spill VFP if the function has no locals? when exactly is spilling VFP needed? - // and can it be determined independent of call site? def emitFastPrologue() { - // Allocate (very cheap) stack frame - masm.emit_subw_r_i(regs.sp, frame.frameSize); - // spill VFP - masm.emit_mov_m_r(ValueKind.REF, frame.vfp_slot, regs.vfp); - // Compute VFP = VSP - sig.params.length * SLOT_SIZE - masm.emit_mov_r_r(ValueKind.REF, regs.vfp, regs.vsp); // XXX: use 3-addr adjustment of VFP + // Compute VFP = VSP - sig.params.length * SLOT_SIZE (no native stack frame needed) + masm.emit_mov_r_r(ValueKind.REF, regs.vfp, regs.vsp); masm.emit_subw_r_i(regs.vfp, sig.params.length * masm.valuerep.slot_size); - asm.nop1(); } def emitFastEpilogue1() { - asm.nop1(); } def emitFastEpilogue2() { - asm.nop1(); - asm.nop1(); - // restore VFP + // Restore VFP from interpreter frame (always in sync; no fast frame was allocated) masm.emit_mov_r_m(ValueKind.REF, regs.vfp, frame.vfp_slot); - masm.emit_addw_r_i(regs.sp, frame.frameSize); - asm.nop1(); - asm.nop1(); } private def visitCompareI(asm: X86_64Assembler, cond: X86_64Cond) -> bool { From 9fbccb5b0d4fa6a5e1067a96214afcc31fdb2dea Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Wed, 1 Apr 2026 12:32:03 -0400 Subject: [PATCH 45/55] Add FAST_CALL 0-47 for each unused bytecode, and minor merge fixes --- src/engine/BytecodeIterator.v3 | 56 +++- src/engine/CodeValidator.v3 | 86 +++++- src/engine/Module.v3 | 7 +- src/engine/Opcodes.v3 | 269 +++++++++++------- src/engine/compiler/SinglePassCompiler.v3 | 3 +- src/engine/v3/V3Interpreter.v3 | 4 +- src/engine/x86-64/X86_64Interpreter.v3 | 51 +++- src/engine/x86-64/X86_64SinglePassCompiler.v3 | 14 +- src/engine/x86-64/X86_64Target.v3 | 6 +- src/util/BytecodeVisitor.v3 | 2 +- 10 files changed, 368 insertions(+), 130 deletions(-) diff --git a/src/engine/BytecodeIterator.v3 b/src/engine/BytecodeIterator.v3 index 1f1b58c4e..1a1c9845f 100644 --- a/src/engine/BytecodeIterator.v3 +++ b/src/engine/BytecodeIterator.v3 @@ -217,7 +217,6 @@ class BytecodeIterator { BR_TABLE => v.visit_BR_TABLE(cp.read_labels()); RETURN => v.visit_RETURN(); CALL => v.visit_CALL(read_FUNC()); - FAST_CALL => v.visit_FAST_CALL(read_FUNC()); CALL_INDIRECT => v.visit_CALL_INDIRECT(read_SIG(), read_TABLE()); RETURN_CALL => v.visit_RETURN_CALL(read_FUNC()); RETURN_CALL_INDIRECT => v.visit_RETURN_CALL_INDIRECT(read_SIG(), read_TABLE()); @@ -780,6 +779,61 @@ class BytecodeIterator { RESUME_THROW => v.visit_RESUME_THROW(read_CONT(), read_TAG(), read_HANDLERS()); RESUME_THROW_REF => v.visit_RESUME_THROW_REF(read_CONT(), read_HANDLERS()); SWITCH => v.visit_SWITCH(read_CONT(), read_TAG()); + + /* here, we require that replacing CALL with FAST_CALL does not touch the + * operand, so that the original function can still be recovered from the bytecode itself + * + * in other places, where we have the module, we can go direct from bytecode to func + */ + // FIXME wrap into _ clause + FAST_CALL0 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL1 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL2 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL3 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL4 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL5 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL6 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL7 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL8 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL9 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL10 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL11 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL12 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL13 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL14 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL15 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL16 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL17 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL18 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL19 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL20 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL21 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL22 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL23 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL24 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL25 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL26 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL27 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL28 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL29 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL30 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL31 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL32 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL33 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL34 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL35 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL36 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL37 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL38 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL39 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL40 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL41 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL42 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL43 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL44 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL45 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL46 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL47 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); } } def trace(out: StringBuilder, module: Module, tracer: InstrTracer) { diff --git a/src/engine/CodeValidator.v3 b/src/engine/CodeValidator.v3 index 24b25887c..281b14a01 100644 --- a/src/engine/CodeValidator.v3 +++ b/src/engine/CodeValidator.v3 @@ -425,16 +425,90 @@ class CodeValidator(extensions: Extension.set, limits: Limits, module: Module, e for (i < module.exports.length) { def ex = module.exports[i]; if (ex.1 == func && Strings.startsWith(ex.0, "fast:")) { - this.func.replaceCall(opcode_pos); + if (Trace.validation) Trace.OUT.puts(" function declared as fast: "); + + var fast_idx = -1; + def fast_funcs = module.fast_funcs; + // look for existing FAST_CALL instruction allocated for this function + for (i < fast_funcs.length) { + if (func == fast_funcs[i]) { + fast_idx = i; + if (Trace.validation) Trace.OUT.put1("allocated as FAST_CALL%d, ", fast_idx); + break; + } + } + // not found? allocate FAST_CALL instruction, if there's space + if (fast_idx < 0) { + if (fast_funcs.length < 48) { + fast_idx = fast_funcs.length; + if (Trace.validation) Trace.OUT.put1("not found, allocating FAST_CALL%d, ", fast_idx); + fast_funcs.put(func); + } else { + if (Trace.validation) Trace.OUT.puts("not found, FAST_CALL table is full, "); + } + } + // replace the bytecode, if it's found or allocated + if (fast_idx >= 0) { + //if (Trace.validation) Trace.OUT.put2("replaceCall(opcode_pos, fast_idx)\n", opcode_pos, fast_idx); + if (Trace.validation) Trace.OUT.puts("replacing call\n"); + this.func.replaceCall(opcode_pos, fast_idx); + } else { + if (Trace.validation) Trace.OUT.puts("not replacing\n"); + } } } } - FAST_CALL => { - var func = parser.readFuncRef(); - if (func == null) return; - checkSignature(func.sig); - } + // code should have FAST_CALL replaced after CALL + + FAST_CALL0 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL1 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL2 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL3 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL4 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL5 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL6 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL7 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL8 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL9 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL10 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL11 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL12 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL13 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL14 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL15 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL16 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL17 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL18 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL19 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL20 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL21 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL22 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL23 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL24 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL25 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL26 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL27 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL28 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL29 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL30 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL31 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL32 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL33 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL34 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL35 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL36 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL37 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL38 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL39 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL40 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL41 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL42 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL43 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL44 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL45 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL46 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL47 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); CALL_INDIRECT => { var sig = parser.readSigRef(); var table = parser.readTableRef(); diff --git a/src/engine/Module.v3 b/src/engine/Module.v3 index 4b37e48f0..c7438ecba 100644 --- a/src/engine/Module.v3 +++ b/src/engine/Module.v3 @@ -17,6 +17,7 @@ class Module(filename: string) { def exports = Vector<(string, Decl)>.new(); def elems = Vector.new(); def data = Vector.new(); + def fast_funcs = Vector.new(); def custom_sections = Vector.new(); var probes: Array>; var dyn_probes: Vector<(int, int, Probe)>; @@ -170,14 +171,16 @@ class FuncDecl(sig_index: int) extends Decl { if (cur_bytecode == orig_bytecode) return; cur_bytecode[pc] = orig_bytecode[pc]; } - def replaceCall(pc: int) { + def replaceCall(pc: int, idx: int) { // "orig" will become a copy of the original code, to allow in-place modification of old code if (cur_bytecode == orig_bytecode) orig_bytecode = Arrays.dup(orig_bytecode); + // sanity check if (cur_bytecode[pc] != Opcode.CALL.code) { def realOp = Opcodes.find(0, cur_bytecode[pc]); System.error("replace bytecode", Strings.format1("not replacing call (got %s)", realOp.mnemonic)); } - cur_bytecode[pc] = byte.!(Opcode.FAST_CALL.code); + cur_bytecode[pc] = byte.!(Opcodes.indexToFastCall(idx).code); + // do NOT replace the operands, as a convenience for BytecodeIterator } def reset() -> this { if (cur_bytecode == orig_bytecode) return; diff --git a/src/engine/Opcodes.v3 b/src/engine/Opcodes.v3 index f05052280..db9c56009 100644 --- a/src/engine/Opcodes.v3 +++ b/src/engine/Opcodes.v3 @@ -612,111 +612,56 @@ enum Opcode(prefix: u8, code: u16, mnemonic: string, imms: Array, sig: SWITCH (0x00, 0xE6, "switch", imm.CONT_TAG, null) // fast call instructions - FAST_CALL (0x00, 0x17, "fast_call", imm.FUNC, null) - //FAST_CALL0 (0x00, 0x27, "fast_call0", imm.NONE, null), - //FAST_CALL1 (0x00, 0xC5, "fast_call1", imm.NONE, null), - //FAST_CALL2 (0x00, 0xC6, "fast_call2", imm.NONE, null), - //FAST_CALL3 (0x00, 0xC7, "fast_call3", imm.NONE, null), - //FAST_CALL4 (0x00, 0xC8, "fast_call4", imm.NONE, null), - //FAST_CALL5 (0x00, 0xC9, "fast_call5", imm.NONE, null), - //FAST_CALL6 (0x00, 0xCA, "fast_call6", imm.NONE, null), - //FAST_CALL7 (0x00, 0xCB, "fast_call7", imm.NONE, null), - //FAST_CALL8 (0x00, 0xCC, "fast_call8", imm.NONE, null), - //FAST_CALL9 (0x00, 0xCD, "fast_call9", imm.NONE, null), - //FAST_CALL10 (0x00, 0xCE, "fast_call10", imm.NONE, null), - //FAST_CALL11 (0x00, 0xCF, "fast_call11", imm.NONE, null), - //FAST_CALL12 (0x00, 0xD7, "fast_call12", imm.NONE, null), - //FAST_CALL13 (0x00, 0xD8, "fast_call13", imm.NONE, null), - //FAST_CALL14 (0x00, 0xD9, "fast_call14", imm.NONE, null), - //FAST_CALL15 (0x00, 0xDA, "fast_call15", imm.NONE, null), - //FAST_CALL16 (0x00, 0xDB, "fast_call16", imm.NONE, null), - //FAST_CALL17 (0x00, 0xDC, "fast_call17", imm.NONE, null), - //FAST_CALL18 (0x00, 0xDD, "fast_call18", imm.NONE, null), - //FAST_CALL19 (0x00, 0xDE, "fast_call19", imm.NONE, null), - //FAST_CALL20 (0x00, 0xDF, "fast_call20", imm.NONE, null), - //FAST_CALL21 (0x00, 0xE0, "fast_call21", imm.NONE, null), - //FAST_CALL22 (0x00, 0xE1, "fast_call22", imm.NONE, null), - //FAST_CALL23 (0x00, 0xE2, "fast_call23", imm.NONE, null), - //FAST_CALL24 (0x00, 0xE3, "fast_call24", imm.NONE, null), - //FAST_CALL25 (0x00, 0xE4, "fast_call25", imm.NONE, null), - //FAST_CALL26 (0x00, 0xE5, "fast_call26", imm.NONE, null), - //FAST_CALL27 (0x00, 0xE6, "fast_call27", imm.NONE, null), - //FAST_CALL28 (0x00, 0xE7, "fast_call28", imm.NONE, null), - //FAST_CALL29 (0x00, 0xE8, "fast_call29", imm.NONE, null), - //FAST_CALL30 (0x00, 0xE9, "fast_call30", imm.NONE, null), - //FAST_CALL31 (0x00, 0xEA, "fast_call31", imm.NONE, null), - //FAST_CALL32 (0x00, 0xEB, "fast_call32", imm.NONE, null), - //FAST_CALL33 (0x00, 0xEC, "fast_call33", imm.NONE, null), - //FAST_CALL34 (0x00, 0xED, "fast_call34", imm.NONE, null), - //FAST_CALL35 (0x00, 0xEE, "fast_call35", imm.NONE, null), - //FAST_CALL36 (0x00, 0xEF, "fast_call36", imm.NONE, null), - //FAST_CALL37 (0x00, 0xF2, "fast_call37", imm.NONE, null), - //FAST_CALL38 (0x00, 0xF3, "fast_call38", imm.NONE, null), - //FAST_CALL39 (0x00, 0xF4, "fast_call39", imm.NONE, null), - //FAST_CALL40 (0x00, 0xF5, "fast_call40", imm.NONE, null), - //FAST_CALL41 (0x00, 0xF6, "fast_call41", imm.NONE, null), - //FAST_CALL42 (0x00, 0xF7, "fast_call42", imm.NONE, null), - //FAST_CALL43 (0x00, 0xF8, "fast_call43", imm.NONE, null), - //FAST_CALL44 (0x00, 0xF9, "fast_call44", imm.NONE, null), - //FAST_CALL45 (0x00, 0xFA, "fast_call45", imm.NONE, null), - //FAST_CALL46 (0x00, 0xFF, "fast_call46", imm.NONE, null) + FAST_CALL0 (0x00, 0x27, "fast_call0", imm.NONE, null), + FAST_CALL1 (0x00, 0xC5, "fast_call1", imm.NONE, null), + FAST_CALL2 (0x00, 0xC6, "fast_call2", imm.NONE, null), + FAST_CALL3 (0x00, 0xC7, "fast_call3", imm.NONE, null), + FAST_CALL4 (0x00, 0xC8, "fast_call4", imm.NONE, null), + FAST_CALL5 (0x00, 0xC9, "fast_call5", imm.NONE, null), + FAST_CALL6 (0x00, 0xCA, "fast_call6", imm.NONE, null), + FAST_CALL7 (0x00, 0xCB, "fast_call7", imm.NONE, null), + FAST_CALL8 (0x00, 0xCC, "fast_call8", imm.NONE, null), + FAST_CALL9 (0x00, 0xCD, "fast_call9", imm.NONE, null), + FAST_CALL10 (0x00, 0xCE, "fast_call10", imm.NONE, null), + FAST_CALL11 (0x00, 0xCF, "fast_call11", imm.NONE, null), + FAST_CALL12 (0x00, 0xD7, "fast_call12", imm.NONE, null), + FAST_CALL13 (0x00, 0xD8, "fast_call13", imm.NONE, null), + FAST_CALL14 (0x00, 0xD9, "fast_call14", imm.NONE, null), + FAST_CALL15 (0x00, 0xDA, "fast_call15", imm.NONE, null), + FAST_CALL16 (0x00, 0xDB, "fast_call16", imm.NONE, null), + FAST_CALL17 (0x00, 0xDC, "fast_call17", imm.NONE, null), + FAST_CALL18 (0x00, 0xDD, "fast_call18", imm.NONE, null), + FAST_CALL19 (0x00, 0xDE, "fast_call19", imm.NONE, null), + FAST_CALL20 (0x00, 0xDF, "fast_call20", imm.NONE, null), + FAST_CALL21 (0x00, 0xE0, "fast_call21", imm.NONE, null), + FAST_CALL22 (0x00, 0xE1, "fast_call22", imm.NONE, null), + FAST_CALL23 (0x00, 0xE2, "fast_call23", imm.NONE, null), + FAST_CALL24 (0x00, 0xE3, "fast_call24", imm.NONE, null), + FAST_CALL25 (0x00, 0xE4, "fast_call25", imm.NONE, null), + FAST_CALL26 (0x00, 0xE5, "fast_call26", imm.NONE, null), + FAST_CALL27 (0x00, 0xE6, "fast_call27", imm.NONE, null), + FAST_CALL28 (0x00, 0xE7, "fast_call28", imm.NONE, null), + FAST_CALL29 (0x00, 0xE8, "fast_call29", imm.NONE, null), + FAST_CALL30 (0x00, 0xE9, "fast_call30", imm.NONE, null), + FAST_CALL31 (0x00, 0xEA, "fast_call31", imm.NONE, null), + FAST_CALL32 (0x00, 0xEB, "fast_call32", imm.NONE, null), + FAST_CALL33 (0x00, 0xEC, "fast_call33", imm.NONE, null), + FAST_CALL34 (0x00, 0xED, "fast_call34", imm.NONE, null), + FAST_CALL35 (0x00, 0xEE, "fast_call35", imm.NONE, null), + FAST_CALL36 (0x00, 0xEF, "fast_call36", imm.NONE, null), + FAST_CALL37 (0x00, 0xF2, "fast_call37", imm.NONE, null), + FAST_CALL38 (0x00, 0xF3, "fast_call38", imm.NONE, null), + FAST_CALL39 (0x00, 0xF4, "fast_call39", imm.NONE, null), + FAST_CALL40 (0x00, 0xF5, "fast_call40", imm.NONE, null), + FAST_CALL41 (0x00, 0xF6, "fast_call41", imm.NONE, null), + FAST_CALL42 (0x00, 0xF7, "fast_call42", imm.NONE, null), + FAST_CALL43 (0x00, 0xF8, "fast_call43", imm.NONE, null), + FAST_CALL44 (0x00, 0xF9, "fast_call44", imm.NONE, null), + FAST_CALL45 (0x00, 0xFA, "fast_call45", imm.NONE, null), + FAST_CALL46 (0x00, 0xFF, "fast_call46", imm.NONE, null), + FAST_CALL47 (0x00, 0x17, "fast_call47", imm.FUNC, null) } -//def indexToFastCall(index: int) -> Opcode { -// var op: Opcode; -// match (index) { -// 0 => op = Opcode.FAST_CALL0; -// 1 => op = Opcode.FAST_CALL1; -// 2 => op = Opcode.FAST_CALL2; -// 3 => op = Opcode.FAST_CALL3; -// 4 => op = Opcode.FAST_CALL4; -// 5 => op = Opcode.FAST_CALL5; -// 6 => op = Opcode.FAST_CALL6; -// 7 => op = Opcode.FAST_CALL7; -// 8 => op = Opcode.FAST_CALL8; -// 9 => op = Opcode.FAST_CALL9; -// 10 => op = Opcode.FAST_CALL10; -// 11 => op = Opcode.FAST_CALL11; -// 12 => op = Opcode.FAST_CALL12; -// 13 => op = Opcode.FAST_CALL13; -// 14 => op = Opcode.FAST_CALL14; -// 15 => op = Opcode.FAST_CALL15; -// 16 => op = Opcode.FAST_CALL16; -// 17 => op = Opcode.FAST_CALL17; -// 18 => op = Opcode.FAST_CALL18; -// 19 => op = Opcode.FAST_CALL19; -// 20 => op = Opcode.FAST_CALL20; -// 21 => op = Opcode.FAST_CALL21; -// 22 => op = Opcode.FAST_CALL22; -// 23 => op = Opcode.FAST_CALL23; -// 24 => op = Opcode.FAST_CALL24; -// 25 => op = Opcode.FAST_CALL25; -// 26 => op = Opcode.FAST_CALL26; -// 27 => op = Opcode.FAST_CALL27; -// 28 => op = Opcode.FAST_CALL28; -// 29 => op = Opcode.FAST_CALL29; -// 30 => op = Opcode.FAST_CALL30; -// 31 => op = Opcode.FAST_CALL31; -// 32 => op = Opcode.FAST_CALL32; -// 33 => op = Opcode.FAST_CALL33; -// 34 => op = Opcode.FAST_CALL34; -// 35 => op = Opcode.FAST_CALL35; -// 36 => op = Opcode.FAST_CALL36; -// 37 => op = Opcode.FAST_CALL37; -// 38 => op = Opcode.FAST_CALL38; -// 39 => op = Opcode.FAST_CALL39; -// 40 => op = Opcode.FAST_CALL40; -// 41 => op = Opcode.FAST_CALL41; -// 42 => op = Opcode.FAST_CALL42; -// 43 => op = Opcode.FAST_CALL43; -// 44 => op = Opcode.FAST_CALL44; -// 45 => op = Opcode.FAST_CALL45; -// 46 => op = Opcode.FAST_CALL46; -// 47 => op = Opcode.FAST_CALL47; -// _ => System.error("indexToFastCall", "out of range"); -// } -// return op; -//} // Enumeration of the different kinds of immediates to opcodes. enum ImmKind { @@ -905,9 +850,14 @@ component Opcodes { def code_pages = [page_FB, page_FC, page_FD, page_FE]; def var longestName: int; def var num_subpages: int; + def var fast_calls: Array; private var nameMap: HashMap; new() { + fast_calls = Array.new(48); + for (i < 48) { + fast_calls[i] = indexToFastCall(i); + } for (op in Opcode) { if (op == Opcode.INVALID) continue; init(op); @@ -916,7 +866,8 @@ component Opcodes { attributes[InternalOpcode.PROBE.code] = OpcodeAttribute.INTERNAL | OpcodeAttribute.PROBE; attributes[InternalOpcode.WHAMM_PROBE.code] = OpcodeAttribute.INTERNAL | OpcodeAttribute.PROBE; attributes[InternalOpcode.BREAK_PROBE.code] = OpcodeAttribute.INTERNAL | OpcodeAttribute.PROBE; - attributes[Opcode.FAST_CALL.tag] = OpcodeAttribute.INTERNAL; + for (op in fast_calls) + attributes[op.tag] = OpcodeAttribute.INTERNAL; for (op in [Opcode.END, Opcode.I32_CONST, Opcode.I64_CONST, Opcode.F32_CONST, Opcode.F64_CONST, Opcode.GLOBAL_GET, Opcode.REF_NULL, Opcode.REF_FUNC, Opcode.STRUCT_NEW, Opcode.STRUCT_NEW_DEFAULT, @@ -1229,6 +1180,116 @@ component Opcodes { } } } + def indexToFastCall(index: int) -> Opcode { + var op: Opcode; + match (index) { + 0 => op = Opcode.FAST_CALL0; + 1 => op = Opcode.FAST_CALL1; + 2 => op = Opcode.FAST_CALL2; + 3 => op = Opcode.FAST_CALL3; + 4 => op = Opcode.FAST_CALL4; + 5 => op = Opcode.FAST_CALL5; + 6 => op = Opcode.FAST_CALL6; + 7 => op = Opcode.FAST_CALL7; + 8 => op = Opcode.FAST_CALL8; + 9 => op = Opcode.FAST_CALL9; + 10 => op = Opcode.FAST_CALL10; + 11 => op = Opcode.FAST_CALL11; + 12 => op = Opcode.FAST_CALL12; + 13 => op = Opcode.FAST_CALL13; + 14 => op = Opcode.FAST_CALL14; + 15 => op = Opcode.FAST_CALL15; + 16 => op = Opcode.FAST_CALL16; + 17 => op = Opcode.FAST_CALL17; + 18 => op = Opcode.FAST_CALL18; + 19 => op = Opcode.FAST_CALL19; + 20 => op = Opcode.FAST_CALL20; + 21 => op = Opcode.FAST_CALL21; + 22 => op = Opcode.FAST_CALL22; + 23 => op = Opcode.FAST_CALL23; + 24 => op = Opcode.FAST_CALL24; + 25 => op = Opcode.FAST_CALL25; + 26 => op = Opcode.FAST_CALL26; + 27 => op = Opcode.FAST_CALL27; + 28 => op = Opcode.FAST_CALL28; + 29 => op = Opcode.FAST_CALL29; + 30 => op = Opcode.FAST_CALL30; + 31 => op = Opcode.FAST_CALL31; + 32 => op = Opcode.FAST_CALL32; + 33 => op = Opcode.FAST_CALL33; + 34 => op = Opcode.FAST_CALL34; + 35 => op = Opcode.FAST_CALL35; + 36 => op = Opcode.FAST_CALL36; + 37 => op = Opcode.FAST_CALL37; + 38 => op = Opcode.FAST_CALL38; + 39 => op = Opcode.FAST_CALL39; + 40 => op = Opcode.FAST_CALL40; + 41 => op = Opcode.FAST_CALL41; + 42 => op = Opcode.FAST_CALL42; + 43 => op = Opcode.FAST_CALL43; + 44 => op = Opcode.FAST_CALL44; + 45 => op = Opcode.FAST_CALL45; + 46 => op = Opcode.FAST_CALL46; + 47 => op = Opcode.FAST_CALL47; + _ => System.error("indexToFastCall", "out of range"); + } + return op; + } + def fastCallToIndex(op: Opcode) -> int { + var idx: int; + match (op) { + FAST_CALL0 => idx = 0; + FAST_CALL1 => idx = 1; + FAST_CALL2 => idx = 2; + FAST_CALL3 => idx = 3; + FAST_CALL4 => idx = 4; + FAST_CALL5 => idx = 5; + FAST_CALL6 => idx = 6; + FAST_CALL7 => idx = 7; + FAST_CALL8 => idx = 8; + FAST_CALL9 => idx = 9; + FAST_CALL10 => idx = 10; + FAST_CALL11 => idx = 11; + FAST_CALL12 => idx = 12; + FAST_CALL13 => idx = 13; + FAST_CALL14 => idx = 14; + FAST_CALL15 => idx = 15; + FAST_CALL16 => idx = 16; + FAST_CALL17 => idx = 17; + FAST_CALL18 => idx = 18; + FAST_CALL19 => idx = 19; + FAST_CALL20 => idx = 20; + FAST_CALL21 => idx = 21; + FAST_CALL22 => idx = 22; + FAST_CALL23 => idx = 23; + FAST_CALL24 => idx = 24; + FAST_CALL25 => idx = 25; + FAST_CALL26 => idx = 26; + FAST_CALL27 => idx = 27; + FAST_CALL28 => idx = 28; + FAST_CALL29 => idx = 29; + FAST_CALL30 => idx = 30; + FAST_CALL31 => idx = 31; + FAST_CALL32 => idx = 32; + FAST_CALL33 => idx = 33; + FAST_CALL34 => idx = 34; + FAST_CALL35 => idx = 35; + FAST_CALL36 => idx = 36; + FAST_CALL37 => idx = 37; + FAST_CALL38 => idx = 38; + FAST_CALL39 => idx = 39; + FAST_CALL40 => idx = 40; + FAST_CALL41 => idx = 41; + FAST_CALL42 => idx = 42; + FAST_CALL43 => idx = 43; + FAST_CALL44 => idx = 44; + FAST_CALL45 => idx = 45; + FAST_CALL46 => idx = 46; + FAST_CALL47 => idx = 47; + _ => System.error("fastCallToIndex", "not a FAST_CALL instruction"); + } + return idx; + } } // Renders instructions as text. diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index bb6f1587d..b7447dfb2 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -3577,7 +3577,8 @@ def funcCanInline(decl: FuncDecl) -> InlineConfig { // Cannot handle control flow yet. IF, BR, BR_IF, BR_TABLE, BR_ON_NULL, BR_ON_NON_NULL, BR_ON_CAST, BR_ON_CAST_FAIL, RETURN => return default; // These opcodes require swapping the instance. - THROW, CALL, FAST_CALL, CALL_INDIRECT, MEMORY_INIT, MEMORY_SIZE, MEMORY_GROW, MEMORY_COPY, MEMORY_FILL, REF_FUNC, DATA_DROP, + // XXX should include FAST_CALL here, except that this code is superceded by inlining PR + THROW, CALL, CALL_INDIRECT, MEMORY_INIT, MEMORY_SIZE, MEMORY_GROW, MEMORY_COPY, MEMORY_FILL, REF_FUNC, DATA_DROP, ELEM_DROP, TABLE_INIT, TABLE_SIZE, TABLE_COPY, TABLE_GROW, GLOBAL_SET, GLOBAL_GET, TABLE_SET, TABLE_GET => swap_instance = true; // Load/store opcodes require either the memory base or the instance. I32_STORE, I64_STORE, F32_STORE, F64_STORE, I32_STORE8, I32_STORE16, I64_STORE8, I64_STORE16, I64_STORE32, diff --git a/src/engine/v3/V3Interpreter.v3 b/src/engine/v3/V3Interpreter.v3 index edd48abc0..3d8b33368 100644 --- a/src/engine/v3/V3Interpreter.v3 +++ b/src/engine/v3/V3Interpreter.v3 @@ -349,7 +349,7 @@ class V3Interpreter extends WasmStack { RETURN => { doReturn(frame.fp, frame.func.sig); } - CALL, FAST_CALL => { + CALL, FAST_CALL0, FAST_CALL1, FAST_CALL2, FAST_CALL3, FAST_CALL4, FAST_CALL5, FAST_CALL6, FAST_CALL7, FAST_CALL8, FAST_CALL9, FAST_CALL10, FAST_CALL11, FAST_CALL12, FAST_CALL13, FAST_CALL14, FAST_CALL15, FAST_CALL16, FAST_CALL17, FAST_CALL18, FAST_CALL19, FAST_CALL20, FAST_CALL21, FAST_CALL22, FAST_CALL23, FAST_CALL24, FAST_CALL25, FAST_CALL26, FAST_CALL27, FAST_CALL28, FAST_CALL29, FAST_CALL30, FAST_CALL31, FAST_CALL32, FAST_CALL33, FAST_CALL34, FAST_CALL35, FAST_CALL36, FAST_CALL37, FAST_CALL38, FAST_CALL39, FAST_CALL40, FAST_CALL41, FAST_CALL42, FAST_CALL43, FAST_CALL44, FAST_CALL45, FAST_CALL46, FAST_CALL47 => { var func_index = codeptr.read_uleb32(); var f = frame.func.instance.functions[func_index]; return doCallFunction(f); @@ -1615,7 +1615,7 @@ class V3Interpreter extends WasmStack { // XXX: use read_opcode_and_skip() var opcode = codeptr.read_opcode_but_skip_probe(frame.func.decl); match (opcode) { - CALL, CALL_REF, FAST_CALL => { + CALL, CALL_REF, FAST_CALL0, FAST_CALL1, FAST_CALL2, FAST_CALL3, FAST_CALL4, FAST_CALL5, FAST_CALL6, FAST_CALL7, FAST_CALL8, FAST_CALL9, FAST_CALL10, FAST_CALL11, FAST_CALL12, FAST_CALL13, FAST_CALL14, FAST_CALL15, FAST_CALL16, FAST_CALL17, FAST_CALL18, FAST_CALL19, FAST_CALL20, FAST_CALL21, FAST_CALL22, FAST_CALL23, FAST_CALL24, FAST_CALL25, FAST_CALL26, FAST_CALL27, FAST_CALL28, FAST_CALL29, FAST_CALL30, FAST_CALL31, FAST_CALL32, FAST_CALL33, FAST_CALL34, FAST_CALL35, FAST_CALL36, FAST_CALL37, FAST_CALL38, FAST_CALL39, FAST_CALL40, FAST_CALL41, FAST_CALL42, FAST_CALL43, FAST_CALL44, FAST_CALL45, FAST_CALL46, FAST_CALL47 => { codeptr.skip_leb(); frame.pc = codeptr.pos; } diff --git a/src/engine/x86-64/X86_64Interpreter.v3 b/src/engine/x86-64/X86_64Interpreter.v3 index 780f479f5..eb723331b 100644 --- a/src/engine/x86-64/X86_64Interpreter.v3 +++ b/src/engine/x86-64/X86_64Interpreter.v3 @@ -1311,7 +1311,56 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { genPopFrameAndRet(); // FAST_CALL - bindHandler(Opcode.FAST_CALL); + // TODO patch the dispatch table so it goes to the code directly, + // instead of this fast function lookup + bindHandler(Opcode.FAST_CALL0); + bindHandler(Opcode.FAST_CALL1); + bindHandler(Opcode.FAST_CALL2); + bindHandler(Opcode.FAST_CALL3); + bindHandler(Opcode.FAST_CALL4); + bindHandler(Opcode.FAST_CALL5); + bindHandler(Opcode.FAST_CALL6); + bindHandler(Opcode.FAST_CALL7); + bindHandler(Opcode.FAST_CALL8); + bindHandler(Opcode.FAST_CALL9); + bindHandler(Opcode.FAST_CALL10); + bindHandler(Opcode.FAST_CALL11); + bindHandler(Opcode.FAST_CALL12); + bindHandler(Opcode.FAST_CALL13); + bindHandler(Opcode.FAST_CALL14); + bindHandler(Opcode.FAST_CALL15); + bindHandler(Opcode.FAST_CALL16); + bindHandler(Opcode.FAST_CALL17); + bindHandler(Opcode.FAST_CALL18); + bindHandler(Opcode.FAST_CALL19); + bindHandler(Opcode.FAST_CALL20); + bindHandler(Opcode.FAST_CALL21); + bindHandler(Opcode.FAST_CALL22); + bindHandler(Opcode.FAST_CALL23); + bindHandler(Opcode.FAST_CALL24); + bindHandler(Opcode.FAST_CALL25); + bindHandler(Opcode.FAST_CALL26); + bindHandler(Opcode.FAST_CALL27); + bindHandler(Opcode.FAST_CALL28); + bindHandler(Opcode.FAST_CALL29); + bindHandler(Opcode.FAST_CALL30); + bindHandler(Opcode.FAST_CALL31); + bindHandler(Opcode.FAST_CALL32); + bindHandler(Opcode.FAST_CALL33); + bindHandler(Opcode.FAST_CALL34); + bindHandler(Opcode.FAST_CALL35); + bindHandler(Opcode.FAST_CALL36); + bindHandler(Opcode.FAST_CALL37); + bindHandler(Opcode.FAST_CALL38); + bindHandler(Opcode.FAST_CALL39); + bindHandler(Opcode.FAST_CALL40); + bindHandler(Opcode.FAST_CALL41); + bindHandler(Opcode.FAST_CALL42); + bindHandler(Opcode.FAST_CALL43); + bindHandler(Opcode.FAST_CALL44); + bindHandler(Opcode.FAST_CALL45); + bindHandler(Opcode.FAST_CALL46); + bindHandler(Opcode.FAST_CALL47); //masm.emit_debugger_breakpoint(); var dispatchLabel = X86_64Label.new(); // genTagPush(BpTypeCode.I32.code); diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3 index 6249270b4..16813646f 100644 --- a/src/engine/x86-64/X86_64SinglePassCompiler.v3 +++ b/src/engine/x86-64/X86_64SinglePassCompiler.v3 @@ -1548,11 +1548,11 @@ def genFastCompileStub(ic: X86_64InterpreterCode, w: DataWriter) { masm.emit_store_curstack_vsp(regs.vsp); masm.emit_debugger_breakpoint(); - asm.movq_r_r(Target.V3_PARAM_GPRS[1], G(regs.func_arg)); // function (rdx) - asm.movq_r_i(Target.V3_PARAM_GPRS[2], int.!(Pointer.atObject(ic) - Pointer.NULL)); // load into rdx + asm.movq_r_r(G(Target.V3_PARAM_GPRS[1]), G(regs.func_arg)); // function (rdx) + asm.movq_r_i(G(Target.V3_PARAM_GPRS[2]), int.!(Pointer.atObject(ic) - Pointer.NULL)); // load into rdx // dispatch is in r14 (don't overwrite, just access directly) // Load {null} for the receiver. - asm.movq_r_i(Target.V3_PARAM_GPRS[0], 0); + asm.movq_r_i(G(Target.V3_PARAM_GPRS[0]), 0); // Call {X86_64Spc.fastCompile} directly. //masm.emit_debugger_breakpoint(); masm.emit_call_abs(codePointer(X86_64Spc.fastCompile)); @@ -1560,12 +1560,12 @@ def genFastCompileStub(ic: X86_64InterpreterCode, w: DataWriter) { asm.q.add_r_i(R.RSP, Pointer.SIZE); // pop function off stack // Check for non-null abrupt return. var unwind = X86_64Label.new(); - asm.q.cmp_r_i(Target.V3_RET_GPRS[2], 0); + asm.q.cmp_r_i(G(Target.V3_RET_GPRS[2]), 0); asm.jc_rel_near(C.NZ, unwind); // Tail-call the result of the compile. var scratch = X86_64Regs.R9; - asm.movq_r_r(scratch, Target.V3_RET_GPRS[1]); // entrypoint - asm.movq_r_r(G(regs.func_arg), Target.V3_RET_GPRS[0]); // function + asm.movq_r_r(scratch, G(Target.V3_RET_GPRS[1])); // entrypoint + asm.movq_r_r(G(regs.func_arg), G(Target.V3_RET_GPRS[0])); // function //for (i < all_ivars.length) { // asm.popq_r(all_ivars[all_ivars.length - i - 1].0); @@ -1581,7 +1581,7 @@ def genFastCompileStub(ic: X86_64InterpreterCode, w: DataWriter) { // Simply return the {Throwable} object. ?? asm.bind(unwind); - asm.movq_r_r(Target.V3_RET_GPRS[0], Target.V3_RET_GPRS[2]); + asm.movq_r_r(G(Target.V3_RET_GPRS[0]), G(Target.V3_RET_GPRS[2])); asm.ret(); } def genFastNopStub(ic: X86_64InterpreterCode, w: DataWriter) { diff --git a/src/engine/x86-64/X86_64Target.v3 b/src/engine/x86-64/X86_64Target.v3 index c24553c74..bbb836cbb 100644 --- a/src/engine/x86-64/X86_64Target.v3 +++ b/src/engine/x86-64/X86_64Target.v3 @@ -238,6 +238,7 @@ class X86_64InterpreterOnlyStrategy extends X86_64ExecutionStrategy { def onModuleFinish(module: Module, size: u32, err: ErrorGen) { disableLazyNameDecodingDuringGC(module); + fastCompileEntireModule(module, size, false, err, 1024); } def onFuncValidationFinish(module: Module, func: FuncDecl, err: ErrorGen) { if (err != null && !err.ok()) return; @@ -352,11 +353,6 @@ class X86_64InterpreterOnlyStrategy extends X86_64ExecutionStrategy { module.target_module.spc_code.keepAlive(); Debug.afterCompileModule(module); } - - // XXX not an exhaustive way to add stubs. but what is? - def onModuleFinish(module: Module, size: u32, err: ErrorGen) { - fastCompileEntireModule(module, size, false, err, 1024); - } } // Base class of all strategies that use SPC. diff --git a/src/util/BytecodeVisitor.v3 b/src/util/BytecodeVisitor.v3 index adc822bfe..cacd6c738 100644 --- a/src/util/BytecodeVisitor.v3 +++ b/src/util/BytecodeVisitor.v3 @@ -70,7 +70,7 @@ class BytecodeVisitor { def visit_BR_TABLE (labels: Range) { visitControl(Opcode.BR_TABLE); } def visit_RETURN () { visitControl(Opcode.RETURN); } def visit_CALL (func_index: u31) { visitCallDirect(Opcode.CALL, func_index, SLOW); } - def visit_FAST_CALL (func_index: u31) { visitCallDirect(Opcode.FAST_CALL, func_index, FAST); } + def visit_FAST_CALL (fast_index: int, func_index: u31) { visitCallDirect(Opcodes.indexToFastCall(fast_index), func_index, FAST); } def visit_CALL_INDIRECT (sig_index: u31, table_index: u31) { visitCallIndirect(Opcode.CALL_INDIRECT, sig_index, table_index, SLOW); } def visit_RETURN_CALL (func_index: u31) { visitCallDirect(Opcode.RETURN_CALL, func_index, TAIL); } def visit_RETURN_CALL_INDIRECT(sig_index: u31, table_index: u31) { visitCallIndirect(Opcode.RETURN_CALL_INDIRECT, sig_index, table_index, TAIL); } From 15f87ea03aed7b57649f269af903542f8f14e344 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Wed, 1 Apr 2026 18:21:42 -0400 Subject: [PATCH 46/55] Patch the dispatch table with fast-compiled functions (and use them) + others * added FastIntTuning.useFastFunctions which could control use of fast functions * added Mmap.reserve32 to use MAP_32BIT to force 4-byte addresses in dispatch table * allow PROT_WRITE in executable code for the dispatch table patching - probably needs to be refined to be just the table region * added u32leb_size helper (should really be removed) * add fast_call_idx so that we can get opcode from function later for patching * advance PC beyond operands (which remain in place, orig function index) --- src/engine/CodeValidator.v3 | 1 + src/engine/Module.v3 | 1 + src/engine/Tuning.v3 | 1 + src/engine/x86-64/Mmap.v3 | 10 +++++++ src/engine/x86-64/X86_64Interpreter.v3 | 12 ++++++++ src/engine/x86-64/X86_64PreGenStubs.v3 | 4 +-- src/engine/x86-64/X86_64SinglePassCompiler.v3 | 7 +++++ src/engine/x86-64/X86_64Target.v3 | 29 +++++++++++++++++-- 8 files changed, 60 insertions(+), 5 deletions(-) diff --git a/src/engine/CodeValidator.v3 b/src/engine/CodeValidator.v3 index 281b14a01..d7b939791 100644 --- a/src/engine/CodeValidator.v3 +++ b/src/engine/CodeValidator.v3 @@ -441,6 +441,7 @@ class CodeValidator(extensions: Extension.set, limits: Limits, module: Module, e if (fast_idx < 0) { if (fast_funcs.length < 48) { fast_idx = fast_funcs.length; + func.fast_call_idx = fast_idx; if (Trace.validation) Trace.OUT.put1("not found, allocating FAST_CALL%d, ", fast_idx); fast_funcs.put(func); } else { diff --git a/src/engine/Module.v3 b/src/engine/Module.v3 index c7438ecba..b19b3f0b9 100644 --- a/src/engine/Module.v3 +++ b/src/engine/Module.v3 @@ -143,6 +143,7 @@ class FuncDecl(sig_index: int) extends Decl { var frame_var_tags: Array; // value tags for frame variables var target_code: TargetCode; var fast_target_code: TargetCode; + var fast_call_idx: int = -1; var tierup_trigger: int = int.max; var handlers = FuncHandlerInfo.new(); diff --git a/src/engine/Tuning.v3 b/src/engine/Tuning.v3 index c1c12d3e1..2ca212cdc 100644 --- a/src/engine/Tuning.v3 +++ b/src/engine/Tuning.v3 @@ -42,6 +42,7 @@ component FastIntTuning { def inlineGlobalAccess = true; // enable inline access of (primitive) globals def stealFlagBitForMemory64 = true; // use a bit in the memarg flags for memory64 def whammProbeTrampolineNumPages = 1024; + def useFastFunctions = true; // treat functions exported with `fast:` in the name as fast functions } // Tuning settings for the single-pass compiler that have no effect on correctness. diff --git a/src/engine/x86-64/Mmap.v3 b/src/engine/x86-64/Mmap.v3 index 5305ef0c6..25621ab12 100644 --- a/src/engine/x86-64/Mmap.v3 +++ b/src/engine/x86-64/Mmap.v3 @@ -18,6 +18,16 @@ component Mmap { RiGc.registerFinalizer(mapping, range.unmap); return mapping; } + def reserve32(size: u64, prot: int) -> Mapping { + var flags = LinuxConst.MAP_PRIVATE | LinuxConst.MAP_ANONYMOUS | 0x40; // 0x40 = MAP_32BIT + var r = Linux.syscall(LinuxConst.SYS_mmap, (Pointer.NULL, size, prot, flags, 0, 0)); + if (r.0 == -1) return null; + var start = Pointer.NULL + r.0, end = start + i64.view(size); + var range = MemoryRange.new(start, end); + var mapping = Mapping.new(range); + RiGc.registerFinalizer(mapping, range.unmap); + return mapping; + } def protect(start: Pointer, size: u64, prot: int) -> bool { var r = Linux.syscall(LinuxConst.SYS_mprotect, (start, size, prot)); return r.0 == 0; diff --git a/src/engine/x86-64/X86_64Interpreter.v3 b/src/engine/x86-64/X86_64Interpreter.v3 index eb723331b..ac028fe36 100644 --- a/src/engine/x86-64/X86_64Interpreter.v3 +++ b/src/engine/x86-64/X86_64Interpreter.v3 @@ -1361,12 +1361,24 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { bindHandler(Opcode.FAST_CALL45); bindHandler(Opcode.FAST_CALL46); bindHandler(Opcode.FAST_CALL47); + masm.emit_intentional_crash(); //masm.emit_debugger_breakpoint(); var dispatchLabel = X86_64Label.new(); // genTagPush(BpTypeCode.I32.code); // asm.movq_m_i(vsph[0].value, 770); // incrementVsp(); + /* TODO What should happen in a FAST_CALL? + * + * Ideally, we've patched the dispatch table with exactly what appears in fast_target_code + * so it instantly jumps there and so we don't have to set up the jump first. + * + * Fast function implementation should include code to skip the original operand as + * part of incrementing pc (will be done over in SPC). + * + * But, we could keep this for a quasi-fast call? + */ + genReadUleb32(r_tmp1); asm.movq_r_m(r_tmp0, r_instance.plus(offsets.Instance_functions)); asm.movq_r_m(func_arg, r_tmp0.plusR(r_tmp1, offsets.REF_SIZE, offsets.Array_contents)); diff --git a/src/engine/x86-64/X86_64PreGenStubs.v3 b/src/engine/x86-64/X86_64PreGenStubs.v3 index 4252307d2..d836db938 100644 --- a/src/engine/x86-64/X86_64PreGenStubs.v3 +++ b/src/engine/x86-64/X86_64PreGenStubs.v3 @@ -222,8 +222,8 @@ component X86_64PreGenStubs { ic.header.probedDispatchTableOffset, ic.header.fastDispatchTableOffset); - // Write-protect the executable code for security and debugging - Mmap.protect(range.start + ic.header.codeStart, u64.!(ic.header.codeEnd - ic.header.codeStart), Mmap.PROT_READ | Mmap.PROT_EXEC); + // XXX: PROT_WRITE included to allow runtime dispatch table patching + Mmap.protect(range.start + ic.header.codeStart, u64.!(ic.header.codeEnd - ic.header.codeStart), Mmap.PROT_READ | Mmap.PROT_WRITE | Mmap.PROT_EXEC); // The host call stub is part of interpreter code (TODO: does it need to be?) hostCallStub.start = ic.start + ic.header.hostCallStubOffset; diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3 index 16813646f..cf5c5a715 100644 --- a/src/engine/x86-64/X86_64SinglePassCompiler.v3 +++ b/src/engine/x86-64/X86_64SinglePassCompiler.v3 @@ -3,6 +3,11 @@ // XXX: reduce duplication with MacroAssembler def G = X86_64MasmRegs.toGpr, X = X86_64MasmRegs.toXmmr; +def u32leb_size(v: int) -> int { + var n = 1, data = u32.view(v); + while (data >= 0x80) { data = data >> 7; n++; } + return n; +} def R: X86_64Regs; def C: X86_64Conds; def A(ma: MasmAddr) -> X86_64Addr { @@ -48,6 +53,8 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { if(ic != null, IcCodeRef.new(ic.header.fastDispatchTableOffset)), true, ic); } def emitFastPrologue() { + // Advance r_ip past the ULEB32 operand (original CALL function index) + asm.add_r_i(G(X86_64MasmRegs.INT_EXEC_ENV.ip), u32leb_size(func.func_index)); // Compute VFP = VSP - sig.params.length * SLOT_SIZE (no native stack frame needed) masm.emit_mov_r_r(ValueKind.REF, regs.vfp, regs.vsp); masm.emit_subw_r_i(regs.vfp, sig.params.length * masm.valuerep.slot_size); diff --git a/src/engine/x86-64/X86_64Target.v3 b/src/engine/x86-64/X86_64Target.v3 index bbb836cbb..02ddf51d3 100644 --- a/src/engine/x86-64/X86_64Target.v3 +++ b/src/engine/x86-64/X86_64Target.v3 @@ -79,9 +79,32 @@ component Target { Trace.OUT.ln(); } } - f.fast_target_code = TargetCode(addr); + f.fast_target_code = TargetCode(addr); + patchFastCallDispatch(f, addr); Debug.afterCompile(f, u64.view(addr - Pointer.NULL)); } + def patchFastCallDispatch(f: FuncDecl, addr: Pointer) { + if (f.fast_call_idx < 0) return; + def opcode = Opcodes.indexToFastCall(f.fast_call_idx); + def ic = X86_64PreGenStubs.getInterpreterCode(); + // XXX Patch only fast dispatch tables + def fast_offset = ic.header.fastDispatchTableOffset; + def entry = ic.start + fast_offset + opcode.code * FastIntTuning.dispatchEntrySize; + if (Trace.compiler) { + Trace.OUT.puts("patching dispatch type\n"); + Trace.OUT.put1("start 0x%x\n", u64.view(ic.start)); + Trace.OUT.put1("entry 0x%x\n", u64.view(entry)); + Trace.OUT.put1("addr 0x%x\n", u64.view(addr)); + } + // XXX we require 8 entry size because of `addr` position + match (FastIntTuning.dispatchEntrySize) { + 4 => entry.store(u32.view(addr)); + 8 => entry.store(long.view(addr)); + // 2-byte relative case would need a relative offset + } + if (Trace.compiler) Trace.OUT.puts("patched successfully\n"); + } + def pregenIntoFile(filename: string) -> ErrorBuilder { var data = System.fileLoad(filename); var err = ErrorBuilder.new().puts("interpreter generator: "); @@ -317,9 +340,9 @@ class X86_64InterpreterOnlyStrategy extends X86_64ExecutionStrategy { } } - // copy and map code + // copy and map code (reserve32 ensures address fits in 32 bits for dispatch table patching) var length = u64.view(w.atEnd().pos) + ballast; - var mapping = Mmap.reserve(length, Mmap.PROT_WRITE), range = mapping.range; // TODO: handle failure + var mapping = Mmap.reserve32(length, Mmap.PROT_WRITE), range = mapping.range; // TODO: handle failure var masm = X86_64MacroAssembler.!(compiler.masm); masm.setTargetAddress(u64.view(range.start - Pointer.NULL)); Target.copyInto(mapping.range, 0, w); From e0a2adb9a2a8a1c4c5724295204044e53a665aec Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Thu, 2 Apr 2026 11:56:02 -0400 Subject: [PATCH 47/55] Cleaned up some stuff --- src/engine/CodeValidator.v3 | 4 +- src/engine/Module.v3 | 2 +- src/engine/Opcodes.v3 | 4 +- src/engine/compiler/SinglePassCompiler.v3 | 74 ++++++++++--------- src/engine/x86-64/X86_64SinglePassCompiler.v3 | 18 ----- 5 files changed, 44 insertions(+), 58 deletions(-) diff --git a/src/engine/CodeValidator.v3 b/src/engine/CodeValidator.v3 index d7b939791..0815a7714 100644 --- a/src/engine/CodeValidator.v3 +++ b/src/engine/CodeValidator.v3 @@ -422,7 +422,7 @@ class CodeValidator(extensions: Extension.set, limits: Limits, module: Module, e checkSignature(func.sig); // fast call: if function is exported with fast name, replace the bytecode with FAST_CALL - for (i < module.exports.length) { + if (FastIntTuning.useFastFunctions) for (i < module.exports.length) { def ex = module.exports[i]; if (ex.1 == func && Strings.startsWith(ex.0, "fast:")) { if (Trace.validation) Trace.OUT.puts(" function declared as fast: "); @@ -434,7 +434,7 @@ class CodeValidator(extensions: Extension.set, limits: Limits, module: Module, e if (func == fast_funcs[i]) { fast_idx = i; if (Trace.validation) Trace.OUT.put1("allocated as FAST_CALL%d, ", fast_idx); - break; + break; } } // not found? allocate FAST_CALL instruction, if there's space diff --git a/src/engine/Module.v3 b/src/engine/Module.v3 index b19b3f0b9..ecdabfcdf 100644 --- a/src/engine/Module.v3 +++ b/src/engine/Module.v3 @@ -179,7 +179,7 @@ class FuncDecl(sig_index: int) extends Decl { if (cur_bytecode[pc] != Opcode.CALL.code) { def realOp = Opcodes.find(0, cur_bytecode[pc]); System.error("replace bytecode", Strings.format1("not replacing call (got %s)", realOp.mnemonic)); - } + } cur_bytecode[pc] = byte.!(Opcodes.indexToFastCall(idx).code); // do NOT replace the operands, as a convenience for BytecodeIterator } diff --git a/src/engine/Opcodes.v3 b/src/engine/Opcodes.v3 index db9c56009..0998599ea 100644 --- a/src/engine/Opcodes.v3 +++ b/src/engine/Opcodes.v3 @@ -32,7 +32,6 @@ enum Opcode(prefix: u8, code: u16, mnemonic: string, imms: Array, sig: RETURN_CALL_INDIRECT (0x00, 0x13, "return_call_indirect", imm.SIG_TABLE, null), CALL_REF (0x00, 0x14, "call_ref", imm.SIG, null), RETURN_CALL_REF (0x00, 0x15, "return_call_ref", imm.SIG, null), - // Fast handler custom instruction DELEGATE (0x00, 0x18, "delegate", imm.LABEL, null), CATCH_ALL (0x00, 0x19, "catch_all", imm.NONE, null), DROP (0x00, 0x1A, "drop", imm.NONE, null), @@ -866,8 +865,7 @@ component Opcodes { attributes[InternalOpcode.PROBE.code] = OpcodeAttribute.INTERNAL | OpcodeAttribute.PROBE; attributes[InternalOpcode.WHAMM_PROBE.code] = OpcodeAttribute.INTERNAL | OpcodeAttribute.PROBE; attributes[InternalOpcode.BREAK_PROBE.code] = OpcodeAttribute.INTERNAL | OpcodeAttribute.PROBE; - for (op in fast_calls) - attributes[op.tag] = OpcodeAttribute.INTERNAL; + for (op in fast_calls) attributes[op.tag] = OpcodeAttribute.INTERNAL; for (op in [Opcode.END, Opcode.I32_CONST, Opcode.I64_CONST, Opcode.F32_CONST, Opcode.F64_CONST, Opcode.GLOBAL_GET, Opcode.REF_NULL, Opcode.REF_FUNC, Opcode.STRUCT_NEW, Opcode.STRUCT_NEW_DEFAULT, diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index b7447dfb2..501f3851a 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -203,11 +203,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl pushSpcFrame(initial_frame); // Emit prologue, which allocates the frame and initializes various registers. - if (fast) { - emitFastPrologue(); - } else { - emitPrologue(); - } + emitPrologue(); // Visit all local declarations. it.dispatchLocalDecls(this); @@ -384,36 +380,43 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl if (!cond) bailout(Strings.format3(msg, p1, p2, p3)); } def emitPrologue() { - // Allocate stack frame - masm.emit_subw_r_i(regs.sp, frame.frameSize); - - // Spill VSP - emit_spill_vsp(regs.vsp); // XXX: track VSP-spilled state - // Spill wf: WasmFunction - masm.emit_mov_m_r(ValueKind.REF, frame.wasm_func_slot, regs.func_arg); - // Load wf.instance and spill - masm.emit_v3_WasmFunction_instance_r_r(regs.instance, regs.func_arg); - masm.emit_mov_m_r(ValueKind.REF, frame.instance_slot, regs.instance); - // Clear FrameAccessor - masm.emit_mov_m_l(frame.accessor_slot, 0); // XXX: value kind - // Clear inlined whamm instance - if (SpcTuning.inlineWhammProbes && SpcTuning.intrinsifyWhammProbe) { - masm.emit_mov_m_l(frame.inlined_instance_slot, 0); + if (!fast) { + // Allocate stack frame + masm.emit_subw_r_i(regs.sp, frame.frameSize); + + // Spill VSP + emit_spill_vsp(regs.vsp); // XXX: track VSP-spilled state + // Spill wf: WasmFunction + masm.emit_mov_m_r(ValueKind.REF, frame.wasm_func_slot, regs.func_arg); + // Load wf.instance and spill + masm.emit_v3_WasmFunction_instance_r_r(regs.instance, regs.func_arg); + masm.emit_mov_m_r(ValueKind.REF, frame.instance_slot, regs.instance); + // Clear FrameAccessor + masm.emit_mov_m_l(frame.accessor_slot, 0); // XXX: value kind + // Clear inlined whamm instance + if (SpcTuning.inlineWhammProbes && SpcTuning.intrinsifyWhammProbe) { + masm.emit_mov_m_l(frame.inlined_instance_slot, 0); + } + } else { + masm.emit_addw_r_i(X86_64MasmRegs.INT_EXEC_ENV.ip, uleb_size(func.func_index)); } // Compute VFP = VSP - sig.params.length * SLOT_SIZE masm.emit_mov_r_r(ValueKind.REF, regs.vfp, regs.vsp); // XXX: use 3-addr adjustment of VFP masm.emit_subw_r_i(regs.vfp, sig.params.length * masm.valuerep.slot_size); - // XXX: skip spilling of VFP - masm.emit_mov_m_r(ValueKind.REF, frame.vfp_slot, regs.vfp); - // Load instance.memories[0].start into MEM0_BASE and spill - if (module.memories.length > 0) { - // XXX: skip loading memory base if function doesn't access memory - masm.emit_v3_Instance_memories_r_r(regs.mem0_base, regs.instance); - masm.emit_v3_Array_elem_r_ri(ValueKind.REF, regs.mem0_base, regs.mem0_base, 0); - masm.emit_v3_Memory_start_r_r(regs.mem0_base, regs.mem0_base); - masm.emit_mov_m_r(ValueKind.REF, frame.mem0_base_slot, regs.mem0_base); + if (!fast) { + // XXX: skip spilling of VFP + masm.emit_mov_m_r(ValueKind.REF, frame.vfp_slot, regs.vfp); + + // Load instance.memories[0].start into MEM0_BASE and spill + if (module.memories.length > 0) { + // XXX: skip loading memory base if function doesn't access memory + masm.emit_v3_Instance_memories_r_r(regs.mem0_base, regs.instance); + masm.emit_v3_Array_elem_r_ri(ValueKind.REF, regs.mem0_base, regs.mem0_base, 0); + masm.emit_v3_Memory_start_r_r(regs.mem0_base, regs.mem0_base); + masm.emit_mov_m_r(ValueKind.REF, frame.mem0_base_slot, regs.mem0_base); + } } } def visitLocalDecl(count: u32, vtc: ValueTypeCode) { @@ -2165,7 +2168,6 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.bindLabel(ret_label); ret_label = null; } - emitFastEpilogue1(); var results = sig.results; // fix values? @@ -2190,14 +2192,12 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_addw_r_i(regs.sp, frame.frameSize); // | masm.emit_ret(); // / } else { - emitFastEpilogue2(); + // Restore VFP from interpreter frame + masm.emit_mov_r_m(ValueKind.REF, regs.vfp, frame.vfp_slot); emitFastDispatch(); } } def emitFastDispatch() -> void; - def emitFastPrologue() -> void; - def emitFastEpilogue1() -> void; - def emitFastEpilogue2() -> void; def emitOsrEntry(osr_entry_label: MasmLabel, state: Array) { if (Trace.compiler) Trace.OUT.put1(" OSR (+%d)", osr_entry_label.create_pos).ln(); masm.bindLabel(osr_entry_label); @@ -3604,3 +3604,9 @@ type WhammInlineConfig(swap_membase: bool, swap_instance: bool, is_inlined: bool // The SPC emits a stub at {stub_label} for each handler in the function. The stub restores the // expected state of the environment, then jumps to {dest_label} to continue execution at handler. type SpcHandlerInfo(is_dummy: bool, func_end: bool, dest_label: MasmLabel, stub_label: MasmLabel, merge_state: Array); + +def uleb_size(v: int) -> int { + var n = 1, data = u32.view(v); + while (data >= 0x80) { data = data >> 7; n++; } + return n; +} diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3 index cf5c5a715..f6662b7e0 100644 --- a/src/engine/x86-64/X86_64SinglePassCompiler.v3 +++ b/src/engine/x86-64/X86_64SinglePassCompiler.v3 @@ -3,11 +3,6 @@ // XXX: reduce duplication with MacroAssembler def G = X86_64MasmRegs.toGpr, X = X86_64MasmRegs.toXmmr; -def u32leb_size(v: int) -> int { - var n = 1, data = u32.view(v); - while (data >= 0x80) { data = data >> 7; n++; } - return n; -} def R: X86_64Regs; def C: X86_64Conds; def A(ma: MasmAddr) -> X86_64Addr { @@ -52,19 +47,6 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { mmasm.emit_int_dispatch(r_tmp0, r_tmp1, r_ip, r_dispatch, ip_ptr, if(ic != null, IcCodeRef.new(ic.header.fastDispatchTableOffset)), true, ic); } - def emitFastPrologue() { - // Advance r_ip past the ULEB32 operand (original CALL function index) - asm.add_r_i(G(X86_64MasmRegs.INT_EXEC_ENV.ip), u32leb_size(func.func_index)); - // Compute VFP = VSP - sig.params.length * SLOT_SIZE (no native stack frame needed) - masm.emit_mov_r_r(ValueKind.REF, regs.vfp, regs.vsp); - masm.emit_subw_r_i(regs.vfp, sig.params.length * masm.valuerep.slot_size); - } - def emitFastEpilogue1() { - } - def emitFastEpilogue2() { - // Restore VFP from interpreter frame (always in sync; no fast frame was allocated) - masm.emit_mov_r_m(ValueKind.REF, regs.vfp, frame.vfp_slot); - } private def visitCompareI(asm: X86_64Assembler, cond: X86_64Cond) -> bool { var b = pop(), a = popReg(); From cd2e243dffb8df14454529fa74ec1ab0615c3049 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Thu, 2 Apr 2026 12:10:17 -0400 Subject: [PATCH 48/55] Remove unused stubs and deps --- src/engine/x86-64/X86_64MacroAssembler.v3 | 28 ---- src/engine/x86-64/X86_64SinglePassCompiler.v3 | 149 ------------------ 2 files changed, 177 deletions(-) diff --git a/src/engine/x86-64/X86_64MacroAssembler.v3 b/src/engine/x86-64/X86_64MacroAssembler.v3 index 71ed05b91..8e50d5668 100644 --- a/src/engine/x86-64/X86_64MacroAssembler.v3 +++ b/src/engine/x86-64/X86_64MacroAssembler.v3 @@ -47,34 +47,6 @@ class X86_64MacroAssembler extends MacroAssembler { } } - def saveIVar(r: X86_64Gpr, ivars: Array<(X86_64Gpr, X86_64Addr)>) { - for (t in ivars) { - if (t.0 == r) asm.movq_m_r(t.1, r); - } - } - def saveCallerIVars(r_ip: X86_64Gpr, r_stp: X86_64Gpr, r_curpc: X86_64Gpr, - ivars: Array<(X86_64Gpr, X86_64Addr)>) { - saveIVar(r_ip, ivars); - saveIVar(r_stp, ivars); - if (!FeatureDisable.stacktraces) saveIVar(r_curpc, ivars); - } - def restoreReg(r: X86_64Gpr, ivars: Array<(X86_64Gpr, X86_64Addr)>) { - for (t in ivars) { - if (t.0 == r) asm.movq_r_m(r, t.1); - } - } - def restoreCallerIVars(r_ip: X86_64Gpr, r_stp: X86_64Gpr, r_eip: X86_64Gpr, - r_instance: X86_64Gpr, r_func_decl: X86_64Gpr, r_mem0_base: X86_64Gpr, r_vfp: X86_64Gpr, - ivars: Array<(X86_64Gpr, X86_64Addr)>) { - restoreReg(r_ip, ivars); - restoreReg(r_stp, ivars); - restoreReg(r_eip, ivars); - restoreReg(r_instance, ivars); - restoreReg(r_func_decl, ivars); - restoreReg(r_mem0_base, ivars); - restoreReg(r_vfp, ivars); - } - // Label operations def newLabel(create_pos: int) -> X86_64MasmLabel { return X86_64MasmLabel.new(create_pos, asm.newLabel()); diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3 index f6662b7e0..a9f9dfbff 100644 --- a/src/engine/x86-64/X86_64SinglePassCompiler.v3 +++ b/src/engine/x86-64/X86_64SinglePassCompiler.v3 @@ -1410,8 +1410,6 @@ class X86_64SpcCompileStub extends RiUserCode { def V3_SPC_ENTRY_FUNC = X86_64PreGenFunc<(WasmFunction, Pointer, Pointer), Throwable>.new("v3-spc-entry", null, genSpcEntryFunc); def LAZY_COMPILE_STUB = X86_64PreGenStub.new("spc-lazy-compile", X86_64SpcCompileStub.new("lazy"), genLazyCompileStub); -def FAST_COMPILE_STUB = X86_64PreGenStub.new("spc-fast-compile", X86_64SpcCompileStub.new("fast"), genFastCompileStub); -def FAST_CALL_NOP = X86_64PreGenStub.new("spc-fast-nop", X86_64SpcCompileStub.new("fast"), genFastNopStub); def TIERUP_COMPILE_STUB = X86_64PreGenStub.new("spc-tierup-compile", X86_64SpcCompileStub.new("tierup"), genTierUpCompileStub); def TRAPS_STUB = X86_64SpcTrapsStub.new(); def TRAPS_PREGEN = X86_64PreGenStub.new("spc-trap", TRAPS_STUB, genTrapsStub); @@ -1467,141 +1465,6 @@ def genLazyCompileStub(ic: X86_64InterpreterCode, w: DataWriter) { asm.movq_r_r(G(Target.V3_RET_GPRS[0]), G(Target.V3_RET_GPRS[2])); asm.ret(); } -/* This stub should: - * - save program state (i.e. an epilogue as if it was a call/new frame) - * - compile the function (given register constraints imposed by fast int) - * - rewrite the `fast_target_code` field with this new function - * - restore program state - * - jump into the new `fast_target_code` (or re-dispatch on itself) - */ -def genFastCompileStub(ic: X86_64InterpreterCode, w: DataWriter) { - if (SpcTuning.disable) return; - var masm = X86_64MacroAssembler.new(w, X86_64MasmRegs.CONFIG); - var asm = X86_64Assembler.!(masm.asm); - var regs = X86_64MasmRegs.SPC_EXEC_ENV; - var func_arg = G(regs.func_arg); - - var xenv = X86_64MasmRegs.INT_EXEC_ENV; - - // TODO ensure that register use is compatible with fast-int usage - def r_mem0_base = G(xenv.mem0_base); - def r_vfp = G(xenv.vfp); - def r_vsp = G(xenv.vsp); - def r_stp = G(xenv.stp); - def r_ip = G(xenv.ip); - def r_eip = G(xenv.eip); - def r_func_decl = G(xenv.func_decl); - def r_instance = G(xenv.instance); - def r_curpc = G(xenv.curpc); - - def m_mem0_base = R.RSP.plus(X86_64InterpreterFrame.mem0_base.offset); - def m_vfp = R.RSP.plus(X86_64InterpreterFrame.vfp.offset); - def m_vsp = R.RSP.plus(X86_64InterpreterFrame.vsp.offset); - def m_stp = R.RSP.plus(X86_64InterpreterFrame.stp.offset); - def m_ip = R.RSP.plus(X86_64InterpreterFrame.ip.offset); - def m_eip = R.RSP.plus(X86_64InterpreterFrame.eip.offset); - def m_func_decl = R.RSP.plus(X86_64InterpreterFrame.func_decl.offset); - def m_instance = R.RSP.plus(X86_64InterpreterFrame.instance.offset); - def m_curpc = R.RSP.plus(X86_64InterpreterFrame.curpc.offset); - - def ivar_MEM0_BASE = (r_mem0_base, m_mem0_base); - def ivar_VFP = (r_vfp, m_vfp); - def ivar_VSP = (r_vsp, m_vsp); - def ivar_STP = (r_stp, m_stp); - def ivar_IP = (r_ip, m_ip); - def ivar_EIP = (r_eip, m_eip); - def ivar_FUNC_DECL = (r_func_decl, m_func_decl); - def ivar_INSTANCE = (r_instance, m_instance); - def ivar_CURPC = (r_curpc, m_curpc); - - def all_ivars = [ - ivar_MEM0_BASE, - ivar_VFP, - ivar_VSP, - ivar_STP, - ivar_IP, - ivar_EIP, - ivar_FUNC_DECL, - ivar_INSTANCE, - ivar_CURPC - ]; - - masm.emit_debugger_breakpoint(); - //for (i < all_ivars.length) { - // asm.pushq_r(all_ivars[i].0); - //} - masm.saveCallerIVars(r_ip, r_stp, r_curpc, all_ivars); - asm.pushq_r(G(regs.func_arg)); // push function onto stack - // saveCallerIVars (move to macro assembler) - // look at runtime calls in int, and int->spc calls - masm.emit_store_curstack_vsp(regs.vsp); - masm.emit_debugger_breakpoint(); - - asm.movq_r_r(G(Target.V3_PARAM_GPRS[1]), G(regs.func_arg)); // function (rdx) - asm.movq_r_i(G(Target.V3_PARAM_GPRS[2]), int.!(Pointer.atObject(ic) - Pointer.NULL)); // load into rdx - // dispatch is in r14 (don't overwrite, just access directly) - // Load {null} for the receiver. - asm.movq_r_i(G(Target.V3_PARAM_GPRS[0]), 0); - // Call {X86_64Spc.fastCompile} directly. - //masm.emit_debugger_breakpoint(); - masm.emit_call_abs(codePointer(X86_64Spc.fastCompile)); - masm.emit_debugger_breakpoint(); - asm.q.add_r_i(R.RSP, Pointer.SIZE); // pop function off stack - // Check for non-null abrupt return. - var unwind = X86_64Label.new(); - asm.q.cmp_r_i(G(Target.V3_RET_GPRS[2]), 0); - asm.jc_rel_near(C.NZ, unwind); - // Tail-call the result of the compile. - var scratch = X86_64Regs.R9; - asm.movq_r_r(scratch, G(Target.V3_RET_GPRS[1])); // entrypoint - asm.movq_r_r(G(regs.func_arg), G(Target.V3_RET_GPRS[0])); // function - - //for (i < all_ivars.length) { - // asm.popq_r(all_ivars[all_ivars.length - i - 1].0); - //} - masm.restoreCallerIVars(r_ip, r_stp, r_eip, r_instance, r_func_decl, r_mem0_base, r_vfp, all_ivars); - masm.emit_load_curstack_vsp(regs.vsp); - - masm.emit_debugger_breakpoint(); - asm.ijmp_r(scratch); // jump to entrypoint - - asm.invalid(); - asm.ret(); - - // Simply return the {Throwable} object. ?? - asm.bind(unwind); - asm.movq_r_r(G(Target.V3_RET_GPRS[0]), G(Target.V3_RET_GPRS[2])); - asm.ret(); -} -def genFastNopStub(ic: X86_64InterpreterCode, w: DataWriter) { - if (SpcTuning.disable) return; - var masm = X86_64MacroAssembler.new(w, X86_64MasmRegs.CONFIG); - var asm = X86_64Assembler.!(masm.asm); - var regs = X86_64MasmRegs.SPC_EXEC_ENV; - var func_arg = G(regs.func_arg); - - def xenv: IntExecEnv = X86_64MasmRegs.INT_EXEC_ENV; - def r_ip = G(xenv.ip); - def r_dispatch = G(xenv.dispatch); - def r_tmp0 = G(xenv.tmp0); // RCX - def r_tmp1 = G(xenv.tmp1); // RDX - def ip_ptr = r_ip.plus(0); - - // simplified dispatch sequence - - var opcode = r_tmp0; - var base = r_tmp1; - - asm.movbzx_r_m(opcode, ip_ptr); - asm.inc_r(r_ip); - - // flattened 4 case - var addr = ic.start + ic.header.fastDispatchTableOffset; - asm.movd_r_m(base, X86_64Addr.new(null, opcode, 4, int.!(addr - Pointer.NULL))); - asm.ijmp_r(base); - - asm.invalid(); -} def genTierUpCompileStub(ic: X86_64InterpreterCode, w: DataWriter) { if (SpcTuning.disable) return; var masm = X86_64MacroAssembler.new(w, X86_64MasmRegs.CONFIG); @@ -1671,10 +1534,6 @@ component X86_64Spc { def invoke(wf: WasmFunction, sp: Pointer) -> Throwable { return V3_SPC_ENTRY_FUNC.get()(wf, sp, wf.decl.target_code.spc_entry); } - def setFastCompileFor(module: Module, decl: FuncDecl) { - if (Debug.runtime) Trace.OUT.put1("setFastCompile %q", decl.render(module.names, _)).ln(); - decl.fast_target_code = TargetCode(FAST_COMPILE_STUB.getEntry()); - } def setLazyCompileFor(module: Module, decl: FuncDecl) { if (Debug.runtime) Trace.OUT.put1("setLazyCompile %q", decl.render(module.names, _)).ln(); decl.target_code = TargetCode(LAZY_COMPILE_STUB.getEntry()); @@ -1698,14 +1557,6 @@ component X86_64Spc { var result = X86_64SpcStrategy.!(Execute.tiering).lazyCompile(wf); return (result.wf, result.entrypoint, result.thrown); } - private def fastCompile(wf: WasmFunction, ic: X86_64InterpreterCode) -> (WasmFunction, Pointer, Throwable) { - // The global stub simply consults the execution strategy. - var result = X86_64ExecutionStrategy.!(Execute.tiering).fastCompile(wf, ic); // no condition that tiering uses SPC (int => fast SPC) - return (result.wf, result.entrypoint, result.thrown); - // need to compute _a_ new vfp - // bump the stack pointer? - // goal: avoid having to make new frame entirely - } private def tierupCompile(wf: WasmFunction) -> (WasmFunction, Pointer, Throwable) { // The global stub simply consults the execution strategy. var result = X86_64SpcStrategy.!(Execute.tiering).tierupCompile(wf); From db986c2f6df357b6f66d9a9a6b5151c272882682 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Thu, 2 Apr 2026 12:22:34 -0400 Subject: [PATCH 49/55] Build fast SPC exec env at the same time as other exec envs, remove dup --- src/engine/compiler/SinglePassCompiler.v3 | 33 ------------ src/engine/x86-64/X86_64MasmRegs.v3 | 64 +++++++++++------------ 2 files changed, 32 insertions(+), 65 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 501f3851a..d602570ae 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -31,39 +31,6 @@ class SpcExecEnv { var runtime_ret1: Reg; var ret_throw: Reg; var scratch: Reg; - - def dup() -> SpcExecEnv { - def env = SpcExecEnv.new(); - - env.frameSize = this.frameSize; - env.vsp_slot = this.vsp_slot; - env.vfp_slot = this.vfp_slot; - env.pc_slot = this.pc_slot; - env.instance_slot = this.instance_slot; - env.inlined_instance_slot = this.inlined_instance_slot; - env.wasm_func_slot = this.wasm_func_slot; - env.mem0_base_slot = this.mem0_base_slot; - env.inlined_mem0_base_slot = this.inlined_mem0_base_slot; - env.accessor_slot = this.accessor_slot; - - env.sp = this.sp; - env.func_arg = this.func_arg; - env.vsp = this.vsp; - env.vfp = this.vfp; - env.mem0_base = this.mem0_base; - env.instance = this.instance; - env.runtime_arg0 = this.runtime_arg0; - env.runtime_arg1 = this.runtime_arg1; - env.runtime_arg2 = this.runtime_arg2; - env.runtime_arg3 = this.runtime_arg3; - env.runtime_arg4 = this.runtime_arg4; - env.runtime_ret0 = this.runtime_ret0; - env.runtime_ret1 = this.runtime_ret1; - env.ret_throw = this.ret_throw; - env.scratch = this.scratch; - - return env; - } } def INITIAL_VALUE_STACK_SIZE = 16; diff --git a/src/engine/x86-64/X86_64MasmRegs.v3 b/src/engine/x86-64/X86_64MasmRegs.v3 index d1fe06a79..73af95b67 100644 --- a/src/engine/x86-64/X86_64MasmRegs.v3 +++ b/src/engine/x86-64/X86_64MasmRegs.v3 @@ -88,26 +88,27 @@ component X86_64MasmRegs { return config; })(); - // Build both the SPC and INT execution environments together. - private def t = (fun -> (SpcExecEnv, IntExecEnv) { + // Build the SPC, fast-SPC, and INT execution environments together. + private def t = (fun -> (SpcExecEnv, SpcExecEnv, IntExecEnv) { var xspc = SpcExecEnv.new(); + var xfast = SpcExecEnv.new(); var xint = IntExecEnv.new(); - xint.sp = xspc.sp = RSP; - xint.func_arg = xspc.func_arg = RDX; // cache of frame (callee-restore) - xint.vsp = xspc.vsp = RSI; - xint.vfp = xspc.vfp = R11; - xint.mem0_base = xspc.mem0_base = R10; // cache of frame (callee-restore) - xint.instance = xspc.instance = RDI; // cache of frame (callee-restore) - xint.runtime_arg0 = xspc.runtime_arg0 = RSI; - xint.runtime_arg1 = xspc.runtime_arg1 = RDX; - xint.runtime_arg2 = xspc.runtime_arg2 = RCX; - xint.runtime_arg3 = xspc.runtime_arg3 = R8; - xint.runtime_arg4 = xspc.runtime_arg4 = R9; - xint.ret_throw = xspc.ret_throw = RAX; - xint.runtime_ret0 = xspc.runtime_ret0 = RAX; - xint.runtime_ret1 = xspc.runtime_ret1 = RDX; - xint.scratch = xspc.scratch = RBP; + xint.sp = xspc.sp = xfast.sp = RSP; + xint.func_arg = xspc.func_arg = xfast.func_arg = RDX; // cache of frame (callee-restore) + xint.vsp = xspc.vsp = xfast.vsp = RSI; + xint.vfp = xspc.vfp = xfast.vfp = R11; + xint.mem0_base = xspc.mem0_base = xfast.mem0_base = R10; // cache of frame (callee-restore) + xint.instance = xspc.instance = xfast.instance = RDI; // cache of frame (callee-restore) + xint.runtime_arg0 = xspc.runtime_arg0 = xfast.runtime_arg0 = RSI; + xint.runtime_arg1 = xspc.runtime_arg1 = xfast.runtime_arg1 = RDX; + xint.runtime_arg2 = xspc.runtime_arg2 = xfast.runtime_arg2 = RCX; + xint.runtime_arg3 = xspc.runtime_arg3 = xfast.runtime_arg3 = R8; + xint.runtime_arg4 = xspc.runtime_arg4 = xfast.runtime_arg4 = R9; + xint.ret_throw = xspc.ret_throw = xfast.ret_throw = RAX; + xint.runtime_ret0 = xspc.runtime_ret0 = xfast.runtime_ret0 = RAX; + xint.runtime_ret1 = xspc.runtime_ret1 = xfast.runtime_ret1 = RDX; + xint.scratch = xspc.scratch = xfast.scratch = RBP; xint.curpc = R15; xint.stp = RBX; @@ -127,31 +128,32 @@ component X86_64MasmRegs { def m = MasmAddr(xspc.sp, _); - xint.accessor_slot = xspc.accessor_slot = m(X86_64InterpreterFrame.accessor.offset); - xint.instance_slot = xspc.instance_slot = m(X86_64InterpreterFrame.instance.offset); - xint.mem0_base_slot = xspc.mem0_base_slot = m(X86_64InterpreterFrame.mem0_base.offset); - xint.pc_slot = xspc.pc_slot = m(X86_64InterpreterFrame.curpc.offset); - xint.vfp_slot = xspc.vfp_slot = m(X86_64InterpreterFrame.vfp.offset); - xint.vsp_slot = xspc.vsp_slot = m(X86_64InterpreterFrame.vsp.offset); - xint.wasm_func_slot = xspc.wasm_func_slot = m(X86_64InterpreterFrame.wasm_func.offset); - xint.ip_slot = xspc.inlined_mem0_base_slot = m(X86_64InterpreterFrame.ip.offset); - xint.stp_slot = xspc.inlined_instance_slot = m(X86_64InterpreterFrame.stp.offset); + xint.accessor_slot = xspc.accessor_slot = xfast.accessor_slot = m(X86_64InterpreterFrame.accessor.offset); + xint.instance_slot = xspc.instance_slot = xfast.instance_slot = m(X86_64InterpreterFrame.instance.offset); + xint.mem0_base_slot = xspc.mem0_base_slot = xfast.mem0_base_slot = m(X86_64InterpreterFrame.mem0_base.offset); + xint.pc_slot = xspc.pc_slot = xfast.pc_slot = m(X86_64InterpreterFrame.curpc.offset); + xint.vfp_slot = xspc.vfp_slot = xfast.vfp_slot = m(X86_64InterpreterFrame.vfp.offset); + xint.vsp_slot = xspc.vsp_slot = xfast.vsp_slot = m(X86_64InterpreterFrame.vsp.offset); + xint.wasm_func_slot = xspc.wasm_func_slot = xfast.wasm_func_slot = m(X86_64InterpreterFrame.wasm_func.offset); + xint.ip_slot = xspc.inlined_mem0_base_slot = xfast.inlined_mem0_base_slot = m(X86_64InterpreterFrame.ip.offset); + xint.stp_slot = xspc.inlined_instance_slot = xfast.inlined_instance_slot = m(X86_64InterpreterFrame.stp.offset); xint.func_decl_slot = m(X86_64InterpreterFrame.func_decl.offset); xint.code_slot = m(X86_64InterpreterFrame.code.offset); xint.eip_slot = m(X86_64InterpreterFrame.eip.offset); xint.frameSize = xspc.frameSize = X86_64InterpreterFrame.size; + xfast.frameSize = 0; - return (xspc, xint); + return (xspc, xfast, xint); })(); // The execution environment for single-pass compilation contexts. def SPC_EXEC_ENV = t.0; + // The execution environment for fast single-pass compilation contexts. + def FAST_SPC_EXEC_ENV = t.1; // The execution environment for interpreter compilation contexts. - def INT_EXEC_ENV = t.1; - - def FAST_SPC_EXEC_ENV = SPC_EXEC_ENV.dup(); + def INT_EXEC_ENV = t.2; // A register allocator for single-pass compilation contexts. def SPC_ALLOC = (fun -> RegAlloc { @@ -193,5 +195,3 @@ component X86_64MasmRegs { } } -def nothing = X86_64MasmRegs.FAST_SPC_EXEC_ENV.frameSize = 0; -def nothing2 = X86_64MasmRegs.FAST_SPC_EXEC_ENV.vfp_slot = MasmAddr(X86_64MasmRegs.FAST_SPC_EXEC_ENV.sp, X86_64InterpreterFrame.vfp.offset); From c6e6a5dda0f87d7a6b12af6f7c40f652fb23d2ef Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Thu, 2 Apr 2026 12:45:04 -0400 Subject: [PATCH 50/55] Remove unused functions --- src/engine/compiler/SinglePassCompiler.v3 | 33 ----------------------- 1 file changed, 33 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index d602570ae..deac910f0 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -3530,39 +3530,6 @@ class MoveNode { var dstNext: MoveNode; // next in a list of successors } -// checks function bytecode to see if it can be inlined based on -// simple heuristics: length <= maxInlineBytecodeSize and straightline code. -def funcCanInline(decl: FuncDecl) -> InlineConfig { - var default = InlineConfig(false, false, false); - if (decl.orig_bytecode.length > SpcTuning.maxInlineBytecodeSize || decl.sig.params.length > SpcTuning.maxInlineParams) return default; - var bi = BytecodeIterator.new().reset(decl); - var swap_instance = false; - var swap_membase = false; - while (bi.more()) { - var op = bi.current(); - match (op) { - // Cannot handle control flow yet. - IF, BR, BR_IF, BR_TABLE, BR_ON_NULL, BR_ON_NON_NULL, BR_ON_CAST, BR_ON_CAST_FAIL, RETURN => return default; - // These opcodes require swapping the instance. - // XXX should include FAST_CALL here, except that this code is superceded by inlining PR - THROW, CALL, CALL_INDIRECT, MEMORY_INIT, MEMORY_SIZE, MEMORY_GROW, MEMORY_COPY, MEMORY_FILL, REF_FUNC, DATA_DROP, - ELEM_DROP, TABLE_INIT, TABLE_SIZE, TABLE_COPY, TABLE_GROW, GLOBAL_SET, GLOBAL_GET, TABLE_SET, TABLE_GET => swap_instance = true; - // Load/store opcodes require either the memory base or the instance. - I32_STORE, I64_STORE, F32_STORE, F64_STORE, I32_STORE8, I32_STORE16, I64_STORE8, I64_STORE16, I64_STORE32, - V128_STORE, I32_LOAD, I64_LOAD, F32_LOAD, F64_LOAD, I32_LOAD8_S, I32_LOAD8_U, I32_LOAD16_S, I32_LOAD16_U, - I64_LOAD8_S, I64_LOAD8_U, I64_LOAD16_S, I64_LOAD16_U, I64_LOAD32_S, I64_LOAD32_U, V128_LOAD => { - var memarg = bi.immptr().read_MemArg(); - if (memarg.memory_index == 0) swap_membase = true; - else swap_instance = true; - } - _ => ; - } - bi.next(); - } - return InlineConfig(swap_membase, swap_instance, true); -} - -type InlineConfig(swap_membase: bool, swap_instance: bool, can_inline: bool); type WhammInlineConfig(swap_membase: bool, swap_instance: bool, is_inlined: bool); // Used to record the entry point of exception/suspension handlers. Jumping to {stub_label} allows From 2b4e4d59fbe2818888a347b8c3afbd4fe4d766ee Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Thu, 2 Apr 2026 18:59:52 -0400 Subject: [PATCH 51/55] save/restore caller IVars in SPC and across frame reconstruction --- src/engine/compiler/SinglePassCompiler.v3 | 18 +++- src/engine/x86-64/X86_64SinglePassCompiler.v3 | 82 +++++++++++++++++-- 2 files changed, 90 insertions(+), 10 deletions(-) diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index deac910f0..36e693e5e 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -2194,13 +2194,22 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl return label; } def getSpcInlinedFrameIp() -> long; + def saveCallerIVars(); + def restoreDispatchTableReg(); + def restoreCallerIVars(); // Emit code to materialize stack frames for each inlined function. def emitReconstructStackFrames(frames: Array) -> int { Metrics.spc_static_reconst.val++; masm.emit_inc_metric(Metrics.spc_dynamic_reconst); - def real_frame = frames[0]; - masm.emit_mov_m_i(xenv.pc_slot, real_frame.pc); + if (fast) { + // pc already saved + saveCallerIVars(); + } else { + def real_frame = frames[0]; + masm.emit_mov_m_i(xenv.pc_slot, real_frame.pc); + } + // NOTE we could use interpreter-backed registers for these instead of allocating new regs // load instance var inst_reg = allocTmp(ValueKind.REF); masm.emit_mov_r_m(ValueKind.REF, inst_reg, frame.instance_slot); @@ -2306,11 +2315,14 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl if (space > 0) { masm.emit_addw_r_i(regs.sp, space); masm.emit_mov_r_m(ValueKind.REF, regs.vfp, frame.vfp_slot); + if (fast) { + restoreCallerIVars(); + restoreDispatchTableReg(); + } } } else { emit(); } - } def unsupported() { success = false; // XXX: add opcode diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3 index a9f9dfbff..bc5a02fd8 100644 --- a/src/engine/x86-64/X86_64SinglePassCompiler.v3 +++ b/src/engine/x86-64/X86_64SinglePassCompiler.v3 @@ -23,6 +23,54 @@ def KIND_F64 = SpcConsts.KIND_F64; def KIND_V128 = SpcConsts.KIND_V128; def KIND_REF = SpcConsts.KIND_REF; +def xenv: IntExecEnv = X86_64MasmRegs.INT_EXEC_ENV; + +def r_mem0_base = G(xenv.mem0_base); +def r_vfp = G(xenv.vfp); +def r_vsp = G(xenv.vsp); +def r_stp = G(xenv.stp); +def r_ip = G(xenv.ip); +def r_eip = G(xenv.eip); +def r_func_decl = G(xenv.func_decl); +def r_instance = G(xenv.instance); +def r_curpc = G(xenv.curpc); +def ip_ptr = r_ip.plus(0); +def r_dispatch = G(xenv.dispatch); +def r_tmp0 = G(xenv.tmp0); // RCX +def r_tmp1 = G(xenv.tmp1); // RDX + +def m_mem0_base = R.RSP.plus(X86_64InterpreterFrame.mem0_base.offset); +def m_vfp = R.RSP.plus(X86_64InterpreterFrame.vfp.offset); +def m_vsp = R.RSP.plus(X86_64InterpreterFrame.vsp.offset); +def m_stp = R.RSP.plus(X86_64InterpreterFrame.stp.offset); +def m_ip = R.RSP.plus(X86_64InterpreterFrame.ip.offset); +def m_eip = R.RSP.plus(X86_64InterpreterFrame.eip.offset); +def m_func_decl = R.RSP.plus(X86_64InterpreterFrame.func_decl.offset); +def m_instance = R.RSP.plus(X86_64InterpreterFrame.instance.offset); +def m_curpc = R.RSP.plus(X86_64InterpreterFrame.curpc.offset); + +def ivar_MEM0_BASE = (r_mem0_base, m_mem0_base); +def ivar_VFP = (r_vfp, m_vfp); +def ivar_VSP = (r_vsp, m_vsp); +def ivar_STP = (r_stp, m_stp); +def ivar_IP = (r_ip, m_ip); +def ivar_EIP = (r_eip, m_eip); +def ivar_FUNC_DECL = (r_func_decl, m_func_decl); +def ivar_INSTANCE = (r_instance, m_instance); +def ivar_CURPC = (r_curpc, m_curpc); + +def all_ivars = [ + ivar_MEM0_BASE, + ivar_VFP, + ivar_VSP, + ivar_STP, + ivar_IP, + ivar_EIP, + ivar_FUNC_DECL, + ivar_INSTANCE, + ivar_CURPC +]; + // Implements the target-specific parts of the single-pass compiler for X86-64. class X86_64SinglePassCompiler extends SinglePassCompiler { def w = DataWriter.new(); @@ -37,16 +85,36 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { mmasm.trap_stubs = TRAPS_STUB; } def emitFastDispatch() { - // DISPATCH - def xenv: IntExecEnv = X86_64MasmRegs.INT_EXEC_ENV; - def r_ip = G(xenv.ip); - def ip_ptr = r_ip.plus(0); - def r_dispatch = G(xenv.dispatch); - def r_tmp0 = G(xenv.tmp0); // RCX - def r_tmp1 = G(xenv.tmp1); // RDX mmasm.emit_int_dispatch(r_tmp0, r_tmp1, r_ip, r_dispatch, ip_ptr, if(ic != null, IcCodeRef.new(ic.header.fastDispatchTableOffset)), true, ic); } + private def saveIVar(r: X86_64Gpr) { + for (t in all_ivars) { + if (t.0 == r) asm.movq_m_r(t.1, r); + } + } + def saveCallerIVars() { + saveIVar(r_ip); + saveIVar(r_stp); + if (!FeatureDisable.stacktraces) saveIVar(r_curpc); + } + def restoreCurPcFromFrame() { + if (!FeatureDisable.stacktraces) restoreReg(r_curpc); + } + private def restoreReg(r: X86_64Gpr) { + for (t in all_ivars) { + if (t.0 == r) asm.movq_r_m(r, t.1); + } + } + def restoreCallerIVars() { + restoreReg(r_ip); + restoreReg(r_stp); + restoreReg(r_eip); + restoreReg(r_instance); + restoreReg(r_func_decl); + restoreReg(r_mem0_base); + restoreReg(r_vfp); + } private def visitCompareI(asm: X86_64Assembler, cond: X86_64Cond) -> bool { var b = pop(), a = popReg(); From 84e8b991431e23d2906335a90abe138f6352e204 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Thu, 2 Apr 2026 21:26:28 -0400 Subject: [PATCH 52/55] Complete stack reconstruction around fast compilation --- src/engine/Debug.v3 | 6 ++-- src/engine/compiler/SinglePassCompiler.v3 | 28 ++++++++++++++----- src/engine/x86-64/X86_64SinglePassCompiler.v3 | 8 ++++-- src/engine/x86-64/X86_64Stack.v3 | 6 ---- 4 files changed, 30 insertions(+), 18 deletions(-) diff --git a/src/engine/Debug.v3 b/src/engine/Debug.v3 index 55a445978..29b91af78 100644 --- a/src/engine/Debug.v3 +++ b/src/engine/Debug.v3 @@ -6,9 +6,9 @@ component Debug { // Debug tracing options. def paranoid = false; def verbose = false; - def interpreter = true; - def runtime = true; - def compiler = true; + def interpreter = false; + def runtime = false; + def compiler = false; def pregen = false; def stack = false; def memory = false; diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 36e693e5e..f3e657766 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -76,6 +76,10 @@ def KIND_V128 = SpcConsts.KIND_V128; def KIND_REF = SpcConsts.KIND_REF; def KIND_REF_U64 = SpcConsts.KIND_REF_U64; +// Unlike frame.frameSize, where it is 0 for fast contexts. These are always the +// true frame size (for stack reconstruction methods). +def FRAME_SIZE = X86_64InterpreterFrame.size; + // Compiles Wasm bytecode to machine code in a single pass via a MacroAssembler. class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAlloc, extensions: Extension.set, limits: Limits, fast: bool) extends BytecodeVisitor { def instrTracer = if(Trace.compiler, InstrTracer.new()); @@ -166,6 +170,11 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // Push initial frame for top-level function state.frame_stack.clear(); + if (fast) { + // push a SpcFrame representing the interpreter frame already on the stack + var interp_frame = SpcFrame.new(null, module, 0, 0, 0, -1, null); + pushSpcFrame(interp_frame); + } var initial_frame = SpcFrame.new(func, module, 0, 0, func.num_slots(), 0, masm.newLabel(func.cur_bytecode.length)); pushSpcFrame(initial_frame); @@ -760,7 +769,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl setUnreachable(); } def visit_END() { - if (!isInlined()) { + if (needsEpilogue()) { var ctl_top = state.ctl_stack.peek(); if (ctl_top.opcode == Opcode.LOOP.code) { state.ctl_stack.pop(); @@ -2148,7 +2157,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } } - if (isInlined()) return; + if (!needsEpilogue()) return; // Compute VSP = VFP + state.sp emit_compute_vsp(regs.vsp, state.sp); @@ -2232,7 +2241,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } // Pre-allocate stack space for all reconstructed frames at once. - def total_space = (frames.length - 1) * (frame.frameSize + 8); + def total_space = (frames.length - 1) * (FRAME_SIZE + 8); masm.emit_subw_r_i(regs.sp, total_space); // Process the inlined frames (skip the outermost which already exists on native stack) @@ -2252,9 +2261,9 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // Use inlined frame stub IP as return address for all reconstructed frames def return_addr = getSpcInlinedFrameIp(); - def frame_offset = offset * (frame.frameSize + 8); + def frame_offset = offset * (FRAME_SIZE + 8); // Write inlined frame stub IP as return address - def retaddr_slot = MasmAddr(regs.sp, frame_offset + frame.frameSize); + def retaddr_slot = MasmAddr(regs.sp, frame_offset + FRAME_SIZE); masm.emit_mov_m_l(retaddr_slot, return_addr); // get functions[func_index] and save into frame @@ -2309,6 +2318,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } unrefRegs(); frames_reconstructed = true; + if (Trace.compiler) Trace.OUT.puts("performing frame reconstruction\n"); def space = emitReconstructStackFrames(snapshotFrames()); emit(); frames_reconstructed = false; @@ -2760,10 +2770,10 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } state.frame_stack.push(frame); // Update cached copies from new top frame - it.reset(frame.func).at(frame.pc, -1); + if (frame.func != null) it.reset(frame.func).at(frame.pc, -1); module = frame.module; func = frame.func; - sig = func.sig; + sig = if(func != null, func.sig); num_locals = frame.num_locals; local_base_sp = frame.local_base_sp; ctl_base_sp = frame.ctl_base_sp; @@ -2789,6 +2799,10 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl def isInlined() -> bool { return state.frame_stack.top > 1; } + def needsEpilogue() -> bool { + // inlined callees will fallthrough and don't need epilogue to be emitted + return !isInlined() || ctl_base_sp == 0; + } def inlineDepth() -> int { return state.frame_stack.top - 1; } diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3 index bc5a02fd8..3bdbbf6e2 100644 --- a/src/engine/x86-64/X86_64SinglePassCompiler.v3 +++ b/src/engine/x86-64/X86_64SinglePassCompiler.v3 @@ -98,8 +98,12 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { saveIVar(r_stp); if (!FeatureDisable.stacktraces) saveIVar(r_curpc); } - def restoreCurPcFromFrame() { - if (!FeatureDisable.stacktraces) restoreReg(r_curpc); + def restoreDispatchTableReg() { + if (!FeatureDisable.globalProbes) { + // restore dispatch table from Interpreter.dispatchTable + def offsets = masm.getOffsets(); + asm.movq_r_m(r_dispatch, mmasm.absPointer(offsets.Interpreter_dispatchTable)); + } } private def restoreReg(r: X86_64Gpr) { for (t in all_ivars) { diff --git a/src/engine/x86-64/X86_64Stack.v3 b/src/engine/x86-64/X86_64Stack.v3 index b2717e4a8..cb3e94c93 100644 --- a/src/engine/x86-64/X86_64Stack.v3 +++ b/src/engine/x86-64/X86_64Stack.v3 @@ -926,12 +926,6 @@ layout X86_64InterpreterFrame { =104; } -// XXX: this frame may be differently sized depending on other touched registers -layout X86_64InterpreterFastCallFrame { - +0 vfp : i64; // Pointer - =8; -} - // Native frame states used in the implementation of {FrameStateAccessor}. Since a frame // can be optimized or deoptimized in place, the frame state accessor has to check the // state for every call. From 62cb3ab2ee67369f181ee85b7b373fb85542c8a0 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Thu, 2 Apr 2026 21:42:27 -0400 Subject: [PATCH 53/55] Remove FAST_CALL47 and opcode mapped to 0xFF --- src/engine/BytecodeIterator.v3 | 1 - src/engine/CodeValidator.v3 | 1 - src/engine/Opcodes.v3 | 11 +++++------ src/engine/v3/V3Interpreter.v3 | 4 ++-- src/engine/x86-64/X86_64Interpreter.v3 | 1 - 5 files changed, 7 insertions(+), 11 deletions(-) diff --git a/src/engine/BytecodeIterator.v3 b/src/engine/BytecodeIterator.v3 index 1a1c9845f..6048b21e6 100644 --- a/src/engine/BytecodeIterator.v3 +++ b/src/engine/BytecodeIterator.v3 @@ -833,7 +833,6 @@ class BytecodeIterator { FAST_CALL44 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); FAST_CALL45 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); FAST_CALL46 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); - FAST_CALL47 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); } } def trace(out: StringBuilder, module: Module, tracer: InstrTracer) { diff --git a/src/engine/CodeValidator.v3 b/src/engine/CodeValidator.v3 index 0815a7714..a40970cab 100644 --- a/src/engine/CodeValidator.v3 +++ b/src/engine/CodeValidator.v3 @@ -509,7 +509,6 @@ class CodeValidator(extensions: Extension.set, limits: Limits, module: Module, e FAST_CALL44 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); FAST_CALL45 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); FAST_CALL46 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL47 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); CALL_INDIRECT => { var sig = parser.readSigRef(); var table = parser.readTableRef(); diff --git a/src/engine/Opcodes.v3 b/src/engine/Opcodes.v3 index 0998599ea..ccb857342 100644 --- a/src/engine/Opcodes.v3 +++ b/src/engine/Opcodes.v3 @@ -657,8 +657,7 @@ enum Opcode(prefix: u8, code: u16, mnemonic: string, imms: Array, sig: FAST_CALL43 (0x00, 0xF8, "fast_call43", imm.NONE, null), FAST_CALL44 (0x00, 0xF9, "fast_call44", imm.NONE, null), FAST_CALL45 (0x00, 0xFA, "fast_call45", imm.NONE, null), - FAST_CALL46 (0x00, 0xFF, "fast_call46", imm.NONE, null), - FAST_CALL47 (0x00, 0x17, "fast_call47", imm.FUNC, null) + FAST_CALL46 (0x00, 0x17, "fast_call46", imm.FUNC, null) } @@ -849,12 +848,14 @@ component Opcodes { def code_pages = [page_FB, page_FC, page_FD, page_FE]; def var longestName: int; def var num_subpages: int; + def FAST_CALL_OPCODES = 47; def var fast_calls: Array; private var nameMap: HashMap; new() { - fast_calls = Array.new(48); - for (i < 48) { + + fast_calls = Array.new(FAST_CALL_OPCODES); + for (i < FAST_CALL_OPCODES) { fast_calls[i] = indexToFastCall(i); } for (op in Opcode) { @@ -1228,7 +1229,6 @@ component Opcodes { 44 => op = Opcode.FAST_CALL44; 45 => op = Opcode.FAST_CALL45; 46 => op = Opcode.FAST_CALL46; - 47 => op = Opcode.FAST_CALL47; _ => System.error("indexToFastCall", "out of range"); } return op; @@ -1283,7 +1283,6 @@ component Opcodes { FAST_CALL44 => idx = 44; FAST_CALL45 => idx = 45; FAST_CALL46 => idx = 46; - FAST_CALL47 => idx = 47; _ => System.error("fastCallToIndex", "not a FAST_CALL instruction"); } return idx; diff --git a/src/engine/v3/V3Interpreter.v3 b/src/engine/v3/V3Interpreter.v3 index 3d8b33368..7e762e800 100644 --- a/src/engine/v3/V3Interpreter.v3 +++ b/src/engine/v3/V3Interpreter.v3 @@ -349,7 +349,7 @@ class V3Interpreter extends WasmStack { RETURN => { doReturn(frame.fp, frame.func.sig); } - CALL, FAST_CALL0, FAST_CALL1, FAST_CALL2, FAST_CALL3, FAST_CALL4, FAST_CALL5, FAST_CALL6, FAST_CALL7, FAST_CALL8, FAST_CALL9, FAST_CALL10, FAST_CALL11, FAST_CALL12, FAST_CALL13, FAST_CALL14, FAST_CALL15, FAST_CALL16, FAST_CALL17, FAST_CALL18, FAST_CALL19, FAST_CALL20, FAST_CALL21, FAST_CALL22, FAST_CALL23, FAST_CALL24, FAST_CALL25, FAST_CALL26, FAST_CALL27, FAST_CALL28, FAST_CALL29, FAST_CALL30, FAST_CALL31, FAST_CALL32, FAST_CALL33, FAST_CALL34, FAST_CALL35, FAST_CALL36, FAST_CALL37, FAST_CALL38, FAST_CALL39, FAST_CALL40, FAST_CALL41, FAST_CALL42, FAST_CALL43, FAST_CALL44, FAST_CALL45, FAST_CALL46, FAST_CALL47 => { + CALL, FAST_CALL0, FAST_CALL1, FAST_CALL2, FAST_CALL3, FAST_CALL4, FAST_CALL5, FAST_CALL6, FAST_CALL7, FAST_CALL8, FAST_CALL9, FAST_CALL10, FAST_CALL11, FAST_CALL12, FAST_CALL13, FAST_CALL14, FAST_CALL15, FAST_CALL16, FAST_CALL17, FAST_CALL18, FAST_CALL19, FAST_CALL20, FAST_CALL21, FAST_CALL22, FAST_CALL23, FAST_CALL24, FAST_CALL25, FAST_CALL26, FAST_CALL27, FAST_CALL28, FAST_CALL29, FAST_CALL30, FAST_CALL31, FAST_CALL32, FAST_CALL33, FAST_CALL34, FAST_CALL35, FAST_CALL36, FAST_CALL37, FAST_CALL38, FAST_CALL39, FAST_CALL40, FAST_CALL41, FAST_CALL42, FAST_CALL43, FAST_CALL44, FAST_CALL45, FAST_CALL46 => { var func_index = codeptr.read_uleb32(); var f = frame.func.instance.functions[func_index]; return doCallFunction(f); @@ -1615,7 +1615,7 @@ class V3Interpreter extends WasmStack { // XXX: use read_opcode_and_skip() var opcode = codeptr.read_opcode_but_skip_probe(frame.func.decl); match (opcode) { - CALL, CALL_REF, FAST_CALL0, FAST_CALL1, FAST_CALL2, FAST_CALL3, FAST_CALL4, FAST_CALL5, FAST_CALL6, FAST_CALL7, FAST_CALL8, FAST_CALL9, FAST_CALL10, FAST_CALL11, FAST_CALL12, FAST_CALL13, FAST_CALL14, FAST_CALL15, FAST_CALL16, FAST_CALL17, FAST_CALL18, FAST_CALL19, FAST_CALL20, FAST_CALL21, FAST_CALL22, FAST_CALL23, FAST_CALL24, FAST_CALL25, FAST_CALL26, FAST_CALL27, FAST_CALL28, FAST_CALL29, FAST_CALL30, FAST_CALL31, FAST_CALL32, FAST_CALL33, FAST_CALL34, FAST_CALL35, FAST_CALL36, FAST_CALL37, FAST_CALL38, FAST_CALL39, FAST_CALL40, FAST_CALL41, FAST_CALL42, FAST_CALL43, FAST_CALL44, FAST_CALL45, FAST_CALL46, FAST_CALL47 => { + CALL, CALL_REF, FAST_CALL0, FAST_CALL1, FAST_CALL2, FAST_CALL3, FAST_CALL4, FAST_CALL5, FAST_CALL6, FAST_CALL7, FAST_CALL8, FAST_CALL9, FAST_CALL10, FAST_CALL11, FAST_CALL12, FAST_CALL13, FAST_CALL14, FAST_CALL15, FAST_CALL16, FAST_CALL17, FAST_CALL18, FAST_CALL19, FAST_CALL20, FAST_CALL21, FAST_CALL22, FAST_CALL23, FAST_CALL24, FAST_CALL25, FAST_CALL26, FAST_CALL27, FAST_CALL28, FAST_CALL29, FAST_CALL30, FAST_CALL31, FAST_CALL32, FAST_CALL33, FAST_CALL34, FAST_CALL35, FAST_CALL36, FAST_CALL37, FAST_CALL38, FAST_CALL39, FAST_CALL40, FAST_CALL41, FAST_CALL42, FAST_CALL43, FAST_CALL44, FAST_CALL45, FAST_CALL46 => { codeptr.skip_leb(); frame.pc = codeptr.pos; } diff --git a/src/engine/x86-64/X86_64Interpreter.v3 b/src/engine/x86-64/X86_64Interpreter.v3 index ac028fe36..001e03aed 100644 --- a/src/engine/x86-64/X86_64Interpreter.v3 +++ b/src/engine/x86-64/X86_64Interpreter.v3 @@ -1360,7 +1360,6 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { bindHandler(Opcode.FAST_CALL44); bindHandler(Opcode.FAST_CALL45); bindHandler(Opcode.FAST_CALL46); - bindHandler(Opcode.FAST_CALL47); masm.emit_intentional_crash(); //masm.emit_debugger_breakpoint(); var dispatchLabel = X86_64Label.new(); From b487e6b54e6295ef2b429ac62c0ddb679218e2d1 Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Thu, 2 Apr 2026 21:55:19 -0400 Subject: [PATCH 54/55] Remove more FAST_CALLs contradicting stack switching --- src/engine/BytecodeIterator.v3 | 7 -- src/engine/CodeValidator.v3 | 9 --- src/engine/Opcodes.v3 | 103 ++++++++++--------------- src/engine/v3/V3Interpreter.v3 | 4 +- src/engine/x86-64/X86_64Interpreter.v3 | 7 -- 5 files changed, 43 insertions(+), 87 deletions(-) diff --git a/src/engine/BytecodeIterator.v3 b/src/engine/BytecodeIterator.v3 index 6048b21e6..c341bb881 100644 --- a/src/engine/BytecodeIterator.v3 +++ b/src/engine/BytecodeIterator.v3 @@ -826,13 +826,6 @@ class BytecodeIterator { FAST_CALL37 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); FAST_CALL38 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); FAST_CALL39 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); - FAST_CALL40 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); - FAST_CALL41 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); - FAST_CALL42 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); - FAST_CALL43 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); - FAST_CALL44 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); - FAST_CALL45 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); - FAST_CALL46 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); } } def trace(out: StringBuilder, module: Module, tracer: InstrTracer) { diff --git a/src/engine/CodeValidator.v3 b/src/engine/CodeValidator.v3 index a40970cab..bda019bfd 100644 --- a/src/engine/CodeValidator.v3 +++ b/src/engine/CodeValidator.v3 @@ -460,8 +460,6 @@ class CodeValidator(extensions: Extension.set, limits: Limits, module: Module, e } } - // code should have FAST_CALL replaced after CALL - FAST_CALL0 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); FAST_CALL1 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); FAST_CALL2 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); @@ -502,13 +500,6 @@ class CodeValidator(extensions: Extension.set, limits: Limits, module: Module, e FAST_CALL37 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); FAST_CALL38 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); FAST_CALL39 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL40 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL41 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL42 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL43 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL44 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL45 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL46 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); CALL_INDIRECT => { var sig = parser.readSigRef(); var table = parser.readTableRef(); diff --git a/src/engine/Opcodes.v3 b/src/engine/Opcodes.v3 index ccb857342..2e785e04c 100644 --- a/src/engine/Opcodes.v3 +++ b/src/engine/Opcodes.v3 @@ -611,53 +611,46 @@ enum Opcode(prefix: u8, code: u16, mnemonic: string, imms: Array, sig: SWITCH (0x00, 0xE6, "switch", imm.CONT_TAG, null) // fast call instructions - FAST_CALL0 (0x00, 0x27, "fast_call0", imm.NONE, null), - FAST_CALL1 (0x00, 0xC5, "fast_call1", imm.NONE, null), - FAST_CALL2 (0x00, 0xC6, "fast_call2", imm.NONE, null), - FAST_CALL3 (0x00, 0xC7, "fast_call3", imm.NONE, null), - FAST_CALL4 (0x00, 0xC8, "fast_call4", imm.NONE, null), - FAST_CALL5 (0x00, 0xC9, "fast_call5", imm.NONE, null), - FAST_CALL6 (0x00, 0xCA, "fast_call6", imm.NONE, null), - FAST_CALL7 (0x00, 0xCB, "fast_call7", imm.NONE, null), - FAST_CALL8 (0x00, 0xCC, "fast_call8", imm.NONE, null), - FAST_CALL9 (0x00, 0xCD, "fast_call9", imm.NONE, null), - FAST_CALL10 (0x00, 0xCE, "fast_call10", imm.NONE, null), - FAST_CALL11 (0x00, 0xCF, "fast_call11", imm.NONE, null), - FAST_CALL12 (0x00, 0xD7, "fast_call12", imm.NONE, null), - FAST_CALL13 (0x00, 0xD8, "fast_call13", imm.NONE, null), - FAST_CALL14 (0x00, 0xD9, "fast_call14", imm.NONE, null), - FAST_CALL15 (0x00, 0xDA, "fast_call15", imm.NONE, null), - FAST_CALL16 (0x00, 0xDB, "fast_call16", imm.NONE, null), - FAST_CALL17 (0x00, 0xDC, "fast_call17", imm.NONE, null), - FAST_CALL18 (0x00, 0xDD, "fast_call18", imm.NONE, null), - FAST_CALL19 (0x00, 0xDE, "fast_call19", imm.NONE, null), - FAST_CALL20 (0x00, 0xDF, "fast_call20", imm.NONE, null), - FAST_CALL21 (0x00, 0xE0, "fast_call21", imm.NONE, null), - FAST_CALL22 (0x00, 0xE1, "fast_call22", imm.NONE, null), - FAST_CALL23 (0x00, 0xE2, "fast_call23", imm.NONE, null), - FAST_CALL24 (0x00, 0xE3, "fast_call24", imm.NONE, null), - FAST_CALL25 (0x00, 0xE4, "fast_call25", imm.NONE, null), - FAST_CALL26 (0x00, 0xE5, "fast_call26", imm.NONE, null), - FAST_CALL27 (0x00, 0xE6, "fast_call27", imm.NONE, null), - FAST_CALL28 (0x00, 0xE7, "fast_call28", imm.NONE, null), - FAST_CALL29 (0x00, 0xE8, "fast_call29", imm.NONE, null), - FAST_CALL30 (0x00, 0xE9, "fast_call30", imm.NONE, null), - FAST_CALL31 (0x00, 0xEA, "fast_call31", imm.NONE, null), - FAST_CALL32 (0x00, 0xEB, "fast_call32", imm.NONE, null), - FAST_CALL33 (0x00, 0xEC, "fast_call33", imm.NONE, null), - FAST_CALL34 (0x00, 0xED, "fast_call34", imm.NONE, null), - FAST_CALL35 (0x00, 0xEE, "fast_call35", imm.NONE, null), - FAST_CALL36 (0x00, 0xEF, "fast_call36", imm.NONE, null), - FAST_CALL37 (0x00, 0xF2, "fast_call37", imm.NONE, null), - FAST_CALL38 (0x00, 0xF3, "fast_call38", imm.NONE, null), - FAST_CALL39 (0x00, 0xF4, "fast_call39", imm.NONE, null), - FAST_CALL40 (0x00, 0xF5, "fast_call40", imm.NONE, null), - FAST_CALL41 (0x00, 0xF6, "fast_call41", imm.NONE, null), - FAST_CALL42 (0x00, 0xF7, "fast_call42", imm.NONE, null), - FAST_CALL43 (0x00, 0xF8, "fast_call43", imm.NONE, null), - FAST_CALL44 (0x00, 0xF9, "fast_call44", imm.NONE, null), - FAST_CALL45 (0x00, 0xFA, "fast_call45", imm.NONE, null), - FAST_CALL46 (0x00, 0x17, "fast_call46", imm.FUNC, null) + FAST_CALL0 (0x00, 0x27, "fast_call0", imm.FUNC, null), + FAST_CALL1 (0x00, 0xC5, "fast_call1", imm.FUNC, null), + FAST_CALL2 (0x00, 0xC6, "fast_call2", imm.FUNC, null), + FAST_CALL3 (0x00, 0xC7, "fast_call3", imm.FUNC, null), + FAST_CALL4 (0x00, 0xC8, "fast_call4", imm.FUNC, null), + FAST_CALL5 (0x00, 0xC9, "fast_call5", imm.FUNC, null), + FAST_CALL6 (0x00, 0xCA, "fast_call6", imm.FUNC, null), + FAST_CALL7 (0x00, 0xCB, "fast_call7", imm.FUNC, null), + FAST_CALL8 (0x00, 0xCC, "fast_call8", imm.FUNC, null), + FAST_CALL9 (0x00, 0xCD, "fast_call9", imm.FUNC, null), + FAST_CALL10 (0x00, 0xCE, "fast_call10", imm.FUNC, null), + FAST_CALL11 (0x00, 0xCF, "fast_call11", imm.FUNC, null), + FAST_CALL12 (0x00, 0xD7, "fast_call12", imm.FUNC, null), + FAST_CALL13 (0x00, 0xD8, "fast_call13", imm.FUNC, null), + FAST_CALL14 (0x00, 0xD9, "fast_call14", imm.FUNC, null), + FAST_CALL15 (0x00, 0xDA, "fast_call15", imm.FUNC, null), + FAST_CALL16 (0x00, 0xDB, "fast_call16", imm.FUNC, null), + FAST_CALL17 (0x00, 0xDC, "fast_call17", imm.FUNC, null), + FAST_CALL18 (0x00, 0xDD, "fast_call18", imm.FUNC, null), + FAST_CALL19 (0x00, 0xDE, "fast_call19", imm.FUNC, null), + FAST_CALL20 (0x00, 0xDF, "fast_call20", imm.FUNC, null), + FAST_CALL21 (0x00, 0xE7, "fast_call21", imm.FUNC, null), + FAST_CALL22 (0x00, 0xE8, "fast_call22", imm.FUNC, null), + FAST_CALL23 (0x00, 0xE9, "fast_call23", imm.FUNC, null), + FAST_CALL24 (0x00, 0xEA, "fast_call24", imm.FUNC, null), + FAST_CALL25 (0x00, 0xEB, "fast_call25", imm.FUNC, null), + FAST_CALL26 (0x00, 0xEC, "fast_call26", imm.FUNC, null), + FAST_CALL27 (0x00, 0xED, "fast_call27", imm.FUNC, null), + FAST_CALL28 (0x00, 0xEE, "fast_call28", imm.FUNC, null), + FAST_CALL29 (0x00, 0xEF, "fast_call29", imm.FUNC, null), + FAST_CALL30 (0x00, 0xF2, "fast_call30", imm.FUNC, null), + FAST_CALL31 (0x00, 0xF3, "fast_call31", imm.FUNC, null), + FAST_CALL32 (0x00, 0xF4, "fast_call32", imm.FUNC, null), + FAST_CALL33 (0x00, 0xF5, "fast_call33", imm.FUNC, null), + FAST_CALL34 (0x00, 0xF6, "fast_call34", imm.FUNC, null), + FAST_CALL35 (0x00, 0xF7, "fast_call35", imm.FUNC, null), + FAST_CALL36 (0x00, 0xF8, "fast_call36", imm.FUNC, null), + FAST_CALL37 (0x00, 0xF9, "fast_call37", imm.FUNC, null), + FAST_CALL38 (0x00, 0xFA, "fast_call38", imm.FUNC, null), + FAST_CALL39 (0x00, 0x17, "fast_call39", imm.FUNC, null), } @@ -848,7 +841,7 @@ component Opcodes { def code_pages = [page_FB, page_FC, page_FD, page_FE]; def var longestName: int; def var num_subpages: int; - def FAST_CALL_OPCODES = 47; + def FAST_CALL_OPCODES = 40; def var fast_calls: Array; private var nameMap: HashMap; @@ -1222,13 +1215,6 @@ component Opcodes { 37 => op = Opcode.FAST_CALL37; 38 => op = Opcode.FAST_CALL38; 39 => op = Opcode.FAST_CALL39; - 40 => op = Opcode.FAST_CALL40; - 41 => op = Opcode.FAST_CALL41; - 42 => op = Opcode.FAST_CALL42; - 43 => op = Opcode.FAST_CALL43; - 44 => op = Opcode.FAST_CALL44; - 45 => op = Opcode.FAST_CALL45; - 46 => op = Opcode.FAST_CALL46; _ => System.error("indexToFastCall", "out of range"); } return op; @@ -1276,13 +1262,6 @@ component Opcodes { FAST_CALL37 => idx = 37; FAST_CALL38 => idx = 38; FAST_CALL39 => idx = 39; - FAST_CALL40 => idx = 40; - FAST_CALL41 => idx = 41; - FAST_CALL42 => idx = 42; - FAST_CALL43 => idx = 43; - FAST_CALL44 => idx = 44; - FAST_CALL45 => idx = 45; - FAST_CALL46 => idx = 46; _ => System.error("fastCallToIndex", "not a FAST_CALL instruction"); } return idx; diff --git a/src/engine/v3/V3Interpreter.v3 b/src/engine/v3/V3Interpreter.v3 index 7e762e800..c84ad7267 100644 --- a/src/engine/v3/V3Interpreter.v3 +++ b/src/engine/v3/V3Interpreter.v3 @@ -349,7 +349,7 @@ class V3Interpreter extends WasmStack { RETURN => { doReturn(frame.fp, frame.func.sig); } - CALL, FAST_CALL0, FAST_CALL1, FAST_CALL2, FAST_CALL3, FAST_CALL4, FAST_CALL5, FAST_CALL6, FAST_CALL7, FAST_CALL8, FAST_CALL9, FAST_CALL10, FAST_CALL11, FAST_CALL12, FAST_CALL13, FAST_CALL14, FAST_CALL15, FAST_CALL16, FAST_CALL17, FAST_CALL18, FAST_CALL19, FAST_CALL20, FAST_CALL21, FAST_CALL22, FAST_CALL23, FAST_CALL24, FAST_CALL25, FAST_CALL26, FAST_CALL27, FAST_CALL28, FAST_CALL29, FAST_CALL30, FAST_CALL31, FAST_CALL32, FAST_CALL33, FAST_CALL34, FAST_CALL35, FAST_CALL36, FAST_CALL37, FAST_CALL38, FAST_CALL39, FAST_CALL40, FAST_CALL41, FAST_CALL42, FAST_CALL43, FAST_CALL44, FAST_CALL45, FAST_CALL46 => { + CALL, FAST_CALL0, FAST_CALL1, FAST_CALL2, FAST_CALL3, FAST_CALL4, FAST_CALL5, FAST_CALL6, FAST_CALL7, FAST_CALL8, FAST_CALL9, FAST_CALL10, FAST_CALL11, FAST_CALL12, FAST_CALL13, FAST_CALL14, FAST_CALL15, FAST_CALL16, FAST_CALL17, FAST_CALL18, FAST_CALL19, FAST_CALL20, FAST_CALL21, FAST_CALL22, FAST_CALL23, FAST_CALL24, FAST_CALL25, FAST_CALL26, FAST_CALL27, FAST_CALL28, FAST_CALL29, FAST_CALL30, FAST_CALL31, FAST_CALL32, FAST_CALL33, FAST_CALL34, FAST_CALL35, FAST_CALL36, FAST_CALL37, FAST_CALL38, FAST_CALL39 => { var func_index = codeptr.read_uleb32(); var f = frame.func.instance.functions[func_index]; return doCallFunction(f); @@ -1615,7 +1615,7 @@ class V3Interpreter extends WasmStack { // XXX: use read_opcode_and_skip() var opcode = codeptr.read_opcode_but_skip_probe(frame.func.decl); match (opcode) { - CALL, CALL_REF, FAST_CALL0, FAST_CALL1, FAST_CALL2, FAST_CALL3, FAST_CALL4, FAST_CALL5, FAST_CALL6, FAST_CALL7, FAST_CALL8, FAST_CALL9, FAST_CALL10, FAST_CALL11, FAST_CALL12, FAST_CALL13, FAST_CALL14, FAST_CALL15, FAST_CALL16, FAST_CALL17, FAST_CALL18, FAST_CALL19, FAST_CALL20, FAST_CALL21, FAST_CALL22, FAST_CALL23, FAST_CALL24, FAST_CALL25, FAST_CALL26, FAST_CALL27, FAST_CALL28, FAST_CALL29, FAST_CALL30, FAST_CALL31, FAST_CALL32, FAST_CALL33, FAST_CALL34, FAST_CALL35, FAST_CALL36, FAST_CALL37, FAST_CALL38, FAST_CALL39, FAST_CALL40, FAST_CALL41, FAST_CALL42, FAST_CALL43, FAST_CALL44, FAST_CALL45, FAST_CALL46 => { + CALL, CALL_REF, FAST_CALL0, FAST_CALL1, FAST_CALL2, FAST_CALL3, FAST_CALL4, FAST_CALL5, FAST_CALL6, FAST_CALL7, FAST_CALL8, FAST_CALL9, FAST_CALL10, FAST_CALL11, FAST_CALL12, FAST_CALL13, FAST_CALL14, FAST_CALL15, FAST_CALL16, FAST_CALL17, FAST_CALL18, FAST_CALL19, FAST_CALL20, FAST_CALL21, FAST_CALL22, FAST_CALL23, FAST_CALL24, FAST_CALL25, FAST_CALL26, FAST_CALL27, FAST_CALL28, FAST_CALL29, FAST_CALL30, FAST_CALL31, FAST_CALL32, FAST_CALL33, FAST_CALL34, FAST_CALL35, FAST_CALL36, FAST_CALL37, FAST_CALL38, FAST_CALL39 => { codeptr.skip_leb(); frame.pc = codeptr.pos; } diff --git a/src/engine/x86-64/X86_64Interpreter.v3 b/src/engine/x86-64/X86_64Interpreter.v3 index 001e03aed..7b3e74e91 100644 --- a/src/engine/x86-64/X86_64Interpreter.v3 +++ b/src/engine/x86-64/X86_64Interpreter.v3 @@ -1353,13 +1353,6 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { bindHandler(Opcode.FAST_CALL37); bindHandler(Opcode.FAST_CALL38); bindHandler(Opcode.FAST_CALL39); - bindHandler(Opcode.FAST_CALL40); - bindHandler(Opcode.FAST_CALL41); - bindHandler(Opcode.FAST_CALL42); - bindHandler(Opcode.FAST_CALL43); - bindHandler(Opcode.FAST_CALL44); - bindHandler(Opcode.FAST_CALL45); - bindHandler(Opcode.FAST_CALL46); masm.emit_intentional_crash(); //masm.emit_debugger_breakpoint(); var dispatchLabel = X86_64Label.new(); From 71194b8874ef64d9ad18615ea58cdbdd9964758f Mon Sep 17 00:00:00 2001 From: Matthew Schneider Date: Fri, 3 Apr 2026 16:42:38 -0400 Subject: [PATCH 55/55] Move fast_target_code and fast_call_idx (conflict in internal offset!), also make more compatible with spectests --- src/engine/CodeValidator.v3 | 141 ++++++++++++++++++------------------ src/engine/Module.v3 | 5 +- src/engine/Tuning.v3 | 2 +- 3 files changed, 75 insertions(+), 73 deletions(-) diff --git a/src/engine/CodeValidator.v3 b/src/engine/CodeValidator.v3 index bda019bfd..d0d050019 100644 --- a/src/engine/CodeValidator.v3 +++ b/src/engine/CodeValidator.v3 @@ -422,84 +422,85 @@ class CodeValidator(extensions: Extension.set, limits: Limits, module: Module, e checkSignature(func.sig); // fast call: if function is exported with fast name, replace the bytecode with FAST_CALL - if (FastIntTuning.useFastFunctions) for (i < module.exports.length) { - def ex = module.exports[i]; - if (ex.1 == func && Strings.startsWith(ex.0, "fast:")) { - if (Trace.validation) Trace.OUT.puts(" function declared as fast: "); + if (FastIntTuning.useFastFunctions) { + for (i < module.exports.length) { + def ex = module.exports[i]; + if (ex.1 == func && Strings.startsWith(ex.0, "fast:")) { + if (Trace.validation) Trace.OUT.puts(" function declared as fast: "); - var fast_idx = -1; - def fast_funcs = module.fast_funcs; - // look for existing FAST_CALL instruction allocated for this function - for (i < fast_funcs.length) { - if (func == fast_funcs[i]) { - fast_idx = i; - if (Trace.validation) Trace.OUT.put1("allocated as FAST_CALL%d, ", fast_idx); - break; + var fast_idx = -1; + def fast_funcs = module.fast_funcs; + // look for existing FAST_CALL instruction allocated for this function + for (i < fast_funcs.length) { + if (func == fast_funcs[i]) { + fast_idx = i; + if (Trace.validation) Trace.OUT.put1("allocated as FAST_CALL%d, ", fast_idx); + break; + } } - } - // not found? allocate FAST_CALL instruction, if there's space - if (fast_idx < 0) { - if (fast_funcs.length < 48) { - fast_idx = fast_funcs.length; - func.fast_call_idx = fast_idx; - if (Trace.validation) Trace.OUT.put1("not found, allocating FAST_CALL%d, ", fast_idx); - fast_funcs.put(func); + // not found? allocate FAST_CALL instruction, if there's space + if (fast_idx < 0) { + if (fast_funcs.length < 40) { + fast_idx = fast_funcs.length; + func.fast_call_idx = fast_idx; + if (Trace.validation) Trace.OUT.put1("not found, allocating FAST_CALL%d, ", fast_idx); + fast_funcs.put(func); + } else { + if (Trace.validation) Trace.OUT.puts("not found, FAST_CALL table is full, "); + } + } + // replace the bytecode, if it's found or allocated + if (fast_idx >= 0) { + //if (Trace.validation) Trace.OUT.put2("replaceCall(opcode_pos, fast_idx)\n", opcode_pos, fast_idx); + if (Trace.validation) Trace.OUT.puts("replacing call\n"); + this.func.replaceCall(opcode_pos, fast_idx); } else { - if (Trace.validation) Trace.OUT.puts("not found, FAST_CALL table is full, "); + if (Trace.validation) Trace.OUT.puts("not replacing\n"); } } - // replace the bytecode, if it's found or allocated - if (fast_idx >= 0) { - //if (Trace.validation) Trace.OUT.put2("replaceCall(opcode_pos, fast_idx)\n", opcode_pos, fast_idx); - if (Trace.validation) Trace.OUT.puts("replacing call\n"); - this.func.replaceCall(opcode_pos, fast_idx); - } else { - if (Trace.validation) Trace.OUT.puts("not replacing\n"); - } } } - } - FAST_CALL0 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL1 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL2 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL3 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL4 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL5 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL6 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL7 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL8 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL9 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL10 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL11 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL12 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL13 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL14 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL15 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL16 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL17 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL18 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL19 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL20 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL21 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL22 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL23 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL24 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL25 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL26 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL27 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL28 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL29 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL30 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL31 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL32 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL33 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL34 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL35 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL36 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL37 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL38 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); - FAST_CALL39 => System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL0 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL1 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL2 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL3 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL4 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL5 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL6 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL7 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL8 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL9 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL10 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL11 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL12 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL13 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL14 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL15 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL16 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL17 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL18 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL19 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL20 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL21 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL22 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL23 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL24 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL25 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL26 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL27 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL28 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL29 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL30 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL31 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL32 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL33 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL34 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL35 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL36 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL37 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL38 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL39 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); CALL_INDIRECT => { var sig = parser.readSigRef(); var table = parser.readTableRef(); diff --git a/src/engine/Module.v3 b/src/engine/Module.v3 index ecdabfcdf..a26d75e74 100644 --- a/src/engine/Module.v3 +++ b/src/engine/Module.v3 @@ -142,10 +142,10 @@ class FuncDecl(sig_index: int) extends Decl { var cbd_sidetable: Array; // CBD u8 sidetable var frame_var_tags: Array; // value tags for frame variables var target_code: TargetCode; - var fast_target_code: TargetCode; - var fast_call_idx: int = -1; var tierup_trigger: int = int.max; var handlers = FuncHandlerInfo.new(); + var fast_target_code: TargetCode; + var fast_call_idx: int = -1; def render(names: NameSection, buf: StringBuilder) -> StringBuilder { var name = if (names != null, names.getFuncName(func_index)); @@ -217,6 +217,7 @@ class FuncDecl(sig_index: int) extends Decl { Trace.OUT.put3("(func=%q, tag=%d, throw_pc=%d)", this.render(instance.module.names, _), tag.decl.tag_index, throw_pc).ln(); } + while (i < handlers.length) { // XXX: speed this up with a binary search var e = handlers[i]; if (Trace.exception) Trace.OUT.put3(" entry[%d...%d] tag=%d", e.start, e.end, e.tag).ln(); diff --git a/src/engine/Tuning.v3 b/src/engine/Tuning.v3 index 2ca212cdc..323fcdd61 100644 --- a/src/engine/Tuning.v3 +++ b/src/engine/Tuning.v3 @@ -42,7 +42,7 @@ component FastIntTuning { def inlineGlobalAccess = true; // enable inline access of (primitive) globals def stealFlagBitForMemory64 = true; // use a bit in the memarg flags for memory64 def whammProbeTrampolineNumPages = 1024; - def useFastFunctions = true; // treat functions exported with `fast:` in the name as fast functions + def useFastFunctions = false; // treat functions exported with `fast:` in the name as fast functions } // Tuning settings for the single-pass compiler that have no effect on correctness.