From fe107d4af9c52a17494e0a01746faedb0daf66b1 Mon Sep 17 00:00:00 2001
From: Matthew Schneider <panat.matthew@gmail.com>
Date: Wed, 11 Mar 2026 17:00:31 -0400
Subject: [PATCH] Move genDispatch0 to macro assembler

---
 src/engine/x86-64/X86_64Interpreter.v3    | 67 +++++------------------
 src/engine/x86-64/X86_64MacroAssembler.v3 | 40 +++++++++++++-
 2 files changed, 54 insertions(+), 53 deletions(-)

diff --git a/src/engine/x86-64/X86_64Interpreter.v3 b/src/engine/x86-64/X86_64Interpreter.v3
index 29307ca98..26b03aaac 100644
--- a/src/engine/x86-64/X86_64Interpreter.v3
+++ b/src/engine/x86-64/X86_64Interpreter.v3
@@ -784,7 +784,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			var pos = w.atEnd().pos;
 			var refsndary = t.1;
 			computeCurIpForTrap(-1);
-			genDispatch0(ip_ptr, refsndary, true);
+			masm.genDispatch0(ic, ip_ptr, refsndary, true);
 			// main[prefix] = prefix's table
 			writeDispatchEntry(ref0, prefix, pos);
 		}
@@ -801,7 +801,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 				// happens only with non-minimal encoding
 				genSkipLeb();
 				asm.d.and_r_i(r_tmp0, 0x7F);		// clear upper bit
-				genDispatch0(null, refsndary, false);	// use secondary table
+				masm.genDispatch0(ic, null, refsndary, false);	// use secondary table
 			} else {
 				// code needs to grab next byte to see what to do
 				asm.movd_r_r(r_tmp2, r_tmp0);		// save opcode
@@ -833,7 +833,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 				// 1000 0000 -> 0000 0001 upper bit 0, LEB continues
 				// 1000 0001 -> 0000 0011 upper bit 1, LEB continues
 				// entries 0-3 of the LEB table are otherwise unused :-)
-				genDispatch0(null, refleb, false);
+				masm.genDispatch0(ic, null, refleb, false);
 
 				// create and install the four dispatch sequences for mapped second byte
 
@@ -843,7 +843,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 				// not used when encoding is minimal
 				asm.movd_r_r(r_tmp0, r_tmp2);
 				asm.d.and_r_i(r_tmp0, 0x7F);
-				genDispatch0(null, refsndary, false);
+				masm.genDispatch0(ic, null, refsndary, false);
 				writeDispatchEntry(refleb, 0x00, leb00_pos);
 
 				var leb01_pos = w.pos;
@@ -853,14 +853,14 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 				genSkipLeb();
 				asm.movd_r_r(r_tmp0, r_tmp2);	// EBM check this!
 				asm.d.and_r_i(r_tmp0, 0x7F);
-				genDispatch0(null, refsndary, false);
+				masm.genDispatch0(ic, null, refsndary, false);
 				writeDispatchEntry(refleb, 0x01, leb01_pos);
 
 				var leb02_pos = w.pos;
 				// upper bit of opcode is 1, no LEB continuation:
 				// dispatch through LEB, don't clear upper bit
 				asm.movd_r_r(r_tmp0, r_tmp2);
-				genDispatch0(null, refleb, false);
+				masm.genDispatch0(ic, null, refleb, false);
 				writeDispatchEntry(refleb, 0x02, leb02_pos);
 
 				var leb03_pos = w.pos;
@@ -870,7 +870,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 				// not used when encoding is minimal
 				genSkipLeb();
 				asm.movd_r_r(r_tmp0, r_tmp2);
-				genDispatch0(null, refleb, false);
+				masm.genDispatch0(ic, null, refleb, false);
 				writeDispatchEntry(refleb, 0x03, leb03_pos);
 			}
 			for (i = 128; i < 256; i++) {
@@ -899,7 +899,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			var sub_index_leb = sub_index | byte.view(0x80);
 			var refsub = t.4;
 
-			genDispatch0(ip_ptr, refsub, true);
+			masm.genDispatch0(ic, ip_ptr, refsub, true);
 			// all uses of a subpage will have the LEB bit set
 			writeDispatchEntry(refsndary, sub_index_leb, subpage_pos);
 
@@ -913,7 +913,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			var subpage_leb_pos = w.pos;
 			genSkipLeb();
 			asm.d.and_r_i(r_tmp0, 0x7F);
-			genDispatch0(null, refsub, false);
+			masm.genDispatch0(ic, null, refsub, false);
 			for (i = 128; i < 256; ++i) {
 				writeDispatchEntry(refsub, i, subpage_leb_pos);
 			}
@@ -1013,7 +1013,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			asm.inc_r(r_ip);
 			asm.cmp_r_i(opcode, Opcode.BLOCK.code);
 			asm.jc_rel_near(C.Z, repeated_block);
-			genDispatch0(null, null, false); // inlined rest of dispatch
+			masm.genDispatch0(ic, null, null, false); // inlined rest of dispatch
 		} else {
 			endHandler();
 		}
@@ -2646,7 +2646,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			asm.movq_r_m(origIp, r_func_decl.plus(offsets.FuncDecl_orig_bytecode));
 			asm.add_r_r(origIp, pc);
 			asm.sub_r_i(origIp, 1);
-			genDispatch0(origIp.indirect(), dispatchTables[0].1, false);
+			masm.genDispatch0(ic, origIp.indirect(), dispatchTables[0].1, false);
 		}
 		// Specialized handler for whamm probes, uses a trampoline.
 		if (FastIntTuning.enableWhammProbeTrampoline) {
@@ -2694,7 +2694,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			asm.add_r_r(origIp, pc);
 			asm.sub_r_i(origIp, 1);
 			// Dispatch to the original bytecode at this pc.
-			genDispatch0(origIp.indirect(), dispatchTables[0].1, false);
+			masm.genDispatch0(ic, origIp.indirect(), dispatchTables[0].1, false);
 		}
 		// Specialized handler for breakpoint probe, issues int(3), which gdb sees as a signal.
 		if (FastIntTuning.enableWhammProbeTrampoline) {
@@ -2709,7 +2709,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			masm.emit_v3_FuncDecl_orig_bytecode_r_r(xenv.tmp0, xenv.func_decl);
 			asm.add_r_r(origIp, pc);
 			asm.sub_r_i(origIp, 1);
-			genDispatch0(origIp.indirect(), dispatchTables[0].1, false);
+			masm.genDispatch0(ic, origIp.indirect(), dispatchTables[0].1, false);
 		}
 	}
 	def genGlobalProbeSupport() {
@@ -2720,7 +2720,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		}
 		genGlobalProbeCall();
 		asm.sub_r_i(r_ip, 1);
-		genDispatch0(r_ip.indirect(), dispatchTables[0].1, true);
+		masm.genDispatch0(ic, r_ip.indirect(), dispatchTables[0].1, true);
 
 		// LOOP, BLOCK: don't tier up or skip blocks; call global probe and skip block type
 		offset = w.atEnd().pos;
@@ -3999,44 +3999,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 	}
 	// Generate a dispatch from the main dispatch table.
 	def genDispatch() {
-		genDispatch0(ip_ptr, if (FeatureDisable.globalProbes, dispatchTables[0].1), true);
-	}
-	// Generate a load of the next bytecode and a dispatch through the dispatch table.
-	def genDispatch0(ptr: X86_64Addr, table: IcCodeRef, increment: bool) {
-		var opcode = r_tmp0;
-		var base = r_tmp1;
-		if (ptr != null) asm.movbzx_r_m(opcode, ptr);
-		if (increment) asm.inc_r(r_ip);
-		match (FastIntTuning.dispatchEntrySize) {
-			2 => {
-				if (table == null) asm.movq_r_r(base, r_dispatch);
-				else asm.lea(base, table); // RIP-relative LEA
-				asm.movwsx_r_m(opcode, base.plusR(opcode, 2, 0)); // load 16-bit offset
-				asm.add_r_r(base, opcode);
-				if (dispatchJmpOffset < 0) dispatchJmpOffset = w.pos;
-				asm.ijmp_r(base);
-			}
-			4 => {
-				if (table == null) {
-					asm.movd_r_m(base, r_dispatch.plusR(opcode, 4, 0));
-				} else {
-					var addr = ic.start + table.offset;
-					asm.movd_r_m(base, X86_64Addr.new(null, opcode, 4, int.!(addr - Pointer.NULL)));
-				}
-				if (dispatchJmpOffset < 0) dispatchJmpOffset = w.pos;
-				asm.ijmp_r(base);
-			}
-			8 => {
-				if (table == null) {
-					if (dispatchJmpOffset < 0) dispatchJmpOffset = w.pos;
-					asm.ijmp_m(r_dispatch.plusR(opcode, 8, 0));
-				} else {
-					var addr = ic.start + table.offset;
-					if (dispatchJmpOffset < 0) dispatchJmpOffset = w.pos;
-					asm.ijmp_m(X86_64Addr.new(null, opcode, 8, int.!(addr - Pointer.NULL)));
-				}
-			}
-		}
+		masm.genDispatch0(ic, ip_ptr, if (FeatureDisable.globalProbes, dispatchTables[0].1), true);
 	}
 	// Patch the dispatch table for the given opcode to go to the given position.
 	def patchDispatchTable(opcode: Opcode, pos: int) {
diff --git a/src/engine/x86-64/X86_64MacroAssembler.v3 b/src/engine/x86-64/X86_64MacroAssembler.v3
index 181aa9696..2d0e497b1 100644
--- a/src/engine/x86-64/X86_64MacroAssembler.v3
+++ b/src/engine/x86-64/X86_64MacroAssembler.v3
@@ -92,6 +92,44 @@ class X86_64MacroAssembler extends MacroAssembler {
 		return Reg(0);
 	}
 
+	// Generate a load of the next bytecode and a dispatch through the dispatch table.
+	def genDispatch0(ic: X86_64InterpreterCode, ptr: X86_64Addr, table: IcCodeRef, increment: bool) {
+		def xenv: IntExecEnv = X86_64MasmRegs.INT_EXEC_ENV;
+		def opcode 	= G(xenv.tmp0);
+		def base 	= G(xenv.tmp1);
+		def r_ip	= G(xenv.ip);
+		def r_dispatch	= G(xenv.dispatch);
+
+		if (ptr != null) asm.movbzx_r_m(opcode, ptr);
+		if (increment) asm.inc_r(r_ip);
+		match (FastIntTuning.dispatchEntrySize) {
+			2 => {
+				if (table == null) asm.movq_r_r(base, r_dispatch);
+				else asm.lea(base, table); // RIP-relative LEA
+				asm.movwsx_r_m(opcode, base.plusR(opcode, 2, 0)); // load 16-bit offset
+				asm.add_r_r(base, opcode);
+				asm.ijmp_r(base);
+			}
+			4 => {
+				if (table == null) {
+					asm.movd_r_m(base, r_dispatch.plusR(opcode, 4, 0));
+				} else {
+					var addr = ic.start + table.offset;
+					asm.movd_r_m(base, X86_64Addr.new(null, opcode, 4, int.!(addr - Pointer.NULL)));
+				}
+				asm.ijmp_r(base);
+			}
+			8 => {
+				if (table == null) {
+					asm.ijmp_m(r_dispatch.plusR(opcode, 8, 0));
+				} else {
+					var addr = ic.start + table.offset;
+					asm.ijmp_m(X86_64Addr.new(null, opcode, 8, int.!(addr - Pointer.NULL)));
+				}
+			}
+		}
+	}
+
 	def emit_intentional_crash() {
 		recordCurSourceLoc();
 		asm.invalid();
@@ -1700,4 +1738,4 @@ def TRUNC_i32_f64_u = FloatTrunc.new(false, true, false);
 def TRUNC_i64_f32_s = FloatTrunc.new(true, false, true);
 def TRUNC_i64_f32_u = FloatTrunc.new(true, false, false);
 def TRUNC_i64_f64_s = FloatTrunc.new(true, true, true);
-def TRUNC_i64_f64_u = FloatTrunc.new(true, true, false);
\ No newline at end of file
+def TRUNC_i64_f64_u = FloatTrunc.new(true, true, false);