From 142750e7b99974f274bd3354dd61447ba83df5f2 Mon Sep 17 00:00:00 2001
From: Matthew Schneider <panat.matthew@gmail.com>
Date: Thu, 29 Jan 2026 17:43:03 -0500
Subject: [PATCH 1/2] [WIP] Migrate VspHelper to use MasmAddr

---
 src/engine/compiler/MacroAssembler.v3     |   1 +
 src/engine/compiler/RegSet.v3             |   8 +-
 src/engine/x86-64/X86_64Interpreter.v3    | 689 +++++++++++-----------
 src/engine/x86-64/X86_64MacroAssembler.v3 | 132 +++++
 4 files changed, 481 insertions(+), 349 deletions(-)
diff --git a/src/engine/compiler/MacroAssembler.v3 b/src/engine/compiler/MacroAssembler.v3
index d3868cf1e..fbd5d836f 100644
--- a/src/engine/compiler/MacroAssembler.v3
+++ b/src/engine/compiler/MacroAssembler.v3
@@ -299,6 +299,7 @@ class MacroAssembler(valuerep: Tagging, regConfig: RegConfig) {
 
 	def emit_binop_r_r(op: Opcode, reg: Reg, reg2: Reg);
 	def emit_binop_r_m(op: Opcode, reg: Reg, addr: MasmAddr);
+	def emit_binop_m_r(op: Opcode, addr: MasmAddr, reg: Reg);
 	def emit_binop_r_i(op: Opcode, reg: Reg, val: int);
 
 	def emit_pop_r(kind: ValueKind, reg: Reg);
diff --git a/src/engine/compiler/RegSet.v3 b/src/engine/compiler/RegSet.v3
index 0d4be1dda..4d7257990 100644
--- a/src/engine/compiler/RegSet.v3
+++ b/src/engine/compiler/RegSet.v3
@@ -3,7 +3,11 @@
 
 // Architecture-independent representation of a register for use in {MacroAssembler} and portable
 // parts of compilers. Kept small to keep data structures small. The name is stored in a {RegSet}.
-type Reg(index: byte) #unboxed { }
+type Reg(index: byte) #unboxed {
+	def plus(offset: int) -> MasmAddr {
+		return MasmAddr(this, offset);
+	}
+}
 
 // Describes the set of (maximum 256) registers for a target.
 // By convention, register #0 is reserved for indicating an unallocated register or no register.
@@ -49,4 +53,4 @@ class RegPool32(regs: Array<Reg>) {
 		map = Array<u5>.new(max + 1);
 		for (i < regs.length) map[regs[i].index] = u5.!(i);
 	}
-}
\ No newline at end of file
+}
diff --git a/src/engine/x86-64/X86_64Interpreter.v3 b/src/engine/x86-64/X86_64Interpreter.v3
index 30b4090da..2980390c7 100644
--- a/src/engine/x86-64/X86_64Interpreter.v3
+++ b/src/engine/x86-64/X86_64Interpreter.v3
@@ -220,8 +220,8 @@ class IntExecEnv {
 // Internal register configuration for variables live in the interpreter execution context.
 def R: X86_64Regs, GPRs = X86_64Regs.GPRs, C: X86_64Conds;
 // Helper for various slot addresses.
-type SlotAddrs(tag: X86_64Addr, value: X86_64Addr, upper: X86_64Addr) #unboxed { }
-class VspHelper(vsp: X86_64Gpr, valuerep: Tagging, depth: int) {
+type SlotAddrs(tag: MasmAddr, value: MasmAddr, upper: MasmAddr) #unboxed { }
+class VspHelper(vsp: Reg, valuerep: Tagging, depth: int) {
 	private def slots = Array<SlotAddrs>.new(depth + 1);
 	new() {
 		for (i < slots.length) {
@@ -242,6 +242,9 @@ def TYPE_IS_LEB: byte = 0x40;
 def LEB_UPPER_BIT: byte = 0x80;
 def G = X86_64MasmRegs.toGpr;
 def X = X86_64MasmRegs.toXmmr;
+def A(ma: MasmAddr) -> X86_64Addr {
+	return X86_64Addr.new(G(ma.base), null, 1, ma.offset);
+}
 
 // Generates {X86_64InterpreterCode} for X86-64.
 class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
@@ -263,8 +266,8 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 	var hostTailCallStubLabel = masm.newLabel(-1);
 	var hostCallStubLabel = masm.newLabel(-1);
 	var spcEntryLabel = X86_64Label.new();
-	var controlFallThruLabel = X86_64Label.new();
-	var controlTransferLabel = X86_64Label.new();
+	var controlFallThruLabel = masm.newLabel(-1);
+	var controlTransferLabel = masm.newLabel(-1);
 	var controlSkipSidetableAndDispatchLabel = X86_64Label.new();
 	var probedDispatchTableRef: IcCodeRef;
 	var typeTagTableOffset: int;
@@ -323,7 +326,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 
 	def k_frame_size	= X86_64InterpreterFrame.size;
 
-	def vsph = VspHelper.new(r_vsp, valuerep, 3);
+	def vsph = VspHelper.new(xenv.vsp, valuerep, 3);
 
 	def dispatchTables = Array<(byte, IcCodeRef, IcCodeRef, byte, IcCodeRef)>.new(
 				Opcodes.code_pages.length + Opcodes.num_subpages + 1);
@@ -676,8 +679,8 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		asm.d.cmp_r_i(numGpr, 0);
 		asm.jc_rel_near(C.Z, done2);
 		asm.bind(start2);
-		genTagPushR(typeGpr);			// *(sp) = type
-		asm.movq_m_i(vsph[0].value, 0);		// *(sp + 8) = 0
+		genTagPushR(xenv.tmp2 /* XXX typeGpr */);			// *(sp) = type
+		masm.emit_mov_m_l(vsph[0].value, 0);	// *(sp + 8) = 0
 		incrementVsp();	// sp += 1 slot
 		// gen: while (--num != 0)
 		asm.d.dec_r(numGpr);
@@ -705,8 +708,8 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 				asm.q.cmp_r_r(r_start, r_end);
 				asm.jc_rel_far(C.GE, retpoint);
 				asm.movbzx_r_m(r_tag, r_start.plus(0));
-				genTagPushR(r_tag);			// *(sp) = type
-				asm.movq_m_i(vsph[0].value, 0);		// *(sp + 8) = 0
+				genTagPushR(xenv.tmp3 /* XXX r_tag */);			// *(sp) = type
+				masm.emit_mov_m_l(vsph[0].value, 0);	// *(sp + 8) = 0
 				incrementVsp();	// sp += 1 slot
 				// gen: while (--num != 0)
 				asm.q.inc_r(r_start);
@@ -956,14 +959,14 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		bindHandler(Opcode.I32_CONST); {
 			genReadSleb32_inline(r_tmp1);
 			genTagPush(BpTypeCode.I32.code);
-			asm.movq_m_r(vsph[0].value, r_tmp1);
+			masm.emit_mov_m_r(ValueKind.I64, vsph[0].value, xenv.tmp1);
 			incrementVsp();
 			endHandler();
 		}
 		bindHandler(Opcode.I64_CONST); {
 			genReadSleb64_inline(r_tmp1);
 			genTagPush(BpTypeCode.I64.code);
-			asm.movq_m_r(vsph[0].value, r_tmp1);
+			masm.emit_mov_m_r(ValueKind.I64, vsph[0].value, xenv.tmp1);
 			incrementVsp();
 			endHandler();
 		}
@@ -971,7 +974,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			asm.movd_r_m(r_tmp0, ip_ptr);
 			asm.add_r_i(r_ip, 4);
 			genTagPush(BpTypeCode.F32.code);
-			asm.movq_m_r(vsph[0].value, r_tmp0);
+			masm.emit_mov_m_r(ValueKind.I64, vsph[0].value, xenv.tmp0);
 			incrementVsp();
 			endHandler();
 		}
@@ -979,7 +982,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			asm.movq_r_m(r_tmp0, ip_ptr);
 			asm.add_r_i(r_ip, 8);
 			genTagPush(BpTypeCode.F64.code);
-			asm.movq_m_r(vsph[0].value, r_tmp0);
+			masm.emit_mov_m_r(ValueKind.I64, vsph[0].value, xenv.tmp0);
 			incrementVsp();
 			endHandler();
 		}
@@ -1018,13 +1021,12 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			endHandler();
 		}
 
-		var ctl_xfer_nostack = X86_64Label.new();
+		var ctl_xfer_nostack = masm.newLabel(-1);
 
 		// IF: check condition and either fall thru to next bytecode or ctl xfer (without stack copying)
 		bindHandler(Opcode.IF);
 		decrementVsp();
-		asm.d.cmp_m_i(vsph[0].value, 0);
-		asm.jc_rel_far(C.Z, ctl_xfer_nostack); // XXX: can be near if no complex block types
+		masm.emit_br_m(vsph[0].value, MasmBrCond.I32_ZERO, ctl_xfer_nostack); // XXX: can be near if no complex block types
 		genSkipBlockType();
 		asm.bind(controlSkipSidetableAndDispatchLabel);
 		genSkipSidetableEntry();
@@ -1033,19 +1035,18 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		// BR_IF: check condition and either fall thru to next bytecode or ctl xfer (with stack copying)
 		bindHandler(Opcode.BR_IF);
 		decrementVsp();
-		asm.d.cmp_m_i(vsph[0].value, 0);
-		asm.jc_rel_far(C.Z, controlFallThruLabel); // XXX: move shared fallthrough closer?
+		masm.emit_br_m(vsph[0].value, MasmBrCond.I32_ZERO, controlFallThruLabel); // XXX: can be near if no complex block types
 		// fallthru to BR
 
 		// BR: unconditional ctl xfer with stack copying
 		bindHandlerNoAlign(Opcode.BR);
-		asm.bind(controlTransferLabel);
+		masm.bindLabel(controlTransferLabel);
 		var popcount = r_tmp0;
 		var valcount = r_tmp1;
 		// if popcount > 0
 		asm.movd_r_m(popcount, r_stp.plus(Sidetable_BrEntry.popcount.offset));
 		asm.d.cmp_r_i(popcount, 0);
-		asm.jc_rel_near(C.Z, ctl_xfer_nostack);
+		asm.jc_rel_near(C.Z, ctl_xfer_nostack.label);
 		// load valcount
 		asm.movd_r_m(valcount, r_stp.plus(Sidetable_BrEntry.valcount.offset));
 		// popcount = popcount * SLOT_SIZE
@@ -1059,7 +1060,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		var loop = X86_64Label.new();
 		asm.bind(loop);
 		asm.d.dec_r(valcount);
-		asm.jc_rel_near(C.S, ctl_xfer_nostack);
+		asm.jc_rel_near(C.S, ctl_xfer_nostack.label);
 		genCopySlot(r_vsp.plus(0), r_vsp.plusR(popcount, 1, 0));
 		incrementVsp();
 		asm.jmp_rel_near(loop);
@@ -1068,7 +1069,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		bindHandlerNoAlign(Opcode.CATCH);
 		bindHandlerNoAlign(Opcode.CATCH_ALL);
 		bindHandlerNoAlign(Opcode.ELSE);
-		asm.bind(ctl_xfer_nostack);
+		masm.bindLabel(ctl_xfer_nostack);
 		if (FastIntTuning.fourByteSidetable) { // load and sign-extend a 4-byte pc delta
 			asm.movd_r_m(r_tmp0, r_stp.plus(Sidetable_BrEntry.pc_delta.offset));
 			asm.q.shl_r_i(r_tmp0, 32);
@@ -1092,7 +1093,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		var max = r_tmp0, key = r_tmp1;
 		asm.movd_r_m(max, r_stp.plus(Sidetable_BrEntry.pc_delta.offset));
 		decrementVsp();
-		asm.movd_r_m(key, vsph[0].value);
+		masm.emit_mov_r_m(ValueKind.I32, xenv.tmp1 /* key */, vsph[0].value);
 		asm.d.cmp_r_r(key, max);
 		var ok = X86_64Label.new();
 		asm.jc_rel_near(C.NC, ok);
@@ -1102,40 +1103,37 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		asm.q.add_r_r(r_ip, max);
 		asm.shl_r_i(max, u6.!(Ints.log(u32.!(Sidetable_BrEntry.size))));
 		asm.q.add_r_r(r_stp, max);
-		asm.jmp_rel_near(controlTransferLabel);
+		masm.emit_br(controlTransferLabel);
 
 		// BR_ON_NULL: check condition and either fall thru to next bytecode or ctl xfer (with stack copying)
 		bindHandler(Opcode.BR_ON_NULL);
-		asm.q.cmp_m_i(vsph[-1].value, 0);
-		asm.jc_rel_near(C.NZ, controlFallThruLabel);
+		masm.emit_br_m(vsph[-1].value, MasmBrCond.REF_NULL, controlFallThruLabel);
 		decrementVsp();
-		asm.jmp_rel_near(controlTransferLabel);
+		masm.emit_br(controlTransferLabel);
 
 		// BR_ON_NON_NULL: check condition and either fall thru to next bytecode or ctl xfer (with stack copying)
 		bindHandler(Opcode.BR_ON_NON_NULL);
-		asm.q.cmp_m_i(vsph[-1].value, 0);
-		asm.jc_rel_near(C.NZ, controlTransferLabel);
+		masm.emit_br_m(vsph[-1].value, MasmBrCond.REF_NONNULL, controlTransferLabel);
 		decrementVsp();
 		// shared code for not-taken banches
-		asm.bind(controlFallThruLabel);
+		masm.bindLabel(controlFallThruLabel);
 		genSkipLeb();
 		genSkipSidetableEntry();
 		endHandler();
 
 		bindHandler(Opcode.SELECT); {
-			var label = X86_64Label.new();
-			asm.d.cmp_m_i(vsph[-1].value, 0);
-			asm.jc_rel_near(C.NZ, label);
+			var label = masm.newLabel(-1);
+			masm.emit_br_m(vsph[-1].value, MasmBrCond.I32_NONZERO, label);
 			// false case; copy false value down
 			if (valuerep.value_size == 16) {
-				asm.movdqu_s_m(r_xmm0, vsph[-2].value);
-				asm.movdqu_m_s(vsph[-3].value, r_xmm0);
+				masm.emit_mov_r_m(ValueKind.V128, xenv.xmm0, vsph[-2].value); // XXX not really the exact valuekind
+				masm.emit_mov_m_r(ValueKind.V128, vsph[-3].value, xenv.xmm0);
 			} else {
-				asm.movq_r_m(r_tmp0, vsph[-2].value);
-				asm.movq_m_r(vsph[-3].value, r_tmp0);
+				masm.emit_mov_r_m(ValueKind.I64, xenv.tmp0, vsph[-2].value); // XXX not exact valuekind
+				masm.emit_mov_m_r(ValueKind.I64, vsph[-3].value, xenv.tmp0);
 			}
 			// true case, nothing to do
-			asm.bind(label);
+			masm.bindLabel(label);
 			adjustVsp(-2);
 			endHandler();
 		}
@@ -1149,7 +1147,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			asm.jc_rel_near(C.NZ, skip);
 
 			asm.d.shl_r_i(r_tmp0, valuerep.slot_size_log);
-			asm.movd_r_m(r_tmp1, vsph[-1].value);
+			masm.emit_mov_r_m(ValueKind.I32, xenv.tmp1, vsph[-1].value);
 			asm.sub_r_r(r_vsp, r_tmp0);
 			decrementVsp(); // XXX: combine with above using lea
 			asm.d.cmp_r_i(r_tmp1, 0);
@@ -1208,7 +1206,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		bindHandler(Opcode.THROW_REF); {
 			computeCurIpForTrap(-1);
 			computePcFromCurIp();
-			asm.movd_r_m(r_tmp0, vsph[-1].value);
+			masm.emit_mov_r_m(ValueKind.I32, xenv.tmp0, vsph[-1].value);
 			decrementVsp();
 			masm.emit_get_curstack(xenv.tmp1);
 			saveCallerIVars();
@@ -1259,10 +1257,10 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		asm.d.shl_r_i(r_tmp0, valuerep.slot_size_log);
 		decrementVsp();
 		if (valuerep.value_size == 16) {
-			asm.movdqu_s_m(r_xmm0, vsph[0].value);
+			masm.emit_mov_r_m(ValueKind.V128, xenv.xmm0, vsph[0].value);
 			asm.movdqu_m_s(r_vfp.plusR(r_tmp0, 1, valuerep.tag_size), r_xmm0);
 		} else {
-			asm.movq_r_m(r_tmp1, vsph[0].value);
+			masm.emit_mov_r_m(ValueKind.I64, xenv.tmp1, vsph[0].value);
 			asm.movq_m_r(r_vfp.plusR(r_tmp0, 1, valuerep.tag_size), r_tmp1);
 		}
 		endHandler();
@@ -1271,10 +1269,10 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		genReadUleb32(r_tmp0);
 		asm.d.shl_r_i(r_tmp0, valuerep.slot_size_log);
 		if (valuerep.value_size == 16) {
-			asm.movdqu_s_m(r_xmm0, vsph[-1].value);
+			masm.emit_mov_r_m(ValueKind.V128, xenv.xmm0, vsph[-1].value);
 			asm.movdqu_m_s(r_vfp.plusR(r_tmp0, 1, valuerep.tag_size), r_xmm0);
 		} else {
-			asm.movq_r_m(r_tmp1, vsph[-1].value);
+			masm.emit_mov_r_m(ValueKind.I64, xenv.tmp1, vsph[-1].value);
 			asm.movq_m_r(r_vfp.plusR(r_tmp0, 1, valuerep.tag_size), r_tmp1);
 		}
 		endHandler();
@@ -1349,7 +1347,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			computeCurIpForTrap(-1);
 			genSkipLeb(); // skip signature index
 			decrementVsp();
-			asm.movq_r_m(func_arg, vsph[0].value);
+			masm.emit_mov_r_m(ValueKind.REF, xenv.func_arg, vsph[0].value);
 			asm.q.cmp_r_i(func_arg, 0);
 			asm.jc_rel_near(X86_64Conds.NZ, callFunction);
 			asm.jmp_rel_far(newTrapLabel(TrapReason.NULL_DEREF));
@@ -1393,7 +1391,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			computeCurIpForTrap(-1);
 			genSkipLeb(); // skip signature index
 			decrementVsp();
-			asm.movq_r_m(func_arg, vsph[0].value);
+			masm.emit_mov_r_m(ValueKind.REF, xenv.func_arg, vsph[0].value);
 			asm.q.cmp_r_i(func_arg, 0);
 			asm.jc_rel_near(X86_64Conds.NZ, tailCallFunction);
 			asm.jmp_rel_far(newTrapLabel(TrapReason.NULL_DEREF));
@@ -1411,7 +1409,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		genReadUleb32(table_index);
 
 		decrementVsp();
-		asm.movd_r_m(func_index, vsph[0].value);
+		masm.emit_mov_r_m(ValueKind.I32, xenv.tmp0 /* XXX func_index */, vsph[0].value);
 
 		var tmp = r_tmp2;
 		var sig_id = r_tmp3;
@@ -1490,21 +1488,21 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 				asm.jc_rel_near(C.Z, runtime_call);
 				// do inline global get or set
 				if (t.0 == Opcode.GLOBAL_GET) {
-					genTagPushR(r_tmp1); // set the tag
+					genTagPushR(xenv.tmp1); // set the tag
 					if (valuerep.value_size == 16) {
 						asm.movdqu_s_m(r_xmm0, r_global.plus(offsets.Global_low));
-						asm.movdqu_m_s(vsph[0].value, r_xmm0);
+						masm.emit_mov_m_r(ValueKind.V128, vsph[0].value, xenv.xmm0);
 					} else {
 						asm.movq_r_m(r_tmp0, r_global.plus(offsets.Global_low));
-						asm.movq_m_r(vsph[0].value, r_tmp0);
+						masm.emit_mov_m_r(ValueKind.I64, vsph[0].value, xenv.tmp0);
 					}
 					incrementVsp();
 				} else {
 					if (valuerep.value_size == 16) {
-						asm.movdqu_s_m(r_xmm0, vsph[-1].value);
+						masm.emit_mov_r_m(ValueKind.V128, xenv.xmm0, vsph[-1].value);
 						asm.movdqu_m_s(r_global.plus(offsets.Global_low), r_xmm0);
 					} else {
-						asm.movq_r_m(r_tmp0, vsph[-1].value);
+						masm.emit_mov_r_m(ValueKind.I64, xenv.tmp0, vsph[-1].value);
 						asm.movq_m_r(r_global.plus(offsets.Global_low), r_tmp0);
 					}
 					decrementVsp();
@@ -1621,11 +1619,9 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			(Opcode.I32_GE_U, C.NC)
 		]) {
 			bindHandler(t.0);
-			asm.movd_r_m(r_tmp0, vsph[-1].value);
-			asm.d.cmp_m_r(vsph[-2].value, r_tmp0);
-			asm.set_r(t.1, r_tmp0);
-			asm.movbzx_r_r(r_tmp0, r_tmp0);
-			asm.movd_m_r(vsph[-2].value, r_tmp0);
+			masm.emit_mov_r_m(ValueKind.I32, xenv.tmp0, vsph[-1].value);
+			masm.emit_cmpd_m_r(t.1, vsph[-2].value, xenv.tmp0);
+			masm.emit_mov_m_r(ValueKind.I32, vsph[-2].value, xenv.tmp0);
 			decrementVsp();
 			endHandler();
 		}
@@ -1644,12 +1640,10 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			(Opcode.I64_GE_U, C.NC)
 		]) {
 			bindHandler(t.0);
-			asm.movq_r_m(r_tmp0, vsph[-1].value);
-			asm.q.cmp_m_r(vsph[-2].value, r_tmp0);
-			asm.set_r(t.1, r_tmp0);
-			asm.movbzx_r_r(r_tmp0, r_tmp0);
-			asm.movq_m_r(vsph[-2].value, r_tmp0);
-			if (valuerep.tagged) asm.movq_m_i(vsph[-2].tag, BpTypeCode.I32.code);
+			masm.emit_mov_r_m(ValueKind.I64, xenv.tmp0, vsph[-1].value);
+			masm.emit_cmpq_m_r(t.1, vsph[-2].value, xenv.tmp0);
+			masm.emit_mov_m_r(ValueKind.I64, vsph[-2].value, xenv.tmp0);
+			if (valuerep.tagged) masm.emit_mov_m_l(vsph[-2].tag, BpTypeCode.I32.code); // XXX check this
 			decrementVsp();
 			endHandler();
 		}
@@ -1657,37 +1651,30 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 	}
 	def genI32Arith() {
 		bindHandler(Opcode.I32_EQZ); {
-			asm.d.test_m_i(vsph[-1].value, -1);
+			masm.emit_testd_m_i(vsph[-1].value, -1);
 			asm.set_r(C.Z, r_tmp0);
 			asm.movbzx_r_r(r_tmp0, r_tmp0);
-			asm.movd_m_r(vsph[-1].value, r_tmp0);
+			masm.emit_mov_m_r(ValueKind.I32, vsph[-1].value, xenv.tmp0);
 			endHandler();
 		}
 		bindHandler(Opcode.I32_CLZ); {
-			asm.movd_r_i(r_tmp1, -1);
-			asm.d.bsr_r_m(r_tmp0, vsph[-1].value);
-			asm.d.cmov_r(C.Z, r_tmp0, r_tmp1);
-			asm.movd_r_i(r_tmp1, 31);
-			asm.d.sub_r_r(r_tmp1, r_tmp0);
-			asm.movd_m_r(vsph[-1].value, r_tmp1);
+			masm.emit_i32_clz_r_m(xenv.tmp0, vsph[-1].value);
 			endHandler();
 		}
 		bindHandler(Opcode.I32_CTZ); {
-			asm.d.bsf_r_m(r_tmp0, vsph[-1].value);
-			asm.movd_r_i(r_tmp1, 32);
-			asm.d.cmov_r(C.Z, r_tmp0, r_tmp1);
-			asm.movd_m_r(vsph[-1].value, r_tmp0);
+			masm.emit_i32_ctz_r_m(xenv.tmp0, vsph[-1].value);
+			masm.emit_mov_m_r(ValueKind.I32, vsph[-1].value, xenv.tmp0); // XXX check this
 			endHandler();
 		}
 		bindHandler(Opcode.I32_POPCNT); {
-			asm.d.popcnt_r_m(r_tmp0, vsph[-1].value);
-			asm.movd_m_r(vsph[-1].value, r_tmp0);
+			masm.emit_popcntd_r_m(xenv.tmp0, vsph[-1].value);
+			masm.emit_mov_m_r(ValueKind.I32, vsph[-1].value, xenv.tmp0);
 			endHandler();
 		}
 		bindHandler(Opcode.I32_MUL); {
-			asm.movd_r_m(r_tmp0, vsph[-1].value);
-			asm.d.imul_r_m(r_tmp0, vsph[-2].value);
-			asm.movd_m_r(vsph[-2].value, r_tmp0);
+			masm.emit_mov_r_m(ValueKind.I32, xenv.tmp0, vsph[-1].value);
+			masm.emit_binop_r_m(Opcode.I32_MUL, xenv.tmp0, vsph[-2].value);
+			masm.emit_mov_m_r(ValueKind.I32, vsph[-2].value, xenv.tmp0);
 			decrementVsp();
 			endHandler();
 		}
@@ -1699,27 +1686,29 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		]) {
 			bindHandler(t.0);
 			computeCurIpForTrap(-1);
-			asm.movd_r_m(r_tmp0, vsph[-1].value);
+			masm.emit_mov_r_m(ValueKind.I32, xenv.tmp0, vsph[-1].value);
 			spillReg(R.RAX);
 			spillReg(R.RDX);
-			asm.movd_r_m(R.RAX, vsph[-2].value);
+			def a = X86_64Addr.new(G(vsph[-2].value.base), null, 1, vsph[-2].value.offset);
+			// XXX x86-64 address used here
+			asm.movd_r_m(R.RAX, a);
 			t.1(r_tmp0);
-			asm.movd_m_r(vsph[-2].value, t.2);
+			asm.movd_m_r(a, t.2);
 			restoreReg(R.RAX);
 			restoreReg(R.RDX);
 			decrementVsp();
 			endHandler();
 		}
 		for (t in [
-			(Opcode.I32_ADD, asm.d.add_m_r),
-			(Opcode.I32_SUB, asm.d.sub_m_r),
-			(Opcode.I32_AND, asm.d.and_m_r),
-			(Opcode.I32_OR, asm.d.or_m_r),
-			(Opcode.I32_XOR, asm.d.xor_m_r)
+			Opcode.I32_ADD,
+			Opcode.I32_SUB,
+			Opcode.I32_AND,
+			Opcode.I32_OR, 
+			Opcode.I32_XOR 
 		]) {
-			bindHandler(t.0);
-			asm.movd_r_m(r_tmp0, vsph[-1].value);
-			t.1(vsph[-2].value, r_tmp0);
+			bindHandler(t);
+			masm.emit_mov_r_m(ValueKind.I32, xenv.tmp0, vsph[-1].value);
+			masm.emit_binop_m_r(t, vsph[-2].value, xenv.tmp0);
 			decrementVsp();
 			endHandler();
 		}
@@ -1731,46 +1720,42 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			(Opcode.I32_ROTR, asm.d.ror_m_cl)
 		]) {
 			bindHandler(t.0);
-			asm.movd_r_m(R.RCX, vsph[-1].value);
-			t.1(vsph[-2].value);
+			def a1 = X86_64Addr.new(G(vsph[-1].value.base), null, 1, vsph[-1].value.offset);
+			def a2 = X86_64Addr.new(G(vsph[-2].value.base), null, 1, vsph[-2].value.offset);
+			// XXX x86-64 address used here
+			asm.movd_r_m(R.RCX, a1);
+			t.1(a2);
 			decrementVsp();
 			endHandler();
 		}
 	}
 	def genI64Arith() {
 		bindHandler(Opcode.I64_EQZ); {
-			asm.q.test_m_i(vsph[-1].value, -1);
+			masm.emit_testq_m_i(vsph[-1].value, -1);
 			asm.set_r(C.Z, r_tmp0);
 			asm.movbzx_r_r(r_tmp0, r_tmp0);
-			asm.movd_m_r(vsph[-1].value, r_tmp0);
-			if (valuerep.tagged) asm.movd_m_i(vsph[-1].tag, BpTypeCode.I32.code);
+			masm.emit_mov_m_r(ValueKind.I32, vsph[-1].value, xenv.tmp0);
+			if (valuerep.tagged) masm.emit_mov_m_i(vsph[-1].tag, BpTypeCode.I32.code);
 			endHandler();
 		}
 		bindHandler(Opcode.I64_CLZ); {
-			asm.movq_r_i(r_tmp1, -1);
-			asm.q.bsr_r_m(r_tmp0, vsph[-1].value);
-			asm.q.cmov_r(C.Z, r_tmp0, r_tmp1);
-			asm.movd_r_i(r_tmp1, 63);
-			asm.q.sub_r_r(r_tmp1, r_tmp0);
-			asm.movq_m_r(vsph[-1].value, r_tmp1);
+			masm.emit_i64_clz_r_m(xenv.tmp0, vsph[-1].value);
 			endHandler();
 		}
 		bindHandler(Opcode.I64_CTZ); {
-			asm.q.bsf_r_m(r_tmp0, vsph[-1].value);
-			asm.movd_r_i(r_tmp1, 64);
-			asm.q.cmov_r(C.Z, r_tmp0, r_tmp1);
-			asm.movq_m_r(vsph[-1].value, r_tmp0);
+			masm.emit_i64_ctz_r_m(xenv.tmp0, vsph[-1].value);
+			masm.emit_mov_m_r(ValueKind.I64, vsph[-1].value, xenv.tmp0);
 			endHandler();
 		}
 		bindHandler(Opcode.I64_POPCNT); {
-			asm.q.popcnt_r_m(r_tmp0, vsph[-1].value);
-			asm.movq_m_r(vsph[-1].value, r_tmp0);
+			masm.emit_popcntq_r_m(xenv.tmp0, vsph[-1].value);
+			masm.emit_mov_m_r(ValueKind.I64, vsph[-1].value, xenv.tmp0);
 			endHandler();
 		}
 		bindHandler(Opcode.I64_MUL); {
-			asm.movq_r_m(r_tmp0, vsph[-1].value);
-			asm.q.imul_r_m(r_tmp0, vsph[-2].value);
-			asm.movq_m_r(vsph[-2].value, r_tmp0);
+			masm.emit_mov_r_m(ValueKind.I64, xenv.tmp0, vsph[-1].value);
+			masm.emit_binop_r_m(Opcode.I32_MUL, xenv.tmp0, vsph[-2].value);
+			masm.emit_mov_m_r(ValueKind.I64, vsph[-2].value, xenv.tmp0);
 			decrementVsp();
 			endHandler();
 		}
@@ -1782,27 +1767,29 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		]) {
 			bindHandler(t.0);
 			computeCurIpForTrap(-1);
-			asm.movq_r_m(r_tmp0, vsph[-1].value);
+			masm.emit_mov_r_m(ValueKind.I64, xenv.tmp0, vsph[-1].value);
 			spillReg(R.RAX);
 			spillReg(R.RDX);
-			asm.movq_r_m(R.RAX, vsph[-2].value);
+			def a = X86_64Addr.new(G(vsph[-2].value.base), null, 1, vsph[-2].value.offset);
+			// XXX x86-64 registers used here
+			asm.movq_r_m(R.RAX, a);
 			t.1(r_tmp0);
-			asm.movq_m_r(vsph[-2].value, t.2);
+			asm.movq_m_r(a, t.2);
 			restoreReg(R.RAX);
 			restoreReg(R.RDX);
 			decrementVsp();
 			endHandler();
 		}
 		for (t in [
-			(Opcode.I64_ADD, asm.q.add_m_r),
-			(Opcode.I64_SUB, asm.q.sub_m_r),
-			(Opcode.I64_AND, asm.q.and_m_r),
-			(Opcode.I64_OR, asm.q.or_m_r),
-			(Opcode.I64_XOR, asm.q.xor_m_r)
+			Opcode.I64_ADD,
+			Opcode.I64_SUB,
+			Opcode.I64_AND,
+			Opcode.I64_OR,
+			Opcode.I64_XOR
 		]) {
-			bindHandler(t.0);
-			asm.movq_r_m(r_tmp0, vsph[-1].value);
-			t.1(vsph[-2].value, r_tmp0);
+			bindHandler(t);
+			masm.emit_mov_r_m(ValueKind.I64, xenv.tmp0, vsph[-1].value);
+			masm.emit_binop_m_r(t, vsph[-2].value, xenv.tmp0);
 			decrementVsp();
 			endHandler();
 		}
@@ -1814,8 +1801,11 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			(Opcode.I64_ROTR, asm.q.ror_m_cl)
 		]) {
 			bindHandler(t.0);
-			asm.movq_r_m(R.RCX, vsph[-1].value);
-			t.1(vsph[-2].value);
+			def a1 = X86_64Addr.new(G(vsph[-1].value.base), null, 1, vsph[-1].value.offset);
+			def a2 = X86_64Addr.new(G(vsph[-2].value.base), null, 1, vsph[-2].value.offset);
+			// XXX x86-64 address used here
+			asm.movq_r_m(R.RCX, a1);
+			t.1(a2);
 			decrementVsp();
 			endHandler();
 		}
@@ -1922,20 +1912,20 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		}
 		genReadUleb32(r_tmp0);			// decode offset
 		if (isCmpAndExchange) {
-			asm.movq_r_m(r_tmp1, vsph[-3].value);	 // read index
+			masm.emit_mov_r_m(ValueKind.I64, xenv.tmp1, vsph[-3].value);	 // read index
 			asm.q.add_r_r(r_tmp0, r_tmp1);	      // add index + offset
-			asm.movq_r_m(r_tmp1, vsph[-2].value);	  // new value for cmpxchg
+			masm.emit_mov_r_m(ValueKind.I64, xenv.tmp1, vsph[-2].value);	  // new value for cmpxchg
 			spillReg(R.RAX);
-			asm.movq_r_m(R.RAX, vsph[-1].value);
+			masm.emit_mov_r_m(ValueKind.I64, xenv.ret_throw /* XXX rax */, vsph[-1].value);
 			asm.lock();
 			op(r_mem0_base.plusR(r_tmp0, 1, 0), r_tmp1);
 			// asm.movq_r_m(r_tmp1, r_mem0_base.plusR(r_tmp0, 1, 0)); // This will return the return of the operation
 			restoreReg(R.RAX);  // Restore the original RAX if it was used elsewhere
 			decrementVsp();
 		} else {
-			asm.movq_r_m(r_tmp1, vsph[-2].value);	// read index
+			masm.emit_mov_r_m(ValueKind.I64, xenv.tmp1, vsph[-2].value);	// read index
 			asm.q.add_r_r(r_tmp0, r_tmp1);		// add index + offset
-			asm.movq_r_m(r_tmp1, vsph[-1].value);	// read value
+			masm.emit_mov_r_m(ValueKind.I64, xenv.tmp1, vsph[-1].value);	// read value
 			if (neg != null) {
 					neg(r_tmp1);
 			} else if (exchange != null) {
@@ -1944,7 +1934,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			asm.lock();
 			op(r_mem0_base.plusR(r_tmp0, 1, 0), r_tmp1);
 			// asm.movq_r_m(r_tmp1, r_mem0_base.plusR(r_tmp0, 1, 0)); // This will return the return of the operation
-			asm.xchgq_m_r(vsph[-1].value, r_tmp1);
+			masm.emit_xchgq_m_r(vsph[-1].value, xenv.tmp1);
 		}
 		asm.bind(finish);
 		endHandler();
@@ -1958,20 +1948,20 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			genReadUleb32(r_tmp0);			// decode offset
 
 			if (isCmpAndExchange) {
-				asm.movq_r_m(r_tmp1, vsph[-3].value); 	// read index
+				masm.emit_mov_r_m(ValueKind.I64, xenv.tmp1, vsph[-3].value); 	// read index
 				asm.q.add_r_r(r_tmp0, r_tmp1);		// add index + offset
-				asm.movq_r_m(r_tmp1, vsph[-2].value);	// new value for cmpxchg
+				masm.emit_mov_r_m(ValueKind.I64, xenv.tmp1, vsph[-2].value);	// new value for cmpxchg
 				spillReg(R.RAX);
-				asm.movq_r_m(R.RAX, vsph[-1].value);
+				masm.emit_mov_r_m(ValueKind.I64, xenv.ret_throw /* XXX rax */, vsph[-1].value);
 				asm.lock();
 				op(r_mem0_base.plusR(r_tmp0, 1, 0), r_tmp1);
 				// asm.movq_r_m(r_tmp1, r_mem0_base.plusR(r_tmp0, 1, 0)); // This will return the return of the operation
 				restoreReg(R.RAX);  // Restore the original RAX if it was used elsewhere
 				decrementVsp();
 			} else {
-				asm.movq_r_m(r_tmp1, vsph[-2].value);	// read index
+				masm.emit_mov_r_m(ValueKind.I64, xenv.tmp1, vsph[-2].value);	// read index
 				asm.q.add_r_r(r_tmp0, r_tmp1);		// add index + offset
-				asm.movq_r_m(r_tmp1, vsph[-1].value);	// read value
+				masm.emit_mov_r_m(ValueKind.I64, xenv.tmp1, vsph[-1].value);	// read value
 				if (neg != null) {
 						neg(r_tmp1);
 				} else if (exchange != null) {
@@ -1980,7 +1970,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 				asm.lock();
 				op(r_mem0_base.plusR(r_tmp0, 1, 0), r_tmp1);
 				// asm.movq_r_m(r_tmp1, r_mem0_base.plusR(r_tmp0, 1, 0)); // This will return the return of the operation
-				asm.xchgq_m_r(vsph[-1].value, r_tmp1);
+				masm.emit_xchgq_m_r(vsph[-1].value, xenv.tmp1);
 			}
 			asm.jmp_rel_near(finish);
 		}
@@ -2022,89 +2012,91 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			endHandler();
 		}
 		bindHandler(Opcode.I32_EXTEND8_S); {
-			asm.d.movbsx_r_m(r_tmp0, vsph[-1].value);
-			asm.movd_m_r(vsph[-1].value, r_tmp0);
+			masm.emit_i32_extend8_s_r_m(xenv.tmp0, vsph[-1].value);
+			masm.emit_mov_m_r(ValueKind.I32, vsph[-1].value, xenv.tmp0);
 			endHandler();
 		}
 		bindHandler(Opcode.I32_EXTEND16_S); {
-			asm.d.movwsx_r_m(r_tmp0, vsph[-1].value);
-			asm.movd_m_r(vsph[-1].value, r_tmp0);
+			masm.emit_i32_extend16_s_r_m(xenv.tmp0, vsph[-1].value);
+			masm.emit_mov_m_r(ValueKind.I32, vsph[-1].value, xenv.tmp0);
 			endHandler();
 		}
 		bindHandler(Opcode.I64_EXTEND8_S); {
-			asm.q.movbsx_r_m(r_tmp0, vsph[-1].value);
-			asm.movq_m_r(vsph[-1].value, r_tmp0);
+			masm.emit_i64_extend8_s_r_m(xenv.tmp0, vsph[-1].value);
+			masm.emit_mov_m_r(ValueKind.I64, vsph[-1].value, xenv.tmp0);
 			endHandler();
 		}
 		bindHandler(Opcode.I64_EXTEND16_S); {
-			asm.q.movwsx_r_m(r_tmp0, vsph[-1].value);
-			asm.movq_m_r(vsph[-1].value, r_tmp0);
+			masm.emit_i64_extend16_s_r_m(xenv.tmp0, vsph[-1].value);
+			masm.emit_mov_m_r(ValueKind.I64, vsph[-1].value, xenv.tmp0);
 			endHandler();
 		}
 		bindHandler(Opcode.I64_EXTEND_I32_S);
 		bindHandler(Opcode.I64_EXTEND32_S); {
 			genTagUpdate(BpTypeCode.I64.code);
-			asm.movd_r_m(r_tmp0, vsph[-1].value);
+			// TODO see emit_movq_32s_r_m
+			masm.emit_mov_r_m(ValueKind.I32, xenv.tmp0, vsph[-1].value);
 			asm.q.shl_r_i(r_tmp0, 32);
 			asm.q.sar_r_i(r_tmp0, 32);
-			asm.movq_m_r(vsph[-1].value, r_tmp0);
+			masm.emit_mov_m_r(ValueKind.I64, vsph[-1].value, xenv.tmp0);
 			endHandler();
 		}
 		bindHandler(Opcode.I64_EXTEND_I32_U); {
 			genTagUpdate(BpTypeCode.I64.code);
-			asm.movd_m_i(vsph[-1].value.plus(4), 0); // zero upper portion
+			masm.emit_mov_m_i(vsph[-1].value.plus(4), 0); // zero upper portion
 			endHandler();
 		}
 	}
 	def genF32Arith() {
 		bindHandler(Opcode.F32_ABS); {
-			asm.d.and_m_i(vsph[-1].value, 0x7FFFFFFF); // explicit update of upper word
+			masm.emit_and_m_i(vsph[-1].value, 0x7FFFFFFF); // explicit update of upper word
 			endHandler();
 		}
 		bindHandler(Opcode.F32_NEG); {
-			asm.d.xor_m_i(vsph[-1].value, 0x80000000); // explicit update of upper word
+			masm.emit_xor_m_i(vsph[-1].value, 0x80000000); // explicit update of upper word
 			endHandler();
 		}
+		// TODO make into loop
 		bindHandler(Opcode.F32_ADD); {
-			asm.movss_s_m(r_xmm0, vsph[-2].value);
-			asm.addss_s_m(r_xmm0, vsph[-1].value);
-			asm.movss_m_s(vsph[-2].value, r_xmm0);
+			masm.emit_mov_r_m(ValueKind.F32, xenv.xmm0, vsph[-2].value);
+			masm.emit_binop_r_m(Opcode.F32_ADD, xenv.xmm0, vsph[-1].value);
+			masm.emit_mov_m_r(ValueKind.F32, vsph[-2].value, xenv.xmm0);
 			decrementVsp();
 			endHandler();
 		}
 		bindHandler(Opcode.F32_SUB); {
-			asm.movss_s_m(r_xmm0, vsph[-2].value);
-			asm.subss_s_m(r_xmm0, vsph[-1].value);
-			asm.movss_m_s(vsph[-2].value, r_xmm0);
+			masm.emit_mov_r_m(ValueKind.F32, xenv.xmm0, vsph[-2].value);
+			masm.emit_binop_r_m(Opcode.F32_SUB, xenv.xmm0, vsph[-1].value);
+			masm.emit_mov_m_r(ValueKind.F32, vsph[-2].value, xenv.xmm0);
 			decrementVsp();
 			endHandler();
 		}
 		bindHandler(Opcode.F32_MUL); {
-			asm.movss_s_m(r_xmm0, vsph[-2].value);
-			asm.mulss_s_m(r_xmm0, vsph[-1].value);
-			asm.movss_m_s(vsph[-2].value, r_xmm0);
+			masm.emit_mov_r_m(ValueKind.F32, xenv.xmm0, vsph[-2].value);
+			masm.emit_binop_r_m(Opcode.F32_MUL, xenv.xmm0, vsph[-1].value);
+			masm.emit_mov_m_r(ValueKind.F32, vsph[-2].value, xenv.xmm0);
 			decrementVsp();
 			endHandler();
 		}
 		bindHandler(Opcode.F32_DIV); {
-			asm.movss_s_m(r_xmm0, vsph[-2].value);
-			asm.divss_s_m(r_xmm0, vsph[-1].value);
-			asm.movss_m_s(vsph[-2].value, r_xmm0);
+			masm.emit_mov_r_m(ValueKind.F32, xenv.xmm0, vsph[-2].value);
+			masm.emit_binop_r_m(Opcode.F32_DIV, xenv.xmm0, vsph[-1].value);
+			masm.emit_mov_m_r(ValueKind.F32, vsph[-2].value, xenv.xmm0);
 			decrementVsp();
 			endHandler();
 		}
 		bindHandler(Opcode.F32_SQRT); {
-			asm.sqrtss_s_m(r_xmm0, vsph[-1].value);
-			asm.movss_m_s(vsph[-1].value, r_xmm0);
+			masm.emit_sqrtf_r_m(xenv.xmm0, vsph[-1].value);
+			masm.emit_mov_m_r(ValueKind.F32, vsph[-1].value, xenv.xmm0);
 			endHandler();
 		}
 		bindHandler(Opcode.F32_COPYSIGN); {
-			asm.movd_r_m(r_tmp0, vsph[-2].value); // XXX: tradeoff between memory operands and extra regs?
-			asm.d.and_r_i(r_tmp0, 0x7FFFFFFF);
-			asm.movd_r_m(r_tmp1, vsph[-1].value);
-			asm.d.and_r_i(r_tmp1, 0x80000000);
+			masm.emit_mov_r_m(ValueKind.I32, xenv.tmp0, vsph[-2].value); // XXX: tradeoff between memory operands and extra regs?
+			masm.emit_and_r_i(xenv.tmp0, 0x7FFFFFFF);
+			masm.emit_mov_r_m(ValueKind.I32, xenv.tmp1, vsph[-1].value);
+			masm.emit_and_r_i(xenv.tmp1, 0x80000000);
 			asm.d.or_r_r(r_tmp0, r_tmp1);
-			asm.movd_m_r(vsph[-2].value, r_tmp0);
+			masm.emit_mov_m_r(ValueKind.I32, vsph[-2].value, xenv.tmp0);
 			decrementVsp();
 			endHandler();
 		}
@@ -2115,60 +2107,61 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			(Opcode.F32_NEAREST, X86_64Rounding.TO_NEAREST)
 		]) {
 			bindHandler(t.0);
-			asm.roundss_s_m(r_xmm0, vsph[-1].value, t.1);
-			asm.movss_m_s(vsph[-1].value, r_xmm0);
+			masm.emit_roundf_r_m(xenv.xmm0, vsph[-1].value, t.1);
+			masm.emit_mov_m_r(ValueKind.F32, vsph[-1].value, xenv.xmm0);
 			endHandler();
 		}
 	}
 	def genF64Arith() {
 		bindHandler(Opcode.F64_ABS); {
-			asm.d.and_m_i(vsph[-1].upper, 0x7FFFFFFF);
+			masm.emit_and_m_i(vsph[-1].upper, 0x7FFFFFFF);
 			endHandler();
 		}
 		bindHandler(Opcode.F64_NEG); {
-			asm.d.xor_m_i(vsph[-1].upper, 0x80000000);
+			masm.emit_xor_m_i(vsph[-1].upper, 0x80000000);
 			endHandler();
 		}
 		bindHandler(Opcode.F64_ADD); {
-			asm.movsd_s_m(r_xmm0, vsph[-2].value);
-			asm.addsd_s_m(r_xmm0, vsph[-1].value);
-			asm.movsd_m_s(vsph[-2].value, r_xmm0);
+			masm.emit_mov_r_m(ValueKind.F64, xenv.xmm0, vsph[-2].value);
+			masm.emit_binop_r_m(Opcode.F64_ADD, xenv.xmm0, vsph[-1].value);
+			masm.emit_mov_m_r(ValueKind.F64, vsph[-2].value, xenv.xmm0);
 			decrementVsp();
 			endHandler();
 		}
 		bindHandler(Opcode.F64_SUB); {
-			asm.movsd_s_m(r_xmm0, vsph[-2].value);
-			asm.subsd_s_m(r_xmm0, vsph[-1].value);
-			asm.movsd_m_s(vsph[-2].value, r_xmm0);
+			masm.emit_mov_r_m(ValueKind.F64, xenv.xmm0, vsph[-2].value);
+			masm.emit_binop_r_m(Opcode.F64_SUB, xenv.xmm0, vsph[-1].value);
+			masm.emit_mov_m_r(ValueKind.F64, vsph[-2].value, xenv.xmm0);
 			decrementVsp();
 			endHandler();
 		}
 		bindHandler(Opcode.F64_MUL); {
-				asm.movsd_s_m(r_xmm0, vsph[-2].value);
-				asm.mulsd_s_m(r_xmm0, vsph[-1].value);
-				asm.movsd_m_s(vsph[-2].value, r_xmm0);
-				decrementVsp();
+			masm.emit_mov_r_m(ValueKind.F64, xenv.xmm0, vsph[-2].value);
+			masm.emit_binop_r_m(Opcode.F64_MUL, xenv.xmm0, vsph[-1].value);
+			masm.emit_mov_m_r(ValueKind.F64, vsph[-2].value, xenv.xmm0);
+			decrementVsp();
 			endHandler();
 		}
 		bindHandler(Opcode.F64_DIV); {
-			asm.movsd_s_m(r_xmm0, vsph[-2].value);
-			asm.divsd_s_m(r_xmm0, vsph[-1].value);
-			asm.movsd_m_s(vsph[-2].value, r_xmm0);
+			masm.emit_mov_r_m(ValueKind.F64, xenv.xmm0, vsph[-2].value);
+			masm.emit_binop_r_m(Opcode.F64_DIV, xenv.xmm0, vsph[-1].value);
+			masm.emit_mov_m_r(ValueKind.F64, vsph[-2].value, xenv.xmm0);
 			decrementVsp();
 			endHandler();
 		}
 		bindHandler(Opcode.F64_SQRT); {
-			asm.sqrtsd_s_m(r_xmm0, vsph[-1].value);
-			asm.movsd_m_s(vsph[-1].value, r_xmm0);
+			masm.emit_sqrtd_r_m(xenv.xmm0, vsph[-1].value);
+			masm.emit_mov_m_r(ValueKind.F64, vsph[-1].value, xenv.xmm0);
 			endHandler();
 		}
 		bindHandler(Opcode.F64_COPYSIGN); {
-			asm.movd_r_m(r_tmp0, vsph[-2].upper); // XXX: tradeoff between memory operands and extra regs?
-			asm.d.and_r_i(r_tmp0, 0x7FFFFFFF);
-			asm.movd_r_m(r_tmp1, vsph[-1].upper);
-			asm.d.and_r_i(r_tmp1, 0x80000000);
+			masm.emit_mov_r_m(ValueKind.I32, xenv.tmp0, vsph[-2].upper); // XXX: tradeoff between memory operands and extra regs?
+			masm.emit_and_r_i(xenv.tmp0, 0x7FFFFFFF);
+			masm.emit_mov_r_m(ValueKind.I32, xenv.tmp1, vsph[-1].upper);
+			masm.emit_and_r_i(xenv.tmp1, 0x80000000);
+			masm.emit_mov_r_m(ValueKind.I32, xenv.tmp1, vsph[-1].value);
 			asm.d.or_r_r(r_tmp0, r_tmp1);
-			asm.movd_m_r(vsph[-2].upper, r_tmp0);
+			masm.emit_mov_m_r(ValueKind.I32, vsph[-2].upper, xenv.tmp0);
 			decrementVsp();
 			endHandler();
 		}
@@ -2179,8 +2172,8 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			(Opcode.F64_NEAREST, X86_64Rounding.TO_NEAREST)
 		]) {
 			bindHandler(t.0);
-			asm.roundsd_s_m(r_xmm0, vsph[-1].value, t.1);
-			asm.movsd_m_s(vsph[-1].value, r_xmm0);
+			masm.emit_roundd_r_m(xenv.xmm0, vsph[-1].value, t.1);
+			masm.emit_mov_m_r(ValueKind.F64, vsph[-1].value, xenv.xmm0);
 			endHandler();
 		}
 	}
@@ -2194,8 +2187,8 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			(Opcode.F32_LE, C.A),
 			(Opcode.F32_GE, C.C)]) {
 			bindHandler(t.0);
-			asm.movss_s_m(r_xmm0, vsph[-2].value);
-			asm.ucomiss_s_m(r_xmm0, vsph[-1].value);
+			masm.emit_mov_r_m(ValueKind.F32, xenv.xmm0, vsph[-2].value);
+			masm.emit_cmpf_r_m(xenv.xmm0, vsph[-1].value);
 			asm.jc_rel_near(C.P, if(t.0 == Opcode.F32_NE, ret_one, ret_zero));
 			asm.jc_rel_near(t.1, ret_zero);
 			asm.jmp_rel_near(ret_one);
@@ -2204,13 +2197,13 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		asm.bind(ret_zero);
 		decrementVsp();
 		genTagUpdate(BpTypeCode.I32.code);
-		asm.movd_m_i(vsph[-1].value, 0);
+		masm.emit_mov_m_i(vsph[-1].value, 0);
 		endHandler();
 
 		asm.bind(ret_one);
 		decrementVsp();
 		genTagUpdate(BpTypeCode.I32.code);
-		asm.movd_m_i(vsph[-1].value, 1);
+		masm.emit_mov_m_i(vsph[-1].value, 1);
 		endHandler();
 
 		// XXX: too far of a near jump to share these between f32 and f64
@@ -2224,8 +2217,8 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			(Opcode.F64_LE, C.A),
 			(Opcode.F64_GE, C.C)]) {
 			bindHandler(t.0);
-			asm.movsd_s_m(r_xmm0, vsph[-2].value);
-			asm.ucomisd_s_m(r_xmm0, vsph[-1].value);
+			masm.emit_mov_r_m(ValueKind.F64, xenv.xmm0, vsph[-2].value);
+			masm.emit_cmpd_r_m(xenv.xmm0, vsph[-1].value);
 			asm.jc_rel_near(C.P, if(t.0 == Opcode.F64_NE, ret_one, ret_zero));
 			asm.jc_rel_near(t.1, ret_zero);
 			asm.jmp_rel_near(ret_one);
@@ -2234,49 +2227,49 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		asm.bind(ret_zero);
 		decrementVsp();
 		genTagUpdate(BpTypeCode.I32.code);
-		asm.movd_m_i(vsph[-1].value, 0);
+		masm.emit_mov_m_i(vsph[-1].value, 0);
 		genDispatchOrJumpToDispatch();
 
 		asm.bind(ret_one);
 		decrementVsp();
 		genTagUpdate(BpTypeCode.I32.code);
-		asm.movd_m_i(vsph[-1].value, 1);
+		masm.emit_mov_m_i(vsph[-1].value, 1);
 		genDispatchOrJumpToDispatch();
 	}
 	def genGcInstrs() {
 		bindHandler(Opcode.REF_I31); {
-			asm.movd_r_m(r_tmp0, vsph[-1].value);
+			masm.emit_mov_r_m(ValueKind.I32, xenv.tmp0, vsph[-1].value);
 			asm.d.shl_r_i(r_tmp0, 1);
 			asm.d.or_r_i(r_tmp0, 1);
-			asm.movq_m_r(vsph[-1].value, r_tmp0);
+			masm.emit_mov_m_r(ValueKind.I64, vsph[-1].value, xenv.tmp0);
 			genTagUpdate(BpTypeCode.I31REF.code);
 			endHandler();
 		}
 		bindHandler(Opcode.I31_GET_S); {
-			asm.movd_r_m(r_tmp0, vsph[-1].value);
+			masm.emit_mov_r_m(ValueKind.I32, xenv.tmp0, vsph[-1].value);
 			asm.d.cmp_r_i(r_tmp0, 0);
 			asm.jc_rel_far(X86_64Conds.Z, newTrapLabel(TrapReason.NULL_DEREF));
 			asm.d.sar_r_i(r_tmp0, 1);
-			asm.movq_m_r(vsph[-1].value, r_tmp0);
+			masm.emit_mov_m_r(ValueKind.I64, vsph[-1].value, xenv.tmp0);
 			genTagUpdate(BpTypeCode.I32.code);
 			endHandler();
 		}
 		bindHandler(Opcode.I31_GET_U); {
-			asm.movd_r_m(r_tmp0, vsph[-1].value);
+			masm.emit_mov_r_m(ValueKind.I32, xenv.tmp0, vsph[-1].value);
 			asm.d.cmp_r_i(r_tmp0, 0);
 			asm.jc_rel_far(X86_64Conds.Z, newTrapLabel(TrapReason.NULL_DEREF));
 			asm.d.shr_r_i(r_tmp0, 1);
-			asm.movd_m_r(vsph[-1].value, r_tmp0);
+			masm.emit_mov_m_r(ValueKind.I32, vsph[-1].value, xenv.tmp0);
 			genTagUpdate(BpTypeCode.I32.code);
 			endHandler();
 		}
 		bindHandler(Opcode.ARRAY_LEN); {
-			asm.movq_r_m(r_tmp0, vsph[-1].value);
+			masm.emit_mov_r_m(ValueKind.I64, xenv.tmp0, vsph[-1].value);
 			asm.q.cmp_r_i(r_tmp0, 0);
 			asm.jc_rel_far(X86_64Conds.Z, newTrapLabel(TrapReason.NULL_DEREF));
 			asm.movq_r_m(r_tmp0, r_tmp0.plus(offsets.HeapArray_vals));
 			asm.movd_r_m(r_tmp0, r_tmp0.plus(offsets.Array_length));
-			asm.movd_m_r(vsph[-1].value, r_tmp0);
+			masm.emit_mov_m_r(ValueKind.I32, vsph[-1].value, xenv.tmp0);
 			genTagUpdate(BpTypeCode.I32.code);
 			endHandler();
 		}
@@ -2291,7 +2284,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			callRuntime(refRuntimeCall(X86_64RT.runtime_doCast), [r_tmp3, r_instance, nullable_reg, r_tmp1], false);
 			asm.movbzx_r_r(r_tmp0, Target.V3_RET_GPRS[0]); // XXX: restore just VSP and update first?
 			restoreCallerIVars();
-			asm.movd_m_r(vsph[-1].value, r_tmp0);
+			masm.emit_mov_m_r(ValueKind.I32, vsph[-1].value, xenv.tmp0);
 			genTagUpdate(BpTypeCode.I32.code);
 			endHandler();
 			// ref.test_null jumps back to ref.test
@@ -2343,11 +2336,11 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			// TODO: we recompute the start of the current instruction from the curpc and the code object,
 			//       because the sidetable entry's delta_ip is encoded relative to 1 + curpc
 			asm.lea(r_ip, X86_64Addr.new(r_tmp1, r_curpc, 1, 1 + offsets.Array_contents));
-			asm.jmp_rel_far(controlTransferLabel);
+			masm.emit_br(controlTransferLabel);
 		}
 		bindHandler(Opcode.REF_AS_NON_NULL); {
 			computeCurIpForTrap(-1);
-			asm.q.cmp_m_i(vsph[-1].value, 0);
+			masm.emit_cmpq_m_i(vsph[-1].value, 0);
 			asm.jc_rel_far(C.Z, newTrapLabel(TrapReason.NULL_DEREF));
 			endHandler();
 		}
@@ -2362,26 +2355,26 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			asm.movq_r_m(r_tmp0, r_tmp0.plusR(r_tmp1, offsets.REF_SIZE, offsets.Array_contents));
 			asm.movd_r_m(r_tmp1, r_tmp0.plus(offsets.NativeWasmMemory_num_pages));
 			asm.movb_r_m(r_tmp0, r_tmp0.plus(offsets.NativeWasmMemory_index_tag));
-			genTagPushR(r_tmp0);
-			asm.movq_m_r(vsph[0].value, r_tmp1);
+			genTagPushR(xenv.tmp0);
+			masm.emit_mov_m_r(ValueKind.I64, vsph[0].value, xenv.tmp1);
 			incrementVsp();
 			endHandler();
 		}
 		bindHandler(Opcode.REF_NULL); {
 			genSkipLeb();
 			genTagPush(BpTypeCode.REF_NULL.code);
-			asm.movq_m_i(vsph[0].value, 0);
+			masm.emit_mov_m_l(vsph[0].value, 0);
 			// XXX: only clear upper slot when REF is guaranteed a REF_U64
-			asm.movq_m_i(vsph[0].value.plus(8), 0);
+			masm.emit_mov_m_l(vsph[0].value.plus(8), 0);
 			incrementVsp();
 			endHandler();
 		}
 		bindHandler(Opcode.REF_IS_NULL); {
-			asm.d.test_m_i(vsph[-1].value, -1);
+			masm.emit_testd_m_i(vsph[-1].value, -1);
 			asm.set_r(C.Z, r_tmp0);
 			asm.movbzx_r_r(r_tmp0, r_tmp0);
-			if (valuerep.tagged) asm.movd_m_i(vsph[-1].tag, i7.view(BpTypeCode.I32.code));
-			asm.movd_m_r(vsph[-1].value, r_tmp0);
+			if (valuerep.tagged) masm.emit_mov_m_i(vsph[-1].tag, i7.view(BpTypeCode.I32.code));
+			masm.emit_mov_m_r(ValueKind.I32, vsph[-1].value, xenv.tmp0);
 			endHandler();
 		}
 		bindHandler(Opcode.REF_FUNC); {
@@ -2389,7 +2382,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			asm.movq_r_m(r_tmp0, r_instance.plus(offsets.Instance_functions));
 			asm.movq_r_m(r_tmp0, r_tmp0.plusR(r_tmp1, offsets.REF_SIZE, offsets.Array_contents));
 			genTagPush(BpTypeCode.FUNCREF.code);
-			asm.movq_m_r(vsph[0].value, r_tmp0);
+			masm.emit_mov_m_r(ValueKind.I64, vsph[0].value, xenv.tmp0);
 			incrementVsp();
 			endHandler();
 		}
@@ -2410,10 +2403,10 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			asm.movq_r_m(r_tmp0, r_instance.plus(offsets.Instance_tables));
 			asm.movq_r_m(r_tmp0, r_tmp0.plusR(r_tmp1, offsets.REF_SIZE, offsets.Array_contents));
 			asm.movb_r_m(r_tmp2, r_tmp0.plus(offsets.Table_index_tag));
-			genTagPushR(r_tmp2);
+			genTagPushR(xenv.tmp2);
 			asm.movq_r_m(r_tmp0, r_tmp0.plus(offsets.Table_elems));
 			asm.movd_r_m(r_tmp0, r_tmp0.plus(offsets.Array_length));
-			asm.movq_m_r(vsph[0].value, r_tmp0);
+			masm.emit_mov_m_r(ValueKind.I64, vsph[0].value, xenv.tmp0);
 			incrementVsp();
 			endHandler();
 		}
@@ -2530,7 +2523,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			var cont = xenv.xmm0, ex = xenv.tmp0;
 			var contStack = xenv.tmp1;
 			genPopCont(X(cont));
-			genPopInto(G(ex));
+			genPopInto(ex);
 			masm.emit_validate_and_consume_cont(contStack, cont);
 
 			var curStack = xenv.tmp4;
@@ -2743,35 +2736,35 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 	def genFloatMinAndMax() {
 		var ret_b = X86_64Label.new(), ret_a = X86_64Label.new(), is_nan32 = X86_64Label.new(), is_nan64 = X86_64Label.new();
 		bindHandler(Opcode.F32_MIN);
-		asm.movss_s_m(r_xmm0, vsph[-2].value);
-		asm.movss_s_m(r_xmm1, vsph[-1].value);
+		masm.emit_mov_r_m(ValueKind.F32, xenv.xmm0, vsph[-2].value);
+		masm.emit_mov_r_m(ValueKind.F32, xenv.xmm1, vsph[-1].value);
 		asm.ucomiss_s_s(r_xmm0, r_xmm1);
 		asm.jc_rel_far(C.P, is_nan32);
 		asm.jc_rel_near(C.C, ret_a);
 		asm.jc_rel_near(C.A, ret_b);
-		asm.d.cmp_m_i(vsph[-1].value, 0);
+		masm.emit_cmpd_m_i(vsph[-1].value, 0);
 		asm.jc_rel_near(C.S, ret_b); // handle min(-0, 0) == -0
 		asm.jmp_rel_near(ret_a);
 
 		bindHandler(Opcode.F32_MAX);
-		asm.movss_s_m(r_xmm0, vsph[-2].value);
-		asm.movss_s_m(r_xmm1, vsph[-1].value);
+		masm.emit_mov_r_m(ValueKind.F32, xenv.xmm0, vsph[-2].value);
+		masm.emit_mov_r_m(ValueKind.F32, xenv.xmm1, vsph[-1].value);
 		asm.ucomiss_s_s(r_xmm0, r_xmm1);
 		asm.jc_rel_far(C.P, is_nan32);
 		asm.jc_rel_near(C.C, ret_b);
 		asm.jc_rel_near(C.A, ret_a);
-		asm.d.cmp_m_i(vsph[-1].value, 0);
+		masm.emit_cmpd_m_i(vsph[-1].value, 0);
 		asm.jc_rel_near(C.NS, ret_b); // handle max(-0, 0) == 0
 		asm.jmp_rel_near(ret_a);
 
 		bindHandler(Opcode.F64_MIN);
-		asm.movsd_s_m(r_xmm0, vsph[-2].value);
-		asm.movsd_s_m(r_xmm1, vsph[-1].value);
+		masm.emit_mov_r_m(ValueKind.F64, xenv.xmm0, vsph[-2].value);
+		masm.emit_mov_r_m(ValueKind.F64, xenv.xmm1, vsph[-1].value);
 		asm.ucomisd_s_s(r_xmm0, r_xmm1);
 		asm.jc_rel_near(C.P, is_nan64);
 		asm.jc_rel_near(C.C, ret_a);
 		asm.jc_rel_near(C.A, ret_b);
-		asm.d.cmp_m_i(vsph[-1].upper, 0);
+		masm.emit_cmpd_m_i(vsph[-1].upper, 0);
 		asm.jc_rel_near(C.S, ret_b); // handle min(-0, 0) == -0
 		// fall through to ret_a
 		asm.bind(ret_a);
@@ -2779,27 +2772,27 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		endHandler();
 
 		bindHandler(Opcode.F64_MAX);
-		asm.movsd_s_m(r_xmm0, vsph[-2].value);
-		asm.movsd_s_m(r_xmm1, vsph[-1].value);
+		masm.emit_mov_r_m(ValueKind.F64, xenv.xmm0, vsph[-2].value);
+		masm.emit_mov_r_m(ValueKind.F64, xenv.xmm1, vsph[-1].value);
 		asm.ucomisd_s_s(r_xmm0, r_xmm1);
 		asm.jc_rel_near(C.P, is_nan64);
 		asm.jc_rel_near(C.C, ret_b);
 		asm.jc_rel_near(C.A, ret_a);
-		asm.d.cmp_m_i(vsph[-1].upper, 0);
+		masm.emit_cmpd_m_i(vsph[-1].upper, 0);
 		asm.jc_rel_near(C.S, ret_a); // handle max(-0, 0) == 0
 		// fall through to ret_b
 		asm.bind(ret_b);
-		asm.movsd_m_s(vsph[-2].value, r_xmm1);
+		masm.emit_mov_m_r(ValueKind.F64, vsph[-2].value, xenv.xmm1);
 		decrementVsp();
 		endHandler();
 
 		asm.bind(is_nan32);
-		asm.movd_m_i(vsph[-2].value, int.view(FloatUtils.f_nan));
+		masm.emit_mov_m_i(vsph[-2].value, int.view(FloatUtils.f_nan));
 		asm.jmp_rel_near(ret_a);
 
 		asm.bind(is_nan64);
-		asm.movd_m_i(vsph[-2].upper, int.view(FloatUtils.d_nan >> 32));
-		asm.movd_m_i(vsph[-2].value, 0);
+		masm.emit_mov_m_i(vsph[-2].upper, int.view(FloatUtils.d_nan >> 32));
+		masm.emit_mov_m_i(vsph[-2].value, 0);
 		asm.jmp_rel_near(ret_a);
 	}
 	def genFloatTruncs() {
@@ -2824,16 +2817,16 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			// XXX: don't load current IP for saturating conversions
 			computeCurIpForTrap(-1);
 			// load value from stack
-			if (opcode.sig.params[0] == ValueType.F32) asm.movss_s_m(r_xmm0, vsph[-1].value);
-			else asm.movsd_s_m(r_xmm0, vsph[-1].value);
+			if (opcode.sig.params[0] == ValueType.F32) masm.emit_mov_r_m(ValueKind.F32, xenv.xmm0, vsph[-1].value);
+			else masm.emit_mov_r_m(ValueKind.F64, xenv.xmm0, vsph[-1].value);
 			// emit conversion
 			masm.emit_i_trunc_f(opcode, r_tmp0, r_xmm0, r_xmm1);
 			// store and update tag
 			if (opcode.sig.results[0] == ValueType.I32) {
-				asm.movd_m_r(vsph[-1].value, r_tmp0);
+				masm.emit_mov_m_r(ValueKind.I32, vsph[-1].value, xenv.tmp0);
 				genTagUpdate(BpTypeCode.I32.code);
 			} else {
-				asm.movq_m_r(vsph[-1].value, r_tmp0);
+				masm.emit_mov_m_r(ValueKind.I64, vsph[-1].value, xenv.tmp0);
 				genTagUpdate(BpTypeCode.I64.code);
 			}
 			endHandler();
@@ -2842,74 +2835,76 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 	def genFloatConversions() {
 		bindHandler(Opcode.F32_CONVERT_I32_S); {
 			genTagUpdate(BpTypeCode.F32.code);
-			asm.movd_r_m(r_tmp0, vsph[-1].value);
+			masm.emit_mov_r_m(ValueKind.I32, xenv.tmp0, vsph[-1].value);
 			asm.q.shl_r_i(r_tmp0, 32);
 			asm.q.sar_r_i(r_tmp0, 32); // sign-extend
 			asm.cvtsi2ss_s_r(r_xmm0, r_tmp0);
-			asm.movss_m_s(vsph[-1].value, r_xmm0);
+			masm.emit_mov_m_r(ValueKind.F32, vsph[-1].value, xenv.xmm0);
 			endHandler();
 		}
 		bindHandler(Opcode.F32_CONVERT_I32_U); {
 			genTagUpdate(BpTypeCode.F32.code);
-			asm.movd_r_m(r_tmp0, vsph[-1].value);
+			masm.emit_mov_r_m(ValueKind.I32, xenv.tmp0, vsph[-1].value);
 			asm.cvtsi2ss_s_r(r_xmm0, r_tmp0);
-			asm.movss_m_s(vsph[-1].value, r_xmm0);
+			masm.emit_mov_m_r(ValueKind.F32, vsph[-1].value, xenv.xmm0);
 			endHandler();
 		}
 		bindHandler(Opcode.F32_CONVERT_I64_S); {
 			genTagUpdate(BpTypeCode.F32.code);
-			asm.movq_r_m(r_tmp0, vsph[-1].value);
+			masm.emit_mov_r_m(ValueKind.I64, xenv.tmp0, vsph[-1].value);
 			asm.cvtsi2ss_s_r(r_xmm0, r_tmp0);
-			asm.movss_m_s(vsph[-1].value, r_xmm0);
+			masm.emit_mov_m_r(ValueKind.F32, vsph[-1].value, xenv.xmm0);
 			endHandler();
 		}
 		bindHandler(Opcode.F32_CONVERT_I64_U); {
 			genTagUpdate(BpTypeCode.F32.code);
-			asm.movq_r_m(r_tmp0, vsph[-1].value);
+			masm.emit_mov_r_m(ValueKind.I64, xenv.tmp0, vsph[-1].value);
 			masm.emit_f32_convert_i64_u(r_xmm0, r_tmp0, r_xmm1, r_scratch);
-			asm.movss_m_s(vsph[-1].value, r_xmm0);
+			masm.emit_mov_m_r(ValueKind.F32, vsph[-1].value, xenv.xmm0);
 			endHandler();
 		}
 		bindHandler(Opcode.F32_DEMOTE_F64); {
 			genTagUpdate(BpTypeCode.F32.code);
-			asm.cvtsd2ss_s_m(r_xmm0, vsph[-1].value);
-			asm.movss_m_s(vsph[-1].value, r_xmm0);
+			// XXX how come this is the only one that uses vsph in operation?
+			masm.emit_demote_r_m(xenv.xmm0, vsph[-1].value);
+			masm.emit_mov_m_r(ValueKind.F32, vsph[-1].value, xenv.xmm0);
 			endHandler();
 		}
 		bindHandler(Opcode.F64_CONVERT_I32_S); {
 			genTagUpdate(BpTypeCode.F64.code);
-			asm.movd_r_m(r_tmp0, vsph[-1].value);
+			masm.emit_mov_r_m(ValueKind.I32, xenv.tmp0, vsph[-1].value);
 			asm.q.shl_r_i(r_tmp0, 32);
 			asm.q.sar_r_i(r_tmp0, 32); // sign-extend
 			asm.cvtsi2sd_s_r(r_xmm0, r_tmp0);
-			asm.movsd_m_s(vsph[-1].value, r_xmm0);
+			masm.emit_mov_m_r(ValueKind.F64, vsph[-1].value, xenv.xmm0);
 			endHandler();
 		}
 		bindHandler(Opcode.F64_CONVERT_I32_U); {
 			genTagUpdate(BpTypeCode.F64.code);
-			asm.movd_r_m(r_tmp0, vsph[-1].value);
+			masm.emit_mov_r_m(ValueKind.I32, xenv.tmp0, vsph[-1].value);
 			asm.cvtsi2sd_s_r(r_xmm0, r_tmp0);
-			asm.movsd_m_s(vsph[-1].value, r_xmm0);
+			masm.emit_mov_m_r(ValueKind.F64, vsph[-1].value, xenv.xmm0);
 			endHandler();
 		}
 		bindHandler(Opcode.F64_CONVERT_I64_S); {
 			genTagUpdate(BpTypeCode.F64.code);
-			asm.movq_r_m(r_tmp0, vsph[-1].value);
+			masm.emit_mov_r_m(ValueKind.I64, xenv.tmp0, vsph[-1].value);
 			asm.cvtsi2sd_s_r(r_xmm0, r_tmp0);
-			asm.movsd_m_s(vsph[-1].value, r_xmm0);
+			masm.emit_mov_m_r(ValueKind.F64, vsph[-1].value, xenv.xmm0);
 			endHandler();
 		}
 		bindHandler(Opcode.F64_CONVERT_I64_U); {
 			genTagUpdate(BpTypeCode.F64.code);
-			asm.movq_r_m(r_tmp0, vsph[-1].value);
+			masm.emit_mov_r_m(ValueKind.I64, xenv.tmp0, vsph[-1].value);
 			masm.emit_f64_convert_i64_u(r_xmm0, r_tmp0, r_xmm1, r_scratch);
-			asm.movsd_m_s(vsph[-1].value, r_xmm0);
+			masm.emit_mov_m_r(ValueKind.F64, vsph[-1].value, xenv.xmm0);
 			endHandler();
 		}
 		bindHandler(Opcode.F64_PROMOTE_F32); {
 			genTagUpdate(BpTypeCode.F64.code);
-			asm.cvtss2sd_s_m(r_xmm0, vsph[-1].value);
-			asm.movsd_m_s(vsph[-1].value, r_xmm0);
+			// XXX how come this is the only two that uses vsph in operation?
+			masm.emit_promote_r_m(xenv.xmm0, vsph[-1].value);
+			masm.emit_mov_m_r(ValueKind.F64, vsph[-1].value, xenv.xmm0);
 			endHandler();
 		}
 	}
@@ -2986,24 +2981,24 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 
 	// helper method to move values from vsp to registers
 	// vsp[-2] -> s0, vsp[-1] -> s1
-	def load_v128_s_s(s0: X86_64Xmmr, s1: X86_64Xmmr) {
-		asm.movdqu_s_m(s0, vsph[-2].value);
-		asm.movdqu_s_m(s1, vsph[-1].value);
+	def load_v128_s_s(s0: Reg, s1: Reg) {
+		masm.emit_mov_r_m(ValueKind.V128, s0, vsph[-2].value);
+		masm.emit_mov_r_m(ValueKind.V128, s1, vsph[-1].value);
 	}
-	def load_v128_s_r(s: X86_64Xmmr, r: X86_64Gpr) {
-		asm.movdqu_s_m(s, vsph[-2].value);
-		asm.movd_r_m(r, vsph[-1].value);
+	def load_v128_s_r(s: Reg, r: Reg) {
+		masm.emit_mov_r_m(ValueKind.V128, s, vsph[-2].value);
+		masm.emit_mov_r_m(ValueKind.I32, r, vsph[-1].value);
 	}
 	// vsp[-2] -> xmm0, vsp[-1] -> xmm1
 	def load_v128_xmm0_xmm1(){
-		load_v128_s_s(r_xmm0, r_xmm1);
+		load_v128_s_s(xenv.xmm0, xenv.xmm1);
 	}
 	// vsp[-2] -> xmm1, vsp[-1] -> xmm0
 	def load_v128_xmm1_xmm0(){
-		load_v128_s_s(r_xmm1, r_xmm0);
+		load_v128_s_s(xenv.xmm1, xenv.xmm0);
 	}
 	def load_v128_xmm0_tmp0(){
-		load_v128_s_r(r_xmm0, r_tmp0);
+		load_v128_s_r(xenv.xmm0, xenv.tmp0);
 	}
 	def load_imm8(r: X86_64Gpr) {
 		asm.movbzx_r_m(r, ip_ptr);
@@ -3045,7 +3040,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		asm_mov_m_r: (X86_64Addr, X86_64Gpr) -> T,
 		signExt: bool) {
 		bindHandler(opcode);
-		def dst: X86_64Addr = vsph[-1].value;
+		def dst: X86_64Addr = A(vsph[-1].value); // XXX skipped
 		def idx: X86_64Gpr = r_tmp0;
 		def src: X86_64Gpr = r_tmp1;
 		// load imm (one byte)
@@ -3067,8 +3062,8 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		asm_mov_r_m: (X86_64Gpr, X86_64Addr) -> T,
 		asm_mov_m_r: (X86_64Addr, X86_64Gpr) -> T) {
 		bindHandler(opcode);
-		def dst: X86_64Addr = vsph[-2].value;
-		def src: X86_64Addr = vsph[-1].value;
+		def dst: X86_64Addr = A(vsph[-2].value); // XXX skipped
+		def src: X86_64Addr = A(vsph[-1].value); // XXX skipped
 		def idx: X86_64Gpr = r_tmp0;
 		def val: X86_64Gpr = r_tmp1;
 		def addr: X86_64Gpr = r_tmp2;
@@ -3083,11 +3078,11 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		load_memarg: (X86_64Gpr, X86_64Addr, X86_64Gpr) -> void,
 		asm_mov_m_r: (X86_64Addr, X86_64Gpr) -> T) {
 		bindHandler(opcode);
-		def dst: X86_64Addr = vsph[-1].value;
+		def dst: X86_64Addr = A(vsph[-1].value);
 		def idx: X86_64Gpr = r_tmp0;
 		def val: X86_64Gpr = r_tmp1;
 		def addr: X86_64Gpr = r_tmp2;
-		load_memarg(val, vsph[-2].value, addr); // should load memarg first per the bytecode order
+		load_memarg(val, A(vsph[-2].value), addr); // should load memarg first per the bytecode order
 		load_imm8(idx); // then load imm
 		asm.q.lea(addr, dst); // load address
 		asm_mov_m_r(X86_64Addr.new(addr, idx, size, 0), val); // store (replace) value
@@ -3098,10 +3093,10 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		load_memarg: (X86_64Gpr, X86_64Addr, X86_64Gpr) -> void,
 		asm_insert_s_r_i: (X86_64Xmmr, X86_64Gpr, u8) -> T) {
 		bindHandler(opcode);
-		load_memarg(r_tmp0, vsph[-1].value, r_tmp1); // load memarg to tmp0
+		load_memarg(r_tmp0, A(vsph[-1].value), r_tmp1); // load memarg to tmp0
 		masm.emit_v128_zero(r_xmm0); // zero out xmm0
 		asm_insert_s_r_i(r_xmm0, r_tmp0, 0); // insert value to lowest bits
-		asm.movdqu_m_s(vsph[-1].value, r_xmm0);
+		masm.emit_mov_m_r(ValueKind.V128, vsph[-1].value, xenv.xmm0);
 		if (valuerep.tagged) genTagUpdate(BpTypeCode.V128.code);
 		endHandler();
 	}
@@ -3111,20 +3106,20 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		def dst = r_xmm0;
 		def tmp1 = r_tmp0;
 		def tmp2 = r_tmp1;
-		def src = decode_memarg(vsph[-1].value, tmp1, tmp2);
+		def src = decode_memarg(A(vsph[-1].value), tmp1, tmp2);
 		asm_extend_s_m(dst, src);
 		if (valuerep.tagged) genTagUpdate(BpTypeCode.V128.code);
-		asm.movdqu_m_s(vsph[-1].value, dst);
+		asm.movdqu_m_s(A(vsph[-1].value), dst);
 		endHandler();
 	}
 	def genStoreLane<T>(opcode: Opcode, size: byte,
 		asm_mov_r_m: (X86_64Gpr, X86_64Addr) -> T,
 		asm_mov_m_r: (X86_64Addr, X86_64Gpr) -> T) {
 		bindHandler(opcode);
-		def data: X86_64Addr = vsph[-1].value;
+		def data: X86_64Addr = A(vsph[-1].value);
 		def idx: X86_64Gpr = r_tmp0;
 		def val: X86_64Gpr = r_tmp1;
-		def mem_addr = decode_memarg(vsph[-2].value, idx, val);
+		def mem_addr = decode_memarg(A(vsph[-2].value), idx, val);
 		load_imm8(idx);
 		asm.q.lea(val, data); // load address of the vector, val as a temp variable
 		asm_mov_r_m(val, X86_64Addr.new(val, idx, size, 0)); // extract the lane from the value stack
@@ -3136,9 +3131,9 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		asm_mov_r_m: (X86_64Gpr, X86_64Addr) -> T,
 		masm_emit: (X86_64Xmmr, X86_64Gpr) -> void) {
 		bindHandler(opcode);
-		asm_mov_r_m(r_tmp0, vsph[-1].value);
+		asm_mov_r_m(r_tmp0, A(vsph[-1].value));
 		masm_emit(r_xmm0, r_tmp0);
-		asm.movdqu_m_s(vsph[-1].value, r_xmm0);
+		masm.emit_mov_m_r(ValueKind.V128, vsph[-1].value, xenv.xmm0);
 		genTagUpdate(BpTypeCode.V128.code);
 		endHandler();
 	}
@@ -3146,9 +3141,9 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		load_memarg: (X86_64Gpr, X86_64Addr, X86_64Gpr) -> void,
 		masm_emit: (X86_64Xmmr, X86_64Gpr) -> void) {
 		bindHandler(opcode);
-		load_memarg(r_tmp0, vsph[-1].value, r_tmp1);
+		load_memarg(r_tmp0, A(vsph[-1].value), r_tmp1);
 		masm_emit(r_xmm0, r_tmp0);
-		asm.movdqu_m_s(vsph[-1].value, r_xmm0);
+		masm.emit_mov_m_r(ValueKind.V128, vsph[-1].value, xenv.xmm0);
 		genTagUpdate(BpTypeCode.V128.code);
 		endHandler();
 	}
@@ -3157,7 +3152,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		bindHandler(Opcode.V128_CONST); {
 			asm.movdqu_s_m(r_xmm0, ip_ptr);
 			asm.q.add_r_i(r_ip, 16);
-			asm.movdqu_m_s(vsph[0].value, r_xmm0);
+			masm.emit_mov_m_r(ValueKind.V128, vsph[0].value, xenv.xmm0);
 			genTagPush(BpTypeCode.V128.code);
 			incrementVsp();
 			endHandler();
@@ -3172,9 +3167,9 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		// V128 load
 		bindHandler(Opcode.V128_LOAD); {
 			computeCurIpForTrap(-1);
-			load_memarg128(r_xmm0, vsph[-1].value, r_tmp0, r_tmp1);
+			load_memarg128(r_xmm0, A(vsph[-1].value), r_tmp0, r_tmp1);
 			if (valuerep.tagged) genTagUpdate(BpTypeCode.V128.code); // update tag if necessary
-			asm.movdqu_m_s(vsph[-1].value, r_xmm0);
+			masm.emit_mov_m_r(ValueKind.V128, vsph[-1].value, xenv.xmm0);
 			endHandler();
 		}
 		// V128 load_lane
@@ -3207,8 +3202,8 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		// V128 store
 		bindHandler(Opcode.V128_STORE); {
 			computeCurIpForTrap(-1);
-			asm.movdqu_s_m(r_xmm0, vsph[-1].value);
-			store_memarg128(vsph[-2].value, r_xmm0, r_tmp0, r_tmp1);
+			masm.emit_mov_r_m(ValueKind.V128, xenv.xmm0, vsph[-1].value);
+			store_memarg128(A(vsph[-2].value), r_xmm0, r_tmp0, r_tmp1);
 			adjustVsp(-2);
 			endHandler();
 		}
@@ -3243,11 +3238,11 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			// Use the bits in the control mask c to select the corresponding bit
 			// from v1 when 1 and v2 when 0.
 			// This operation is equivalent to v128.or(v128.and(v1, c), v128.and(v2, v128.not(c)))
-			asm.movdqu_s_m(r_xmm0, vsph[-3].value); // v1
-			asm.movdqu_s_m(r_xmm1, vsph[-2].value); // v2
-			asm.movdqu_s_m(r_xmm2, vsph[-1].value); // c
+			masm.emit_mov_r_m(ValueKind.V128, xenv.xmm0, vsph[-3].value); // v1
+			masm.emit_mov_r_m(ValueKind.V128, xenv.xmm1, vsph[-2].value); // v2
+			masm.emit_mov_r_m(ValueKind.V128, xenv.xmm2, vsph[-1].value); // c
 			masm.emit_v128_bitselect(r_xmm0, r_xmm1, r_xmm2, r_xmm3);
-			asm.movdqu_m_s(vsph[-3].value, r_xmm0);
+			masm.emit_mov_m_r(ValueKind.V128, vsph[-3].value, xenv.xmm0);
 			adjustVsp(-2);
 			endHandler();
 		}
@@ -3271,7 +3266,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			bindHandler(t.0);
 			load_v128_xmm0_tmp0();
 			t.1(r_xmm0, r_tmp0, r_tmp1, r_xmm1, r_xmm2);
-			asm.movdqu_m_s(vsph[-2].value, r_xmm0);
+			masm.emit_mov_m_r(ValueKind.V128, vsph[-2].value, xenv.xmm0);
 			decrementVsp();
 			endHandler();
 		}
@@ -3288,7 +3283,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			bindHandler(t.0);
 			load_v128_xmm0_tmp0();
 			masm.emit_v128_shift(r_xmm0, r_tmp0, byte.view(t.2), r_tmp1, r_xmm1, t.1);
-			asm.movdqu_m_s(vsph[-2].value, r_xmm0);
+			masm.emit_mov_m_r(ValueKind.V128, vsph[-2].value, xenv.xmm0);
 			decrementVsp();
 			endHandler();
 		}
@@ -3467,7 +3462,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		bindHandler(Opcode.I8X16_SHUFFLE); {
 			var RHS = X86_64Label.new(), LOOP_PRO = X86_64Label.new(), LOOP_EPI = X86_64Label.new();
 			incrementVsp(); // make room for a local variable dst (the result)
-			def dst = vsph[-1].value;
+			def dst = A(vsph[-1].value);
 			def dst_addr = r_tmp3;
 			asm.q.lea(dst_addr, dst);
 			def idx = r_tmp0; // idx's bound is [0, 31]
@@ -3475,8 +3470,8 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			def i = r_tmp2; // loop counter
 			def SIMD_128_SIZE: byte = 16;
 			// vectors from the value stack
-			def lhs = vsph[-3].value;
-			def rhs = vsph[-2].value;
+			def lhs = A(vsph[-3].value);
+			def rhs = A(vsph[-2].value);
 			// for (i: byte < 16)
 			asm.movq_r_i(i, 0);
 			// LOOP_PROLOGUE:
@@ -3494,8 +3489,8 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			asm.q.cmp_r_i(i, SIMD_128_SIZE); // loop when 0 <= i < 16
 			asm.jc_rel_near(C.L, LOOP_PRO);
 			// Return
-			asm.movdqu_s_m(r_xmm0, vsph[-1].value);
-			asm.movdqu_m_s(vsph[-3].value, r_xmm0);
+			masm.emit_mov_r_m(ValueKind.V128, xenv.xmm0, vsph[-1].value);
+			masm.emit_mov_m_r(ValueKind.V128, vsph[-3].value, xenv.xmm0);
 			adjustVsp(-2);
 			endHandler();
 			// RHS:
@@ -3509,7 +3504,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		bindHandler(Opcode.I8X16_RELAXED_SWIZZLE); { // XXX: faster swizzle available?
 			load_v128_xmm0_xmm1();
 			masm.emit_i8x16_swizzle(r_xmm0, r_xmm1, r_tmp0, r_xmm2);
-			asm.movdqu_m_s(vsph[-2].value, r_xmm0);
+			masm.emit_mov_m_r(ValueKind.V128, vsph[-2].value, xenv.xmm0);
 			decrementVsp();
 			endHandler();
 		}
@@ -3521,23 +3516,23 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		// Relaxed dot products.
 		genSimdBinopCommute(Opcode.I16X8_RELAXED_DOT_I8X16_I7X16_S, asm.pmaddubsw_s_s);
 		bindHandler(Opcode.I32X4_RELAXED_DOT_I8X16_I7X16_ADD_S); {
-			asm.movdqu_s_m(r_xmm0, vsph[-3].value); // a
-			asm.movdqu_s_m(r_xmm1, vsph[-2].value); // b
-			asm.movdqu_s_m(r_xmm2, vsph[-1].value); // c
+			masm.emit_mov_r_m(ValueKind.V128, xenv.xmm0, vsph[-3].value); // a
+			masm.emit_mov_r_m(ValueKind.V128, xenv.xmm1, vsph[-2].value); // b
+			masm.emit_mov_r_m(ValueKind.V128, xenv.xmm2, vsph[-1].value); // c
 			asm.pmaddubsw_s_s(r_xmm1, r_xmm0);
 			masm.load_v128_mask(r_xmm0, masm.mask_i16x8_splat_0x0001, r_tmp0);
 			asm.pmaddwd_s_s(r_xmm0, r_xmm1);
 			asm.paddd_s_s(r_xmm2, r_xmm0);
-			asm.movdqu_m_s(vsph[-3].value, r_xmm2);
+			masm.emit_mov_m_r(ValueKind.V128, vsph[-3].value, xenv.xmm2);
 			adjustVsp(-2);
 			endHandler();
 		}
 	}
 	def genSimdMultiplyAdd(op: Opcode, is64: bool, isNeg: bool) {
 		bindHandler(op);
-		asm.movdqu_s_m(r_xmm0, vsph[-3].value); // a
-		asm.movdqu_s_m(r_xmm1, vsph[-2].value); // b
-		asm.movdqu_s_m(r_xmm2, vsph[-1].value); // c
+		masm.emit_mov_r_m(ValueKind.V128, xenv.xmm0, vsph[-3].value); // a
+		masm.emit_mov_r_m(ValueKind.V128, xenv.xmm1, vsph[-2].value); // b
+		masm.emit_mov_r_m(ValueKind.V128, xenv.xmm2, vsph[-1].value); // c
 		if (is64) {
 			asm.mulpd_s_s(r_xmm0, r_xmm1);
 			if (isNeg) asm.subpd_s_s(r_xmm2, r_xmm0);
@@ -3547,7 +3542,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			if (isNeg) asm.subps_s_s(r_xmm2, r_xmm0);
 			else asm.addps_s_s(r_xmm2, r_xmm0);
 		}
-		asm.movdqu_m_s(vsph[-3].value, r_xmm2);
+		masm.emit_mov_m_r(ValueKind.V128, vsph[-3].value, xenv.xmm2);
 		adjustVsp(-2);
 		endHandler();
 	}
@@ -3556,27 +3551,27 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 	}
 	def genSimdBinops<T>(opcodes: Array<Opcode>, f: (X86_64Xmmr, X86_64Xmmr) -> T) {
 		for (opcode in opcodes) bindHandler(opcode);
-		asm.movdqu_s_m(r_xmm0, vsph[-2].value);
-		asm.movdqu_s_m(r_xmm1, vsph[-1].value);
+		masm.emit_mov_r_m(ValueKind.V128, xenv.xmm0, vsph[-2].value);
+		masm.emit_mov_r_m(ValueKind.V128, xenv.xmm1, vsph[-1].value);
 		f(r_xmm0, r_xmm1);
-		asm.movdqu_m_s(vsph[-2].value, r_xmm0);
+		masm.emit_mov_m_r(ValueKind.V128, vsph[-2].value, xenv.xmm0);
 		decrementVsp();
 		endHandler();
 	}
 	def genSimdBinopCommute<T>(opcode: Opcode, f: (X86_64Xmmr, X86_64Xmmr) -> T) {
 		bindHandler(opcode);
-		asm.movdqu_s_m(r_xmm1, vsph[-2].value);
-		asm.movdqu_s_m(r_xmm0, vsph[-1].value);
+		masm.emit_mov_r_m(ValueKind.V128, xenv.xmm1, vsph[-2].value);
+		masm.emit_mov_r_m(ValueKind.V128, xenv.xmm0, vsph[-1].value);
 		f(r_xmm0, r_xmm1);
-		asm.movdqu_m_s(vsph[-2].value, r_xmm0);
+		masm.emit_mov_m_r(ValueKind.V128, vsph[-2].value, xenv.xmm0);
 		decrementVsp();
 		endHandler();
 	}
 	def genSimdUnop_xx_x<T>(opcode: Opcode, f: (X86_64Xmmr, X86_64Xmmr) -> T) {
 		bindHandler(opcode);
-		asm.movdqu_s_m(r_xmm0, vsph[-1].value);
+		masm.emit_mov_r_m(ValueKind.V128, xenv.xmm0, vsph[-1].value);
 		f(r_xmm0, r_xmm0);
-		asm.movdqu_m_s(vsph[-1].value, r_xmm0);
+		masm.emit_mov_m_r(ValueKind.V128, vsph[-1].value, xenv.xmm0);
 		endHandler();
 	}
 	def genSimdUnop_x_x<T>(opcode: Opcode, f: (X86_64Xmmr) -> T) {
@@ -3584,31 +3579,31 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 	}
 	def genSimdUnops_x_x<T>(opcodes: Array<Opcode>, f: (X86_64Xmmr) -> T) {
 		for (opcode in opcodes) bindHandler(opcode);
-		asm.movdqu_s_m(r_xmm0, vsph[-1].value);
+		masm.emit_mov_r_m(ValueKind.V128, xenv.xmm0, vsph[-1].value);
 		f(r_xmm0);
-		asm.movdqu_m_s(vsph[-1].value, r_xmm0);
+		masm.emit_mov_m_r(ValueKind.V128, vsph[-1].value, xenv.xmm0);
 		endHandler();
 	}
 	def genSimdUnop_xxtmp_x<T>(opcode: Opcode, f: (X86_64Xmmr, X86_64Xmmr) -> T) {
 		bindHandler(opcode);
-		asm.movdqu_s_m(r_xmm0, vsph[-1].value);
+		masm.emit_mov_r_m(ValueKind.V128, xenv.xmm0, vsph[-1].value);
 		f(r_xmm0, r_xmm1);
-		asm.movdqu_m_s(vsph[-1].value, r_xmm0);
+		masm.emit_mov_m_r(ValueKind.V128, vsph[-1].value, xenv.xmm0);
 		endHandler();
 	}
 	def genSimdUnop_x_r<T>(opcode: Opcode, f: (X86_64Gpr, X86_64Xmmr) -> T) {
 		bindHandler(opcode);
-		asm.movdqu_s_m(r_xmm0, vsph[-1].value);
+		masm.emit_mov_r_m(ValueKind.V128, xenv.xmm0, vsph[-1].value);
 		f(r_tmp0, r_xmm0);
-		asm.movd_m_r(vsph[-1].value, r_tmp0);
+		masm.emit_mov_m_r(ValueKind.I32, vsph[-1].value, xenv.tmp0);
 		genTagUpdate(BpTypeCode.I32.code);
 		endHandler();
 	}
 	def genSimdUnop_xxtmp_r<T>(opcode: Opcode, f: (X86_64Gpr, X86_64Xmmr, X86_64Xmmr) -> T) {
 		bindHandler(opcode);
-		asm.movdqu_s_m(r_xmm0, vsph[-1].value);
+		masm.emit_mov_r_m(ValueKind.V128, xenv.xmm0, vsph[-1].value);
 		f(r_tmp0, r_xmm0, r_xmm1);
-		asm.movd_m_r(vsph[-1].value, r_tmp0);
+		masm.emit_mov_m_r(ValueKind.I32, vsph[-1].value, xenv.tmp0);
 		genTagUpdate(BpTypeCode.I32.code);
 		endHandler();
 	}
@@ -3700,13 +3695,13 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		stk[i] = 1;				// mark as done
 	}
 	def genTagUpdate(tag: byte) {
-		if (valuerep.tagged) asm.movq_m_i(vsph[-1].tag, tag);
+		if (valuerep.tagged) masm.emit_mov_m_l(vsph[-1].tag, tag);
 	}
 	def genTagPush(tag: byte) {
-		if (valuerep.tagged) asm.movq_m_i(vsph[0].tag, i7.view(tag));
+		if (valuerep.tagged) masm.emit_mov_m_l(vsph[0].tag, i7.view(tag));
 	}
-	def genTagPushR(r: X86_64Gpr) {
-		if (valuerep.tagged) asm.movq_m_r(vsph[0].tag, r);
+	def genTagPushR(r: Reg) {
+		if (valuerep.tagged) masm.emit_mov_m_r(ValueKind.I64, vsph[0].tag, r);
 	}
 	def genCopySlot(dst: X86_64Addr, src: X86_64Addr) {
 		match (valuerep.slot_size) {
@@ -3761,12 +3756,12 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			asm.jc_rel_near(C.NZ, has_index);
 		}
 		genReadUleb32(r_tmp0);				// decode offset
-		asm.movd_r_m(r_tmp1, vsph[-1].value);		// read index off value stack
+		masm.emit_mov_r_m(ValueKind.I32, xenv.tmp1, vsph[-1].value);		// read index off value stack
 		asm.q.add_r_r(r_tmp0, r_tmp1);			// add index + offset
 		gen(r_tmp1, r_mem0_base.plusR(r_tmp0, 1, 0));
 		asm.bind(finish);
 		if (valuerep.tagged && tag != BpTypeCode.I32.code) genTagUpdate(tag); // update tag if necessary
-		asm.movq_m_r(vsph[-1].value, r_tmp1);
+		masm.emit_mov_m_r(ValueKind.I64, vsph[-1].value, xenv.tmp1);
 		endHandler();
 		if (has_index != null) {
 			asm.bind(has_index);
@@ -3776,7 +3771,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			asm.movq_r_m(memN, memN.plusR(r_tmp0, 8, offsets.Array_contents));
 			asm.movq_r_m(memN, memN.plus(offsets.NativeWasmMemory_start));
 			genReadUleb32(r_tmp0);			// decode offset
-			asm.movd_r_m(r_tmp1, vsph[-1].value);	// read index off value stack
+			masm.emit_mov_r_m(ValueKind.I32, xenv.tmp1, vsph[-1].value);	// read index off value stack
 			asm.q.add_r_r(r_tmp0, r_tmp1);		// add index + offset
 			gen(r_tmp1, memN.plusR(r_tmp0, 1, 0));
 			asm.jmp_rel_near(finish);
@@ -3785,7 +3780,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			asm.bind(index64);
 			// TODO: multi-memory with memory64
 			masm.emit_read_uleb(r_tmp3, r_ip, r_tmp1, r_tmp0); // decode offset
-			asm.movq_r_m(r_tmp1, vsph[-1].value);		// read index off value stack
+			masm.emit_mov_r_m(ValueKind.I64, xenv.tmp1, vsph[-1].value);		// read index off value stack
 			asm.movq_r_r(r_tmp2, r_tmp1);
 			asm.q.or_r_r(r_tmp2, r_tmp3);			// check that neither offset or index have
 			asm.q.shr_r_i(r_tmp2, 34);			// upper bits set
@@ -3813,9 +3808,9 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			asm.jc_rel_near(C.NZ, has_index);
 		}
 		genReadUleb32(r_tmp0);			// decode offset
-		asm.movd_r_m(r_tmp1, vsph[-2].value);	// read index
+		masm.emit_mov_r_m(ValueKind.I32, xenv.tmp1, vsph[-2].value);	// read index
 		asm.q.add_r_r(r_tmp0, r_tmp1);		// add index + offset
-		asm.movq_r_m(r_tmp1, vsph[-1].value);	// read value
+		masm.emit_mov_r_m(ValueKind.I64, xenv.tmp1, vsph[-1].value);	// read value
 		gen(r_mem0_base.plusR(r_tmp0, 1, 0), r_tmp1);
 		asm.bind(finish);
 		adjustVsp(-2);
@@ -3828,9 +3823,9 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			asm.movq_r_m(memN, memN.plusR(r_tmp0, 8, offsets.Array_contents));
 			asm.movq_r_m(memN, memN.plus(offsets.NativeWasmMemory_start));
 			genReadUleb32(r_tmp0);			// decode offset
-			asm.movd_r_m(r_tmp1, vsph[-2].value);	// read index off value stack
+			masm.emit_mov_r_m(ValueKind.I32, xenv.tmp1, vsph[-2].value);	// read index off value stack
 			asm.q.add_r_r(r_tmp0, r_tmp1);		// add index + offset
-			asm.movq_r_m(r_tmp1, vsph[-1].value);	// read value
+			masm.emit_mov_r_m(ValueKind.I64, xenv.tmp1, vsph[-1].value);	// read value
 			gen(memN.plusR(r_tmp0, 1, 0), r_tmp1);
 			asm.jmp_rel_near(finish);
 		}
@@ -3838,14 +3833,14 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			asm.bind(index64);
 			// TODO: multi-memory with memory64
 			masm.emit_read_uleb(r_tmp3, r_ip, r_tmp1, r_tmp0); // decode offset
-			asm.movq_r_m(r_tmp1, vsph[-2].value);		// read index off value stack
+			masm.emit_mov_r_m(ValueKind.I64, xenv.tmp1, vsph[-2].value);		// read index off value stack
 			asm.movq_r_r(r_tmp2, r_tmp1);
 			asm.q.or_r_r(r_tmp2, r_tmp3);			// check that neither offset or index have
 			asm.q.shr_r_i(r_tmp2, 34);			// upper bits set
 			var label = newTrapLabel(TrapReason.MEMORY_OOB);
 			asm.jc_rel_far(X86_64Conds.NZ, label);
 			asm.q.add_r_r(r_tmp3, r_tmp1);			// add index + offset
-			asm.movq_r_m(r_tmp1, vsph[-1].value);		// read value
+			masm.emit_mov_r_m(ValueKind.I64, xenv.tmp1, vsph[-1].value);		// read value
 			gen(r_mem0_base.plusR(r_tmp3, 1, 0), r_tmp1);
 			asm.jmp_rel_near(finish);
 		}
@@ -4215,16 +4210,16 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		fatal("ran out of buffer space");
 		return w;
 	}
-	private def genPopInto(reg: X86_64Gpr) {
+	private def genPopInto(reg: Reg) {
 		decrementVsp();
-		asm.movq_r_m(reg, vsph[0].value);
+		masm.emit_mov_r_m(ValueKind.I64, reg, vsph[0].value);
 	}
 	private def genPopCont(reg: X86_64Xmmr) {
 		decrementVsp();
-		asm.movq_r_m(r_scratch, vsph[0].value);
+		masm.emit_mov_r_m(ValueKind.I64, xenv.scratch, vsph[0].value);
 		asm.pinsrq_s_r_i(reg, r_scratch, 0);
 		if (!FeatureDisable.unboxedConts) {
-			asm.movd_r_m(r_scratch, vsph[0].value.plus(8));
+			masm.emit_mov_r_m(ValueKind.I32, xenv.scratch, vsph[0].value.plus(8));
 			asm.pinsrq_s_r_i(reg, r_scratch, 1);
 		}
 	}
diff --git a/src/engine/x86-64/X86_64MacroAssembler.v3 b/src/engine/x86-64/X86_64MacroAssembler.v3
index 5325e353f..9de127d83 100644
--- a/src/engine/x86-64/X86_64MacroAssembler.v3
+++ b/src/engine/x86-64/X86_64MacroAssembler.v3
@@ -505,11 +505,36 @@ class X86_64MacroAssembler extends MacroAssembler {
 			I32_AND => asm.d.and_r_m(G(reg), A(ma));
 			I32_OR => asm.d.or_r_m(G(reg), A(ma));
 			I32_XOR => asm.d.xor_r_m(G(reg), A(ma));
+			I32_MUL => asm.d.imul_r_m(G(reg), A(ma));
 			I64_ADD => asm.q.add_r_m(G(reg), A(ma));
 			I64_SUB => asm.q.sub_r_m(G(reg), A(ma));
 			I64_AND => asm.q.and_r_m(G(reg), A(ma));
 			I64_OR => asm.q.or_r_m(G(reg), A(ma));
 			I64_XOR => asm.q.xor_r_m(G(reg), A(ma));
+			I64_MUL => asm.q.imul_r_m(G(reg), A(ma));
+			F32_ADD => asm.addss_s_m(X(reg), A(ma));
+			F32_SUB => asm.subss_s_m(X(reg), A(ma));
+			F32_MUL => asm.mulss_s_m(X(reg), A(ma));
+			F32_DIV => asm.divss_s_m(X(reg), A(ma));
+			F64_ADD => asm.addsd_s_m(X(reg), A(ma));
+			F64_SUB => asm.subsd_s_m(X(reg), A(ma));
+			F64_MUL => asm.mulsd_s_m(X(reg), A(ma));
+			F64_DIV => asm.divsd_s_m(X(reg), A(ma));
+			_ => unimplemented();
+		}
+	}
+	def emit_binop_m_r(op: Opcode, ma: MasmAddr, reg: Reg) {
+		match (op) {
+			I32_ADD => asm.d.add_m_r(A(ma), G(reg));
+			I32_SUB => asm.d.sub_m_r(A(ma), G(reg));
+			I32_AND => asm.d.and_m_r(A(ma), G(reg));
+			I32_OR => asm.d.or_m_r(A(ma), G(reg));
+			I32_XOR => asm.d.xor_m_r(A(ma), G(reg));
+			I64_ADD => asm.q.add_m_r(A(ma), G(reg));
+			I64_SUB => asm.q.sub_m_r(A(ma), G(reg));
+			I64_AND => asm.q.and_m_r(A(ma), G(reg));
+			I64_OR => asm.q.or_m_r(A(ma), G(reg));
+			I64_XOR => asm.q.xor_m_r(A(ma), G(reg));
 			_ => unimplemented();
 		}
 	}
@@ -528,6 +553,42 @@ class X86_64MacroAssembler extends MacroAssembler {
 			_ => unimplemented();
 		}
 	}
+	def emit_testd_m_i(addr: MasmAddr, i: int) {
+		asm.d.test_m_i(A(addr), i);
+	}
+	def emit_testq_m_i(addr: MasmAddr, i: int) {
+		asm.q.test_m_i(A(addr), i);
+	}
+	def emit_and_r_i(reg: Reg, i: int) {
+		asm.d.and_r_i(G(reg), i);
+	}
+	def emit_and_m_i(addr: MasmAddr, i: int) {
+		asm.d.and_m_i(A(addr), i);
+	}
+	def emit_xor_m_i(addr: MasmAddr, i: int) {
+		asm.d.and_m_i(A(addr), i);
+	}
+	def emit_popcntd_r_m(reg: Reg, addr: MasmAddr) {
+		asm.d.popcnt_r_m(G(reg), A(addr));
+	}
+	def emit_popcntq_r_m(reg: Reg, addr: MasmAddr) {
+		asm.q.popcnt_r_m(G(reg), A(addr));
+	}
+	def emit_sqrtf_r_m(reg: Reg, addr: MasmAddr) {
+		asm.sqrtss_s_m(X(reg), A(addr));
+	}
+	def emit_sqrtd_r_m(reg: Reg, addr: MasmAddr) {
+		asm.sqrtsd_s_m(X(reg), A(addr));
+	}
+	def emit_demote_r_m(reg: Reg, addr: MasmAddr) {
+		asm.cvtsd2ss_s_m(X(reg), A(addr));
+	}
+	def emit_promote_r_m(reg: Reg, addr: MasmAddr) {
+		asm.cvtss2sd_s_m(X(reg), A(addr));
+	}
+	def emit_cmpq_m_i(addr: MasmAddr, val: int) {
+		asm.q.cmp_m_i(A(addr), val);
+	}
 	def emit_cmpq_r_i(cond: X86_64Cond, r1: X86_64Gpr, val: int) {
 		asm.q.cmp_r_i(r1, val);
 		asm.set_r(cond, r1);
@@ -553,6 +614,36 @@ class X86_64MacroAssembler extends MacroAssembler {
 		asm.set_r(cond, r1);
 		asm.q.movbzx_r_r(r1, r1);
 	}
+	def emit_cmpq_m_r(cond: X86_64Cond, addr: MasmAddr, reg: Reg) {
+		def r1 = G(reg);
+		asm.q.cmp_m_r(A(addr), r1);
+		asm.set_r(cond, r1);
+		asm.q.movbzx_r_r(r1, r1);
+	}
+	def emit_cmpd_m_r(cond: X86_64Cond, addr: MasmAddr, reg: Reg) {
+		def r1 = G(reg);
+		asm.d.cmp_m_r(A(addr), r1);
+		asm.set_r(cond, r1);
+		asm.d.movbzx_r_r(r1, r1);
+	}
+	// TODO migrate into above later based on uses (as br_r)
+	def emit_cmpd_m_i(addr: MasmAddr, val: int) {
+		asm.d.cmp_m_i(A(addr), val);
+	}
+	def emit_cmpf_r_m(reg: Reg, addr: MasmAddr) {
+		// TODO migrate more in here?
+		asm.q.ucomiss_s_m(X(reg), A(addr));
+	}
+	def emit_cmpd_r_m(reg: Reg, addr: MasmAddr) {
+		// TODO migrate more in here?
+		asm.q.ucomisd_s_m(X(reg), A(addr));
+	}
+	def emit_roundf_r_m(reg: Reg, addr: MasmAddr, rounding: X86_64Rounding) {
+		asm.roundss_s_m(X(reg), A(addr), rounding);
+	}
+	def emit_roundd_r_m(reg: Reg, addr: MasmAddr, rounding: X86_64Rounding) {
+		asm.roundsd_s_m(X(reg), A(addr), rounding);
+	}
 	def emit_pop_r(kind: ValueKind, reg: Reg) {
 		match (kind) {
 			I32 => asm.d.popq_r(G(reg));
@@ -801,6 +892,9 @@ class X86_64MacroAssembler extends MacroAssembler {
 	def emit_jump_r(reg: Reg) {
 		asm.ijmp_r(G(reg));
 	}
+	def emit_xchgq_m_r(addr: MasmAddr, reg: Reg) {
+		asm.xchgq_m_r(A(addr), G(reg));
+	}
 	def emit_increment_CountProbe(tmp: Reg, probe: CountProbe, increment: u64) {
 		var r1 = G(tmp);
 		var refOffset = asm.movq_r_p(r1, Pointer.atObject(probe) - Pointer.NULL);
@@ -990,11 +1084,24 @@ class X86_64MacroAssembler extends MacroAssembler {
 		asm.d.sub_r_r(scratch, r);
 		asm.movd_r_r(r, scratch); // XXX: can save an instruction here?
 	}
+	def emit_i32_clz_r_m(r: Reg, m: MasmAddr) {
+		asm.movd_r_i(scratch, -1);
+		asm.d.bsr_r_m(G(r), A(m));
+		asm.d.cmov_r(C.Z, G(r), scratch);
+		asm.movd_r_i(scratch, 31);
+		asm.d.sub_r_r(scratch, G(r));
+		asm.movd_m_r(A(m), scratch);
+	}
 	def emit_i32_ctz_r_r(r: X86_64Gpr, s: X86_64Gpr) {
 		asm.d.bsf_r_r(r, s);
 		asm.movd_r_i(scratch, 32);
 		asm.d.cmov_r(C.Z, r, scratch);
 	}
+	def emit_i32_ctz_r_m(r: Reg, m: MasmAddr) {
+		asm.d.bsf_r_m(G(r), A(m));
+		asm.movd_r_i(scratch, 32);
+		asm.d.cmov_r(C.Z, G(r), scratch);
+	}
 	def emit_i64_clz_r_r(r: X86_64Gpr, s: X86_64Gpr) {
 		asm.movq_r_i(scratch, -1);
 		asm.q.bsr_r_r(r, s);
@@ -1003,11 +1110,24 @@ class X86_64MacroAssembler extends MacroAssembler {
 		asm.q.sub_r_r(scratch, r);
 		asm.movq_r_r(r, scratch); // XXX: can save an instruction with second output reg
 	}
+	def emit_i64_clz_r_m(r: Reg, m: MasmAddr) {
+		asm.movq_r_i(scratch, -1);
+		asm.q.bsr_r_m(G(r), A(m));
+		asm.q.cmov_r(C.Z, G(r), scratch);
+		asm.movq_r_i(scratch, 63);
+		asm.q.sub_r_r(scratch, G(r));
+		asm.movq_m_r(A(m), scratch);
+	}
 	def emit_i64_ctz_r_r(r: X86_64Gpr, s: X86_64Gpr) {
 		asm.q.bsf_r_r(r, s);
 		asm.movq_r_i(scratch, 64);
 		asm.q.cmov_r(C.Z, r, scratch);
 	}
+	def emit_i64_ctz_r_m(r: Reg, m: MasmAddr) {
+		asm.q.bsf_r_m(G(r), A(m));
+		asm.movq_r_i(scratch, 64);
+		asm.q.cmov_r(C.Z, G(r), scratch);
+	}
 	def emit_i64_extend_i32_s(r: X86_64Gpr) {
 		asm.q.shl_r_i(r, 32);
 		asm.q.sar_r_i(r, 32);
@@ -1015,6 +1135,18 @@ class X86_64MacroAssembler extends MacroAssembler {
 	def emit_i64_extend_i32_u(r: X86_64Gpr) {
 		asm.movd_r_r(r, r);
 	}
+	def emit_i32_extend8_s_r_m(reg: Reg, addr: MasmAddr) {
+		asm.d.movbsx_r_m(G(reg), A(addr));
+	}
+	def emit_i32_extend16_s_r_m(reg: Reg, addr: MasmAddr) {
+		asm.d.movwsx_r_m(G(reg), A(addr));
+	}
+	def emit_i64_extend8_s_r_m(reg: Reg, addr: MasmAddr) {
+		asm.q.movbsx_r_m(G(reg), A(addr));
+	}
+	def emit_i64_extend16_s_r_m(reg: Reg, addr: MasmAddr) {
+		asm.q.movwsx_r_m(G(reg), A(addr));
+	}
 	// SSE assemblers and helpers
 	// Masks for simd instructions
 	def mask_i8x16_splat_0x0f: (u64, u64) = (0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F);

From db2c66a5ef3305b34fab264841566f47464f5bfe Mon Sep 17 00:00:00 2001
From: Matthew Schneider <panat.matthew@gmail.com>
Date: Fri, 30 Jan 2026 11:02:28 -0500
Subject: [PATCH 2/2] Fix failing tests in VspHelper update

---
 src/engine/x86-64/X86_64Interpreter.v3    | 5 ++---
 src/engine/x86-64/X86_64MacroAssembler.v3 | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/engine/x86-64/X86_64Interpreter.v3 b/src/engine/x86-64/X86_64Interpreter.v3
index 2980390c7..bb6e636b8 100644
--- a/src/engine/x86-64/X86_64Interpreter.v3
+++ b/src/engine/x86-64/X86_64Interpreter.v3
@@ -1107,7 +1107,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 
 		// BR_ON_NULL: check condition and either fall thru to next bytecode or ctl xfer (with stack copying)
 		bindHandler(Opcode.BR_ON_NULL);
-		masm.emit_br_m(vsph[-1].value, MasmBrCond.REF_NULL, controlFallThruLabel);
+		masm.emit_br_m(vsph[-1].value, MasmBrCond.REF_NONNULL, controlFallThruLabel);
 		decrementVsp();
 		masm.emit_br(controlTransferLabel);
 
@@ -1754,7 +1754,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		}
 		bindHandler(Opcode.I64_MUL); {
 			masm.emit_mov_r_m(ValueKind.I64, xenv.tmp0, vsph[-1].value);
-			masm.emit_binop_r_m(Opcode.I32_MUL, xenv.tmp0, vsph[-2].value);
+			masm.emit_binop_r_m(Opcode.I64_MUL, xenv.tmp0, vsph[-2].value);
 			masm.emit_mov_m_r(ValueKind.I64, vsph[-2].value, xenv.tmp0);
 			decrementVsp();
 			endHandler();
@@ -2159,7 +2159,6 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			masm.emit_and_r_i(xenv.tmp0, 0x7FFFFFFF);
 			masm.emit_mov_r_m(ValueKind.I32, xenv.tmp1, vsph[-1].upper);
 			masm.emit_and_r_i(xenv.tmp1, 0x80000000);
-			masm.emit_mov_r_m(ValueKind.I32, xenv.tmp1, vsph[-1].value);
 			asm.d.or_r_r(r_tmp0, r_tmp1);
 			masm.emit_mov_m_r(ValueKind.I32, vsph[-2].upper, xenv.tmp0);
 			decrementVsp();
diff --git a/src/engine/x86-64/X86_64MacroAssembler.v3 b/src/engine/x86-64/X86_64MacroAssembler.v3
index 9de127d83..30d787d90 100644
--- a/src/engine/x86-64/X86_64MacroAssembler.v3
+++ b/src/engine/x86-64/X86_64MacroAssembler.v3
@@ -566,7 +566,7 @@ class X86_64MacroAssembler extends MacroAssembler {
 		asm.d.and_m_i(A(addr), i);
 	}
 	def emit_xor_m_i(addr: MasmAddr, i: int) {
-		asm.d.and_m_i(A(addr), i);
+		asm.d.xor_m_i(A(addr), i);
 	}
 	def emit_popcntd_r_m(reg: Reg, addr: MasmAddr) {
 		asm.d.popcnt_r_m(G(reg), A(addr));