diff --git a/fast_call.wasm b/fast_call.wasm
new file mode 100644
index 000000000..36f858295
Binary files /dev/null and b/fast_call.wasm differ
diff --git a/fast_call.wat b/fast_call.wat
new file mode 100644
index 000000000..fa74fa213
--- /dev/null
+++ b/fast_call.wat
@@ -0,0 +1,10 @@
+(module
+  (import "wizeng" "puti" (func $puti (param i32)))
+  (func $f (result i32)
+	i32.const 10)
+  (func (export "main") (result i32)
+        call $f
+	call $puti
+	i32.const 0
+  )
+)
diff --git a/fast_call2.wasm b/fast_call2.wasm
new file mode 100644
index 000000000..b3dcbf2bf
Binary files /dev/null and b/fast_call2.wasm differ
diff --git a/fast_call2.wat b/fast_call2.wat
new file mode 100644
index 000000000..3dd58686b
--- /dev/null
+++ b/fast_call2.wat
@@ -0,0 +1,7 @@
+(module
+  (func $f (result i32)
+	i32.const 10)
+  (func (export "main") (result i32)
+        call $f
+  )
+)
diff --git a/fast_call_export.wasm b/fast_call_export.wasm
new file mode 100644
index 000000000..de5abe4d8
Binary files /dev/null and b/fast_call_export.wasm differ
diff --git a/fast_call_export.wat b/fast_call_export.wat
new file mode 100644
index 000000000..20c428045
--- /dev/null
+++ b/fast_call_export.wat
@@ -0,0 +1,10 @@
+;; export name holds fast information, we don't modify binary ahead of time
+
+(module
+  (func $fast (export "fast:foo") (result i32)
+    (i32.const 2)
+  )
+  (func (export "main") (result i32)
+    (call $fast)
+  )
+)
diff --git a/fast_call_nop.wasm b/fast_call_nop.wasm
new file mode 100644
index 000000000..403dc7cf1
Binary files /dev/null and b/fast_call_nop.wasm differ
diff --git a/fast_call_nop.wat b/fast_call_nop.wat
new file mode 100644
index 000000000..c406ac91a
--- /dev/null
+++ b/fast_call_nop.wat
@@ -0,0 +1,10 @@
+(module
+  (func $f)
+  (func $g)
+  (func (export "main") (result i32)
+  	i64.const 11
+	drop
+        call $g
+	i32.const 0
+  )
+)
diff --git a/fast_call_param.wasm b/fast_call_param.wasm
new file mode 100644
index 000000000..c70071c25
Binary files /dev/null and b/fast_call_param.wasm differ
diff --git a/fast_call_param.wat b/fast_call_param.wat
new file mode 100644
index 000000000..b3f4ad728
--- /dev/null
+++ b/fast_call_param.wat
@@ -0,0 +1,18 @@
+(module
+  (import "wizeng" "puti" (func $puti (param i32)))
+  (func $f (param i32) (result i32)
+    local.get 0
+    if (result i32)
+      i32.const 999
+    else
+      i32.const -216
+    end
+  )
+  (func (export "main") (result i32)
+        (call $f (i32.const 1))
+	call $puti
+        (call $f (i32.const 0))
+	call $puti
+	i32.const 0
+  )
+)
diff --git a/int/Export b/int/Export
new file mode 100755
index 000000000..df2a0f18a
--- /dev/null
+++ b/int/Export
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+S=${BASH_SOURCE[0]}
+while [ -h "$S" ]; do
+  DIR=$(cd -P $(dirname "$S") >/dev/null 2>&1 && pwd)
+  S=$(readlink "$S")
+  [[ $S != /* ]] && S=$DIR/$S
+done
+DIR=$(cd -P $(dirname "$S") >/dev/null 2>&1 && pwd)
+wizeng $DIR/Export.wasm "$@"
diff --git a/int/Export.v3 b/int/Export.v3
new file mode 100644
index 000000000..2409223df
--- /dev/null
+++ b/int/Export.v3
@@ -0,0 +1,12 @@
+export "fast:foo" def foo(x: int, y: int) -> int {
+	var val = y;
+	for (i < x) {
+		val += i * y;
+	}
+	return val;
+}
+
+export "main" def main() -> int {
+	System.puts(Strings.format1("%d\n", foo(11, 2)));
+	return 0;
+}
diff --git a/int/Export.wasm b/int/Export.wasm
new file mode 100644
index 000000000..65fa630c5
Binary files /dev/null and b/int/Export.wasm differ
diff --git a/int/Interpreter b/int/Interpreter
new file mode 100755
index 000000000..55ced8cc9
--- /dev/null
+++ b/int/Interpreter
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+S=${BASH_SOURCE[0]}
+while [ -h "$S" ]; do
+  DIR=$(cd -P $(dirname "$S") >/dev/null 2>&1 && pwd)
+  S=$(readlink "$S")
+  [[ $S != /* ]] && S=$DIR/$S
+done
+DIR=$(cd -P $(dirname "$S") >/dev/null 2>&1 && pwd)
+wizeng $DIR/Interpreter.wasm "$@"
diff --git a/int/Interpreter.v3 b/int/Interpreter.v3
new file mode 100644
index 000000000..13f0a1b9e
--- /dev/null
+++ b/int/Interpreter.v3
@@ -0,0 +1,403 @@
+export "fast:const0"	def const0() -> long { return 0; }
+export "fast:const1"	def const1() -> long { return 1; }
+export "fast:constN"	def constN(n: int) -> long { return n; }
+export "fast:add"   	def add(l: long, r: long) -> long { return l + r; }
+export "fast:sub"   	def sub(l: long, r: long) -> long { return l - r; }
+export "fast:fact"  	def fact(n: long) -> long { 
+				var v: long = 1;
+				for (i < n) {
+					v *= i + 1;
+				}
+				return v;
+			}
+export "fast:seq"   	def seq(f: long, s: long) -> long { return s; }
+export "fast:select"	def select(c: long, t: long, f: long) -> long { return if(c != 0, t, f); }
+export "fast:if"    	def if_(c: long) -> bool { return c != 0; }
+export "fast:nop"   	def nop() -> void {}
+export "fast:print"   	def print(n: long) -> long {
+				System.puts(Strings.format1("%d\n", n));
+				return 0;
+			}
+export "fast:double"    def double(n: long) -> long { return add(n, n); }
+
+def HANDLER_CONST0 = CiWasmTarget.functionId(const0);
+def HANDLER_CONST1 = CiWasmTarget.functionId(const1);
+def HANDLER_CONSTN = CiWasmTarget.functionId(constN);
+def HANDLER_ADD    = CiWasmTarget.functionId(add);
+def HANDLER_SUB    = CiWasmTarget.functionId(sub);
+def HANDLER_FACT   = CiWasmTarget.functionId(fact);
+def HANDLER_SEQ    = CiWasmTarget.functionId(seq);
+def HANDLER_SELECT = CiWasmTarget.functionId(select);
+def HANDLER_IF     = CiWasmTarget.functionId(if_);
+def HANDLER_NOP    = CiWasmTarget.functionId(nop);
+def HANDLER_PRINT  = CiWasmTarget.functionId(print);
+def HANDLER_DOUBLE = CiWasmTarget.functionId(double);
+
+export "main" def main() -> int {
+	def buf = StringBuilder.new();
+
+	//def prog = Select(Sub(ConstN(1), Const1), Add(Const1, ConstN(100)), Seq(Sub(Add(Const1, ConstN(2)), Const0), ConstN(15)));
+
+	//def prog = AST.If(Const1, ConstN(2), ConstN(3));
+	def prog = Double(Fact(ConstN(13)));
+
+	//def prog = Const1;
+	def bytecode = compile(prog);
+	def val = eval(bytecode);
+
+	prog.display(buf);
+	buf.ln();
+	buf.put1("=> %d", val);
+	buf.ln();
+	System.puts(buf.extract());
+
+	def f: Func.F = wasmCompile(bytecode);
+	def val_ = f.f();
+
+	buf.put1("=> %d", val_);
+	buf.ln();
+	System.puts(buf.extract());
+
+	return 0;
+}
+
+def eval(bytecode: Array<byte>) -> long {
+	def vstk = ArrayStack<long>.new();
+	var pc = 0;
+
+	// print out bytecode
+	def b = StringBuilder.new();
+	while (pc < bytecode.length) {
+		b.put1("+%d ", pc);
+
+		def instruction = Ref<Instruction>.at(bytecode, pc);
+		def opcode  = instruction.opcode;
+		def operand = instruction.operand;
+		pc += Instruction.size;
+
+		b.puts(opcode.name);
+		match (opcode) {
+			CONSTN, IF, ELSE => b.put1(" %d", operand);
+			_ => ;
+		}
+		b.ln();
+	}
+	System.puts(b.extract());
+	
+	pc = 0;
+	while (pc < bytecode.length) {
+		System.puts(Strings.format1("pc=%d\n", pc));
+		def instruction = Ref<Instruction>.at(bytecode, pc);
+		def opcode  = instruction.opcode;
+		def operand = instruction.operand;
+		pc += Instruction.size;
+
+		match (opcode) {
+			CONST0 => vstk.push(0);
+			CONST1 => vstk.push(1);
+			CONSTN => vstk.push(operand);
+			ADD => {
+				def right = vstk.pop();
+				def left  = vstk.pop();
+				vstk.push(left + right);
+			}
+			SUB => {
+				def right = vstk.pop();
+				def left  = vstk.pop();
+				vstk.push(left - right);
+			}
+			FACT => {
+				def arg = vstk.pop();
+				var val: long = 1;
+				for (i < arg) {
+					val *= i + 1;
+				}
+				vstk.push(val);
+			}
+			PRINT => {
+				def arg = vstk.pop();
+				System.puts(Strings.format1("%d\n", arg));
+				vstk.push(0);
+			}
+			DOUBLE => {
+				def arg = vstk.pop();
+				vstk.push(arg + arg);
+			}
+			SEQ => {
+				def snd = vstk.pop();
+				def fst = vstk.pop();
+				vstk.push(snd);
+			}
+			SELECT => {
+				def snd = vstk.pop();
+				def fst = vstk.pop();
+				def cond = vstk.pop();
+				vstk.push(if(cond != 0, fst, snd));
+			}
+			IF => {
+				def cond = vstk.pop();
+				if (cond == 0) pc += operand;
+			}
+			ELSE => {
+				pc += operand;
+			}
+			END => {} // nop
+		}
+	}
+	return vstk.peek();
+}
+
+enum Opcode(handler: int) {
+	CONST0	(HANDLER_CONST0) 
+	CONST1	(HANDLER_CONST1) 
+	CONSTN	(HANDLER_CONSTN) 
+	ADD	(HANDLER_ADD) 
+	SUB	(HANDLER_SUB)
+	FACT	(HANDLER_FACT)
+	SEQ     (HANDLER_SEQ)
+	SELECT  (HANDLER_SELECT)
+	IF      (HANDLER_IF)
+	ELSE    (HANDLER_NOP)
+	END     (HANDLER_NOP)
+	PRINT   (HANDLER_PRINT)
+	DOUBLE  (HANDLER_DOUBLE)
+}
+
+layout Instruction {
+	+0	opcode:  Opcode;
+	+1	operand: byte;
+	=2;
+}
+
+type AST {
+	case Const0 {
+		def compile(w: DataWriter) {
+			w.putb(Opcode.CONST0.tag).putb(0);
+		}
+		def display(s: StringBuilder) {
+			s.putc('0');
+		}
+	}
+	case Const1 {
+		def compile(w: DataWriter) {
+			w.putb(Opcode.CONST1.tag).putb(0);
+		}
+		def display(s: StringBuilder) {
+			s.putc('1');
+		}
+	}
+	case ConstN(n: byte) {
+		def compile(w: DataWriter) {
+			w.putb(Opcode.CONSTN.tag).putb(n);
+		}
+		def display(s: StringBuilder) {
+			s.putd(n);
+		}
+	}
+	case Add(left: AST, right: AST) {
+		def compile(w: DataWriter) {
+			left.compile(w);
+			right.compile(w);
+			w.putb(Opcode.ADD.tag).putb(0);
+		}
+		def display(s: StringBuilder) {
+			s.putc('(');
+			left.display(s);
+			s.puts(" + ");
+			right.display(s);
+			s.putc(')');
+		}
+	}
+	case Fact(arg: AST) {
+		def compile(w: DataWriter) {
+			arg.compile(w);
+			w.putb(Opcode.FACT.tag).putb(0);
+		}
+		def display(s: StringBuilder) {
+			s.puts("(fact ");
+			arg.display(s);
+			s.putc(')');
+		}
+	}
+	case Print(arg: AST) {
+		def compile(w: DataWriter) {
+			arg.compile(w);
+			w.putb(Opcode.PRINT.tag).putb(0);
+		}
+		def display(s: StringBuilder) {
+			s.puts("(print ");
+			arg.display(s);
+			s.putc(')');
+		}
+	}
+	case Double(arg: AST) {
+		def compile(w: DataWriter) {
+			arg.compile(w);
+			w.putb(Opcode.DOUBLE.tag).putb(0);
+		}
+		def display(s: StringBuilder) {
+			s.puts("(double ");
+			arg.display(s);
+			s.putc(')');
+		}
+	}
+	case Sub(left: AST, right: AST) {
+		def compile(w: DataWriter) {
+			left.compile(w);
+			right.compile(w);
+			w.putb(Opcode.SUB.tag).putb(0);
+		}
+		def display(s: StringBuilder) {
+			s.putc('(');
+			left.display(s);
+			s.puts(" - ");
+			right.display(s);
+			s.putc(')');
+		}
+	}
+	case Seq(fst: AST, snd: AST) {
+		def compile(w: DataWriter) {
+			fst.compile(w);
+			snd.compile(w);
+			w.putb(Opcode.SEQ.tag).putb(0);
+		}
+		def display(s: StringBuilder) {
+			s.putc('(');
+			fst.display(s);
+			s.puts(" ; ");
+			snd.display(s);
+			s.putc(')');
+		}
+	}
+	// eager evaluation of branches
+	case Select(cond: AST, left: AST, right: AST) {
+		def compile(w: DataWriter) {
+			cond.compile(w);
+			left.compile(w);
+			right.compile(w);
+			w.putb(Opcode.SELECT.tag).putb(0);
+		}
+		def display(s: StringBuilder) {
+			s.puts("(select ");
+			cond.display(s);
+			s.putc(' ');
+			left.display(s);
+			s.putc(' ');
+			right.display(s);
+			s.putc(')');
+		}
+	}
+	// lazy evaluation of branches
+	case If(cond: AST, left: AST, right: AST) {
+		def compile(w: DataWriter) {
+			cond.compile(w);
+			w.putb(Opcode.IF.tag).putb(0);
+			def hole1 = w.pos;
+			left.compile(w);
+			w.putb(Opcode.ELSE.tag).putb(0);
+			w.data[hole1 - 1] = byte.!(w.pos - hole1);
+			def hole2 = w.pos;
+			right.compile(w);
+			w.data[hole2 - 1] = byte.!(w.pos - hole2);
+			w.putb(Opcode.END.tag).putb(0);
+
+		}
+		def display(s: StringBuilder) {
+			s.puts("(if ");
+			cond.display(s);
+			s.putc(' ');
+			left.display(s);
+			s.putc(' ');
+			right.display(s);
+			s.putc(')');
+		}
+	}
+
+	def compile(w: DataWriter);
+	def display(s: StringBuilder);
+}
+
+def Const0 = AST.Const0;
+def Const1 = AST.Const1;
+def ConstN = AST.ConstN;
+def Add = AST.Add;
+def Sub = AST.Sub;
+def Fact= AST.Fact;
+def Print = AST.Print;
+def Seq = AST.Seq;
+def Select = AST.Select;
+def If = AST.If;
+def Double = AST.Double;
+
+def compile(prog: AST) -> Array<byte> {
+	def w = DataWriter.new();
+
+	prog.compile(w);
+	
+	return w.extract();
+}
+
+type Func {
+	case F(f: () -> long);
+}
+
+def wasmCompile(bytecode: Array<byte>) -> Func.F {
+	def w = DataWriter.new();
+
+	w.put_uleb32(0); // 0 locals
+
+	var pc = 0;
+	while (pc < bytecode.length) {
+		def instruction = Ref<Instruction>.at(bytecode, pc);
+		def opcode  = instruction.opcode;
+		def operand = instruction.operand;
+		pc += Instruction.size;
+
+		// setup for handler, if necessary (guest-level operands)
+		match (opcode) {
+			CONSTN => {
+				w.putb(I32_CONST);
+				w.put_sleb32(operand);
+			}
+			_ => ;
+		}
+		// call handler function
+		if (opcode.handler != HANDLER_NOP) {
+			w.putb(CALL);
+			w.put_uleb32(u32.!(opcode.handler));
+		}
+		// post-handler wasm bytecodes
+		match (opcode) {
+			IF => {
+				w.putb(IF);
+				w.putb(RESULT_I64);
+			}
+			ELSE => w.putb(ELSE); // didn't emit handler anyway
+			END => w.putb(END);   // didn't emit handler anyway
+			_ => ;
+		}
+	}
+	w.putb(END);
+
+	// create wasm function
+	def sig  = CiWasmTarget.functionTypeId<int, long>();
+	def wasm = w.extract();
+	def fid  = wave.new_func(sig, Pointer.atContents(wasm), wasm.length);
+	if (fid < 0) {
+		System.puts("failed to compile wasm function\n");
+		System.error("error", "failed to compile");
+	}
+	def func = CiRuntime.forgeClosure<void, void, long>(Pointer.NULL + fid, void);
+
+	return Func.F(func);
+}
+
+def IF: byte = 0x04;
+def ELSE: byte = 0x05;
+def END: byte = 0x0B;
+def CALL: byte = 0x10;
+def DROP: byte = 0x1A;
+def I32_CONST: byte = 0x41;
+def I64_CONST: byte = 0x42;
+
+def RESULT_I64: byte = 0x7E;
diff --git a/int/Interpreter.wasm b/int/Interpreter.wasm
new file mode 100644
index 000000000..efa0a14d2
Binary files /dev/null and b/int/Interpreter.wasm differ
diff --git a/int/InterpreterBug b/int/InterpreterBug
new file mode 100755
index 000000000..a88079884
Binary files /dev/null and b/int/InterpreterBug differ
diff --git a/int/InterpreterBug.v3 b/int/InterpreterBug.v3
new file mode 100644
index 000000000..fa20b709d
--- /dev/null
+++ b/int/InterpreterBug.v3
@@ -0,0 +1,25 @@
+def main() -> int {
+	def x = A.Z(ay);
+
+	x.foo();
+
+	return 0;
+}
+
+def ay = A.Y;
+
+type A {
+	case X {
+		def foo() {}
+	}
+	case Y {
+		def foo() {}
+	}
+	case Z(a: A) {
+		def foo() { a.foo(); }
+	}
+
+	def foo();
+}
+
+
diff --git a/int/InterpreterBug.wasm b/int/InterpreterBug.wasm
new file mode 100644
index 000000000..dc25a1547
Binary files /dev/null and b/int/InterpreterBug.wasm differ
diff --git a/int/RiRuntime b/int/RiRuntime
new file mode 100755
index 000000000..716d1b2e8
--- /dev/null
+++ b/int/RiRuntime
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+S=${BASH_SOURCE[0]}
+while [ -h "$S" ]; do
+  DIR=$(cd -P $(dirname "$S") >/dev/null 2>&1 && pwd)
+  S=$(readlink "$S")
+  [[ $S != /* ]] && S=$DIR/$S
+done
+DIR=$(cd -P $(dirname "$S") >/dev/null 2>&1 && pwd)
+wizeng $DIR/RiRuntime.wasm "$@"
diff --git a/int/RiRuntime.wasm b/int/RiRuntime.wasm
new file mode 100644
index 000000000..049ef40aa
Binary files /dev/null and b/int/RiRuntime.wasm differ
diff --git a/slow_call.wasm b/slow_call.wasm
new file mode 100644
index 000000000..8d09b720e
Binary files /dev/null and b/slow_call.wasm differ
diff --git a/slow_call_nop.wasm b/slow_call_nop.wasm
new file mode 100644
index 000000000..2af221a38
Binary files /dev/null and b/slow_call_nop.wasm differ
diff --git a/slow_call_nop.wat b/slow_call_nop.wat
new file mode 100644
index 000000000..0533b6638
--- /dev/null
+++ b/slow_call_nop.wat
@@ -0,0 +1,7 @@
+(module
+  (func $f)
+  (func (export "main") (result i32)
+        call $f
+	i32.const 0
+  )
+)
diff --git a/src/engine/BytecodeIterator.v3 b/src/engine/BytecodeIterator.v3
index 37af6878d..c341bb881 100644
--- a/src/engine/BytecodeIterator.v3
+++ b/src/engine/BytecodeIterator.v3
@@ -779,6 +779,53 @@ class BytecodeIterator {
 			RESUME_THROW => 		v.visit_RESUME_THROW(read_CONT(), read_TAG(), read_HANDLERS());
 			RESUME_THROW_REF => 		v.visit_RESUME_THROW_REF(read_CONT(), read_HANDLERS());
 			SWITCH => 			v.visit_SWITCH(read_CONT(), read_TAG());
+
+			/* here, we require that replacing CALL with FAST_CALL does not touch the
+			 * operand, so that the original function can still be recovered from the bytecode itself
+			 *
+			 * in other places, where we have the module, we can go direct from bytecode to func
+			 */
+			// FIXME wrap into _ clause
+			FAST_CALL0  => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL1  => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL2  => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL3  => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL4  => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL5  => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL6  => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL7  => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL8  => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL9  => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL10 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL11 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL12 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL13 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL14 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL15 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL16 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL17 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL18 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL19 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL20 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL21 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL22 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL23 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL24 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL25 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL26 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL27 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL28 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL29 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL30 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL31 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL32 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL33 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL34 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL35 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL36 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL37 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL38 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
+			FAST_CALL39 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC());
 		}
 	}
 	def trace(out: StringBuilder, module: Module, tracer: InstrTracer) {
diff --git a/src/engine/CodeValidator.v3 b/src/engine/CodeValidator.v3
index bf3511288..d0d050019 100644
--- a/src/engine/CodeValidator.v3
+++ b/src/engine/CodeValidator.v3
@@ -420,7 +420,87 @@ class CodeValidator(extensions: Extension.set, limits: Limits, module: Module, e
 					var func = parser.readFuncRef();
 					if (func == null) return;
 					checkSignature(func.sig);
+
+					// fast call: if function is exported with fast name, replace the bytecode with FAST_CALL
+					if (FastIntTuning.useFastFunctions) {
+						for (i < module.exports.length) {
+							def ex = module.exports[i];
+							if (ex.1 == func && Strings.startsWith(ex.0, "fast:")) {
+								if (Trace.validation) Trace.OUT.puts("    function declared as fast: ");
+
+								var fast_idx = -1;
+								def fast_funcs = module.fast_funcs;
+								// look for existing FAST_CALL instruction allocated for this function
+								for (i < fast_funcs.length) {
+									if (func == fast_funcs[i]) {
+										fast_idx = i;
+										if (Trace.validation) Trace.OUT.put1("allocated as FAST_CALL%d, ", fast_idx);
+										break;
+									}
+								}
+								// not found? allocate FAST_CALL instruction, if there's space
+								if (fast_idx < 0) {
+									if (fast_funcs.length < 40) {
+										fast_idx = fast_funcs.length;
+										func.fast_call_idx = fast_idx;
+										if (Trace.validation) Trace.OUT.put1("not found, allocating FAST_CALL%d, ", fast_idx);
+										fast_funcs.put(func);
+									} else {
+										if (Trace.validation) Trace.OUT.puts("not found, FAST_CALL table is full, ");
+									}
+								}
+								// replace the bytecode, if it's found or allocated
+								if (fast_idx >= 0) {
+									//if (Trace.validation) Trace.OUT.put2("replaceCall(opcode_pos, fast_idx)\n", opcode_pos, fast_idx);
+									if (Trace.validation) Trace.OUT.puts("replacing call\n");
+									this.func.replaceCall(opcode_pos, fast_idx);
+								} else {
+									if (Trace.validation) Trace.OUT.puts("not replacing\n");
+								}
+							}
+						}
+					}
 				}
+				FAST_CALL0  => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL1  => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL2  => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL3  => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL4  => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL5  => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL6  => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL7  => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL8  => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL9  => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL10 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL11 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL12 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL13 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL14 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL15 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL16 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL17 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL18 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL19 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL20 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL21 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL22 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL23 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL24 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL25 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL26 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL27 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL28 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL29 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL30 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL31 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL32 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL33 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL34 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL35 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL36 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL37 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL38 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
+				FAST_CALL39 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode");
 				CALL_INDIRECT => {
 					var sig = parser.readSigRef();
 					var table = parser.readTableRef();
diff --git a/src/engine/Module.v3 b/src/engine/Module.v3
index 90d85a4ae..a26d75e74 100644
--- a/src/engine/Module.v3
+++ b/src/engine/Module.v3
@@ -17,6 +17,7 @@ class Module(filename: string) {
 	def exports = Vector<(string, Decl)>.new();
 	def elems = Vector<ElemDecl>.new();
 	def data = Vector<DataDecl>.new();
+	def fast_funcs = Vector<FuncDecl>.new();
 	def custom_sections = Vector<CustomSection>.new();
 	var probes: Array<Array<Probe>>;
 	var dyn_probes: Vector<(int, int, Probe)>;
@@ -143,6 +144,8 @@ class FuncDecl(sig_index: int) extends Decl {
 	var target_code: TargetCode;
 	var tierup_trigger: int = int.max;
 	var handlers = FuncHandlerInfo.new();
+	var fast_target_code: TargetCode;
+	var fast_call_idx: int = -1;
 
 	def render(names: NameSection, buf: StringBuilder) -> StringBuilder {
 		var name = if (names != null, names.getFuncName(func_index));
@@ -154,6 +157,7 @@ class FuncDecl(sig_index: int) extends Decl {
 		var tc: TargetCode;
 		var tr: TargetCode;
 		target_code = tc; // reset target code as well
+		fast_target_code = tc;
 		sidetable = Sidetables.NO_SIDETABLE;
 		cbd_sidetable = null;
 	}
@@ -168,6 +172,17 @@ class FuncDecl(sig_index: int) extends Decl {
 		if (cur_bytecode == orig_bytecode) return;
 		cur_bytecode[pc] = orig_bytecode[pc];
 	}
+	def replaceCall(pc: int, idx: int) {
+		// "orig" will become a copy of the original code, to allow in-place modification of old code
+		if (cur_bytecode == orig_bytecode) orig_bytecode = Arrays.dup(orig_bytecode);
+		// sanity check
+		if (cur_bytecode[pc] != Opcode.CALL.code) {
+			def realOp = Opcodes.find(0, cur_bytecode[pc]);
+			System.error("replace bytecode", Strings.format1("not replacing call (got %s)", realOp.mnemonic));
+		}
+		cur_bytecode[pc] = byte.!(Opcodes.indexToFastCall(idx).code);
+		// do NOT replace the operands, as a convenience for BytecodeIterator
+	}
 	def reset() -> this {
 		if (cur_bytecode == orig_bytecode) return;
 		ArrayUtil.copyInto(cur_bytecode, 0, orig_bytecode, 0, orig_bytecode.length);
@@ -183,6 +198,7 @@ class FuncDecl(sig_index: int) extends Decl {
 		n.sidetable = this.sidetable;
 		n.num_locals = this.num_locals;
 		n.target_code = this.target_code;
+		n.fast_target_code = this.fast_target_code;
 		return n;
 	}
 	def findExHandler(instance: Instance, tag: Tag, throw_pc: int) -> ExHandler {
@@ -201,6 +217,7 @@ class FuncDecl(sig_index: int) extends Decl {
 			Trace.OUT.put3("(func=%q, tag=%d, throw_pc=%d)",
 			this.render(instance.module.names, _), tag.decl.tag_index, throw_pc).ln();
 		}
+		
 		while (i < handlers.length) { // XXX: speed this up with a binary search
 			var e = handlers[i];
 			if (Trace.exception) Trace.OUT.put3("  entry[%d...%d] tag=%d", e.start, e.end, e.tag).ln();
diff --git a/src/engine/Opcodes.v3 b/src/engine/Opcodes.v3
index b0a78d735..2e785e04c 100644
--- a/src/engine/Opcodes.v3
+++ b/src/engine/Opcodes.v3
@@ -609,8 +609,51 @@ enum Opcode(prefix: u8, code: u16, mnemonic: string, imms: Array<ImmKind>, sig:
 	RESUME_THROW			(0x00, 0xE4, "resume_throw", imm.CONT_TAG_HANDLE, null),
 	RESUME_THROW_REF		(0x00, 0xE5, "resume_throw_ref", imm.CONT_HANDLE, null),
 	SWITCH				(0x00, 0xE6, "switch", imm.CONT_TAG, null)
+
+	// fast call instructions
+	FAST_CALL0               (0x00, 0x27, "fast_call0",  imm.FUNC, null),
+	FAST_CALL1               (0x00, 0xC5, "fast_call1",  imm.FUNC, null),
+	FAST_CALL2               (0x00, 0xC6, "fast_call2",  imm.FUNC, null),
+	FAST_CALL3               (0x00, 0xC7, "fast_call3",  imm.FUNC, null),
+	FAST_CALL4               (0x00, 0xC8, "fast_call4",  imm.FUNC, null),
+	FAST_CALL5               (0x00, 0xC9, "fast_call5",  imm.FUNC, null),
+	FAST_CALL6               (0x00, 0xCA, "fast_call6",  imm.FUNC, null),
+	FAST_CALL7               (0x00, 0xCB, "fast_call7",  imm.FUNC, null),
+	FAST_CALL8               (0x00, 0xCC, "fast_call8",  imm.FUNC, null),
+	FAST_CALL9               (0x00, 0xCD, "fast_call9",  imm.FUNC, null),
+	FAST_CALL10              (0x00, 0xCE, "fast_call10", imm.FUNC, null),
+	FAST_CALL11              (0x00, 0xCF, "fast_call11", imm.FUNC, null),
+	FAST_CALL12              (0x00, 0xD7, "fast_call12", imm.FUNC, null),
+	FAST_CALL13              (0x00, 0xD8, "fast_call13", imm.FUNC, null),
+	FAST_CALL14              (0x00, 0xD9, "fast_call14", imm.FUNC, null),
+	FAST_CALL15              (0x00, 0xDA, "fast_call15", imm.FUNC, null),
+	FAST_CALL16              (0x00, 0xDB, "fast_call16", imm.FUNC, null),
+	FAST_CALL17              (0x00, 0xDC, "fast_call17", imm.FUNC, null),
+	FAST_CALL18              (0x00, 0xDD, "fast_call18", imm.FUNC, null),
+	FAST_CALL19              (0x00, 0xDE, "fast_call19", imm.FUNC, null),
+	FAST_CALL20              (0x00, 0xDF, "fast_call20", imm.FUNC, null),
+	FAST_CALL21              (0x00, 0xE7, "fast_call21", imm.FUNC, null),
+	FAST_CALL22              (0x00, 0xE8, "fast_call22", imm.FUNC, null),
+	FAST_CALL23              (0x00, 0xE9, "fast_call23", imm.FUNC, null),
+	FAST_CALL24              (0x00, 0xEA, "fast_call24", imm.FUNC, null),
+	FAST_CALL25              (0x00, 0xEB, "fast_call25", imm.FUNC, null),
+	FAST_CALL26              (0x00, 0xEC, "fast_call26", imm.FUNC, null),
+	FAST_CALL27              (0x00, 0xED, "fast_call27", imm.FUNC, null),
+	FAST_CALL28              (0x00, 0xEE, "fast_call28", imm.FUNC, null),
+	FAST_CALL29              (0x00, 0xEF, "fast_call29", imm.FUNC, null),
+	FAST_CALL30              (0x00, 0xF2, "fast_call30", imm.FUNC, null),
+	FAST_CALL31              (0x00, 0xF3, "fast_call31", imm.FUNC, null),
+	FAST_CALL32              (0x00, 0xF4, "fast_call32", imm.FUNC, null),
+	FAST_CALL33              (0x00, 0xF5, "fast_call33", imm.FUNC, null),
+	FAST_CALL34              (0x00, 0xF6, "fast_call34", imm.FUNC, null),
+	FAST_CALL35              (0x00, 0xF7, "fast_call35", imm.FUNC, null),
+	FAST_CALL36              (0x00, 0xF8, "fast_call36", imm.FUNC, null),
+	FAST_CALL37              (0x00, 0xF9, "fast_call37", imm.FUNC, null),
+	FAST_CALL38              (0x00, 0xFA, "fast_call38", imm.FUNC, null),
+	FAST_CALL39              (0x00, 0x17, "fast_call39", imm.FUNC, null),
 }
 
+
 // Enumeration of the different kinds of immediates to opcodes.
 enum ImmKind {
 	ARRAY_TYPE_INDEX,	// ARRAYT
@@ -798,9 +841,16 @@ component Opcodes {
 	def code_pages = [page_FB, page_FC, page_FD, page_FE];
 	def var longestName: int;
 	def var num_subpages: int;
+	def FAST_CALL_OPCODES = 40;
+	def var fast_calls: Array<Opcode>;
 	private var nameMap: HashMap<string, Opcode>;
 
 	new() {
+
+		fast_calls = Array.new(FAST_CALL_OPCODES);
+		for (i < FAST_CALL_OPCODES) {
+			fast_calls[i] = indexToFastCall(i);
+		}
 		for (op in Opcode) {
 			if (op == Opcode.INVALID) continue;
 			init(op);
@@ -809,6 +859,7 @@ component Opcodes {
 		attributes[InternalOpcode.PROBE.code] = OpcodeAttribute.INTERNAL | OpcodeAttribute.PROBE;
 		attributes[InternalOpcode.WHAMM_PROBE.code] = OpcodeAttribute.INTERNAL | OpcodeAttribute.PROBE;
 		attributes[InternalOpcode.BREAK_PROBE.code] = OpcodeAttribute.INTERNAL | OpcodeAttribute.PROBE;
+		for (op in fast_calls) attributes[op.tag] = OpcodeAttribute.INTERNAL;
 
 		for (op in [Opcode.END, Opcode.I32_CONST, Opcode.I64_CONST, Opcode.F32_CONST, Opcode.F64_CONST, Opcode.GLOBAL_GET,
 				Opcode.REF_NULL, Opcode.REF_FUNC, Opcode.STRUCT_NEW, Opcode.STRUCT_NEW_DEFAULT,
@@ -1121,6 +1172,100 @@ component Opcodes {
 			}
 		}
 	}
+	def indexToFastCall(index: int) -> Opcode {
+		var op: Opcode;
+		match (index) {
+			0  => op = Opcode.FAST_CALL0;
+			1  => op = Opcode.FAST_CALL1;
+			2  => op = Opcode.FAST_CALL2;
+			3  => op = Opcode.FAST_CALL3;
+			4  => op = Opcode.FAST_CALL4;
+			5  => op = Opcode.FAST_CALL5;
+			6  => op = Opcode.FAST_CALL6;
+			7  => op = Opcode.FAST_CALL7;
+			8  => op = Opcode.FAST_CALL8;
+			9  => op = Opcode.FAST_CALL9;
+			10 => op = Opcode.FAST_CALL10;
+			11 => op = Opcode.FAST_CALL11;
+			12 => op = Opcode.FAST_CALL12;
+			13 => op = Opcode.FAST_CALL13;
+			14 => op = Opcode.FAST_CALL14;
+			15 => op = Opcode.FAST_CALL15;
+			16 => op = Opcode.FAST_CALL16;
+			17 => op = Opcode.FAST_CALL17;
+			18 => op = Opcode.FAST_CALL18;
+			19 => op = Opcode.FAST_CALL19;
+			20 => op = Opcode.FAST_CALL20;
+			21 => op = Opcode.FAST_CALL21;
+			22 => op = Opcode.FAST_CALL22;
+			23 => op = Opcode.FAST_CALL23;
+			24 => op = Opcode.FAST_CALL24;
+			25 => op = Opcode.FAST_CALL25;
+			26 => op = Opcode.FAST_CALL26;
+			27 => op = Opcode.FAST_CALL27;
+			28 => op = Opcode.FAST_CALL28;
+			29 => op = Opcode.FAST_CALL29;
+			30 => op = Opcode.FAST_CALL30;
+			31 => op = Opcode.FAST_CALL31;
+			32 => op = Opcode.FAST_CALL32;
+			33 => op = Opcode.FAST_CALL33;
+			34 => op = Opcode.FAST_CALL34;
+			35 => op = Opcode.FAST_CALL35;
+			36 => op = Opcode.FAST_CALL36;
+			37 => op = Opcode.FAST_CALL37;
+			38 => op = Opcode.FAST_CALL38;
+			39 => op = Opcode.FAST_CALL39;
+			_ => System.error("indexToFastCall", "out of range");
+		}
+		return op;
+	}
+	def fastCallToIndex(op: Opcode) -> int {
+		var idx: int;
+		match (op) {
+			FAST_CALL0  => idx = 0;
+			FAST_CALL1  => idx = 1;
+			FAST_CALL2  => idx = 2;
+			FAST_CALL3  => idx = 3;
+			FAST_CALL4  => idx = 4;
+			FAST_CALL5  => idx = 5;
+			FAST_CALL6  => idx = 6;
+			FAST_CALL7  => idx = 7;
+			FAST_CALL8  => idx = 8;
+			FAST_CALL9  => idx = 9;
+			FAST_CALL10 => idx = 10;
+			FAST_CALL11 => idx = 11;
+			FAST_CALL12 => idx = 12;
+			FAST_CALL13 => idx = 13;
+			FAST_CALL14 => idx = 14;
+			FAST_CALL15 => idx = 15;
+			FAST_CALL16 => idx = 16;
+			FAST_CALL17 => idx = 17;
+			FAST_CALL18 => idx = 18;
+			FAST_CALL19 => idx = 19;
+			FAST_CALL20 => idx = 20;
+			FAST_CALL21 => idx = 21;
+			FAST_CALL22 => idx = 22;
+			FAST_CALL23 => idx = 23;
+			FAST_CALL24 => idx = 24;
+			FAST_CALL25 => idx = 25;
+			FAST_CALL26 => idx = 26;
+			FAST_CALL27 => idx = 27;
+			FAST_CALL28 => idx = 28;
+			FAST_CALL29 => idx = 29;
+			FAST_CALL30 => idx = 30;
+			FAST_CALL31 => idx = 31;
+			FAST_CALL32 => idx = 32;
+			FAST_CALL33 => idx = 33;
+			FAST_CALL34 => idx = 34;
+			FAST_CALL35 => idx = 35;
+			FAST_CALL36 => idx = 36;
+			FAST_CALL37 => idx = 37;
+			FAST_CALL38 => idx = 38;
+			FAST_CALL39 => idx = 39;
+			_ => System.error("fastCallToIndex", "not a FAST_CALL instruction");
+		}
+		return idx;
+	}
 }
 
 // Renders instructions as text.
diff --git a/src/engine/Tuning.v3 b/src/engine/Tuning.v3
index c1c12d3e1..323fcdd61 100644
--- a/src/engine/Tuning.v3
+++ b/src/engine/Tuning.v3
@@ -42,6 +42,7 @@ component FastIntTuning {
 	def inlineGlobalAccess = true;	// enable inline access of (primitive) globals
 	def stealFlagBitForMemory64 = true;	// use a bit in the memarg flags for memory64
 	def whammProbeTrampolineNumPages = 1024;
+	def useFastFunctions = false;    // treat functions exported with `fast:` in the name as fast functions
 }
 
 // Tuning settings for the single-pass compiler that have no effect on correctness.
diff --git a/src/engine/compiler/MacroAssembler.v3 b/src/engine/compiler/MacroAssembler.v3
index a3eb8110b..294c47b8d 100644
--- a/src/engine/compiler/MacroAssembler.v3
+++ b/src/engine/compiler/MacroAssembler.v3
@@ -368,6 +368,8 @@ class MacroAssembler(valuerep: Tagging, regConfig: RegConfig) {
 	// Destructive on {parent}.
 	def emit_cont_mv(from_vsp: Reg, contStack: Reg, n_vals: Reg, tmp1: Reg, tmp2: Reg, xmm0: Reg);
 
+	def emit_dispatchSequence();
+
 	// Validates {cont} and:
 	// - Mark {cont} as used
 	// - Move {cont.stack} to {destContStack}
diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3
index 616224c22..f3e657766 100644
--- a/src/engine/compiler/SinglePassCompiler.v3
+++ b/src/engine/compiler/SinglePassCompiler.v3
@@ -76,8 +76,12 @@ def KIND_V128 = SpcConsts.KIND_V128;
 def KIND_REF = SpcConsts.KIND_REF;
 def KIND_REF_U64 = SpcConsts.KIND_REF_U64;
 
+// Unlike frame.frameSize, where it is 0 for fast contexts. These are always the
+// true frame size (for stack reconstruction methods).
+def FRAME_SIZE = X86_64InterpreterFrame.size;
+
 // Compiles Wasm bytecode to machine code in a single pass via a MacroAssembler.
-class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAlloc, extensions: Extension.set, limits: Limits) extends BytecodeVisitor {
+class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAlloc, extensions: Extension.set, limits: Limits, fast: bool) extends BytecodeVisitor {
 	def instrTracer = if(Trace.compiler, InstrTracer.new());
 	def config = masm.regConfig;
 	def regs = xenv;
@@ -102,6 +106,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 	var sig: SigDecl;
 	var num_locals: int;
 	var local_base_sp: u31; // can use a Range for 0-indexing instead of from offset
+	var ctl_base_sp: u31;   // index of the RETURN control in ctl_stack for the current frame
 
 	var success = true;
 	var osr_pc: int;
@@ -112,9 +117,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 	var ret_label: MasmLabel;
 	var last_probe = 0;
 	var skip_to_end: bool;
-	// this is Whamm probe inlining, not arbitrary function inlining (yet)
-	var is_inlined = false;
-	var whamm_probe_ctl_base: u31; // ctl_stack.top when Whamm probe compilation started
+	var whamm_config: WhammInlineConfig;
+	var frames_reconstructed = false;
 	// XXX: hack
 	var handler_dest_info = Vector<SpcHandlerInfo>.new();
 
@@ -131,12 +135,12 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 	def gen(module: Module, func: FuncDecl, err: ErrorGen) -> bool {
 		this.osr_pc = -1;
 		this.err = err;
-		return Metrics.spc_time_us.run(gen0, (module, func));
+		return Metrics.spc_time_us.run(gen0(_, _), (module, func));
 	}
 	def genOsr(module: Module, func: FuncDecl, pc: int, err: ErrorGen) -> MasmLabel {
 		this.osr_pc = pc;
 		this.err = err;
-		var ok = Metrics.spc_time_us.run(gen0, (module, func));
+		var ok = Metrics.spc_time_us.run(gen0(_, _), (module, func));
 		return if(ok, osr_entry_label);
 	}
 	private def gen0(module: Module, func: FuncDecl) -> bool {
@@ -166,7 +170,12 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 
 		// Push initial frame for top-level function
 		state.frame_stack.clear();
-		var initial_frame = SpcFrame.new(func, module, 0, 0, func.num_slots(), 0);
+		if (fast) {
+			// push a SpcFrame representing the interpreter frame already on the stack
+			var interp_frame = SpcFrame.new(null, module, 0, 0, 0, -1, null);
+			pushSpcFrame(interp_frame);
+		}
+		var initial_frame = SpcFrame.new(func, module, 0, 0, func.num_slots(), 0, masm.newLabel(func.cur_bytecode.length));
 		pushSpcFrame(initial_frame);
 
 		// Emit prologue, which allocates the frame and initializes various registers.
@@ -182,7 +191,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 		// Emit function entry probe, if any.
 		if (!FeatureDisable.entryProbes && func.entry_probed) {
 			var probe = Instrumentation.getLocalProbe(module, func.func_index, 0);
-			emitProbe0(0, probe);
+			withReconstructedInlinedFrames(fun =>
+				emitProbe0(0, probe));
 		}
 
 		masm.current_fid = func.func_index;
@@ -214,8 +224,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 			masm.bindLabel(label);
 
 			if (frames.length > 1) {
-				// no inlining yet: this should never happen
-				System.error("SpcError", "attempt to emit trap in inlined context");
+				unrefRegs();
+				emitReconstructStackFrames(frames);
 			} else {
 				masm.emit_mov_m_i(xenv.pc_slot, label.create_pos);
 			}
@@ -346,36 +356,43 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 		if (!cond) bailout(Strings.format3(msg, p1, p2, p3));
 	}
 	def emitPrologue() {
-		// Allocate stack frame
-		masm.emit_subw_r_i(regs.sp, frame.frameSize);
-
-		// Spill VSP
-		emit_spill_vsp(regs.vsp); // XXX: track VSP-spilled state
-		// Spill wf: WasmFunction
-		masm.emit_mov_m_r(ValueKind.REF, frame.wasm_func_slot, regs.func_arg);
-		// Load wf.instance and spill
-		masm.emit_v3_WasmFunction_instance_r_r(regs.instance, regs.func_arg);
-		masm.emit_mov_m_r(ValueKind.REF, frame.instance_slot, regs.instance);
-		// Clear FrameAccessor
-		masm.emit_mov_m_l(frame.accessor_slot, 0); // XXX: value kind
-		// Clear inlined whamm instance
-		if (SpcTuning.inlineWhammProbes && SpcTuning.intrinsifyWhammProbe) {
-			masm.emit_mov_m_l(frame.inlined_instance_slot, 0);
+		if (!fast) {
+			// Allocate stack frame
+			masm.emit_subw_r_i(regs.sp, frame.frameSize);
+
+			// Spill VSP
+			emit_spill_vsp(regs.vsp); // XXX: track VSP-spilled state
+			// Spill wf: WasmFunction
+			masm.emit_mov_m_r(ValueKind.REF, frame.wasm_func_slot, regs.func_arg);
+			// Load wf.instance and spill
+			masm.emit_v3_WasmFunction_instance_r_r(regs.instance, regs.func_arg);
+			masm.emit_mov_m_r(ValueKind.REF, frame.instance_slot, regs.instance);
+			// Clear FrameAccessor
+			masm.emit_mov_m_l(frame.accessor_slot, 0); // XXX: value kind
+			// Clear inlined whamm instance
+			if (SpcTuning.inlineWhammProbes && SpcTuning.intrinsifyWhammProbe) {
+				masm.emit_mov_m_l(frame.inlined_instance_slot, 0);
+			}
+		} else {
+			masm.emit_addw_r_i(X86_64MasmRegs.INT_EXEC_ENV.ip, uleb_size(func.func_index));
 		}
 
 		// Compute VFP = VSP - sig.params.length * SLOT_SIZE
 		masm.emit_mov_r_r(ValueKind.REF, regs.vfp, regs.vsp); // XXX: use 3-addr adjustment of VFP
 		masm.emit_subw_r_i(regs.vfp, sig.params.length * masm.valuerep.slot_size);
-		// XXX: skip spilling of VFP
-		masm.emit_mov_m_r(ValueKind.REF, frame.vfp_slot, regs.vfp);
 
-		// Load instance.memories[0].start into MEM0_BASE and spill
-		if (module.memories.length > 0) {
-			// XXX: skip loading memory base if function doesn't access memory
-			masm.emit_v3_Instance_memories_r_r(regs.mem0_base, regs.instance);
-			masm.emit_v3_Array_elem_r_ri(ValueKind.REF, regs.mem0_base, regs.mem0_base, 0);
-			masm.emit_v3_Memory_start_r_r(regs.mem0_base, regs.mem0_base);
-			masm.emit_mov_m_r(ValueKind.REF, frame.mem0_base_slot, regs.mem0_base);
+		if (!fast) {
+			// XXX: skip spilling of VFP
+			masm.emit_mov_m_r(ValueKind.REF, frame.vfp_slot, regs.vfp);
+
+			// Load instance.memories[0].start into MEM0_BASE and spill
+			if (module.memories.length > 0) {
+				// XXX: skip loading memory base if function doesn't access memory
+				masm.emit_v3_Instance_memories_r_r(regs.mem0_base, regs.instance);
+				masm.emit_v3_Array_elem_r_ri(ValueKind.REF, regs.mem0_base, regs.mem0_base, 0);
+				masm.emit_v3_Memory_start_r_r(regs.mem0_base, regs.mem0_base);
+				masm.emit_mov_m_r(ValueKind.REF, frame.mem0_base_slot, regs.mem0_base);
+			}
 		}
 	}
 	def visitLocalDecl(count: u32, vtc: ValueTypeCode) {
@@ -399,7 +416,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 		if (last_probe == 0) return;
 		var probe = Instrumentation.getLocalProbe(module, func.func_index, last_probe);
 		last_probe = 0;
-		emitProbe0(it.pc, probe);
+		withReconstructedInlinedFrames(fun =>
+			emitProbe0(it.pc, probe));
 		if (Trace.compiler) traceOpcodeAndStack(true);
 	}
 	def emitProbe0(pc: int, probe: Probe) {
@@ -484,40 +502,33 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 
 	// saves the overhead of using a runtime call by directly invoking the wasm function associated with the whamm probe
 	def emitWhammProbe(probe: WhammProbe) {
+		if (Trace.compiler) Trace.OUT.puts("emitting whamm probe\n");
 		// set up args and push to frame slots.
 		var whamm_sig = probe.sig;
-		var inline_config = InlineConfig(false, false, false);
-		var new_local_base_sp = 0;
 		var orig_sp = state.sp;
 		var callee_func = WasmFunction.!(probe.func);
+		def inline_decision = shouldInline(callee_func.decl) && SpcTuning.inlineWhammProbes; // TODO move to shouldInline
+		var swap_instance = false;
+		var swap_membase = false;
 
-		if (SpcTuning.inlineWhammProbes) {
-			inline_config = InlineConfig(probe.spc_swap_membase, probe.spc_swap_instance, probe.spc_inline_func);
-			if (!probe.inline_heuristic_checked) {
-				inline_config = funcCanInline(callee_func.decl);
-				probe.inline_heuristic_checked = true;
-				probe.spc_swap_instance = inline_config.swap_instance;
-				probe.spc_swap_membase = inline_config.swap_membase;
-				probe.spc_inline_func = inline_config.can_inline;
-			}
+		if (inline_decision) {
+			probe.checkSwap();
+			swap_instance = probe.swap_instance;
+			swap_membase = probe.swap_membase;
 
-			if (inline_config.swap_instance) { // push whamm instance onto abstract stack directly
+			if (swap_instance) {
 				masm.emit_mov_r_Instance(regs.scratch, callee_func.instance);
 				masm.emit_mov_m_r(ValueKind.REF, frame.inlined_instance_slot, regs.scratch);
 			}
-
-			// overwrite mem0_base with whamm instance's memory base, restore from frame slot later
-			if (inline_config.swap_membase) {
-				var membase = callee_func.instance.memories[0].getMemBase64();
-				masm.emit_mov_r_l(regs.mem0_base, i64.view(membase));
+			if (swap_membase) {
+				if (callee_func.instance.memories.length > 0) {
+					var membase = callee_func.instance.memories[0].getMemBase64();
+					masm.emit_mov_r_l(regs.mem0_base, i64.view(membase));
+				}
 				masm.emit_mov_m_r(ValueKind.REF, frame.inlined_mem0_base_slot, regs.mem0_base);
 			}
-		}
-
-		if (!inline_config.can_inline) {
-			state.emitSaveAll(resolver, probeSpillMode);
 		} else {
-			new_local_base_sp = int.view(state.sp);
+			state.emitSaveAll(resolver, probeSpillMode);
 		}
 
 		for (i < whamm_sig.length) {
@@ -526,13 +537,13 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 			var kind: byte;
 			match(whamm_sig[i]) {
 				FrameAccessor => {
-					if (inline_config.can_inline) state.emitSaveAll(resolver, probeSpillMode); // spill entire value stack.
+					if (inline_decision) state.emitSaveAll(resolver, probeSpillMode); // spill entire value stack.
 					masm.emit_call_runtime_getFrameAccessorMetaRef();
 					emit_reload_regs();
-					if (inline_config.can_inline && !probeSpillMode.free_regs) state.emitRestoreAll(resolver);
+					if (inline_decision && !probeSpillMode.free_regs) state.emitRestoreAll(resolver);
 
 					// move result to mem slot or reg, depending on inlining
-					if (inline_config.can_inline) {
+					if (inline_decision) {
 						var reg = allocRegTos(ValueKind.REF);
 						masm.emit_mov_r_r(ValueKind.REF, reg, xenv.runtime_ret0);
 						state.push(KIND_REF | IN_REG, reg, 0);
@@ -544,7 +555,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 				Val(val) => {
 					match (val) {
 						I31(v) => {
-							if (inline_config.can_inline) {
+							if (inline_decision) {
 								var reg = allocRegTos(ValueKind.REF);
 								masm.emit_mov_r_i(reg, i32.view(v) << 1);
 								state.push(KIND_REF | IN_REG, reg, 0);
@@ -554,7 +565,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 							kind = ValueKind.REF.code;
 						}
 						I32(v) => {
-							if (inline_config.can_inline) {
+							if (inline_decision) {
 								state.push(KIND_I32 | IS_CONST, NO_REG, i32.view(v));
 							} else {
 								masm.emit_mov_m_d(slot_addr, v);
@@ -562,7 +573,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 							kind = ValueKind.I32.code;
 						}
 						I64(v) => {
-							if (inline_config.can_inline) {
+							if (inline_decision) {
 								var reg = allocRegTos(ValueKind.I64);
 								masm.emit_mov_r_l(reg, i64.view(v));
 								state.push(KIND_I64 | IN_REG, reg, 0);
@@ -572,7 +583,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 							kind = ValueKind.I64.code;
 						}
 						F32(v) => {
-							if (inline_config.can_inline) {
+							if (inline_decision) {
 								var reg = allocRegTos(ValueKind.F32);
 								masm.emit_mov_r_f32(reg, v);
 								state.push(KIND_F32 | IN_REG, reg, 0);
@@ -582,7 +593,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 							kind = ValueKind.F32.code;
 						}
 						F64(v) => {
-							if (inline_config.can_inline) {
+							if (inline_decision) {
 								var reg = allocRegTos(ValueKind.F64);
 								masm.emit_mov_r_d64(reg, v);
 								state.push(KIND_F64 | IN_REG, reg, 0);
@@ -592,7 +603,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 							kind = ValueKind.F64.code;
 						}
 						V128(l, h) => {
-							if (inline_config.can_inline) {
+							if (inline_decision) {
 								var reg = allocRegTos(ValueKind.V128);
 								masm.emit_mov_r_q(reg, l, h);
 								state.push(KIND_V128 | IN_REG, reg, 0);
@@ -603,7 +614,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 							kind = ValueKind.V128.code;
 						}
 						Ref(v) => {
-							if (inline_config.can_inline) {
+							if (inline_decision) {
 								var reg = allocRegTos(ValueKind.REF);
 								masm.emit_mov_r_Object(reg, v);
 								state.push(KIND_REF | IN_REG, reg, 0);
@@ -614,7 +625,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 							kind = ValueKind.REF.code;
 						}
 						Cont(v) => {
-							if (inline_config.can_inline) {
+							if (inline_decision) {
 								var reg = allocRegTos(ValueKind.REF_U64);
 								masm.emit_mov_r_Cont(reg, v);
 								state.push(KIND_REF_U64 | IN_REG, reg, 0);
@@ -629,15 +640,15 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 				}
 				Operand(_, i) => {
 					var index = orig_sp + u32.view(i) - 1;
-					if (inline_config.can_inline) {
-						visit_LOCAL_GET(u31.view(index));
+					if (inline_decision) {
+						visit_LOCAL_GET(u31.view(index - local_base_sp));
 					} else {
 						masm.emit_mov_m_m(state.state[index].kind(), slot_addr, masm.slotAddr(index));
 					}
 					kind = state.state[index].kind().code;
 				}
 				Local(_, i) => {
-					if (inline_config.can_inline) {
+					if (inline_decision) {
 						visit_LOCAL_GET(u31.view(i));
 					} else {
 						masm.emit_mov_m_m(state.state[u31.view(i)].kind(), slot_addr, masm.slotAddr(u32.view(i)));
@@ -646,7 +657,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 				}
 				Null => System.error("whamm", "null whamm arg!");
 			}
-			if (!inline_config.can_inline) {
+			if (!inline_decision) {
 				masm.emit_mov_m_i(slot_tag_addr, kind);
 			}
 		}
@@ -654,49 +665,15 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 		var func_id = callee_func.decl.func_index;
 		var whamm_module = whamm_instance.module;
 		var whamm_func_decl = callee_func.decl;
-		if (inline_config.can_inline) {
-			var prev_it = it;
-			it = BytecodeIterator.new().reset(whamm_func_decl);
-			var orig_module = module;
-
-			// prepare spc for inlining
-			this.local_base_sp = u31.view(new_local_base_sp);
-			this.module = whamm_module;
-			this.func = whamm_func_decl;
-			this.sig = whamm_func_decl.sig;
-
-			// inline codegen
-			it.dispatchLocalDecls(this);
-			this.is_inlined = true;
-			if (Trace.compiler) Trace.OUT.puts("Start compiling inlined whamm probe").ln();
-			while (it.more() && success) {
-				if (Trace.compiler) traceOpcodeAndStack(false);
-				last_probe = 0;
-				masm.source_loc = it.pc;
-				it.dispatch(this);
-				if (Trace.compiler && Trace.asm) {
-					OUT.puts("JIT code: ");
-					masm.printCodeBytes(OUT, codegen_offset, masm.curCodeBytes());
-					codegen_offset = masm.curCodeBytes();
-					OUT.ln();
-				}
-				unrefRegs();
-				if (Debug.compiler) checkRegAlloc();
-				it.next();
+		if (inline_decision) {
+			whamm_config = WhammInlineConfig(swap_membase, swap_instance, true);
+			masm.emit_mov_m_r(ValueKind.REF, frame.vfp_slot, regs.vfp);
+			emitInlinedCall(whamm_func_decl, probe);
+			whamm_config = WhammInlineConfig(false, false, false);
+			// Restore mem0_base after probe
+			if (module.memories.length > 0) {
+				masm.emit_mov_r_m(ValueKind.REF, regs.mem0_base, frame.mem0_base_slot);
 			}
-			if (Trace.compiler) Trace.OUT.puts("Finished compiling inlined whamm probe").ln();
-
-			// restore spc after inlining
-			it  = prev_it;
-			this.local_base_sp = 0;
-			this.is_inlined = false;
-			this.module = orig_module;
-			this.func = it.func;
-			this.sig = it.func.sig;
-			masm.emit_mov_r_m(ValueKind.REF, regs.mem0_base, frame.mem0_base_slot);
-
-			// clear callee params/locals from abstract state
-			dropN(state.sp - orig_sp);
 		} else {
 			var vsp_reg = allocTmpFixed(ValueKind.REF, regs.vsp);
 			var func_reg = allocTmpFixed(ValueKind.REF, regs.func_arg);
@@ -739,7 +716,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 		state.prepareLoop(resolver);
 		masm.bindLabel(ctl_top.label);
 		emitProbe();
-		if (it.pc == osr_pc) {
+		if (it.pc == osr_pc && !isInlined()) {
 			osr_state = state.ctl_stack.peek().copyMerge();
 			osr_loop_label = masm.newLabel(it.pc);
 			masm.bindLabel(osr_loop_label);
@@ -792,7 +769,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 		setUnreachable();
 	}
 	def visit_END() {
-		if (!this.is_inlined) {
+		if (needsEpilogue()) {
 			var ctl_top = state.ctl_stack.peek();
 			if (ctl_top.opcode == Opcode.LOOP.code) {
 				state.ctl_stack.pop();
@@ -813,6 +790,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 				masm.bindLabel(ctl_top.label);
 				state.resetToMerge(ctl_top);
 				state.ctl_stack.pop();
+			// case for END for fallthrough at end of function?
 			} else if (ctl_top.opcode == Opcode.RETURN.code) {
 				state.emitFallthru(resolver);
 				masm.bindLabel(ctl_top.label);
@@ -821,8 +799,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 				if (ctl_top.merge_count > 1) emitReturn(ctl_top);
 				state.ctl_stack.pop();
 			}
-			emitProbe();
 		}
+		emitProbe();
 	}
 	def visit_BR(depth: u31) {
 		var target = state.getControl(depth);
@@ -851,35 +829,150 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 		setUnreachable();
 	}
 	def visit_RETURN() {
-		var target = state.ctl_stack.elems[0];
+		var target = state.ctl_stack.elems[ctl_base_sp];
 		state.emitTransfer(target, resolver);
-		if (ret_label == null) ret_label = masm.newLabel(func.cur_bytecode.length);
 		masm.emit_br(ret_label);
 		setUnreachable();
 	}
-	def visitCallDirect(op: Opcode, index: u31, tailCall: bool) {
+	// for CALL, FAST_CALL, and RETURN_CALL
+	def visitCallDirect(op: Opcode, index: u31, prop: CallProperty) {
 		if (op == Opcode.CALL) {
 			Metrics.spc_static_calls.val++;
 			masm.emit_inc_metric(Metrics.spc_dynamic_calls);
 		}
 		var func = module.functions[index];
-		var retpt = masm.newLabel(it.pc), wasmcall_label = masm.newLabel(it.pc);
-		// Load the instance (which must happen before frame is unwound).
-		var vsp_reg = allocTmpFixed(ValueKind.REF, regs.vsp);
-		var func_reg = allocTmpFixed(ValueKind.REF, regs.func_arg);
-		var tmp = allocTmp(ValueKind.REF);
-		emit_load_instance(tmp);
 
-		// Load the function, XXX: skip and compute function from instance + code on stack?
-		masm.emit_v3_Instance_functions_r_r(func_reg, tmp);
-		masm.emit_v3_Array_elem_r_ri(ValueKind.REF, func_reg, func_reg, func.func_index);
+		// Try inlining for intra-module, non-tail calls
+		if (prop != CallProperty.TAIL && shouldInline(func)) {
+			if (Trace.compiler) Trace.OUT.put2("Inlining call to func #%d (%d bytes)", index, func.orig_bytecode.length).ln();
+			if (op == Opcode.CALL) {
+				Metrics.spc_static_inlined_calls.val++;
+				masm.emit_inc_metric(Metrics.spc_dynamic_inlined_calls);
+			}
+			emitInlinedCall(func, null);
+			return;
+		}
+
+		withReconstructedInlinedFrames(fun {
+			var retpt = masm.newLabel(it.pc), wasmcall_label = masm.newLabel(it.pc);
+			// Load the instance (which must happen before frame is unwound).
+			var vsp_reg = allocTmpFixed(ValueKind.REF, regs.vsp);
+			var func_reg = allocTmpFixed(ValueKind.REF, regs.func_arg);
+			var tmp = allocTmp(ValueKind.REF);
+			emit_load_instance(tmp);
+
+			// Load the function, XXX: skip and compute function from instance + code on stack?
+			masm.emit_v3_Instance_functions_r_r(func_reg, tmp);
+			masm.emit_v3_Array_elem_r_ri(ValueKind.REF, func_reg, func_reg, func.func_index);
+
+			emitCallToReg(func.sig, func_reg, vsp_reg, tmp, func.imp != null, prop);
+		});
+	}
+	def emitInlinedCall(callee_func: FuncDecl, whamm: WhammProbe) {
+		var sig = callee_func.sig;
+		var params_count = u32.view(sig.params.length);
+		var results_count = u32.view(sig.results.length);
+		var orig_sp = state.sp;
+
+		// Arguments are already on stack
+		// Stack: [..., arg0, arg1, ..., argN] <- sp
+		// We want callee's local 0 = arg0, so:
+		var new_local_base_sp: u31 = u31.view(orig_sp - params_count);
+		var new_ctl_base_sp = u31.view(state.ctl_stack.top);
+
+		var num_locals = callee_func.num_slots();
+
+		// Push a RETURN control for the inlined callee's function body.
+		var end_label = masm.newLabel(callee_func.cur_bytecode.length);
+		var func_body_ctl = state.pushFuncBody(sig.params, sig.results, end_label);
+
+		var m: Module = module;
+
+		// Whamm probe configuration
+		if (whamm != null) {
+			def whamm_sig = whamm.sig;
+			def whamm_wf = WasmFunction.!(whamm.func);
+			def whamm_instance = whamm_wf.instance;
+			def whamm_func_decl = whamm_wf.decl;
+
+			m = whamm_instance.module;
+			new_local_base_sp = u31.view(state.sp) - u31.view(whamm_sig.length); // XXX
+			func_body_ctl.val_stack_top = new_local_base_sp; // correct val_stack_top for whamm arg count 
+		}
+
+		// create merge state based on outer function's base sp given inlined function's results
+		func_body_ctl.merge_state = state.getInMemoryMergeWithArgs(int.view(new_local_base_sp), sig.results);
+		func_body_ctl.merge_count = 1;
+
+		// Create and push frame for inlined function
+		var callee_frame = SpcFrame.new(callee_func,
+			m, new_local_base_sp, new_ctl_base_sp, num_locals, 0, masm.newLabel(callee_func.cur_bytecode.length));
+
+		pushSpcFrame(callee_frame);
+
+		// Emit function entry probe, if any.
+		// XXX expensive because frame materialization required
+		if (whamm == null && !FeatureDisable.entryProbes && func.entry_probed) {
+			var probe = Instrumentation.getLocalProbe(module, callee_func.func_index, 0);
+			withReconstructedInlinedFrames(fun =>
+				emitProbe0(0, probe));
+		}
+
+		// Allocate callee's non-parameter locals
+		it.dispatchLocalDecls(this);
+
+		// Compile callee's bytecode
+		if (Trace.compiler) Trace.OUT.puts("  Start inlined function body").ln();
+		while (it.more() && success) {
+			if (Trace.compiler) traceOpcodeAndStack(false);
+			last_probe = 0;
+			masm.source_loc = it.pc;
+			masm.current_fid = func.func_index;
+			it.dispatch(this);
+			if (Trace.compiler && Trace.asm) {
+				OUT.puts("JIT code: ");
+				masm.printCodeBytes(OUT, codegen_offset, masm.curCodeBytes());
+				codegen_offset = masm.curCodeBytes();
+				OUT.ln();
+			}
+			unrefRegs();
+			if (Debug.compiler) checkRegAlloc();
+			it.next();
+			if (skip_to_end) doSkipToEndOfBlock();
+		}
+		if (Trace.compiler) Trace.OUT.puts("  End inlined function body").ln();
+
+		// Check if the inlined function is unreachable (e.g., ended with UNREACHABLE, RETURN, THROW)
+		var inlined_reachable = state.ctl_stack.peek().reachable;
+
+		// Restore caller context by popping frame
+		popSpcFrame();  // Automatically restores cached fields
+
+		// Note: Control stack cleanup (popping implicit BLOCK) is handled by visit_END
+
+		// If inlined function is unreachable, no results to clean up
+		if (!inlined_reachable) {
+			if (Trace.compiler) {
+				Trace.OUT.puts("  Inlined function unreachable, skipping result cleanup").ln();
+				Trace.OUT.put3("    state.sp=%d, new_local_base_sp=%d, callee_slots=%d",
+					state.sp, new_local_base_sp, state.sp - new_local_base_sp).ln();
+			}
+			// Drop all callee state (params + locals, no results)
+			var callee_slots = state.sp - new_local_base_sp;
+			if (callee_slots > 0) dropN(u32.view(callee_slots));
+			if (Trace.compiler) Trace.OUT.put1("    After dropN: state.sp=%d", state.sp).ln();
+			setUnreachable();
+			return;
+		}
 
-		emitCallToReg(func.sig, func_reg, vsp_reg, tmp, func.imp != null, tailCall);
+		if (Trace.compiler) {
+			Trace.OUT.put1("  Inlined call complete, sp=%d", state.sp).ln();
+		}
 	}
-	def emitCallToReg(sig: SigDecl, func_reg: Reg, vsp_reg: Reg, tmp: Reg, checkHostCall: bool, tailCall: bool) {
+	def emitCallToReg(sig: SigDecl, func_reg: Reg, vsp_reg: Reg, tmp: Reg, checkHostCall: bool, prop: CallProperty) {
 		var retpt = masm.newLabel(it.pc), wasmcall_label = masm.newLabel(it.pc);
 		// Handle the current stack state.
-		if (tailCall) emitMoveTailCallArgs(sig); // transfer tail call args
+		if (prop == CallProperty.TAIL) emitMoveTailCallArgs(sig); // transfer tail call args
 		else state.emitSaveAll(resolver, SpillMode.SAVE_AND_FREE_REGS); // spill entire value stack
 
 		// Compute the value stack pointer.
@@ -887,7 +980,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 		if (checkHostCall) {
 			// A call to imported function must first check for WasmFunction.
 			masm.emit_br_r(func_reg, MasmBrCond.IS_WASM_FUNC, wasmcall_label);
-			if (tailCall) {
+			if (prop == CallProperty.TAIL) {
 				masm.emit_jump_HostCallStub(); // XXX: stub relies on func_arg and VSP
 			} else {
 				masm.emit_call_HostCallStub(); // XXX: stub relies on func_arg and VSP
@@ -900,7 +993,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 		masm.emit_v3_FuncDecl_target_code_r_r(tmp, tmp);
 
 		// Call or jump to the entrypoint.
-		if (tailCall) {
+		if (prop == CallProperty.TAIL) {
 			masm.emit_jump_r(tmp);
 			setUnreachable();
 		} else {
@@ -924,7 +1017,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 		// adjust frame
 		masm.emit_addw_r_i(regs.sp, frame.frameSize);
 	}
-	def visitCallIndirect(op: Opcode, sig_index: u31, table_index: u31, tailCall: bool) {
+	def visitCallIndirect(op: Opcode, sig_index: u31, table_index: u31, prop: CallProperty) {
 		var sig = SigDecl.!(module.heaptypes[sig_index]);
 		var retpt = masm.newLabel(it.pc), wasmcall_label = masm.newLabel(it.pc);
 		var vsp_reg = allocTmpFixed(ValueKind.REF, regs.vsp);
@@ -981,9 +1074,9 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 			masm.bindLabel(end);
 		}
 
-		emitCallToReg(sig, func_reg, vsp_reg, tmp_reg, true, tailCall);
+		emitCallToReg(sig, func_reg, vsp_reg, tmp_reg, true, prop);
 	}
-	def visitCallRef(op: Opcode, index: u31, tailCall: bool) {
+	def visitCallRef(op: Opcode, index: u31, prop: CallProperty) {
 		var sig = SigDecl.!(module.heaptypes[index]);
 		var sv = state.peek();
 		if (sv.isConst() && sv.const == 0) {
@@ -996,7 +1089,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 		var tmp = allocTmp(ValueKind.REF);
 		var func_reg = sv.reg;
 
-		emitCallToReg(sig, func_reg, vsp_reg, tmp, true, tailCall);
+		emitCallToReg(sig, func_reg, vsp_reg, tmp, true, prop);
 	}
 	def visit_DROP() {
 		dropN(1);
@@ -1939,12 +2032,17 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 		state.emitSaveAll(resolver, runtimeSpillMode);
 		emit_compute_vsp(regs.vsp, state.sp);
 		masm.emit_store_curstack_vsp(regs.vsp);
-		masm.emit_get_curstack(regs.runtime_arg0);
-		masm.emit_v3_set_X86_64Stack_rsp_r_r(regs.runtime_arg0, regs.sp);
-		masm.emit_push_X86_64Stack_rsp_r_r(regs.runtime_arg0);
-		emit_load_instance(regs.runtime_arg1);
-		masm.emit_mov_r_i(regs.runtime_arg2, arg1);
-		masm.emit_call_runtime_op(op);
+
+		def emit = fun {
+			masm.emit_get_curstack(regs.runtime_arg0);
+			masm.emit_v3_set_X86_64Stack_rsp_r_r(regs.runtime_arg0, regs.sp);
+			masm.emit_push_X86_64Stack_rsp_r_r(regs.runtime_arg0);
+			emit_load_instance(regs.runtime_arg1);
+			masm.emit_mov_r_i(regs.runtime_arg2, arg1);
+			masm.emit_call_runtime_op(op);
+		};
+		// Reconstruct stack frames across runtime calls that might (Wasm-level) trap.
+ 		if (canTrap) withReconstructedInlinedFrames(emit); else emit();
 		masm.emit_get_curstack(regs.scratch);
 		masm.emit_pop_X86_64Stack_rsp_r_r(regs.scratch);
 		dropN(args);
@@ -1956,13 +2054,18 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 		state.emitSaveAll(resolver, runtimeSpillMode);
 		emit_compute_vsp(regs.vsp, state.sp);
 		masm.emit_store_curstack_vsp(regs.vsp);
-		masm.emit_get_curstack(regs.runtime_arg0);
-		masm.emit_v3_set_X86_64Stack_rsp_r_r(regs.runtime_arg0, regs.sp);
-		masm.emit_push_X86_64Stack_rsp_r_r(regs.runtime_arg0);
-		emit_load_instance(regs.runtime_arg1);
-		masm.emit_mov_r_i(regs.runtime_arg2, arg1);
-		masm.emit_mov_r_i(regs.runtime_arg3, arg2);
-		masm.emit_call_runtime_op(op);
+
+		def emit = fun {
+			masm.emit_get_curstack(regs.runtime_arg0);
+			masm.emit_v3_set_X86_64Stack_rsp_r_r(regs.runtime_arg0, regs.sp);
+			masm.emit_push_X86_64Stack_rsp_r_r(regs.runtime_arg0);
+			emit_load_instance(regs.runtime_arg1);
+			masm.emit_mov_r_i(regs.runtime_arg2, arg1);
+			masm.emit_mov_r_i(regs.runtime_arg3, arg2);
+			masm.emit_call_runtime_op(op);
+		};
+		// Reconstruct stack frames across runtime calls that might (Wasm-level) trap.
+ 		if (canTrap) withReconstructedInlinedFrames(emit); else emit();
 		masm.emit_get_curstack(regs.scratch);
 		masm.emit_pop_X86_64Stack_rsp_r_r(regs.scratch);
 		dropN(args);
@@ -2034,13 +2137,16 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 			masm.emit_br(target.label);
 		}
 	}
+	// Return includes epilogue
 	def emitReturn(ctl: SpcControl) {
 		// All explicit RETURN instructions branch here.
 		if (ret_label != null) {
 			masm.bindLabel(ret_label);
 			ret_label = null;
 		}
+
 		var results = sig.results;
+		// fix values?
 		if (masm.valuerep.tagged) {
 			// update mismatched value tags
 			var params = sig.params;
@@ -2050,14 +2156,24 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 				masm.emit_mov_m_i(masm.tagAddr(state.sp - u32.view(results.length) + u32.view(i)), rtag.code);
 			}
 		}
+
+		if (!needsEpilogue()) return;
+
 		// Compute VSP = VFP + state.sp
 		emit_compute_vsp(regs.vsp, state.sp);
-		// Return to caller
-		masm.emit_mov_r_i(regs.ret_throw, 0);
-		// Deallocate stack frame
-		masm.emit_addw_r_i(regs.sp, frame.frameSize);
-		masm.emit_ret();
+		if (!fast) {
+			// Return to caller                           // \ fast context: do not emit these instructions
+			masm.emit_mov_r_i(regs.ret_throw, 0);         // | instead, emit the dispatch sequence from the interpreter
+			// Deallocate stack frame                     // |
+			masm.emit_addw_r_i(regs.sp, frame.frameSize); // |
+			masm.emit_ret();                              // /
+		} else {
+			// Restore VFP from interpreter frame
+			masm.emit_mov_r_m(ValueKind.REF, regs.vfp, frame.vfp_slot);
+			emitFastDispatch();
+		}
 	}
+	def emitFastDispatch() -> void;
 	def emitOsrEntry(osr_entry_label: MasmLabel, state: Array<SpcVal>) {
 		if (Trace.compiler) Trace.OUT.put1("  OSR (+%d)", osr_entry_label.create_pos).ln();
 		masm.bindLabel(osr_entry_label);
@@ -2087,6 +2203,137 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 		return label;
 	}
 	def getSpcInlinedFrameIp() -> long;
+	def saveCallerIVars();
+	def restoreDispatchTableReg();
+	def restoreCallerIVars();
+	// Emit code to materialize stack frames for each inlined function.
+	def emitReconstructStackFrames(frames: Array<SpcFrame>) -> int {
+		Metrics.spc_static_reconst.val++;
+		masm.emit_inc_metric(Metrics.spc_dynamic_reconst);
+		if (fast) {
+			// pc already saved
+			saveCallerIVars();
+		} else {
+			def real_frame = frames[0];
+			masm.emit_mov_m_i(xenv.pc_slot, real_frame.pc);
+		}
+
+		// NOTE we could use interpreter-backed registers for these instead of allocating new regs
+		// load instance
+		var inst_reg = allocTmp(ValueKind.REF);
+		masm.emit_mov_r_m(ValueKind.REF, inst_reg, frame.instance_slot);
+		var mem_reg = allocTmp(ValueKind.REF);
+		masm.emit_mov_r_m(ValueKind.REF, mem_reg, frame.mem0_base_slot);
+		// Load instance.functions
+		def func_reg = allocTmp(ValueKind.REF);
+		masm.emit_v3_Instance_functions_r_r(func_reg, inst_reg);
+		def vfp_reg = allocTmp(ValueKind.REF);
+		masm.emit_mov_r_m(ValueKind.REF, vfp_reg, frame.vfp_slot);
+		var prev_base_sp = int.view(frames[0].local_base_sp);
+		var wasm_func_reg = allocTmp(ValueKind.REF);
+
+		var inl_inst_reg: Reg, inl_mem0_reg: Reg;
+		if (whamm_config.is_inlined) { // TODO investigate, check individual configs?
+			inl_inst_reg = allocTmp(ValueKind.REF);
+			inl_mem0_reg = allocTmp(ValueKind.REF);
+			masm.emit_mov_r_m(ValueKind.REF, inl_inst_reg, frame.inlined_instance_slot);
+			masm.emit_mov_r_m(ValueKind.REF, inl_mem0_reg, frame.inlined_mem0_base_slot);
+		}
+
+		// Pre-allocate stack space for all reconstructed frames at once.
+		def total_space = (frames.length - 1) * (FRAME_SIZE + 8);
+		masm.emit_subw_r_i(regs.sp, total_space);
+
+		// Process the inlined frames (skip the outermost which already exists on native stack)
+		for (i = 1; i < frames.length; i++) {
+			def frame_info = frames[i];
+			def cur_base_sp = int.view(frame_info.local_base_sp);
+			def delta = (cur_base_sp - prev_base_sp) * masm.valuerep.slot_size;
+			emitReconstructStackFrame(frame_info, frames.length - i - 1, delta,
+					wasm_func_reg, func_reg, inst_reg, mem_reg, vfp_reg, inl_inst_reg, inl_mem0_reg);
+			prev_base_sp = cur_base_sp;
+		}
+
+		return total_space;
+	}
+	def emitReconstructStackFrame(spcFrame: SpcFrame, offset: int, vfp_delta: int,
+			wasm_func_reg: Reg, func_reg: Reg, inst_reg: Reg, mem_reg: Reg, vfp_reg: Reg, inl_inst_reg: Reg, inl_mem0_reg: Reg) {
+		// Use inlined frame stub IP as return address for all reconstructed frames
+		def return_addr = getSpcInlinedFrameIp();
+
+		def frame_offset = offset * (FRAME_SIZE + 8);
+		// Write inlined frame stub IP as return address
+		def retaddr_slot = MasmAddr(regs.sp, frame_offset + FRAME_SIZE);
+		masm.emit_mov_m_l(retaddr_slot, return_addr);
+
+		// get functions[func_index] and save into frame
+		def wasm_func_slot = frame.wasm_func_slot.plus(frame_offset);
+		masm.emit_v3_Array_elem_r_ri(ValueKind.REF, wasm_func_reg, func_reg, spcFrame.func.func_index);
+		masm.emit_mov_m_r(ValueKind.REF, wasm_func_slot, wasm_func_reg);
+
+		// Save instance
+		def instance_slot = frame.instance_slot.plus(frame_offset);
+		masm.emit_mov_m_r(ValueKind.REF, instance_slot, inst_reg);
+
+		// Save mem0 base
+		def mem0_base_slot = frame.mem0_base_slot.plus(frame_offset);
+		masm.emit_mov_m_r(ValueKind.REF, mem0_base_slot, mem_reg);
+
+		// Step vfp_reg by change in local_base_sp from previous frame and save
+		if (vfp_delta != 0) masm.emit_addw_r_i(vfp_reg, vfp_delta);
+		def vfp_slot = frame.vfp_slot.plus(frame_offset);
+		masm.emit_mov_m_r(ValueKind.REF, vfp_slot, vfp_reg);
+
+		// Save PC
+		def pc_slot = frame.pc_slot.plus(frame_offset);
+		masm.emit_mov_m_i(pc_slot, spcFrame.pc);
+
+		// Clear FrameAccessor
+		def accessor_slot = frame.accessor_slot.plus(frame_offset);
+		masm.emit_mov_m_l(accessor_slot, 0);
+
+		// if an inlined whamm probe, also grab inlined slots
+		if (whamm_config.is_inlined) {
+			def inl_instance_slot = frame.inlined_instance_slot.plus(frame_offset);
+			masm.emit_mov_m_r(ValueKind.REF, inl_instance_slot, inl_inst_reg);
+			def inl_mem0_base_slot = frame.inlined_mem0_base_slot.plus(frame_offset);
+			masm.emit_mov_m_r(ValueKind.REF, inl_mem0_base_slot, inl_mem0_reg);
+		} else {
+			def inl_instance_slot = frame.inlined_instance_slot.plus(frame_offset);
+			masm.emit_mov_m_l(inl_instance_slot, 0);
+			def inl_mem0_base_slot = frame.inlined_mem0_base_slot.plus(frame_offset);
+			masm.emit_mov_m_l(inl_mem0_base_slot, 0);
+		}
+	}
+	// Guards compiler code with frame reconstruction (if necessary).
+	def withReconstructedInlinedFrames(emit: void -> void) {
+		if (isInlined()) {
+			if (frames_reconstructed) {
+				// FIXME this should not happen (but does):
+				// - in the case of deep nesting when one layer is a Whamm probe
+				// - when refactoring to avoid `with` clause, GC test fails (inlining depth 2)
+				if (Trace.compiler) Trace.OUT.puts("  nested frame reconstruction inhibited\n");
+				emit();
+				return;
+			}
+			unrefRegs();
+			frames_reconstructed = true;
+			if (Trace.compiler) Trace.OUT.puts("performing frame reconstruction\n");
+			def space = emitReconstructStackFrames(snapshotFrames());
+			emit();
+			frames_reconstructed = false;
+			if (space > 0) {
+				masm.emit_addw_r_i(regs.sp, space);
+				masm.emit_mov_r_m(ValueKind.REF, regs.vfp, frame.vfp_slot);
+				if (fast) {
+					restoreCallerIVars();
+					restoreDispatchTableReg();
+				}
+			}
+		} else {
+			emit();
+		}
+	}
 	def unsupported() {
 		success = false; // XXX: add opcode
 	}
@@ -2181,7 +2428,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 		// XXX: recompute VFP from VSP - #slots?
 		masm.emit_mov_r_m(ValueKind.REF, regs.vfp, frame.vfp_slot);
 		if (module.memories.length > 0) {
-			if (is_inlined) {
+			if (whamm_config.is_inlined) {
 				masm.emit_mov_r_m(ValueKind.REF, regs.mem0_base, frame.inlined_mem0_base_slot);
 			} else {
 				masm.emit_mov_r_m(ValueKind.REF, regs.mem0_base, frame.mem0_base_slot);
@@ -2189,7 +2436,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 		}
 	}
 	def emit_load_instance(reg: Reg) {
-		if (is_inlined) { // inline compilation
+		if (whamm_config.is_inlined) { // inline compilation
 			masm.emit_mov_r_m(ValueKind.REF, reg, frame.inlined_instance_slot);
 		} else {
 			masm.emit_mov_r_m(ValueKind.REF, reg, frame.instance_slot);
@@ -2517,15 +2764,20 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 		if (func != null) masm.pushInlineContext(func.func_index);
 
 		def current = state.frame_stack.peek();
-		if (current != null) current.pc = it.pc;
+		if (current != null) {
+			current.pc = it.pc;
+			current.ret_label = ret_label;
+		}
 		state.frame_stack.push(frame);
 		// Update cached copies from new top frame
-		it.reset(frame.func).at(frame.pc, -1);
+		if (frame.func != null) it.reset(frame.func).at(frame.pc, -1);
 		module = frame.module;
 		func = frame.func;
-		sig = func.sig;
+		sig = if(func != null, func.sig);
 		num_locals = frame.num_locals;
 		local_base_sp = frame.local_base_sp;
+		ctl_base_sp = frame.ctl_base_sp;
+		ret_label = frame.ret_label;
 	}
 	def popSpcFrame() -> SpcFrame {
 		masm.popInlineContext();
@@ -2539,12 +2791,18 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 		sig = func.sig;
 		num_locals = current.num_locals;
 		local_base_sp = current.local_base_sp;
+		ctl_base_sp = current.ctl_base_sp;
+		ret_label = current.ret_label;
 		return frame;
 	}
 
 	def isInlined() -> bool {
 		return state.frame_stack.top > 1;
 	}
+	def needsEpilogue() -> bool {
+		// inlined callees will fallthrough and don't need epilogue to be emitted
+		return !isInlined() || ctl_base_sp == 0;
+	}
 	def inlineDepth() -> int {
 		return state.frame_stack.top - 1;
 	}
@@ -2553,10 +2811,41 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl
 		for (i < state.frame_stack.top) {
 			var f = state.frame_stack.elems[i];
 			var pc = if(i == state.frame_stack.top - 1, it.pc, f.pc);
-			frames[i] = SpcFrame.new(f.func, f.module, f.local_base_sp, f.ctl_base_sp, f.num_locals, pc);
+			frames[i] = SpcFrame.new(f.func, f.module, f.local_base_sp, f.ctl_base_sp, f.num_locals, pc, null);
 		}
 		return frames;
 	}
+	// Determine if a regular function call should be inlined
+	def shouldInline(func: FuncDecl) -> bool {
+		if (Trace.compiler) OUT.put1("deciding on inlining call to func #%d: ", func.func_index);
+
+		if (func.imp != null) return no("imported");
+		if (inlineDepth() >= SpcTuning.maxInlineDepth) return no("max inline depth exceeded");
+		if (func.orig_bytecode.length > SpcTuning.maxInlineBytecodeSize) return no("func too large");
+		if (func.sig.params.length > SpcTuning.maxInlineParams) return no("too many parameters");
+
+		// Scan bytecode for unsupported instructions
+		var bi = BytecodeIterator.new().reset(func);
+		while (bi.more()) {
+			match (bi.current()) {
+				RETURN_CALL, RETURN_CALL_INDIRECT, RETURN_CALL_REF => 
+					return no("uses return instruction");
+				TRY, CATCH, THROW, RETHROW, THROW_REF, DELEGATE, CATCH_ALL, TRY_TABLE =>
+					return no("uses exception handling instruction");
+				CONT_NEW, CONT_BIND, SUSPEND, RESUME, RESUME_THROW, RESUME_THROW_REF, SWITCH =>
+					return no("uses stack switching instruction");
+				_ => ;
+			}
+			bi.next();
+		}
+
+		if (Trace.compiler) OUT.puts("YES\n");
+		return true;
+	}
+	private def no(reason: string) -> bool {
+		if (Trace.compiler) OUT.puts("NO (").puts(reason).putc(')').ln();
+		return false;
+	}
 }
 // Different branch instructions have different repush
 enum BrRepush(taken: bool, not_taken: bool) {
@@ -2710,8 +2999,9 @@ class SpcFrame {
 	var ctl_base_sp: u31;    // Base index into SpcState.ctl_stack
 	var num_locals: int;
 	var pc: int;
+	var ret_label: MasmLabel;
 
-	new(func, module, local_base_sp, ctl_base_sp, num_locals, pc) {}
+	new(func, module, local_base_sp, ctl_base_sp, num_locals, pc, ret_label) {}
 }
 
 class SpcState(regAlloc: RegAlloc) {
@@ -2728,7 +3018,7 @@ class SpcState(regAlloc: RegAlloc) {
 		ctl_stack.clear();
 		// manually set up first control entry and return merge state
 		var results = sig.results;
-		var ctl = pushControl(Opcode.RETURN.code, ValueTypes.NONE, results, ret_label);
+		var ctl = pushFuncBody(ValueTypes.NONE, results, ret_label);
 		var merge_state = Array<SpcVal>.new(results.length);
 		for (i < results.length) {
 			// request the merged values be stored to the stack, but don't require tags
@@ -2760,6 +3050,9 @@ class SpcState(regAlloc: RegAlloc) {
 	def pushBlock(params: Array<ValueType>, results: Array<ValueType>, end_label: MasmLabel) -> SpcControl {
 		return pushControl(Opcode.BLOCK.code, params, results, end_label);
 	}
+	def pushFuncBody(params: Array<ValueType>, results: Array<ValueType>, end_label: MasmLabel) -> SpcControl {
+		return pushControl(Opcode.RETURN.code, params, results, end_label);
+	}
 	def pushLoop(params: Array<ValueType>, results: Array<ValueType>, start_label: MasmLabel) -> SpcControl {
 		var ctl = pushControl(Opcode.LOOP.code, params, results, start_label);
 		return ctl;
@@ -3263,38 +3556,7 @@ class MoveNode {
 	var dstNext: MoveNode;	// next in a list of successors
 }
 
-// checks function bytecode to see if it can be inlined based on
-// simple heuristics: length <= maxInlineBytecodeSize and straightline code.
-def funcCanInline(decl: FuncDecl) -> InlineConfig {
-	var default = InlineConfig(false, false, false);
-	if (decl.orig_bytecode.length > SpcTuning.maxInlineBytecodeSize || decl.sig.params.length > SpcTuning.maxInlineParams) return default;
-	var bi = BytecodeIterator.new().reset(decl);
-	var swap_instance = false;
-	var swap_membase = false;
-	while (bi.more()) {
-		var op = bi.current();
-		match (op) {
-			// Cannot handle control flow yet.
-			IF, BR, BR_IF, BR_TABLE, BR_ON_NULL, BR_ON_NON_NULL, BR_ON_CAST, BR_ON_CAST_FAIL, RETURN => return default;
-			// These opcodes require swapping the instance.
-			THROW, CALL, CALL_INDIRECT, MEMORY_INIT, MEMORY_SIZE, MEMORY_GROW, MEMORY_COPY, MEMORY_FILL, REF_FUNC, DATA_DROP,
-			ELEM_DROP, TABLE_INIT, TABLE_SIZE, TABLE_COPY, TABLE_GROW, GLOBAL_SET, GLOBAL_GET, TABLE_SET, TABLE_GET => swap_instance = true;
-			// Load/store opcodes require either the memory base or the instance.
-			I32_STORE, I64_STORE, F32_STORE, F64_STORE, I32_STORE8, I32_STORE16, I64_STORE8, I64_STORE16, I64_STORE32,
-			V128_STORE, I32_LOAD, I64_LOAD, F32_LOAD, F64_LOAD, I32_LOAD8_S, I32_LOAD8_U, I32_LOAD16_S, I32_LOAD16_U,
-			I64_LOAD8_S, I64_LOAD8_U, I64_LOAD16_S, I64_LOAD16_U, I64_LOAD32_S, I64_LOAD32_U, V128_LOAD => {
-				var memarg = bi.immptr().read_MemArg();
-				if (memarg.memory_index == 0) swap_membase = true;
-				else swap_instance = true;
-			}
-			_ => ;
-		}
-		bi.next();
-	}
-	return InlineConfig(swap_membase, swap_instance, true);
-}
-
-type InlineConfig(swap_membase: bool, swap_instance: bool, can_inline: bool);
+type WhammInlineConfig(swap_membase: bool, swap_instance: bool, is_inlined: bool);
 
 // Used to record the entry point of exception/suspension handlers. Jumping to {stub_label} allows
 // control transfer to its corresponding handler without falling back to fast-int.
@@ -3302,3 +3564,9 @@ type InlineConfig(swap_membase: bool, swap_instance: bool, can_inline: bool);
 // The SPC emits a stub at {stub_label} for each handler in the function. The stub restores the
 // expected state of the environment, then jumps to {dest_label} to continue execution at handler.
 type SpcHandlerInfo(is_dummy: bool, func_end: bool, dest_label: MasmLabel, stub_label: MasmLabel, merge_state: Array<SpcVal>);
+
+def uleb_size(v: int) -> int {
+	var n = 1, data = u32.view(v);
+	while (data >= 0x80) { data = data >> 7; n++; }
+	return n;
+}
diff --git a/src/engine/v3/V3Interpreter.v3 b/src/engine/v3/V3Interpreter.v3
index ff4ae0d13..c84ad7267 100644
--- a/src/engine/v3/V3Interpreter.v3
+++ b/src/engine/v3/V3Interpreter.v3
@@ -349,7 +349,7 @@ class V3Interpreter extends WasmStack {
 			RETURN => {
 				doReturn(frame.fp, frame.func.sig);
 			}
-			CALL => {
+			CALL, FAST_CALL0, FAST_CALL1, FAST_CALL2, FAST_CALL3, FAST_CALL4, FAST_CALL5, FAST_CALL6, FAST_CALL7, FAST_CALL8, FAST_CALL9, FAST_CALL10, FAST_CALL11, FAST_CALL12, FAST_CALL13, FAST_CALL14, FAST_CALL15, FAST_CALL16, FAST_CALL17, FAST_CALL18, FAST_CALL19, FAST_CALL20, FAST_CALL21, FAST_CALL22, FAST_CALL23, FAST_CALL24, FAST_CALL25, FAST_CALL26, FAST_CALL27, FAST_CALL28, FAST_CALL29, FAST_CALL30, FAST_CALL31, FAST_CALL32, FAST_CALL33, FAST_CALL34, FAST_CALL35, FAST_CALL36, FAST_CALL37, FAST_CALL38, FAST_CALL39 => {
 				var func_index = codeptr.read_uleb32();
 				var f = frame.func.instance.functions[func_index];
 				return doCallFunction(f);
@@ -1615,7 +1615,7 @@ class V3Interpreter extends WasmStack {
 		// XXX: use read_opcode_and_skip()
 		var opcode = codeptr.read_opcode_but_skip_probe(frame.func.decl);
 		match (opcode) {
-			CALL, CALL_REF => {
+			CALL, CALL_REF, FAST_CALL0, FAST_CALL1, FAST_CALL2, FAST_CALL3, FAST_CALL4, FAST_CALL5, FAST_CALL6, FAST_CALL7, FAST_CALL8, FAST_CALL9, FAST_CALL10, FAST_CALL11, FAST_CALL12, FAST_CALL13, FAST_CALL14, FAST_CALL15, FAST_CALL16, FAST_CALL17, FAST_CALL18, FAST_CALL19, FAST_CALL20, FAST_CALL21, FAST_CALL22, FAST_CALL23, FAST_CALL24, FAST_CALL25, FAST_CALL26, FAST_CALL27, FAST_CALL28, FAST_CALL29, FAST_CALL30, FAST_CALL31, FAST_CALL32, FAST_CALL33, FAST_CALL34, FAST_CALL35, FAST_CALL36, FAST_CALL37, FAST_CALL38, FAST_CALL39 => {
 				codeptr.skip_leb();
 				frame.pc = codeptr.pos;
 			}
diff --git a/src/engine/x86-64/Mmap.v3 b/src/engine/x86-64/Mmap.v3
index 5305ef0c6..25621ab12 100644
--- a/src/engine/x86-64/Mmap.v3
+++ b/src/engine/x86-64/Mmap.v3
@@ -18,6 +18,16 @@ component Mmap {
 		RiGc.registerFinalizer(mapping, range.unmap);
 		return mapping;
 	}
+	def reserve32(size: u64, prot: int) -> Mapping {
+		var flags = LinuxConst.MAP_PRIVATE | LinuxConst.MAP_ANONYMOUS | 0x40; // 0x40 = MAP_32BIT
+		var r = Linux.syscall(LinuxConst.SYS_mmap, (Pointer.NULL, size, prot, flags, 0, 0));
+		if (r.0 == -1) return null;
+		var start = Pointer.NULL + r.0, end = start + i64.view(size);
+		var range = MemoryRange.new(start, end);
+		var mapping = Mapping.new(range);
+		RiGc.registerFinalizer(mapping, range.unmap);
+		return mapping;
+	}
 	def protect(start: Pointer, size: u64, prot: int) -> bool {
 		var r = Linux.syscall(LinuxConst.SYS_mprotect, (start, size, prot));
 		return r.0 == 0;
diff --git a/src/engine/x86-64/V3Offsets.v3 b/src/engine/x86-64/V3Offsets.v3
index 5de4559e9..2761ab15a 100644
--- a/src/engine/x86-64/V3Offsets.v3
+++ b/src/engine/x86-64/V3Offsets.v3
@@ -31,6 +31,7 @@ class V3Offsets {
 	def FuncDecl_orig_bytecode	= int.view(Pointer.atField(decl.orig_bytecode) - Pointer.atObject(decl));
 	def FuncDecl_sidetable		= int.view(Pointer.atField(decl.sidetable.entries) - Pointer.atObject(decl));
 	def FuncDecl_target_code	= int.view(Pointer.atField(decl.target_code.spc_entry) - Pointer.atObject(decl));
+	def FuncDecl_fast_target_code	= int.view(Pointer.atField(decl.fast_target_code.spc_entry) - Pointer.atObject(decl));
 	def FuncDecl_tierup_trigger	= int.view(Pointer.atField(decl.tierup_trigger) - Pointer.atObject(decl));
 	def FuncDecl_entry_probed	= int.view(Pointer.atField(decl.entry_probed) - Pointer.atObject(decl));
 	def FuncDecl_frame_var_tags	= int.view(Pointer.atField(decl.frame_var_tags) - Pointer.atObject(decl));
diff --git a/src/engine/x86-64/X86_64Interpreter.v3 b/src/engine/x86-64/X86_64Interpreter.v3
index 29307ca98..7b3e74e91 100644
--- a/src/engine/x86-64/X86_64Interpreter.v3
+++ b/src/engine/x86-64/X86_64Interpreter.v3
@@ -536,6 +536,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		var tmp = r_scratch;
 		{ // Entrypoint for calls coming from V3
 			ic.header.intV3EntryOffset = w.pos;
+			//masm.emit_debugger_breakpoint();
 
 			// Allocate and initialize interpreter stack frame from incoming V3 args.
 			asm.q.sub_r_i(r_sp, k_frame_size);
@@ -1244,7 +1245,9 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 	}
 	def genLocals() {
 		bindHandler(Opcode.DROP);
+		//masm.emit_debugger_breakpoint();
 		decrementVsp();
+		//masm.emit_debugger_breakpoint();
 		endHandler();
 
 		bindHandler(Opcode.LOCAL_GET);
@@ -1307,6 +1310,89 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 			asm.movd_r_i(G(Target.V3_RET_GPRS[0]), 0);
 			genPopFrameAndRet();
 
+			// FAST_CALL
+			// TODO patch the dispatch table so it goes to the code directly,
+			// instead of this fast function lookup
+			bindHandler(Opcode.FAST_CALL0);
+			bindHandler(Opcode.FAST_CALL1);
+			bindHandler(Opcode.FAST_CALL2);
+			bindHandler(Opcode.FAST_CALL3);
+			bindHandler(Opcode.FAST_CALL4);
+			bindHandler(Opcode.FAST_CALL5);
+			bindHandler(Opcode.FAST_CALL6);
+			bindHandler(Opcode.FAST_CALL7);
+			bindHandler(Opcode.FAST_CALL8);
+			bindHandler(Opcode.FAST_CALL9);
+			bindHandler(Opcode.FAST_CALL10);
+			bindHandler(Opcode.FAST_CALL11);
+			bindHandler(Opcode.FAST_CALL12);
+			bindHandler(Opcode.FAST_CALL13);
+			bindHandler(Opcode.FAST_CALL14);
+			bindHandler(Opcode.FAST_CALL15);
+			bindHandler(Opcode.FAST_CALL16);
+			bindHandler(Opcode.FAST_CALL17);
+			bindHandler(Opcode.FAST_CALL18);
+			bindHandler(Opcode.FAST_CALL19);
+			bindHandler(Opcode.FAST_CALL20);
+			bindHandler(Opcode.FAST_CALL21);
+			bindHandler(Opcode.FAST_CALL22);
+			bindHandler(Opcode.FAST_CALL23);
+			bindHandler(Opcode.FAST_CALL24);
+			bindHandler(Opcode.FAST_CALL25);
+			bindHandler(Opcode.FAST_CALL26);
+			bindHandler(Opcode.FAST_CALL27);
+			bindHandler(Opcode.FAST_CALL28);
+			bindHandler(Opcode.FAST_CALL29);
+			bindHandler(Opcode.FAST_CALL30);
+			bindHandler(Opcode.FAST_CALL31);
+			bindHandler(Opcode.FAST_CALL32);
+			bindHandler(Opcode.FAST_CALL33);
+			bindHandler(Opcode.FAST_CALL34);
+			bindHandler(Opcode.FAST_CALL35);
+			bindHandler(Opcode.FAST_CALL36);
+			bindHandler(Opcode.FAST_CALL37);
+			bindHandler(Opcode.FAST_CALL38);
+			bindHandler(Opcode.FAST_CALL39);
+			masm.emit_intentional_crash();
+			//masm.emit_debugger_breakpoint();
+			var dispatchLabel = X86_64Label.new();
+			// genTagPush(BpTypeCode.I32.code);
+			// asm.movq_m_i(vsph[0].value, 770);
+			// incrementVsp();
+
+			/* TODO What should happen in a FAST_CALL?
+			 *
+			 * Ideally, we've patched the dispatch table with exactly what appears in fast_target_code
+			 * so it instantly jumps there and so we don't have to set up the jump first.
+			 *
+			 * Fast function implementation should include code to skip the original operand as
+			 * part of incrementing pc (will be done over in SPC).
+			 *
+			 * But, we could keep this for a quasi-fast call?
+			 */
+
+			genReadUleb32(r_tmp1);
+			asm.movq_r_m(r_tmp0, r_instance.plus(offsets.Instance_functions));
+			asm.movq_r_m(func_arg, r_tmp0.plusR(r_tmp1, offsets.REF_SIZE, offsets.Array_contents));
+
+			var tmp = r_tmp2;
+			asm.movq_r_m(tmp, func_arg.plus(offsets.WasmFunction_decl));
+
+			//masm.emit_debugger_breakpoint();
+			asm.ijmp_m(tmp.plus(offsets.FuncDecl_fast_target_code));
+			//asm.icall_m(tmp.plus(offsets.FuncDecl_fast_target_code));
+			//asm.invalid();
+			
+			// don't go here
+			asm.bind(dispatchLabel);
+			masm.emit_nop();
+			masm.emit_nop();
+			masm.emit_nop();
+			masm.emit_nop();
+			masm.emit_nop();
+			masm.emit_nop();
+			endHandler();
+
 			bindHandler(Opcode.CALL);
 			computeCurIpForTrap(-1);
 			genReadUleb32(r_tmp1);
@@ -1326,6 +1412,8 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 				var tmp = r_tmp2;
 				asm.movq_r_m(tmp, func_arg.plus(offsets.WasmFunction_decl));
 				asm.icall_m(tmp.plus(offsets.FuncDecl_target_code));
+				// assembly call to target function
+				// if not compiled, interpreter's entry point
 			} else {
 				asm.call_rel_far(callReentryLabel);
 			}
@@ -2700,7 +2788,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		if (FastIntTuning.enableWhammProbeTrampoline) {
 			var pos = w.atEnd().pos;
 			writeDispatchEntry(dispatchTables[0].1, InternalOpcode.BREAK_PROBE.code, pos);
-			masm.emit_debugger_breakpoint();
+			//masm.emit_debugger_breakpoint();
 			// Compute a pointer to the original code at this pc offset
 			var pc = r_tmp1; // = IP - CODE
 			asm.movq_r_r(pc, r_ip);
@@ -4000,6 +4088,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 	// Generate a dispatch from the main dispatch table.
 	def genDispatch() {
 		genDispatch0(ip_ptr, if (FeatureDisable.globalProbes, dispatchTables[0].1), true);
+		//masm.emit_int_dispatch(r_tmp0, r_tmp1, r_ip, r_dispatch, ip_ptr, dispatchTables[0].1, true, ic);
 	}
 	// Generate a load of the next bytecode and a dispatch through the dispatch table.
 	def genDispatch0(ptr: X86_64Addr, table: IcCodeRef, increment: bool) {
diff --git a/src/engine/x86-64/X86_64MacroAssembler.v3 b/src/engine/x86-64/X86_64MacroAssembler.v3
index 20a76af68..8e50d5668 100644
--- a/src/engine/x86-64/X86_64MacroAssembler.v3
+++ b/src/engine/x86-64/X86_64MacroAssembler.v3
@@ -1592,6 +1592,49 @@ class X86_64MacroAssembler extends MacroAssembler {
 		asm.pextrq_r_s_i(G(to), X(from), 1);
 	}
 
+	// xenv: IntExecEnv = X86_64MasmRegs.INT_EXEC_ENV;
+	// r_ip       rax
+	// ip_ptr
+	// r_dispatch r14
+	// r_tmp0     rcx
+	// r_tmp1     rdx
+	def emit_int_dispatch(opcode: X86_64Gpr, base: X86_64Gpr, r_ip: X86_64Gpr, r_dispatch: X86_64Gpr,
+			ptr: X86_64Addr, table: IcCodeRef, increment: bool, ic: X86_64InterpreterCode) {
+		if (ptr != null) asm.movbzx_r_m(opcode, ptr);
+		if (increment) asm.inc_r(r_ip);
+		match (FastIntTuning.dispatchEntrySize) {
+			2 => {
+				if (table == null) asm.movq_r_r(base, r_dispatch);
+				else asm.lea(base, table); // RIP-relative LEA
+				asm.movwsx_r_m(opcode, base.plusR(opcode, 2, 0)); // load 16-bit offset
+				asm.add_r_r(base, opcode);
+				//if (dispatchJmpOffset < 0) dispatchJmpOffset = w.pos;
+				asm.ijmp_r(base);
+			}
+			4 => {
+				if (table == null) {
+					asm.movd_r_m(base, r_dispatch.plusR(opcode, 4, 0));
+				} else {
+					var addr = ic.start + table.offset;
+					asm.movd_r_m(base, X86_64Addr.new(null, opcode, 4, int.!(addr - Pointer.NULL)));
+				}
+				//if (dispatchJmpOffset < 0) dispatchJmpOffset = w.pos;
+				asm.ijmp_r(base);
+			}
+			8 => {
+				if (table == null) {
+					//if (dispatchJmpOffset < 0) dispatchJmpOffset = w.pos;
+					asm.ijmp_m(r_dispatch.plusR(opcode, 8, 0));
+				} else {
+					var addr = ic.start + table.offset;
+					//if (dispatchJmpOffset < 0) dispatchJmpOffset = w.pos;
+					asm.ijmp_m(X86_64Addr.new(null, opcode, 8, int.!(addr - Pointer.NULL)));
+				}
+			}
+		}
+
+	}
+
 	// Reads a 32- or 64-bit unsigned LEB from {rw_ptr} into {w_dest}.
 	def emit_read_uleb(w_dest: X86_64Gpr, rw_ptr: X86_64Gpr, w_scratch1: X86_64Gpr, w_scratch2: X86_64Gpr) -> this {
 		// TODO: handle w_dest = rcx
diff --git a/src/engine/x86-64/X86_64MasmRegs.v3 b/src/engine/x86-64/X86_64MasmRegs.v3
index 0668c35a5..73af95b67 100644
--- a/src/engine/x86-64/X86_64MasmRegs.v3
+++ b/src/engine/x86-64/X86_64MasmRegs.v3
@@ -88,33 +88,34 @@ component X86_64MasmRegs {
 		return config;
 	})();
 
-	// Build both the SPC and INT execution environments together.
-	private def t = (fun -> (SpcExecEnv, IntExecEnv) {
+	// Build the SPC, fast-SPC, and INT execution environments together.
+	private def t = (fun -> (SpcExecEnv, SpcExecEnv, IntExecEnv) {
 		var xspc = SpcExecEnv.new();
+		var xfast = SpcExecEnv.new();
 		var xint = IntExecEnv.new();
 
-		xint.sp			=	xspc.sp			= RSP;
-		xint.func_arg		= 	xspc.func_arg		= RDX;
-		xint.vsp 		= 	xspc.vsp		= RSI;
-		xint.vfp 		= 	xspc.vfp		= R11;
-		xint.mem0_base 		= 	xspc.mem0_base		= R10;
-		xint.instance 		= 	xspc.instance		= RDI;
-		xint.runtime_arg0 	= 	xspc.runtime_arg0	= RSI;
-		xint.runtime_arg1 	=	xspc.runtime_arg1 	= RDX;
-		xint.runtime_arg2 	= 	xspc.runtime_arg2 	= RCX;
-		xint.runtime_arg3 	= 	xspc.runtime_arg3 	= R8;
-		xint.runtime_arg4 	= 	xspc.runtime_arg4 	= R9;
-		xint.ret_throw 		=	xspc.ret_throw		= RAX;
-		xint.runtime_ret0 	= 	xspc.runtime_ret0	= RAX;
-		xint.runtime_ret1 	=	xspc.runtime_ret1 	= RDX;
-		xint.scratch		=	xspc.scratch		= RBP;
+		xint.sp			=	xspc.sp		=	xfast.sp		= RSP;
+		xint.func_arg		= 	xspc.func_arg	=	xfast.func_arg		= RDX; // cache of frame (callee-restore)
+		xint.vsp 		= 	xspc.vsp	=	xfast.vsp		= RSI;
+		xint.vfp 		= 	xspc.vfp	=	xfast.vfp		= R11;
+		xint.mem0_base 		= 	xspc.mem0_base	=	xfast.mem0_base		= R10; // cache of frame (callee-restore)
+		xint.instance 		= 	xspc.instance	=	xfast.instance		= RDI; // cache of frame (callee-restore)
+		xint.runtime_arg0 	= 	xspc.runtime_arg0 =	xfast.runtime_arg0	= RSI;
+		xint.runtime_arg1 	=	xspc.runtime_arg1 =	xfast.runtime_arg1	= RDX;
+		xint.runtime_arg2 	= 	xspc.runtime_arg2 =	xfast.runtime_arg2	= RCX;
+		xint.runtime_arg3 	= 	xspc.runtime_arg3 =	xfast.runtime_arg3	= R8;
+		xint.runtime_arg4 	= 	xspc.runtime_arg4 =	xfast.runtime_arg4	= R9;
+		xint.ret_throw 		=	xspc.ret_throw	=	xfast.ret_throw		= RAX;
+		xint.runtime_ret0 	= 	xspc.runtime_ret0 =	xfast.runtime_ret0	= RAX;
+		xint.runtime_ret1 	=	xspc.runtime_ret1 =	xfast.runtime_ret1	= RDX;
+		xint.scratch		=	xspc.scratch	=	xfast.scratch		= RBP;
 
 		xint.curpc 		= R15;
 		xint.stp		= RBX;
 		xint.ip			= RAX;
 		xint.func_decl		= R12;
 		xint.eip		= R13;
-		xint.dispatch		= R14;
+		xint.dispatch		= R14;  // cache of field (see how it is saved/stored in interpreter)
 		xint.xmm0		= XMM0;
 		xint.xmm1		= XMM1;
 		xint.xmm2		= XMM2;
@@ -127,29 +128,32 @@ component X86_64MasmRegs {
 
 		def m = MasmAddr(xspc.sp, _);
 
-		xint.accessor_slot	=	xspc.accessor_slot		= m(X86_64InterpreterFrame.accessor.offset);
-		xint.instance_slot	=	xspc.instance_slot		= m(X86_64InterpreterFrame.instance.offset);
-		xint.mem0_base_slot	=	xspc.mem0_base_slot		= m(X86_64InterpreterFrame.mem0_base.offset);
-		xint.pc_slot		=	xspc.pc_slot			= m(X86_64InterpreterFrame.curpc.offset);
-		xint.vfp_slot		=	xspc.vfp_slot			= m(X86_64InterpreterFrame.vfp.offset);
-		xint.vsp_slot		=	xspc.vsp_slot			= m(X86_64InterpreterFrame.vsp.offset);
-		xint.wasm_func_slot	=	xspc.wasm_func_slot		= m(X86_64InterpreterFrame.wasm_func.offset);
-		xint.ip_slot		=	xspc.inlined_mem0_base_slot	= m(X86_64InterpreterFrame.ip.offset);
-		xint.stp_slot		=	xspc.inlined_instance_slot	= m(X86_64InterpreterFrame.stp.offset);
+		xint.accessor_slot	=	xspc.accessor_slot	=	xfast.accessor_slot		= m(X86_64InterpreterFrame.accessor.offset);
+		xint.instance_slot	=	xspc.instance_slot	=	xfast.instance_slot		= m(X86_64InterpreterFrame.instance.offset);
+		xint.mem0_base_slot	=	xspc.mem0_base_slot	=	xfast.mem0_base_slot		= m(X86_64InterpreterFrame.mem0_base.offset);
+		xint.pc_slot		=	xspc.pc_slot		=	xfast.pc_slot			= m(X86_64InterpreterFrame.curpc.offset);
+		xint.vfp_slot		=	xspc.vfp_slot		=	xfast.vfp_slot			= m(X86_64InterpreterFrame.vfp.offset);
+		xint.vsp_slot		=	xspc.vsp_slot		=	xfast.vsp_slot			= m(X86_64InterpreterFrame.vsp.offset);
+		xint.wasm_func_slot	=	xspc.wasm_func_slot	=	xfast.wasm_func_slot		= m(X86_64InterpreterFrame.wasm_func.offset);
+		xint.ip_slot		=	xspc.inlined_mem0_base_slot =	xfast.inlined_mem0_base_slot	= m(X86_64InterpreterFrame.ip.offset);
+		xint.stp_slot		=	xspc.inlined_instance_slot  =	xfast.inlined_instance_slot	= m(X86_64InterpreterFrame.stp.offset);
 
 		xint.func_decl_slot	=	m(X86_64InterpreterFrame.func_decl.offset);
 		xint.code_slot		=	m(X86_64InterpreterFrame.code.offset);
 		xint.eip_slot		=	m(X86_64InterpreterFrame.eip.offset);
 
 		xint.frameSize		=	xspc.frameSize			= X86_64InterpreterFrame.size;
+		xfast.frameSize		= 0;
 
-		return (xspc, xint);
+		return (xspc, xfast, xint);
 	})();
 
 	// The execution environment for single-pass compilation contexts.
 	def SPC_EXEC_ENV = t.0;
+	// The execution environment for fast single-pass compilation contexts.
+	def FAST_SPC_EXEC_ENV = t.1;
 	// The execution environment for interpreter compilation contexts.
-	def INT_EXEC_ENV = t.1;
+	def INT_EXEC_ENV = t.2;
 
 	// A register allocator for single-pass compilation contexts.
 	def SPC_ALLOC = (fun -> RegAlloc {
@@ -163,7 +167,8 @@ component X86_64MasmRegs {
 	// A register allocator for interpreter contexts.
 	def INT_ALLOC = (fun -> RegAlloc {
 		var pools = [
-			RegPool32.new([RCX, RDX, R8, R9]),
+			RegPool32.new([RCX, RDX, R8, R9]), // could use callee-restore (but put at end)
+			// if callee-restore registers are used, have to emit a restore at the end
 			RegPool32.new([XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14])
 		];
 		return RegAlloc.new(CONFIG.poolMap, pools, null);
@@ -189,3 +194,4 @@ component X86_64MasmRegs {
 		return reg;
 	}
 }
+
diff --git a/src/engine/x86-64/X86_64PreGenStubs.v3 b/src/engine/x86-64/X86_64PreGenStubs.v3
index f8a780792..d836db938 100644
--- a/src/engine/x86-64/X86_64PreGenStubs.v3
+++ b/src/engine/x86-64/X86_64PreGenStubs.v3
@@ -25,7 +25,7 @@ layout X86_64PreGenHeader {
 	+24	intV3EntryOffset:		i32;	// entry into interpreter from V3 caller
 	+28	intSpcEntryOffset:		i32;	// entry into interpreter from SPC caller
 	+32	intIntEntryOffset:		i32;	// entry into interpreter from interpreter caller
-	+36 intSuspendEntryOffset:	i32; // entry into interpreter from a suspended child stack
+	+36 	intSuspendEntryOffset:		i32; 	// entry into interpreter from a suspended child stack
 	+40	deoptReentryOffset:		i32;	// re-enter interpreter from optimized code
 	+44	oobMemoryHandlerOffset:		i32;	// handler for signals caused by OOB memory access
 	+48	divZeroHandlerOffset:		i32;	// handler for signals caused by divide by zero
@@ -222,8 +222,8 @@ component X86_64PreGenStubs {
 				ic.header.probedDispatchTableOffset,
 				ic.header.fastDispatchTableOffset);
 
-		// Write-protect the executable code for security and debugging
-		Mmap.protect(range.start + ic.header.codeStart, u64.!(ic.header.codeEnd - ic.header.codeStart), Mmap.PROT_READ | Mmap.PROT_EXEC);
+		// XXX: PROT_WRITE included to allow runtime dispatch table patching
+		Mmap.protect(range.start + ic.header.codeStart, u64.!(ic.header.codeEnd - ic.header.codeStart), Mmap.PROT_READ | Mmap.PROT_WRITE | Mmap.PROT_EXEC);
 
 		// The host call stub is part of interpreter code (TODO: does it need to be?)
 		hostCallStub.start = ic.start + ic.header.hostCallStubOffset;
diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3
index 8e5f0e370..3bdbbf6e2 100644
--- a/src/engine/x86-64/X86_64SinglePassCompiler.v3
+++ b/src/engine/x86-64/X86_64SinglePassCompiler.v3
@@ -23,16 +23,102 @@ def KIND_F64 = SpcConsts.KIND_F64;
 def KIND_V128 = SpcConsts.KIND_V128;
 def KIND_REF = SpcConsts.KIND_REF;
 
+def xenv: IntExecEnv = X86_64MasmRegs.INT_EXEC_ENV;
+
+def r_mem0_base	= G(xenv.mem0_base);
+def r_vfp	= G(xenv.vfp);
+def r_vsp	= G(xenv.vsp);
+def r_stp	= G(xenv.stp);
+def r_ip	= G(xenv.ip);
+def r_eip	= G(xenv.eip);
+def r_func_decl	= G(xenv.func_decl);
+def r_instance	= G(xenv.instance);
+def r_curpc	= G(xenv.curpc);
+def ip_ptr    	= r_ip.plus(0);
+def r_dispatch 	= G(xenv.dispatch);
+def r_tmp0     	= G(xenv.tmp0);		// RCX
+def r_tmp1     	= G(xenv.tmp1);		// RDX
+
+def m_mem0_base 	= R.RSP.plus(X86_64InterpreterFrame.mem0_base.offset);
+def m_vfp 		= R.RSP.plus(X86_64InterpreterFrame.vfp.offset);
+def m_vsp 		= R.RSP.plus(X86_64InterpreterFrame.vsp.offset);
+def m_stp 		= R.RSP.plus(X86_64InterpreterFrame.stp.offset);
+def m_ip 		= R.RSP.plus(X86_64InterpreterFrame.ip.offset);
+def m_eip 		= R.RSP.plus(X86_64InterpreterFrame.eip.offset);
+def m_func_decl 	= R.RSP.plus(X86_64InterpreterFrame.func_decl.offset);
+def m_instance 		= R.RSP.plus(X86_64InterpreterFrame.instance.offset);
+def m_curpc 		= R.RSP.plus(X86_64InterpreterFrame.curpc.offset);
+
+def ivar_MEM0_BASE	= (r_mem0_base, m_mem0_base);
+def ivar_VFP		= (r_vfp, m_vfp);
+def ivar_VSP		= (r_vsp, m_vsp);
+def ivar_STP		= (r_stp, m_stp);
+def ivar_IP		= (r_ip, m_ip);
+def ivar_EIP		= (r_eip, m_eip);
+def ivar_FUNC_DECL	= (r_func_decl, m_func_decl);
+def ivar_INSTANCE	= (r_instance, m_instance);
+def ivar_CURPC		= (r_curpc, m_curpc);
+
+def all_ivars = [
+	ivar_MEM0_BASE,
+	ivar_VFP,
+	ivar_VSP,
+	ivar_STP,
+	ivar_IP,
+	ivar_EIP,
+	ivar_FUNC_DECL,
+	ivar_INSTANCE,
+	ivar_CURPC
+];
+
 // Implements the target-specific parts of the single-pass compiler for X86-64.
 class X86_64SinglePassCompiler extends SinglePassCompiler {
 	def w = DataWriter.new();
 	def mmasm = X86_64MacroAssembler.new(w, X86_64MasmRegs.CONFIG);
 	def asm = mmasm.asm;
+	var ic: X86_64InterpreterCode;
 
-	new(extensions: Extension.set, limits: Limits, config: RegConfig)
-		super(X86_64MasmRegs.SPC_EXEC_ENV, mmasm, X86_64MasmRegs.SPC_ALLOC.copy(), extensions, limits) {
+	new(ic, extensions: Extension.set, limits: Limits, config: RegConfig, fast: bool)
+		super(if(fast, X86_64MasmRegs.FAST_SPC_EXEC_ENV, X86_64MasmRegs.SPC_EXEC_ENV), mmasm, 
+				if(fast, X86_64MasmRegs.INT_ALLOC.copy(), X86_64MasmRegs.SPC_ALLOC.copy()), 
+				extensions, limits, fast) {
 		mmasm.trap_stubs = TRAPS_STUB;
 	}
+	def emitFastDispatch() {
+		mmasm.emit_int_dispatch(r_tmp0, r_tmp1, r_ip, r_dispatch, ip_ptr, 
+				if(ic != null, IcCodeRef.new(ic.header.fastDispatchTableOffset)), true, ic);
+	}
+	private def saveIVar(r: X86_64Gpr) {
+		for (t in all_ivars) {
+			if (t.0 == r) asm.movq_m_r(t.1, r);
+		}
+	}
+	def saveCallerIVars() {
+		saveIVar(r_ip);
+		saveIVar(r_stp);
+		if (!FeatureDisable.stacktraces) saveIVar(r_curpc);
+	}
+	def restoreDispatchTableReg() {
+		if (!FeatureDisable.globalProbes) {
+			// restore dispatch table from Interpreter.dispatchTable
+			def offsets = masm.getOffsets();
+			asm.movq_r_m(r_dispatch, mmasm.absPointer(offsets.Interpreter_dispatchTable));
+		}
+	}
+	private def restoreReg(r: X86_64Gpr) {
+		for (t in all_ivars) {
+			if (t.0 == r) asm.movq_r_m(r, t.1);
+		}
+	}
+	def restoreCallerIVars() {
+		restoreReg(r_ip);
+		restoreReg(r_stp);
+		restoreReg(r_eip);
+		restoreReg(r_instance);
+		restoreReg(r_func_decl);
+		restoreReg(r_mem0_base);
+		restoreReg(r_vfp);
+	}
 
 	private def visitCompareI(asm: X86_64Assembler, cond: X86_64Cond) -> bool {
 		var b = pop(), a = popReg();
@@ -1256,7 +1342,35 @@ class X86_64SpcModuleCode extends X86_64SpcCode {
 	}
 	// Reconstructs inlined interpreter frames for an inlined hardware trap context.
 	// Returns the new rsp to write into the ucontext (top of stack).
-	private def reconstructInlinedFramesForTrap(r_rsp: Pointer, inline_ctx: List<FuncLoc>) -> Pointer;
+	private def reconstructInlinedFramesForTrap(r_rsp: Pointer, inline_ctx: List<FuncLoc>) -> Pointer {
+		def frames: Array<FuncLoc> = Lists.toArray(inline_ctx);
+		def outer = frames[frames.length - 1];
+		def inlined = frames[0 ... (frames.length - 1)];
+		def count = inlined.length;
+
+		// set outermost pc in the real frame
+		(r_rsp + X86_64InterpreterFrame.curpc.offset).store<int>(outer.pc);
+
+		// Read instance from the real outer frame (shared across all inlined frames)
+		var instance = (r_rsp + X86_64InterpreterFrame.instance.offset).load<Instance>();
+
+		// Push inlined frames
+		for (i = count - 1; i >= 0; i--) {
+			var fid = inlined[i].func_index;
+			var pc  = inlined[i].pc;
+
+			r_rsp += -8;
+			r_rsp.store<Pointer>(INLINED_FRAME_STUB.start);
+
+			r_rsp += -X86_64InterpreterFrame.size;  // move rsp?
+			// write func, pc, frame accessor
+			var wasm_func = WasmFunction.!(instance.functions[fid]);
+			(r_rsp + X86_64InterpreterFrame.wasm_func.offset).store<WasmFunction>(wasm_func);
+			(r_rsp + X86_64InterpreterFrame.curpc.offset).store<int>(pc);
+			(r_rsp + X86_64InterpreterFrame.accessor.offset).store<X86_64FrameAccessor>(null);
+		}
+		return r_rsp;
+	}
 	// Look up the source {pc} of a location {i} in this code. Returns {-1} if no exact entry is found.
 	// Return addresses are treated differently than other addresses in the code.
 	def lookupPc(ip: Pointer, isRetAddr: bool) -> List<FuncLoc> {
@@ -1487,6 +1601,7 @@ def codePointer<P, R>(f: P -> R) -> Pointer {
 
 // Global functionality associated with the single-pass compiler for X86-64.
 component X86_64Spc {
+	var ic: X86_64InterpreterCode;
 	// A handy chokepoint for entering JIT code from V3.
 	def invoke(wf: WasmFunction, sp: Pointer) -> Throwable {
 		return V3_SPC_ENTRY_FUNC.get()(wf, sp, wf.decl.target_code.spc_entry);
@@ -1507,7 +1622,7 @@ component X86_64Spc {
 		return addr;
 	}
 	def estimateCodeSizeFor(decl: FuncDecl) -> int {
-		return 60 + decl.orig_bytecode.length * 20; // TODO: huge overestimate
+		return 60 + decl.orig_bytecode.length * 20 * (2 << byte.view(SpcTuning.maxInlineDepth)); // TODO: huge overestimate
 	}
 	private def lazyCompile(wf: WasmFunction) -> (WasmFunction, Pointer, Throwable) {
 		// The global stub simply consults the execution strategy.
diff --git a/src/engine/x86-64/X86_64Target.v3 b/src/engine/x86-64/X86_64Target.v3
index 015db9508..02ddf51d3 100644
--- a/src/engine/x86-64/X86_64Target.v3
+++ b/src/engine/x86-64/X86_64Target.v3
@@ -65,6 +65,46 @@ component Target {
 		f.target_code = TargetCode(addr);
 		Debug.afterCompile(f, u64.view(addr - Pointer.NULL));
 	}
+	def setFastTargetCode(f: FuncDecl, addr: Pointer, end: Pointer) {
+		if (Trace.compiler) {
+			Trace.OUT.put2("func[%d].fast_target_code: break *0x%x", f.func_index, addr - Pointer.NULL)
+				.put2(" disass 0x%x, 0x%x", addr - Pointer.NULL, end - Pointer.NULL).ln();
+			if (Trace.asm) {
+				var cur_byte = addr;
+				Trace.OUT.puts("JIT code: ");
+				while (cur_byte < end) {
+					Trace.OUT.put1("%x ", cur_byte.load<u8>());
+					cur_byte++;
+				}
+				Trace.OUT.ln();
+			}
+		}
+		f.fast_target_code = TargetCode(addr);		
+		patchFastCallDispatch(f, addr);
+		Debug.afterCompile(f, u64.view(addr - Pointer.NULL));
+	}
+	def patchFastCallDispatch(f: FuncDecl, addr: Pointer) {
+		if (f.fast_call_idx < 0) return;
+		def opcode = Opcodes.indexToFastCall(f.fast_call_idx);
+		def ic = X86_64PreGenStubs.getInterpreterCode();
+		// XXX Patch only fast dispatch tables
+		def fast_offset = ic.header.fastDispatchTableOffset;
+		def entry = ic.start + fast_offset + opcode.code * FastIntTuning.dispatchEntrySize;
+		if (Trace.compiler) {
+			Trace.OUT.puts("patching dispatch type\n");
+			Trace.OUT.put1("start 0x%x\n", u64.view(ic.start));
+			Trace.OUT.put1("entry 0x%x\n", u64.view(entry));
+			Trace.OUT.put1("addr 0x%x\n", u64.view(addr));
+		}
+		// XXX we require 8 entry size because of `addr` position
+		match (FastIntTuning.dispatchEntrySize) {
+			4 => entry.store<u32>(u32.view(addr));
+			8 => entry.store<long>(long.view(addr));
+			// 2-byte relative case would need a relative offset
+		}
+		if (Trace.compiler) Trace.OUT.puts("patched successfully\n");
+	}
+
 	def pregenIntoFile(filename: string) -> ErrorBuilder {
 		var data = System.fileLoad(filename);
 		var err = ErrorBuilder.new().puts("interpreter generator: ");
@@ -188,6 +228,7 @@ class X86_64ExecutionStrategy extends ExecutionStrategy {
 	}
 	// Compilation methods called directly by stubs.
 	def lazyCompile(wf: WasmFunction) -> SpcResultForStub;
+	def fastCompile(wf: WasmFunction, ic: X86_64InterpreterCode) -> SpcResultForStub;
 	def tierupCompile(wf: WasmFunction) -> SpcResultForStub;
 	// Tiering may require setting up the whole module.
 	def onTestModule(module: Module) {
@@ -196,6 +237,20 @@ class X86_64ExecutionStrategy extends ExecutionStrategy {
 	def disableLazyNameDecodingDuringGC(module: Module) {
 		if (module.names != null) module.names.lazyDecodeDisabled = RiGc.inGC;
 	}
+
+	def installStubForModule(module: Module, set: (Module, FuncDecl) -> void) {
+		// ensure entrypoint and lazy compile stubs are generated
+		X86_64PreGenStubs.gen();
+		// Set all functions to refer to the tier-up compile stub.
+		var codeSize = MINIMUM_CODE_SIZE;
+		for (i < module.functions.length) {
+			var f = module.functions[i];
+			if (f.imported()) continue;
+			set(module, f);
+			codeSize += X86_64Spc.estimateCodeSizeFor(f);
+		}
+		allocateCodeForModule(module, codeSize);
+	}
 }
 
 // One tier: fast-int, modules require no pre-processing.
@@ -206,10 +261,18 @@ class X86_64InterpreterOnlyStrategy extends X86_64ExecutionStrategy {
 
 	def onModuleFinish(module: Module, size: u32, err: ErrorGen) {
 		disableLazyNameDecodingDuringGC(module);
+		fastCompileEntireModule(module, size, false, err, 1024);
 	}
 	def onFuncValidationFinish(module: Module, func: FuncDecl, err: ErrorGen) {
 		if (err != null && !err.ok()) return;
 		Target.setUnconditionalInterpreterEntryIfMultiTier(func);
+
+		for (i < module.exports.length) {
+			def ex = module.exports[i];
+			if (ex.1 == func && Strings.startsWith(ex.0, "fast:")) {
+				System.puts(Strings.format1("fast function %s\n", ex.0));
+			}
+		}
 	}
 	def onNewFunction(wf: WasmFunction, err: ErrorGen) {
 		Target.setUnconditionalInterpreterEntryIfMultiTier(wf.decl);
@@ -219,6 +282,100 @@ class X86_64InterpreterOnlyStrategy extends X86_64ExecutionStrategy {
 		if (FastIntTuning.enableWhammProbeTrampoline && WhammProbe.?(p))
 			X86_64WhammTrampoline.makeTrampoline(WhammProbe.!(p), X86_64PreGenStubs.getInterpreterCode());
 	}
+
+	// TODO avoid duplicated function here
+	def fastCompile(wf: WasmFunction, ic: X86_64InterpreterCode) -> SpcResultForStub {
+		var module = wf.instance.module;
+		var code = module.target_module.spc_code;
+		var compiler = newCompiler(module.filename, true, null);
+		var masm = X86_64MacroAssembler.!(compiler.masm), w = masm.asm.w;
+
+		// generate code for the function
+		var success = compiler.gen(module, wf.decl, null);
+
+		// Check for remaining code space
+		var regionSize = code.mapping.range.size();
+		var remaining = regionSize - u64.!(code.codeEnd);
+		var codeSize = w.atEnd().pos;
+		if (codeSize > remaining) {
+			if (Trace.compiler) Trace.OUT.put3("exhausted code space for module (%d of %d bytes remaining, need %d)",
+				remaining, regionSize, codeSize).ln();
+			success = false;
+		}
+
+		var entrypoint: Pointer;
+		if (success) {
+			// Copy code into end of region
+			entrypoint = code.appendCode(masm);
+			Target.setFastTargetCode(wf.decl, entrypoint, entrypoint + codeSize);
+		} else {
+			// Failed, enter interpreter
+			var f = wf.decl;
+			if (Trace.compiler) Trace.OUT.put1("func[%d] FAST compile failed", f.func_index).ln();
+			entrypoint = X86_64Spc.setInterpreterFallback(f);
+		}
+		return SpcResultForStub(wf, entrypoint, null);
+	}
+	def fastCompileEntireModule(module: Module, size: u32, interpreter_fallback: bool, err: ErrorGen, ballast: u32) {
+		// ensure entrypoint and lazy compile stubs are generated
+		X86_64PreGenStubs.gen();
+
+		var compiler = newCompiler(module.filename, true, null);
+		var w = compiler.w;
+
+		// generate code for all functions
+		var bounds = Array<(int, int)>.new(module.functions.length);
+		var suberr = if(!interpreter_fallback, err);
+		for (i = 0; err.ok() && i < module.functions.length; i++) {
+			var f = module.functions[i];
+			if (f.imported()) continue;
+			for (j < module.exports.length) {
+				def ex = module.exports[j];
+				if (ex.1 == f && Strings.startsWith(ex.0, "fast:")) {
+					var start = w.atEnd().pos;
+					var compiled = compiler.gen(module, f, suberr);
+					if (compiled) bounds[i] = (start, w.end());
+					else bounds[i] = (-1, -1);
+				}
+			}
+		}
+
+		// copy and map code (reserve32 ensures address fits in 32 bits for dispatch table patching)
+		var length = u64.view(w.atEnd().pos) + ballast;
+		var mapping = Mmap.reserve32(length, Mmap.PROT_WRITE), range = mapping.range; // TODO: handle failure
+		var masm = X86_64MacroAssembler.!(compiler.masm);
+		masm.setTargetAddress(u64.view(range.start - Pointer.NULL));
+		Target.copyInto(mapping.range, 0, w);
+		// TODO: for security, move embedded references out of the code region and make it non-writable
+		Mmap.protect(range.start, u64.!(range.end - range.start), Mmap.PROT_WRITE | Mmap.PROT_READ | Mmap.PROT_EXEC);
+		for (i < bounds.length) {
+			var b = bounds[i];
+			if (b.0 >= 0) {
+				var addr = mapping.range.start;
+				var f = module.functions[i];
+				Target.setFastTargetCode(f, addr + b.0, addr + b.1);
+			} else {
+				var f = module.functions[i];
+				if (Trace.compiler) Trace.OUT.put1("func[%d] initial compile failed", f.func_index).ln();
+				X86_64Spc.setInterpreterFallback(f);
+			}
+		}
+		// XXX: reduce duplication with {X86_64SpcModuleCode.appendCode}.
+		var code = X86_64SpcModuleCode.new(mapping);
+		if (masm.source_locs != null) {
+			code.sourcePcs = Vector.new();
+			code.sourcePcs.putv(masm.source_locs);
+		}
+		if (masm.embeddedRefOffsets != null) {
+			if (code.embeddedRefOffsets == null) code.embeddedRefOffsets = Vector.new();
+			code.embeddedRefOffsets.putv(masm.embeddedRefOffsets);
+		}
+
+		module.target_module = TargetModule(code);
+		RiRuntime.registerUserCode(code);
+		module.target_module.spc_code.keepAlive();
+		Debug.afterCompileModule(module);
+	}
 }
 
 // Base class of all strategies that use SPC.
@@ -242,7 +399,7 @@ class X86_64SpcStrategy extends X86_64ExecutionStrategy {
 
 		var module = wf.instance.module;
 		var code = module.target_module.spc_code;
-		var compiler = newCompiler(module.filename); // XXX: cache per-thread
+		var compiler = newCompiler(module.filename, false, null); // XXX: cache per-thread
 		var masm = X86_64MacroAssembler.!(compiler.masm), w = masm.asm.w;
 
 		// generate code for the function
@@ -271,19 +428,6 @@ class X86_64SpcStrategy extends X86_64ExecutionStrategy {
 		}
 		return SpcResultForStub(wf, entrypoint, null);
 	}
-	def installStubForModule(module: Module, set: (Module, FuncDecl) -> void) {
-		// ensure entrypoint and lazy compile stubs are generated
-		X86_64PreGenStubs.gen();
-		// Set all functions to refer to the tier-up compile stub.
-		var codeSize = MINIMUM_CODE_SIZE;
-		for (i < module.functions.length) {
-			var f = module.functions[i];
-			if (f.imported()) continue;
-			set(module, f);
-			codeSize += X86_64Spc.estimateCodeSizeFor(f);
-		}
-		allocateCodeForModule(module, codeSize);
-	}
 }
 
 // One tier: SPC, modules are eagerly compiled.
@@ -319,7 +463,7 @@ class X86_64SpcAotStrategy(interpreter_fallback: bool) extends X86_64SpcStrategy
 		// ensure entrypoint and lazy compile stubs are generated
 		X86_64PreGenStubs.gen();
 
-		var compiler = newCompiler(module.filename);
+		var compiler = newCompiler(module.filename, false, null);
 		var w = compiler.w;
 
 		// generate code for all functions
@@ -412,7 +556,7 @@ class X86_64DynamicStrategy extends X86_64SpcStrategy {
 	}
 	def onTierUp(wf: WasmFunction, pc: int) -> TargetOsrInfo {
 		var module = wf.instance.module;
-		var compiler = newCompiler(module.filename);
+		var compiler = newCompiler(module.filename, false, null);
 		if (!applyJitFilter(wf.instance.module, wf.decl, "osr")) {
 			// OSR compile suppressed
 			wf.decl.tierup_trigger = int.max; // no point in trying for a while
@@ -441,10 +585,10 @@ class X86_64DynamicStrategy extends X86_64SpcStrategy {
 	}
 }
 
-def newCompiler(filename: string) -> X86_64SinglePassCompiler {
+def newCompiler(filename: string, fast: bool, ic: X86_64InterpreterCode) -> X86_64SinglePassCompiler {
 	var extensions = Extension.set.all; // TODO: all extensions enabled for compilation
 	var limits = Limits.new();
-	var compiler = X86_64SinglePassCompiler.new(extensions, limits, X86_64MasmRegs.CONFIG);
+	var compiler = X86_64SinglePassCompiler.new(ic, extensions, limits, X86_64MasmRegs.CONFIG, fast);
 	return compiler;
 }
 def MINIMUM_CODE_SIZE = PAGE_SIZE_i;
diff --git a/src/util/BytecodeVisitor.v3 b/src/util/BytecodeVisitor.v3
index fbef4b056..cacd6c738 100644
--- a/src/util/BytecodeVisitor.v3
+++ b/src/util/BytecodeVisitor.v3
@@ -20,9 +20,9 @@ class BytecodeVisitor {
 	def visitMisc(op: Opcode) 			{ visitOp(op); }
 	def visitControl(op: Opcode) 			{ visitOp(op); }
 	def visitCall(op: Opcode)			{ visitOp(op); }
-	def visitCallDirect(op: Opcode, func_index: u31, tailCall: bool)	{ visitCall(op); }
-	def visitCallIndirect(op: Opcode, sig_index: u31, table_index: u31, tailCall: bool)	{ visitCall(op); }
-	def visitCallRef(op: Opcode, sig_index: u31, tailCall: bool)		{ visitCall(op); }
+	def visitCallDirect(op: Opcode, func_index: u31, prop: CallProperty)	{ visitCall(op); }
+	def visitCallIndirect(op: Opcode, sig_index: u31, table_index: u31, prop: CallProperty)	{ visitCall(op); }
+	def visitCallRef(op: Opcode, sig_index: u31, prop: CallProperty)		{ visitCall(op); }
 	def visitLocal(op: Opcode, local_index: u31)	{ visitOp(op); }
 	def visitGlobal(op: Opcode, local_index: u31)	{ visitOp(op); }
 	def visitTable(op: Opcode, table_index: u31)	{ visitOp(op); }
@@ -69,12 +69,13 @@ class BytecodeVisitor {
 	def visit_BR_IF		(depth: u31) 					{ visitControl(Opcode.BR_IF); }
 	def visit_BR_TABLE	(labels: Range<u31>)			 	{ visitControl(Opcode.BR_TABLE); }
 	def visit_RETURN	() 						{ visitControl(Opcode.RETURN); }
-	def visit_CALL		(func_index: u31) 				{ visitCallDirect(Opcode.CALL, func_index, false); }
-	def visit_CALL_INDIRECT	(sig_index: u31, table_index: u31) 		{ visitCallIndirect(Opcode.CALL_INDIRECT, sig_index, table_index, false); }
-	def visit_RETURN_CALL	(func_index: u31) 				{ visitCallDirect(Opcode.RETURN_CALL, func_index, true); }
-	def visit_RETURN_CALL_INDIRECT(sig_index: u31, table_index: u31)	{ visitCallIndirect(Opcode.RETURN_CALL_INDIRECT, sig_index, table_index, true); }
-	def visit_CALL_REF	(sig_index: u31) 				{ visitCallRef(Opcode.CALL_REF, sig_index, false); }
-	def visit_RETURN_CALL_REF(sig_index: u31) 				{ visitCallRef(Opcode.RETURN_CALL_REF, sig_index, true); }
+	def visit_CALL		(func_index: u31) 				{ visitCallDirect(Opcode.CALL, func_index, SLOW); }
+	def visit_FAST_CALL	(fast_index: int, func_index: u31) 		{ visitCallDirect(Opcodes.indexToFastCall(fast_index), func_index, FAST); }
+	def visit_CALL_INDIRECT	(sig_index: u31, table_index: u31) 		{ visitCallIndirect(Opcode.CALL_INDIRECT, sig_index, table_index, SLOW); }
+	def visit_RETURN_CALL	(func_index: u31) 				{ visitCallDirect(Opcode.RETURN_CALL, func_index, TAIL); }
+	def visit_RETURN_CALL_INDIRECT(sig_index: u31, table_index: u31)	{ visitCallIndirect(Opcode.RETURN_CALL_INDIRECT, sig_index, table_index, TAIL); }
+	def visit_CALL_REF	(sig_index: u31) 				{ visitCallRef(Opcode.CALL_REF, sig_index, SLOW); }
+	def visit_RETURN_CALL_REF(sig_index: u31) 				{ visitCallRef(Opcode.RETURN_CALL_REF, sig_index, TAIL); }
 	def visit_DELEGATE	(depth: u31) 					{ visitControl(Opcode.DELEGATE); }
 	def visit_CATCH_ALL	() 						{ visitControl(Opcode.CATCH_ALL); }
 	def visit_DROP		() 						{ visitMisc(Opcode.DROP); }
@@ -653,3 +654,11 @@ class BytecodeVisitor {
 	def visit_SUSPEND					(tag: u31)  	{ visitOp(Opcode.SUSPEND); }
 	def visit_SWITCH					(cont: u31, tag: u31) 	{ visitOp(Opcode.SWITCH); }
 }
+
+enum CallProperty {
+	SLOW, TAIL, FAST
+}
+
+def SLOW = CallProperty.SLOW;
+def TAIL = CallProperty.TAIL;
+def FAST = CallProperty.FAST;
diff --git a/src/util/Whamm.v3 b/src/util/Whamm.v3
index 9b93b746d..ae1649d8b 100644
--- a/src/util/Whamm.v3
+++ b/src/util/Whamm.v3
@@ -175,10 +175,9 @@ component Whamm {
 class WhammProbe(func: Function, sig: Array<WhammArg>) extends Probe {
 	var trampoline: TargetCode;
 	// properties set by the spc to make inlining optimization decisions.
-	var inline_heuristic_checked = false;
-	var spc_inline_func = false;
-	var spc_swap_instance = false;
-	var spc_swap_membase = false;
+	var swap_checked = false;
+	var swap_instance = false;
+	var swap_membase = false;
 
 	private def args = if(sig.length == 0, Values.NONE, Array<Value>.new(sig.length));
 
@@ -203,6 +202,31 @@ class WhammProbe(func: Function, sig: Array<WhammArg>) extends Probe {
 		}
 		return ProbeAction.Continue;
 	}
+
+	// If function is to be inlined, check to see if instance or mem0_base need to be swapped.
+	def checkSwap() {
+		if (swap_checked) return;
+		var bi = BytecodeIterator.new().reset(WasmFunction.!(func).decl);
+		while (bi.more()) {
+			var op = bi.current();
+			match (op) {
+				// These opcodes require swapping the instance.
+				THROW, CALL, CALL_INDIRECT, MEMORY_INIT, MEMORY_SIZE, MEMORY_GROW, MEMORY_COPY, MEMORY_FILL, REF_FUNC, DATA_DROP,
+				ELEM_DROP, TABLE_INIT, TABLE_SIZE, TABLE_COPY, TABLE_GROW, GLOBAL_SET, GLOBAL_GET, TABLE_SET, TABLE_GET => swap_instance = true;
+				// Load/store opcodes require either the memory base or the instance.
+				I32_STORE, I64_STORE, F32_STORE, F64_STORE, I32_STORE8, I32_STORE16, I64_STORE8, I64_STORE16, I64_STORE32,
+				V128_STORE, I32_LOAD, I64_LOAD, F32_LOAD, F64_LOAD, I32_LOAD8_S, I32_LOAD8_U, I32_LOAD16_S, I32_LOAD16_U,
+				I64_LOAD8_S, I64_LOAD8_U, I64_LOAD16_S, I64_LOAD16_U, I64_LOAD32_S, I64_LOAD32_U, V128_LOAD => {
+					var memarg = bi.immptr().read_MemArg();
+					if (memarg.memory_index == 0) swap_membase = true;
+					else swap_instance = true;
+				}
+				_ => ;
+			}
+			bi.next();
+		}
+		swap_checked = true;
+	}
 }
 
 def parseParam0(r: TextReader) -> WhammParam {
diff --git a/test/inline/failures.x86-64-linux b/test/inline/failures.x86-64-linux
deleted file mode 100644
index 925e70891..000000000
--- a/test/inline/failures.x86-64-linux
+++ /dev/null
@@ -1,3 +0,0 @@
-inline_test_arithmetic.wasm
-inline_test_locals_control.wasm
-inline_test_nesting.wasm
diff --git a/test/inline/failures.x86-64-linux.dyn b/test/inline/failures.x86-64-linux.dyn
index da02fa079..50325688b 100644
--- a/test/inline/failures.x86-64-linux.dyn
+++ b/test/inline/failures.x86-64-linux.dyn
@@ -1,4 +1,5 @@
 inline_test_arithmetic.wasm
 inline_test_locals_control.wasm
 inline_test_nesting.wasm
+inline_test_return.wasm
 
diff --git a/test/inline/inline_test_return.wasm b/test/inline/inline_test_return.wasm
new file mode 100644
index 000000000..d7bcbbaa0
Binary files /dev/null and b/test/inline/inline_test_return.wasm differ
diff --git a/test/inline/inline_test_return.wasm.exit b/test/inline/inline_test_return.wasm.exit
new file mode 100644
index 000000000..573541ac9
--- /dev/null
+++ b/test/inline/inline_test_return.wasm.exit
@@ -0,0 +1 @@
+0
diff --git a/test/inline/inline_test_return.wasm.flags b/test/inline/inline_test_return.wasm.flags
new file mode 100644
index 000000000..0c2fe67af
--- /dev/null
+++ b/test/inline/inline_test_return.wasm.flags
@@ -0,0 +1 @@
+--metrics=spc*calls --inline-max-depth=1
diff --git a/test/inline/inline_test_return.wasm.out b/test/inline/inline_test_return.wasm.out
new file mode 100644
index 000000000..79d1497bf
--- /dev/null
+++ b/test/inline/inline_test_return.wasm.out
@@ -0,0 +1,4 @@
+spc:static_calls         : 6         calls
+spc:static_inlined_calls : 6         calls
+spc:dynamic_calls        : 6         calls
+spc:dynamic_inlined_calls : 6        calls
diff --git a/test/inline/inline_test_return.wat b/test/inline/inline_test_return.wat
new file mode 100644
index 000000000..c1dd8b196
--- /dev/null
+++ b/test/inline/inline_test_return.wat
@@ -0,0 +1,97 @@
+;; Test inlined functions with explicit RETURN, including nested control flow
+;; and paths where extra values are on the stack at the time of return.
+(module
+  ;; Two levels of nested ifs; in the early-return path, 2*a is an extra value
+  ;; on the value stack below the returned a+b.
+  (func $weighted (param i32) (param i32) (result i32)
+    block (result i32)
+      local.get 0
+      i32.const 2
+      i32.mul           ;; [2a] -- extra below when early return fires
+      block
+        local.get 0
+        i32.const 0
+        i32.gt_s
+        if
+          local.get 1
+          i32.const 0
+          i32.gt_s
+          if
+            ;; both positive: return a+b; 2a is extra on stack
+            local.get 0
+            local.get 1
+            i32.add
+            return
+          end
+        end
+      end
+      local.get 1
+      i32.add           ;; fallthrough: 2a+b
+    end
+  )
+
+  ;; Clamp x to [lo, hi]; two levels of nesting, returns on multiple paths.
+  (func $clamp (param i32) (param i32) (param i32) (result i32)
+    local.get 0
+    local.get 1
+    i32.lt_s
+    if
+      local.get 1
+      return
+    end
+    local.get 0
+    local.get 2
+    i32.gt_s
+    if
+      local.get 2
+      return
+    end
+    local.get 0
+  )
+
+  (func (export "main") (result i32)
+    i32.const 3
+    i32.const 4
+    call $weighted
+    i32.const 7         ;; both positive: 3+4=7
+    i32.ne
+
+    i32.const 3
+    i32.const -1
+    call $weighted
+    i32.const 5         ;; b<=0: 2*3+(-1)=5
+    i32.ne
+    i32.or
+
+    i32.const -1
+    i32.const 4
+    call $weighted
+    i32.const 2         ;; a<=0: 2*(-1)+4=2
+    i32.ne
+    i32.or
+
+    i32.const 5
+    i32.const 0
+    i32.const 10
+    call $clamp
+    i32.const 5
+    i32.ne
+    i32.or
+
+    i32.const -3
+    i32.const 0
+    i32.const 10
+    call $clamp
+    i32.const 0
+    i32.ne
+    i32.or
+
+    i32.const 15
+    i32.const 0
+    i32.const 10
+    call $clamp
+    i32.const 10
+    i32.ne
+    i32.or
+  )
+)