diff --git a/fast_call.wasm b/fast_call.wasm new file mode 100644 index 000000000..36f858295 Binary files /dev/null and b/fast_call.wasm differ diff --git a/fast_call.wat b/fast_call.wat new file mode 100644 index 000000000..fa74fa213 --- /dev/null +++ b/fast_call.wat @@ -0,0 +1,10 @@ +(module + (import "wizeng" "puti" (func $puti (param i32))) + (func $f (result i32) + i32.const 10) + (func (export "main") (result i32) + call $f + call $puti + i32.const 0 + ) +) diff --git a/fast_call2.wasm b/fast_call2.wasm new file mode 100644 index 000000000..b3dcbf2bf Binary files /dev/null and b/fast_call2.wasm differ diff --git a/fast_call2.wat b/fast_call2.wat new file mode 100644 index 000000000..3dd58686b --- /dev/null +++ b/fast_call2.wat @@ -0,0 +1,7 @@ +(module + (func $f (result i32) + i32.const 10) + (func (export "main") (result i32) + call $f + ) +) diff --git a/fast_call_export.wasm b/fast_call_export.wasm new file mode 100644 index 000000000..de5abe4d8 Binary files /dev/null and b/fast_call_export.wasm differ diff --git a/fast_call_export.wat b/fast_call_export.wat new file mode 100644 index 000000000..20c428045 --- /dev/null +++ b/fast_call_export.wat @@ -0,0 +1,10 @@ +;; export name holds fast information, we don't modify binary ahead of time + +(module + (func $fast (export "fast:foo") (result i32) + (i32.const 2) + ) + (func (export "main") (result i32) + (call $fast) + ) +) diff --git a/fast_call_nop.wasm b/fast_call_nop.wasm new file mode 100644 index 000000000..403dc7cf1 Binary files /dev/null and b/fast_call_nop.wasm differ diff --git a/fast_call_nop.wat b/fast_call_nop.wat new file mode 100644 index 000000000..c406ac91a --- /dev/null +++ b/fast_call_nop.wat @@ -0,0 +1,10 @@ +(module + (func $f) + (func $g) + (func (export "main") (result i32) + i64.const 11 + drop + call $g + i32.const 0 + ) +) diff --git a/fast_call_param.wasm b/fast_call_param.wasm new file mode 100644 index 000000000..c70071c25 Binary files /dev/null and b/fast_call_param.wasm differ diff --git a/fast_call_param.wat b/fast_call_param.wat new file mode 100644 index 000000000..b3f4ad728 --- /dev/null +++ b/fast_call_param.wat @@ -0,0 +1,18 @@ +(module + (import "wizeng" "puti" (func $puti (param i32))) + (func $f (param i32) (result i32) + local.get 0 + if (result i32) + i32.const 999 + else + i32.const -216 + end + ) + (func (export "main") (result i32) + (call $f (i32.const 1)) + call $puti + (call $f (i32.const 0)) + call $puti + i32.const 0 + ) +) diff --git a/int/Export b/int/Export new file mode 100755 index 000000000..df2a0f18a --- /dev/null +++ b/int/Export @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +S=${BASH_SOURCE[0]} +while [ -h "$S" ]; do + DIR=$(cd -P $(dirname "$S") >/dev/null 2>&1 && pwd) + S=$(readlink "$S") + [[ $S != /* ]] && S=$DIR/$S +done +DIR=$(cd -P $(dirname "$S") >/dev/null 2>&1 && pwd) +wizeng $DIR/Export.wasm "$@" diff --git a/int/Export.v3 b/int/Export.v3 new file mode 100644 index 000000000..2409223df --- /dev/null +++ b/int/Export.v3 @@ -0,0 +1,12 @@ +export "fast:foo" def foo(x: int, y: int) -> int { + var val = y; + for (i < x) { + val += i * y; + } + return val; +} + +export "main" def main() -> int { + System.puts(Strings.format1("%d\n", foo(11, 2))); + return 0; +} diff --git a/int/Export.wasm b/int/Export.wasm new file mode 100644 index 000000000..65fa630c5 Binary files /dev/null and b/int/Export.wasm differ diff --git a/int/Interpreter b/int/Interpreter new file mode 100755 index 000000000..55ced8cc9 --- /dev/null +++ b/int/Interpreter @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +S=${BASH_SOURCE[0]} +while [ -h "$S" ]; do + DIR=$(cd -P $(dirname "$S") >/dev/null 2>&1 && pwd) + S=$(readlink "$S") + [[ $S != /* ]] && S=$DIR/$S +done +DIR=$(cd -P $(dirname "$S") >/dev/null 2>&1 && pwd) +wizeng $DIR/Interpreter.wasm "$@" diff --git a/int/Interpreter.v3 b/int/Interpreter.v3 new file mode 100644 index 000000000..13f0a1b9e --- /dev/null +++ b/int/Interpreter.v3 @@ -0,0 +1,403 @@ +export "fast:const0" def const0() -> long { return 0; } +export "fast:const1" def const1() -> long { return 1; } +export "fast:constN" def constN(n: int) -> long { return n; } +export "fast:add" def add(l: long, r: long) -> long { return l + r; } +export "fast:sub" def sub(l: long, r: long) -> long { return l - r; } +export "fast:fact" def fact(n: long) -> long { + var v: long = 1; + for (i < n) { + v *= i + 1; + } + return v; + } +export "fast:seq" def seq(f: long, s: long) -> long { return s; } +export "fast:select" def select(c: long, t: long, f: long) -> long { return if(c != 0, t, f); } +export "fast:if" def if_(c: long) -> bool { return c != 0; } +export "fast:nop" def nop() -> void {} +export "fast:print" def print(n: long) -> long { + System.puts(Strings.format1("%d\n", n)); + return 0; + } +export "fast:double" def double(n: long) -> long { return add(n, n); } + +def HANDLER_CONST0 = CiWasmTarget.functionId(const0); +def HANDLER_CONST1 = CiWasmTarget.functionId(const1); +def HANDLER_CONSTN = CiWasmTarget.functionId(constN); +def HANDLER_ADD = CiWasmTarget.functionId(add); +def HANDLER_SUB = CiWasmTarget.functionId(sub); +def HANDLER_FACT = CiWasmTarget.functionId(fact); +def HANDLER_SEQ = CiWasmTarget.functionId(seq); +def HANDLER_SELECT = CiWasmTarget.functionId(select); +def HANDLER_IF = CiWasmTarget.functionId(if_); +def HANDLER_NOP = CiWasmTarget.functionId(nop); +def HANDLER_PRINT = CiWasmTarget.functionId(print); +def HANDLER_DOUBLE = CiWasmTarget.functionId(double); + +export "main" def main() -> int { + def buf = StringBuilder.new(); + + //def prog = Select(Sub(ConstN(1), Const1), Add(Const1, ConstN(100)), Seq(Sub(Add(Const1, ConstN(2)), Const0), ConstN(15))); + + //def prog = AST.If(Const1, ConstN(2), ConstN(3)); + def prog = Double(Fact(ConstN(13))); + + //def prog = Const1; + def bytecode = compile(prog); + def val = eval(bytecode); + + prog.display(buf); + buf.ln(); + buf.put1("=> %d", val); + buf.ln(); + System.puts(buf.extract()); + + def f: Func.F = wasmCompile(bytecode); + def val_ = f.f(); + + buf.put1("=> %d", val_); + buf.ln(); + System.puts(buf.extract()); + + return 0; +} + +def eval(bytecode: Array) -> long { + def vstk = ArrayStack.new(); + var pc = 0; + + // print out bytecode + def b = StringBuilder.new(); + while (pc < bytecode.length) { + b.put1("+%d ", pc); + + def instruction = Ref.at(bytecode, pc); + def opcode = instruction.opcode; + def operand = instruction.operand; + pc += Instruction.size; + + b.puts(opcode.name); + match (opcode) { + CONSTN, IF, ELSE => b.put1(" %d", operand); + _ => ; + } + b.ln(); + } + System.puts(b.extract()); + + pc = 0; + while (pc < bytecode.length) { + System.puts(Strings.format1("pc=%d\n", pc)); + def instruction = Ref.at(bytecode, pc); + def opcode = instruction.opcode; + def operand = instruction.operand; + pc += Instruction.size; + + match (opcode) { + CONST0 => vstk.push(0); + CONST1 => vstk.push(1); + CONSTN => vstk.push(operand); + ADD => { + def right = vstk.pop(); + def left = vstk.pop(); + vstk.push(left + right); + } + SUB => { + def right = vstk.pop(); + def left = vstk.pop(); + vstk.push(left - right); + } + FACT => { + def arg = vstk.pop(); + var val: long = 1; + for (i < arg) { + val *= i + 1; + } + vstk.push(val); + } + PRINT => { + def arg = vstk.pop(); + System.puts(Strings.format1("%d\n", arg)); + vstk.push(0); + } + DOUBLE => { + def arg = vstk.pop(); + vstk.push(arg + arg); + } + SEQ => { + def snd = vstk.pop(); + def fst = vstk.pop(); + vstk.push(snd); + } + SELECT => { + def snd = vstk.pop(); + def fst = vstk.pop(); + def cond = vstk.pop(); + vstk.push(if(cond != 0, fst, snd)); + } + IF => { + def cond = vstk.pop(); + if (cond == 0) pc += operand; + } + ELSE => { + pc += operand; + } + END => {} // nop + } + } + return vstk.peek(); +} + +enum Opcode(handler: int) { + CONST0 (HANDLER_CONST0) + CONST1 (HANDLER_CONST1) + CONSTN (HANDLER_CONSTN) + ADD (HANDLER_ADD) + SUB (HANDLER_SUB) + FACT (HANDLER_FACT) + SEQ (HANDLER_SEQ) + SELECT (HANDLER_SELECT) + IF (HANDLER_IF) + ELSE (HANDLER_NOP) + END (HANDLER_NOP) + PRINT (HANDLER_PRINT) + DOUBLE (HANDLER_DOUBLE) +} + +layout Instruction { + +0 opcode: Opcode; + +1 operand: byte; + =2; +} + +type AST { + case Const0 { + def compile(w: DataWriter) { + w.putb(Opcode.CONST0.tag).putb(0); + } + def display(s: StringBuilder) { + s.putc('0'); + } + } + case Const1 { + def compile(w: DataWriter) { + w.putb(Opcode.CONST1.tag).putb(0); + } + def display(s: StringBuilder) { + s.putc('1'); + } + } + case ConstN(n: byte) { + def compile(w: DataWriter) { + w.putb(Opcode.CONSTN.tag).putb(n); + } + def display(s: StringBuilder) { + s.putd(n); + } + } + case Add(left: AST, right: AST) { + def compile(w: DataWriter) { + left.compile(w); + right.compile(w); + w.putb(Opcode.ADD.tag).putb(0); + } + def display(s: StringBuilder) { + s.putc('('); + left.display(s); + s.puts(" + "); + right.display(s); + s.putc(')'); + } + } + case Fact(arg: AST) { + def compile(w: DataWriter) { + arg.compile(w); + w.putb(Opcode.FACT.tag).putb(0); + } + def display(s: StringBuilder) { + s.puts("(fact "); + arg.display(s); + s.putc(')'); + } + } + case Print(arg: AST) { + def compile(w: DataWriter) { + arg.compile(w); + w.putb(Opcode.PRINT.tag).putb(0); + } + def display(s: StringBuilder) { + s.puts("(print "); + arg.display(s); + s.putc(')'); + } + } + case Double(arg: AST) { + def compile(w: DataWriter) { + arg.compile(w); + w.putb(Opcode.DOUBLE.tag).putb(0); + } + def display(s: StringBuilder) { + s.puts("(double "); + arg.display(s); + s.putc(')'); + } + } + case Sub(left: AST, right: AST) { + def compile(w: DataWriter) { + left.compile(w); + right.compile(w); + w.putb(Opcode.SUB.tag).putb(0); + } + def display(s: StringBuilder) { + s.putc('('); + left.display(s); + s.puts(" - "); + right.display(s); + s.putc(')'); + } + } + case Seq(fst: AST, snd: AST) { + def compile(w: DataWriter) { + fst.compile(w); + snd.compile(w); + w.putb(Opcode.SEQ.tag).putb(0); + } + def display(s: StringBuilder) { + s.putc('('); + fst.display(s); + s.puts(" ; "); + snd.display(s); + s.putc(')'); + } + } + // eager evaluation of branches + case Select(cond: AST, left: AST, right: AST) { + def compile(w: DataWriter) { + cond.compile(w); + left.compile(w); + right.compile(w); + w.putb(Opcode.SELECT.tag).putb(0); + } + def display(s: StringBuilder) { + s.puts("(select "); + cond.display(s); + s.putc(' '); + left.display(s); + s.putc(' '); + right.display(s); + s.putc(')'); + } + } + // lazy evaluation of branches + case If(cond: AST, left: AST, right: AST) { + def compile(w: DataWriter) { + cond.compile(w); + w.putb(Opcode.IF.tag).putb(0); + def hole1 = w.pos; + left.compile(w); + w.putb(Opcode.ELSE.tag).putb(0); + w.data[hole1 - 1] = byte.!(w.pos - hole1); + def hole2 = w.pos; + right.compile(w); + w.data[hole2 - 1] = byte.!(w.pos - hole2); + w.putb(Opcode.END.tag).putb(0); + + } + def display(s: StringBuilder) { + s.puts("(if "); + cond.display(s); + s.putc(' '); + left.display(s); + s.putc(' '); + right.display(s); + s.putc(')'); + } + } + + def compile(w: DataWriter); + def display(s: StringBuilder); +} + +def Const0 = AST.Const0; +def Const1 = AST.Const1; +def ConstN = AST.ConstN; +def Add = AST.Add; +def Sub = AST.Sub; +def Fact= AST.Fact; +def Print = AST.Print; +def Seq = AST.Seq; +def Select = AST.Select; +def If = AST.If; +def Double = AST.Double; + +def compile(prog: AST) -> Array { + def w = DataWriter.new(); + + prog.compile(w); + + return w.extract(); +} + +type Func { + case F(f: () -> long); +} + +def wasmCompile(bytecode: Array) -> Func.F { + def w = DataWriter.new(); + + w.put_uleb32(0); // 0 locals + + var pc = 0; + while (pc < bytecode.length) { + def instruction = Ref.at(bytecode, pc); + def opcode = instruction.opcode; + def operand = instruction.operand; + pc += Instruction.size; + + // setup for handler, if necessary (guest-level operands) + match (opcode) { + CONSTN => { + w.putb(I32_CONST); + w.put_sleb32(operand); + } + _ => ; + } + // call handler function + if (opcode.handler != HANDLER_NOP) { + w.putb(CALL); + w.put_uleb32(u32.!(opcode.handler)); + } + // post-handler wasm bytecodes + match (opcode) { + IF => { + w.putb(IF); + w.putb(RESULT_I64); + } + ELSE => w.putb(ELSE); // didn't emit handler anyway + END => w.putb(END); // didn't emit handler anyway + _ => ; + } + } + w.putb(END); + + // create wasm function + def sig = CiWasmTarget.functionTypeId(); + def wasm = w.extract(); + def fid = wave.new_func(sig, Pointer.atContents(wasm), wasm.length); + if (fid < 0) { + System.puts("failed to compile wasm function\n"); + System.error("error", "failed to compile"); + } + def func = CiRuntime.forgeClosure(Pointer.NULL + fid, void); + + return Func.F(func); +} + +def IF: byte = 0x04; +def ELSE: byte = 0x05; +def END: byte = 0x0B; +def CALL: byte = 0x10; +def DROP: byte = 0x1A; +def I32_CONST: byte = 0x41; +def I64_CONST: byte = 0x42; + +def RESULT_I64: byte = 0x7E; diff --git a/int/Interpreter.wasm b/int/Interpreter.wasm new file mode 100644 index 000000000..efa0a14d2 Binary files /dev/null and b/int/Interpreter.wasm differ diff --git a/int/InterpreterBug b/int/InterpreterBug new file mode 100755 index 000000000..a88079884 Binary files /dev/null and b/int/InterpreterBug differ diff --git a/int/InterpreterBug.v3 b/int/InterpreterBug.v3 new file mode 100644 index 000000000..fa20b709d --- /dev/null +++ b/int/InterpreterBug.v3 @@ -0,0 +1,25 @@ +def main() -> int { + def x = A.Z(ay); + + x.foo(); + + return 0; +} + +def ay = A.Y; + +type A { + case X { + def foo() {} + } + case Y { + def foo() {} + } + case Z(a: A) { + def foo() { a.foo(); } + } + + def foo(); +} + + diff --git a/int/InterpreterBug.wasm b/int/InterpreterBug.wasm new file mode 100644 index 000000000..dc25a1547 Binary files /dev/null and b/int/InterpreterBug.wasm differ diff --git a/int/RiRuntime b/int/RiRuntime new file mode 100755 index 000000000..716d1b2e8 --- /dev/null +++ b/int/RiRuntime @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +S=${BASH_SOURCE[0]} +while [ -h "$S" ]; do + DIR=$(cd -P $(dirname "$S") >/dev/null 2>&1 && pwd) + S=$(readlink "$S") + [[ $S != /* ]] && S=$DIR/$S +done +DIR=$(cd -P $(dirname "$S") >/dev/null 2>&1 && pwd) +wizeng $DIR/RiRuntime.wasm "$@" diff --git a/int/RiRuntime.wasm b/int/RiRuntime.wasm new file mode 100644 index 000000000..049ef40aa Binary files /dev/null and b/int/RiRuntime.wasm differ diff --git a/slow_call.wasm b/slow_call.wasm new file mode 100644 index 000000000..8d09b720e Binary files /dev/null and b/slow_call.wasm differ diff --git a/slow_call_nop.wasm b/slow_call_nop.wasm new file mode 100644 index 000000000..2af221a38 Binary files /dev/null and b/slow_call_nop.wasm differ diff --git a/slow_call_nop.wat b/slow_call_nop.wat new file mode 100644 index 000000000..0533b6638 --- /dev/null +++ b/slow_call_nop.wat @@ -0,0 +1,7 @@ +(module + (func $f) + (func (export "main") (result i32) + call $f + i32.const 0 + ) +) diff --git a/src/engine/BytecodeIterator.v3 b/src/engine/BytecodeIterator.v3 index 37af6878d..c341bb881 100644 --- a/src/engine/BytecodeIterator.v3 +++ b/src/engine/BytecodeIterator.v3 @@ -779,6 +779,53 @@ class BytecodeIterator { RESUME_THROW => v.visit_RESUME_THROW(read_CONT(), read_TAG(), read_HANDLERS()); RESUME_THROW_REF => v.visit_RESUME_THROW_REF(read_CONT(), read_HANDLERS()); SWITCH => v.visit_SWITCH(read_CONT(), read_TAG()); + + /* here, we require that replacing CALL with FAST_CALL does not touch the + * operand, so that the original function can still be recovered from the bytecode itself + * + * in other places, where we have the module, we can go direct from bytecode to func + */ + // FIXME wrap into _ clause + FAST_CALL0 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL1 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL2 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL3 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL4 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL5 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL6 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL7 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL8 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL9 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL10 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL11 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL12 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL13 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL14 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL15 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL16 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL17 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL18 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL19 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL20 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL21 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL22 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL23 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL24 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL25 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL26 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL27 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL28 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL29 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL30 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL31 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL32 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL33 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL34 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL35 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL36 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL37 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL38 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); + FAST_CALL39 => v.visit_FAST_CALL(Opcodes.fastCallToIndex(opcode), read_FUNC()); } } def trace(out: StringBuilder, module: Module, tracer: InstrTracer) { diff --git a/src/engine/CodeValidator.v3 b/src/engine/CodeValidator.v3 index bf3511288..d0d050019 100644 --- a/src/engine/CodeValidator.v3 +++ b/src/engine/CodeValidator.v3 @@ -420,7 +420,87 @@ class CodeValidator(extensions: Extension.set, limits: Limits, module: Module, e var func = parser.readFuncRef(); if (func == null) return; checkSignature(func.sig); + + // fast call: if function is exported with fast name, replace the bytecode with FAST_CALL + if (FastIntTuning.useFastFunctions) { + for (i < module.exports.length) { + def ex = module.exports[i]; + if (ex.1 == func && Strings.startsWith(ex.0, "fast:")) { + if (Trace.validation) Trace.OUT.puts(" function declared as fast: "); + + var fast_idx = -1; + def fast_funcs = module.fast_funcs; + // look for existing FAST_CALL instruction allocated for this function + for (i < fast_funcs.length) { + if (func == fast_funcs[i]) { + fast_idx = i; + if (Trace.validation) Trace.OUT.put1("allocated as FAST_CALL%d, ", fast_idx); + break; + } + } + // not found? allocate FAST_CALL instruction, if there's space + if (fast_idx < 0) { + if (fast_funcs.length < 40) { + fast_idx = fast_funcs.length; + func.fast_call_idx = fast_idx; + if (Trace.validation) Trace.OUT.put1("not found, allocating FAST_CALL%d, ", fast_idx); + fast_funcs.put(func); + } else { + if (Trace.validation) Trace.OUT.puts("not found, FAST_CALL table is full, "); + } + } + // replace the bytecode, if it's found or allocated + if (fast_idx >= 0) { + //if (Trace.validation) Trace.OUT.put2("replaceCall(opcode_pos, fast_idx)\n", opcode_pos, fast_idx); + if (Trace.validation) Trace.OUT.puts("replacing call\n"); + this.func.replaceCall(opcode_pos, fast_idx); + } else { + if (Trace.validation) Trace.OUT.puts("not replacing\n"); + } + } + } + } } + FAST_CALL0 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL1 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL2 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL3 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL4 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL5 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL6 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL7 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL8 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL9 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL10 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL11 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL12 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL13 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL14 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL15 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL16 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL17 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL18 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL19 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL20 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL21 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL22 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL23 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL24 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL25 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL26 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL27 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL28 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL29 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL30 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL31 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL32 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL33 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL34 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL35 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL36 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL37 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL38 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); + FAST_CALL39 => if (FastIntTuning.useFastFunctions) System.error("validation error", "trying to validate FAST_CALL internal opcode"); CALL_INDIRECT => { var sig = parser.readSigRef(); var table = parser.readTableRef(); diff --git a/src/engine/Module.v3 b/src/engine/Module.v3 index 90d85a4ae..a26d75e74 100644 --- a/src/engine/Module.v3 +++ b/src/engine/Module.v3 @@ -17,6 +17,7 @@ class Module(filename: string) { def exports = Vector<(string, Decl)>.new(); def elems = Vector.new(); def data = Vector.new(); + def fast_funcs = Vector.new(); def custom_sections = Vector.new(); var probes: Array>; var dyn_probes: Vector<(int, int, Probe)>; @@ -143,6 +144,8 @@ class FuncDecl(sig_index: int) extends Decl { var target_code: TargetCode; var tierup_trigger: int = int.max; var handlers = FuncHandlerInfo.new(); + var fast_target_code: TargetCode; + var fast_call_idx: int = -1; def render(names: NameSection, buf: StringBuilder) -> StringBuilder { var name = if (names != null, names.getFuncName(func_index)); @@ -154,6 +157,7 @@ class FuncDecl(sig_index: int) extends Decl { var tc: TargetCode; var tr: TargetCode; target_code = tc; // reset target code as well + fast_target_code = tc; sidetable = Sidetables.NO_SIDETABLE; cbd_sidetable = null; } @@ -168,6 +172,17 @@ class FuncDecl(sig_index: int) extends Decl { if (cur_bytecode == orig_bytecode) return; cur_bytecode[pc] = orig_bytecode[pc]; } + def replaceCall(pc: int, idx: int) { + // "orig" will become a copy of the original code, to allow in-place modification of old code + if (cur_bytecode == orig_bytecode) orig_bytecode = Arrays.dup(orig_bytecode); + // sanity check + if (cur_bytecode[pc] != Opcode.CALL.code) { + def realOp = Opcodes.find(0, cur_bytecode[pc]); + System.error("replace bytecode", Strings.format1("not replacing call (got %s)", realOp.mnemonic)); + } + cur_bytecode[pc] = byte.!(Opcodes.indexToFastCall(idx).code); + // do NOT replace the operands, as a convenience for BytecodeIterator + } def reset() -> this { if (cur_bytecode == orig_bytecode) return; ArrayUtil.copyInto(cur_bytecode, 0, orig_bytecode, 0, orig_bytecode.length); @@ -183,6 +198,7 @@ class FuncDecl(sig_index: int) extends Decl { n.sidetable = this.sidetable; n.num_locals = this.num_locals; n.target_code = this.target_code; + n.fast_target_code = this.fast_target_code; return n; } def findExHandler(instance: Instance, tag: Tag, throw_pc: int) -> ExHandler { @@ -201,6 +217,7 @@ class FuncDecl(sig_index: int) extends Decl { Trace.OUT.put3("(func=%q, tag=%d, throw_pc=%d)", this.render(instance.module.names, _), tag.decl.tag_index, throw_pc).ln(); } + while (i < handlers.length) { // XXX: speed this up with a binary search var e = handlers[i]; if (Trace.exception) Trace.OUT.put3(" entry[%d...%d] tag=%d", e.start, e.end, e.tag).ln(); diff --git a/src/engine/Opcodes.v3 b/src/engine/Opcodes.v3 index b0a78d735..2e785e04c 100644 --- a/src/engine/Opcodes.v3 +++ b/src/engine/Opcodes.v3 @@ -609,8 +609,51 @@ enum Opcode(prefix: u8, code: u16, mnemonic: string, imms: Array, sig: RESUME_THROW (0x00, 0xE4, "resume_throw", imm.CONT_TAG_HANDLE, null), RESUME_THROW_REF (0x00, 0xE5, "resume_throw_ref", imm.CONT_HANDLE, null), SWITCH (0x00, 0xE6, "switch", imm.CONT_TAG, null) + + // fast call instructions + FAST_CALL0 (0x00, 0x27, "fast_call0", imm.FUNC, null), + FAST_CALL1 (0x00, 0xC5, "fast_call1", imm.FUNC, null), + FAST_CALL2 (0x00, 0xC6, "fast_call2", imm.FUNC, null), + FAST_CALL3 (0x00, 0xC7, "fast_call3", imm.FUNC, null), + FAST_CALL4 (0x00, 0xC8, "fast_call4", imm.FUNC, null), + FAST_CALL5 (0x00, 0xC9, "fast_call5", imm.FUNC, null), + FAST_CALL6 (0x00, 0xCA, "fast_call6", imm.FUNC, null), + FAST_CALL7 (0x00, 0xCB, "fast_call7", imm.FUNC, null), + FAST_CALL8 (0x00, 0xCC, "fast_call8", imm.FUNC, null), + FAST_CALL9 (0x00, 0xCD, "fast_call9", imm.FUNC, null), + FAST_CALL10 (0x00, 0xCE, "fast_call10", imm.FUNC, null), + FAST_CALL11 (0x00, 0xCF, "fast_call11", imm.FUNC, null), + FAST_CALL12 (0x00, 0xD7, "fast_call12", imm.FUNC, null), + FAST_CALL13 (0x00, 0xD8, "fast_call13", imm.FUNC, null), + FAST_CALL14 (0x00, 0xD9, "fast_call14", imm.FUNC, null), + FAST_CALL15 (0x00, 0xDA, "fast_call15", imm.FUNC, null), + FAST_CALL16 (0x00, 0xDB, "fast_call16", imm.FUNC, null), + FAST_CALL17 (0x00, 0xDC, "fast_call17", imm.FUNC, null), + FAST_CALL18 (0x00, 0xDD, "fast_call18", imm.FUNC, null), + FAST_CALL19 (0x00, 0xDE, "fast_call19", imm.FUNC, null), + FAST_CALL20 (0x00, 0xDF, "fast_call20", imm.FUNC, null), + FAST_CALL21 (0x00, 0xE7, "fast_call21", imm.FUNC, null), + FAST_CALL22 (0x00, 0xE8, "fast_call22", imm.FUNC, null), + FAST_CALL23 (0x00, 0xE9, "fast_call23", imm.FUNC, null), + FAST_CALL24 (0x00, 0xEA, "fast_call24", imm.FUNC, null), + FAST_CALL25 (0x00, 0xEB, "fast_call25", imm.FUNC, null), + FAST_CALL26 (0x00, 0xEC, "fast_call26", imm.FUNC, null), + FAST_CALL27 (0x00, 0xED, "fast_call27", imm.FUNC, null), + FAST_CALL28 (0x00, 0xEE, "fast_call28", imm.FUNC, null), + FAST_CALL29 (0x00, 0xEF, "fast_call29", imm.FUNC, null), + FAST_CALL30 (0x00, 0xF2, "fast_call30", imm.FUNC, null), + FAST_CALL31 (0x00, 0xF3, "fast_call31", imm.FUNC, null), + FAST_CALL32 (0x00, 0xF4, "fast_call32", imm.FUNC, null), + FAST_CALL33 (0x00, 0xF5, "fast_call33", imm.FUNC, null), + FAST_CALL34 (0x00, 0xF6, "fast_call34", imm.FUNC, null), + FAST_CALL35 (0x00, 0xF7, "fast_call35", imm.FUNC, null), + FAST_CALL36 (0x00, 0xF8, "fast_call36", imm.FUNC, null), + FAST_CALL37 (0x00, 0xF9, "fast_call37", imm.FUNC, null), + FAST_CALL38 (0x00, 0xFA, "fast_call38", imm.FUNC, null), + FAST_CALL39 (0x00, 0x17, "fast_call39", imm.FUNC, null), } + // Enumeration of the different kinds of immediates to opcodes. enum ImmKind { ARRAY_TYPE_INDEX, // ARRAYT @@ -798,9 +841,16 @@ component Opcodes { def code_pages = [page_FB, page_FC, page_FD, page_FE]; def var longestName: int; def var num_subpages: int; + def FAST_CALL_OPCODES = 40; + def var fast_calls: Array; private var nameMap: HashMap; new() { + + fast_calls = Array.new(FAST_CALL_OPCODES); + for (i < FAST_CALL_OPCODES) { + fast_calls[i] = indexToFastCall(i); + } for (op in Opcode) { if (op == Opcode.INVALID) continue; init(op); @@ -809,6 +859,7 @@ component Opcodes { attributes[InternalOpcode.PROBE.code] = OpcodeAttribute.INTERNAL | OpcodeAttribute.PROBE; attributes[InternalOpcode.WHAMM_PROBE.code] = OpcodeAttribute.INTERNAL | OpcodeAttribute.PROBE; attributes[InternalOpcode.BREAK_PROBE.code] = OpcodeAttribute.INTERNAL | OpcodeAttribute.PROBE; + for (op in fast_calls) attributes[op.tag] = OpcodeAttribute.INTERNAL; for (op in [Opcode.END, Opcode.I32_CONST, Opcode.I64_CONST, Opcode.F32_CONST, Opcode.F64_CONST, Opcode.GLOBAL_GET, Opcode.REF_NULL, Opcode.REF_FUNC, Opcode.STRUCT_NEW, Opcode.STRUCT_NEW_DEFAULT, @@ -1121,6 +1172,100 @@ component Opcodes { } } } + def indexToFastCall(index: int) -> Opcode { + var op: Opcode; + match (index) { + 0 => op = Opcode.FAST_CALL0; + 1 => op = Opcode.FAST_CALL1; + 2 => op = Opcode.FAST_CALL2; + 3 => op = Opcode.FAST_CALL3; + 4 => op = Opcode.FAST_CALL4; + 5 => op = Opcode.FAST_CALL5; + 6 => op = Opcode.FAST_CALL6; + 7 => op = Opcode.FAST_CALL7; + 8 => op = Opcode.FAST_CALL8; + 9 => op = Opcode.FAST_CALL9; + 10 => op = Opcode.FAST_CALL10; + 11 => op = Opcode.FAST_CALL11; + 12 => op = Opcode.FAST_CALL12; + 13 => op = Opcode.FAST_CALL13; + 14 => op = Opcode.FAST_CALL14; + 15 => op = Opcode.FAST_CALL15; + 16 => op = Opcode.FAST_CALL16; + 17 => op = Opcode.FAST_CALL17; + 18 => op = Opcode.FAST_CALL18; + 19 => op = Opcode.FAST_CALL19; + 20 => op = Opcode.FAST_CALL20; + 21 => op = Opcode.FAST_CALL21; + 22 => op = Opcode.FAST_CALL22; + 23 => op = Opcode.FAST_CALL23; + 24 => op = Opcode.FAST_CALL24; + 25 => op = Opcode.FAST_CALL25; + 26 => op = Opcode.FAST_CALL26; + 27 => op = Opcode.FAST_CALL27; + 28 => op = Opcode.FAST_CALL28; + 29 => op = Opcode.FAST_CALL29; + 30 => op = Opcode.FAST_CALL30; + 31 => op = Opcode.FAST_CALL31; + 32 => op = Opcode.FAST_CALL32; + 33 => op = Opcode.FAST_CALL33; + 34 => op = Opcode.FAST_CALL34; + 35 => op = Opcode.FAST_CALL35; + 36 => op = Opcode.FAST_CALL36; + 37 => op = Opcode.FAST_CALL37; + 38 => op = Opcode.FAST_CALL38; + 39 => op = Opcode.FAST_CALL39; + _ => System.error("indexToFastCall", "out of range"); + } + return op; + } + def fastCallToIndex(op: Opcode) -> int { + var idx: int; + match (op) { + FAST_CALL0 => idx = 0; + FAST_CALL1 => idx = 1; + FAST_CALL2 => idx = 2; + FAST_CALL3 => idx = 3; + FAST_CALL4 => idx = 4; + FAST_CALL5 => idx = 5; + FAST_CALL6 => idx = 6; + FAST_CALL7 => idx = 7; + FAST_CALL8 => idx = 8; + FAST_CALL9 => idx = 9; + FAST_CALL10 => idx = 10; + FAST_CALL11 => idx = 11; + FAST_CALL12 => idx = 12; + FAST_CALL13 => idx = 13; + FAST_CALL14 => idx = 14; + FAST_CALL15 => idx = 15; + FAST_CALL16 => idx = 16; + FAST_CALL17 => idx = 17; + FAST_CALL18 => idx = 18; + FAST_CALL19 => idx = 19; + FAST_CALL20 => idx = 20; + FAST_CALL21 => idx = 21; + FAST_CALL22 => idx = 22; + FAST_CALL23 => idx = 23; + FAST_CALL24 => idx = 24; + FAST_CALL25 => idx = 25; + FAST_CALL26 => idx = 26; + FAST_CALL27 => idx = 27; + FAST_CALL28 => idx = 28; + FAST_CALL29 => idx = 29; + FAST_CALL30 => idx = 30; + FAST_CALL31 => idx = 31; + FAST_CALL32 => idx = 32; + FAST_CALL33 => idx = 33; + FAST_CALL34 => idx = 34; + FAST_CALL35 => idx = 35; + FAST_CALL36 => idx = 36; + FAST_CALL37 => idx = 37; + FAST_CALL38 => idx = 38; + FAST_CALL39 => idx = 39; + _ => System.error("fastCallToIndex", "not a FAST_CALL instruction"); + } + return idx; + } } // Renders instructions as text. diff --git a/src/engine/Tuning.v3 b/src/engine/Tuning.v3 index c1c12d3e1..323fcdd61 100644 --- a/src/engine/Tuning.v3 +++ b/src/engine/Tuning.v3 @@ -42,6 +42,7 @@ component FastIntTuning { def inlineGlobalAccess = true; // enable inline access of (primitive) globals def stealFlagBitForMemory64 = true; // use a bit in the memarg flags for memory64 def whammProbeTrampolineNumPages = 1024; + def useFastFunctions = false; // treat functions exported with `fast:` in the name as fast functions } // Tuning settings for the single-pass compiler that have no effect on correctness. diff --git a/src/engine/compiler/MacroAssembler.v3 b/src/engine/compiler/MacroAssembler.v3 index a3eb8110b..294c47b8d 100644 --- a/src/engine/compiler/MacroAssembler.v3 +++ b/src/engine/compiler/MacroAssembler.v3 @@ -368,6 +368,8 @@ class MacroAssembler(valuerep: Tagging, regConfig: RegConfig) { // Destructive on {parent}. def emit_cont_mv(from_vsp: Reg, contStack: Reg, n_vals: Reg, tmp1: Reg, tmp2: Reg, xmm0: Reg); + def emit_dispatchSequence(); + // Validates {cont} and: // - Mark {cont} as used // - Move {cont.stack} to {destContStack} diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 616224c22..f3e657766 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -76,8 +76,12 @@ def KIND_V128 = SpcConsts.KIND_V128; def KIND_REF = SpcConsts.KIND_REF; def KIND_REF_U64 = SpcConsts.KIND_REF_U64; +// Unlike frame.frameSize, where it is 0 for fast contexts. These are always the +// true frame size (for stack reconstruction methods). +def FRAME_SIZE = X86_64InterpreterFrame.size; + // Compiles Wasm bytecode to machine code in a single pass via a MacroAssembler. -class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAlloc, extensions: Extension.set, limits: Limits) extends BytecodeVisitor { +class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAlloc, extensions: Extension.set, limits: Limits, fast: bool) extends BytecodeVisitor { def instrTracer = if(Trace.compiler, InstrTracer.new()); def config = masm.regConfig; def regs = xenv; @@ -102,6 +106,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var sig: SigDecl; var num_locals: int; var local_base_sp: u31; // can use a Range for 0-indexing instead of from offset + var ctl_base_sp: u31; // index of the RETURN control in ctl_stack for the current frame var success = true; var osr_pc: int; @@ -112,9 +117,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var ret_label: MasmLabel; var last_probe = 0; var skip_to_end: bool; - // this is Whamm probe inlining, not arbitrary function inlining (yet) - var is_inlined = false; - var whamm_probe_ctl_base: u31; // ctl_stack.top when Whamm probe compilation started + var whamm_config: WhammInlineConfig; + var frames_reconstructed = false; // XXX: hack var handler_dest_info = Vector.new(); @@ -131,12 +135,12 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl def gen(module: Module, func: FuncDecl, err: ErrorGen) -> bool { this.osr_pc = -1; this.err = err; - return Metrics.spc_time_us.run(gen0, (module, func)); + return Metrics.spc_time_us.run(gen0(_, _), (module, func)); } def genOsr(module: Module, func: FuncDecl, pc: int, err: ErrorGen) -> MasmLabel { this.osr_pc = pc; this.err = err; - var ok = Metrics.spc_time_us.run(gen0, (module, func)); + var ok = Metrics.spc_time_us.run(gen0(_, _), (module, func)); return if(ok, osr_entry_label); } private def gen0(module: Module, func: FuncDecl) -> bool { @@ -166,7 +170,12 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // Push initial frame for top-level function state.frame_stack.clear(); - var initial_frame = SpcFrame.new(func, module, 0, 0, func.num_slots(), 0); + if (fast) { + // push a SpcFrame representing the interpreter frame already on the stack + var interp_frame = SpcFrame.new(null, module, 0, 0, 0, -1, null); + pushSpcFrame(interp_frame); + } + var initial_frame = SpcFrame.new(func, module, 0, 0, func.num_slots(), 0, masm.newLabel(func.cur_bytecode.length)); pushSpcFrame(initial_frame); // Emit prologue, which allocates the frame and initializes various registers. @@ -182,7 +191,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // Emit function entry probe, if any. if (!FeatureDisable.entryProbes && func.entry_probed) { var probe = Instrumentation.getLocalProbe(module, func.func_index, 0); - emitProbe0(0, probe); + withReconstructedInlinedFrames(fun => + emitProbe0(0, probe)); } masm.current_fid = func.func_index; @@ -214,8 +224,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.bindLabel(label); if (frames.length > 1) { - // no inlining yet: this should never happen - System.error("SpcError", "attempt to emit trap in inlined context"); + unrefRegs(); + emitReconstructStackFrames(frames); } else { masm.emit_mov_m_i(xenv.pc_slot, label.create_pos); } @@ -346,36 +356,43 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl if (!cond) bailout(Strings.format3(msg, p1, p2, p3)); } def emitPrologue() { - // Allocate stack frame - masm.emit_subw_r_i(regs.sp, frame.frameSize); - - // Spill VSP - emit_spill_vsp(regs.vsp); // XXX: track VSP-spilled state - // Spill wf: WasmFunction - masm.emit_mov_m_r(ValueKind.REF, frame.wasm_func_slot, regs.func_arg); - // Load wf.instance and spill - masm.emit_v3_WasmFunction_instance_r_r(regs.instance, regs.func_arg); - masm.emit_mov_m_r(ValueKind.REF, frame.instance_slot, regs.instance); - // Clear FrameAccessor - masm.emit_mov_m_l(frame.accessor_slot, 0); // XXX: value kind - // Clear inlined whamm instance - if (SpcTuning.inlineWhammProbes && SpcTuning.intrinsifyWhammProbe) { - masm.emit_mov_m_l(frame.inlined_instance_slot, 0); + if (!fast) { + // Allocate stack frame + masm.emit_subw_r_i(regs.sp, frame.frameSize); + + // Spill VSP + emit_spill_vsp(regs.vsp); // XXX: track VSP-spilled state + // Spill wf: WasmFunction + masm.emit_mov_m_r(ValueKind.REF, frame.wasm_func_slot, regs.func_arg); + // Load wf.instance and spill + masm.emit_v3_WasmFunction_instance_r_r(regs.instance, regs.func_arg); + masm.emit_mov_m_r(ValueKind.REF, frame.instance_slot, regs.instance); + // Clear FrameAccessor + masm.emit_mov_m_l(frame.accessor_slot, 0); // XXX: value kind + // Clear inlined whamm instance + if (SpcTuning.inlineWhammProbes && SpcTuning.intrinsifyWhammProbe) { + masm.emit_mov_m_l(frame.inlined_instance_slot, 0); + } + } else { + masm.emit_addw_r_i(X86_64MasmRegs.INT_EXEC_ENV.ip, uleb_size(func.func_index)); } // Compute VFP = VSP - sig.params.length * SLOT_SIZE masm.emit_mov_r_r(ValueKind.REF, regs.vfp, regs.vsp); // XXX: use 3-addr adjustment of VFP masm.emit_subw_r_i(regs.vfp, sig.params.length * masm.valuerep.slot_size); - // XXX: skip spilling of VFP - masm.emit_mov_m_r(ValueKind.REF, frame.vfp_slot, regs.vfp); - // Load instance.memories[0].start into MEM0_BASE and spill - if (module.memories.length > 0) { - // XXX: skip loading memory base if function doesn't access memory - masm.emit_v3_Instance_memories_r_r(regs.mem0_base, regs.instance); - masm.emit_v3_Array_elem_r_ri(ValueKind.REF, regs.mem0_base, regs.mem0_base, 0); - masm.emit_v3_Memory_start_r_r(regs.mem0_base, regs.mem0_base); - masm.emit_mov_m_r(ValueKind.REF, frame.mem0_base_slot, regs.mem0_base); + if (!fast) { + // XXX: skip spilling of VFP + masm.emit_mov_m_r(ValueKind.REF, frame.vfp_slot, regs.vfp); + + // Load instance.memories[0].start into MEM0_BASE and spill + if (module.memories.length > 0) { + // XXX: skip loading memory base if function doesn't access memory + masm.emit_v3_Instance_memories_r_r(regs.mem0_base, regs.instance); + masm.emit_v3_Array_elem_r_ri(ValueKind.REF, regs.mem0_base, regs.mem0_base, 0); + masm.emit_v3_Memory_start_r_r(regs.mem0_base, regs.mem0_base); + masm.emit_mov_m_r(ValueKind.REF, frame.mem0_base_slot, regs.mem0_base); + } } } def visitLocalDecl(count: u32, vtc: ValueTypeCode) { @@ -399,7 +416,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl if (last_probe == 0) return; var probe = Instrumentation.getLocalProbe(module, func.func_index, last_probe); last_probe = 0; - emitProbe0(it.pc, probe); + withReconstructedInlinedFrames(fun => + emitProbe0(it.pc, probe)); if (Trace.compiler) traceOpcodeAndStack(true); } def emitProbe0(pc: int, probe: Probe) { @@ -484,40 +502,33 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // saves the overhead of using a runtime call by directly invoking the wasm function associated with the whamm probe def emitWhammProbe(probe: WhammProbe) { + if (Trace.compiler) Trace.OUT.puts("emitting whamm probe\n"); // set up args and push to frame slots. var whamm_sig = probe.sig; - var inline_config = InlineConfig(false, false, false); - var new_local_base_sp = 0; var orig_sp = state.sp; var callee_func = WasmFunction.!(probe.func); + def inline_decision = shouldInline(callee_func.decl) && SpcTuning.inlineWhammProbes; // TODO move to shouldInline + var swap_instance = false; + var swap_membase = false; - if (SpcTuning.inlineWhammProbes) { - inline_config = InlineConfig(probe.spc_swap_membase, probe.spc_swap_instance, probe.spc_inline_func); - if (!probe.inline_heuristic_checked) { - inline_config = funcCanInline(callee_func.decl); - probe.inline_heuristic_checked = true; - probe.spc_swap_instance = inline_config.swap_instance; - probe.spc_swap_membase = inline_config.swap_membase; - probe.spc_inline_func = inline_config.can_inline; - } + if (inline_decision) { + probe.checkSwap(); + swap_instance = probe.swap_instance; + swap_membase = probe.swap_membase; - if (inline_config.swap_instance) { // push whamm instance onto abstract stack directly + if (swap_instance) { masm.emit_mov_r_Instance(regs.scratch, callee_func.instance); masm.emit_mov_m_r(ValueKind.REF, frame.inlined_instance_slot, regs.scratch); } - - // overwrite mem0_base with whamm instance's memory base, restore from frame slot later - if (inline_config.swap_membase) { - var membase = callee_func.instance.memories[0].getMemBase64(); - masm.emit_mov_r_l(regs.mem0_base, i64.view(membase)); + if (swap_membase) { + if (callee_func.instance.memories.length > 0) { + var membase = callee_func.instance.memories[0].getMemBase64(); + masm.emit_mov_r_l(regs.mem0_base, i64.view(membase)); + } masm.emit_mov_m_r(ValueKind.REF, frame.inlined_mem0_base_slot, regs.mem0_base); } - } - - if (!inline_config.can_inline) { - state.emitSaveAll(resolver, probeSpillMode); } else { - new_local_base_sp = int.view(state.sp); + state.emitSaveAll(resolver, probeSpillMode); } for (i < whamm_sig.length) { @@ -526,13 +537,13 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var kind: byte; match(whamm_sig[i]) { FrameAccessor => { - if (inline_config.can_inline) state.emitSaveAll(resolver, probeSpillMode); // spill entire value stack. + if (inline_decision) state.emitSaveAll(resolver, probeSpillMode); // spill entire value stack. masm.emit_call_runtime_getFrameAccessorMetaRef(); emit_reload_regs(); - if (inline_config.can_inline && !probeSpillMode.free_regs) state.emitRestoreAll(resolver); + if (inline_decision && !probeSpillMode.free_regs) state.emitRestoreAll(resolver); // move result to mem slot or reg, depending on inlining - if (inline_config.can_inline) { + if (inline_decision) { var reg = allocRegTos(ValueKind.REF); masm.emit_mov_r_r(ValueKind.REF, reg, xenv.runtime_ret0); state.push(KIND_REF | IN_REG, reg, 0); @@ -544,7 +555,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl Val(val) => { match (val) { I31(v) => { - if (inline_config.can_inline) { + if (inline_decision) { var reg = allocRegTos(ValueKind.REF); masm.emit_mov_r_i(reg, i32.view(v) << 1); state.push(KIND_REF | IN_REG, reg, 0); @@ -554,7 +565,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl kind = ValueKind.REF.code; } I32(v) => { - if (inline_config.can_inline) { + if (inline_decision) { state.push(KIND_I32 | IS_CONST, NO_REG, i32.view(v)); } else { masm.emit_mov_m_d(slot_addr, v); @@ -562,7 +573,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl kind = ValueKind.I32.code; } I64(v) => { - if (inline_config.can_inline) { + if (inline_decision) { var reg = allocRegTos(ValueKind.I64); masm.emit_mov_r_l(reg, i64.view(v)); state.push(KIND_I64 | IN_REG, reg, 0); @@ -572,7 +583,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl kind = ValueKind.I64.code; } F32(v) => { - if (inline_config.can_inline) { + if (inline_decision) { var reg = allocRegTos(ValueKind.F32); masm.emit_mov_r_f32(reg, v); state.push(KIND_F32 | IN_REG, reg, 0); @@ -582,7 +593,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl kind = ValueKind.F32.code; } F64(v) => { - if (inline_config.can_inline) { + if (inline_decision) { var reg = allocRegTos(ValueKind.F64); masm.emit_mov_r_d64(reg, v); state.push(KIND_F64 | IN_REG, reg, 0); @@ -592,7 +603,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl kind = ValueKind.F64.code; } V128(l, h) => { - if (inline_config.can_inline) { + if (inline_decision) { var reg = allocRegTos(ValueKind.V128); masm.emit_mov_r_q(reg, l, h); state.push(KIND_V128 | IN_REG, reg, 0); @@ -603,7 +614,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl kind = ValueKind.V128.code; } Ref(v) => { - if (inline_config.can_inline) { + if (inline_decision) { var reg = allocRegTos(ValueKind.REF); masm.emit_mov_r_Object(reg, v); state.push(KIND_REF | IN_REG, reg, 0); @@ -614,7 +625,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl kind = ValueKind.REF.code; } Cont(v) => { - if (inline_config.can_inline) { + if (inline_decision) { var reg = allocRegTos(ValueKind.REF_U64); masm.emit_mov_r_Cont(reg, v); state.push(KIND_REF_U64 | IN_REG, reg, 0); @@ -629,15 +640,15 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } Operand(_, i) => { var index = orig_sp + u32.view(i) - 1; - if (inline_config.can_inline) { - visit_LOCAL_GET(u31.view(index)); + if (inline_decision) { + visit_LOCAL_GET(u31.view(index - local_base_sp)); } else { masm.emit_mov_m_m(state.state[index].kind(), slot_addr, masm.slotAddr(index)); } kind = state.state[index].kind().code; } Local(_, i) => { - if (inline_config.can_inline) { + if (inline_decision) { visit_LOCAL_GET(u31.view(i)); } else { masm.emit_mov_m_m(state.state[u31.view(i)].kind(), slot_addr, masm.slotAddr(u32.view(i))); @@ -646,7 +657,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } Null => System.error("whamm", "null whamm arg!"); } - if (!inline_config.can_inline) { + if (!inline_decision) { masm.emit_mov_m_i(slot_tag_addr, kind); } } @@ -654,49 +665,15 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var func_id = callee_func.decl.func_index; var whamm_module = whamm_instance.module; var whamm_func_decl = callee_func.decl; - if (inline_config.can_inline) { - var prev_it = it; - it = BytecodeIterator.new().reset(whamm_func_decl); - var orig_module = module; - - // prepare spc for inlining - this.local_base_sp = u31.view(new_local_base_sp); - this.module = whamm_module; - this.func = whamm_func_decl; - this.sig = whamm_func_decl.sig; - - // inline codegen - it.dispatchLocalDecls(this); - this.is_inlined = true; - if (Trace.compiler) Trace.OUT.puts("Start compiling inlined whamm probe").ln(); - while (it.more() && success) { - if (Trace.compiler) traceOpcodeAndStack(false); - last_probe = 0; - masm.source_loc = it.pc; - it.dispatch(this); - if (Trace.compiler && Trace.asm) { - OUT.puts("JIT code: "); - masm.printCodeBytes(OUT, codegen_offset, masm.curCodeBytes()); - codegen_offset = masm.curCodeBytes(); - OUT.ln(); - } - unrefRegs(); - if (Debug.compiler) checkRegAlloc(); - it.next(); + if (inline_decision) { + whamm_config = WhammInlineConfig(swap_membase, swap_instance, true); + masm.emit_mov_m_r(ValueKind.REF, frame.vfp_slot, regs.vfp); + emitInlinedCall(whamm_func_decl, probe); + whamm_config = WhammInlineConfig(false, false, false); + // Restore mem0_base after probe + if (module.memories.length > 0) { + masm.emit_mov_r_m(ValueKind.REF, regs.mem0_base, frame.mem0_base_slot); } - if (Trace.compiler) Trace.OUT.puts("Finished compiling inlined whamm probe").ln(); - - // restore spc after inlining - it = prev_it; - this.local_base_sp = 0; - this.is_inlined = false; - this.module = orig_module; - this.func = it.func; - this.sig = it.func.sig; - masm.emit_mov_r_m(ValueKind.REF, regs.mem0_base, frame.mem0_base_slot); - - // clear callee params/locals from abstract state - dropN(state.sp - orig_sp); } else { var vsp_reg = allocTmpFixed(ValueKind.REF, regs.vsp); var func_reg = allocTmpFixed(ValueKind.REF, regs.func_arg); @@ -739,7 +716,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl state.prepareLoop(resolver); masm.bindLabel(ctl_top.label); emitProbe(); - if (it.pc == osr_pc) { + if (it.pc == osr_pc && !isInlined()) { osr_state = state.ctl_stack.peek().copyMerge(); osr_loop_label = masm.newLabel(it.pc); masm.bindLabel(osr_loop_label); @@ -792,7 +769,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl setUnreachable(); } def visit_END() { - if (!this.is_inlined) { + if (needsEpilogue()) { var ctl_top = state.ctl_stack.peek(); if (ctl_top.opcode == Opcode.LOOP.code) { state.ctl_stack.pop(); @@ -813,6 +790,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.bindLabel(ctl_top.label); state.resetToMerge(ctl_top); state.ctl_stack.pop(); + // case for END for fallthrough at end of function? } else if (ctl_top.opcode == Opcode.RETURN.code) { state.emitFallthru(resolver); masm.bindLabel(ctl_top.label); @@ -821,8 +799,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl if (ctl_top.merge_count > 1) emitReturn(ctl_top); state.ctl_stack.pop(); } - emitProbe(); } + emitProbe(); } def visit_BR(depth: u31) { var target = state.getControl(depth); @@ -851,35 +829,150 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl setUnreachable(); } def visit_RETURN() { - var target = state.ctl_stack.elems[0]; + var target = state.ctl_stack.elems[ctl_base_sp]; state.emitTransfer(target, resolver); - if (ret_label == null) ret_label = masm.newLabel(func.cur_bytecode.length); masm.emit_br(ret_label); setUnreachable(); } - def visitCallDirect(op: Opcode, index: u31, tailCall: bool) { + // for CALL, FAST_CALL, and RETURN_CALL + def visitCallDirect(op: Opcode, index: u31, prop: CallProperty) { if (op == Opcode.CALL) { Metrics.spc_static_calls.val++; masm.emit_inc_metric(Metrics.spc_dynamic_calls); } var func = module.functions[index]; - var retpt = masm.newLabel(it.pc), wasmcall_label = masm.newLabel(it.pc); - // Load the instance (which must happen before frame is unwound). - var vsp_reg = allocTmpFixed(ValueKind.REF, regs.vsp); - var func_reg = allocTmpFixed(ValueKind.REF, regs.func_arg); - var tmp = allocTmp(ValueKind.REF); - emit_load_instance(tmp); - // Load the function, XXX: skip and compute function from instance + code on stack? - masm.emit_v3_Instance_functions_r_r(func_reg, tmp); - masm.emit_v3_Array_elem_r_ri(ValueKind.REF, func_reg, func_reg, func.func_index); + // Try inlining for intra-module, non-tail calls + if (prop != CallProperty.TAIL && shouldInline(func)) { + if (Trace.compiler) Trace.OUT.put2("Inlining call to func #%d (%d bytes)", index, func.orig_bytecode.length).ln(); + if (op == Opcode.CALL) { + Metrics.spc_static_inlined_calls.val++; + masm.emit_inc_metric(Metrics.spc_dynamic_inlined_calls); + } + emitInlinedCall(func, null); + return; + } + + withReconstructedInlinedFrames(fun { + var retpt = masm.newLabel(it.pc), wasmcall_label = masm.newLabel(it.pc); + // Load the instance (which must happen before frame is unwound). + var vsp_reg = allocTmpFixed(ValueKind.REF, regs.vsp); + var func_reg = allocTmpFixed(ValueKind.REF, regs.func_arg); + var tmp = allocTmp(ValueKind.REF); + emit_load_instance(tmp); + + // Load the function, XXX: skip and compute function from instance + code on stack? + masm.emit_v3_Instance_functions_r_r(func_reg, tmp); + masm.emit_v3_Array_elem_r_ri(ValueKind.REF, func_reg, func_reg, func.func_index); + + emitCallToReg(func.sig, func_reg, vsp_reg, tmp, func.imp != null, prop); + }); + } + def emitInlinedCall(callee_func: FuncDecl, whamm: WhammProbe) { + var sig = callee_func.sig; + var params_count = u32.view(sig.params.length); + var results_count = u32.view(sig.results.length); + var orig_sp = state.sp; + + // Arguments are already on stack + // Stack: [..., arg0, arg1, ..., argN] <- sp + // We want callee's local 0 = arg0, so: + var new_local_base_sp: u31 = u31.view(orig_sp - params_count); + var new_ctl_base_sp = u31.view(state.ctl_stack.top); + + var num_locals = callee_func.num_slots(); + + // Push a RETURN control for the inlined callee's function body. + var end_label = masm.newLabel(callee_func.cur_bytecode.length); + var func_body_ctl = state.pushFuncBody(sig.params, sig.results, end_label); + + var m: Module = module; + + // Whamm probe configuration + if (whamm != null) { + def whamm_sig = whamm.sig; + def whamm_wf = WasmFunction.!(whamm.func); + def whamm_instance = whamm_wf.instance; + def whamm_func_decl = whamm_wf.decl; + + m = whamm_instance.module; + new_local_base_sp = u31.view(state.sp) - u31.view(whamm_sig.length); // XXX + func_body_ctl.val_stack_top = new_local_base_sp; // correct val_stack_top for whamm arg count + } + + // create merge state based on outer function's base sp given inlined function's results + func_body_ctl.merge_state = state.getInMemoryMergeWithArgs(int.view(new_local_base_sp), sig.results); + func_body_ctl.merge_count = 1; + + // Create and push frame for inlined function + var callee_frame = SpcFrame.new(callee_func, + m, new_local_base_sp, new_ctl_base_sp, num_locals, 0, masm.newLabel(callee_func.cur_bytecode.length)); + + pushSpcFrame(callee_frame); + + // Emit function entry probe, if any. + // XXX expensive because frame materialization required + if (whamm == null && !FeatureDisable.entryProbes && func.entry_probed) { + var probe = Instrumentation.getLocalProbe(module, callee_func.func_index, 0); + withReconstructedInlinedFrames(fun => + emitProbe0(0, probe)); + } + + // Allocate callee's non-parameter locals + it.dispatchLocalDecls(this); + + // Compile callee's bytecode + if (Trace.compiler) Trace.OUT.puts(" Start inlined function body").ln(); + while (it.more() && success) { + if (Trace.compiler) traceOpcodeAndStack(false); + last_probe = 0; + masm.source_loc = it.pc; + masm.current_fid = func.func_index; + it.dispatch(this); + if (Trace.compiler && Trace.asm) { + OUT.puts("JIT code: "); + masm.printCodeBytes(OUT, codegen_offset, masm.curCodeBytes()); + codegen_offset = masm.curCodeBytes(); + OUT.ln(); + } + unrefRegs(); + if (Debug.compiler) checkRegAlloc(); + it.next(); + if (skip_to_end) doSkipToEndOfBlock(); + } + if (Trace.compiler) Trace.OUT.puts(" End inlined function body").ln(); + + // Check if the inlined function is unreachable (e.g., ended with UNREACHABLE, RETURN, THROW) + var inlined_reachable = state.ctl_stack.peek().reachable; + + // Restore caller context by popping frame + popSpcFrame(); // Automatically restores cached fields + + // Note: Control stack cleanup (popping implicit BLOCK) is handled by visit_END + + // If inlined function is unreachable, no results to clean up + if (!inlined_reachable) { + if (Trace.compiler) { + Trace.OUT.puts(" Inlined function unreachable, skipping result cleanup").ln(); + Trace.OUT.put3(" state.sp=%d, new_local_base_sp=%d, callee_slots=%d", + state.sp, new_local_base_sp, state.sp - new_local_base_sp).ln(); + } + // Drop all callee state (params + locals, no results) + var callee_slots = state.sp - new_local_base_sp; + if (callee_slots > 0) dropN(u32.view(callee_slots)); + if (Trace.compiler) Trace.OUT.put1(" After dropN: state.sp=%d", state.sp).ln(); + setUnreachable(); + return; + } - emitCallToReg(func.sig, func_reg, vsp_reg, tmp, func.imp != null, tailCall); + if (Trace.compiler) { + Trace.OUT.put1(" Inlined call complete, sp=%d", state.sp).ln(); + } } - def emitCallToReg(sig: SigDecl, func_reg: Reg, vsp_reg: Reg, tmp: Reg, checkHostCall: bool, tailCall: bool) { + def emitCallToReg(sig: SigDecl, func_reg: Reg, vsp_reg: Reg, tmp: Reg, checkHostCall: bool, prop: CallProperty) { var retpt = masm.newLabel(it.pc), wasmcall_label = masm.newLabel(it.pc); // Handle the current stack state. - if (tailCall) emitMoveTailCallArgs(sig); // transfer tail call args + if (prop == CallProperty.TAIL) emitMoveTailCallArgs(sig); // transfer tail call args else state.emitSaveAll(resolver, SpillMode.SAVE_AND_FREE_REGS); // spill entire value stack // Compute the value stack pointer. @@ -887,7 +980,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl if (checkHostCall) { // A call to imported function must first check for WasmFunction. masm.emit_br_r(func_reg, MasmBrCond.IS_WASM_FUNC, wasmcall_label); - if (tailCall) { + if (prop == CallProperty.TAIL) { masm.emit_jump_HostCallStub(); // XXX: stub relies on func_arg and VSP } else { masm.emit_call_HostCallStub(); // XXX: stub relies on func_arg and VSP @@ -900,7 +993,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_v3_FuncDecl_target_code_r_r(tmp, tmp); // Call or jump to the entrypoint. - if (tailCall) { + if (prop == CallProperty.TAIL) { masm.emit_jump_r(tmp); setUnreachable(); } else { @@ -924,7 +1017,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // adjust frame masm.emit_addw_r_i(regs.sp, frame.frameSize); } - def visitCallIndirect(op: Opcode, sig_index: u31, table_index: u31, tailCall: bool) { + def visitCallIndirect(op: Opcode, sig_index: u31, table_index: u31, prop: CallProperty) { var sig = SigDecl.!(module.heaptypes[sig_index]); var retpt = masm.newLabel(it.pc), wasmcall_label = masm.newLabel(it.pc); var vsp_reg = allocTmpFixed(ValueKind.REF, regs.vsp); @@ -981,9 +1074,9 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.bindLabel(end); } - emitCallToReg(sig, func_reg, vsp_reg, tmp_reg, true, tailCall); + emitCallToReg(sig, func_reg, vsp_reg, tmp_reg, true, prop); } - def visitCallRef(op: Opcode, index: u31, tailCall: bool) { + def visitCallRef(op: Opcode, index: u31, prop: CallProperty) { var sig = SigDecl.!(module.heaptypes[index]); var sv = state.peek(); if (sv.isConst() && sv.const == 0) { @@ -996,7 +1089,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var tmp = allocTmp(ValueKind.REF); var func_reg = sv.reg; - emitCallToReg(sig, func_reg, vsp_reg, tmp, true, tailCall); + emitCallToReg(sig, func_reg, vsp_reg, tmp, true, prop); } def visit_DROP() { dropN(1); @@ -1939,12 +2032,17 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl state.emitSaveAll(resolver, runtimeSpillMode); emit_compute_vsp(regs.vsp, state.sp); masm.emit_store_curstack_vsp(regs.vsp); - masm.emit_get_curstack(regs.runtime_arg0); - masm.emit_v3_set_X86_64Stack_rsp_r_r(regs.runtime_arg0, regs.sp); - masm.emit_push_X86_64Stack_rsp_r_r(regs.runtime_arg0); - emit_load_instance(regs.runtime_arg1); - masm.emit_mov_r_i(regs.runtime_arg2, arg1); - masm.emit_call_runtime_op(op); + + def emit = fun { + masm.emit_get_curstack(regs.runtime_arg0); + masm.emit_v3_set_X86_64Stack_rsp_r_r(regs.runtime_arg0, regs.sp); + masm.emit_push_X86_64Stack_rsp_r_r(regs.runtime_arg0); + emit_load_instance(regs.runtime_arg1); + masm.emit_mov_r_i(regs.runtime_arg2, arg1); + masm.emit_call_runtime_op(op); + }; + // Reconstruct stack frames across runtime calls that might (Wasm-level) trap. + if (canTrap) withReconstructedInlinedFrames(emit); else emit(); masm.emit_get_curstack(regs.scratch); masm.emit_pop_X86_64Stack_rsp_r_r(regs.scratch); dropN(args); @@ -1956,13 +2054,18 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl state.emitSaveAll(resolver, runtimeSpillMode); emit_compute_vsp(regs.vsp, state.sp); masm.emit_store_curstack_vsp(regs.vsp); - masm.emit_get_curstack(regs.runtime_arg0); - masm.emit_v3_set_X86_64Stack_rsp_r_r(regs.runtime_arg0, regs.sp); - masm.emit_push_X86_64Stack_rsp_r_r(regs.runtime_arg0); - emit_load_instance(regs.runtime_arg1); - masm.emit_mov_r_i(regs.runtime_arg2, arg1); - masm.emit_mov_r_i(regs.runtime_arg3, arg2); - masm.emit_call_runtime_op(op); + + def emit = fun { + masm.emit_get_curstack(regs.runtime_arg0); + masm.emit_v3_set_X86_64Stack_rsp_r_r(regs.runtime_arg0, regs.sp); + masm.emit_push_X86_64Stack_rsp_r_r(regs.runtime_arg0); + emit_load_instance(regs.runtime_arg1); + masm.emit_mov_r_i(regs.runtime_arg2, arg1); + masm.emit_mov_r_i(regs.runtime_arg3, arg2); + masm.emit_call_runtime_op(op); + }; + // Reconstruct stack frames across runtime calls that might (Wasm-level) trap. + if (canTrap) withReconstructedInlinedFrames(emit); else emit(); masm.emit_get_curstack(regs.scratch); masm.emit_pop_X86_64Stack_rsp_r_r(regs.scratch); dropN(args); @@ -2034,13 +2137,16 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_br(target.label); } } + // Return includes epilogue def emitReturn(ctl: SpcControl) { // All explicit RETURN instructions branch here. if (ret_label != null) { masm.bindLabel(ret_label); ret_label = null; } + var results = sig.results; + // fix values? if (masm.valuerep.tagged) { // update mismatched value tags var params = sig.params; @@ -2050,14 +2156,24 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_mov_m_i(masm.tagAddr(state.sp - u32.view(results.length) + u32.view(i)), rtag.code); } } + + if (!needsEpilogue()) return; + // Compute VSP = VFP + state.sp emit_compute_vsp(regs.vsp, state.sp); - // Return to caller - masm.emit_mov_r_i(regs.ret_throw, 0); - // Deallocate stack frame - masm.emit_addw_r_i(regs.sp, frame.frameSize); - masm.emit_ret(); + if (!fast) { + // Return to caller // \ fast context: do not emit these instructions + masm.emit_mov_r_i(regs.ret_throw, 0); // | instead, emit the dispatch sequence from the interpreter + // Deallocate stack frame // | + masm.emit_addw_r_i(regs.sp, frame.frameSize); // | + masm.emit_ret(); // / + } else { + // Restore VFP from interpreter frame + masm.emit_mov_r_m(ValueKind.REF, regs.vfp, frame.vfp_slot); + emitFastDispatch(); + } } + def emitFastDispatch() -> void; def emitOsrEntry(osr_entry_label: MasmLabel, state: Array) { if (Trace.compiler) Trace.OUT.put1(" OSR (+%d)", osr_entry_label.create_pos).ln(); masm.bindLabel(osr_entry_label); @@ -2087,6 +2203,137 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl return label; } def getSpcInlinedFrameIp() -> long; + def saveCallerIVars(); + def restoreDispatchTableReg(); + def restoreCallerIVars(); + // Emit code to materialize stack frames for each inlined function. + def emitReconstructStackFrames(frames: Array) -> int { + Metrics.spc_static_reconst.val++; + masm.emit_inc_metric(Metrics.spc_dynamic_reconst); + if (fast) { + // pc already saved + saveCallerIVars(); + } else { + def real_frame = frames[0]; + masm.emit_mov_m_i(xenv.pc_slot, real_frame.pc); + } + + // NOTE we could use interpreter-backed registers for these instead of allocating new regs + // load instance + var inst_reg = allocTmp(ValueKind.REF); + masm.emit_mov_r_m(ValueKind.REF, inst_reg, frame.instance_slot); + var mem_reg = allocTmp(ValueKind.REF); + masm.emit_mov_r_m(ValueKind.REF, mem_reg, frame.mem0_base_slot); + // Load instance.functions + def func_reg = allocTmp(ValueKind.REF); + masm.emit_v3_Instance_functions_r_r(func_reg, inst_reg); + def vfp_reg = allocTmp(ValueKind.REF); + masm.emit_mov_r_m(ValueKind.REF, vfp_reg, frame.vfp_slot); + var prev_base_sp = int.view(frames[0].local_base_sp); + var wasm_func_reg = allocTmp(ValueKind.REF); + + var inl_inst_reg: Reg, inl_mem0_reg: Reg; + if (whamm_config.is_inlined) { // TODO investigate, check individual configs? + inl_inst_reg = allocTmp(ValueKind.REF); + inl_mem0_reg = allocTmp(ValueKind.REF); + masm.emit_mov_r_m(ValueKind.REF, inl_inst_reg, frame.inlined_instance_slot); + masm.emit_mov_r_m(ValueKind.REF, inl_mem0_reg, frame.inlined_mem0_base_slot); + } + + // Pre-allocate stack space for all reconstructed frames at once. + def total_space = (frames.length - 1) * (FRAME_SIZE + 8); + masm.emit_subw_r_i(regs.sp, total_space); + + // Process the inlined frames (skip the outermost which already exists on native stack) + for (i = 1; i < frames.length; i++) { + def frame_info = frames[i]; + def cur_base_sp = int.view(frame_info.local_base_sp); + def delta = (cur_base_sp - prev_base_sp) * masm.valuerep.slot_size; + emitReconstructStackFrame(frame_info, frames.length - i - 1, delta, + wasm_func_reg, func_reg, inst_reg, mem_reg, vfp_reg, inl_inst_reg, inl_mem0_reg); + prev_base_sp = cur_base_sp; + } + + return total_space; + } + def emitReconstructStackFrame(spcFrame: SpcFrame, offset: int, vfp_delta: int, + wasm_func_reg: Reg, func_reg: Reg, inst_reg: Reg, mem_reg: Reg, vfp_reg: Reg, inl_inst_reg: Reg, inl_mem0_reg: Reg) { + // Use inlined frame stub IP as return address for all reconstructed frames + def return_addr = getSpcInlinedFrameIp(); + + def frame_offset = offset * (FRAME_SIZE + 8); + // Write inlined frame stub IP as return address + def retaddr_slot = MasmAddr(regs.sp, frame_offset + FRAME_SIZE); + masm.emit_mov_m_l(retaddr_slot, return_addr); + + // get functions[func_index] and save into frame + def wasm_func_slot = frame.wasm_func_slot.plus(frame_offset); + masm.emit_v3_Array_elem_r_ri(ValueKind.REF, wasm_func_reg, func_reg, spcFrame.func.func_index); + masm.emit_mov_m_r(ValueKind.REF, wasm_func_slot, wasm_func_reg); + + // Save instance + def instance_slot = frame.instance_slot.plus(frame_offset); + masm.emit_mov_m_r(ValueKind.REF, instance_slot, inst_reg); + + // Save mem0 base + def mem0_base_slot = frame.mem0_base_slot.plus(frame_offset); + masm.emit_mov_m_r(ValueKind.REF, mem0_base_slot, mem_reg); + + // Step vfp_reg by change in local_base_sp from previous frame and save + if (vfp_delta != 0) masm.emit_addw_r_i(vfp_reg, vfp_delta); + def vfp_slot = frame.vfp_slot.plus(frame_offset); + masm.emit_mov_m_r(ValueKind.REF, vfp_slot, vfp_reg); + + // Save PC + def pc_slot = frame.pc_slot.plus(frame_offset); + masm.emit_mov_m_i(pc_slot, spcFrame.pc); + + // Clear FrameAccessor + def accessor_slot = frame.accessor_slot.plus(frame_offset); + masm.emit_mov_m_l(accessor_slot, 0); + + // if an inlined whamm probe, also grab inlined slots + if (whamm_config.is_inlined) { + def inl_instance_slot = frame.inlined_instance_slot.plus(frame_offset); + masm.emit_mov_m_r(ValueKind.REF, inl_instance_slot, inl_inst_reg); + def inl_mem0_base_slot = frame.inlined_mem0_base_slot.plus(frame_offset); + masm.emit_mov_m_r(ValueKind.REF, inl_mem0_base_slot, inl_mem0_reg); + } else { + def inl_instance_slot = frame.inlined_instance_slot.plus(frame_offset); + masm.emit_mov_m_l(inl_instance_slot, 0); + def inl_mem0_base_slot = frame.inlined_mem0_base_slot.plus(frame_offset); + masm.emit_mov_m_l(inl_mem0_base_slot, 0); + } + } + // Guards compiler code with frame reconstruction (if necessary). + def withReconstructedInlinedFrames(emit: void -> void) { + if (isInlined()) { + if (frames_reconstructed) { + // FIXME this should not happen (but does): + // - in the case of deep nesting when one layer is a Whamm probe + // - when refactoring to avoid `with` clause, GC test fails (inlining depth 2) + if (Trace.compiler) Trace.OUT.puts(" nested frame reconstruction inhibited\n"); + emit(); + return; + } + unrefRegs(); + frames_reconstructed = true; + if (Trace.compiler) Trace.OUT.puts("performing frame reconstruction\n"); + def space = emitReconstructStackFrames(snapshotFrames()); + emit(); + frames_reconstructed = false; + if (space > 0) { + masm.emit_addw_r_i(regs.sp, space); + masm.emit_mov_r_m(ValueKind.REF, regs.vfp, frame.vfp_slot); + if (fast) { + restoreCallerIVars(); + restoreDispatchTableReg(); + } + } + } else { + emit(); + } + } def unsupported() { success = false; // XXX: add opcode } @@ -2181,7 +2428,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // XXX: recompute VFP from VSP - #slots? masm.emit_mov_r_m(ValueKind.REF, regs.vfp, frame.vfp_slot); if (module.memories.length > 0) { - if (is_inlined) { + if (whamm_config.is_inlined) { masm.emit_mov_r_m(ValueKind.REF, regs.mem0_base, frame.inlined_mem0_base_slot); } else { masm.emit_mov_r_m(ValueKind.REF, regs.mem0_base, frame.mem0_base_slot); @@ -2189,7 +2436,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } } def emit_load_instance(reg: Reg) { - if (is_inlined) { // inline compilation + if (whamm_config.is_inlined) { // inline compilation masm.emit_mov_r_m(ValueKind.REF, reg, frame.inlined_instance_slot); } else { masm.emit_mov_r_m(ValueKind.REF, reg, frame.instance_slot); @@ -2517,15 +2764,20 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl if (func != null) masm.pushInlineContext(func.func_index); def current = state.frame_stack.peek(); - if (current != null) current.pc = it.pc; + if (current != null) { + current.pc = it.pc; + current.ret_label = ret_label; + } state.frame_stack.push(frame); // Update cached copies from new top frame - it.reset(frame.func).at(frame.pc, -1); + if (frame.func != null) it.reset(frame.func).at(frame.pc, -1); module = frame.module; func = frame.func; - sig = func.sig; + sig = if(func != null, func.sig); num_locals = frame.num_locals; local_base_sp = frame.local_base_sp; + ctl_base_sp = frame.ctl_base_sp; + ret_label = frame.ret_label; } def popSpcFrame() -> SpcFrame { masm.popInlineContext(); @@ -2539,12 +2791,18 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl sig = func.sig; num_locals = current.num_locals; local_base_sp = current.local_base_sp; + ctl_base_sp = current.ctl_base_sp; + ret_label = current.ret_label; return frame; } def isInlined() -> bool { return state.frame_stack.top > 1; } + def needsEpilogue() -> bool { + // inlined callees will fallthrough and don't need epilogue to be emitted + return !isInlined() || ctl_base_sp == 0; + } def inlineDepth() -> int { return state.frame_stack.top - 1; } @@ -2553,10 +2811,41 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl for (i < state.frame_stack.top) { var f = state.frame_stack.elems[i]; var pc = if(i == state.frame_stack.top - 1, it.pc, f.pc); - frames[i] = SpcFrame.new(f.func, f.module, f.local_base_sp, f.ctl_base_sp, f.num_locals, pc); + frames[i] = SpcFrame.new(f.func, f.module, f.local_base_sp, f.ctl_base_sp, f.num_locals, pc, null); } return frames; } + // Determine if a regular function call should be inlined + def shouldInline(func: FuncDecl) -> bool { + if (Trace.compiler) OUT.put1("deciding on inlining call to func #%d: ", func.func_index); + + if (func.imp != null) return no("imported"); + if (inlineDepth() >= SpcTuning.maxInlineDepth) return no("max inline depth exceeded"); + if (func.orig_bytecode.length > SpcTuning.maxInlineBytecodeSize) return no("func too large"); + if (func.sig.params.length > SpcTuning.maxInlineParams) return no("too many parameters"); + + // Scan bytecode for unsupported instructions + var bi = BytecodeIterator.new().reset(func); + while (bi.more()) { + match (bi.current()) { + RETURN_CALL, RETURN_CALL_INDIRECT, RETURN_CALL_REF => + return no("uses return instruction"); + TRY, CATCH, THROW, RETHROW, THROW_REF, DELEGATE, CATCH_ALL, TRY_TABLE => + return no("uses exception handling instruction"); + CONT_NEW, CONT_BIND, SUSPEND, RESUME, RESUME_THROW, RESUME_THROW_REF, SWITCH => + return no("uses stack switching instruction"); + _ => ; + } + bi.next(); + } + + if (Trace.compiler) OUT.puts("YES\n"); + return true; + } + private def no(reason: string) -> bool { + if (Trace.compiler) OUT.puts("NO (").puts(reason).putc(')').ln(); + return false; + } } // Different branch instructions have different repush enum BrRepush(taken: bool, not_taken: bool) { @@ -2710,8 +2999,9 @@ class SpcFrame { var ctl_base_sp: u31; // Base index into SpcState.ctl_stack var num_locals: int; var pc: int; + var ret_label: MasmLabel; - new(func, module, local_base_sp, ctl_base_sp, num_locals, pc) {} + new(func, module, local_base_sp, ctl_base_sp, num_locals, pc, ret_label) {} } class SpcState(regAlloc: RegAlloc) { @@ -2728,7 +3018,7 @@ class SpcState(regAlloc: RegAlloc) { ctl_stack.clear(); // manually set up first control entry and return merge state var results = sig.results; - var ctl = pushControl(Opcode.RETURN.code, ValueTypes.NONE, results, ret_label); + var ctl = pushFuncBody(ValueTypes.NONE, results, ret_label); var merge_state = Array.new(results.length); for (i < results.length) { // request the merged values be stored to the stack, but don't require tags @@ -2760,6 +3050,9 @@ class SpcState(regAlloc: RegAlloc) { def pushBlock(params: Array, results: Array, end_label: MasmLabel) -> SpcControl { return pushControl(Opcode.BLOCK.code, params, results, end_label); } + def pushFuncBody(params: Array, results: Array, end_label: MasmLabel) -> SpcControl { + return pushControl(Opcode.RETURN.code, params, results, end_label); + } def pushLoop(params: Array, results: Array, start_label: MasmLabel) -> SpcControl { var ctl = pushControl(Opcode.LOOP.code, params, results, start_label); return ctl; @@ -3263,38 +3556,7 @@ class MoveNode { var dstNext: MoveNode; // next in a list of successors } -// checks function bytecode to see if it can be inlined based on -// simple heuristics: length <= maxInlineBytecodeSize and straightline code. -def funcCanInline(decl: FuncDecl) -> InlineConfig { - var default = InlineConfig(false, false, false); - if (decl.orig_bytecode.length > SpcTuning.maxInlineBytecodeSize || decl.sig.params.length > SpcTuning.maxInlineParams) return default; - var bi = BytecodeIterator.new().reset(decl); - var swap_instance = false; - var swap_membase = false; - while (bi.more()) { - var op = bi.current(); - match (op) { - // Cannot handle control flow yet. - IF, BR, BR_IF, BR_TABLE, BR_ON_NULL, BR_ON_NON_NULL, BR_ON_CAST, BR_ON_CAST_FAIL, RETURN => return default; - // These opcodes require swapping the instance. - THROW, CALL, CALL_INDIRECT, MEMORY_INIT, MEMORY_SIZE, MEMORY_GROW, MEMORY_COPY, MEMORY_FILL, REF_FUNC, DATA_DROP, - ELEM_DROP, TABLE_INIT, TABLE_SIZE, TABLE_COPY, TABLE_GROW, GLOBAL_SET, GLOBAL_GET, TABLE_SET, TABLE_GET => swap_instance = true; - // Load/store opcodes require either the memory base or the instance. - I32_STORE, I64_STORE, F32_STORE, F64_STORE, I32_STORE8, I32_STORE16, I64_STORE8, I64_STORE16, I64_STORE32, - V128_STORE, I32_LOAD, I64_LOAD, F32_LOAD, F64_LOAD, I32_LOAD8_S, I32_LOAD8_U, I32_LOAD16_S, I32_LOAD16_U, - I64_LOAD8_S, I64_LOAD8_U, I64_LOAD16_S, I64_LOAD16_U, I64_LOAD32_S, I64_LOAD32_U, V128_LOAD => { - var memarg = bi.immptr().read_MemArg(); - if (memarg.memory_index == 0) swap_membase = true; - else swap_instance = true; - } - _ => ; - } - bi.next(); - } - return InlineConfig(swap_membase, swap_instance, true); -} - -type InlineConfig(swap_membase: bool, swap_instance: bool, can_inline: bool); +type WhammInlineConfig(swap_membase: bool, swap_instance: bool, is_inlined: bool); // Used to record the entry point of exception/suspension handlers. Jumping to {stub_label} allows // control transfer to its corresponding handler without falling back to fast-int. @@ -3302,3 +3564,9 @@ type InlineConfig(swap_membase: bool, swap_instance: bool, can_inline: bool); // The SPC emits a stub at {stub_label} for each handler in the function. The stub restores the // expected state of the environment, then jumps to {dest_label} to continue execution at handler. type SpcHandlerInfo(is_dummy: bool, func_end: bool, dest_label: MasmLabel, stub_label: MasmLabel, merge_state: Array); + +def uleb_size(v: int) -> int { + var n = 1, data = u32.view(v); + while (data >= 0x80) { data = data >> 7; n++; } + return n; +} diff --git a/src/engine/v3/V3Interpreter.v3 b/src/engine/v3/V3Interpreter.v3 index ff4ae0d13..c84ad7267 100644 --- a/src/engine/v3/V3Interpreter.v3 +++ b/src/engine/v3/V3Interpreter.v3 @@ -349,7 +349,7 @@ class V3Interpreter extends WasmStack { RETURN => { doReturn(frame.fp, frame.func.sig); } - CALL => { + CALL, FAST_CALL0, FAST_CALL1, FAST_CALL2, FAST_CALL3, FAST_CALL4, FAST_CALL5, FAST_CALL6, FAST_CALL7, FAST_CALL8, FAST_CALL9, FAST_CALL10, FAST_CALL11, FAST_CALL12, FAST_CALL13, FAST_CALL14, FAST_CALL15, FAST_CALL16, FAST_CALL17, FAST_CALL18, FAST_CALL19, FAST_CALL20, FAST_CALL21, FAST_CALL22, FAST_CALL23, FAST_CALL24, FAST_CALL25, FAST_CALL26, FAST_CALL27, FAST_CALL28, FAST_CALL29, FAST_CALL30, FAST_CALL31, FAST_CALL32, FAST_CALL33, FAST_CALL34, FAST_CALL35, FAST_CALL36, FAST_CALL37, FAST_CALL38, FAST_CALL39 => { var func_index = codeptr.read_uleb32(); var f = frame.func.instance.functions[func_index]; return doCallFunction(f); @@ -1615,7 +1615,7 @@ class V3Interpreter extends WasmStack { // XXX: use read_opcode_and_skip() var opcode = codeptr.read_opcode_but_skip_probe(frame.func.decl); match (opcode) { - CALL, CALL_REF => { + CALL, CALL_REF, FAST_CALL0, FAST_CALL1, FAST_CALL2, FAST_CALL3, FAST_CALL4, FAST_CALL5, FAST_CALL6, FAST_CALL7, FAST_CALL8, FAST_CALL9, FAST_CALL10, FAST_CALL11, FAST_CALL12, FAST_CALL13, FAST_CALL14, FAST_CALL15, FAST_CALL16, FAST_CALL17, FAST_CALL18, FAST_CALL19, FAST_CALL20, FAST_CALL21, FAST_CALL22, FAST_CALL23, FAST_CALL24, FAST_CALL25, FAST_CALL26, FAST_CALL27, FAST_CALL28, FAST_CALL29, FAST_CALL30, FAST_CALL31, FAST_CALL32, FAST_CALL33, FAST_CALL34, FAST_CALL35, FAST_CALL36, FAST_CALL37, FAST_CALL38, FAST_CALL39 => { codeptr.skip_leb(); frame.pc = codeptr.pos; } diff --git a/src/engine/x86-64/Mmap.v3 b/src/engine/x86-64/Mmap.v3 index 5305ef0c6..25621ab12 100644 --- a/src/engine/x86-64/Mmap.v3 +++ b/src/engine/x86-64/Mmap.v3 @@ -18,6 +18,16 @@ component Mmap { RiGc.registerFinalizer(mapping, range.unmap); return mapping; } + def reserve32(size: u64, prot: int) -> Mapping { + var flags = LinuxConst.MAP_PRIVATE | LinuxConst.MAP_ANONYMOUS | 0x40; // 0x40 = MAP_32BIT + var r = Linux.syscall(LinuxConst.SYS_mmap, (Pointer.NULL, size, prot, flags, 0, 0)); + if (r.0 == -1) return null; + var start = Pointer.NULL + r.0, end = start + i64.view(size); + var range = MemoryRange.new(start, end); + var mapping = Mapping.new(range); + RiGc.registerFinalizer(mapping, range.unmap); + return mapping; + } def protect(start: Pointer, size: u64, prot: int) -> bool { var r = Linux.syscall(LinuxConst.SYS_mprotect, (start, size, prot)); return r.0 == 0; diff --git a/src/engine/x86-64/V3Offsets.v3 b/src/engine/x86-64/V3Offsets.v3 index 5de4559e9..2761ab15a 100644 --- a/src/engine/x86-64/V3Offsets.v3 +++ b/src/engine/x86-64/V3Offsets.v3 @@ -31,6 +31,7 @@ class V3Offsets { def FuncDecl_orig_bytecode = int.view(Pointer.atField(decl.orig_bytecode) - Pointer.atObject(decl)); def FuncDecl_sidetable = int.view(Pointer.atField(decl.sidetable.entries) - Pointer.atObject(decl)); def FuncDecl_target_code = int.view(Pointer.atField(decl.target_code.spc_entry) - Pointer.atObject(decl)); + def FuncDecl_fast_target_code = int.view(Pointer.atField(decl.fast_target_code.spc_entry) - Pointer.atObject(decl)); def FuncDecl_tierup_trigger = int.view(Pointer.atField(decl.tierup_trigger) - Pointer.atObject(decl)); def FuncDecl_entry_probed = int.view(Pointer.atField(decl.entry_probed) - Pointer.atObject(decl)); def FuncDecl_frame_var_tags = int.view(Pointer.atField(decl.frame_var_tags) - Pointer.atObject(decl)); diff --git a/src/engine/x86-64/X86_64Interpreter.v3 b/src/engine/x86-64/X86_64Interpreter.v3 index 29307ca98..7b3e74e91 100644 --- a/src/engine/x86-64/X86_64Interpreter.v3 +++ b/src/engine/x86-64/X86_64Interpreter.v3 @@ -536,6 +536,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { var tmp = r_scratch; { // Entrypoint for calls coming from V3 ic.header.intV3EntryOffset = w.pos; + //masm.emit_debugger_breakpoint(); // Allocate and initialize interpreter stack frame from incoming V3 args. asm.q.sub_r_i(r_sp, k_frame_size); @@ -1244,7 +1245,9 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { } def genLocals() { bindHandler(Opcode.DROP); + //masm.emit_debugger_breakpoint(); decrementVsp(); + //masm.emit_debugger_breakpoint(); endHandler(); bindHandler(Opcode.LOCAL_GET); @@ -1307,6 +1310,89 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { asm.movd_r_i(G(Target.V3_RET_GPRS[0]), 0); genPopFrameAndRet(); + // FAST_CALL + // TODO patch the dispatch table so it goes to the code directly, + // instead of this fast function lookup + bindHandler(Opcode.FAST_CALL0); + bindHandler(Opcode.FAST_CALL1); + bindHandler(Opcode.FAST_CALL2); + bindHandler(Opcode.FAST_CALL3); + bindHandler(Opcode.FAST_CALL4); + bindHandler(Opcode.FAST_CALL5); + bindHandler(Opcode.FAST_CALL6); + bindHandler(Opcode.FAST_CALL7); + bindHandler(Opcode.FAST_CALL8); + bindHandler(Opcode.FAST_CALL9); + bindHandler(Opcode.FAST_CALL10); + bindHandler(Opcode.FAST_CALL11); + bindHandler(Opcode.FAST_CALL12); + bindHandler(Opcode.FAST_CALL13); + bindHandler(Opcode.FAST_CALL14); + bindHandler(Opcode.FAST_CALL15); + bindHandler(Opcode.FAST_CALL16); + bindHandler(Opcode.FAST_CALL17); + bindHandler(Opcode.FAST_CALL18); + bindHandler(Opcode.FAST_CALL19); + bindHandler(Opcode.FAST_CALL20); + bindHandler(Opcode.FAST_CALL21); + bindHandler(Opcode.FAST_CALL22); + bindHandler(Opcode.FAST_CALL23); + bindHandler(Opcode.FAST_CALL24); + bindHandler(Opcode.FAST_CALL25); + bindHandler(Opcode.FAST_CALL26); + bindHandler(Opcode.FAST_CALL27); + bindHandler(Opcode.FAST_CALL28); + bindHandler(Opcode.FAST_CALL29); + bindHandler(Opcode.FAST_CALL30); + bindHandler(Opcode.FAST_CALL31); + bindHandler(Opcode.FAST_CALL32); + bindHandler(Opcode.FAST_CALL33); + bindHandler(Opcode.FAST_CALL34); + bindHandler(Opcode.FAST_CALL35); + bindHandler(Opcode.FAST_CALL36); + bindHandler(Opcode.FAST_CALL37); + bindHandler(Opcode.FAST_CALL38); + bindHandler(Opcode.FAST_CALL39); + masm.emit_intentional_crash(); + //masm.emit_debugger_breakpoint(); + var dispatchLabel = X86_64Label.new(); + // genTagPush(BpTypeCode.I32.code); + // asm.movq_m_i(vsph[0].value, 770); + // incrementVsp(); + + /* TODO What should happen in a FAST_CALL? + * + * Ideally, we've patched the dispatch table with exactly what appears in fast_target_code + * so it instantly jumps there and so we don't have to set up the jump first. + * + * Fast function implementation should include code to skip the original operand as + * part of incrementing pc (will be done over in SPC). + * + * But, we could keep this for a quasi-fast call? + */ + + genReadUleb32(r_tmp1); + asm.movq_r_m(r_tmp0, r_instance.plus(offsets.Instance_functions)); + asm.movq_r_m(func_arg, r_tmp0.plusR(r_tmp1, offsets.REF_SIZE, offsets.Array_contents)); + + var tmp = r_tmp2; + asm.movq_r_m(tmp, func_arg.plus(offsets.WasmFunction_decl)); + + //masm.emit_debugger_breakpoint(); + asm.ijmp_m(tmp.plus(offsets.FuncDecl_fast_target_code)); + //asm.icall_m(tmp.plus(offsets.FuncDecl_fast_target_code)); + //asm.invalid(); + + // don't go here + asm.bind(dispatchLabel); + masm.emit_nop(); + masm.emit_nop(); + masm.emit_nop(); + masm.emit_nop(); + masm.emit_nop(); + masm.emit_nop(); + endHandler(); + bindHandler(Opcode.CALL); computeCurIpForTrap(-1); genReadUleb32(r_tmp1); @@ -1326,6 +1412,8 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { var tmp = r_tmp2; asm.movq_r_m(tmp, func_arg.plus(offsets.WasmFunction_decl)); asm.icall_m(tmp.plus(offsets.FuncDecl_target_code)); + // assembly call to target function + // if not compiled, interpreter's entry point } else { asm.call_rel_far(callReentryLabel); } @@ -2700,7 +2788,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { if (FastIntTuning.enableWhammProbeTrampoline) { var pos = w.atEnd().pos; writeDispatchEntry(dispatchTables[0].1, InternalOpcode.BREAK_PROBE.code, pos); - masm.emit_debugger_breakpoint(); + //masm.emit_debugger_breakpoint(); // Compute a pointer to the original code at this pc offset var pc = r_tmp1; // = IP - CODE asm.movq_r_r(pc, r_ip); @@ -4000,6 +4088,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { // Generate a dispatch from the main dispatch table. def genDispatch() { genDispatch0(ip_ptr, if (FeatureDisable.globalProbes, dispatchTables[0].1), true); + //masm.emit_int_dispatch(r_tmp0, r_tmp1, r_ip, r_dispatch, ip_ptr, dispatchTables[0].1, true, ic); } // Generate a load of the next bytecode and a dispatch through the dispatch table. def genDispatch0(ptr: X86_64Addr, table: IcCodeRef, increment: bool) { diff --git a/src/engine/x86-64/X86_64MacroAssembler.v3 b/src/engine/x86-64/X86_64MacroAssembler.v3 index 20a76af68..8e50d5668 100644 --- a/src/engine/x86-64/X86_64MacroAssembler.v3 +++ b/src/engine/x86-64/X86_64MacroAssembler.v3 @@ -1592,6 +1592,49 @@ class X86_64MacroAssembler extends MacroAssembler { asm.pextrq_r_s_i(G(to), X(from), 1); } + // xenv: IntExecEnv = X86_64MasmRegs.INT_EXEC_ENV; + // r_ip rax + // ip_ptr + // r_dispatch r14 + // r_tmp0 rcx + // r_tmp1 rdx + def emit_int_dispatch(opcode: X86_64Gpr, base: X86_64Gpr, r_ip: X86_64Gpr, r_dispatch: X86_64Gpr, + ptr: X86_64Addr, table: IcCodeRef, increment: bool, ic: X86_64InterpreterCode) { + if (ptr != null) asm.movbzx_r_m(opcode, ptr); + if (increment) asm.inc_r(r_ip); + match (FastIntTuning.dispatchEntrySize) { + 2 => { + if (table == null) asm.movq_r_r(base, r_dispatch); + else asm.lea(base, table); // RIP-relative LEA + asm.movwsx_r_m(opcode, base.plusR(opcode, 2, 0)); // load 16-bit offset + asm.add_r_r(base, opcode); + //if (dispatchJmpOffset < 0) dispatchJmpOffset = w.pos; + asm.ijmp_r(base); + } + 4 => { + if (table == null) { + asm.movd_r_m(base, r_dispatch.plusR(opcode, 4, 0)); + } else { + var addr = ic.start + table.offset; + asm.movd_r_m(base, X86_64Addr.new(null, opcode, 4, int.!(addr - Pointer.NULL))); + } + //if (dispatchJmpOffset < 0) dispatchJmpOffset = w.pos; + asm.ijmp_r(base); + } + 8 => { + if (table == null) { + //if (dispatchJmpOffset < 0) dispatchJmpOffset = w.pos; + asm.ijmp_m(r_dispatch.plusR(opcode, 8, 0)); + } else { + var addr = ic.start + table.offset; + //if (dispatchJmpOffset < 0) dispatchJmpOffset = w.pos; + asm.ijmp_m(X86_64Addr.new(null, opcode, 8, int.!(addr - Pointer.NULL))); + } + } + } + + } + // Reads a 32- or 64-bit unsigned LEB from {rw_ptr} into {w_dest}. def emit_read_uleb(w_dest: X86_64Gpr, rw_ptr: X86_64Gpr, w_scratch1: X86_64Gpr, w_scratch2: X86_64Gpr) -> this { // TODO: handle w_dest = rcx diff --git a/src/engine/x86-64/X86_64MasmRegs.v3 b/src/engine/x86-64/X86_64MasmRegs.v3 index 0668c35a5..73af95b67 100644 --- a/src/engine/x86-64/X86_64MasmRegs.v3 +++ b/src/engine/x86-64/X86_64MasmRegs.v3 @@ -88,33 +88,34 @@ component X86_64MasmRegs { return config; })(); - // Build both the SPC and INT execution environments together. - private def t = (fun -> (SpcExecEnv, IntExecEnv) { + // Build the SPC, fast-SPC, and INT execution environments together. + private def t = (fun -> (SpcExecEnv, SpcExecEnv, IntExecEnv) { var xspc = SpcExecEnv.new(); + var xfast = SpcExecEnv.new(); var xint = IntExecEnv.new(); - xint.sp = xspc.sp = RSP; - xint.func_arg = xspc.func_arg = RDX; - xint.vsp = xspc.vsp = RSI; - xint.vfp = xspc.vfp = R11; - xint.mem0_base = xspc.mem0_base = R10; - xint.instance = xspc.instance = RDI; - xint.runtime_arg0 = xspc.runtime_arg0 = RSI; - xint.runtime_arg1 = xspc.runtime_arg1 = RDX; - xint.runtime_arg2 = xspc.runtime_arg2 = RCX; - xint.runtime_arg3 = xspc.runtime_arg3 = R8; - xint.runtime_arg4 = xspc.runtime_arg4 = R9; - xint.ret_throw = xspc.ret_throw = RAX; - xint.runtime_ret0 = xspc.runtime_ret0 = RAX; - xint.runtime_ret1 = xspc.runtime_ret1 = RDX; - xint.scratch = xspc.scratch = RBP; + xint.sp = xspc.sp = xfast.sp = RSP; + xint.func_arg = xspc.func_arg = xfast.func_arg = RDX; // cache of frame (callee-restore) + xint.vsp = xspc.vsp = xfast.vsp = RSI; + xint.vfp = xspc.vfp = xfast.vfp = R11; + xint.mem0_base = xspc.mem0_base = xfast.mem0_base = R10; // cache of frame (callee-restore) + xint.instance = xspc.instance = xfast.instance = RDI; // cache of frame (callee-restore) + xint.runtime_arg0 = xspc.runtime_arg0 = xfast.runtime_arg0 = RSI; + xint.runtime_arg1 = xspc.runtime_arg1 = xfast.runtime_arg1 = RDX; + xint.runtime_arg2 = xspc.runtime_arg2 = xfast.runtime_arg2 = RCX; + xint.runtime_arg3 = xspc.runtime_arg3 = xfast.runtime_arg3 = R8; + xint.runtime_arg4 = xspc.runtime_arg4 = xfast.runtime_arg4 = R9; + xint.ret_throw = xspc.ret_throw = xfast.ret_throw = RAX; + xint.runtime_ret0 = xspc.runtime_ret0 = xfast.runtime_ret0 = RAX; + xint.runtime_ret1 = xspc.runtime_ret1 = xfast.runtime_ret1 = RDX; + xint.scratch = xspc.scratch = xfast.scratch = RBP; xint.curpc = R15; xint.stp = RBX; xint.ip = RAX; xint.func_decl = R12; xint.eip = R13; - xint.dispatch = R14; + xint.dispatch = R14; // cache of field (see how it is saved/stored in interpreter) xint.xmm0 = XMM0; xint.xmm1 = XMM1; xint.xmm2 = XMM2; @@ -127,29 +128,32 @@ component X86_64MasmRegs { def m = MasmAddr(xspc.sp, _); - xint.accessor_slot = xspc.accessor_slot = m(X86_64InterpreterFrame.accessor.offset); - xint.instance_slot = xspc.instance_slot = m(X86_64InterpreterFrame.instance.offset); - xint.mem0_base_slot = xspc.mem0_base_slot = m(X86_64InterpreterFrame.mem0_base.offset); - xint.pc_slot = xspc.pc_slot = m(X86_64InterpreterFrame.curpc.offset); - xint.vfp_slot = xspc.vfp_slot = m(X86_64InterpreterFrame.vfp.offset); - xint.vsp_slot = xspc.vsp_slot = m(X86_64InterpreterFrame.vsp.offset); - xint.wasm_func_slot = xspc.wasm_func_slot = m(X86_64InterpreterFrame.wasm_func.offset); - xint.ip_slot = xspc.inlined_mem0_base_slot = m(X86_64InterpreterFrame.ip.offset); - xint.stp_slot = xspc.inlined_instance_slot = m(X86_64InterpreterFrame.stp.offset); + xint.accessor_slot = xspc.accessor_slot = xfast.accessor_slot = m(X86_64InterpreterFrame.accessor.offset); + xint.instance_slot = xspc.instance_slot = xfast.instance_slot = m(X86_64InterpreterFrame.instance.offset); + xint.mem0_base_slot = xspc.mem0_base_slot = xfast.mem0_base_slot = m(X86_64InterpreterFrame.mem0_base.offset); + xint.pc_slot = xspc.pc_slot = xfast.pc_slot = m(X86_64InterpreterFrame.curpc.offset); + xint.vfp_slot = xspc.vfp_slot = xfast.vfp_slot = m(X86_64InterpreterFrame.vfp.offset); + xint.vsp_slot = xspc.vsp_slot = xfast.vsp_slot = m(X86_64InterpreterFrame.vsp.offset); + xint.wasm_func_slot = xspc.wasm_func_slot = xfast.wasm_func_slot = m(X86_64InterpreterFrame.wasm_func.offset); + xint.ip_slot = xspc.inlined_mem0_base_slot = xfast.inlined_mem0_base_slot = m(X86_64InterpreterFrame.ip.offset); + xint.stp_slot = xspc.inlined_instance_slot = xfast.inlined_instance_slot = m(X86_64InterpreterFrame.stp.offset); xint.func_decl_slot = m(X86_64InterpreterFrame.func_decl.offset); xint.code_slot = m(X86_64InterpreterFrame.code.offset); xint.eip_slot = m(X86_64InterpreterFrame.eip.offset); xint.frameSize = xspc.frameSize = X86_64InterpreterFrame.size; + xfast.frameSize = 0; - return (xspc, xint); + return (xspc, xfast, xint); })(); // The execution environment for single-pass compilation contexts. def SPC_EXEC_ENV = t.0; + // The execution environment for fast single-pass compilation contexts. + def FAST_SPC_EXEC_ENV = t.1; // The execution environment for interpreter compilation contexts. - def INT_EXEC_ENV = t.1; + def INT_EXEC_ENV = t.2; // A register allocator for single-pass compilation contexts. def SPC_ALLOC = (fun -> RegAlloc { @@ -163,7 +167,8 @@ component X86_64MasmRegs { // A register allocator for interpreter contexts. def INT_ALLOC = (fun -> RegAlloc { var pools = [ - RegPool32.new([RCX, RDX, R8, R9]), + RegPool32.new([RCX, RDX, R8, R9]), // could use callee-restore (but put at end) + // if callee-restore registers are used, have to emit a restore at the end RegPool32.new([XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14]) ]; return RegAlloc.new(CONFIG.poolMap, pools, null); @@ -189,3 +194,4 @@ component X86_64MasmRegs { return reg; } } + diff --git a/src/engine/x86-64/X86_64PreGenStubs.v3 b/src/engine/x86-64/X86_64PreGenStubs.v3 index f8a780792..d836db938 100644 --- a/src/engine/x86-64/X86_64PreGenStubs.v3 +++ b/src/engine/x86-64/X86_64PreGenStubs.v3 @@ -25,7 +25,7 @@ layout X86_64PreGenHeader { +24 intV3EntryOffset: i32; // entry into interpreter from V3 caller +28 intSpcEntryOffset: i32; // entry into interpreter from SPC caller +32 intIntEntryOffset: i32; // entry into interpreter from interpreter caller - +36 intSuspendEntryOffset: i32; // entry into interpreter from a suspended child stack + +36 intSuspendEntryOffset: i32; // entry into interpreter from a suspended child stack +40 deoptReentryOffset: i32; // re-enter interpreter from optimized code +44 oobMemoryHandlerOffset: i32; // handler for signals caused by OOB memory access +48 divZeroHandlerOffset: i32; // handler for signals caused by divide by zero @@ -222,8 +222,8 @@ component X86_64PreGenStubs { ic.header.probedDispatchTableOffset, ic.header.fastDispatchTableOffset); - // Write-protect the executable code for security and debugging - Mmap.protect(range.start + ic.header.codeStart, u64.!(ic.header.codeEnd - ic.header.codeStart), Mmap.PROT_READ | Mmap.PROT_EXEC); + // XXX: PROT_WRITE included to allow runtime dispatch table patching + Mmap.protect(range.start + ic.header.codeStart, u64.!(ic.header.codeEnd - ic.header.codeStart), Mmap.PROT_READ | Mmap.PROT_WRITE | Mmap.PROT_EXEC); // The host call stub is part of interpreter code (TODO: does it need to be?) hostCallStub.start = ic.start + ic.header.hostCallStubOffset; diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3 index 8e5f0e370..3bdbbf6e2 100644 --- a/src/engine/x86-64/X86_64SinglePassCompiler.v3 +++ b/src/engine/x86-64/X86_64SinglePassCompiler.v3 @@ -23,16 +23,102 @@ def KIND_F64 = SpcConsts.KIND_F64; def KIND_V128 = SpcConsts.KIND_V128; def KIND_REF = SpcConsts.KIND_REF; +def xenv: IntExecEnv = X86_64MasmRegs.INT_EXEC_ENV; + +def r_mem0_base = G(xenv.mem0_base); +def r_vfp = G(xenv.vfp); +def r_vsp = G(xenv.vsp); +def r_stp = G(xenv.stp); +def r_ip = G(xenv.ip); +def r_eip = G(xenv.eip); +def r_func_decl = G(xenv.func_decl); +def r_instance = G(xenv.instance); +def r_curpc = G(xenv.curpc); +def ip_ptr = r_ip.plus(0); +def r_dispatch = G(xenv.dispatch); +def r_tmp0 = G(xenv.tmp0); // RCX +def r_tmp1 = G(xenv.tmp1); // RDX + +def m_mem0_base = R.RSP.plus(X86_64InterpreterFrame.mem0_base.offset); +def m_vfp = R.RSP.plus(X86_64InterpreterFrame.vfp.offset); +def m_vsp = R.RSP.plus(X86_64InterpreterFrame.vsp.offset); +def m_stp = R.RSP.plus(X86_64InterpreterFrame.stp.offset); +def m_ip = R.RSP.plus(X86_64InterpreterFrame.ip.offset); +def m_eip = R.RSP.plus(X86_64InterpreterFrame.eip.offset); +def m_func_decl = R.RSP.plus(X86_64InterpreterFrame.func_decl.offset); +def m_instance = R.RSP.plus(X86_64InterpreterFrame.instance.offset); +def m_curpc = R.RSP.plus(X86_64InterpreterFrame.curpc.offset); + +def ivar_MEM0_BASE = (r_mem0_base, m_mem0_base); +def ivar_VFP = (r_vfp, m_vfp); +def ivar_VSP = (r_vsp, m_vsp); +def ivar_STP = (r_stp, m_stp); +def ivar_IP = (r_ip, m_ip); +def ivar_EIP = (r_eip, m_eip); +def ivar_FUNC_DECL = (r_func_decl, m_func_decl); +def ivar_INSTANCE = (r_instance, m_instance); +def ivar_CURPC = (r_curpc, m_curpc); + +def all_ivars = [ + ivar_MEM0_BASE, + ivar_VFP, + ivar_VSP, + ivar_STP, + ivar_IP, + ivar_EIP, + ivar_FUNC_DECL, + ivar_INSTANCE, + ivar_CURPC +]; + // Implements the target-specific parts of the single-pass compiler for X86-64. class X86_64SinglePassCompiler extends SinglePassCompiler { def w = DataWriter.new(); def mmasm = X86_64MacroAssembler.new(w, X86_64MasmRegs.CONFIG); def asm = mmasm.asm; + var ic: X86_64InterpreterCode; - new(extensions: Extension.set, limits: Limits, config: RegConfig) - super(X86_64MasmRegs.SPC_EXEC_ENV, mmasm, X86_64MasmRegs.SPC_ALLOC.copy(), extensions, limits) { + new(ic, extensions: Extension.set, limits: Limits, config: RegConfig, fast: bool) + super(if(fast, X86_64MasmRegs.FAST_SPC_EXEC_ENV, X86_64MasmRegs.SPC_EXEC_ENV), mmasm, + if(fast, X86_64MasmRegs.INT_ALLOC.copy(), X86_64MasmRegs.SPC_ALLOC.copy()), + extensions, limits, fast) { mmasm.trap_stubs = TRAPS_STUB; } + def emitFastDispatch() { + mmasm.emit_int_dispatch(r_tmp0, r_tmp1, r_ip, r_dispatch, ip_ptr, + if(ic != null, IcCodeRef.new(ic.header.fastDispatchTableOffset)), true, ic); + } + private def saveIVar(r: X86_64Gpr) { + for (t in all_ivars) { + if (t.0 == r) asm.movq_m_r(t.1, r); + } + } + def saveCallerIVars() { + saveIVar(r_ip); + saveIVar(r_stp); + if (!FeatureDisable.stacktraces) saveIVar(r_curpc); + } + def restoreDispatchTableReg() { + if (!FeatureDisable.globalProbes) { + // restore dispatch table from Interpreter.dispatchTable + def offsets = masm.getOffsets(); + asm.movq_r_m(r_dispatch, mmasm.absPointer(offsets.Interpreter_dispatchTable)); + } + } + private def restoreReg(r: X86_64Gpr) { + for (t in all_ivars) { + if (t.0 == r) asm.movq_r_m(r, t.1); + } + } + def restoreCallerIVars() { + restoreReg(r_ip); + restoreReg(r_stp); + restoreReg(r_eip); + restoreReg(r_instance); + restoreReg(r_func_decl); + restoreReg(r_mem0_base); + restoreReg(r_vfp); + } private def visitCompareI(asm: X86_64Assembler, cond: X86_64Cond) -> bool { var b = pop(), a = popReg(); @@ -1256,7 +1342,35 @@ class X86_64SpcModuleCode extends X86_64SpcCode { } // Reconstructs inlined interpreter frames for an inlined hardware trap context. // Returns the new rsp to write into the ucontext (top of stack). - private def reconstructInlinedFramesForTrap(r_rsp: Pointer, inline_ctx: List) -> Pointer; + private def reconstructInlinedFramesForTrap(r_rsp: Pointer, inline_ctx: List) -> Pointer { + def frames: Array = Lists.toArray(inline_ctx); + def outer = frames[frames.length - 1]; + def inlined = frames[0 ... (frames.length - 1)]; + def count = inlined.length; + + // set outermost pc in the real frame + (r_rsp + X86_64InterpreterFrame.curpc.offset).store(outer.pc); + + // Read instance from the real outer frame (shared across all inlined frames) + var instance = (r_rsp + X86_64InterpreterFrame.instance.offset).load(); + + // Push inlined frames + for (i = count - 1; i >= 0; i--) { + var fid = inlined[i].func_index; + var pc = inlined[i].pc; + + r_rsp += -8; + r_rsp.store(INLINED_FRAME_STUB.start); + + r_rsp += -X86_64InterpreterFrame.size; // move rsp? + // write func, pc, frame accessor + var wasm_func = WasmFunction.!(instance.functions[fid]); + (r_rsp + X86_64InterpreterFrame.wasm_func.offset).store(wasm_func); + (r_rsp + X86_64InterpreterFrame.curpc.offset).store(pc); + (r_rsp + X86_64InterpreterFrame.accessor.offset).store(null); + } + return r_rsp; + } // Look up the source {pc} of a location {i} in this code. Returns {-1} if no exact entry is found. // Return addresses are treated differently than other addresses in the code. def lookupPc(ip: Pointer, isRetAddr: bool) -> List { @@ -1487,6 +1601,7 @@ def codePointer(f: P -> R) -> Pointer { // Global functionality associated with the single-pass compiler for X86-64. component X86_64Spc { + var ic: X86_64InterpreterCode; // A handy chokepoint for entering JIT code from V3. def invoke(wf: WasmFunction, sp: Pointer) -> Throwable { return V3_SPC_ENTRY_FUNC.get()(wf, sp, wf.decl.target_code.spc_entry); @@ -1507,7 +1622,7 @@ component X86_64Spc { return addr; } def estimateCodeSizeFor(decl: FuncDecl) -> int { - return 60 + decl.orig_bytecode.length * 20; // TODO: huge overestimate + return 60 + decl.orig_bytecode.length * 20 * (2 << byte.view(SpcTuning.maxInlineDepth)); // TODO: huge overestimate } private def lazyCompile(wf: WasmFunction) -> (WasmFunction, Pointer, Throwable) { // The global stub simply consults the execution strategy. diff --git a/src/engine/x86-64/X86_64Target.v3 b/src/engine/x86-64/X86_64Target.v3 index 015db9508..02ddf51d3 100644 --- a/src/engine/x86-64/X86_64Target.v3 +++ b/src/engine/x86-64/X86_64Target.v3 @@ -65,6 +65,46 @@ component Target { f.target_code = TargetCode(addr); Debug.afterCompile(f, u64.view(addr - Pointer.NULL)); } + def setFastTargetCode(f: FuncDecl, addr: Pointer, end: Pointer) { + if (Trace.compiler) { + Trace.OUT.put2("func[%d].fast_target_code: break *0x%x", f.func_index, addr - Pointer.NULL) + .put2(" disass 0x%x, 0x%x", addr - Pointer.NULL, end - Pointer.NULL).ln(); + if (Trace.asm) { + var cur_byte = addr; + Trace.OUT.puts("JIT code: "); + while (cur_byte < end) { + Trace.OUT.put1("%x ", cur_byte.load()); + cur_byte++; + } + Trace.OUT.ln(); + } + } + f.fast_target_code = TargetCode(addr); + patchFastCallDispatch(f, addr); + Debug.afterCompile(f, u64.view(addr - Pointer.NULL)); + } + def patchFastCallDispatch(f: FuncDecl, addr: Pointer) { + if (f.fast_call_idx < 0) return; + def opcode = Opcodes.indexToFastCall(f.fast_call_idx); + def ic = X86_64PreGenStubs.getInterpreterCode(); + // XXX Patch only fast dispatch tables + def fast_offset = ic.header.fastDispatchTableOffset; + def entry = ic.start + fast_offset + opcode.code * FastIntTuning.dispatchEntrySize; + if (Trace.compiler) { + Trace.OUT.puts("patching dispatch type\n"); + Trace.OUT.put1("start 0x%x\n", u64.view(ic.start)); + Trace.OUT.put1("entry 0x%x\n", u64.view(entry)); + Trace.OUT.put1("addr 0x%x\n", u64.view(addr)); + } + // XXX we require 8 entry size because of `addr` position + match (FastIntTuning.dispatchEntrySize) { + 4 => entry.store(u32.view(addr)); + 8 => entry.store(long.view(addr)); + // 2-byte relative case would need a relative offset + } + if (Trace.compiler) Trace.OUT.puts("patched successfully\n"); + } + def pregenIntoFile(filename: string) -> ErrorBuilder { var data = System.fileLoad(filename); var err = ErrorBuilder.new().puts("interpreter generator: "); @@ -188,6 +228,7 @@ class X86_64ExecutionStrategy extends ExecutionStrategy { } // Compilation methods called directly by stubs. def lazyCompile(wf: WasmFunction) -> SpcResultForStub; + def fastCompile(wf: WasmFunction, ic: X86_64InterpreterCode) -> SpcResultForStub; def tierupCompile(wf: WasmFunction) -> SpcResultForStub; // Tiering may require setting up the whole module. def onTestModule(module: Module) { @@ -196,6 +237,20 @@ class X86_64ExecutionStrategy extends ExecutionStrategy { def disableLazyNameDecodingDuringGC(module: Module) { if (module.names != null) module.names.lazyDecodeDisabled = RiGc.inGC; } + + def installStubForModule(module: Module, set: (Module, FuncDecl) -> void) { + // ensure entrypoint and lazy compile stubs are generated + X86_64PreGenStubs.gen(); + // Set all functions to refer to the tier-up compile stub. + var codeSize = MINIMUM_CODE_SIZE; + for (i < module.functions.length) { + var f = module.functions[i]; + if (f.imported()) continue; + set(module, f); + codeSize += X86_64Spc.estimateCodeSizeFor(f); + } + allocateCodeForModule(module, codeSize); + } } // One tier: fast-int, modules require no pre-processing. @@ -206,10 +261,18 @@ class X86_64InterpreterOnlyStrategy extends X86_64ExecutionStrategy { def onModuleFinish(module: Module, size: u32, err: ErrorGen) { disableLazyNameDecodingDuringGC(module); + fastCompileEntireModule(module, size, false, err, 1024); } def onFuncValidationFinish(module: Module, func: FuncDecl, err: ErrorGen) { if (err != null && !err.ok()) return; Target.setUnconditionalInterpreterEntryIfMultiTier(func); + + for (i < module.exports.length) { + def ex = module.exports[i]; + if (ex.1 == func && Strings.startsWith(ex.0, "fast:")) { + System.puts(Strings.format1("fast function %s\n", ex.0)); + } + } } def onNewFunction(wf: WasmFunction, err: ErrorGen) { Target.setUnconditionalInterpreterEntryIfMultiTier(wf.decl); @@ -219,6 +282,100 @@ class X86_64InterpreterOnlyStrategy extends X86_64ExecutionStrategy { if (FastIntTuning.enableWhammProbeTrampoline && WhammProbe.?(p)) X86_64WhammTrampoline.makeTrampoline(WhammProbe.!(p), X86_64PreGenStubs.getInterpreterCode()); } + + // TODO avoid duplicated function here + def fastCompile(wf: WasmFunction, ic: X86_64InterpreterCode) -> SpcResultForStub { + var module = wf.instance.module; + var code = module.target_module.spc_code; + var compiler = newCompiler(module.filename, true, null); + var masm = X86_64MacroAssembler.!(compiler.masm), w = masm.asm.w; + + // generate code for the function + var success = compiler.gen(module, wf.decl, null); + + // Check for remaining code space + var regionSize = code.mapping.range.size(); + var remaining = regionSize - u64.!(code.codeEnd); + var codeSize = w.atEnd().pos; + if (codeSize > remaining) { + if (Trace.compiler) Trace.OUT.put3("exhausted code space for module (%d of %d bytes remaining, need %d)", + remaining, regionSize, codeSize).ln(); + success = false; + } + + var entrypoint: Pointer; + if (success) { + // Copy code into end of region + entrypoint = code.appendCode(masm); + Target.setFastTargetCode(wf.decl, entrypoint, entrypoint + codeSize); + } else { + // Failed, enter interpreter + var f = wf.decl; + if (Trace.compiler) Trace.OUT.put1("func[%d] FAST compile failed", f.func_index).ln(); + entrypoint = X86_64Spc.setInterpreterFallback(f); + } + return SpcResultForStub(wf, entrypoint, null); + } + def fastCompileEntireModule(module: Module, size: u32, interpreter_fallback: bool, err: ErrorGen, ballast: u32) { + // ensure entrypoint and lazy compile stubs are generated + X86_64PreGenStubs.gen(); + + var compiler = newCompiler(module.filename, true, null); + var w = compiler.w; + + // generate code for all functions + var bounds = Array<(int, int)>.new(module.functions.length); + var suberr = if(!interpreter_fallback, err); + for (i = 0; err.ok() && i < module.functions.length; i++) { + var f = module.functions[i]; + if (f.imported()) continue; + for (j < module.exports.length) { + def ex = module.exports[j]; + if (ex.1 == f && Strings.startsWith(ex.0, "fast:")) { + var start = w.atEnd().pos; + var compiled = compiler.gen(module, f, suberr); + if (compiled) bounds[i] = (start, w.end()); + else bounds[i] = (-1, -1); + } + } + } + + // copy and map code (reserve32 ensures address fits in 32 bits for dispatch table patching) + var length = u64.view(w.atEnd().pos) + ballast; + var mapping = Mmap.reserve32(length, Mmap.PROT_WRITE), range = mapping.range; // TODO: handle failure + var masm = X86_64MacroAssembler.!(compiler.masm); + masm.setTargetAddress(u64.view(range.start - Pointer.NULL)); + Target.copyInto(mapping.range, 0, w); + // TODO: for security, move embedded references out of the code region and make it non-writable + Mmap.protect(range.start, u64.!(range.end - range.start), Mmap.PROT_WRITE | Mmap.PROT_READ | Mmap.PROT_EXEC); + for (i < bounds.length) { + var b = bounds[i]; + if (b.0 >= 0) { + var addr = mapping.range.start; + var f = module.functions[i]; + Target.setFastTargetCode(f, addr + b.0, addr + b.1); + } else { + var f = module.functions[i]; + if (Trace.compiler) Trace.OUT.put1("func[%d] initial compile failed", f.func_index).ln(); + X86_64Spc.setInterpreterFallback(f); + } + } + // XXX: reduce duplication with {X86_64SpcModuleCode.appendCode}. + var code = X86_64SpcModuleCode.new(mapping); + if (masm.source_locs != null) { + code.sourcePcs = Vector.new(); + code.sourcePcs.putv(masm.source_locs); + } + if (masm.embeddedRefOffsets != null) { + if (code.embeddedRefOffsets == null) code.embeddedRefOffsets = Vector.new(); + code.embeddedRefOffsets.putv(masm.embeddedRefOffsets); + } + + module.target_module = TargetModule(code); + RiRuntime.registerUserCode(code); + module.target_module.spc_code.keepAlive(); + Debug.afterCompileModule(module); + } } // Base class of all strategies that use SPC. @@ -242,7 +399,7 @@ class X86_64SpcStrategy extends X86_64ExecutionStrategy { var module = wf.instance.module; var code = module.target_module.spc_code; - var compiler = newCompiler(module.filename); // XXX: cache per-thread + var compiler = newCompiler(module.filename, false, null); // XXX: cache per-thread var masm = X86_64MacroAssembler.!(compiler.masm), w = masm.asm.w; // generate code for the function @@ -271,19 +428,6 @@ class X86_64SpcStrategy extends X86_64ExecutionStrategy { } return SpcResultForStub(wf, entrypoint, null); } - def installStubForModule(module: Module, set: (Module, FuncDecl) -> void) { - // ensure entrypoint and lazy compile stubs are generated - X86_64PreGenStubs.gen(); - // Set all functions to refer to the tier-up compile stub. - var codeSize = MINIMUM_CODE_SIZE; - for (i < module.functions.length) { - var f = module.functions[i]; - if (f.imported()) continue; - set(module, f); - codeSize += X86_64Spc.estimateCodeSizeFor(f); - } - allocateCodeForModule(module, codeSize); - } } // One tier: SPC, modules are eagerly compiled. @@ -319,7 +463,7 @@ class X86_64SpcAotStrategy(interpreter_fallback: bool) extends X86_64SpcStrategy // ensure entrypoint and lazy compile stubs are generated X86_64PreGenStubs.gen(); - var compiler = newCompiler(module.filename); + var compiler = newCompiler(module.filename, false, null); var w = compiler.w; // generate code for all functions @@ -412,7 +556,7 @@ class X86_64DynamicStrategy extends X86_64SpcStrategy { } def onTierUp(wf: WasmFunction, pc: int) -> TargetOsrInfo { var module = wf.instance.module; - var compiler = newCompiler(module.filename); + var compiler = newCompiler(module.filename, false, null); if (!applyJitFilter(wf.instance.module, wf.decl, "osr")) { // OSR compile suppressed wf.decl.tierup_trigger = int.max; // no point in trying for a while @@ -441,10 +585,10 @@ class X86_64DynamicStrategy extends X86_64SpcStrategy { } } -def newCompiler(filename: string) -> X86_64SinglePassCompiler { +def newCompiler(filename: string, fast: bool, ic: X86_64InterpreterCode) -> X86_64SinglePassCompiler { var extensions = Extension.set.all; // TODO: all extensions enabled for compilation var limits = Limits.new(); - var compiler = X86_64SinglePassCompiler.new(extensions, limits, X86_64MasmRegs.CONFIG); + var compiler = X86_64SinglePassCompiler.new(ic, extensions, limits, X86_64MasmRegs.CONFIG, fast); return compiler; } def MINIMUM_CODE_SIZE = PAGE_SIZE_i; diff --git a/src/util/BytecodeVisitor.v3 b/src/util/BytecodeVisitor.v3 index fbef4b056..cacd6c738 100644 --- a/src/util/BytecodeVisitor.v3 +++ b/src/util/BytecodeVisitor.v3 @@ -20,9 +20,9 @@ class BytecodeVisitor { def visitMisc(op: Opcode) { visitOp(op); } def visitControl(op: Opcode) { visitOp(op); } def visitCall(op: Opcode) { visitOp(op); } - def visitCallDirect(op: Opcode, func_index: u31, tailCall: bool) { visitCall(op); } - def visitCallIndirect(op: Opcode, sig_index: u31, table_index: u31, tailCall: bool) { visitCall(op); } - def visitCallRef(op: Opcode, sig_index: u31, tailCall: bool) { visitCall(op); } + def visitCallDirect(op: Opcode, func_index: u31, prop: CallProperty) { visitCall(op); } + def visitCallIndirect(op: Opcode, sig_index: u31, table_index: u31, prop: CallProperty) { visitCall(op); } + def visitCallRef(op: Opcode, sig_index: u31, prop: CallProperty) { visitCall(op); } def visitLocal(op: Opcode, local_index: u31) { visitOp(op); } def visitGlobal(op: Opcode, local_index: u31) { visitOp(op); } def visitTable(op: Opcode, table_index: u31) { visitOp(op); } @@ -69,12 +69,13 @@ class BytecodeVisitor { def visit_BR_IF (depth: u31) { visitControl(Opcode.BR_IF); } def visit_BR_TABLE (labels: Range) { visitControl(Opcode.BR_TABLE); } def visit_RETURN () { visitControl(Opcode.RETURN); } - def visit_CALL (func_index: u31) { visitCallDirect(Opcode.CALL, func_index, false); } - def visit_CALL_INDIRECT (sig_index: u31, table_index: u31) { visitCallIndirect(Opcode.CALL_INDIRECT, sig_index, table_index, false); } - def visit_RETURN_CALL (func_index: u31) { visitCallDirect(Opcode.RETURN_CALL, func_index, true); } - def visit_RETURN_CALL_INDIRECT(sig_index: u31, table_index: u31) { visitCallIndirect(Opcode.RETURN_CALL_INDIRECT, sig_index, table_index, true); } - def visit_CALL_REF (sig_index: u31) { visitCallRef(Opcode.CALL_REF, sig_index, false); } - def visit_RETURN_CALL_REF(sig_index: u31) { visitCallRef(Opcode.RETURN_CALL_REF, sig_index, true); } + def visit_CALL (func_index: u31) { visitCallDirect(Opcode.CALL, func_index, SLOW); } + def visit_FAST_CALL (fast_index: int, func_index: u31) { visitCallDirect(Opcodes.indexToFastCall(fast_index), func_index, FAST); } + def visit_CALL_INDIRECT (sig_index: u31, table_index: u31) { visitCallIndirect(Opcode.CALL_INDIRECT, sig_index, table_index, SLOW); } + def visit_RETURN_CALL (func_index: u31) { visitCallDirect(Opcode.RETURN_CALL, func_index, TAIL); } + def visit_RETURN_CALL_INDIRECT(sig_index: u31, table_index: u31) { visitCallIndirect(Opcode.RETURN_CALL_INDIRECT, sig_index, table_index, TAIL); } + def visit_CALL_REF (sig_index: u31) { visitCallRef(Opcode.CALL_REF, sig_index, SLOW); } + def visit_RETURN_CALL_REF(sig_index: u31) { visitCallRef(Opcode.RETURN_CALL_REF, sig_index, TAIL); } def visit_DELEGATE (depth: u31) { visitControl(Opcode.DELEGATE); } def visit_CATCH_ALL () { visitControl(Opcode.CATCH_ALL); } def visit_DROP () { visitMisc(Opcode.DROP); } @@ -653,3 +654,11 @@ class BytecodeVisitor { def visit_SUSPEND (tag: u31) { visitOp(Opcode.SUSPEND); } def visit_SWITCH (cont: u31, tag: u31) { visitOp(Opcode.SWITCH); } } + +enum CallProperty { + SLOW, TAIL, FAST +} + +def SLOW = CallProperty.SLOW; +def TAIL = CallProperty.TAIL; +def FAST = CallProperty.FAST; diff --git a/src/util/Whamm.v3 b/src/util/Whamm.v3 index 9b93b746d..ae1649d8b 100644 --- a/src/util/Whamm.v3 +++ b/src/util/Whamm.v3 @@ -175,10 +175,9 @@ component Whamm { class WhammProbe(func: Function, sig: Array) extends Probe { var trampoline: TargetCode; // properties set by the spc to make inlining optimization decisions. - var inline_heuristic_checked = false; - var spc_inline_func = false; - var spc_swap_instance = false; - var spc_swap_membase = false; + var swap_checked = false; + var swap_instance = false; + var swap_membase = false; private def args = if(sig.length == 0, Values.NONE, Array.new(sig.length)); @@ -203,6 +202,31 @@ class WhammProbe(func: Function, sig: Array) extends Probe { } return ProbeAction.Continue; } + + // If function is to be inlined, check to see if instance or mem0_base need to be swapped. + def checkSwap() { + if (swap_checked) return; + var bi = BytecodeIterator.new().reset(WasmFunction.!(func).decl); + while (bi.more()) { + var op = bi.current(); + match (op) { + // These opcodes require swapping the instance. + THROW, CALL, CALL_INDIRECT, MEMORY_INIT, MEMORY_SIZE, MEMORY_GROW, MEMORY_COPY, MEMORY_FILL, REF_FUNC, DATA_DROP, + ELEM_DROP, TABLE_INIT, TABLE_SIZE, TABLE_COPY, TABLE_GROW, GLOBAL_SET, GLOBAL_GET, TABLE_SET, TABLE_GET => swap_instance = true; + // Load/store opcodes require either the memory base or the instance. + I32_STORE, I64_STORE, F32_STORE, F64_STORE, I32_STORE8, I32_STORE16, I64_STORE8, I64_STORE16, I64_STORE32, + V128_STORE, I32_LOAD, I64_LOAD, F32_LOAD, F64_LOAD, I32_LOAD8_S, I32_LOAD8_U, I32_LOAD16_S, I32_LOAD16_U, + I64_LOAD8_S, I64_LOAD8_U, I64_LOAD16_S, I64_LOAD16_U, I64_LOAD32_S, I64_LOAD32_U, V128_LOAD => { + var memarg = bi.immptr().read_MemArg(); + if (memarg.memory_index == 0) swap_membase = true; + else swap_instance = true; + } + _ => ; + } + bi.next(); + } + swap_checked = true; + } } def parseParam0(r: TextReader) -> WhammParam { diff --git a/test/inline/failures.x86-64-linux b/test/inline/failures.x86-64-linux deleted file mode 100644 index 925e70891..000000000 --- a/test/inline/failures.x86-64-linux +++ /dev/null @@ -1,3 +0,0 @@ -inline_test_arithmetic.wasm -inline_test_locals_control.wasm -inline_test_nesting.wasm diff --git a/test/inline/failures.x86-64-linux.dyn b/test/inline/failures.x86-64-linux.dyn index da02fa079..50325688b 100644 --- a/test/inline/failures.x86-64-linux.dyn +++ b/test/inline/failures.x86-64-linux.dyn @@ -1,4 +1,5 @@ inline_test_arithmetic.wasm inline_test_locals_control.wasm inline_test_nesting.wasm +inline_test_return.wasm diff --git a/test/inline/inline_test_return.wasm b/test/inline/inline_test_return.wasm new file mode 100644 index 000000000..d7bcbbaa0 Binary files /dev/null and b/test/inline/inline_test_return.wasm differ diff --git a/test/inline/inline_test_return.wasm.exit b/test/inline/inline_test_return.wasm.exit new file mode 100644 index 000000000..573541ac9 --- /dev/null +++ b/test/inline/inline_test_return.wasm.exit @@ -0,0 +1 @@ +0 diff --git a/test/inline/inline_test_return.wasm.flags b/test/inline/inline_test_return.wasm.flags new file mode 100644 index 000000000..0c2fe67af --- /dev/null +++ b/test/inline/inline_test_return.wasm.flags @@ -0,0 +1 @@ +--metrics=spc*calls --inline-max-depth=1 diff --git a/test/inline/inline_test_return.wasm.out b/test/inline/inline_test_return.wasm.out new file mode 100644 index 000000000..79d1497bf --- /dev/null +++ b/test/inline/inline_test_return.wasm.out @@ -0,0 +1,4 @@ +spc:static_calls : 6 calls +spc:static_inlined_calls : 6 calls +spc:dynamic_calls : 6 calls +spc:dynamic_inlined_calls : 6 calls diff --git a/test/inline/inline_test_return.wat b/test/inline/inline_test_return.wat new file mode 100644 index 000000000..c1dd8b196 --- /dev/null +++ b/test/inline/inline_test_return.wat @@ -0,0 +1,97 @@ +;; Test inlined functions with explicit RETURN, including nested control flow +;; and paths where extra values are on the stack at the time of return. +(module + ;; Two levels of nested ifs; in the early-return path, 2*a is an extra value + ;; on the value stack below the returned a+b. + (func $weighted (param i32) (param i32) (result i32) + block (result i32) + local.get 0 + i32.const 2 + i32.mul ;; [2a] -- extra below when early return fires + block + local.get 0 + i32.const 0 + i32.gt_s + if + local.get 1 + i32.const 0 + i32.gt_s + if + ;; both positive: return a+b; 2a is extra on stack + local.get 0 + local.get 1 + i32.add + return + end + end + end + local.get 1 + i32.add ;; fallthrough: 2a+b + end + ) + + ;; Clamp x to [lo, hi]; two levels of nesting, returns on multiple paths. + (func $clamp (param i32) (param i32) (param i32) (result i32) + local.get 0 + local.get 1 + i32.lt_s + if + local.get 1 + return + end + local.get 0 + local.get 2 + i32.gt_s + if + local.get 2 + return + end + local.get 0 + ) + + (func (export "main") (result i32) + i32.const 3 + i32.const 4 + call $weighted + i32.const 7 ;; both positive: 3+4=7 + i32.ne + + i32.const 3 + i32.const -1 + call $weighted + i32.const 5 ;; b<=0: 2*3+(-1)=5 + i32.ne + i32.or + + i32.const -1 + i32.const 4 + call $weighted + i32.const 2 ;; a<=0: 2*(-1)+4=2 + i32.ne + i32.or + + i32.const 5 + i32.const 0 + i32.const 10 + call $clamp + i32.const 5 + i32.ne + i32.or + + i32.const -3 + i32.const 0 + i32.const 10 + call $clamp + i32.const 0 + i32.ne + i32.or + + i32.const 15 + i32.const 0 + i32.const 10 + call $clamp + i32.const 10 + i32.ne + i32.or + ) +)