diff --git a/src/passes/Asyncify.cpp b/src/passes/Asyncify.cpp index 48f9cc59419..02fc76e6280 100644 --- a/src/passes/Asyncify.cpp +++ b/src/passes/Asyncify.cpp @@ -258,6 +258,7 @@ // out why a certain function was instrumented. // // --pass-arg=asyncify-memory@memory +// // Picks which exported memory of the module to store and load data from // and to (useful if the module contains multiple memories). // diff --git a/src/passes/AutoBatch.cpp b/src/passes/AutoBatch.cpp new file mode 100644 index 00000000000..b1440c753fc --- /dev/null +++ b/src/passes/AutoBatch.cpp @@ -0,0 +1,435 @@ +/* + * Copyright 2026 WebAssembly Community Group participants + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// +// Automatically batch calls to imports. This can be useful to reduce overhead +// on the wasm/JS boundary. For example, consider this code: +// +// gl_bind_buffer(10, 20); +// gl_run_shader(123); +// +// If each of these is a call to a JS import, then we cross the wasm/JS +// boundary twice. Instead, we can serialize the commands we want to run on the +// JS side, and call out once to JS, then read the buffer and execute them both, +// doing a single boundary crossing. If there are very many crossings, this +// batching can be worthwhile. +// +// The idea of batching Web API calls is used in Emscripten's GL proxying, +// +// https://github.com/emscripten-core/emscripten/tree/main/system/lib/gl +// +// where most functions are proxied in an async way to the main thread, which +// means the calling thread effectively only "flushes" the command buffer when +// we need to execute a synchronous method. The WebCC project does this as well, +// +// https://github.com/io-eric/webcc/blob/main/docs/architecture.md#architecture +// +// This AutoBatch pass is different in that it *automatically* batches calls, +// without a fixed set of APIs that it recognizes. Whenever it sees an import, +// +// * If the import has no return value, it wraps it in a function that +// serializes it to the command buffer. +// * If the import does have a return value, the wrapper flushes the command +// buffer before calling it. +// +// The serialization format, and the code to serialize in wasm and deserialize +// in JS, is all generated based on the actual imports seen in the wasm. This +// avoids the "big switch of calls" problem that such proxying/serialization +// implementations usually have, where they map integer IDs to functions to be +// called, which has the result of keeping all those functions alive (since it +// doesn't see the integer IDs actually used at runtime). +// +// A flush() import is added, which is called to flush the command buffer. This +// receives two parameters, one to the start of the buffer and one to the +// location just past the end. This pass generates the JS to be called in +// flush(), and you just need to connect that JS to the import for the wasm. +// That import should have module "autobatch" and base name "flush" TODO flag +// +// --pass-arg=autobatch-js@filename +// +// A filename to write the JS code for deserialization, that is, the +// implementation of flush() which flushes the command buffer. This code +// assumes the following variables are available: +// +// * imports: The import object the wasm is instantiated with, so it can +// call imports. +// * HEAP32: A Uint32Array view on the memory the wasm uses, so we can +// read the command buffer. +// * HEAP64: A BigInt64Array view on the memory. +// * HEAPF32: A Float32Array view on the memory. +// * HEAPF64: A Float64Array view on the memory. +// +// --pass-arg=autobatch-asserts +// +// This enables extra asserts in the output, like checking if we exceed the +// size of the command buffer. +// +// TODO: flags to control special exports etc. +// +// Internal ABI: The command buffer's start is assumed to be 8-byte aligned. +// Each command is a function id (32 bits) followed by the parameters. 8-byte +// parameters are fully aligned (so JS can read them with a typed array). We +// extend each command to be a multiple of 8 bytes so that each command can +// assume itself to be 8-byte aligned. +// +// TODO: tools to detect problems: reentrancy and stale data (serialized +// pointers refer to data that might get changed) +// +// Benchmarks: +// +// * emscripten microbenchmark (autobatch_bench.cpp): 2x speedup +// * emcc test/autobatch_bench.cpp -O3 --profiling -o b.html +// -sGLOBAL_BASE=1MB -sTOTAL_MEMORY=128MB +// * bin/wasm-opt -all b.wasm --autobatch -o b.wasm +// --pass-arg=autobatch-js@autobatch.js +// * Then paste autobatch.js in the right place in b.js (see the cpp). +// * webcc: 1.5x speedup (had to add flush() and make set_main_loop not throw) +// * emscripten glgears (hello_world_gles.c): no change (profiling confirms +// boundary is not an issue; build +// with -DANIMATE) +// + +#include "ir/module-utils.h" +#include "ir/names.h" +#include "pass.h" +#include "support/file.h" +#include "wasm-builder.h" +#include "wasm.h" + +namespace wasm { + +namespace { + +struct AutoBatch : public Pass { + // The original imports, before we wrapped them, in order of ids. + std::vector originalImports; + + // Map import names to the ids we use to serialize them. + std::unordered_map importIds; + + bool asserts; + + std::unique_ptr builder; + + // The name of the global containing the command buffer's base. + Name commandBufferBaseGlobal; + // The name of the global containing the command buffer current position + // relative to the base, that is, the end of the command buffer. + Name commandBufferPosGlobal; + // TODO: add a size as well, and a new export to users can set the pos+size. + + // The memory we serialize to. + Name memory; + + // The internal name of the flush import. + Name flushName; + + void run(Module* module) override { + asserts = hasArgument("autobatch-asserts"); + + builder = std::make_unique(*module); + + auto numOriginalFunctions = module->functions.size(); + + // Build the mapping of integer ID to imports. We map imports with no + // results, which are exactly the things we serialize commands to. + for (auto& func : module->functions) { + if (!func->imported() || func->getResults() != Type::none) { + continue; + } + Index id = importIds.size(); + importIds[func->name] = id; + } + + // Add the flush import, which receives start, end params. + flushName = Names::getValidFunctionName(*module, "flush"); + auto flushType = + Type(Signature({Type::i32, Type::i32}, Type::none), NonNullable, Inexact); + auto* flushFunc = module->addFunction( + builder->makeFunction(flushName, flushType, {}, nullptr)); + // TODO: flags? + flushFunc->module = "autobatch"; + flushFunc->base = "flush"; + + // Use the first memory. TODO: use multi-memory? + assert(!module->memories.empty()); + memory = module->memories[0]->name; + + // Add the command buffer base global. + commandBufferBaseGlobal = Names::getValidGlobalName(*module, "cmdbufbase"); + // TODO: allow setting a non-0 value here, right now we just use the start + // of the memory + module->addGlobal(builder->makeGlobal(commandBufferBaseGlobal, + Type::i32, + builder->makeConst(int32_t(0)), + Builder::Mutable)); + + // Add the command buffer position global. + commandBufferPosGlobal = Names::getValidGlobalName(*module, "cmdbufpos"); + // TODO: support 64-bit offsets? + module->addGlobal(builder->makeGlobal(commandBufferPosGlobal, + Type::i32, + builder->makeConst(int32_t(0)), + Builder::Mutable)); + + // Wrap every import (but leave our new import alone). Loop until the + // original number of functions, so we do not modify flush() or any of the + // new functions we add. + for (Index i = 0; i < numOriginalFunctions; i++) { + auto* func = module->functions[i].get(); + if (func->imported() && func->name != flushName) { + // Copy the original import to create the actual import that the wrapper + // calls. Doing it this way avoids needing to update callers: we replace + // the original import in-place, so existing calls go to the wrapper + // now. + auto newImportName = Names::getValidFunctionName(*module, func->name); + auto* originalImport = + ModuleUtils::copyFunction(func, *module, newImportName); + + // This one is no longer an import. + func->module = func->base = Name(); + assert(!func->imported()); + func->type = func->type.with(Exact); + + // Fill in the wrapper body. + if (func->getResults() == Type::none) { + wrapNonReturning(func, newImportName); + originalImports.push_back(originalImport); + } else { + wrapReturning(func, newImportName); + } + } + } + + // Emit the JS. + auto jsFile = getArgumentOrDefault("autobatch-js", ""); + if (jsFile.empty()) { + std::cerr << "warning: not emitting JS. Use " + << "--pass-arg=autobatch-js@FILENAME\n"; + } else { + emitJS(jsFile, module); + } + } + + // Serialize a given value to the command buffer. Receives the index of a + // local with the command buffer position, and the offset relative to that + // local. Returns the code to serialize, and updates the offset to the place + // for the thing after it. + Expression* serialize(Expression* value, Index posLocal, Index& offset) { + auto type = value->type; + // TODO: if we cannot serialize something, return an error, and the + // caller can flush and call, giving up on batching. + assert(type.isBasic()); + switch (type.getBasic()) { + case Type::i32: + case Type::i64: + case Type::f32: + case Type::f64: { + auto size = type.getByteSize(); + // Ensure values are aligned. + auto miss = offset % size; + if (miss) { + offset += size - miss; + } + auto* ptr = builder->makeLocalGet(posLocal, Type::i32); + auto* ret = + builder->makeStore(size, offset, size, ptr, value, type, memory); + offset += size; + return ret; + break; + } + default: { + Fatal() << "AutoBatch: unsupported serialization type " << type; + } + } + } + + // Wrap a function that does not return a result. We add it to the command + // buffer. + void wrapNonReturning(Function* func, Name importToCall) { + std::vector body; + + // Stash the command buffer's position before our additions. + auto posLocal = Builder::addVar(func, Type::i32); + body.push_back(builder->makeLocalSet( + posLocal, builder->makeGlobalGet(commandBufferPosGlobal, Type::i32))); + + Index offset = 0; + + // Serialize the id. + // TODO: we could use an 8 or 16 bit id when the # of imports is small + body.push_back(serialize( + builder->makeConst(int32_t(importIds[func->name])), posLocal, offset)); + + // Serialize the params. + auto params = func->getParams(); + for (Index i = 0; i < params.size(); i++) { + body.push_back( + serialize(builder->makeLocalGet(i, params[i]), posLocal, offset)); + } + + // The total we emit for this command must be aligned. + ensure8ByteAlign(offset); + + // Update the command buffer position. + auto* total = + builder->makeBinary(AddInt32, + builder->makeLocalGet(posLocal, Type::i32), + builder->makeConst(int32_t(offset))); + body.push_back(builder->makeGlobalSet(commandBufferPosGlobal, total)); + + // TODO: add assertion here when asserts + + func->body = builder->makeBlock(body); + } + + // Wrap a function that returns a result. We flush the command buffer, then + // call it. TODO: we could also add it to the command buffer itself, to save + // a call. + void wrapReturning(Function* func, Name importToCall) { + std::vector body; + + // Flush the command buffer and rest the position, if we have anything. + auto* check = builder->makeGlobalGet(commandBufferPosGlobal, Type::i32); + auto* start = builder->makeGlobalGet(commandBufferBaseGlobal, Type::i32); + auto* end = builder->makeGlobalGet(commandBufferPosGlobal, Type::i32); + auto* flush = builder->makeCall(flushName, {start, end}, Type::none); + auto* start2 = builder->makeGlobalGet(commandBufferBaseGlobal, Type::i32); + auto* reset = builder->makeGlobalSet(commandBufferPosGlobal, start2); + auto* iff = builder->makeIf(check, builder->makeSequence(flush, reset)); + body.push_back(iff); + + // Call the import. + auto params = func->getParams(); + std::vector args; + for (Index i = 0; i < params.size(); i++) { + args.push_back(builder->makeLocalGet(i, params[i])); + } + body.push_back(builder->makeCall(importToCall, args, func->getResults())); + + func->body = builder->makeBlock(body); + } + + void emitJS(const std::string& jsFile, Module* module) { + Output out(jsFile, Flags::Text); + + // The main loop goes over commands, each time switching over which function + // to call. + out << R"(function flush(pos, end) { + while (pos != end) { + let funcId = HEAP32[pos >> 2]; + switch (funcId) { +)"; + + // Emit deserialization code for each function. + for (Index id = 0; id < originalImports.size(); id++) { + auto* import = originalImports[id]; + if (import->getResults() != Type::none) { + // This is not something we serialize. + continue; + } + + // Track the offset relative to `pos`, which beings after the id. + Index offset = 4; + + // Emit a case for the function. + out << " case "; + out << std::to_string(id); + out << ": {\n"; + + // Emit a call to the function. + out << " imports["; + out << "'" << import->module << "'"; + out << "]["; + out << "'" << import->base << "'"; + out << "]("; + + // Emit deserialization for each param. + auto params = import->getParams(); + for (Index i = 0; i < params.size(); i++) { + if (i > 0) { + out << ", "; + } + auto type = params[i]; + assert(type.isBasic()); + switch (type.getBasic()) { + case Type::i32: { + out << "HEAP32[pos + " + std::to_string(offset) + " >> 2]"; + offset += 4; + break; + } + case Type::f32: { + out << "HEAPF32[pos + " + std::to_string(offset) + " >> 2]"; + offset += 4; + break; + } + case Type::i64: { + ensure8ByteAlign(offset); + out << "HEAP64[pos + " + std::to_string(offset) + " >> 3]"; + offset += 8; + break; + } + case Type::f64: { + ensure8ByteAlign(offset); + out << "HEAPF64[pos + " + std::to_string(offset) + " >> 3]"; + offset += 8; + break; + } + default: { + Fatal() << "AutoBatch: unsupported serialization type " << type; + } + } + } + + // Finish the call. + out << ");\n"; + + // Bump the position to the proper alignment, if we need to, and update. + ensure8ByteAlign(offset); // no need for bool TODO + out << " pos += " << std::to_string(offset) << ";\n"; + + // Finish the switch case. + out << " continue;\n"; + out << " }\n"; + } + + // Error handling. + out << " default: throw `invalid id ${funcId}`;\n"; + + // End the switch, loop, and function. + out << " }\n"; + out << " }\n"; + out << "}\n"; + } + + // Given an offset, bump it to 8-byte alignment. (We only need to handle the + // case of offset 4, as our values are all 32 or 64-bit.) Returns true if we + // changed the value. + bool ensure8ByteAlign(Index& offset) { + if (offset % 8) { + assert(offset % 8 == 4); + offset += 4; + return true; + } + return false; + } +}; + +} // anonymous namespace + +Pass* createAutoBatchPass() { return new AutoBatch(); } + +} // namespace wasm diff --git a/src/passes/CMakeLists.txt b/src/passes/CMakeLists.txt index c2952e174b8..d43a1b53438 100644 --- a/src/passes/CMakeLists.txt +++ b/src/passes/CMakeLists.txt @@ -20,6 +20,7 @@ set(passes_SOURCES AbstractTypeRefining.cpp AlignmentLowering.cpp Asyncify.cpp + AutoBatch.cpp AvoidReinterprets.cpp CoalesceLocals.cpp CodePushing.cpp diff --git a/src/passes/pass.cpp b/src/passes/pass.cpp index 0e6e28267c2..a31365d9c53 100644 --- a/src/passes/pass.cpp +++ b/src/passes/pass.cpp @@ -96,6 +96,8 @@ void PassRegistry::registerPasses() { registerPass("asyncify", "async/await style transform, allowing pausing and resuming", createAsyncifyPass); + registerPass( + "autobatch", "automatically batch calls to imports", createAutoBatchPass); registerPass("avoid-reinterprets", "Tries to avoid reinterpret operations via more loads", createAvoidReinterpretsPass); diff --git a/src/passes/passes.h b/src/passes/passes.h index be06369a9f8..2c77adce179 100644 --- a/src/passes/passes.h +++ b/src/passes/passes.h @@ -25,6 +25,7 @@ class Pass; Pass* createAbstractTypeRefiningPass(); Pass* createAlignmentLoweringPass(); Pass* createAsyncifyPass(); +Pass* createAutoBatchPass(); Pass* createAvoidReinterpretsPass(); Pass* createCoalesceLocalsPass(); Pass* createCoalesceLocalsWithLearningPass(); diff --git a/test/lit/node/autobatch-js-post.js b/test/lit/node/autobatch-js-post.js new file mode 100644 index 00000000000..c1d1dc9a6f4 --- /dev/null +++ b/test/lit/node/autobatch-js-post.js @@ -0,0 +1,25 @@ +// Code that goes after the flush() function that autobatch generates. This +// instantiates and runs the code. + +// We test with and without autobatching. Without, there is no flush method to +// add. +if (typeof flush === 'function') { + imports.autobatch = { + flush: flush, + }; +} + +let instance = new WebAssembly.Instance(mod, imports); + +let buffer = instance.exports.mem.buffer; +HEAP32 = new Int32Array(buffer); +HEAP64 = new BigInt64Array(buffer); +HEAPF32 = new Float32Array(buffer); +HEAPF64 = new Float64Array(buffer); + +console.log('calling caller'); +let result = instance.exports.caller(); +console.log(`result: ${result}`); + +console.log('test complete.'); + diff --git a/test/lit/node/autobatch-js-pre.js b/test/lit/node/autobatch-js-pre.js new file mode 100644 index 00000000000..ca79a26f99e --- /dev/null +++ b/test/lit/node/autobatch-js-pre.js @@ -0,0 +1,29 @@ +// Code that goes before the flush() function that autobatch generates. This +// loads the binary and prepares the imports and other globals. + +let argv = process.argv.slice(2); + +let binary = require('fs').readFileSync(argv[0]); + +let mod = new WebAssembly.Module(binary); + +let imports = { + outside: { + foo1: (x, y) => { + console.log(`foo1: ${x} ${y}`); + }, + foo2: (x, y) => { + console.log(`foo2: ${x} ${y}`); + }, + foo3: (x, y) => { + console.log(`foo3: ${x} ${y}`); + }, + bar: () => { + console.log('bar'); + return 42; + }, + }, +}; + +let HEAP32, HEAP64, HEAPF32, HEAPF64; + diff --git a/test/lit/node/autobatch-js.wast b/test/lit/node/autobatch-js.wast new file mode 100644 index 00000000000..a352c321d78 --- /dev/null +++ b/test/lit/node/autobatch-js.wast @@ -0,0 +1,100 @@ +;; Similar testcase to autobatch.wast, but here we test the JS output and +;; execution. + +(module + ;; This serializes as [i32 id, i32 param, f64 param], which is a total of 16 + ;; bytes. The f64 is aligned properly just by how the offsets work out. + (import "outside" "foo1" (func $noresult1 (param i32) (param f64))) + + ;; This serializes as [i32 id, i64 param, f32 param], which is a total of 16 + ;; bytes again, but now the 64-bit param must have a 4-byte buffer before it, + ;; so it is aligned. + (import "outside" "foo2" (func $noresult2 (param i64) (param f32))) + + ;; This serializes as [i32 id, i32 param, f32 param], which is a total of 12 + ;; bytes. We bump $cmdbufpos by 16, to keep the thing after us aligned. + (import "outside" "foo3" (func $noresult3 (param i32) (param f32))) + + (import "outside" "bar" (func $result (result f64))) + + (memory $mem 10 20) + (export "mem" (memory $mem)) + + (func $caller (export "caller") (result f64) + ;; Two calls and a flush. + (call $noresult1 + (i32.const 42) + (f64.const 3.14159) + ) + (call $noresult2 + (i64.const 1234) + (f32.const 2.71828) + ) + (drop (call $result)) + + ;; One call and a flush. + (call $noresult3 + (i32.const -1) + (f32.const -2.3) + ) + (drop (call $result)) + + ;; Flush and test we get the result. + (call $result) + ) +) + +;; Build the autobatched wasm and JS. +;; RUN: wasm-opt %s --autobatch -o %t.wasm --pass-arg=autobatch-js@%t.js + +;; Compare the JS to what we expect. +;; RUN: cat %t.js | filecheck %s --check-prefix=JS + +;; JS: function flush(pos, end) { +;; JS-NEXT: while (pos != end) { +;; JS-NEXT: let funcId = HEAP32[pos >> 2]; +;; JS-NEXT: switch (funcId) { +;; JS-NEXT: case 0: { +;; JS-NEXT: imports['outside']['foo1'](HEAP32[pos + 4 >> 2], HEAPF64[pos + 8 >> 3]); +;; JS-NEXT: pos += 16; +;; JS-NEXT: continue; +;; JS-NEXT: } +;; JS-NEXT: case 1: { +;; JS-NEXT: imports['outside']['foo2'](HEAP64[pos + 8 >> 3], HEAPF32[pos + 16 >> 2]); +;; JS-NEXT: pos += 24; +;; JS-NEXT: continue; +;; JS-NEXT: } +;; JS-NEXT: case 2: { +;; JS-NEXT: imports['outside']['foo3'](HEAP32[pos + 4 >> 2], HEAPF32[pos + 8 >> 2]); +;; JS-NEXT: pos += 16; +;; JS-NEXT: continue; +;; JS-NEXT: } +;; JS-NEXT: } +;; JS-NEXT: } +;; JS-NEXT: } + +;; Combine our test JS with the generated JS. +;; RUN: cat %S/autobatch-js-pre.js > %t.combined.js +;; RUN: cat %t.js >> %t.combined.js +;; RUN: cat %S/autobatch-js-post.js >> %t.combined.js + +;; Execute the autobatched wasm. +;; RUN: node %t.combined.js %t.wasm | filecheck %s --check-prefix=EXEC + +;; EXEC: calling caller +;; EXEC-NEXT: foo1: 42 3.14159 +;; EXEC-NEXT: foo2: 1234 2.718280076980591 +;; EXEC-NEXT: bar +;; EXEC-NEXT: foo3: -1 -2.299999952316284 +;; EXEC-NEXT: bar +;; EXEC-NEXT: bar +;; EXEC-NEXT: result: 42 +;; EXEC-NEXT: test complete. + +;; Execute it again, without autobatching. The execution is the same. +;; RUN: wasm-opt %s -o %t.wasm +;; RUN: cat %S/autobatch-js-pre.js > %t.combined.js +;; RUN: cat %S/autobatch-js-post.js >> %t.combined.js +;; RUN: node %t.combined.js %t.wasm | filecheck %s --check-prefix=EXEC + + diff --git a/test/lit/passes/autobatch.wast b/test/lit/passes/autobatch.wast new file mode 100644 index 00000000000..a0f1da5574b --- /dev/null +++ b/test/lit/passes/autobatch.wast @@ -0,0 +1,204 @@ +;; NOTE: Assertions have been generated by update_lit_checks.py --all-items and should not be edited. + +;; RUN: wasm-opt %s --autobatch -S -o - | filecheck %s + +;; The output will replace non-returning imports with wrappers that serialize +;; commands. +;; +;; Value-returning imports will flush in their wrappers, then do the call. +;; +;; $caller does not change at all, as calls to the imports now call wrappers +;; with the same names. + +(module + ;; This serializes as [i32 id, i32 param, f64 param], which is a total of 16 + ;; bytes. The f64 is aligned properly just by how the offsets work out. + (import "outside" "foo1" (func $noresult1 (param i32) (param f64))) + + ;; This serializes as [i32 id, i64 param, f32 param], which is a total of 16 + ;; bytes again, but now the 64-bit param must have a 4-byte buffer before it, + ;; so it is aligned. + (import "outside" "foo2" (func $noresult2 (param i64) (param f32))) + + ;; A function with a result. We purposefully put this before "foo3" to test + ;; for mixups in the indexing (we do not index "bar", as calls to functions + ;; returning a value are not serialized). + (import "outside" "bar" (func $result (result f64))) + + ;; This serializes as [i32 id, i32 param, f32 param], which is a total of 12 + ;; bytes. We bump $cmdbufpos by 16, to keep the thing after us aligned. + (import "outside" "foo3" (func $noresult3 (param i32) (param f32))) + + ;; CHECK: (type $0 (func (result f64))) + + ;; CHECK: (type $1 (func (param i32 f64))) + + ;; CHECK: (type $2 (func (param i64 f32))) + + ;; CHECK: (type $3 (func (param i32 f32))) + + ;; CHECK: (type $4 (func (param i32 i32))) + + ;; CHECK: (import "autobatch" "flush" (func $flush (param i32 i32))) + + ;; CHECK: (import "outside" "foo1" (func $noresult1_6 (param i32 f64))) + + ;; CHECK: (import "outside" "foo2" (func $noresult2_7 (param i64 f32))) + + ;; CHECK: (import "outside" "bar" (func $result_8 (result f64))) + + ;; CHECK: (import "outside" "foo3" (func $noresult3_9 (param i32 f32))) + + ;; CHECK: (global $cmdbufbase (mut i32) (i32.const 0)) + + ;; CHECK: (global $cmdbufpos (mut i32) (i32.const 0)) + + ;; CHECK: (memory $mem 10 20) + (memory $mem 10 20) + + ;; CHECK: (func $noresult1 (param $0 i32) (param $1 f64) + ;; CHECK-NEXT: (local $2 i32) + ;; CHECK-NEXT: (local.set $2 + ;; CHECK-NEXT: (global.get $cmdbufpos) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (i32.store + ;; CHECK-NEXT: (local.get $2) + ;; CHECK-NEXT: (i32.const 0) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (i32.store offset=4 + ;; CHECK-NEXT: (local.get $2) + ;; CHECK-NEXT: (local.get $0) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (f64.store offset=8 + ;; CHECK-NEXT: (local.get $2) + ;; CHECK-NEXT: (local.get $1) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (global.set $cmdbufpos + ;; CHECK-NEXT: (i32.add + ;; CHECK-NEXT: (local.get $2) + ;; CHECK-NEXT: (i32.const 16) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + + ;; CHECK: (func $noresult2 (param $0 i64) (param $1 f32) + ;; CHECK-NEXT: (local $2 i32) + ;; CHECK-NEXT: (local.set $2 + ;; CHECK-NEXT: (global.get $cmdbufpos) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (i32.store + ;; CHECK-NEXT: (local.get $2) + ;; CHECK-NEXT: (i32.const 1) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (i64.store offset=8 + ;; CHECK-NEXT: (local.get $2) + ;; CHECK-NEXT: (local.get $0) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (f32.store offset=16 + ;; CHECK-NEXT: (local.get $2) + ;; CHECK-NEXT: (local.get $1) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (global.set $cmdbufpos + ;; CHECK-NEXT: (i32.add + ;; CHECK-NEXT: (local.get $2) + ;; CHECK-NEXT: (i32.const 24) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + + ;; CHECK: (func $result (result f64) + ;; CHECK-NEXT: (if + ;; CHECK-NEXT: (global.get $cmdbufpos) + ;; CHECK-NEXT: (then + ;; CHECK-NEXT: (call $flush + ;; CHECK-NEXT: (global.get $cmdbufbase) + ;; CHECK-NEXT: (global.get $cmdbufpos) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (global.set $cmdbufpos + ;; CHECK-NEXT: (global.get $cmdbufbase) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (call $result_8) + ;; CHECK-NEXT: ) + + ;; CHECK: (func $noresult3 (param $0 i32) (param $1 f32) + ;; CHECK-NEXT: (local $2 i32) + ;; CHECK-NEXT: (local.set $2 + ;; CHECK-NEXT: (global.get $cmdbufpos) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (i32.store + ;; CHECK-NEXT: (local.get $2) + ;; CHECK-NEXT: (i32.const 2) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (i32.store offset=4 + ;; CHECK-NEXT: (local.get $2) + ;; CHECK-NEXT: (local.get $0) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (f32.store offset=8 + ;; CHECK-NEXT: (local.get $2) + ;; CHECK-NEXT: (local.get $1) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (global.set $cmdbufpos + ;; CHECK-NEXT: (i32.add + ;; CHECK-NEXT: (local.get $2) + ;; CHECK-NEXT: (i32.const 16) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + + ;; CHECK: (func $caller (result f64) + ;; CHECK-NEXT: (call $noresult1 + ;; CHECK-NEXT: (i32.const 42) + ;; CHECK-NEXT: (f64.const 3.14159) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (call $noresult2 + ;; CHECK-NEXT: (i64.const 1234) + ;; CHECK-NEXT: (f32.const 2.718280076980591) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (drop + ;; CHECK-NEXT: (call $result) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (call $noresult3 + ;; CHECK-NEXT: (i32.const -1) + ;; CHECK-NEXT: (f32.const -2.299999952316284) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (call $noresult1 + ;; CHECK-NEXT: (i32.const 942) + ;; CHECK-NEXT: (f64.const 93.14159) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (call $noresult2 + ;; CHECK-NEXT: (i64.const 91234) + ;; CHECK-NEXT: (f32.const 92.71827697753906) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (call $result) + ;; CHECK-NEXT: ) + (func $caller (result f64) + ;; A bunch of calls to them all. + (call $noresult1 + (i32.const 42) + (f64.const 3.14159) + ) + (call $noresult2 + (i64.const 1234) + (f32.const 2.71828) + ) + (drop (call $result)) + + (call $noresult3 + (i32.const -1) + (f32.const -2.3) + ) + + (call $noresult1 + (i32.const 942) + (f64.const 93.14159) + ) + (call $noresult2 + (i64.const 91234) + (f32.const 92.71828) + ) + (call $result) + ) +) +