diff --git a/src/passes/Asyncify.cpp b/src/passes/Asyncify.cpp
index 48f9cc59419..02fc76e6280 100644
--- a/src/passes/Asyncify.cpp
+++ b/src/passes/Asyncify.cpp
@@ -258,6 +258,7 @@
 //      out why a certain function was instrumented.
 //
 //   --pass-arg=asyncify-memory@memory
+//
 //      Picks which exported memory of the module to store and load data from
 //      and to (useful if the module contains multiple memories).
 //
diff --git a/src/passes/AutoBatch.cpp b/src/passes/AutoBatch.cpp
new file mode 100644
index 00000000000..b1440c753fc
--- /dev/null
+++ b/src/passes/AutoBatch.cpp
@@ -0,0 +1,435 @@
+/*
+ * Copyright 2026 WebAssembly Community Group participants
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//
+// Automatically batch calls to imports. This can be useful to reduce overhead
+// on the wasm/JS boundary. For example, consider this code:
+//
+//   gl_bind_buffer(10, 20);
+//   gl_run_shader(123);
+//
+// If each of these is a call to a JS import, then we cross the wasm/JS
+// boundary twice. Instead, we can serialize the commands we want to run on the
+// JS side, and call out once to JS, then read the buffer and execute them both,
+// doing a single boundary crossing. If there are very many crossings, this
+// batching can be worthwhile.
+//
+// The idea of batching Web API calls is used in Emscripten's GL proxying,
+//
+//   https://github.com/emscripten-core/emscripten/tree/main/system/lib/gl
+//
+// where most functions are proxied in an async way to the main thread, which
+// means the calling thread effectively only "flushes" the command buffer when
+// we need to execute a synchronous method. The WebCC project does this as well,
+//
+//   https://github.com/io-eric/webcc/blob/main/docs/architecture.md#architecture
+//
+// This AutoBatch pass is different in that it *automatically* batches calls,
+// without a fixed set of APIs that it recognizes. Whenever it sees an import,
+//
+//   * If the import has no return value, it wraps it in a function that
+//     serializes it to the command buffer.
+//   * If the import does have a return value, the wrapper flushes the command
+//     buffer before calling it.
+//
+// The serialization format, and the code to serialize in wasm and deserialize
+// in JS, is all generated based on the actual imports seen in the wasm. This
+// avoids the "big switch of calls" problem that such proxying/serialization
+// implementations usually have, where they map integer IDs to functions to be
+// called, which has the result of keeping all those functions alive (since it
+// doesn't see the integer IDs actually used at runtime).
+//
+// A flush() import is added, which is called to flush the command buffer. This
+// receives two parameters, one to the start of the buffer and one to the
+// location just past the end. This pass generates the JS to be called in
+// flush(), and you just need to connect that JS to the import for the wasm.
+// That import should have module "autobatch" and base name "flush" TODO flag
+//
+//   --pass-arg=autobatch-js@filename
+//
+//      A filename to write the JS code for deserialization, that is, the
+//      implementation of flush() which flushes the command buffer. This code
+//      assumes the following variables are available:
+//
+//        * imports: The import object the wasm is instantiated with, so it can
+//                   call imports.
+//        * HEAP32: A Uint32Array view on the memory the wasm uses, so we can
+//                  read the command buffer.
+//        * HEAP64: A BigInt64Array view on the memory.
+//        * HEAPF32: A Float32Array view on the memory.
+//        * HEAPF64: A Float64Array view on the memory.
+//
+//   --pass-arg=autobatch-asserts
+//
+//      This enables extra asserts in the output, like checking if we exceed the
+//      size of the command buffer.
+//
+// TODO: flags to control special exports etc.
+//
+// Internal ABI: The command buffer's start is assumed to be 8-byte aligned.
+// Each command is a function id (32 bits) followed by the parameters. 8-byte
+// parameters are fully aligned (so JS can read them with a typed array). We
+// extend each command to be a multiple of 8 bytes so that each command can
+// assume itself to be 8-byte aligned.
+//
+// TODO: tools to detect problems: reentrancy and stale data (serialized
+//       pointers refer to data that might get changed)
+//
+// Benchmarks:
+//
+//   * emscripten microbenchmark (autobatch_bench.cpp): 2x speedup
+//       * emcc test/autobatch_bench.cpp -O3 --profiling -o b.html
+//              -sGLOBAL_BASE=1MB -sTOTAL_MEMORY=128MB
+//       * bin/wasm-opt -all b.wasm --autobatch -o b.wasm
+//                      --pass-arg=autobatch-js@autobatch.js
+//       * Then paste autobatch.js in the right place in b.js (see the cpp).
+//   * webcc: 1.5x speedup (had to add flush() and make set_main_loop not throw)
+//   * emscripten glgears (hello_world_gles.c): no change (profiling confirms
+//                                              boundary is not an issue; build
+//                                              with -DANIMATE)
+//
+
+#include "ir/module-utils.h"
+#include "ir/names.h"
+#include "pass.h"
+#include "support/file.h"
+#include "wasm-builder.h"
+#include "wasm.h"
+
+namespace wasm {
+
+namespace {
+
+struct AutoBatch : public Pass {
+  // The original imports, before we wrapped them, in order of ids.
+  std::vector<Function*> originalImports;
+
+  // Map import names to the ids we use to serialize them.
+  std::unordered_map<Name, Index> importIds;
+
+  bool asserts;
+
+  std::unique_ptr<Builder> builder;
+
+  // The name of the global containing the command buffer's base.
+  Name commandBufferBaseGlobal;
+  // The name of the global containing the command buffer current position
+  // relative to the base, that is, the end of the command buffer.
+  Name commandBufferPosGlobal;
+  // TODO: add a size as well, and a new export to users can set the pos+size.
+
+  // The memory we serialize to.
+  Name memory;
+
+  // The internal name of the flush import.
+  Name flushName;
+
+  void run(Module* module) override {
+    asserts = hasArgument("autobatch-asserts");
+
+    builder = std::make_unique<Builder>(*module);
+
+    auto numOriginalFunctions = module->functions.size();
+
+    // Build the mapping of integer ID to imports. We map imports with no
+    // results, which are exactly the things we serialize commands to.
+    for (auto& func : module->functions) {
+      if (!func->imported() || func->getResults() != Type::none) {
+        continue;
+      }
+      Index id = importIds.size();
+      importIds[func->name] = id;
+    }
+
+    // Add the flush import, which receives start, end params.
+    flushName = Names::getValidFunctionName(*module, "flush");
+    auto flushType =
+      Type(Signature({Type::i32, Type::i32}, Type::none), NonNullable, Inexact);
+    auto* flushFunc = module->addFunction(
+      builder->makeFunction(flushName, flushType, {}, nullptr));
+    // TODO: flags?
+    flushFunc->module = "autobatch";
+    flushFunc->base = "flush";
+
+    // Use the first memory. TODO: use multi-memory?
+    assert(!module->memories.empty());
+    memory = module->memories[0]->name;
+
+    // Add the command buffer base global.
+    commandBufferBaseGlobal = Names::getValidGlobalName(*module, "cmdbufbase");
+    // TODO: allow setting a non-0 value here, right now we just use the start
+    //       of the memory
+    module->addGlobal(builder->makeGlobal(commandBufferBaseGlobal,
+                                          Type::i32,
+                                          builder->makeConst(int32_t(0)),
+                                          Builder::Mutable));
+
+    // Add the command buffer position global.
+    commandBufferPosGlobal = Names::getValidGlobalName(*module, "cmdbufpos");
+    // TODO: support 64-bit offsets?
+    module->addGlobal(builder->makeGlobal(commandBufferPosGlobal,
+                                          Type::i32,
+                                          builder->makeConst(int32_t(0)),
+                                          Builder::Mutable));
+
+    // Wrap every import (but leave our new import alone). Loop until the
+    // original number of functions, so we do not modify flush() or any of the
+    // new functions we add.
+    for (Index i = 0; i < numOriginalFunctions; i++) {
+      auto* func = module->functions[i].get();
+      if (func->imported() && func->name != flushName) {
+        // Copy the original import to create the actual import that the wrapper
+        // calls. Doing it this way avoids needing to update callers: we replace
+        // the original import in-place, so existing calls go to the wrapper
+        // now.
+        auto newImportName = Names::getValidFunctionName(*module, func->name);
+        auto* originalImport =
+          ModuleUtils::copyFunction(func, *module, newImportName);
+
+        // This one is no longer an import.
+        func->module = func->base = Name();
+        assert(!func->imported());
+        func->type = func->type.with(Exact);
+
+        // Fill in the wrapper body.
+        if (func->getResults() == Type::none) {
+          wrapNonReturning(func, newImportName);
+          originalImports.push_back(originalImport);
+        } else {
+          wrapReturning(func, newImportName);
+        }
+      }
+    }
+
+    // Emit the JS.
+    auto jsFile = getArgumentOrDefault("autobatch-js", "");
+    if (jsFile.empty()) {
+      std::cerr << "warning: not emitting JS. Use "
+                << "--pass-arg=autobatch-js@FILENAME\n";
+    } else {
+      emitJS(jsFile, module);
+    }
+  }
+
+  // Serialize a given value to the command buffer. Receives the index of a
+  // local with the command buffer position, and the offset relative to that
+  // local. Returns the code to serialize, and updates the offset to the place
+  // for the thing after it.
+  Expression* serialize(Expression* value, Index posLocal, Index& offset) {
+    auto type = value->type;
+    // TODO: if we cannot serialize something, return an error, and the
+    // caller can flush and call, giving up on batching.
+    assert(type.isBasic());
+    switch (type.getBasic()) {
+      case Type::i32:
+      case Type::i64:
+      case Type::f32:
+      case Type::f64: {
+        auto size = type.getByteSize();
+        // Ensure values are aligned.
+        auto miss = offset % size;
+        if (miss) {
+          offset += size - miss;
+        }
+        auto* ptr = builder->makeLocalGet(posLocal, Type::i32);
+        auto* ret =
+          builder->makeStore(size, offset, size, ptr, value, type, memory);
+        offset += size;
+        return ret;
+        break;
+      }
+      default: {
+        Fatal() << "AutoBatch: unsupported serialization type " << type;
+      }
+    }
+  }
+
+  // Wrap a function that does not return a result. We add it to the command
+  // buffer.
+  void wrapNonReturning(Function* func, Name importToCall) {
+    std::vector<Expression*> body;
+
+    // Stash the command buffer's position before our additions.
+    auto posLocal = Builder::addVar(func, Type::i32);
+    body.push_back(builder->makeLocalSet(
+      posLocal, builder->makeGlobalGet(commandBufferPosGlobal, Type::i32)));
+
+    Index offset = 0;
+
+    // Serialize the id.
+    // TODO: we could use an 8 or 16 bit id when the # of imports is small
+    body.push_back(serialize(
+      builder->makeConst(int32_t(importIds[func->name])), posLocal, offset));
+
+    // Serialize the params.
+    auto params = func->getParams();
+    for (Index i = 0; i < params.size(); i++) {
+      body.push_back(
+        serialize(builder->makeLocalGet(i, params[i]), posLocal, offset));
+    }
+
+    // The total we emit for this command must be aligned.
+    ensure8ByteAlign(offset);
+
+    // Update the command buffer position.
+    auto* total =
+      builder->makeBinary(AddInt32,
+                          builder->makeLocalGet(posLocal, Type::i32),
+                          builder->makeConst(int32_t(offset)));
+    body.push_back(builder->makeGlobalSet(commandBufferPosGlobal, total));
+
+    // TODO: add assertion here when asserts
+
+    func->body = builder->makeBlock(body);
+  }
+
+  // Wrap a function that returns a result. We flush the command buffer, then
+  // call it. TODO: we could also add it to the command buffer itself, to save
+  // a call.
+  void wrapReturning(Function* func, Name importToCall) {
+    std::vector<Expression*> body;
+
+    // Flush the command buffer and rest the position, if we have anything.
+    auto* check = builder->makeGlobalGet(commandBufferPosGlobal, Type::i32);
+    auto* start = builder->makeGlobalGet(commandBufferBaseGlobal, Type::i32);
+    auto* end = builder->makeGlobalGet(commandBufferPosGlobal, Type::i32);
+    auto* flush = builder->makeCall(flushName, {start, end}, Type::none);
+    auto* start2 = builder->makeGlobalGet(commandBufferBaseGlobal, Type::i32);
+    auto* reset = builder->makeGlobalSet(commandBufferPosGlobal, start2);
+    auto* iff = builder->makeIf(check, builder->makeSequence(flush, reset));
+    body.push_back(iff);
+
+    // Call the import.
+    auto params = func->getParams();
+    std::vector<Expression*> args;
+    for (Index i = 0; i < params.size(); i++) {
+      args.push_back(builder->makeLocalGet(i, params[i]));
+    }
+    body.push_back(builder->makeCall(importToCall, args, func->getResults()));
+
+    func->body = builder->makeBlock(body);
+  }
+
+  void emitJS(const std::string& jsFile, Module* module) {
+    Output out(jsFile, Flags::Text);
+
+    // The main loop goes over commands, each time switching over which function
+    // to call.
+    out << R"(function flush(pos, end) {
+  while (pos != end) {
+    let funcId = HEAP32[pos >> 2];
+    switch (funcId) {
+)";
+
+    // Emit deserialization code for each function.
+    for (Index id = 0; id < originalImports.size(); id++) {
+      auto* import = originalImports[id];
+      if (import->getResults() != Type::none) {
+        // This is not something we serialize.
+        continue;
+      }
+
+      // Track the offset relative to `pos`, which beings after the id.
+      Index offset = 4;
+
+      // Emit a case for the function.
+      out << "      case ";
+      out << std::to_string(id);
+      out << ": {\n";
+
+      // Emit a call to the function.
+      out << "        imports[";
+      out << "'" << import->module << "'";
+      out << "][";
+      out << "'" << import->base << "'";
+      out << "](";
+
+      // Emit deserialization for each param.
+      auto params = import->getParams();
+      for (Index i = 0; i < params.size(); i++) {
+        if (i > 0) {
+          out << ", ";
+        }
+        auto type = params[i];
+        assert(type.isBasic());
+        switch (type.getBasic()) {
+          case Type::i32: {
+            out << "HEAP32[pos + " + std::to_string(offset) + " >> 2]";
+            offset += 4;
+            break;
+          }
+          case Type::f32: {
+            out << "HEAPF32[pos + " + std::to_string(offset) + " >> 2]";
+            offset += 4;
+            break;
+          }
+          case Type::i64: {
+            ensure8ByteAlign(offset);
+            out << "HEAP64[pos + " + std::to_string(offset) + " >> 3]";
+            offset += 8;
+            break;
+          }
+          case Type::f64: {
+            ensure8ByteAlign(offset);
+            out << "HEAPF64[pos + " + std::to_string(offset) + " >> 3]";
+            offset += 8;
+            break;
+          }
+          default: {
+            Fatal() << "AutoBatch: unsupported serialization type " << type;
+          }
+        }
+      }
+
+      // Finish the call.
+      out << ");\n";
+
+      // Bump the position to the proper alignment, if we need to, and update.
+      ensure8ByteAlign(offset); // no need for bool TODO
+      out << "        pos += " << std::to_string(offset) << ";\n";
+
+      // Finish the switch case.
+      out << "        continue;\n";
+      out << "      }\n";
+    }
+
+    // Error handling.
+    out << "      default: throw `invalid id ${funcId}`;\n";
+
+    // End the switch, loop, and function.
+    out << "    }\n";
+    out << "  }\n";
+    out << "}\n";
+  }
+
+  // Given an offset, bump it to 8-byte alignment. (We only need to handle the
+  // case of offset 4, as our values are all 32 or 64-bit.) Returns true if we
+  // changed the value.
+  bool ensure8ByteAlign(Index& offset) {
+    if (offset % 8) {
+      assert(offset % 8 == 4);
+      offset += 4;
+      return true;
+    }
+    return false;
+  }
+};
+
+} // anonymous namespace
+
+Pass* createAutoBatchPass() { return new AutoBatch(); }
+
+} // namespace wasm
diff --git a/src/passes/CMakeLists.txt b/src/passes/CMakeLists.txt
index c2952e174b8..d43a1b53438 100644
--- a/src/passes/CMakeLists.txt
+++ b/src/passes/CMakeLists.txt
@@ -20,6 +20,7 @@ set(passes_SOURCES
   AbstractTypeRefining.cpp
   AlignmentLowering.cpp
   Asyncify.cpp
+  AutoBatch.cpp
   AvoidReinterprets.cpp
   CoalesceLocals.cpp
   CodePushing.cpp
diff --git a/src/passes/pass.cpp b/src/passes/pass.cpp
index 0e6e28267c2..a31365d9c53 100644
--- a/src/passes/pass.cpp
+++ b/src/passes/pass.cpp
@@ -96,6 +96,8 @@ void PassRegistry::registerPasses() {
   registerPass("asyncify",
                "async/await style transform, allowing pausing and resuming",
                createAsyncifyPass);
+  registerPass(
+    "autobatch", "automatically batch calls to imports", createAutoBatchPass);
   registerPass("avoid-reinterprets",
                "Tries to avoid reinterpret operations via more loads",
                createAvoidReinterpretsPass);
diff --git a/src/passes/passes.h b/src/passes/passes.h
index be06369a9f8..2c77adce179 100644
--- a/src/passes/passes.h
+++ b/src/passes/passes.h
@@ -25,6 +25,7 @@ class Pass;
 Pass* createAbstractTypeRefiningPass();
 Pass* createAlignmentLoweringPass();
 Pass* createAsyncifyPass();
+Pass* createAutoBatchPass();
 Pass* createAvoidReinterpretsPass();
 Pass* createCoalesceLocalsPass();
 Pass* createCoalesceLocalsWithLearningPass();
diff --git a/test/lit/node/autobatch-js-post.js b/test/lit/node/autobatch-js-post.js
new file mode 100644
index 00000000000..c1d1dc9a6f4
--- /dev/null
+++ b/test/lit/node/autobatch-js-post.js
@@ -0,0 +1,25 @@
+// Code that goes after the flush() function that autobatch generates. This
+// instantiates and runs the code.
+
+// We test with and without autobatching. Without, there is no flush method to
+// add.
+if (typeof flush === 'function') {
+  imports.autobatch = {
+    flush: flush,
+  };
+}
+
+let instance = new WebAssembly.Instance(mod, imports);
+
+let buffer = instance.exports.mem.buffer;
+HEAP32 = new Int32Array(buffer);
+HEAP64 = new BigInt64Array(buffer);
+HEAPF32 = new Float32Array(buffer);
+HEAPF64 = new Float64Array(buffer);
+
+console.log('calling caller');
+let result = instance.exports.caller();
+console.log(`result: ${result}`);
+
+console.log('test complete.');
+
diff --git a/test/lit/node/autobatch-js-pre.js b/test/lit/node/autobatch-js-pre.js
new file mode 100644
index 00000000000..ca79a26f99e
--- /dev/null
+++ b/test/lit/node/autobatch-js-pre.js
@@ -0,0 +1,29 @@
+// Code that goes before the flush() function that autobatch generates. This
+// loads the binary and prepares the imports and other globals.
+
+let argv = process.argv.slice(2);
+
+let binary = require('fs').readFileSync(argv[0]);
+
+let mod = new WebAssembly.Module(binary);
+
+let imports = {
+  outside: {
+    foo1: (x, y) => {
+      console.log(`foo1: ${x} ${y}`);
+    },
+    foo2: (x, y) => {
+      console.log(`foo2: ${x} ${y}`);
+    },
+    foo3: (x, y) => {
+      console.log(`foo3: ${x} ${y}`);
+    },
+    bar: () => {
+      console.log('bar');
+      return 42;
+    },
+  },
+};
+
+let HEAP32, HEAP64, HEAPF32, HEAPF64;
+
diff --git a/test/lit/node/autobatch-js.wast b/test/lit/node/autobatch-js.wast
new file mode 100644
index 00000000000..a352c321d78
--- /dev/null
+++ b/test/lit/node/autobatch-js.wast
@@ -0,0 +1,100 @@
+;; Similar testcase to autobatch.wast, but here we test the JS output and
+;; execution.
+
+(module
+  ;; This serializes as [i32 id, i32 param, f64 param], which is a total of 16
+  ;; bytes. The f64 is aligned properly just by how the offsets work out.
+  (import "outside" "foo1" (func $noresult1 (param i32) (param f64)))
+
+  ;; This serializes as [i32 id, i64 param, f32 param], which is a total of 16
+  ;; bytes again, but now the 64-bit param must have a 4-byte buffer before it,
+  ;; so it is aligned.
+  (import "outside" "foo2" (func $noresult2 (param i64) (param f32)))
+
+  ;; This serializes as [i32 id, i32 param, f32 param], which is a total of 12
+  ;; bytes. We bump $cmdbufpos by 16, to keep the thing after us aligned.
+  (import "outside" "foo3" (func $noresult3 (param i32) (param f32)))
+
+  (import "outside" "bar" (func $result (result f64)))
+
+  (memory $mem 10 20)
+  (export "mem" (memory $mem))
+
+  (func $caller (export "caller") (result f64)
+    ;; Two calls and a flush.
+    (call $noresult1
+      (i32.const 42)
+      (f64.const 3.14159)
+    )
+    (call $noresult2
+      (i64.const 1234)
+      (f32.const 2.71828)
+    )
+    (drop (call $result))
+
+    ;; One call and a flush.
+    (call $noresult3
+      (i32.const -1)
+      (f32.const -2.3)
+    )
+    (drop (call $result))
+
+    ;; Flush and test we get the result.
+    (call $result)
+  )
+)
+
+;; Build the autobatched wasm and JS.
+;; RUN: wasm-opt %s --autobatch -o %t.wasm --pass-arg=autobatch-js@%t.js
+
+;; Compare the JS to what we expect.
+;; RUN: cat %t.js | filecheck %s --check-prefix=JS
+
+;; JS:      function flush(pos, end) {
+;; JS-NEXT:   while (pos != end) {
+;; JS-NEXT:     let funcId = HEAP32[pos >> 2];
+;; JS-NEXT:     switch (funcId) {
+;; JS-NEXT:       case 0: {
+;; JS-NEXT:         imports['outside']['foo1'](HEAP32[pos + 4 >> 2], HEAPF64[pos + 8 >> 3]);
+;; JS-NEXT:         pos += 16;
+;; JS-NEXT:         continue;
+;; JS-NEXT:       }
+;; JS-NEXT:       case 1: {
+;; JS-NEXT:         imports['outside']['foo2'](HEAP64[pos + 8 >> 3], HEAPF32[pos + 16 >> 2]);
+;; JS-NEXT:         pos += 24;
+;; JS-NEXT:         continue;
+;; JS-NEXT:       }
+;; JS-NEXT:       case 2: {
+;; JS-NEXT:         imports['outside']['foo3'](HEAP32[pos + 4 >> 2], HEAPF32[pos + 8 >> 2]);
+;; JS-NEXT:         pos += 16;
+;; JS-NEXT:         continue;
+;; JS-NEXT:       }
+;; JS-NEXT:     }
+;; JS-NEXT:   }
+;; JS-NEXT: }
+
+;; Combine our test JS with the generated JS.
+;; RUN: cat %S/autobatch-js-pre.js > %t.combined.js
+;; RUN: cat %t.js >> %t.combined.js
+;; RUN: cat %S/autobatch-js-post.js >> %t.combined.js
+
+;; Execute the autobatched wasm.
+;; RUN: node %t.combined.js %t.wasm | filecheck %s --check-prefix=EXEC
+
+;; EXEC:      calling caller
+;; EXEC-NEXT: foo1: 42 3.14159
+;; EXEC-NEXT: foo2: 1234 2.718280076980591
+;; EXEC-NEXT: bar
+;; EXEC-NEXT: foo3: -1 -2.299999952316284
+;; EXEC-NEXT: bar
+;; EXEC-NEXT: bar
+;; EXEC-NEXT: result: 42
+;; EXEC-NEXT: test complete.
+
+;; Execute it again, without autobatching. The execution is the same.
+;; RUN: wasm-opt %s -o %t.wasm
+;; RUN: cat %S/autobatch-js-pre.js > %t.combined.js
+;; RUN: cat %S/autobatch-js-post.js >> %t.combined.js
+;; RUN: node %t.combined.js %t.wasm | filecheck %s --check-prefix=EXEC
+
+
diff --git a/test/lit/passes/autobatch.wast b/test/lit/passes/autobatch.wast
new file mode 100644
index 00000000000..a0f1da5574b
--- /dev/null
+++ b/test/lit/passes/autobatch.wast
@@ -0,0 +1,204 @@
+;; NOTE: Assertions have been generated by update_lit_checks.py --all-items and should not be edited.
+
+;; RUN: wasm-opt %s --autobatch -S -o - | filecheck %s
+
+;; The output will replace non-returning imports with wrappers that serialize
+;; commands.
+;;
+;; Value-returning imports will flush in their wrappers, then do the call.
+;;
+;; $caller does not change at all, as calls to the imports now call wrappers
+;; with the same names.
+
+(module
+  ;; This serializes as [i32 id, i32 param, f64 param], which is a total of 16
+  ;; bytes. The f64 is aligned properly just by how the offsets work out.
+  (import "outside" "foo1" (func $noresult1 (param i32) (param f64)))
+
+  ;; This serializes as [i32 id, i64 param, f32 param], which is a total of 16
+  ;; bytes again, but now the 64-bit param must have a 4-byte buffer before it,
+  ;; so it is aligned.
+  (import "outside" "foo2" (func $noresult2 (param i64) (param f32)))
+
+  ;; A function with a result. We purposefully put this before "foo3" to test
+  ;; for mixups in the indexing (we do not index "bar", as calls to functions
+  ;; returning a value are not serialized).
+  (import "outside" "bar" (func $result (result f64)))
+
+  ;; This serializes as [i32 id, i32 param, f32 param], which is a total of 12
+  ;; bytes. We bump $cmdbufpos by 16, to keep the thing after us aligned.
+  (import "outside" "foo3" (func $noresult3 (param i32) (param f32)))
+
+  ;; CHECK:      (type $0 (func (result f64)))
+
+  ;; CHECK:      (type $1 (func (param i32 f64)))
+
+  ;; CHECK:      (type $2 (func (param i64 f32)))
+
+  ;; CHECK:      (type $3 (func (param i32 f32)))
+
+  ;; CHECK:      (type $4 (func (param i32 i32)))
+
+  ;; CHECK:      (import "autobatch" "flush" (func $flush (param i32 i32)))
+
+  ;; CHECK:      (import "outside" "foo1" (func $noresult1_6 (param i32 f64)))
+
+  ;; CHECK:      (import "outside" "foo2" (func $noresult2_7 (param i64 f32)))
+
+  ;; CHECK:      (import "outside" "bar" (func $result_8 (result f64)))
+
+  ;; CHECK:      (import "outside" "foo3" (func $noresult3_9 (param i32 f32)))
+
+  ;; CHECK:      (global $cmdbufbase (mut i32) (i32.const 0))
+
+  ;; CHECK:      (global $cmdbufpos (mut i32) (i32.const 0))
+
+  ;; CHECK:      (memory $mem 10 20)
+  (memory $mem 10 20)
+
+  ;; CHECK:      (func $noresult1 (param $0 i32) (param $1 f64)
+  ;; CHECK-NEXT:  (local $2 i32)
+  ;; CHECK-NEXT:  (local.set $2
+  ;; CHECK-NEXT:   (global.get $cmdbufpos)
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (i32.store
+  ;; CHECK-NEXT:   (local.get $2)
+  ;; CHECK-NEXT:   (i32.const 0)
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (i32.store offset=4
+  ;; CHECK-NEXT:   (local.get $2)
+  ;; CHECK-NEXT:   (local.get $0)
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (f64.store offset=8
+  ;; CHECK-NEXT:   (local.get $2)
+  ;; CHECK-NEXT:   (local.get $1)
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (global.set $cmdbufpos
+  ;; CHECK-NEXT:   (i32.add
+  ;; CHECK-NEXT:    (local.get $2)
+  ;; CHECK-NEXT:    (i32.const 16)
+  ;; CHECK-NEXT:   )
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT: )
+
+  ;; CHECK:      (func $noresult2 (param $0 i64) (param $1 f32)
+  ;; CHECK-NEXT:  (local $2 i32)
+  ;; CHECK-NEXT:  (local.set $2
+  ;; CHECK-NEXT:   (global.get $cmdbufpos)
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (i32.store
+  ;; CHECK-NEXT:   (local.get $2)
+  ;; CHECK-NEXT:   (i32.const 1)
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (i64.store offset=8
+  ;; CHECK-NEXT:   (local.get $2)
+  ;; CHECK-NEXT:   (local.get $0)
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (f32.store offset=16
+  ;; CHECK-NEXT:   (local.get $2)
+  ;; CHECK-NEXT:   (local.get $1)
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (global.set $cmdbufpos
+  ;; CHECK-NEXT:   (i32.add
+  ;; CHECK-NEXT:    (local.get $2)
+  ;; CHECK-NEXT:    (i32.const 24)
+  ;; CHECK-NEXT:   )
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT: )
+
+  ;; CHECK:      (func $result (result f64)
+  ;; CHECK-NEXT:  (if
+  ;; CHECK-NEXT:   (global.get $cmdbufpos)
+  ;; CHECK-NEXT:   (then
+  ;; CHECK-NEXT:    (call $flush
+  ;; CHECK-NEXT:     (global.get $cmdbufbase)
+  ;; CHECK-NEXT:     (global.get $cmdbufpos)
+  ;; CHECK-NEXT:    )
+  ;; CHECK-NEXT:    (global.set $cmdbufpos
+  ;; CHECK-NEXT:     (global.get $cmdbufbase)
+  ;; CHECK-NEXT:    )
+  ;; CHECK-NEXT:   )
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (call $result_8)
+  ;; CHECK-NEXT: )
+
+  ;; CHECK:      (func $noresult3 (param $0 i32) (param $1 f32)
+  ;; CHECK-NEXT:  (local $2 i32)
+  ;; CHECK-NEXT:  (local.set $2
+  ;; CHECK-NEXT:   (global.get $cmdbufpos)
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (i32.store
+  ;; CHECK-NEXT:   (local.get $2)
+  ;; CHECK-NEXT:   (i32.const 2)
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (i32.store offset=4
+  ;; CHECK-NEXT:   (local.get $2)
+  ;; CHECK-NEXT:   (local.get $0)
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (f32.store offset=8
+  ;; CHECK-NEXT:   (local.get $2)
+  ;; CHECK-NEXT:   (local.get $1)
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (global.set $cmdbufpos
+  ;; CHECK-NEXT:   (i32.add
+  ;; CHECK-NEXT:    (local.get $2)
+  ;; CHECK-NEXT:    (i32.const 16)
+  ;; CHECK-NEXT:   )
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT: )
+
+  ;; CHECK:      (func $caller (result f64)
+  ;; CHECK-NEXT:  (call $noresult1
+  ;; CHECK-NEXT:   (i32.const 42)
+  ;; CHECK-NEXT:   (f64.const 3.14159)
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (call $noresult2
+  ;; CHECK-NEXT:   (i64.const 1234)
+  ;; CHECK-NEXT:   (f32.const 2.718280076980591)
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (drop
+  ;; CHECK-NEXT:   (call $result)
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (call $noresult3
+  ;; CHECK-NEXT:   (i32.const -1)
+  ;; CHECK-NEXT:   (f32.const -2.299999952316284)
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (call $noresult1
+  ;; CHECK-NEXT:   (i32.const 942)
+  ;; CHECK-NEXT:   (f64.const 93.14159)
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (call $noresult2
+  ;; CHECK-NEXT:   (i64.const 91234)
+  ;; CHECK-NEXT:   (f32.const 92.71827697753906)
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (call $result)
+  ;; CHECK-NEXT: )
+  (func $caller (result f64)
+    ;; A bunch of calls to them all.
+    (call $noresult1
+      (i32.const 42)
+      (f64.const 3.14159)
+    )
+    (call $noresult2
+      (i64.const 1234)
+      (f32.const 2.71828)
+    )
+    (drop (call $result))
+
+    (call $noresult3
+      (i32.const -1)
+      (f32.const -2.3)
+    )
+
+    (call $noresult1
+      (i32.const 942)
+      (f64.const 93.14159)
+    )
+    (call $noresult2
+      (i64.const 91234)
+      (f32.const 92.71828)
+    )
+    (call $result)
+  )
+)
+