feat: add WASM SIMD128 path for f_pixel::diff()

lilith · kornelski · commit b1df2d271552 · 2026-02-11T02:56:21.000Z
Add a wasm32+simd128 implementation of the hot diff() function using
safe core::arch::wasm32 intrinsics (f32x4 constructor, no unsafe).

Translates the existing SSE/NEON pattern:
- f32x4() to pack ARGB into a v128 (safe, no pointer load)
- f32x4_sub/add/mul/max for packed arithmetic
- f32x4_extract_lane + scalar add for horizontal RGB sum

Also adds wasm32+simd128 to the repr(C, align(16)) cfg and excludes
it from the scalar fallback guard.

Measured ~1.9x end-to-end speedup on a 256x256 quantization benchmark
running in wasmtime (scalar: 260ms/iter → simd128: 135ms/iter).
diff --git a/src/pal.rs b/src/pal.rs
@@ -27,15 +27,15 @@ const LIQ_WEIGHT_MSE: f64 = 0.45;
 /// ARGB layout is important for x86 SIMD.
 /// I've created the newtype wrapper to try a 16-byte alignment, but it didn't improve perf :(
 #[cfg_attr(
-    any(target_arch = "x86_64", all(target_feature = "neon", target_arch = "aarch64")),
+    any(target_arch = "x86_64", all(target_feature = "neon", target_arch = "aarch64"), all(target_arch = "wasm32", target_feature = "simd128")),
     repr(C, align(16))
 )]
 #[derive(Debug, Copy, Clone, Default, PartialEq)]
 #[allow(non_camel_case_types)]
 pub struct f_pixel(pub ARGBF);
 
 impl f_pixel {
-    #[cfg(not(any(target_arch = "x86_64", all(target_feature = "neon", target_arch = "aarch64"))))]
+    #[cfg(not(any(target_arch = "x86_64", all(target_feature = "neon", target_arch = "aarch64"), all(target_arch = "wasm32", target_feature = "simd128"))))]
     #[inline(always)]
     pub fn diff(&self, other: &f_pixel) -> f32 {
         let alphas = other.0.a - self.0.a;
@@ -51,6 +51,31 @@ impl f_pixel {
         (black.b * black.b).max(white.b * white.b)
     }
 
+    #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
+    #[inline(always)]
+    pub fn diff(&self, other: &f_pixel) -> f32 {
+        use core::arch::wasm32::*;
+
+        let px = f32x4(self.0.a, self.0.r, self.0.g, self.0.b);
+        let py = f32x4(other.0.a, other.0.r, other.0.g, other.0.b);
+
+        // y.a - x.a, then broadcast lane 0 to all four
+        let alpha_diff = f32x4_sub(py, px);
+        let alphas = f32x4_splat(f32x4_extract_lane::<0>(alpha_diff));
+
+        let mut onblack = f32x4_sub(px, py); // x - y
+        let mut onwhite = f32x4_add(onblack, alphas); // x - y + (y.a - x.a)
+
+        onblack = f32x4_mul(onblack, onblack);
+        onwhite = f32x4_mul(onwhite, onwhite);
+        let max = f32x4_max(onwhite, onblack);
+
+        // add rgb (lanes 1,2,3), not a (lane 0)
+        f32x4_extract_lane::<1>(max)
+            + f32x4_extract_lane::<2>(max)
+            + f32x4_extract_lane::<3>(max)
+    }
+
     #[cfg(all(target_feature = "neon", target_arch = "aarch64"))]
     #[inline(always)]
     pub fn diff(&self, other: &Self) -> f32 {