Skip to content

Commit b1df2d2

Browse files
lilithkornelski
authored andcommitted
feat: add WASM SIMD128 path for f_pixel::diff()
Add a wasm32+simd128 implementation of the hot diff() function using safe core::arch::wasm32 intrinsics (f32x4 constructor, no unsafe). Translates the existing SSE/NEON pattern: - f32x4() to pack ARGB into a v128 (safe, no pointer load) - f32x4_sub/add/mul/max for packed arithmetic - f32x4_extract_lane + scalar add for horizontal RGB sum Also adds wasm32+simd128 to the repr(C, align(16)) cfg and excludes it from the scalar fallback guard. Measured ~1.9x end-to-end speedup on a 256x256 quantization benchmark running in wasmtime (scalar: 260ms/iter → simd128: 135ms/iter).
1 parent 21a6396 commit b1df2d2

1 file changed

Lines changed: 27 additions & 2 deletions

File tree

src/pal.rs

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,15 @@ const LIQ_WEIGHT_MSE: f64 = 0.45;
2727
/// ARGB layout is important for x86 SIMD.
2828
/// I've created the newtype wrapper to try a 16-byte alignment, but it didn't improve perf :(
2929
#[cfg_attr(
30-
any(target_arch = "x86_64", all(target_feature = "neon", target_arch = "aarch64")),
30+
any(target_arch = "x86_64", all(target_feature = "neon", target_arch = "aarch64"), all(target_arch = "wasm32", target_feature = "simd128")),
3131
repr(C, align(16))
3232
)]
3333
#[derive(Debug, Copy, Clone, Default, PartialEq)]
3434
#[allow(non_camel_case_types)]
3535
pub struct f_pixel(pub ARGBF);
3636

3737
impl f_pixel {
38-
#[cfg(not(any(target_arch = "x86_64", all(target_feature = "neon", target_arch = "aarch64"))))]
38+
#[cfg(not(any(target_arch = "x86_64", all(target_feature = "neon", target_arch = "aarch64"), all(target_arch = "wasm32", target_feature = "simd128"))))]
3939
#[inline(always)]
4040
pub fn diff(&self, other: &f_pixel) -> f32 {
4141
let alphas = other.0.a - self.0.a;
@@ -51,6 +51,31 @@ impl f_pixel {
5151
(black.b * black.b).max(white.b * white.b)
5252
}
5353

54+
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
55+
#[inline(always)]
56+
pub fn diff(&self, other: &f_pixel) -> f32 {
57+
use core::arch::wasm32::*;
58+
59+
let px = f32x4(self.0.a, self.0.r, self.0.g, self.0.b);
60+
let py = f32x4(other.0.a, other.0.r, other.0.g, other.0.b);
61+
62+
// y.a - x.a, then broadcast lane 0 to all four
63+
let alpha_diff = f32x4_sub(py, px);
64+
let alphas = f32x4_splat(f32x4_extract_lane::<0>(alpha_diff));
65+
66+
let mut onblack = f32x4_sub(px, py); // x - y
67+
let mut onwhite = f32x4_add(onblack, alphas); // x - y + (y.a - x.a)
68+
69+
onblack = f32x4_mul(onblack, onblack);
70+
onwhite = f32x4_mul(onwhite, onwhite);
71+
let max = f32x4_max(onwhite, onblack);
72+
73+
// add rgb (lanes 1,2,3), not a (lane 0)
74+
f32x4_extract_lane::<1>(max)
75+
+ f32x4_extract_lane::<2>(max)
76+
+ f32x4_extract_lane::<3>(max)
77+
}
78+
5479
#[cfg(all(target_feature = "neon", target_arch = "aarch64"))]
5580
#[inline(always)]
5681
pub fn diff(&self, other: &Self) -> f32 {

0 commit comments

Comments
 (0)