AI-Hypercomputer
diff --git a/‎src/maxdiffusion/__init__.py‎
Lines changed: 182 additions & 196 deletions b/‎src/maxdiffusion/__init__.py‎
Lines changed: 182 additions & 196 deletions
diff --git a/‎src/maxdiffusion/kernels/splash_attention/splash_attention_kernel_test.py‎
Lines changed: 5 additions & 9 deletions b/‎src/maxdiffusion/kernels/splash_attention/splash_attention_kernel_test.py‎
Lines changed: 5 additions & 9 deletions
diff --git a/‎src/maxdiffusion/kernels/splash_attention/splash_attention_mask.py‎
Lines changed: 19 additions & 25 deletions b/‎src/maxdiffusion/kernels/splash_attention/splash_attention_mask.py‎
Lines changed: 19 additions & 25 deletions
diff --git a/‎src/maxdiffusion/kernels/splash_attention/splash_attention_mask_info.py‎
Lines changed: 4 additions & 6 deletions b/‎src/maxdiffusion/kernels/splash_attention/splash_attention_mask_info.py‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎src/maxdiffusion/kernels/splash_attention/splash_attention_mask_test.py‎
Lines changed: 52 additions & 58 deletions b/‎src/maxdiffusion/kernels/splash_attention/splash_attention_mask_test.py‎
Lines changed: 52 additions & 58 deletions
diff --git a/‎src/maxdiffusion/models/flux/transformers/transformer_flux_flax.py‎
Lines changed: 42 additions & 46 deletions b/‎src/maxdiffusion/models/flux/transformers/transformer_flux_flax.py‎
Lines changed: 42 additions & 46 deletions
diff --git a/‎src/maxdiffusion/models/wan/autoencoder_kl_wan.py‎
Lines changed: 0 additions & 1 deletion b/‎src/maxdiffusion/models/wan/autoencoder_kl_wan.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/maxdiffusion/models/wan/transformers/transformer_wan_vace.py‎
Lines changed: 5 additions & 7 deletions b/‎src/maxdiffusion/models/wan/transformers/transformer_wan_vace.py‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎src/maxdiffusion/pedagogical_examples/to_tfrecords.py‎
Lines changed: 6 additions & 8 deletions b/‎src/maxdiffusion/pedagogical_examples/to_tfrecords.py‎
Lines changed: 6 additions & 8 deletions
@@ -290,14 +290,7 @@ def _generate_inputs(
     is_mqa: bool,
     is_segmented: bool,
     use_sinks: bool = False,
-) -> tuple[
-    jax.Array,
-    jax.Array,
-    jax.Array,
-    jax.Array | None,
-    splash.SegmentIds | None,
-    jax.Array,
-]:
+) -> tuple[jax.Array, jax.Array, jax.Array, jax.Array | None, splash.SegmentIds | None, jax.Array,]:
   seed = data.draw(seed_strategy())
   key = random.key(seed)
   k1, k2, k3, k_sinks, k_do = random.split(key, 5)
@@ -351,7 +344,10 @@ def test_splash_attention(self, is_mqa, is_segmented, is_dynamic_mask, data):
     q_seq_len, kv_seq_len = model_config.q_seq_len, model_config.kv_seq_len
     q, k, v, _, segment_ids, _ = _generate_inputs(data, model_config, is_mqa, is_segmented)
     attn_logits_soft_cap = data.draw(attn_logits_soft_cap_strategy())
-    mask = data.draw(mask_strategy(q_seq_len, kv_seq_len)).get_mask()
+    mask_obj = data.draw(mask_strategy(q_seq_len, kv_seq_len))
+    mask = mask_obj.get_mask()
+    # Skip edge case: single attention head + random mask triggers JAX/Mosaic compilation bug
+    hp.assume(not (model_config.num_q_heads == 1 and isinstance(mask_obj, RandomMask)))
     check_mask_no_empty_rows(mask, segment_ids)
     if is_dynamic_mask:
       mask = jnp.array(mask[:, :])
 
@@ -278,14 +278,12 @@ def __eq__(self, other: object):
     return self.shape == other.shape and self.offset == other.offset and np.array_equal(self.q_sequence, other.q_sequence)
 
   def __hash__(self):
-    return hash(
-        (
-            type(self),
-            self.shape,
-            self.offset,
-            self.q_sequence.tobytes() if self.q_sequence is not None else None,
-        )
-    )
+    return hash((
+        type(self),
+        self.shape,
+        self.offset,
+        self.q_sequence.tobytes() if self.q_sequence is not None else None,
+    ))
 
 
 class ChunkedCausalMask(_ComputableMask):
@@ -340,14 +338,12 @@ def __eq__(self, other: object):
     )
 
   def __hash__(self):
-    return hash(
-        (
-            type(self),
-            self.shape,
-            self.chunk_size,
-            self.q_sequence.tobytes() if self.q_sequence is not None else None,
-        )
-    )
+    return hash((
+        type(self),
+        self.shape,
+        self.chunk_size,
+        self.q_sequence.tobytes() if self.q_sequence is not None else None,
+    ))
 
 
 class LocalMask(_ComputableMask):
@@ -419,15 +415,13 @@ def __eq__(self, other: object):
     )
 
   def __hash__(self):
-    return hash(
-        (
-            type(self),
-            self.shape,
-            self.window_size,
-            self.offset,
-            self.q_sequence.tobytes() if self.q_sequence is not None else None,
-        )
-    )
+    return hash((
+        type(self),
+        self.shape,
+        self.window_size,
+        self.offset,
+        self.q_sequence.tobytes() if self.q_sequence is not None else None,
+    ))
 
 
 @dataclasses.dataclass(slots=True)
 
@@ -446,12 +446,10 @@ def _process_mask(
   # Partial blocks are deduplicated and stored in unique_chunks to save memory.
   for coords in np.ndindex((q_blocks_count, kv_blocks_count)):
     (q_idx, kv_idx) = coords
-    chunk = mask[
-        (
-            slice(q_idx * q_block_size, (q_idx + 1) * q_block_size),
-            slice(kv_idx * kv_block_size, (kv_idx + 1) * kv_block_size),
-        )
-    ]
+    chunk = mask[(
+        slice(q_idx * q_block_size, (q_idx + 1) * q_block_size),
+        slice(kv_idx * kv_block_size, (kv_idx + 1) * kv_block_size),
+    )]
     if chunk.any():
       if chunk.all():
         state_grid[q_idx, kv_idx] = 2
 
@@ -374,39 +374,37 @@ def test_lazy_causal_mask_chunking(self, block_size: tuple[int, int], shape: tup
         block_size,
     )
 
-  @parameterized.parameters(
-      [
-          ((256, 256), (1024, 1024), (128, None), 0),
-          ((256, 128), (1024, 1024), (128, None), 16),
-          ((128, 256), (1024, 1024), (128, None), 16),
-          ((256, 256), (1024, 1024), (128, 256), 0),
-          ((256, 128), (1024, 1024), (128, 256), 0),
-          ((128, 256), (1024, 1024), (128, 256), 16),
-          ((256, 256), (1024, 1024), (None, 256), 0),
-          ((256, 128), (1024, 1024), (None, 256), 32),
-          ((128, 256), (1024, 1024), (None, 256), 32),
-          #
-          ((256, 256), (1024, 2048), (128, None), 0),
-          ((256, 128), (1024, 2048), (128, None), 16),
-          ((128, 256), (1024, 2048), (128, None), 16),
-          ((256, 256), (1024, 2048), (128, 256), 0),
-          ((256, 128), (1024, 2048), (128, 256), 0),
-          ((128, 256), (1024, 2048), (128, 256), 16),
-          ((256, 256), (1024, 2048), (None, 256), 0),
-          ((256, 128), (1024, 2048), (None, 256), 32),
-          ((128, 256), (1024, 2048), (None, 256), 32),
-          #
-          ((256, 256), (2048, 1024), (128, None), 0),
-          ((256, 128), (2048, 1024), (128, None), 16),
-          ((128, 256), (2048, 1024), (128, None), 16),
-          ((256, 256), (2048, 1024), (128, 256), 0),
-          ((256, 128), (2048, 1024), (128, 256), 0),
-          ((128, 256), (2048, 1024), (128, 256), 16),
-          ((256, 256), (2048, 1024), (None, 256), 0),
-          ((256, 128), (2048, 1024), (None, 256), 32),
-          ((128, 256), (2048, 1024), (None, 256), 32),
-      ]
-  )
+  @parameterized.parameters([
+      ((256, 256), (1024, 1024), (128, None), 0),
+      ((256, 128), (1024, 1024), (128, None), 16),
+      ((128, 256), (1024, 1024), (128, None), 16),
+      ((256, 256), (1024, 1024), (128, 256), 0),
+      ((256, 128), (1024, 1024), (128, 256), 0),
+      ((128, 256), (1024, 1024), (128, 256), 16),
+      ((256, 256), (1024, 1024), (None, 256), 0),
+      ((256, 128), (1024, 1024), (None, 256), 32),
+      ((128, 256), (1024, 1024), (None, 256), 32),
+      #
+      ((256, 256), (1024, 2048), (128, None), 0),
+      ((256, 128), (1024, 2048), (128, None), 16),
+      ((128, 256), (1024, 2048), (128, None), 16),
+      ((256, 256), (1024, 2048), (128, 256), 0),
+      ((256, 128), (1024, 2048), (128, 256), 0),
+      ((128, 256), (1024, 2048), (128, 256), 16),
+      ((256, 256), (1024, 2048), (None, 256), 0),
+      ((256, 128), (1024, 2048), (None, 256), 32),
+      ((128, 256), (1024, 2048), (None, 256), 32),
+      #
+      ((256, 256), (2048, 1024), (128, None), 0),
+      ((256, 128), (2048, 1024), (128, None), 16),
+      ((128, 256), (2048, 1024), (128, None), 16),
+      ((256, 256), (2048, 1024), (128, 256), 0),
+      ((256, 128), (2048, 1024), (128, 256), 0),
+      ((128, 256), (2048, 1024), (128, 256), 16),
+      ((256, 256), (2048, 1024), (None, 256), 0),
+      ((256, 128), (2048, 1024), (None, 256), 32),
+      ((128, 256), (2048, 1024), (None, 256), 32),
+  ])
   def test_lazy_local_mask_chunking(
       self,
       block_size: tuple[int, int],
@@ -1164,17 +1162,15 @@ def test_two_qseq_shards_causal_local_stacked(self):
 
     expected_num_active_blocks = np.array([10, 10], dtype=np.int32)
 
-    expected_partial_mask_blocks = np.stack(
-        [
-            np.tri(*block_shape, dtype=np.int8),
-            np.triu(
-                np.tri(*block_shape, window_size, dtype=np.int8),
-                -window_size,
-            ),
-            np.tri(*block_shape, -window_size, dtype=np.int8),
-            np.triu(np.ones(block_shape, dtype=np.int8), window_size),
-        ]
-    )
+    expected_partial_mask_blocks = np.stack([
+        np.tri(*block_shape, dtype=np.int8),
+        np.triu(
+            np.tri(*block_shape, window_size, dtype=np.int8),
+            -window_size,
+        ),
+        np.tri(*block_shape, -window_size, dtype=np.int8),
+        np.triu(np.ones(block_shape, dtype=np.int8), window_size),
+    ])
 
     expected_mask_info = mask_info_lib.MaskInfo(
         expected_mask_next,
@@ -1345,20 +1341,18 @@ def test_two_shards_local_wide_local_narrow_stacked(self, q_seq_shards, kv_seq_s
 
     expected_active_rows_dkv = np.concatenate(
         [
-            np.array(
-                [
-                    0,
-                    0,
-                    1,
-                    1,
-                    1,
-                    2,
-                    2,
-                    2,
-                    3,
-                    3,
-                ]
-            ),
+            np.array([
+                0,
+                0,
+                1,
+                1,
+                1,
+                2,
+                2,
+                2,
+                3,
+                3,
+            ]),
             np.array([0, 0, 1, 1, 2, 2, 3, -1, -1, -1]),
         ],
         axis=0,
 
@@ -202,29 +202,27 @@ def setup(self):
         dtype=self.dtype,
         param_dtype=self.weights_dtype,
     )
-    self.img_mlp = nn.Sequential(
-        [
-            nn.Dense(
-                int(self.dim * self.mlp_ratio),
-                use_bias=True,
-                kernel_init=nn.with_logical_partitioning(nn.initializers.lecun_normal(), ("embed", "mlp")),
-                bias_init=nn.with_logical_partitioning(nn.initializers.zeros, (None,)),
-                dtype=self.dtype,
-                param_dtype=self.weights_dtype,
-                precision=self.precision,
-            ),
-            nn.gelu,
-            nn.Dense(
-                self.dim,
-                use_bias=True,
-                kernel_init=nn.with_logical_partitioning(nn.initializers.lecun_normal(), ("mlp", "embed")),
-                bias_init=nn.with_logical_partitioning(nn.initializers.zeros, (None,)),
-                dtype=self.dtype,
-                param_dtype=self.weights_dtype,
-                precision=self.precision,
-            ),
-        ]
-    )
+    self.img_mlp = nn.Sequential([
+        nn.Dense(
+            int(self.dim * self.mlp_ratio),
+            use_bias=True,
+            kernel_init=nn.with_logical_partitioning(nn.initializers.lecun_normal(), ("embed", "mlp")),
+            bias_init=nn.with_logical_partitioning(nn.initializers.zeros, (None,)),
+            dtype=self.dtype,
+            param_dtype=self.weights_dtype,
+            precision=self.precision,
+        ),
+        nn.gelu,
+        nn.Dense(
+            self.dim,
+            use_bias=True,
+            kernel_init=nn.with_logical_partitioning(nn.initializers.lecun_normal(), ("mlp", "embed")),
+            bias_init=nn.with_logical_partitioning(nn.initializers.zeros, (None,)),
+            dtype=self.dtype,
+            param_dtype=self.weights_dtype,
+            precision=self.precision,
+        ),
+    ])
 
     self.txt_norm2 = nn.LayerNorm(
         use_bias=False,
@@ -233,29 +231,27 @@ def setup(self):
         dtype=self.dtype,
         param_dtype=self.weights_dtype,
     )
-    self.txt_mlp = nn.Sequential(
-        [
-            nn.Dense(
-                int(self.dim * self.mlp_ratio),
-                use_bias=True,
-                kernel_init=nn.with_logical_partitioning(nn.initializers.lecun_normal(), ("embed", "mlp")),
-                bias_init=nn.with_logical_partitioning(nn.initializers.zeros, (None,)),
-                dtype=self.dtype,
-                param_dtype=self.weights_dtype,
-                precision=self.precision,
-            ),
-            nn.gelu,
-            nn.Dense(
-                self.dim,
-                use_bias=True,
-                kernel_init=nn.with_logical_partitioning(nn.initializers.lecun_normal(), ("mlp", "embed")),
-                bias_init=nn.with_logical_partitioning(nn.initializers.zeros, (None,)),
-                dtype=self.dtype,
-                param_dtype=self.weights_dtype,
-                precision=self.precision,
-            ),
-        ]
-    )
+    self.txt_mlp = nn.Sequential([
+        nn.Dense(
+            int(self.dim * self.mlp_ratio),
+            use_bias=True,
+            kernel_init=nn.with_logical_partitioning(nn.initializers.lecun_normal(), ("embed", "mlp")),
+            bias_init=nn.with_logical_partitioning(nn.initializers.zeros, (None,)),
+            dtype=self.dtype,
+            param_dtype=self.weights_dtype,
+            precision=self.precision,
+        ),
+        nn.gelu,
+        nn.Dense(
+            self.dim,
+            use_bias=True,
+            kernel_init=nn.with_logical_partitioning(nn.initializers.lecun_normal(), ("mlp", "embed")),
+            bias_init=nn.with_logical_partitioning(nn.initializers.zeros, (None,)),
+            dtype=self.dtype,
+            param_dtype=self.weights_dtype,
+            precision=self.precision,
+        ),
+    ])
 
     # let chunk size default to None
     self._chunk_size = None
 
@@ -1104,7 +1104,6 @@ def __init__(
     )
     self.mesh = mesh
 
-  @nnx.jit
   def _encode(self, x: jax.Array, feat_cache: AutoencoderKLWanCache):
     feat_cache.init_cache()
     if x.shape[-1] != 3:
 
@@ -460,13 +460,11 @@ def __call__(
 
     control_hidden_states = self.vace_patch_embedding(control_hidden_states)
     control_hidden_states = jax.lax.collapse(control_hidden_states, 1, -1)
-    control_hidden_states_padding = jnp.zeros(
-        (
-            batch_size,
-            control_hidden_states.shape[1],
-            hidden_states.shape[2] - control_hidden_states.shape[2],
-        )
-    )
+    control_hidden_states_padding = jnp.zeros((
+        batch_size,
+        control_hidden_states.shape[1],
+        hidden_states.shape[2] - control_hidden_states.shape[2],
+    ))
 
     control_hidden_states = jnp.concatenate([control_hidden_states, control_hidden_states_padding], axis=2)
 
 
@@ -54,14 +54,12 @@
 dl_manager = tfds.download.DownloadManager(download_dir="/tmp")
 tmp_dataset = "dataset"
 
-TRANSFORMS = transforms.Compose(
-    [
-        transforms.ToTensor(),
-        transforms.Resize(size=512, interpolation=transforms.InterpolationMode.BICUBIC),
-        transforms.CenterCrop(size=512),
-        transforms.Normalize([0.5], [0.5]),
-    ]
-)
+TRANSFORMS = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Resize(size=512, interpolation=transforms.InterpolationMode.BICUBIC),
+    transforms.CenterCrop(size=512),
+    transforms.Normalize([0.5], [0.5]),
+])
 
 
 def delete_files(path):
Original file line number	Diff line number	Diff line change
`@@ -1104,7 +1104,6 @@ def __init__(`
`1104`	`1104`	`)`
`1105`	`1105`	`self.mesh = mesh`
`1106`	`1106`
`1107`		`- @nnx.jit`
`1108`	`1107`	`def _encode(self, x: jax.Array, feat_cache: AutoencoderKLWanCache):`
`1109`	`1108`	`feat_cache.init_cache()`
`1110`	`1109`	`if x.shape[-1] != 3:`