AI-Hypercomputer
diff --git a/‎src/maxdiffusion/__init__.py‎
Lines changed: 196 additions & 182 deletions b/‎src/maxdiffusion/__init__.py‎
Lines changed: 196 additions & 182 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_14b.yml‎
Lines changed: 1 addition & 1 deletion b/‎src/maxdiffusion/configs/base_wan_14b.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/maxdiffusion/configuration_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎src/maxdiffusion/configuration_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/maxdiffusion/kernels/splash_attention/splash_attention_kernel.py‎
Lines changed: 5 additions & 4 deletions b/‎src/maxdiffusion/kernels/splash_attention/splash_attention_kernel.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎src/maxdiffusion/kernels/splash_attention/splash_attention_kernel_test.py‎
Lines changed: 8 additions & 1 deletion b/‎src/maxdiffusion/kernels/splash_attention/splash_attention_kernel_test.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎src/maxdiffusion/kernels/splash_attention/splash_attention_mask.py‎
Lines changed: 25 additions & 19 deletions b/‎src/maxdiffusion/kernels/splash_attention/splash_attention_mask.py‎
Lines changed: 25 additions & 19 deletions
diff --git a/‎src/maxdiffusion/kernels/splash_attention/splash_attention_mask_info.py‎
Lines changed: 6 additions & 4 deletions b/‎src/maxdiffusion/kernels/splash_attention/splash_attention_mask_info.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎src/maxdiffusion/kernels/splash_attention/splash_attention_mask_test.py‎
Lines changed: 58 additions & 52 deletions b/‎src/maxdiffusion/kernels/splash_attention/splash_attention_mask_test.py‎
Lines changed: 58 additions & 52 deletions
diff --git a/‎src/maxdiffusion/max_utils.py‎
Lines changed: 14 additions & 2 deletions b/‎src/maxdiffusion/max_utils.py‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎src/maxdiffusion/models/attention_flax.py‎
Lines changed: 0 additions & 6 deletions b/‎src/maxdiffusion/models/attention_flax.py‎
Lines changed: 0 additions & 6 deletions
@@ -61,7 +61,7 @@ jit_initializers: True
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'tokamax_flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring
+attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring
 flash_min_seq_length: 0
 
 # If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
 
@@ -394,7 +394,7 @@ def load_config(
             proxies=proxies,
             resume_download=resume_download,
             local_files_only=local_files_only,
-            use_auth_token=use_auth_token,
+            token=use_auth_token,
             user_agent=user_agent,
             subfolder=subfolder,
             revision=revision,
 
@@ -895,9 +895,7 @@ def _splash_attention_forward_ring_raw(
     num_kv_heads = k.shape[0]
 
   if len(k.shape) != expected_kv_rank:
-    raise ValueError(
-        f"Expected {expected_kv_rank}-dim 'key' tensor for MQA. Instead got a {len(k.shape)}-dim one."
-    )
+    raise ValueError(f"Expected {expected_kv_rank}-dim 'key' tensor for MQA. Instead got a {len(k.shape)}-dim one.")
 
   if k.shape[-1] != head_dim_qk:
     raise ValueError(f"Expected 'key' head dimension to be: {head_dim_qk}. Instead got: {k.shape[-1]}.")
@@ -1054,10 +1052,13 @@ def mask_index_map(h, grid_idx, rows_ref, cols_ref, mask_next_ref=None, *_):
       pl.BlockSpec((None, bq, NUM_LANES), logsumexp_index_map),
   ]
 
-  kernel_name = f"{get_kernel_name(is_mqa=is_mqa, save_residuals=True, is_segmented=segment_ids is not None, phase='fwd')}_ring_raw"
+  kernel_name = (
+      f"{get_kernel_name(is_mqa=is_mqa, save_residuals=True, is_segmented=segment_ids is not None, phase='fwd')}_ring_raw"
+  )
   metadata = {"xprof_metadata": json.dumps(dataclasses.asdict(config))}
 
   vmem_inputs = [q, k, v, q_segment_ids, kv_segment_ids, mask_info.partial_mask_blocks]
+
   def _fwd_cost_estimate(
       q: jax.Array,
       k: jax.Array,
 
@@ -290,7 +290,14 @@ def _generate_inputs(
     is_mqa: bool,
     is_segmented: bool,
     use_sinks: bool = False,
-) -> tuple[jax.Array, jax.Array, jax.Array, jax.Array | None, splash.SegmentIds | None, jax.Array,]:
+) -> tuple[
+    jax.Array,
+    jax.Array,
+    jax.Array,
+    jax.Array | None,
+    splash.SegmentIds | None,
+    jax.Array,
+]:
   seed = data.draw(seed_strategy())
   key = random.key(seed)
   k1, k2, k3, k_sinks, k_do = random.split(key, 5)
 
@@ -278,12 +278,14 @@ def __eq__(self, other: object):
     return self.shape == other.shape and self.offset == other.offset and np.array_equal(self.q_sequence, other.q_sequence)
 
   def __hash__(self):
-    return hash((
-        type(self),
-        self.shape,
-        self.offset,
-        self.q_sequence.tobytes() if self.q_sequence is not None else None,
-    ))
+    return hash(
+        (
+            type(self),
+            self.shape,
+            self.offset,
+            self.q_sequence.tobytes() if self.q_sequence is not None else None,
+        )
+    )
 
 
 class ChunkedCausalMask(_ComputableMask):
@@ -338,12 +340,14 @@ def __eq__(self, other: object):
     )
 
   def __hash__(self):
-    return hash((
-        type(self),
-        self.shape,
-        self.chunk_size,
-        self.q_sequence.tobytes() if self.q_sequence is not None else None,
-    ))
+    return hash(
+        (
+            type(self),
+            self.shape,
+            self.chunk_size,
+            self.q_sequence.tobytes() if self.q_sequence is not None else None,
+        )
+    )
 
 
 class LocalMask(_ComputableMask):
@@ -415,13 +419,15 @@ def __eq__(self, other: object):
     )
 
   def __hash__(self):
-    return hash((
-        type(self),
-        self.shape,
-        self.window_size,
-        self.offset,
-        self.q_sequence.tobytes() if self.q_sequence is not None else None,
-    ))
+    return hash(
+        (
+            type(self),
+            self.shape,
+            self.window_size,
+            self.offset,
+            self.q_sequence.tobytes() if self.q_sequence is not None else None,
+        )
+    )
 
 
 @dataclasses.dataclass(slots=True)
 
@@ -446,10 +446,12 @@ def _process_mask(
   # Partial blocks are deduplicated and stored in unique_chunks to save memory.
   for coords in np.ndindex((q_blocks_count, kv_blocks_count)):
     (q_idx, kv_idx) = coords
-    chunk = mask[(
-        slice(q_idx * q_block_size, (q_idx + 1) * q_block_size),
-        slice(kv_idx * kv_block_size, (kv_idx + 1) * kv_block_size),
-    )]
+    chunk = mask[
+        (
+            slice(q_idx * q_block_size, (q_idx + 1) * q_block_size),
+            slice(kv_idx * kv_block_size, (kv_idx + 1) * kv_block_size),
+        )
+    ]
     if chunk.any():
       if chunk.all():
         state_grid[q_idx, kv_idx] = 2
 
@@ -374,37 +374,39 @@ def test_lazy_causal_mask_chunking(self, block_size: tuple[int, int], shape: tup
         block_size,
     )
 
-  @parameterized.parameters([
-      ((256, 256), (1024, 1024), (128, None), 0),
-      ((256, 128), (1024, 1024), (128, None), 16),
-      ((128, 256), (1024, 1024), (128, None), 16),
-      ((256, 256), (1024, 1024), (128, 256), 0),
-      ((256, 128), (1024, 1024), (128, 256), 0),
-      ((128, 256), (1024, 1024), (128, 256), 16),
-      ((256, 256), (1024, 1024), (None, 256), 0),
-      ((256, 128), (1024, 1024), (None, 256), 32),
-      ((128, 256), (1024, 1024), (None, 256), 32),
-      #
-      ((256, 256), (1024, 2048), (128, None), 0),
-      ((256, 128), (1024, 2048), (128, None), 16),
-      ((128, 256), (1024, 2048), (128, None), 16),
-      ((256, 256), (1024, 2048), (128, 256), 0),
-      ((256, 128), (1024, 2048), (128, 256), 0),
-      ((128, 256), (1024, 2048), (128, 256), 16),
-      ((256, 256), (1024, 2048), (None, 256), 0),
-      ((256, 128), (1024, 2048), (None, 256), 32),
-      ((128, 256), (1024, 2048), (None, 256), 32),
-      #
-      ((256, 256), (2048, 1024), (128, None), 0),
-      ((256, 128), (2048, 1024), (128, None), 16),
-      ((128, 256), (2048, 1024), (128, None), 16),
-      ((256, 256), (2048, 1024), (128, 256), 0),
-      ((256, 128), (2048, 1024), (128, 256), 0),
-      ((128, 256), (2048, 1024), (128, 256), 16),
-      ((256, 256), (2048, 1024), (None, 256), 0),
-      ((256, 128), (2048, 1024), (None, 256), 32),
-      ((128, 256), (2048, 1024), (None, 256), 32),
-  ])
+  @parameterized.parameters(
+      [
+          ((256, 256), (1024, 1024), (128, None), 0),
+          ((256, 128), (1024, 1024), (128, None), 16),
+          ((128, 256), (1024, 1024), (128, None), 16),
+          ((256, 256), (1024, 1024), (128, 256), 0),
+          ((256, 128), (1024, 1024), (128, 256), 0),
+          ((128, 256), (1024, 1024), (128, 256), 16),
+          ((256, 256), (1024, 1024), (None, 256), 0),
+          ((256, 128), (1024, 1024), (None, 256), 32),
+          ((128, 256), (1024, 1024), (None, 256), 32),
+          #
+          ((256, 256), (1024, 2048), (128, None), 0),
+          ((256, 128), (1024, 2048), (128, None), 16),
+          ((128, 256), (1024, 2048), (128, None), 16),
+          ((256, 256), (1024, 2048), (128, 256), 0),
+          ((256, 128), (1024, 2048), (128, 256), 0),
+          ((128, 256), (1024, 2048), (128, 256), 16),
+          ((256, 256), (1024, 2048), (None, 256), 0),
+          ((256, 128), (1024, 2048), (None, 256), 32),
+          ((128, 256), (1024, 2048), (None, 256), 32),
+          #
+          ((256, 256), (2048, 1024), (128, None), 0),
+          ((256, 128), (2048, 1024), (128, None), 16),
+          ((128, 256), (2048, 1024), (128, None), 16),
+          ((256, 256), (2048, 1024), (128, 256), 0),
+          ((256, 128), (2048, 1024), (128, 256), 0),
+          ((128, 256), (2048, 1024), (128, 256), 16),
+          ((256, 256), (2048, 1024), (None, 256), 0),
+          ((256, 128), (2048, 1024), (None, 256), 32),
+          ((128, 256), (2048, 1024), (None, 256), 32),
+      ]
+  )
   def test_lazy_local_mask_chunking(
       self,
       block_size: tuple[int, int],
@@ -1162,15 +1164,17 @@ def test_two_qseq_shards_causal_local_stacked(self):
 
     expected_num_active_blocks = np.array([10, 10], dtype=np.int32)
 
-    expected_partial_mask_blocks = np.stack([
-        np.tri(*block_shape, dtype=np.int8),
-        np.triu(
-            np.tri(*block_shape, window_size, dtype=np.int8),
-            -window_size,
-        ),
-        np.tri(*block_shape, -window_size, dtype=np.int8),
-        np.triu(np.ones(block_shape, dtype=np.int8), window_size),
-    ])
+    expected_partial_mask_blocks = np.stack(
+        [
+            np.tri(*block_shape, dtype=np.int8),
+            np.triu(
+                np.tri(*block_shape, window_size, dtype=np.int8),
+                -window_size,
+            ),
+            np.tri(*block_shape, -window_size, dtype=np.int8),
+            np.triu(np.ones(block_shape, dtype=np.int8), window_size),
+        ]
+    )
 
     expected_mask_info = mask_info_lib.MaskInfo(
         expected_mask_next,
@@ -1341,18 +1345,20 @@ def test_two_shards_local_wide_local_narrow_stacked(self, q_seq_shards, kv_seq_s
 
     expected_active_rows_dkv = np.concatenate(
         [
-            np.array([
-                0,
-                0,
-                1,
-                1,
-                1,
-                2,
-                2,
-                2,
-                3,
-                3,
-            ]),
+            np.array(
+                [
+                    0,
+                    0,
+                    1,
+                    1,
+                    1,
+                    2,
+                    2,
+                    2,
+                    3,
+                    3,
+                ]
+            ),
             np.array([0, 0, 1, 1, 2, 2, 3, -1, -1, -1]),
         ],
         axis=0,
 
@@ -46,7 +46,16 @@
 from flax.linen import partitioning as nn_partitioning
 from flax.training import train_state
 from jax.experimental import mesh_utils
-from transformers import (FlaxCLIPTextModel, FlaxCLIPTextPreTrainedModel)
+
+try:
+  from transformers import (FlaxCLIPTextModel, FlaxCLIPTextPreTrainedModel)
+except ImportError:
+  # For transformers>=5.0, these need different import paths
+  try:
+    from transformers.models.clip.modeling_flax_clip import FlaxCLIPTextModel, FlaxCLIPTextPreTrainedModel
+  except ImportError:
+    FlaxCLIPTextModel = None
+    FlaxCLIPTextPreTrainedModel = None
 from flax import struct
 from typing import (
     Callable,
@@ -336,7 +345,10 @@ def init_train_state(model, tx, weights_init_fn, params=None, training=True, eva
   Args: model_params, model, tx, training
   """
   if not params:
-    if isinstance(model, FlaxCLIPTextModel) or isinstance(model, FlaxCLIPTextPreTrainedModel):
+    is_clip_model = False
+    if FlaxCLIPTextModel is not None and FlaxCLIPTextPreTrainedModel is not None:
+      is_clip_model = isinstance(model, FlaxCLIPTextModel) or isinstance(model, FlaxCLIPTextPreTrainedModel)
+    if is_clip_model:
       params = weights_init_fn()
     else:
       params = weights_init_fn(eval_only=eval_only)
 
@@ -962,12 +962,6 @@ def __init__(
       mask_padding_tokens: bool = True,
       residual_checkpoint_name: str | None = None,
       enable_jax_named_scopes: bool = False,
-      added_kv_proj_dim: Optional[int] = None,
-      image_seq_len: Optional[int] = None,
-  ):
-    if attention_kernel == "cudnn_flash_te":
-      raise NotImplementedError(f"Wan 2.1 has not been tested with {attention_kernel}")
-
       added_kv_proj_dim: Optional[int] = None,  # New for I2V
       image_seq_len: Optional[int] = None,  # New for I2V
   ):