AI-Hypercomputer
diff --git a/‎src/maxtext/configs/base.yml‎
Lines changed: 2 additions & 0 deletions b/‎src/maxtext/configs/base.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/maxtext/configs/models/deepseek3-671b-batchsplit.yml‎
Lines changed: 13 additions & 14 deletions b/‎src/maxtext/configs/models/deepseek3-671b-batchsplit.yml‎
Lines changed: 13 additions & 14 deletions
diff --git a/‎src/maxtext/configs/types.py‎
Lines changed: 3 additions & 0 deletions b/‎src/maxtext/configs/types.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/maxtext/layers/moe.py‎
Lines changed: 2 additions & 2 deletions b/‎src/maxtext/layers/moe.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/maxtext/models/deepseek.py‎
Lines changed: 7 additions & 8 deletions b/‎src/maxtext/models/deepseek.py‎
Lines changed: 7 additions & 8 deletions
@@ -987,6 +987,8 @@ optimize_mesh_for_tpu_v6e: False
 
 shardy: True # Whether to use shardy XLA backend (default in Jax starting 0.7.0), or GSPMD (to be fully deprecated ~2026)
 
+remove_size_one_mesh_axis_from_type: True # Whether to remove size one mesh axis from type through jax.config.
+
 use_ragged_attention: False
 ragged_block_size: 256
 
 
@@ -57,14 +57,15 @@ rope_attention_scaling: False
 
 use_batch_split_schedule: True
 shard_mode: "explicit"
+remove_size_one_mesh_axis_from_type: False
 override_logical_axis_rules: True
-mesh_axes: ['diloco', 'data', 'stage', 'fsdp', 'fsdp_transpose', 'expert', 'context']
-data_sharding: [['data', 'stage', 'fsdp', 'fsdp_transpose', 'expert', 'context']]
+mesh_axes: ['diloco', 'data', 'stage', 'fsdp', 'expert', 'context']
+data_sharding: [['data', 'stage', 'fsdp', 'expert', 'context']]
 logical_axis_rules: [
-    ['activation_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert', 'context']],
-    ['activation_batch_moe', ['data', 'fsdp', 'fsdp_transpose', 'expert', 'context']],
-    ['activation_embed_and_logits_batch', ['data', 'stage', 'fsdp', 'fsdp_transpose', 'expert', 'context']],
-    ['activation_kv_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert', 'context']],
+    ['activation_batch', ['data', 'fsdp', 'expert', 'context']],
+    ['activation_batch_moe', ['data', 'fsdp', 'expert', 'context']],
+    ['activation_embed_and_logits_batch', ['data', 'stage', 'fsdp', 'expert', 'context']],
+    ['activation_kv_batch', ['data', 'fsdp', 'expert', 'context']],
     ['activation_norm_length', []],
     ['activation_norm_length_moe', []],
     ['activation_heads', []],
@@ -76,14 +77,12 @@ logical_axis_rules: [
     ['q_lora', ['fsdp']],
     ['kv_lora', ['fsdp']],
     ['layers', 'stage'],
-    ['q_lora_up_proj', ['fsdp_transpose']],
-    ['kv_lora_up_proj', ['fsdp_transpose']],
-    ['q_heads', ['fsdp_transpose']],
-    ['kv_heads', ['fsdp_transpose']],
-    ['heads', ['fsdp_transpose']],
-    ['mlp', ['fsdp_transpose']],
-    ['fsdp_transpose_and_expert', ['fsdp_transpose', 'expert']],
-    ['fsdp_transpose_only', ['fsdp_transpose']],
+    ['q_lora_up_proj', []],
+    ['kv_lora_up_proj', []],
+    ['q_heads', []],
+    ['kv_heads', []],
+    ['heads', []],
+    ['mlp', []],
     ['expert_only', ['expert']],
     ['diloco', 'diloco'],
 ]
@@ -832,6 +832,9 @@ class HardwareAndMesh(BaseModel):
   shardy: bool = Field(True, description="Whether to use shardy XLA backend.")
   pure_nnx_decoder: bool = Field(False, description="Whether to enable pure NNX decoder.")
   pure_nnx: bool = Field(False, description="Whether to enable pure NNX mode.")
+  remove_size_one_mesh_axis_from_type: bool = Field(
+      True, description="Whether to remove size one mesh axis from type through jax.config."
+  )
 
 
 class LayoutAndSharding(BaseModel):
 
@@ -102,8 +102,8 @@ def _sort_activations_custom_bwd(residuals: jax.Array, grads: jax.Array) -> tupl
 
 def get_batchsplit_init_kernel_axes():
   return (
-      ("embed_moe", "fsdp_transpose_only", "expert_only"),
-      ("embed_moe", "fsdp_transpose_and_expert", None),
+      ("embed_moe", None, "expert_only"),
+      ("embed_moe", "expert_only", None),
   )
 
 
 
@@ -43,7 +43,6 @@
 from maxtext.utils import max_utils
 from maxtext.utils.sharding import create_sharding
 from maxtext.utils.sharding import maybe_shard_with_logical
-from maxtext.utils.sharding import remove_size_one_mesh_axis
 
 import transformers
 
@@ -492,14 +491,13 @@ def __call__(
         return outputs, None
 
       # bf16 code path
-      activation_pspec = remove_size_one_mesh_axis(
-          jax.sharding.PartitionSpec(
-              ("data", "fsdp", "fsdp_transpose", "expert", "context"),
-              None,
-              None,
-          ),
-          self.mesh,
+      input_sharding = jax.typeof(inputs).sharding
+      activation_pspec = jax.sharding.PartitionSpec(
+          ("data", "fsdp", "expert"),
+          None,
+          None,
       )
+      inputs = jax.reshard(inputs, jax.sharding.NamedSharding(self.mesh, activation_pspec))
       yarn_freqs = deepseek_batchsplit.initialize_yarn_freqs(
           decoder_positions,
           embedding_dims=self.config.qk_rope_head_dim,
@@ -571,6 +569,7 @@ def extract_fn(x):
           in_specs=([activation_pspec] * self.config.batch_split_factor,),
           out_specs=activation_pspec,
       )(outputs)
+      outputs = jax.reshard(outputs, input_sharding)
       return outputs, None
 
     x = self.with_logical_constraint(inputs)
Original file line number	Diff line number	Diff line change
`@@ -102,8 +102,8 @@ def _sort_activations_custom_bwd(residuals: jax.Array, grads: jax.Array) -> tupl`
`102`	`102`
`103`	`103`	`def get_batchsplit_init_kernel_axes():`
`104`	`104`	`return (`
`105`		`- ("embed_moe", "fsdp_transpose_only", "expert_only"),`
`106`		`- ("embed_moe", "fsdp_transpose_and_expert", None),`
	`105`	`+ ("embed_moe", None, "expert_only"),`
	`106`	`+ ("embed_moe", "expert_only", None),`
`107`	`107`	`)`
`108`	`108`
`109`	`109`