Fix unit test

hsuan-lun-chiang · hsuan-lun-chiang · commit 0f94640a9f27 · 2026-04-15T03:10:28.000Z
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -1126,8 +1126,8 @@ position_id_per_seconds: 25
 subslice_shape: ""
 
 # NNX
-enable_nnx: False
-pure_nnx_decoder: False
+enable_nnx: True
+pure_nnx_decoder: True
 pure_nnx: False
 
 ################################## Qwen3-Next Specific Configs ##################################
diff --git a/src/maxtext/layers/nnx_decoders.py b/src/maxtext/layers/nnx_decoders.py
@@ -19,6 +19,7 @@
 import functools
 import inspect
 import warnings
+import dataclasses
 from typing import Any
 
 import jax
@@ -472,13 +473,7 @@ def _create_single_layer(self, decoder_layer_class, rngs, **kwargs):
   def _create_scanned_layers(
       self, decoder_layer_class, length: int, metadata_axis_name: str, rngs: nnx.Rngs, **layer_kwargs
   ):
-    """Creates a scanned stack of layers using jax.lax.scan for memory-efficient initialization.
-
-    Uses jax.lax.scan instead of nnx.vmap to reduce peak memory during initialization.
-    With vmap, all layers' parameters are created simultaneously (O(N) peak memory).
-    With scan, parameters are created one layer at a time (O(1) peak intermediate memory),
-    which prevents OOM on memory-constrained devices like TPU v6e-4.
-    """
+    """Creates a scanned stack of layers using jax.lax.scan for memory-efficient initialization."""
     scan_axis = self.config.param_scan_axis
 
     # Fork rngs to get per-layer RNG states for scanning
@@ -489,10 +484,6 @@ def _create_scanned_layers(
 
     rngs_graphdef, rngs_state = nnx.split(forked_rngs)
 
-    # Create a reference layer to capture the module graph structure (graphdef).
-    # This layer's params are discarded — only the structure is kept.
-    # Must use the first slice of the forked rngs (not a dummy Rngs(0)) so the
-    # graphdef has the same number of RNG state leaves as the scan-created layers.
     first_rng_state = jax.tree.map(lambda x: x[0], rngs_state)
     ref_rngs = nnx.merge(rngs_graphdef, first_rng_state)
     ref_layer = decoder_layer_class(
@@ -501,9 +492,6 @@ def _create_scanned_layers(
     layer_graphdef, _, _ = nnx.split(ref_layer, nnx.Param, ...)
     del ref_layer
 
-    # Sequentially create each layer's parameters via jax.lax.scan.
-    # The scan body is traced once; XLA executes it N times with different RNG keys,
-    # keeping only one layer's intermediate state alive at a time.
     def scan_body(carry, rng_state_slice):
       layer_rngs = nnx.merge(rngs_graphdef, rng_state_slice)
       layer = decoder_layer_class(
@@ -519,47 +507,40 @@ def scan_body(carry, rng_state_slice):
 
     _, (stacked_params, stacked_rest) = jax.lax.scan(scan_body, None, rngs_state)
 
-    # jax.lax.scan stacks outputs along axis 0. Move params to the configured scan axis.
     if scan_axis != 0:
       stacked_params = jax.tree.map(lambda x: jnp.moveaxis(x, 0, scan_axis), stacked_params)
 
-    # Add partition metadata that nnx.vmap's transform_metadata would normally set.
-    # This metadata is read by variable_to_logically_partitioned() in initializers.py
-    # and by nnx.get_partition_spec() to produce correct sharding specs.
     def _add_scan_metadata(state, axis):
       def _update_leaf(leaf):
-        if isinstance(leaf, nnx.VariableState):
-          metadata = leaf.get_metadata()
-          metadata[nnx.PARTITION_NAME] = metadata_axis_name
-          metadata["param_scan_axis"] = axis
-
-          # Patch all sharding configurations in metadata so that nnx.get_partition_spec
-          # returns a 3D spec matching the actual 3D tensor rank, instead of the original 2D.
-          for key in ["out_sharding", "sharding", "kernel_axes"]:
-            if key in metadata and metadata[key] is not None:
-              val = metadata[key]
+        if hasattr(leaf, "replace") and hasattr(leaf, "value"):
+          replace_kwargs = {}
+          if hasattr(leaf, "get_metadata"):
+            replace_kwargs.update(leaf.get_metadata())
+            
+          replace_kwargs[nnx.PARTITION_NAME] = metadata_axis_name
+          replace_kwargs["param_scan_axis"] = axis
+          
+          for key in ["sharding", "out_sharding", "kernel_axes", "sharding_names"]:
+            val = getattr(leaf, key, None)
+            if val is None and key in replace_kwargs:
+              val = replace_kwargs[key]
+              
+            if val is not None:
               if isinstance(val, str):
                 val = (val,)
               if isinstance(val, tuple):
-                sharding_list = list(val)
-                sharding_list.insert(axis, metadata_axis_name)
-                metadata[key] = tuple(sharding_list)
-
-          # Ensure the native 'sharding' property is also updated if it exists separately
-          replace_kwargs = dict(metadata)
-          if hasattr(leaf, "sharding") and leaf.sharding is not None:
-            val = leaf.sharding
-            if isinstance(val, str):
-              val = (val,)
-            if isinstance(val, tuple):
-              sharding_list = list(val)
-              sharding_list.insert(axis, metadata_axis_name)
-              replace_kwargs["sharding"] = tuple(sharding_list)
+                l = list(val)
+                # Safely insert the scan axis into the logical axes string
+                if metadata_axis_name not in l:
+                  insert_idx = min(axis, len(l))
+                  l.insert(insert_idx, metadata_axis_name)
+                  replace_kwargs[key] = tuple(l)
 
           return leaf.replace(**replace_kwargs)
         return leaf
 
-      return jax.tree.map(_update_leaf, state, is_leaf=lambda x: isinstance(x, nnx.VariableState))
+      # We must use a custom is_leaf to catch the VariableState instances
+      return jax.tree.map(_update_leaf, state, is_leaf=lambda x: hasattr(x, "replace") and hasattr(x, "value"))
 
     stacked_params = _add_scan_metadata(stacked_params, scan_axis)
     stacked_rest = _add_scan_metadata(stacked_rest, 0)
@@ -811,7 +792,7 @@ def get_norm_layer(self, num_features: int, rngs: nnx.Rngs):
       )
     elif self.config.decoder_block == DecoderBlockType.QWEN3_NEXT:
       return functools.partial(
-          normalizations.Qwen3NextRMSNorm, num_features=num_features, shard_mode=self.config.shard_mode, rngs=rngs
+          normalizations.RMSNorm, num_features=num_features, shard_mode=self.config.shard_mode, rngs=rngs
       )
     else:
       raise ValueError(f"Incorrect decoder_block name {self.config.decoder_block.value=}")
diff --git a/tests/unit/train_compile_test.py b/tests/unit/train_compile_test.py
@@ -636,8 +636,6 @@ def test_moe_deepseek_pipeline_subset(self):
             "pipeline_parallel_layers=56",
             "ici_expert_parallelism=16",
             "dcn_pipeline_parallelism=8",
-            "first_num_dense_layers=8",
-            "base_num_decoder_layers=72",
         )
     )
 
@@ -655,7 +653,7 @@ def test_pipeline_subset(self):
             "per_device_batch_size=1",
             "max_target_length=1024",
             "pipeline_parallel_layers=56",
-            "base_num_decoder_layers=64",  # Must be divisible by dcn_pipeline_parallelism=8 in NNX scan path.
+            "base_num_decoder_layers=61",  # Remainder of 5 will fail when sharded incorrectly.
             "ici_expert_parallelism=16",
             "dcn_pipeline_parallelism=8",
         )

Original file line number	Diff line number	Diff line change
`@@ -636,8 +636,6 @@ def test_moe_deepseek_pipeline_subset(self):`
`636`	`636`	`"pipeline_parallel_layers=56",`
`637`	`637`	`"ici_expert_parallelism=16",`
`638`	`638`	`"dcn_pipeline_parallelism=8",`
`639`		`- "first_num_dense_layers=8",`
`640`		`- "base_num_decoder_layers=72",`
`641`	`639`	`)`
`642`	`640`	`)`
`643`	`641`
`@@ -655,7 +653,7 @@ def test_pipeline_subset(self):`
`655`	`653`	`"per_device_batch_size=1",`
`656`	`654`	`"max_target_length=1024",`
`657`	`655`	`"pipeline_parallel_layers=56",`
`658`		`- "base_num_decoder_layers=64", # Must be divisible by dcn_pipeline_parallelism=8 in NNX scan path.`
	`656`	`+ "base_num_decoder_layers=61", # Remainder of 5 will fail when sharded incorrectly.`
`659`	`657`	`"ici_expert_parallelism=16",`
`660`	`658`	`"dcn_pipeline_parallelism=8",`
`661`	`659`	`)`