@@ -57,14 +57,15 @@ rope_attention_scaling: False
5757
5858use_batch_split_schedule : True
5959shard_mode : " explicit"
60+ remove_size_one_mesh_axis_from_type : False
6061override_logical_axis_rules : True
61- mesh_axes : ['diloco', 'data', 'stage', 'fsdp', 'fsdp_transpose', ' expert', 'context']
62- data_sharding : [['data', 'stage', 'fsdp', 'fsdp_transpose', ' expert', 'context']]
62+ mesh_axes : ['diloco', 'data', 'stage', 'fsdp', 'expert', 'context']
63+ data_sharding : [['data', 'stage', 'fsdp', 'expert', 'context']]
6364logical_axis_rules : [
64- ['activation_batch', ['data', 'fsdp', 'fsdp_transpose', ' expert', 'context']],
65- ['activation_batch_moe', ['data', 'fsdp', 'fsdp_transpose', ' expert', 'context']],
66- ['activation_embed_and_logits_batch', ['data', 'stage', 'fsdp', 'fsdp_transpose', ' expert', 'context']],
67- ['activation_kv_batch', ['data', 'fsdp', 'fsdp_transpose', ' expert', 'context']],
65+ ['activation_batch', ['data', 'fsdp', 'expert', 'context']],
66+ ['activation_batch_moe', ['data', 'fsdp', 'expert', 'context']],
67+ ['activation_embed_and_logits_batch', ['data', 'stage', 'fsdp', 'expert', 'context']],
68+ ['activation_kv_batch', ['data', 'fsdp', 'expert', 'context']],
6869 ['activation_norm_length', []],
6970 ['activation_norm_length_moe', []],
7071 ['activation_heads', []],
@@ -76,14 +77,12 @@ logical_axis_rules: [
7677 ['q_lora', ['fsdp']],
7778 ['kv_lora', ['fsdp']],
7879 ['layers', 'stage'],
79- ['q_lora_up_proj', ['fsdp_transpose']],
80- ['kv_lora_up_proj', ['fsdp_transpose']],
81- ['q_heads', ['fsdp_transpose']],
82- ['kv_heads', ['fsdp_transpose']],
83- ['heads', ['fsdp_transpose']],
84- ['mlp', ['fsdp_transpose']],
85- ['fsdp_transpose_and_expert', ['fsdp_transpose', 'expert']],
86- ['fsdp_transpose_only', ['fsdp_transpose']],
80+ ['q_lora_up_proj', []],
81+ ['kv_lora_up_proj', []],
82+ ['q_heads', []],
83+ ['kv_heads', []],
84+ ['heads', []],
85+ ['mlp', []],
8786 ['expert_only', ['expert']],
8887 ['diloco', 'diloco'],
8988]
0 commit comments