update decode_update_mla_metadata_v1 for atom dp attention (#2392)

junhaha666 · web-flow · commit b3037388f41d · 2026-03-23T20:55:10.000+08:00
* update decode_update_mla_metadata_v1 natively_supported logic

* edit get_mla_metadata_v1_2_device params.num_heads  = num_heads;
diff --git a/aiter/ops/attention.py b/aiter/ops/attention.py
@@ -1211,8 +1211,24 @@ def decode_update_mla_metadata_v1(
     assert kv_granularity >= 16
     assert page_size == 1
     # assert not (dtype_q == dtypes.bf16 and dtype_kv == dtypes.bf16 and num_heads_per_head_k == 128), "In this case, use get_mla_metadata_v1 instead"
-    natively_supported = (num_heads_per_head_k == 16) or (
-        num_heads_per_head_k == 128 and dtype_q == dtypes.fp8 and dtype_kv == dtypes.fp8
+    q_is_fp8 = dtype_q == dtypes.fp8
+    kv_is_fp8 = dtype_kv == dtypes.fp8
+    arch_id = get_gfx()
+    natively_supported = (
+        (num_heads_per_head_k == 16)
+        or (
+            arch_id == "gfx950"
+            and num_heads_per_head_k == 32
+            and q_is_fp8
+            and kv_is_fp8
+            and max_seqlen_qo == 4
+        )
+        or (
+            arch_id == "gfx942"
+            and num_heads_per_head_k == 128
+            and q_is_fp8
+            and kv_is_fp8
+        )
     )
     cu_num = work_indptr.shape[0] - 1
     tile_reduce_cnt = reduce_indptr.shape[0] - 1
diff --git a/csrc/kernels/mla/metadata/v1_2_device.cuh b/csrc/kernels/mla/metadata/v1_2_device.cuh
@@ -478,8 +478,6 @@ void get_mla_metadata_v1_2_device(const torch::Tensor& seqlens_qo_indptr, // [ba
                              ? num_clusters
                              : min(num_clusters, max_split_per_batch * num_batches);
 
-    const bool fold_to_qh16 = !natively_supported && q_is_fp8 && kv_is_fp8;
-
     MlaMetadataV1KernelParameter params = {};
     params.p_work_metadata_ptrs         = work_metadata_ptrs.data_ptr<uint64_t>();
     params.p_work_indptr                = work_indptr.data_ptr<int32_t>();
@@ -491,8 +489,7 @@ void get_mla_metadata_v1_2_device(const torch::Tensor& seqlens_qo_indptr, // [ba
     params.p_seqlens_kv_indptr          = seqlens_kv_indptr.data_ptr<int32_t>();
     params.p_kv_last_page_lens          = kv_last_page_lens.data_ptr<int32_t>();
     params.num_batches                  = num_batches;
-    params.num_heads                    = fold_to_qh16 ? num_heads
-                                                       : num_heads_k * num_heads_per_head_k;
+    params.num_heads                    = num_heads;
     params.num_cu                       = num_clusters;
     params.num_splits                   = num_splits;
     params.reduce_indptr_size           = reduce_indptr.size(0);