fix: cache modulate_index in QwenImageTransformer2DModel to avoid per-step DtoH sync

varaprasadtarunkumar · varaprasadtarunkumar · commit d2d5fdad9645 · 2026-04-04T00:19:05.000+05:30
When zero_cond_t=True, the modulate_index tensor was being recreated on every transformer forward pass (once per denoising step) using: torch.tensor(list_comprehension, device=timestep.device, ...) This triggers a Python list comprehension + torch.tensor() from a Python list, which causes a cudaMemcpyAsync + cudaStreamSynchronize (DtoH sync) that forces the CPU to wait for all pending GPU kernels. Since img_shapes (which fully determines modulate_index) is fixed for the entire inference run, the resulting tensor is identical across all steps. We cache it in _modulate_index_cache keyed by (img_shapes, device), so the tensor is built only on the first step and reused thereafter. This eliminates N-1 unnecessary torch.tensor() constructions and DtoH syncs during inference (where N = num_inference_steps). This issue was identified in the profiling guide added in huggingface#13356 and referenced in huggingface#13401. Follows the same caching pattern as _compute_video_freqs in QwenEmbedRope.
diff --git a/src/diffusers/models/transformers/transformer_qwenimage.py b/src/diffusers/models/transformers/transformer_qwenimage.py
@@ -832,6 +832,13 @@ def __init__(
         self.gradient_checkpointing = False
         self.zero_cond_t = zero_cond_t
 
+        # Cache for modulate_index tensor to avoid rebuilding it on every forward pass.
+        # The tensor is determined solely by img_shapes (fixed during inference), so it
+        # only needs to be computed once per unique (img_shapes, device) combination.
+        # Without caching, every forward call triggers a Python list comprehension +
+        # torch.tensor() construction which is visible as CPU overhead in profiling traces.
+        self._modulate_index_cache: dict = {}
+
     @apply_lora_scale("attention_kwargs")
     def forward(
         self,
@@ -898,11 +905,19 @@ def forward(
 
         if self.zero_cond_t:
             timestep = torch.cat([timestep, timestep * 0], dim=0)
-            modulate_index = torch.tensor(
-                [[0] * prod(sample[0]) + [1] * sum([prod(s) for s in sample[1:]]) for sample in img_shapes],
-                device=timestep.device,
-                dtype=torch.int,
-            )
+            # Cache modulate_index to avoid rebuilding it on every forward pass.
+            # img_shapes is fixed during inference (same across all denoising steps),
+            # so we can build the tensor once and reuse it, eliminating the CPU overhead
+            # and implicit sync from torch.tensor() on each step.
+            device = timestep.device
+            cache_key = (tuple(tuple(s) for s in img_shapes), device)
+            if cache_key not in self._modulate_index_cache:
+                self._modulate_index_cache[cache_key] = torch.tensor(
+                    [[0] * prod(sample[0]) + [1] * sum([prod(s) for s in sample[1:]]) for sample in img_shapes],
+                    device=device,
+                    dtype=torch.int,
+                )
+            modulate_index = self._modulate_index_cache[cache_key]
         else:
             modulate_index = None