huggingface · hlky · May 7, 2026
diff --git a/src/diffusers/models/unets/unet_stable_cascade.py b/src/diffusers/models/unets/unet_stable_cascade.py
@@ -135,6 +135,8 @@ class StableCascadeUNetOutput(BaseOutput):
 
 class StableCascadeUNet(ModelMixin, ConfigMixin, FromOriginalModelMixin):
     _supports_gradient_checkpointing = True
+    _supports_group_offloading = False
+    _skip_layerwise_casting_patterns = ["norm"]
 
     @register_to_config
     def __init__(
@@ -148,24 +150,24 @@ def __init__(
         num_attention_heads: tuple[int, ...] = (32, 32),
         down_num_layers_per_block: tuple[int, ...] = (8, 24),
         up_num_layers_per_block: tuple[int, ...] = (24, 8),
-        down_blocks_repeat_mappers: tuple[int] | None = (
+        down_blocks_repeat_mappers: tuple[int, ...] | None = (
             1,
             1,
         ),
-        up_blocks_repeat_mappers: tuple[int] | None = (1, 1),
-        block_types_per_layer: tuple[tuple[str]] = (
+        up_blocks_repeat_mappers: tuple[int, ...] | None = (1, 1),
+        block_types_per_layer: tuple[tuple[str, ...], ...] = (
             ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"),
             ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"),
         ),
         clip_text_in_channels: int | None = None,
-        clip_text_pooled_in_channels=1280,
+        clip_text_pooled_in_channels: int = 1280,
         clip_image_in_channels: int | None = None,
-        clip_seq=4,
+        clip_seq: int = 4,
         effnet_in_channels: int | None = None,
         pixel_mapper_in_channels: int | None = None,
-        kernel_size=3,
-        dropout: float | tuple[float] = (0.1, 0.1),
-        self_attn: bool | tuple[bool] = True,
+        kernel_size: int = 3,
+        dropout: float | tuple[float, ...] = (0.1, 0.1),
+        self_attn: bool | tuple[bool, ...] = True,
         timestep_conditioning_type: tuple[str, ...] = ("sca", "crp"),
         switch_level: tuple[bool] | None = None,
     ):
@@ -431,20 +433,27 @@ def get_timestep_ratio_embedding(self, timestep_ratio, max_positions=10000):
 
     def get_clip_embeddings(self, clip_txt_pooled, clip_txt=None, clip_img=None):
         if len(clip_txt_pooled.shape) == 2:
-            clip_txt_pool = clip_txt_pooled.unsqueeze(1)
+            clip_txt_pooled = clip_txt_pooled.unsqueeze(1)
         clip_txt_pool = self.clip_txt_pooled_mapper(clip_txt_pooled).view(
             clip_txt_pooled.size(0), clip_txt_pooled.size(1) * self.config.clip_seq, -1
         )
-        if clip_txt is not None and clip_img is not None:
+
+        clip = []
+        if clip_txt is not None:
             clip_txt = self.clip_txt_mapper(clip_txt)
+            clip.append(clip_txt)
+
+        clip.append(clip_txt_pool)
+
+        if clip_img is not None:
             if len(clip_img.shape) == 2:
                 clip_img = clip_img.unsqueeze(1)
             clip_img = self.clip_img_mapper(clip_img).view(
                 clip_img.size(0), clip_img.size(1) * self.config.clip_seq, -1
             )
-            clip = torch.cat([clip_txt, clip_txt_pool, clip_img], dim=1)
-        else:
-            clip = clip_txt_pool
+            clip.append(clip_img)
+
+        clip = torch.cat(clip, dim=1)
         return self.clip_norm(clip)
 
     def _down_encode(self, x, r_embed, clip):
@@ -548,8 +557,8 @@ def forward(
         crp=None,
         return_dict=True,
     ):
-        if pixels is None:
-            pixels = sample.new_zeros(sample.size(0), 3, 8, 8)
+        if pixels is None and hasattr(self, "pixels_mapper"):
+            pixels = sample.new_zeros(sample.size(0), self.config.pixel_mapper_in_channels, 8, 8)
 
         # Process the conditioning embeddings
         timestep_ratio_embed = self.get_timestep_ratio_embedding(timestep_ratio)
@@ -560,7 +569,7 @@ def forward(
                 cond = crp
             else:
                 cond = None
-            t_cond = cond or torch.zeros_like(timestep_ratio)
+            t_cond = cond if cond is not None else torch.zeros_like(timestep_ratio)
             timestep_ratio_embed = torch.cat([timestep_ratio_embed, self.get_timestep_ratio_embedding(t_cond)], dim=1)
         clip = self.get_clip_embeddings(clip_txt_pooled=clip_text_pooled, clip_txt=clip_text, clip_img=clip_img)
 

diff --git a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py
diff --git a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py
@@ -125,6 +125,7 @@ def __init__(
         )
 
     def enable_xformers_memory_efficient_attention(self, attention_op: Callable | None = None):
+        self.prior_pipe.enable_xformers_memory_efficient_attention(attention_op)
         self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
 
     def enable_model_cpu_offload(self, gpu_id: int | None = None, device: torch.device | str = None):
@@ -160,12 +161,14 @@ def set_progress_bar_config(self, **kwargs):
     def __call__(
         self,
         prompt: str | list[str] | None = None,
-        images: torch.Tensor | PIL.Image.Image | list[torch.Tensor] | list[PIL.Image.Image] = None,
+        images: torch.Tensor | PIL.Image.Image | list[torch.Tensor] | list[PIL.Image.Image] | None = None,
         height: int = 512,
         width: int = 512,
         prior_num_inference_steps: int = 60,
+        prior_timesteps: list[float] | None = None,
         prior_guidance_scale: float = 4.0,
         num_inference_steps: int = 12,
+        timesteps: list[float] | None = None,
         decoder_guidance_scale: float = 0.0,
         negative_prompt: str | list[str] | None = None,
         prompt_embeds: torch.Tensor | None = None,
@@ -175,7 +178,7 @@ def __call__(
         num_images_per_prompt: int = 1,
         generator: torch.Generator | list[torch.Generator] | None = None,
         latents: torch.Tensor | None = None,
-        output_type: str | None = "pil",
+        output_type: str = "pil",
         return_dict: bool = True,
         prior_callback_on_step_end: Callable[[int, int], None] | None = None,
         prior_callback_on_step_end_tensor_inputs: list[str] = ["latents"],
@@ -221,12 +224,16 @@ def __call__(
                 closely linked to the text `prompt`, usually at the expense of lower image quality.
             prior_num_inference_steps (`int | dict[float, int]`, *optional*, defaults to 60):
                 The number of prior denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference. For more specific timestep spacing, you can pass customized
-                `prior_timesteps`
+                expense of slower inference.
+            prior_timesteps (`list[float]`, *optional*):
+                Custom timesteps to use for the prior denoising process. If provided, `prior_num_inference_steps` is
+                ignored.
             num_inference_steps (`int`, *optional*, defaults to 12):
                 The number of decoder denoising steps. More denoising steps usually lead to a higher quality image at
-                the expense of slower inference. For more specific timestep spacing, you can pass customized
-                `timesteps`
+                the expense of slower inference.
+            timesteps (`list[float]`, *optional*):
+                Custom timesteps to use for the decoder denoising process. If provided, `num_inference_steps` is
+                ignored.
             decoder_guidance_scale (`float`, *optional*, defaults to 0.0):
                 Guidance scale as defined in [Classifier-Free Diffusion
                 Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
@@ -242,7 +249,7 @@ def __call__(
                 tensor will be generated by sampling using the supplied random `generator`.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
-                (`np.array`) or `"pt"` (`torch.Tensor`).
+                (`np.array`), `"pt"` (`torch.Tensor`) or `"latent"` (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
             prior_callback_on_step_end (`Callable`, *optional*):
@@ -281,6 +288,7 @@ def __call__(
             height=height,
             width=width,
             num_inference_steps=prior_num_inference_steps,
+            timesteps=prior_timesteps,
             guidance_scale=prior_guidance_scale,
             negative_prompt=negative_prompt if negative_prompt_embeds is None else None,
             prompt_embeds=prompt_embeds,
@@ -305,6 +313,7 @@ def __call__(
             image_embeddings=image_embeddings,
             prompt=prompt if prompt_embeds is None else None,
             num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
             guidance_scale=decoder_guidance_scale,
             negative_prompt=negative_prompt if negative_prompt_embeds is None else None,
             prompt_embeds=prompt_embeds,