up up up

yiyixuxu · yiyixuxu · commit ed881a15fddd · 2025-08-07T23:34:22.000+02:00
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/decoders.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/decoders.py
@@ -105,9 +105,9 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
         if not block_state.output_type == "latent":
             latents = block_state.latents
             # make sure the VAE is in float32 mode, as it overflows in float16
-            block_state.needs_upcasting = components.vae.dtype == torch.float16 and components.vae.config.force_upcast
+            needs_upcasting = components.vae.dtype == torch.float16 and components.vae.config.force_upcast
 
-            if block_state.needs_upcasting:
+            if needs_upcasting:
                 self.upcast_vae(components)
                 latents = latents.to(next(iter(components.vae.post_quant_conv.parameters())).dtype)
             elif latents.dtype != components.vae.dtype:
@@ -117,21 +117,21 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
 
             # unscale/denormalize the latents
             # denormalize with the mean and std if available and not None
-            block_state.has_latents_mean = (
+            has_latents_mean = (
                 hasattr(components.vae.config, "latents_mean") and components.vae.config.latents_mean is not None
             )
-            block_state.has_latents_std = (
+            has_latents_std = (
                 hasattr(components.vae.config, "latents_std") and components.vae.config.latents_std is not None
             )
-            if block_state.has_latents_mean and block_state.has_latents_std:
-                block_state.latents_mean = (
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
                     torch.tensor(components.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
                 )
-                block_state.latents_std = (
+                latents_std = (
                     torch.tensor(components.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
                 )
                 latents = (
-                    latents * block_state.latents_std / components.vae.config.scaling_factor + block_state.latents_mean
+                    latents * latents_std / components.vae.config.scaling_factor + latents_mean
                 )
             else:
                 latents = latents / components.vae.config.scaling_factor
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/denoise.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/denoise.py
@@ -67,7 +67,7 @@ def intermediate_inputs(self) -> List[str]:
 
     @torch.no_grad()
     def __call__(self, components: StableDiffusionXLModularPipeline, block_state: BlockState, i: int, t: int):
-        block_state.scaled_latents = components.scheduler.scale_model_input(block_state.latents, t)
+        block_state.latent_model_input = components.scheduler.scale_model_input(block_state.latents, t)
 
         return components, block_state
 
@@ -134,10 +134,10 @@ def check_inputs(components, block_state):
     def __call__(self, components: StableDiffusionXLModularPipeline, block_state: BlockState, i: int, t: int):
         self.check_inputs(components, block_state)
 
-        block_state.scaled_latents = components.scheduler.scale_model_input(block_state.latents, t)
+        block_state.latent_model_input = components.scheduler.scale_model_input(block_state.latents, t)
         if components.num_channels_unet == 9:
-            block_state.scaled_latents = torch.cat(
-                [block_state.scaled_latents, block_state.mask, block_state.masked_image_latents], dim=1
+            block_state.latent_model_input = torch.cat(
+                [block_state.latent_model_input, block_state.mask, block_state.masked_image_latents], dim=1
             )
 
         return components, block_state
@@ -232,7 +232,7 @@ def __call__(
             # Predict the noise residual
             # store the noise_pred in guider_state_batch so that we can apply guidance across all batches
             guider_state_batch.noise_pred = components.unet(
-                block_state.scaled_latents,
+                block_state.latent_model_input,
                 t,
                 encoder_hidden_states=prompt_embeds,
                 timestep_cond=block_state.timestep_cond,
@@ -410,7 +410,7 @@ def __call__(self, components: StableDiffusionXLModularPipeline, block_state: Bl
                 mid_block_res_sample = block_state.mid_block_res_sample_zeros
             else:
                 down_block_res_samples, mid_block_res_sample = components.controlnet(
-                    block_state.scaled_latents,
+                    block_state.latent_model_input,
                     t,
                     encoder_hidden_states=guider_state_batch.prompt_embeds,
                     controlnet_cond=block_state.controlnet_cond,
@@ -430,7 +430,7 @@ def __call__(self, components: StableDiffusionXLModularPipeline, block_state: Bl
             # Predict the noise
             # store the noise_pred in guider_state_batch so we can apply guidance across all batches
             guider_state_batch.noise_pred = components.unet(
-                block_state.scaled_latents,
+                block_state.latent_model_input,
                 t,
                 encoder_hidden_states=guider_state_batch.prompt_embeds,
                 timestep_cond=block_state.timestep_cond,
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/encoders.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/encoders.py
@@ -390,7 +390,6 @@ def encode_prompt(
                 Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                 the output of the pre-final layer will be used for computing the prompt embeddings.
         """
-        device = device or components._execution_device
         dtype = components.text_encoder_2.dtype
 
 
@@ -526,7 +525,6 @@ def __call__(self, components: StableDiffusionXLModularPipeline, state: Pipeline
         self.check_inputs(block_state.prompt, block_state.prompt_2, block_state.negative_prompt, block_state.negative_prompt_2)
 
         device = components._execution_device
-        dtype = components.text_encoder_2.dtype
 
         # Encode input prompt
         lora_scale = (
@@ -542,8 +540,8 @@ def __call__(self, components: StableDiffusionXLModularPipeline, state: Pipeline
         ) = self.encode_prompt(
             components,
             prompt=block_state.prompt,
-            prompt2=block_state.prompt_2,
-            device = device,
+            prompt_2=block_state.prompt_2,
+            device=device,
             requires_unconditional_embeds=components.requires_unconditional_embeds,
             negative_prompt=block_state.negative_prompt,
             negative_prompt_2=block_state.negative_prompt_2,
@@ -604,11 +602,11 @@ def __call__(self, components: StableDiffusionXLModularPipeline, state: Pipeline
         device = components._execution_device
         dtype = block_state.dtype if block_state.dtype is not None else components.vae.dtype
 
-        block_state.processed_image = components.image_processor.preprocess(block_state.image)
+        image = components.image_processor.preprocess(block_state.image)
 
         # Encode image into latents
         block_state.image_latents = encode_vae_image(
-            image=block_state.processed_image, 
+            image=image, 
             vae=components.vae,
             generator=block_state.generator,
             dtype=dtype,
@@ -681,7 +679,7 @@ def intermediate_outputs(self) -> List[OutputParam]:
                 description="The crop coordinates to use for the preprocess/postprocess of the image and mask",
             ),
             OutputParam(
-                "mask_latents", 
+                "mask", 
                 type_hint=torch.Tensor, 
                 description="The mask to apply on the latents for the inpainting generation.",
             ),
@@ -715,37 +713,37 @@ def __call__(self, components: StableDiffusionXLModularPipeline, state: Pipeline
             width = components.default_width
 
         if block_state.padding_mask_crop is not None:
-            crops_coords = components.mask_processor.get_crop_region(
+            block_state.crops_coords = components.mask_processor.get_crop_region(
                 mask_image=block_state.mask_image, width=width, height=height, pad=block_state.padding_mask_crop
             )
             resize_mode = "fill"
         else:
-            crops_coords = None
+            block_state.crops_coords = None
             resize_mode = "default"
 
-        processed_image = components.image_processor.preprocess(
+        image = components.image_processor.preprocess(
             block_state.image,
             height=height,
             width=width,
             crops_coords=crops_coords,
             resize_mode=resize_mode,
         )
 
-        processed_image = processed_image.to(dtype=torch.float32)
+        image = image.to(dtype=torch.float32)
 
-        processed_mask_image = components.mask_processor.preprocess(
+        mask = components.mask_processor.preprocess(
             block_state.mask_image,
             height=height,
             width=width,
             resize_mode=resize_mode,
             crops_coords=crops_coords,
         )
         
-        masked_image = processed_image * (block_state.mask_latents < 0.5)
+        masked_image = image * (block_state.mask_latents < 0.5)
 
         # Prepare image latent variables
         block_state.image_latents = encode_vae_image(
-            image=processed_image, 
+            image=image, 
             vae=components.vae,
             generator=block_state.generator,
             dtype=dtype,
@@ -763,11 +761,11 @@ def __call__(self, components: StableDiffusionXLModularPipeline, state: Pipeline
         
         # resize mask to match the image latents
         _, _, height_latents, width_latents = block_state.image_latents.shape
-        block_state.mask_latents = torch.nn.functional.interpolate(
-            processed_mask_image, 
+        block_state.mask = torch.nn.functional.interpolate(
+            mask, 
             size=(height_latents, width_latents), 
         )
-        block_state.mask_latents = block_state.mask_latents.to(dtype=dtype, device=device)
+        block_state.mask = block_state.mask.to(dtype=dtype, device=device)
 
         self.set_block_state(state, block_state)
 
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py
@@ -26,6 +26,7 @@
     StableDiffusionXLPrepareAdditionalConditioningStep,
     StableDiffusionXLPrepareLatentsStep,
     StableDiffusionXLSetTimestepsStep,
+    StableDiffusionXLLCMStep,
 )
 from .decoders import (
     StableDiffusionXLDecodeStep,
@@ -79,6 +80,16 @@ def description(self):
         return "Run IP Adapter step if `ip_adapter_image` is provided. This step should be placed before the 'input' step.\n"
 
 
+class StableDiffusionXLAutoLCMStep(AutoPipelineBlocks):
+    block_classes = [StableDiffusionXLLCMStep]
+    block_names = ["lcm"]
+    block_trigger_inputs = ["embedded_guidance_scale"]
+
+    @property
+    def description(self):
+        return "Run LCM step if `latents` is provided. This step should be placed before the 'input' step.\n"
+
+
 # before_denoise: text2img
 class StableDiffusionXLBeforeDenoiseStep(SequentialPipelineBlocks):
     block_classes = [
@@ -262,6 +273,7 @@ class StableDiffusionXLAutoBlocks(SequentialPipelineBlocks):
         StableDiffusionXLAutoIPAdapterStep,
         StableDiffusionXLAutoVaeEncoderStep,
         StableDiffusionXLAutoBeforeDenoiseStep,
+        StableDiffusionXLAutoLCMStep,
         StableDiffusionXLAutoControlNetInputStep,
         StableDiffusionXLAutoDenoiseStep,
         StableDiffusionXLAutoDecodeStep,
@@ -271,6 +283,7 @@ class StableDiffusionXLAutoBlocks(SequentialPipelineBlocks):
         "ip_adapter",
         "image_encoder",
         "before_denoise",
+        "lcm",
         "controlnet_input",
         "denoise",
         "decoder",
@@ -286,6 +299,7 @@ def description(self):
             + "- to run the controlnet_union workflow, you need to provide `control_image` and `control_mode`\n"
             + "- to run the ip_adapter workflow, you need to provide `ip_adapter_image`\n"
             + "- for text-to-image generation, all you need to provide is `prompt`"
+            + "- to run the latent consistency models workflow, you need to provide `embedded_guidance_scale`"
         )
 
 
@@ -357,6 +371,13 @@ def description(self):
     ]
 )
 
+LCM_BLOCKS = InsertableDict(
+
+    [
+        ("lcm", StableDiffusionXLAutoLCMStep),
+    ]
+)
+
 AUTO_BLOCKS = InsertableDict(
     [
         ("text_encoder", StableDiffusionXLTextEncoderStep),
@@ -376,5 +397,6 @@ def description(self):
     "inpaint": INPAINT_BLOCKS,
     "controlnet": CONTROLNET_BLOCKS,
     "ip_adapter": IP_ADAPTER_BLOCKS,
+    "lcm": LCM_BLOCKS,
     "auto": AUTO_BLOCKS,
 }
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py
@@ -95,7 +95,10 @@ def requires_unconditional_embeds(self):
         # by default, always prepare unconditional embeddings
         requires_unconditional_embeds = True
 
-        if hasattr(self, "guider") and self.guider is not None:
+        if hasattr(self, "unet") and self.unet is not None and self.unet.config.time_cond_proj_dim is None:
+            requires_unconditional_embeds = False
+
+        elif hasattr(self, "guider") and self.guider is not None:
             requires_unconditional_embeds = self.guider.num_conditions > 1
 
         return requires_unconditional_embeds