huggingface
diff --git a/‎docs/source/en/api/pipelines/ltx2.md‎
Lines changed: 173 additions & 0 deletions b/‎docs/source/en/api/pipelines/ltx2.md‎
Lines changed: 173 additions & 0 deletions
diff --git a/‎scripts/convert_ltx2_to_diffusers.py‎
Lines changed: 27 additions & 11 deletions b/‎scripts/convert_ltx2_to_diffusers.py‎
Lines changed: 27 additions & 11 deletions
diff --git a/‎src/diffusers/models/autoencoders/autoencoder_kl_ltx2_audio.py‎
Lines changed: 2 additions & 2 deletions b/‎src/diffusers/models/autoencoders/autoencoder_kl_ltx2_audio.py‎
Lines changed: 2 additions & 2 deletions
@@ -24,6 +24,179 @@ You can find all the original LTX-Video checkpoints under the [Lightricks](https
 
 The original codebase for LTX-2 can be found [here](https://github.com/Lightricks/LTX-2).
 
+## Two-stages Generation
+Recommended pipeline to achieve production quality generation, this pipeline is composed of two stages:
+
+- Stage 1: Generate a video at the target resolution using diffusion sampling with classifier-free guidance (CFG). This stage produces a coherent low-noise video sequence that respects the text/image conditioning.
+- Stage 2: Upsample the Stage 1 output by 2 and refine details using a distilled LoRA model to improve fidelity and visual quality. Stage 2 may apply lighter CFG to preserve the structure from Stage 1 while enhancing texture and sharpness.
+
+Sample usage of text-to-video two stages pipeline
+
+```py
+import torch
+from diffusers import FlowMatchEulerDiscreteScheduler
+from diffusers.pipelines.ltx2 import LTX2Pipeline, LTX2LatentUpsamplePipeline
+from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel
+from diffusers.pipelines.ltx2.utils import STAGE_2_DISTILLED_SIGMA_VALUES
+from diffusers.pipelines.ltx2.export_utils import encode_video
+
+device = "cuda:0"
+width = 768
+height = 512
+
+pipe = LTX2Pipeline.from_pretrained(
+    "Lightricks/LTX-2", torch_dtype=torch.bfloat16
+)
+pipe.enable_sequential_cpu_offload(device=device)
+
+prompt = "A beautiful sunset over the ocean"
+negative_prompt = "shaky, glitchy, low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly, transition, static."
+
+# Stage 1 default (non-distilled) inference
+frame_rate = 24.0
+video_latent, audio_latent = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    width=width,
+    height=height,
+    num_frames=121,
+    frame_rate=frame_rate,
+    num_inference_steps=40,
+    sigmas=None,
+    guidance_scale=4.0,
+    output_type="latent",
+    return_dict=False,
+)
+
+latent_upsampler = LTX2LatentUpsamplerModel.from_pretrained(
+    "Lightricks/LTX-2",
+    subfolder="latent_upsampler",
+    torch_dtype=torch.bfloat16,
+)
+upsample_pipe = LTX2LatentUpsamplePipeline(vae=pipe.vae, latent_upsampler=latent_upsampler)
+upsample_pipe.enable_model_cpu_offload(device=device)
+upscaled_video_latent = upsample_pipe(
+    latents=video_latent,
+    output_type="latent",
+    return_dict=False,
+)[0]
+
+# Load Stage 2 distilled LoRA
+pipe.load_lora_weights(
+    "Lightricks/LTX-2", adapter_name="stage_2_distilled", weight_name="ltx-2-19b-distilled-lora-384.safetensors"
+)
+pipe.set_adapters("stage_2_distilled", 1.0)
+# VAE tiling is usually necessary to avoid OOM error when VAE decoding
+pipe.vae.enable_tiling()
+# Change scheduler to use Stage 2 distilled sigmas as is
+new_scheduler = FlowMatchEulerDiscreteScheduler.from_config(
+    pipe.scheduler.config, use_dynamic_shifting=False, shift_terminal=None
+)
+pipe.scheduler = new_scheduler
+# Stage 2 inference with distilled LoRA and sigmas
+video, audio = pipe(
+    latents=upscaled_video_latent,
+    audio_latents=audio_latent,
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    num_inference_steps=3,
+    noise_scale=STAGE_2_DISTILLED_SIGMA_VALUES[0], # renoise with first sigma value https://github.com/Lightricks/LTX-2/blob/main/packages/ltx-pipelines/src/ltx_pipelines/ti2vid_two_stages.py#L218
+    sigmas=STAGE_2_DISTILLED_SIGMA_VALUES,
+    guidance_scale=1.0,
+    output_type="np",
+    return_dict=False,
+)
+video = (video * 255).round().astype("uint8")
+video = torch.from_numpy(video)
+
+encode_video(
+    video[0],
+    fps=frame_rate,
+    audio=audio[0].float().cpu(),
+    audio_sample_rate=pipe.vocoder.config.output_sampling_rate,
+    output_path="ltx2_lora_distilled_sample.mp4",
+)
+```
+
+## Distilled checkpoint generation
+Fastest two-stages generation pipeline using a distilled checkpoint.
+
+```py
+import torch
+from diffusers.pipelines.ltx2 import LTX2Pipeline, LTX2LatentUpsamplePipeline
+from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel
+from diffusers.pipelines.ltx2.utils import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES
+from diffusers.pipelines.ltx2.export_utils import encode_video
+
+device = "cuda"
+width = 768
+height = 512
+random_seed = 42
+generator = torch.Generator(device).manual_seed(random_seed)
+model_path = "rootonchair/LTX-2-19b-distilled"
+
+pipe = LTX2Pipeline.from_pretrained(
+    model_path, torch_dtype=torch.bfloat16
+)
+pipe.enable_sequential_cpu_offload(device=device)
+
+prompt = "A beautiful sunset over the ocean"
+negative_prompt = "shaky, glitchy, low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly, transition, static."
+
+frame_rate = 24.0
+video_latent, audio_latent = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    width=width,
+    height=height,
+    num_frames=121,
+    frame_rate=frame_rate,
+    num_inference_steps=8,
+    sigmas=DISTILLED_SIGMA_VALUES,
+    guidance_scale=1.0,
+    generator=generator,
+    output_type="latent",
+    return_dict=False,
+)
+
+latent_upsampler = LTX2LatentUpsamplerModel.from_pretrained(
+    model_path,
+    subfolder="latent_upsampler",
+    torch_dtype=torch.bfloat16,
+)
+upsample_pipe = LTX2LatentUpsamplePipeline(vae=pipe.vae, latent_upsampler=latent_upsampler)
+upsample_pipe.enable_model_cpu_offload(device=device)
+upscaled_video_latent = upsample_pipe(
+    latents=video_latent,
+    output_type="latent",
+    return_dict=False,
+)[0]
+
+video, audio = pipe(
+    latents=upscaled_video_latent,
+    audio_latents=audio_latent,
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    num_inference_steps=3,
+    noise_scale=STAGE_2_DISTILLED_SIGMA_VALUES[0], # renoise with first sigma value https://github.com/Lightricks/LTX-2/blob/main/packages/ltx-pipelines/src/ltx_pipelines/distilled.py#L178
+    sigmas=STAGE_2_DISTILLED_SIGMA_VALUES,
+    generator=generator,
+    guidance_scale=1.0,
+    output_type="np",
+    return_dict=False,
+)
+video = (video * 255).round().astype("uint8")
+video = torch.from_numpy(video)
+
+encode_video(
+    video[0],
+    fps=frame_rate,
+    audio=audio[0].float().cpu(),
+    audio_sample_rate=pipe.vocoder.config.output_sampling_rate,
+    output_path="ltx2_distilled_sample.mp4",
+)
+```
+
 ## LTX2Pipeline
 
 [[autodoc]] LTX2Pipeline
 
@@ -63,6 +63,8 @@
     "up_blocks.4": "up_blocks.1",
     "up_blocks.5": "up_blocks.2.upsamplers.0",
     "up_blocks.6": "up_blocks.2",
+    "last_time_embedder": "time_embedder",
+    "last_scale_shift_table": "scale_shift_table",
     # Common
     # For all 3D ResNets
     "res_blocks": "resnets",
@@ -372,7 +374,9 @@ def convert_ltx2_connectors(original_state_dict: Dict[str, Any], version: str) -
     return connectors
 
 
-def get_ltx2_video_vae_config(version: str) -> Tuple[Dict[str, Any], Dict[str, Any], Dict[str, Any]]:
+def get_ltx2_video_vae_config(
+    version: str, timestep_conditioning: bool = False
+) -> Tuple[Dict[str, Any], Dict[str, Any], Dict[str, Any]]:
     if version == "test":
         config = {
             "model_id": "diffusers-internal-dev/dummy-ltx2",
@@ -396,7 +400,7 @@ def get_ltx2_video_vae_config(version: str) -> Tuple[Dict[str, Any], Dict[str, A
                 "downsample_type": ("spatial", "temporal", "spatiotemporal", "spatiotemporal"),
                 "upsample_residual": (True, True, True),
                 "upsample_factor": (2, 2, 2),
-                "timestep_conditioning": False,
+                "timestep_conditioning": timestep_conditioning,
                 "patch_size": 4,
                 "patch_size_t": 1,
                 "resnet_norm_eps": 1e-6,
@@ -433,7 +437,7 @@ def get_ltx2_video_vae_config(version: str) -> Tuple[Dict[str, Any], Dict[str, A
                 "downsample_type": ("spatial", "temporal", "spatiotemporal", "spatiotemporal"),
                 "upsample_residual": (True, True, True),
                 "upsample_factor": (2, 2, 2),
-                "timestep_conditioning": False,
+                "timestep_conditioning": timestep_conditioning,
                 "patch_size": 4,
                 "patch_size_t": 1,
                 "resnet_norm_eps": 1e-6,
@@ -450,8 +454,10 @@ def get_ltx2_video_vae_config(version: str) -> Tuple[Dict[str, Any], Dict[str, A
     return config, rename_dict, special_keys_remap
 
 
-def convert_ltx2_video_vae(original_state_dict: Dict[str, Any], version: str) -> Dict[str, Any]:
-    config, rename_dict, special_keys_remap = get_ltx2_video_vae_config(version)
+def convert_ltx2_video_vae(
+    original_state_dict: Dict[str, Any], version: str, timestep_conditioning: bool
+) -> Dict[str, Any]:
+    config, rename_dict, special_keys_remap = get_ltx2_video_vae_config(version, timestep_conditioning)
     diffusers_config = config["diffusers_config"]
 
     with init_empty_weights():
@@ -659,10 +665,15 @@ def get_model_state_dict_from_combined_ckpt(combined_ckpt: Dict[str, Any], prefi
 def get_args():
     parser = argparse.ArgumentParser()
 
+    def none_or_str(value: str):
+        if isinstance(value, str) and value.lower() == "none":
+            return None
+        return value
+
     parser.add_argument(
         "--original_state_dict_repo_id",
         default="Lightricks/LTX-2",
-        type=str,
+        type=none_or_str,
         help="HF Hub repo id with LTX 2.0 checkpoint",
     )
     parser.add_argument(
@@ -682,7 +693,7 @@ def get_args():
     parser.add_argument(
         "--combined_filename",
         default="ltx-2-19b-dev.safetensors",
-        type=str,
+        type=none_or_str,
         help="Filename for combined checkpoint with all LTX 2.0 models (VAE, DiT, etc.)",
     )
     parser.add_argument("--vae_prefix", default="vae.", type=str)
@@ -701,22 +712,25 @@ def get_args():
     parser.add_argument(
         "--text_encoder_model_id",
         default="google/gemma-3-12b-it-qat-q4_0-unquantized",
-        type=str,
+        type=none_or_str,
         help="HF Hub id for the LTX 2.0 base text encoder model",
     )
     parser.add_argument(
         "--tokenizer_id",
         default="google/gemma-3-12b-it-qat-q4_0-unquantized",
-        type=str,
+        type=none_or_str,
         help="HF Hub id for the LTX 2.0 text tokenizer",
     )
     parser.add_argument(
         "--latent_upsampler_filename",
         default="ltx-2-spatial-upscaler-x2-1.0.safetensors",
-        type=str,
+        type=none_or_str,
         help="Latent upsampler filename",
     )
 
+    parser.add_argument(
+        "--timestep_conditioning", action="store_true", help="Whether to add timestep condition to the video VAE model"
+    )
     parser.add_argument("--vae", action="store_true", help="Whether to convert the video VAE model")
     parser.add_argument("--audio_vae", action="store_true", help="Whether to convert the audio VAE model")
     parser.add_argument("--dit", action="store_true", help="Whether to convert the DiT model")
@@ -786,7 +800,9 @@ def main(args):
             original_vae_ckpt = load_hub_or_local_checkpoint(filename=args.vae_filename)
         elif combined_ckpt is not None:
             original_vae_ckpt = get_model_state_dict_from_combined_ckpt(combined_ckpt, args.vae_prefix)
-        vae = convert_ltx2_video_vae(original_vae_ckpt, version=args.version)
+        vae = convert_ltx2_video_vae(
+            original_vae_ckpt, version=args.version, timestep_conditioning=args.timestep_conditioning
+        )
         if not args.full_pipeline and not args.upsample_pipeline:
             vae.to(vae_dtype).save_pretrained(os.path.join(args.output_path, "vae"))
 
 
@@ -743,8 +743,8 @@ def __init__(
 
         # Per-channel statistics for normalizing and denormalizing the latent representation. This statics is computed over
         # the entire dataset and stored in model's checkpoint under AudioVAE state_dict
-        latents_std = torch.zeros((base_channels,))
-        latents_mean = torch.ones((base_channels,))
+        latents_std = torch.ones((base_channels,))
+        latents_mean = torch.zeros((base_channels,))
         self.register_buffer("latents_mean", latents_mean, persistent=True)
         self.register_buffer("latents_std", latents_std, persistent=True)