AI-Hypercomputer
diff --git a/‎src/maxdiffusion/checkpointing/ltx2_checkpointer.py‎
Lines changed: 3 additions & 3 deletions b/‎src/maxdiffusion/checkpointing/ltx2_checkpointer.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/maxdiffusion/configs/ltx2_video.yml‎
Lines changed: 10 additions & 0 deletions b/‎src/maxdiffusion/configs/ltx2_video.yml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/maxdiffusion/generate_ltx2.py‎
Lines changed: 8 additions & 3 deletions b/‎src/maxdiffusion/generate_ltx2.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎src/maxdiffusion/models/attention_flax.py‎
Lines changed: 1 addition & 1 deletion b/‎src/maxdiffusion/models/attention_flax.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/maxdiffusion/models/ltx2/latent_upsampler_ltx2.py‎
Lines changed: 264 additions & 0 deletions b/‎src/maxdiffusion/models/ltx2/latent_upsampler_ltx2.py‎
Lines changed: 264 additions & 0 deletions
@@ -79,19 +79,19 @@ def load_ltx2_configs_from_orbax(self, step: Optional[int]) -> Tuple[Optional[di
     return restored_checkpoint, step
 
   def load_checkpoint(
-      self, step=None, vae_only=False, load_transformer=True
+      self, step=None, vae_only=False, load_transformer=True, load_upsampler=False
   ) -> Tuple[LTX2Pipeline, Optional[dict], Optional[int]]:
     restored_checkpoint, step = self.load_ltx2_configs_from_orbax(step)
     opt_state = None
 
     if restored_checkpoint:
       max_logging.log("Loading LTX2 pipeline from checkpoint")
-      pipeline = LTX2Pipeline.from_checkpoint(self.config, restored_checkpoint, vae_only, load_transformer)
+      pipeline = LTX2Pipeline.from_checkpoint(self.config, restored_checkpoint, vae_only, load_transformer, load_upsampler)
       if "opt_state" in restored_checkpoint.ltx2_state.keys():
         opt_state = restored_checkpoint.ltx2_state["opt_state"]
     else:
       max_logging.log("No checkpoint found, loading pipeline from pretrained hub")
-      pipeline = LTX2Pipeline.from_pretrained(self.config, vae_only, load_transformer)
+      pipeline = LTX2Pipeline.from_pretrained(self.config, vae_only, load_transformer, load_upsampler)
 
     return pipeline, opt_state, step
 
 
@@ -92,3 +92,13 @@ jit_initializers: True
 enable_single_replica_ckpt_restoring: False
 seed: 0
 audio_format: "s16"
+
+# LTX-2 Latent Upsampler
+run_latent_upsampler: False
+upsampler_model_path: "Lightricks/LTX-2"
+upsampler_spatial_patch_size: 1
+upsampler_temporal_patch_size: 1
+upsampler_adain_factor: 0.0
+upsampler_tone_map_compression_ratio: 0.0
+upsampler_rational_spatial_scale: 2.0
+upsampler_output_type: "pil"
@@ -81,7 +81,6 @@ def get_git_commit_hash():
 
 
 def call_pipeline(config, pipeline, prompt, negative_prompt):
-  # Set default generation arguments
   generator = jax.random.key(config.seed) if hasattr(config, "seed") else jax.random.key(0)
   guidance_scale = config.guidance_scale if hasattr(config, "guidance_scale") else 3.0
 
@@ -99,6 +98,7 @@ def call_pipeline(config, pipeline, prompt, negative_prompt):
       decode_noise_scale=getattr(config, "decode_noise_scale", None),
       max_sequence_length=getattr(config, "max_sequence_length", 1024),
       dtype=jnp.bfloat16 if getattr(config, "activations_dtype", "bfloat16") == "bfloat16" else jnp.float32,
+      output_type=getattr(config, "upsampler_output_type", "pil"),
   )
   return out
 
@@ -114,9 +114,11 @@ def run(config, pipeline=None, filename_prefix="", commit_hash=None):
     else:
       max_logging.log("Could not retrieve Git commit hash.")
 
+  checkpoint_loader = LTX2Checkpointer(config=config)
   if pipeline is None:
-    checkpoint_loader = LTX2Checkpointer(config=config)
-    pipeline, _, _ = checkpoint_loader.load_checkpoint()
+    # Use the config flag to determine if the upsampler should be loaded
+    run_latent_upsampler = getattr(config, "run_latent_upsampler", False)
+    pipeline, _, _ = checkpoint_loader.load_checkpoint(load_upsampler=run_latent_upsampler)
 
   pipeline.enable_vae_slicing()
   pipeline.enable_vae_tiling()
@@ -135,6 +137,7 @@ def run(config, pipeline=None, filename_prefix="", commit_hash=None):
   )
 
   out = call_pipeline(config, pipeline, prompt, negative_prompt)
+
   # out should have .frames and .audio
   videos = out.frames if hasattr(out, "frames") else out[0]
   audios = out.audio if hasattr(out, "audio") else None
@@ -143,6 +146,8 @@ def run(config, pipeline=None, filename_prefix="", commit_hash=None):
   max_logging.log(f"model name: {getattr(config, 'model_name', 'ltx-video')}")
   max_logging.log(f"model path: {config.pretrained_model_name_or_path}")
   max_logging.log(f"model type: {getattr(config, 'model_type', 'T2V')}")
+  if getattr(config, "run_latent_upsampler", False):
+    max_logging.log(f"upsampler model path: {config.upsampler_model_path}")
   max_logging.log(f"hardware: {jax.devices()[0].platform}")
   max_logging.log(f"number of devices: {jax.device_count()}")
   max_logging.log(f"per_device_batch_size: {config.per_device_batch_size}")
 
@@ -128,7 +128,7 @@ def _unflatten_heads(tensor, heads):
   return tensor
 
 
-def _reshape_data_for_flash(tensor, heads, num_context_shards = 1):
+def _reshape_data_for_flash(tensor, heads, num_context_shards=1):
   """
   Reshapes tensors for pallas flash attention adding padding to both seq_len and head_dim.
   Pads seq_len to a multiple of flash_block_size, and ensures the resulting number of
 
@@ -0,0 +1,264 @@
+"""
+Flax/JAX implementation of the LTX-2 Latent Upsampler.
+"""
+
+import os
+import json
+import math
+from typing import Optional, Tuple
+
+import jax
+import jax.numpy as jnp
+import flax.linen as nn
+
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import EntryNotFoundError, HfHubHTTPError
+
+RATIONAL_RESAMPLER_SCALE_MAPPING = {
+    0.75: (3, 4),
+    1.5: (3, 2),
+    2.0: (2, 1),
+    4.0: (4, 1),
+}
+
+
+class ResBlock(nn.Module):
+  channels: int
+  mid_channels: Optional[int] = None
+  dims: int = 3
+
+  @nn.compact
+  def __call__(self, hidden_states: jax.Array) -> jax.Array:
+    mid_channels = self.mid_channels if self.mid_channels is not None else self.channels
+
+    kernel_size = (3,) * self.dims
+    padding = ((1, 1),) * self.dims
+
+    residual = hidden_states
+
+    hidden_states = nn.Conv(mid_channels, kernel_size=kernel_size, padding=padding, name="conv1")(hidden_states)
+    hidden_states = nn.GroupNorm(epsilon=1e-5, num_groups=32, name="norm1")(hidden_states)
+    hidden_states = nn.silu(hidden_states)
+
+    hidden_states = nn.Conv(self.channels, kernel_size=kernel_size, padding=padding, name="conv2")(hidden_states)
+    hidden_states = nn.GroupNorm(epsilon=1e-5, num_groups=32, name="norm2")(hidden_states)
+
+    hidden_states = nn.silu(hidden_states + residual)
+
+    return hidden_states
+
+
+class PixelShuffleND(nn.Module):
+  dims: int
+  upscale_factors: Tuple[int, ...] = (2, 2, 2)
+
+  @nn.compact
+  def __call__(self, x: jax.Array) -> jax.Array:
+    if self.dims == 3:
+      p1, p2, p3 = self.upscale_factors[:3]
+      b, d, h, w, c_p = x.shape
+      c = c_p // (p1 * p2 * p3)
+      x = jnp.reshape(x, (b, d, h, w, c, p1, p2, p3))
+      x = jnp.transpose(x, (0, 1, 5, 2, 6, 3, 7, 4))
+      x = jnp.reshape(x, (b, d * p1, h * p2, w * p3, c))
+      return x
+    elif self.dims == 2:
+      p1, p2 = self.upscale_factors[:2]
+      b, h, w, c_p = x.shape
+      c = c_p // (p1 * p2)
+      x = jnp.reshape(x, (b, h, w, c, p1, p2))
+      x = jnp.transpose(x, (0, 1, 4, 2, 5, 3))
+      x = jnp.reshape(x, (b, h * p1, w * p2, c))
+      return x
+    elif self.dims == 1:
+      p1 = self.upscale_factors[0]
+      b, f, h, w, c_p = x.shape
+      c = c_p // p1
+      x = jnp.reshape(x, (b, f, h, w, c, p1))
+      x = jnp.transpose(x, (0, 1, 5, 2, 3, 4))
+      x = jnp.reshape(x, (b, f * p1, h, w, c))
+      return x
+
+
+class BlurDownsample(nn.Module):
+  dims: int
+  stride: int
+  kernel_size: int = 5
+
+  def setup(self):
+    if self.dims not in (2, 3):
+      raise ValueError(f"`dims` must be either 2 or 3 but is {self.dims}")
+    if self.kernel_size < 3 or self.kernel_size % 2 != 1:
+      raise ValueError(f"`kernel_size` must be an odd number >= 3 but is {self.kernel_size}")
+
+    k = jnp.array([math.comb(self.kernel_size - 1, i) for i in range(self.kernel_size)], dtype=jnp.float32)
+    k2d = jnp.outer(k, k)
+    k2d = k2d / jnp.sum(k2d)
+    self.kernel = jnp.reshape(k2d, (self.kernel_size, self.kernel_size, 1, 1))
+
+  def __call__(self, x: jax.Array) -> jax.Array:
+    if self.stride == 1:
+      return x
+
+    pad = self.kernel_size // 2
+
+    c = x.shape[-1]
+    # Tile the single-channel kernel to match the required output channels
+    kernel_broadcast = jnp.tile(self.kernel, (1, 1, 1, c))
+
+    if self.dims == 2:
+      x = jax.lax.conv_general_dilated(
+          lhs=x,
+          rhs=kernel_broadcast,
+          window_strides=(self.stride, self.stride),
+          padding=((pad, pad), (pad, pad)),
+          feature_group_count=c,
+          dimension_numbers=("NHWC", "HWIO", "NHWC"),
+      )
+    else:
+      b, f, h, w, _ = x.shape
+      x = jnp.reshape(x, (b * f, h, w, c))
+
+      # For depthwise convolution: kernel remains [H, W, 1, 1]
+      x = jax.lax.conv_general_dilated(
+          lhs=x,
+          rhs=kernel_broadcast,
+          window_strides=(self.stride, self.stride),
+          padding=((pad, pad), (pad, pad)),
+          feature_group_count=c,
+          dimension_numbers=("NHWC", "HWIO", "NHWC"),
+      )
+
+      h2, w2 = x.shape[1], x.shape[2]
+      x = jnp.reshape(x, (b, f, h2, w2, c))
+
+    return x
+
+
+class SpatialRationalResampler(nn.Module):
+  mid_channels: int = 1024
+  scale: float = 2.0
+
+  @nn.compact
+  def __call__(self, x: jax.Array) -> jax.Array:
+    if self.scale not in RATIONAL_RESAMPLER_SCALE_MAPPING:
+      raise ValueError(f"scale {self.scale} not supported.")
+    num, den = RATIONAL_RESAMPLER_SCALE_MAPPING[self.scale]
+
+    x = nn.Conv((num**2) * self.mid_channels, kernel_size=(3, 3), padding=((1, 1), (1, 1)), name="conv")(x)
+    x = PixelShuffleND(dims=2, upscale_factors=(num, num))(x)
+    x = BlurDownsample(dims=2, stride=den)(x)
+    return x
+
+
+class LTX2LatentUpsamplerModel(nn.Module):
+  in_channels: int = 128
+  mid_channels: int = 1024
+  num_blocks_per_stage: int = 4
+  dims: int = 3
+  spatial_upsample: bool = True
+  temporal_upsample: bool = False
+  rational_spatial_scale: Optional[float] = 2.0
+
+  @classmethod
+  def load_config(cls, pretrained_model_name_or_path: str, subfolder: str = "", **kwargs):
+    """Dynamically loads config.json from a local path or the Hugging Face Hub."""
+    try:
+      if os.path.isdir(pretrained_model_name_or_path):
+        config_file = os.path.join(pretrained_model_name_or_path, subfolder, "config.json")
+      else:
+        config_file = hf_hub_download(repo_id=pretrained_model_name_or_path, filename="config.json", subfolder=subfolder)
+
+      with open(config_file, "r") as f:
+        config_dict = json.load(f)
+
+      # Apply any runtime overrides passed in via kwargs
+      config_dict.update(kwargs)
+      return config_dict
+
+    except (OSError, json.JSONDecodeError, EntryNotFoundError, HfHubHTTPError) as e:
+      print(f"Warning: Could not load upsampler config.json (using defaults). Reason: {e}")
+      return kwargs
+
+  @nn.compact
+  def __call__(self, hidden_states: jax.Array) -> jax.Array:
+    b, f, h, w, c = hidden_states.shape
+
+    if self.dims == 2:
+      hidden_states = jnp.reshape(hidden_states, (b * f, h, w, c))
+
+      hidden_states = nn.Conv(self.mid_channels, kernel_size=(3, 3), padding=((1, 1), (1, 1)), name="initial_conv")(
+          hidden_states
+      )
+      hidden_states = nn.GroupNorm(epsilon=1e-5, num_groups=32, name="initial_norm")(hidden_states)
+      hidden_states = nn.silu(hidden_states)
+
+      for i in range(self.num_blocks_per_stage):
+        hidden_states = ResBlock(channels=self.mid_channels, dims=2, name=f"res_blocks_{i}")(hidden_states)
+
+      if self.spatial_upsample:
+        if self.rational_spatial_scale is not None:
+          hidden_states = SpatialRationalResampler(self.mid_channels, self.rational_spatial_scale, name="upsampler")(
+              hidden_states
+          )
+        else:
+          hidden_states = nn.Conv(4 * self.mid_channels, kernel_size=(3, 3), padding=((1, 1), (1, 1)), name="upsampler_0")(
+              hidden_states
+          )
+          hidden_states = PixelShuffleND(dims=2)(hidden_states)
+
+      for i in range(self.num_blocks_per_stage):
+        hidden_states = ResBlock(channels=self.mid_channels, dims=2, name=f"post_upsample_res_blocks_{i}")(hidden_states)
+
+      hidden_states = nn.Conv(self.in_channels, kernel_size=(3, 3), padding=((1, 1), (1, 1)), name="final_conv")(
+          hidden_states
+      )
+
+      h2, w2 = hidden_states.shape[1], hidden_states.shape[2]
+      hidden_states = jnp.reshape(hidden_states, (b, f, h2, w2, self.in_channels))
+
+    else:
+      hidden_states = nn.Conv(
+          self.mid_channels, kernel_size=(3, 3, 3), padding=((1, 1), (1, 1), (1, 1)), name="initial_conv"
+      )(hidden_states)
+      hidden_states = nn.GroupNorm(epsilon=1e-5, num_groups=32, name="initial_norm")(hidden_states)
+      hidden_states = nn.silu(hidden_states)
+
+      for i in range(self.num_blocks_per_stage):
+        hidden_states = ResBlock(channels=self.mid_channels, dims=3, name=f"res_blocks_{i}")(hidden_states)
+
+      # FIX: Added Missing Joint Spatiotemporal logic!
+      if self.spatial_upsample and self.temporal_upsample:
+        hidden_states = nn.Conv(
+            8 * self.mid_channels, kernel_size=(3, 3, 3), padding=((1, 1), (1, 1), (1, 1)), name="upsampler_0"
+        )(hidden_states)
+        hidden_states = PixelShuffleND(dims=3)(hidden_states)
+        hidden_states = hidden_states[:, 1:, :, :, :]
+      elif self.temporal_upsample:
+        hidden_states = nn.Conv(
+            2 * self.mid_channels, kernel_size=(3, 3, 3), padding=((1, 1), (1, 1), (1, 1)), name="upsampler_0"
+        )(hidden_states)
+        hidden_states = PixelShuffleND(dims=1)(hidden_states)
+        hidden_states = hidden_states[:, 1:, :, :, :]
+      elif self.spatial_upsample:
+        hidden_states = jnp.reshape(hidden_states, (b * f, h, w, self.mid_channels))
+        if self.rational_spatial_scale is not None:
+          hidden_states = SpatialRationalResampler(self.mid_channels, self.rational_spatial_scale, name="upsampler")(
+              hidden_states
+          )
+        else:
+          hidden_states = nn.Conv(4 * self.mid_channels, kernel_size=(3, 3), padding=((1, 1), (1, 1)), name="upsampler_0")(
+              hidden_states
+          )
+          hidden_states = PixelShuffleND(dims=2)(hidden_states)
+        h2, w2 = hidden_states.shape[1], hidden_states.shape[2]
+        hidden_states = jnp.reshape(hidden_states, (b, f, h2, w2, self.mid_channels))
+
+      for i in range(self.num_blocks_per_stage):
+        hidden_states = ResBlock(channels=self.mid_channels, dims=3, name=f"post_upsample_res_blocks_{i}")(hidden_states)
+
+      hidden_states = nn.Conv(self.in_channels, kernel_size=(3, 3, 3), padding=((1, 1), (1, 1), (1, 1)), name="final_conv")(
+          hidden_states
+      )
+
+    return hidden_states