ltx-video-distilled

Running on Zero

App Files Files Community

jacobitterman

linoyts HF Staff commited on 2 days ago

Commit

68008ac

verified ·

1 Parent(s): 77756bc

update to 0.9.8 (#23)

Browse files

- update to 0.9.8 (0135c48cef03cb1cbdefffc47d83a85095518e10)
- Update app.py (e1c8d1db758dbb5663f2b31af5be19c7aaf5b5c3)
- update code base with main (27e0be50ac121b4b63239a5b5dc6a11eddac9209)
- Update app.py (c3db712c6a92b978a2feb71ccfd70038b6ee3108)
- Update app.py (ab760cfbc5339b4551b6e5a02020981b4c8affe3)

Co-authored-by: Linoy Tsaban <linoyts@users.noreply.huggingface.co>

Files changed (12) hide show

app.py +5 -5
configs/ltxv-13b-0.9.8-dev-fp8.yaml +34 -0
configs/ltxv-13b-0.9.8-dev.yaml +34 -0
configs/ltxv-13b-0.9.8-distilled-fp8.yaml +29 -0
configs/ltxv-13b-0.9.8-distilled.yaml +29 -0
configs/ltxv-2b-0.9.8-distilled-fp8.yaml +28 -0
configs/ltxv-2b-0.9.8-distilled.yaml +28 -0
ltx_video/models/autoencoders/causal_video_autoencoder.py +2 -7
ltx_video/models/transformers/attention.py +0 -1
ltx_video/models/transformers/transformer3d.py +1 -1
ltx_video/pipelines/pipeline_ltx_video.py +113 -63
ltx_video/schedulers/rf.py +1 -1

app.py CHANGED Viewed

@@ -24,7 +24,7 @@ from inference import (
 from ltx_video.pipelines.pipeline_ltx_video import ConditioningItem, LTXMultiScalePipeline, LTXVideoPipeline
 from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
-config_file_path = "configs/ltxv-13b-0.9.7-distilled.yaml"
 with open(config_file_path, "r") as file:
     PIPELINE_CONFIG_YAML = yaml.safe_load(file)
@@ -374,8 +374,8 @@ css="""
 """
 with gr.Blocks(css=css) as demo:
-    gr.Markdown("# LTX Video 0.9.7 Distilled")
-    gr.Markdown("Fast high quality video generation. [Model](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltxv-13b-0.9.7-distilled.safetensors) [GitHub](https://github.com/Lightricks/LTX-Video) [Diffusers](https://huggingface.co/Lightricks/LTX-Video-0.9.7-distilled#diffusers-🧨)")
     with gr.Row():
         with gr.Column():
@@ -404,7 +404,7 @@ with gr.Blocks(css=css) as demo:
                 step=0.1,
                 info=f"Target video duration (0.3s to 8.5s)"
             )
-            improve_texture = gr.Checkbox(label="Improve Texture (multi-scale)", value=True, info="Uses a two-pass generation for better quality, but is slower. Recommended for final output.")
         with gr.Column():
             output_video = gr.Video(label="Generated Video", interactive=False)
@@ -416,7 +416,7 @@ with gr.Blocks(css=css) as demo:
         with gr.Row():
             seed_input = gr.Number(label="Seed", value=42, precision=0, minimum=0, maximum=2**32-1)
             randomize_seed_input = gr.Checkbox(label="Randomize Seed", value=True)
-        with gr.Row():
             guidance_scale_input = gr.Slider(label="Guidance Scale (CFG)", minimum=1.0, maximum=10.0, value=PIPELINE_CONFIG_YAML.get("first_pass", {}).get("guidance_scale", 1.0), step=0.1, info="Controls how much the prompt influences the output. Higher values = stronger influence.")
         with gr.Row():
             height_input = gr.Slider(label="Height", value=512, step=32, minimum=MIN_DIM_SLIDER, maximum=MAX_IMAGE_SIZE, info="Must be divisible by 32.")

 from ltx_video.pipelines.pipeline_ltx_video import ConditioningItem, LTXMultiScalePipeline, LTXVideoPipeline
 from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
+config_file_path = "configs/ltxv-13b-0.9.8-distilled.yaml"
 with open(config_file_path, "r") as file:
     PIPELINE_CONFIG_YAML = yaml.safe_load(file)
 """
 with gr.Blocks(css=css) as demo:
+    gr.Markdown("# LTX Video 0.9.8 13B Distilled")
+    gr.Markdown("Fast high quality video generation. [Model](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltxv-13b-0.9.8-distilled.safetensors) [GitHub](https://github.com/Lightricks/LTX-Video) [Diffusers](https://huggingface.co/Lightricks/LTX-Video-0.9.8-13B-distilled#diffusers-🧨)")
     with gr.Row():
         with gr.Column():
                 step=0.1,
                 info=f"Target video duration (0.3s to 8.5s)"
             )
+            improve_texture = gr.Checkbox(label="Improve Texture (multi-scale)", value=True,visible=False, info="Uses a two-pass generation for better quality, but is slower. Recommended for final output.")
         with gr.Column():
             output_video = gr.Video(label="Generated Video", interactive=False)
         with gr.Row():
             seed_input = gr.Number(label="Seed", value=42, precision=0, minimum=0, maximum=2**32-1)
             randomize_seed_input = gr.Checkbox(label="Randomize Seed", value=True)
+        with gr.Row(visible=False):
             guidance_scale_input = gr.Slider(label="Guidance Scale (CFG)", minimum=1.0, maximum=10.0, value=PIPELINE_CONFIG_YAML.get("first_pass", {}).get("guidance_scale", 1.0), step=0.1, info="Controls how much the prompt influences the output. Higher values = stronger influence.")
         with gr.Row():
             height_input = gr.Slider(label="Height", value=512, step=32, minimum=MIN_DIM_SLIDER, maximum=MAX_IMAGE_SIZE, info="Must be divisible by 32.")

configs/ltxv-13b-0.9.8-dev-fp8.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+pipeline_type: multi-scale
+checkpoint_path: "ltxv-13b-0.9.8-dev-fp8.safetensors"
+downscale_factor: 0.6666666
+spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.8.safetensors"
+stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
+decode_timestep: 0.05
+decode_noise_scale: 0.025
+text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
+precision: "float8_e4m3fn" # options: "float8_e4m3fn", "bfloat16", "mixed_precision"
+sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
+prompt_enhancement_words_threshold: 120
+prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
+prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
+stochastic_sampling: false
+first_pass:
+  guidance_scale: [1, 1, 6, 8, 6, 1, 1]
+  stg_scale: [0, 0, 4, 4, 4, 2, 1]
+  rescaling_scale: [1, 1, 0.5, 0.5, 1, 1, 1]
+  guidance_timesteps: [1.0, 0.996,  0.9933, 0.9850, 0.9767, 0.9008, 0.6180]
+  skip_block_list: [[], [11, 25, 35, 39], [22, 35, 39], [28], [28], [28], [28]]
+  num_inference_steps: 30
+  skip_final_inference_steps: 3
+  cfg_star_rescale: true
+second_pass:
+  guidance_scale: [1]
+  stg_scale: [1]
+  rescaling_scale: [1]
+  guidance_timesteps: [1.0]
+  skip_block_list: [27]
+  num_inference_steps: 30
+  skip_initial_inference_steps: 17
+  cfg_star_rescale: true

configs/ltxv-13b-0.9.8-dev.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+pipeline_type: multi-scale
+checkpoint_path: "ltxv-13b-0.9.8-dev.safetensors"
+downscale_factor: 0.6666666
+spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.8.safetensors"
+stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
+decode_timestep: 0.05
+decode_noise_scale: 0.025
+text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
+precision: "bfloat16"
+sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
+prompt_enhancement_words_threshold: 120
+prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
+prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
+stochastic_sampling: false
+first_pass:
+  guidance_scale: [1, 1, 6, 8, 6, 1, 1]
+  stg_scale: [0, 0, 4, 4, 4, 2, 1]
+  rescaling_scale: [1, 1, 0.5, 0.5, 1, 1, 1]
+  guidance_timesteps: [1.0, 0.996,  0.9933, 0.9850, 0.9767, 0.9008, 0.6180]
+  skip_block_list: [[], [11, 25, 35, 39], [22, 35, 39], [28], [28], [28], [28]]
+  num_inference_steps: 30
+  skip_final_inference_steps: 3
+  cfg_star_rescale: true
+second_pass:
+  guidance_scale: [1]
+  stg_scale: [1]
+  rescaling_scale: [1]
+  guidance_timesteps: [1.0]
+  skip_block_list: [27]
+  num_inference_steps: 30
+  skip_initial_inference_steps: 17
+  cfg_star_rescale: true

configs/ltxv-13b-0.9.8-distilled-fp8.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+pipeline_type: multi-scale
+checkpoint_path: "ltxv-13b-0.9.8-distilled-fp8.safetensors"
+downscale_factor: 0.6666666
+spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.8.safetensors"
+stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
+decode_timestep: 0.05
+decode_noise_scale: 0.025
+text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
+precision: "float8_e4m3fn" # options: "float8_e4m3fn", "bfloat16", "mixed_precision"
+sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
+prompt_enhancement_words_threshold: 120
+prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
+prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
+stochastic_sampling: false
+first_pass:
+  timesteps: [1.0000, 0.9937, 0.9875, 0.9812, 0.9750, 0.9094, 0.7250]
+  guidance_scale: 1
+  stg_scale: 0
+  rescaling_scale: 1
+  skip_block_list: [42]
+second_pass:
+  timesteps: [0.9094, 0.7250, 0.4219]
+  guidance_scale: 1
+  stg_scale: 0
+  rescaling_scale: 1
+  skip_block_list: [42]
+  tone_map_compression_ratio: 0.6

configs/ltxv-13b-0.9.8-distilled.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+pipeline_type: multi-scale
+checkpoint_path: "ltxv-13b-0.9.8-distilled.safetensors"
+downscale_factor: 0.6666666
+spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.8.safetensors"
+stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
+decode_timestep: 0.05
+decode_noise_scale: 0.025
+text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
+precision: "bfloat16"
+sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
+prompt_enhancement_words_threshold: 120
+prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
+prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
+stochastic_sampling: false
+first_pass:
+  timesteps: [1.0000, 0.9937, 0.9875, 0.9812, 0.9750, 0.9094, 0.7250]
+  guidance_scale: 1
+  stg_scale: 0
+  rescaling_scale: 1
+  skip_block_list: [42]
+second_pass:
+  timesteps: [0.9094, 0.7250, 0.4219]
+  guidance_scale: 1
+  stg_scale: 0
+  rescaling_scale: 1
+  skip_block_list: [42]
+  tone_map_compression_ratio: 0.6

configs/ltxv-2b-0.9.8-distilled-fp8.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+pipeline_type: multi-scale
+checkpoint_path: "ltxv-2b-0.9.8-distilled-fp8.safetensors"
+downscale_factor: 0.6666666
+spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.8.safetensors"
+stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
+decode_timestep: 0.05
+decode_noise_scale: 0.025
+text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
+precision: "float8_e4m3fn" # options: "float8_e4m3fn", "bfloat16", "mixed_precision"
+sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
+prompt_enhancement_words_threshold: 120
+prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
+prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
+stochastic_sampling: false
+first_pass:
+  timesteps: [1.0000, 0.9937, 0.9875, 0.9812, 0.9750, 0.9094, 0.7250]
+  guidance_scale: 1
+  stg_scale: 0
+  rescaling_scale: 1
+  skip_block_list: [42]
+second_pass:
+  timesteps: [0.9094, 0.7250, 0.4219]
+  guidance_scale: 1
+  stg_scale: 0
+  rescaling_scale: 1
+  skip_block_list: [42]

configs/ltxv-2b-0.9.8-distilled.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+pipeline_type: multi-scale
+checkpoint_path: "ltxv-2b-0.9.8-distilled.safetensors"
+downscale_factor: 0.6666666
+spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.8.safetensors"
+stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
+decode_timestep: 0.05
+decode_noise_scale: 0.025
+text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
+precision: "bfloat16"
+sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
+prompt_enhancement_words_threshold: 120
+prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
+prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
+stochastic_sampling: false
+first_pass:
+  timesteps: [1.0000, 0.9937, 0.9875, 0.9812, 0.9750, 0.9094, 0.7250]
+  guidance_scale: 1
+  stg_scale: 0
+  rescaling_scale: 1
+  skip_block_list: [42]
+second_pass:
+  timesteps: [0.9094, 0.7250, 0.4219]
+  guidance_scale: 1
+  stg_scale: 0
+  rescaling_scale: 1
+  skip_block_list: [42]

ltx_video/models/autoencoders/causal_video_autoencoder.py CHANGED Viewed

@@ -235,7 +235,7 @@ class CausalVideoAutoencoder(AutoencoderKLWrapper):
                     "compress_time",
                     "compress_all",
                     "compress_all_res",
-                    "compress_space_res",
                 ]
             ]
         )
@@ -608,7 +608,7 @@ class Decoder(nn.Module):
             block_params = block_params if isinstance(block_params, dict) else {}
             if block_name == "res_x_y":
                 output_channel = output_channel * block_params.get("multiplier", 2)
-            if block_name == "compress_all":
                 output_channel = output_channel * block_params.get("multiplier", 1)
         self.conv_in = make_conv_nd(
@@ -1303,20 +1303,15 @@ def create_video_autoencoder_demo_config(
     encoder_blocks = [
         ("res_x", {"num_layers": 2}),
         ("compress_space_res", {"multiplier": 2}),
-        ("res_x", {"num_layers": 2}),
         ("compress_time_res", {"multiplier": 2}),
-        ("res_x", {"num_layers": 1}),
         ("compress_all_res", {"multiplier": 2}),
-        ("res_x", {"num_layers": 1}),
         ("compress_all_res", {"multiplier": 2}),
         ("res_x", {"num_layers": 1}),
     ]
     decoder_blocks = [
         ("res_x", {"num_layers": 2, "inject_noise": False}),
         ("compress_all", {"residual": True, "multiplier": 2}),
-        ("res_x", {"num_layers": 2, "inject_noise": False}),
         ("compress_all", {"residual": True, "multiplier": 2}),
-        ("res_x", {"num_layers": 2, "inject_noise": False}),
         ("compress_all", {"residual": True, "multiplier": 2}),
         ("res_x", {"num_layers": 2, "inject_noise": False}),
     ]

                     "compress_time",
                     "compress_all",
                     "compress_all_res",
+                    "compress_time_res",
                 ]
             ]
         )
             block_params = block_params if isinstance(block_params, dict) else {}
             if block_name == "res_x_y":
                 output_channel = output_channel * block_params.get("multiplier", 2)
+            if block_name.startswith("compress"):
                 output_channel = output_channel * block_params.get("multiplier", 1)
         self.conv_in = make_conv_nd(
     encoder_blocks = [
         ("res_x", {"num_layers": 2}),
         ("compress_space_res", {"multiplier": 2}),
         ("compress_time_res", {"multiplier": 2}),
         ("compress_all_res", {"multiplier": 2}),
         ("compress_all_res", {"multiplier": 2}),
         ("res_x", {"num_layers": 1}),
     ]
     decoder_blocks = [
         ("res_x", {"num_layers": 2, "inject_noise": False}),
         ("compress_all", {"residual": True, "multiplier": 2}),
         ("compress_all", {"residual": True, "multiplier": 2}),
         ("compress_all", {"residual": True, "multiplier": 2}),
         ("res_x", {"num_layers": 2, "inject_noise": False}),
     ]

ltx_video/models/transformers/attention.py CHANGED Viewed

@@ -205,7 +205,6 @@ class BasicTransformerBlock(nn.Module):
         timestep: Optional[torch.LongTensor] = None,
         cross_attention_kwargs: Dict[str, Any] = None,
         class_labels: Optional[torch.LongTensor] = None,
-        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
         skip_layer_mask: Optional[torch.Tensor] = None,
         skip_layer_strategy: Optional[SkipLayerStrategy] = None,
     ) -> torch.FloatTensor:

         timestep: Optional[torch.LongTensor] = None,
         cross_attention_kwargs: Dict[str, Any] = None,
         class_labels: Optional[torch.LongTensor] = None,
         skip_layer_mask: Optional[torch.Tensor] = None,
         skip_layer_strategy: Optional[SkipLayerStrategy] = None,
     ) -> torch.FloatTensor:

ltx_video/models/transformers/transformer3d.py CHANGED Viewed

@@ -268,7 +268,7 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
                 for key, value in state_dict.items()
                 if key.startswith("model.diffusion_model.")
             }
-        super().load_state_dict(state_dict, **kwargs)
     @classmethod
     def from_pretrained(

                 for key, value in state_dict.items()
                 if key.startswith("model.diffusion_model.")
             }
+        super().load_state_dict(state_dict, *args, **kwargs)
     @classmethod
     def from_pretrained(

ltx_video/pipelines/pipeline_ltx_video.py CHANGED Viewed

@@ -45,11 +45,6 @@ from ltx_video.models.autoencoders.vae_encode import (
 )
-try:
-    import torch_xla.distributed.spmd as xs
-except ImportError:
-    xs = None
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -795,6 +790,7 @@ class LTXVideoPipeline(DiffusionPipeline):
         text_encoder_max_tokens: int = 256,
         stochastic_sampling: bool = False,
         media_items: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Union[ImagePipelineOutput, Tuple]:
         """
@@ -876,6 +872,8 @@ class LTXVideoPipeline(DiffusionPipeline):
                 If set to `True`, the sampling is stochastic. If set to `False`, the sampling is deterministic.
             media_items ('torch.Tensor', *optional*):
                 The input media item used for image-to-image / video-to-video.
         Examples:
         Returns:
@@ -978,10 +976,6 @@ class LTXVideoPipeline(DiffusionPipeline):
                 guidance_scale[guidance_mapping[i]] for i in range(len(timesteps))
             ]
-        # For simplicity, we are using a constant num_conds for all timesteps, so we need to zero
-        # out cases where the guidance scale should not be applied.
-        guidance_scale = [x if x > 1.0 else 0.0 for x in guidance_scale]
         if not isinstance(stg_scale, List):
             stg_scale = [stg_scale] * len(timesteps)
         else:
@@ -994,16 +988,6 @@ class LTXVideoPipeline(DiffusionPipeline):
                 rescaling_scale[guidance_mapping[i]] for i in range(len(timesteps))
             ]
-        do_classifier_free_guidance = any(x > 1.0 for x in guidance_scale)
-        do_spatio_temporal_guidance = any(x > 0.0 for x in stg_scale)
-        do_rescaling = any(x != 1.0 for x in rescaling_scale)
-        num_conds = 1
-        if do_classifier_free_guidance:
-            num_conds += 1
-        if do_spatio_temporal_guidance:
-            num_conds += 1
         # Normalize skip_block_list to always be None or a list of lists matching timesteps
         if skip_block_list is not None:
             # Convert single list to list of lists if needed
@@ -1015,17 +999,6 @@ class LTXVideoPipeline(DiffusionPipeline):
                     new_skip_block_list.append(skip_block_list[guidance_mapping[i]])
                 skip_block_list = new_skip_block_list
-        # Prepare skip layer masks
-        skip_layer_masks: Optional[List[torch.Tensor]] = None
-        if do_spatio_temporal_guidance:
-            if skip_block_list is not None:
-                skip_layer_masks = [
-                    self.transformer.create_skip_layer_mask(
-                        batch_size, num_conds, num_conds - 1, skip_blocks
-                    )
-                    for skip_blocks in skip_block_list
-                ]
         if enhance_prompt:
             self.prompt_enhancer_image_caption_model = (
                 self.prompt_enhancer_image_caption_model.to(self._execution_device)
@@ -1055,7 +1028,7 @@ class LTXVideoPipeline(DiffusionPipeline):
             negative_prompt_attention_mask,
         ) = self.encode_prompt(
             prompt,
-            do_classifier_free_guidance,
             negative_prompt=negative_prompt,
             num_images_per_prompt=num_images_per_prompt,
             device=device,
@@ -1073,23 +1046,28 @@ class LTXVideoPipeline(DiffusionPipeline):
         prompt_embeds_batch = prompt_embeds
         prompt_attention_mask_batch = prompt_attention_mask
-        if do_classifier_free_guidance:
-            prompt_embeds_batch = torch.cat(
-                [negative_prompt_embeds, prompt_embeds], dim=0
-            )
-            prompt_attention_mask_batch = torch.cat(
-                [negative_prompt_attention_mask, prompt_attention_mask], dim=0
-            )
-        if do_spatio_temporal_guidance:
-            prompt_embeds_batch = torch.cat([prompt_embeds_batch, prompt_embeds], dim=0)
-            prompt_attention_mask_batch = torch.cat(
-                [
-                    prompt_attention_mask_batch,
-                    prompt_attention_mask,
-                ],
-                dim=0,
-            )
         # 4. Prepare the initial latents using the provided media and conditioning items
         # Prepare the initial latents tensor, shape = (b, c, f, h, w)
@@ -1098,7 +1076,7 @@ class LTXVideoPipeline(DiffusionPipeline):
             media_items=media_items,
             timestep=timesteps[0],
             latent_shape=latent_shape,
-            dtype=prompt_embeds_batch.dtype,
             device=device,
             generator=generator,
             vae_per_channel_normalize=vae_per_channel_normalize,
@@ -1118,14 +1096,6 @@ class LTXVideoPipeline(DiffusionPipeline):
         )
         init_latents = latents.clone()  # Used for image_cond_noise_update
-        pixel_coords = torch.cat([pixel_coords] * num_conds)
-        orig_conditioning_mask = conditioning_mask
-        if conditioning_mask is not None and is_video:
-            assert num_images_per_prompt == 1
-            conditioning_mask = torch.cat([conditioning_mask] * num_conds)
-        fractional_coords = pixel_coords.to(torch.float32)
-        fractional_coords[:, 0] = fractional_coords[:, 0] * (1.0 / frame_rate)
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -1134,8 +1104,50 @@ class LTXVideoPipeline(DiffusionPipeline):
             len(timesteps) - num_inference_steps * self.scheduler.order, 0
         )
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 if conditioning_mask is not None and image_cond_noise_scale > 0.0:
                     latents = self.add_noise_to_image_conditioning_latents(
                         t,
@@ -1194,16 +1206,12 @@ class LTXVideoPipeline(DiffusionPipeline):
                     noise_pred = self.transformer(
                         latent_model_input.to(self.transformer.dtype),
                         indices_grid=fractional_coords,
-                        encoder_hidden_states=prompt_embeds_batch.to(
                             self.transformer.dtype
                         ),
-                        encoder_attention_mask=prompt_attention_mask_batch,
                         timestep=current_timestep,
-                        skip_layer_mask=(
-                            skip_layer_masks[i]
-                            if skip_layer_masks is not None
-                            else None
-                        ),
                         skip_layer_strategy=skip_layer_strategy,
                         return_dict=False,
                     )[0]
@@ -1315,6 +1323,7 @@ class LTXVideoPipeline(DiffusionPipeline):
                 )
             else:
                 decode_timestep = None
             image = vae_decode(
                 latents,
                 self.vae,
@@ -1736,6 +1745,47 @@ class LTXVideoPipeline(DiffusionPipeline):
         num_frames = (num_frames - 1) // scale_factor * scale_factor + 1
         return num_frames
 def adain_filter_latent(
     latents: torch.Tensor, reference_latents: torch.Tensor, factor=1.0

 )
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
         text_encoder_max_tokens: int = 256,
         stochastic_sampling: bool = False,
         media_items: Optional[torch.Tensor] = None,
+        tone_map_compression_ratio: float = 0.0,
         **kwargs,
     ) -> Union[ImagePipelineOutput, Tuple]:
         """
                 If set to `True`, the sampling is stochastic. If set to `False`, the sampling is deterministic.
             media_items ('torch.Tensor', *optional*):
                 The input media item used for image-to-image / video-to-video.
+            tone_map_compression_ratio: compression ratio for tone mapping, defaults to 0.0.
+                        If set to 0.0, no tone mapping is applied. If set to 1.0 - full compression is applied.
         Examples:
         Returns:
                 guidance_scale[guidance_mapping[i]] for i in range(len(timesteps))
             ]
         if not isinstance(stg_scale, List):
             stg_scale = [stg_scale] * len(timesteps)
         else:
                 rescaling_scale[guidance_mapping[i]] for i in range(len(timesteps))
             ]
         # Normalize skip_block_list to always be None or a list of lists matching timesteps
         if skip_block_list is not None:
             # Convert single list to list of lists if needed
                     new_skip_block_list.append(skip_block_list[guidance_mapping[i]])
                 skip_block_list = new_skip_block_list
         if enhance_prompt:
             self.prompt_enhancer_image_caption_model = (
                 self.prompt_enhancer_image_caption_model.to(self._execution_device)
             negative_prompt_attention_mask,
         ) = self.encode_prompt(
             prompt,
+            True,
             negative_prompt=negative_prompt,
             num_images_per_prompt=num_images_per_prompt,
             device=device,
         prompt_embeds_batch = prompt_embeds
         prompt_attention_mask_batch = prompt_attention_mask
+        negative_prompt_embeds = (
+            torch.zeros_like(prompt_embeds)
+            if negative_prompt_embeds is None
+            else negative_prompt_embeds
+        )
+        negative_prompt_attention_mask = (
+            torch.zeros_like(prompt_attention_mask)
+            if negative_prompt_attention_mask is None
+            else negative_prompt_attention_mask
+        )
+        prompt_embeds_batch = torch.cat(
+            [negative_prompt_embeds, prompt_embeds, prompt_embeds], dim=0
+        )
+        prompt_attention_mask_batch = torch.cat(
+            [
+                negative_prompt_attention_mask,
+                prompt_attention_mask,
+                prompt_attention_mask,
+            ],
+            dim=0,
+        )
         # 4. Prepare the initial latents using the provided media and conditioning items
         # Prepare the initial latents tensor, shape = (b, c, f, h, w)
             media_items=media_items,
             timestep=timesteps[0],
             latent_shape=latent_shape,
+            dtype=prompt_embeds.dtype,
             device=device,
             generator=generator,
             vae_per_channel_normalize=vae_per_channel_normalize,
         )
         init_latents = latents.clone()  # Used for image_cond_noise_update
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
             len(timesteps) - num_inference_steps * self.scheduler.order, 0
         )
+        orig_conditioning_mask = conditioning_mask
+        # Befor compiling this code please be aware:
+        # This code might generate different input shapes if some timesteps have no STG or CFG.
+        # This means that the codes might need to be compiled mutliple times.
+        # To avoid that, use the same STG and CFG values for all timesteps.
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
+                do_classifier_free_guidance = guidance_scale[i] > 1.0
+                do_spatio_temporal_guidance = stg_scale[i] > 0
+                do_rescaling = rescaling_scale[i] != 1.0
+                num_conds = 1
+                if do_classifier_free_guidance:
+                    num_conds += 1
+                if do_spatio_temporal_guidance:
+                    num_conds += 1
+                if do_classifier_free_guidance and do_spatio_temporal_guidance:
+                    indices = slice(batch_size * 0, batch_size * 3)
+                elif do_classifier_free_guidance:
+                    indices = slice(batch_size * 0, batch_size * 2)
+                elif do_spatio_temporal_guidance:
+                    indices = slice(batch_size * 1, batch_size * 3)
+                else:
+                    indices = slice(batch_size * 1, batch_size * 2)
+                # Prepare skip layer masks
+                skip_layer_mask: Optional[torch.Tensor] = None
+                if do_spatio_temporal_guidance:
+                    if skip_block_list is not None:
+                        skip_layer_mask = self.transformer.create_skip_layer_mask(
+                            batch_size, num_conds, num_conds - 1, skip_block_list[i]
+                        )
+                batch_pixel_coords = torch.cat([pixel_coords] * num_conds)
+                conditioning_mask = orig_conditioning_mask
+                if conditioning_mask is not None and is_video:
+                    assert num_images_per_prompt == 1
+                    conditioning_mask = torch.cat([conditioning_mask] * num_conds)
+                fractional_coords = batch_pixel_coords.to(torch.float32)
+                fractional_coords[:, 0] = fractional_coords[:, 0] * (1.0 / frame_rate)
                 if conditioning_mask is not None and image_cond_noise_scale > 0.0:
                     latents = self.add_noise_to_image_conditioning_latents(
                         t,
                     noise_pred = self.transformer(
                         latent_model_input.to(self.transformer.dtype),
                         indices_grid=fractional_coords,
+                        encoder_hidden_states=prompt_embeds_batch[indices].to(
                             self.transformer.dtype
                         ),
+                        encoder_attention_mask=prompt_attention_mask_batch[indices],
                         timestep=current_timestep,
+                        skip_layer_mask=skip_layer_mask,
                         skip_layer_strategy=skip_layer_strategy,
                         return_dict=False,
                     )[0]
                 )
             else:
                 decode_timestep = None
+            latents = self.tone_map_latents(latents, tone_map_compression_ratio)
             image = vae_decode(
                 latents,
                 self.vae,
         num_frames = (num_frames - 1) // scale_factor * scale_factor + 1
         return num_frames
+    @staticmethod
+    def tone_map_latents(
+        latents: torch.Tensor,
+        compression: float,
+    ) -> torch.Tensor:
+        """
+        Applies a non-linear tone-mapping function to latent values to reduce their dynamic range
+        in a perceptually smooth way using a sigmoid-based compression.
+        This is useful for regularizing high-variance latents or for conditioning outputs
+        during generation, especially when controlling dynamic behavior with a `compression` factor.
+        Parameters:
+        ----------
+        latents : torch.Tensor
+            Input latent tensor with arbitrary shape. Expected to be roughly in [-1, 1] or [0, 1] range.
+        compression : float
+            Compression strength in the range [0, 1].
+            - 0.0: No tone-mapping (identity transform)
+            - 1.0: Full compression effect
+        Returns:
+        -------
+        torch.Tensor
+            The tone-mapped latent tensor of the same shape as input.
+        """
+        if not (0 <= compression <= 1):
+            raise ValueError("Compression must be in the range [0, 1]")
+        # Remap [0-1] to [0-0.75] and apply sigmoid compression in one shot
+        scale_factor = compression * 0.75
+        abs_latents = torch.abs(latents)
+        # Sigmoid compression: sigmoid shifts large values toward 0.2, small values stay ~1.0
+        # When scale_factor=0, sigmoid term vanishes, when scale_factor=0.75, full effect
+        sigmoid_term = torch.sigmoid(4.0 * scale_factor * (abs_latents - 1.0))
+        scales = 1.0 - 0.8 * scale_factor * sigmoid_term
+        filtered = latents * scales
+        return filtered
 def adain_filter_latent(
     latents: torch.Tensor, reference_latents: torch.Tensor, factor=1.0

ltx_video/schedulers/rf.py CHANGED Viewed

@@ -314,7 +314,7 @@ class RectifiedFlowScheduler(SchedulerMixin, ConfigMixin, TimestepShifter):
         """
         Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
         process from the learned model outputs (most often the predicted noise).
-        z_{t_1} = z_t - \Delta_t * v
         The method finds the next timestep that is lower than the input timestep(s) and denoises the latents
         to that level. The input timestep(s) are not required to be one of the predefined timesteps.

         """
         Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
         process from the learned model outputs (most often the predicted noise).
+        z_{t_1} = z_t - Delta_t * v
         The method finds the next timestep that is lower than the input timestep(s) and denoises the latents
         to that level. The input timestep(s) are not required to be one of the predefined timesteps.