Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
2cc7e11
LTX2 condition pipeline initial commit
dg845 Jan 30, 2026
02c750b
Fix pipeline import error
dg845 Feb 3, 2026
ed52c0d
Implement LTX-2-style general image conditioning
dg845 Feb 3, 2026
5368d73
Blend denoising output and clean latents in sample space instead of v…
dg845 Feb 4, 2026
5577e08
make style and make quality
dg845 Feb 4, 2026
e0bd6a0
make fix-copies
dg845 Feb 4, 2026
70dff16
Merge branch 'main' into ltx2-add-condition-pipeline
sayakpaul Feb 4, 2026
45051e1
Rename LTX2VideoCondition image to frames
dg845 Feb 5, 2026
d39d89f
Update LTX2ConditionPipeline example
dg845 Feb 5, 2026
2e824f5
Remove support for image and video in __call__
dg845 Feb 5, 2026
33e6ec1
Put latent_idx_from_index logic inline
dg845 Feb 5, 2026
98f74b2
Improve comment on using the conditioning mask in denoising loop
dg845 Feb 5, 2026
83c8ae6
Apply suggestions from code review
dg845 Feb 14, 2026
8ba350c
Merge branch 'main' into ltx2-add-condition-pipeline
dg845 Feb 14, 2026
1cdea99
make fix-copies
dg845 Feb 14, 2026
1c120c6
Migrate to Python 3.9+ style type annotations without explicit typing…
dg845 Feb 14, 2026
e8c5ee0
Merge branch 'main' into ltx2-add-condition-pipeline
dg845 Feb 16, 2026
ca931c6
Forward kwargs from preprocess/postprocess_video to preprocess/postpr…
dg845 Feb 16, 2026
df2ca6e
Center crop LTX-2 conditions following original code
dg845 Feb 16, 2026
49ef4c5
Duplicate video and audio position ids if using CFG
dg845 Feb 16, 2026
b4e7815
make style and make quality
dg845 Feb 16, 2026
6559765
Remove unused index_type arg to preprocess_conditions
dg845 Feb 16, 2026
47ebd92
Add # Copied from for _normalize_latents
dg845 Mar 4, 2026
a531ce0
Merge branch 'main' into ltx2-add-condition-pipeline
dg845 Mar 4, 2026
5a0cf67
Fix _normalize_latents # Copied from statement
dg845 Mar 4, 2026
2d42573
Add LTX-2 condition pipeline docs
dg845 Mar 4, 2026
1b84440
Remove TODOs
dg845 Mar 5, 2026
4b6168a
Support only unpacked latents (5D for video, 4D for audio)
dg845 Mar 5, 2026
56e5057
Remove # Copied from for prepare_audio_latents
dg845 Mar 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
179 changes: 179 additions & 0 deletions docs/source/en/api/pipelines/ltx2.md
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,179 @@ encode_video(
)
```

## Condition Pipeline Generation

You can use `LTX2ConditionPipeline` to specify image and/or video conditions at arbitrary latent indices. For example, we can specify both a first-frame and last-frame condition to perform first-last-frame-to-video (FLF2V) generation:

```py
import torch
from diffusers import LTX2ConditionPipeline, LTX2LatentUpsamplePipeline
from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel
from diffusers.pipelines.ltx2.pipeline_ltx2_condition import LTX2VideoCondition
from diffusers.pipelines.ltx2.utils import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES
from diffusers.pipelines.ltx2.export_utils import encode_video
from diffusers.utils import load_image

device = "cuda"
width = 768
height = 512
random_seed = 42
generator = torch.Generator(device).manual_seed(random_seed)
model_path = "rootonchair/LTX-2-19b-distilled"

pipe = LTX2ConditionPipeline.from_pretrained(model_path, torch_dtype=torch.bfloat16)
pipe.enable_sequential_cpu_offload(device=device)
pipe.vae.enable_tiling()

prompt = (
"CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are "
"delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright "
"sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, "
"low-angle perspective."
)

first_image = load_image(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png",
)
last_image = load_image(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png",
)
first_cond = LTX2VideoCondition(frames=first_image, index=0, strength=1.0)
last_cond = LTX2VideoCondition(frames=last_image, index=-1, strength=1.0)
conditions = [first_cond, last_cond]

frame_rate = 24.0
video_latent, audio_latent = pipe(
conditions=conditions,
prompt=prompt,
width=width,
height=height,
num_frames=121,
frame_rate=frame_rate,
num_inference_steps=8,
sigmas=DISTILLED_SIGMA_VALUES,
guidance_scale=1.0,
generator=generator,
output_type="latent",
return_dict=False,
)

latent_upsampler = LTX2LatentUpsamplerModel.from_pretrained(
model_path,
subfolder="latent_upsampler",
torch_dtype=torch.bfloat16,
)
upsample_pipe = LTX2LatentUpsamplePipeline(vae=pipe.vae, latent_upsampler=latent_upsampler)
upsample_pipe.enable_model_cpu_offload(device=device)
upscaled_video_latent = upsample_pipe(
latents=video_latent,
output_type="latent",
return_dict=False,
)[0]

video, audio = pipe(
latents=upscaled_video_latent,
audio_latents=audio_latent,
prompt=prompt,
width=width * 2,
height=height * 2,
num_inference_steps=3,
sigmas=STAGE_2_DISTILLED_SIGMA_VALUES,
generator=generator,
guidance_scale=1.0,
output_type="np",
return_dict=False,
)

encode_video(
video[0],
fps=frame_rate,
audio=audio[0].float().cpu(),
audio_sample_rate=pipe.vocoder.config.output_sampling_rate,
output_path="ltx2_distilled_flf2v.mp4",
)
```

You can use both image and video conditions:

```py
import torch
from diffusers import LTX2ConditionPipeline
from diffusers.pipelines.ltx2.pipeline_ltx2_condition import LTX2VideoCondition
from diffusers.pipelines.ltx2.export_utils import encode_video
from diffusers.utils import load_image, load_video

device = "cuda"
width = 768
height = 512
random_seed = 42
generator = torch.Generator(device).manual_seed(random_seed)
model_path = "rootonchair/LTX-2-19b-distilled"

pipe = LTX2ConditionPipeline.from_pretrained(model_path, torch_dtype=torch.bfloat16)
pipe.enable_sequential_cpu_offload(device=device)
pipe.vae.enable_tiling()

prompt = (
"The video depicts a long, straight highway stretching into the distance, flanked by metal guardrails. The road is "
"divided into multiple lanes, with a few vehicles visible in the far distance. The surrounding landscape features "
"dry, grassy fields on one side and rolling hills on the other. The sky is mostly clear with a few scattered "
"clouds, suggesting a bright, sunny day. And then the camera switch to a winding mountain road covered in snow, "
"with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The "
"landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the "
"solitude and beauty of a winter drive through a mountainous region."
)
negative_prompt = (
"blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, "
"grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, "
"deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, "
"wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of "
"field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent "
"lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny "
"valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, "
"mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, "
"off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward "
"pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, "
"inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
)

cond_video = load_video(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input-vid.mp4"
)
cond_image = load_image(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input.jpg"
)
video_cond = LTX2VideoCondition(frames=cond_video, index=0, strength=1.0)
image_cond = LTX2VideoCondition(frames=cond_image, index=8, strength=1.0)
conditions = [video_cond, image_cond]

frame_rate = 24.0
video, audio = pipe(
conditions=conditions,
prompt=prompt,
negative_prompt=negative_prompt,
width=width,
height=height,
num_frames=121,
frame_rate=frame_rate,
num_inference_steps=40,
guidance_scale=4.0,
generator=generator,
output_type="np",
return_dict=False,
)

encode_video(
video[0],
fps=frame_rate,
audio=audio[0].float().cpu(),
audio_sample_rate=pipe.vocoder.config.output_sampling_rate,
output_path="ltx2_cond_video.mp4",
)
```

Because the conditioning is done via latent frames, the 8 data space frames corresponding to the specified latent frame for an image condition will tend to be static.

## LTX2Pipeline

[[autodoc]] LTX2Pipeline
Expand All @@ -205,6 +378,12 @@ encode_video(
- all
- __call__

## LTX2ConditionPipeline

[[autodoc]] LTX2ConditionPipeline
- all
- __call__

## LTX2LatentUpsamplePipeline

[[autodoc]] LTX2LatentUpsamplePipeline
Expand Down
2 changes: 2 additions & 0 deletions src/diffusers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,7 @@
"LEditsPPPipelineStableDiffusionXL",
"LongCatImageEditPipeline",
"LongCatImagePipeline",
"LTX2ConditionPipeline",
"LTX2ImageToVideoPipeline",
"LTX2LatentUpsamplePipeline",
"LTX2Pipeline",
Expand Down Expand Up @@ -1308,6 +1309,7 @@
LEditsPPPipelineStableDiffusionXL,
LongCatImageEditPipeline,
LongCatImagePipeline,
LTX2ConditionPipeline,
LTX2ImageToVideoPipeline,
LTX2LatentUpsamplePipeline,
LTX2Pipeline,
Expand Down
9 changes: 7 additions & 2 deletions src/diffusers/pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,12 @@
"LTXLatentUpsamplePipeline",
"LTXI2VLongMultiPromptPipeline",
]
_import_structure["ltx2"] = ["LTX2Pipeline", "LTX2ImageToVideoPipeline", "LTX2LatentUpsamplePipeline"]
_import_structure["ltx2"] = [
"LTX2Pipeline",
"LTX2ConditionPipeline",
"LTX2ImageToVideoPipeline",
"LTX2LatentUpsamplePipeline",
]
_import_structure["lumina"] = ["LuminaPipeline", "LuminaText2ImgPipeline"]
_import_structure["lumina2"] = ["Lumina2Pipeline", "Lumina2Text2ImgPipeline"]
_import_structure["lucy"] = ["LucyEditPipeline"]
Expand Down Expand Up @@ -729,7 +734,7 @@
LTXLatentUpsamplePipeline,
LTXPipeline,
)
from .ltx2 import LTX2ImageToVideoPipeline, LTX2LatentUpsamplePipeline, LTX2Pipeline
from .ltx2 import LTX2ConditionPipeline, LTX2ImageToVideoPipeline, LTX2LatentUpsamplePipeline, LTX2Pipeline
from .lucy import LucyEditPipeline
from .lumina import LuminaPipeline, LuminaText2ImgPipeline
from .lumina2 import Lumina2Pipeline, Lumina2Text2ImgPipeline
Expand Down
2 changes: 2 additions & 0 deletions src/diffusers/pipelines/ltx2/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
_import_structure["connectors"] = ["LTX2TextConnectors"]
_import_structure["latent_upsampler"] = ["LTX2LatentUpsamplerModel"]
_import_structure["pipeline_ltx2"] = ["LTX2Pipeline"]
_import_structure["pipeline_ltx2_condition"] = ["LTX2ConditionPipeline"]
_import_structure["pipeline_ltx2_image2video"] = ["LTX2ImageToVideoPipeline"]
_import_structure["pipeline_ltx2_latent_upsample"] = ["LTX2LatentUpsamplePipeline"]
_import_structure["vocoder"] = ["LTX2Vocoder"]
Expand All @@ -40,6 +41,7 @@
from .connectors import LTX2TextConnectors
from .latent_upsampler import LTX2LatentUpsamplerModel
from .pipeline_ltx2 import LTX2Pipeline
from .pipeline_ltx2_condition import LTX2ConditionPipeline
from .pipeline_ltx2_image2video import LTX2ImageToVideoPipeline
from .pipeline_ltx2_latent_upsample import LTX2LatentUpsamplePipeline
from .vocoder import LTX2Vocoder
Expand Down
Loading