Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/source/en/_toctree.yml
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,8 @@
title: HeliosTransformer3DModel
- local: api/models/hidream_image_transformer
title: HiDreamImageTransformer2DModel
- local: api/models/hidream_o1_transformer
title: HiDreamO1Transformer2DModel
- local: api/models/hunyuan_transformer2d
title: HunyuanDiT2DModel
- local: api/models/hunyuanimage_transformer_2d
Expand Down Expand Up @@ -556,6 +558,8 @@
title: GLM-Image
- local: api/pipelines/hidream
title: HiDream-I1
- local: api/pipelines/hidream_o1
title: HiDream-O1
- local: api/pipelines/hunyuandit
title: Hunyuan-DiT
- local: api/pipelines/hunyuanimage21
Expand Down
34 changes: 34 additions & 0 deletions docs/source/en/api/models/hidream_o1_transformer.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<!-- Copyright 2026 chinoll and The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License. -->

# HiDreamO1Transformer2DModel

A Qwen3-VL based raw pixel patch transformer for
[HiDream-O1-Image](https://huggingface.co/HiDream-ai/HiDream-O1-Image).

HiDream-O1 does not use a VAE. The transformer predicts raw RGB pixel patches through the O1 denoising path added on
top of Qwen3-VL.

The model can be loaded with the following code snippet.

```python
import torch
from diffusers import HiDreamO1Transformer2DModel

transformer = HiDreamO1Transformer2DModel.from_pretrained(
"HiDream-ai/HiDream-O1-Image",
torch_dtype=torch.bfloat16,
)
```

## HiDreamO1Transformer2DModel

[[autodoc]] HiDreamO1Transformer2DModel
15 changes: 15 additions & 0 deletions docs/source/en/api/pipelines/hidream_o1.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# HiDream-O1

HiDream-O1 is a Qwen3-VL based image generation model that predicts raw RGB image patches directly. Unlike HiDream-I1,
it does not use a VAE component.

The following model is available for the [`HiDreamO1ImagePipeline`] pipeline:

| Model | Hugging Face Hub |
|---|---|
| HiDream-O1-Image | [`HiDream-ai/HiDream-O1-Image`](https://huggingface.co/HiDream-ai/HiDream-O1-Image) |
| HiDream-O1-Image-Dev | [`HiDream-ai/HiDream-O1-Image-Dev`](https://huggingface.co/HiDream-ai/HiDream-O1-Image-Dev) |

## HiDreamO1ImagePipeline

[[autodoc]] HiDreamO1ImagePipeline
193 changes: 193 additions & 0 deletions scripts/generate_hidream_o1_image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
# Copyright 2026 chinoll and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import argparse
import os

import torch
from transformers import AutoProcessor

from diffusers import HiDreamO1ImagePipeline, HiDreamO1Transformer2DModel, UniPCMultistepScheduler


DEV_TIMESTEPS = [
999,
987,
974,
960,
945,
929,
913,
895,
877,
857,
836,
814,
790,
764,
737,
707,
675,
640,
602,
560,
515,
464,
409,
347,
278,
199,
110,
8,
]


def parse_args():
parser = argparse.ArgumentParser("Generate an image with HiDream-O1")
parser.add_argument("--model_path", default="HiDream-ai/HiDream-O1-Image")
parser.add_argument(
"--prompt",
default=(
"A cinematic portrait of a glass astronaut standing in a neon-lit botanical garden, "
"highly detailed, sharp focus, natural skin tones, 35mm film still."
),
)
parser.add_argument("--output_image", default="hidream_o1_output.png")
parser.add_argument("--height", type=int, default=2048)
parser.add_argument("--width", type=int, default=2048)
parser.add_argument("--seed", type=int, default=32)
parser.add_argument("--num_inference_steps", type=int, default=50)
parser.add_argument("--guidance_scale", type=float, default=5.0)
parser.add_argument("--shift", type=float, default=3.0)
parser.add_argument("--timesteps", default=None, help="Comma-separated custom timestep schedule.")
parser.add_argument("--sigmas", default=None, help="Comma-separated custom sigma schedule.")
parser.add_argument("--noise_scale_start", type=float, default=8.0)
parser.add_argument("--noise_scale_end", type=float, default=None)
parser.add_argument("--noise_clip_std", type=float, default=0.0)
parser.add_argument(
"--dev_defaults",
action="store_true",
help="Use the public dev checkpoint generation defaults: 28 steps, no guidance, shift 1.0, and dev timesteps.",
)
parser.add_argument("--torch_dtype", choices=["bfloat16", "float16", "float32"], default="bfloat16")
parser.add_argument("--device", default="cuda")
parser.add_argument(
"--device_map",
default=None,
help="Optional device_map passed to HiDreamO1Transformer2DModel.from_pretrained, for example `cuda` or `auto`.",
)
parser.add_argument("--local_files_only", action="store_true")
parser.add_argument(
"--use_resolution_binning",
action=argparse.BooleanOptionalAction,
default=True,
help="Snap the requested size to the official predefined high-resolution buckets.",
)
return parser.parse_args()


def get_torch_dtype(dtype_name: str):
return {
"bfloat16": torch.bfloat16,
"float16": torch.float16,
"float32": torch.float32,
}[dtype_name]


def parse_schedule(schedule: str, value_type):
if schedule is None:
return None
return [value_type(value.strip()) for value in schedule.split(",") if value.strip()]


def main():
args = parse_args()
if args.timesteps is not None and args.sigmas is not None:
raise ValueError("Only one of --timesteps or --sigmas can be passed.")
if args.dev_defaults and (args.timesteps is not None or args.sigmas is not None):
raise ValueError("--dev_defaults cannot be combined with --timesteps or --sigmas.")

torch_dtype = get_torch_dtype(args.torch_dtype)

processor = AutoProcessor.from_pretrained(args.model_path, local_files_only=args.local_files_only)
load_kwargs = {
"torch_dtype": torch_dtype,
"local_files_only": args.local_files_only,
}
if args.device_map is not None:
load_kwargs["device_map"] = args.device_map

transformer = HiDreamO1Transformer2DModel.from_pretrained(args.model_path, **load_kwargs).eval()
pipe = HiDreamO1ImagePipeline(
processor=processor,
transformer=transformer,
scheduler=UniPCMultistepScheduler(
prediction_type="sample",
use_flow_sigmas=True,
flow_shift=args.shift,
),
)
if args.device_map is None:
pipe.to(args.device)

timesteps = parse_schedule(args.timesteps, int)
sigmas = parse_schedule(args.sigmas, float)
num_inference_steps = args.num_inference_steps
guidance_scale = args.guidance_scale
shift = args.shift
noise_scale_start = args.noise_scale_start
noise_scale_end = args.noise_scale_end
noise_clip_std = args.noise_clip_std

if args.dev_defaults:
timesteps = DEV_TIMESTEPS
num_inference_steps = len(DEV_TIMESTEPS)
guidance_scale = 0.0
shift = 1.0
noise_scale_start = 7.5
noise_scale_end = 7.5
noise_clip_std = 2.5
elif timesteps is not None:
num_inference_steps = len(timesteps)
elif sigmas is not None:
num_inference_steps = len(sigmas)

generator_device = args.device if args.device_map is None else "cpu"
generator = torch.Generator(device=generator_device).manual_seed(args.seed)
image = pipe(
args.prompt,
height=args.height,
width=args.width,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale,
shift=shift,
timesteps=timesteps,
sigmas=sigmas,
noise_scale_start=noise_scale_start,
noise_scale_end=noise_scale_end,
noise_clip_std=noise_clip_std,
use_resolution_binning=args.use_resolution_binning,
generator=generator,
).images[0]

output_dir = os.path.dirname(os.path.abspath(args.output_image))
os.makedirs(output_dir, exist_ok=True)
image.save(args.output_image)
print(f"Saved image to {args.output_image}")


if __name__ == "__main__":
main()
4 changes: 4 additions & 0 deletions src/diffusers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,7 @@
]

else:
_import_structure["models"].append("HiDreamO1Transformer2DModel")
_import_structure["modular_pipelines"].extend(
[
"ErnieImageAutoBlocks",
Expand Down Expand Up @@ -565,6 +566,7 @@
"HeliosPipeline",
"HeliosPyramidPipeline",
"HiDreamImagePipeline",
"HiDreamO1ImagePipeline",
"HunyuanDiTControlNetPipeline",
"HunyuanDiTPAGPipeline",
"HunyuanDiTPipeline",
Expand Down Expand Up @@ -1245,6 +1247,7 @@
except OptionalDependencyNotAvailable:
from .utils.dummy_torch_and_transformers_objects import * # noqa F403
else:
from .models import HiDreamO1Transformer2DModel
from .modular_pipelines import (
ErnieImageAutoBlocks,
ErnieImageModularPipeline,
Expand Down Expand Up @@ -1362,6 +1365,7 @@
HeliosPipeline,
HeliosPyramidPipeline,
HiDreamImagePipeline,
HiDreamO1ImagePipeline,
HunyuanDiTControlNetPipeline,
HunyuanDiTPAGPipeline,
HunyuanDiTPipeline,
Expand Down
5 changes: 5 additions & 0 deletions src/diffusers/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
_LazyModule,
is_flax_available,
is_torch_available,
is_transformers_available,
)


Expand Down Expand Up @@ -109,6 +110,8 @@
_import_structure["transformers.transformer_glm_image"] = ["GlmImageTransformer2DModel"]
_import_structure["transformers.transformer_helios"] = ["HeliosTransformer3DModel"]
_import_structure["transformers.transformer_hidream_image"] = ["HiDreamImageTransformer2DModel"]
if is_transformers_available():
_import_structure["transformers.transformer_hidream_o1"] = ["HiDreamO1Transformer2DModel"]
_import_structure["transformers.transformer_hunyuan_video"] = ["HunyuanVideoTransformer3DModel"]
_import_structure["transformers.transformer_hunyuan_video15"] = ["HunyuanVideo15Transformer3DModel"]
_import_structure["transformers.transformer_hunyuan_video_framepack"] = ["HunyuanVideoFramepackTransformer3DModel"]
Expand Down Expand Up @@ -267,6 +270,8 @@
WanVACETransformer3DModel,
ZImageTransformer2DModel,
)
if is_transformers_available():
from .transformers.transformer_hidream_o1 import HiDreamO1Transformer2DModel
from .unets import (
I2VGenXLUNet,
Kandinsky3UNet,
Expand Down
4 changes: 3 additions & 1 deletion src/diffusers/models/transformers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from ...utils import is_torch_available
from ...utils import is_torch_available, is_transformers_available


if is_torch_available():
Expand Down Expand Up @@ -32,6 +32,8 @@
from .transformer_glm_image import GlmImageTransformer2DModel
from .transformer_helios import HeliosTransformer3DModel
from .transformer_hidream_image import HiDreamImageTransformer2DModel
if is_transformers_available():
from .transformer_hidream_o1 import HiDreamO1Transformer2DModel
from .transformer_hunyuan_video import HunyuanVideoTransformer3DModel
from .transformer_hunyuan_video15 import HunyuanVideo15Transformer3DModel
from .transformer_hunyuan_video_framepack import HunyuanVideoFramepackTransformer3DModel
Expand Down
Loading
Loading