huggingface · chinoll · May 13, 2026 · May 13, 2026 · May 13, 2026 · May 14, 2026
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -364,6 +364,8 @@
         title: HeliosTransformer3DModel
       - local: api/models/hidream_image_transformer
         title: HiDreamImageTransformer2DModel
+      - local: api/models/hidream_o1_transformer
+        title: HiDreamO1Transformer2DModel
       - local: api/models/hunyuan_transformer2d
         title: HunyuanDiT2DModel
       - local: api/models/hunyuanimage_transformer_2d
@@ -556,6 +558,8 @@
         title: GLM-Image
       - local: api/pipelines/hidream
         title: HiDream-I1
+      - local: api/pipelines/hidream_o1
+        title: HiDream-O1
       - local: api/pipelines/hunyuandit
         title: Hunyuan-DiT
       - local: api/pipelines/hunyuanimage21

diff --git a/docs/source/en/api/models/hidream_o1_transformer.md b/docs/source/en/api/models/hidream_o1_transformer.md
@@ -0,0 +1,34 @@
+<!-- Copyright 2026 chinoll and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# HiDreamO1Transformer2DModel
+
+A Qwen3-VL based raw pixel patch transformer for
+[HiDream-O1-Image](https://huggingface.co/HiDream-ai/HiDream-O1-Image).
+
+HiDream-O1 does not use a VAE. The transformer predicts raw RGB pixel patches through the O1 denoising path added on
+top of Qwen3-VL.
+
+The model can be loaded with the following code snippet.
+
+```python
+import torch
+from diffusers import HiDreamO1Transformer2DModel
+
+transformer = HiDreamO1Transformer2DModel.from_pretrained(
+    "HiDream-ai/HiDream-O1-Image",
+    torch_dtype=torch.bfloat16,
+)
+```
+
+## HiDreamO1Transformer2DModel
+
+[[autodoc]] HiDreamO1Transformer2DModel
diff --git a/docs/source/en/api/pipelines/hidream_o1.md b/docs/source/en/api/pipelines/hidream_o1.md
@@ -0,0 +1,15 @@
+# HiDream-O1
+
+HiDream-O1 is a Qwen3-VL based image generation model that predicts raw RGB image patches directly. Unlike HiDream-I1,
+it does not use a VAE component.
+
+The following model is available for the [`HiDreamO1ImagePipeline`] pipeline:
+
+| Model | Hugging Face Hub |
+|---|---|
+| HiDream-O1-Image | [`HiDream-ai/HiDream-O1-Image`](https://huggingface.co/HiDream-ai/HiDream-O1-Image) |
+| HiDream-O1-Image-Dev | [`HiDream-ai/HiDream-O1-Image-Dev`](https://huggingface.co/HiDream-ai/HiDream-O1-Image-Dev) |
+
+## HiDreamO1ImagePipeline
+
+[[autodoc]] HiDreamO1ImagePipeline
diff --git a/scripts/generate_hidream_o1_image.py b/scripts/generate_hidream_o1_image.py
@@ -0,0 +1,193 @@
+# Copyright 2026 chinoll and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import argparse
+import os
+
+import torch
+from transformers import AutoProcessor
+
+from diffusers import HiDreamO1ImagePipeline, HiDreamO1Transformer2DModel, UniPCMultistepScheduler
+
+
+DEV_TIMESTEPS = [
+    999,
+    987,
+    974,
+    960,
+    945,
+    929,
+    913,
+    895,
+    877,
+    857,
+    836,
+    814,
+    790,
+    764,
+    737,
+    707,
+    675,
+    640,
+    602,
+    560,
+    515,
+    464,
+    409,
+    347,
+    278,
+    199,
+    110,
+    8,
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("Generate an image with HiDream-O1")
+    parser.add_argument("--model_path", default="HiDream-ai/HiDream-O1-Image")
+    parser.add_argument(
+        "--prompt",
+        default=(
+            "A cinematic portrait of a glass astronaut standing in a neon-lit botanical garden, "
+            "highly detailed, sharp focus, natural skin tones, 35mm film still."
+        ),
+    )
+    parser.add_argument("--output_image", default="hidream_o1_output.png")
+    parser.add_argument("--height", type=int, default=2048)
+    parser.add_argument("--width", type=int, default=2048)
+    parser.add_argument("--seed", type=int, default=32)
+    parser.add_argument("--num_inference_steps", type=int, default=50)
+    parser.add_argument("--guidance_scale", type=float, default=5.0)
+    parser.add_argument("--shift", type=float, default=3.0)
+    parser.add_argument("--timesteps", default=None, help="Comma-separated custom timestep schedule.")
+    parser.add_argument("--sigmas", default=None, help="Comma-separated custom sigma schedule.")
+    parser.add_argument("--noise_scale_start", type=float, default=8.0)
+    parser.add_argument("--noise_scale_end", type=float, default=None)
+    parser.add_argument("--noise_clip_std", type=float, default=0.0)
+    parser.add_argument(
+        "--dev_defaults",
+        action="store_true",
+        help="Use the public dev checkpoint generation defaults: 28 steps, no guidance, shift 1.0, and dev timesteps.",
+    )
+    parser.add_argument("--torch_dtype", choices=["bfloat16", "float16", "float32"], default="bfloat16")
+    parser.add_argument("--device", default="cuda")
+    parser.add_argument(
+        "--device_map",
+        default=None,
+        help="Optional device_map passed to HiDreamO1Transformer2DModel.from_pretrained, for example `cuda` or `auto`.",
+    )
+    parser.add_argument("--local_files_only", action="store_true")
+    parser.add_argument(
+        "--use_resolution_binning",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="Snap the requested size to the official predefined high-resolution buckets.",
+    )
+    return parser.parse_args()
+
+
+def get_torch_dtype(dtype_name: str):
+    return {
+        "bfloat16": torch.bfloat16,
+        "float16": torch.float16,
+        "float32": torch.float32,
+    }[dtype_name]
+
+
+def parse_schedule(schedule: str, value_type):
+    if schedule is None:
+        return None
+    return [value_type(value.strip()) for value in schedule.split(",") if value.strip()]
+
+
+def main():
+    args = parse_args()
+    if args.timesteps is not None and args.sigmas is not None:
+        raise ValueError("Only one of --timesteps or --sigmas can be passed.")
+    if args.dev_defaults and (args.timesteps is not None or args.sigmas is not None):
+        raise ValueError("--dev_defaults cannot be combined with --timesteps or --sigmas.")
+
+    torch_dtype = get_torch_dtype(args.torch_dtype)
+
+    processor = AutoProcessor.from_pretrained(args.model_path, local_files_only=args.local_files_only)
+    load_kwargs = {
+        "torch_dtype": torch_dtype,
+        "local_files_only": args.local_files_only,
+    }
+    if args.device_map is not None:
+        load_kwargs["device_map"] = args.device_map
+
+    transformer = HiDreamO1Transformer2DModel.from_pretrained(args.model_path, **load_kwargs).eval()
+    pipe = HiDreamO1ImagePipeline(
+        processor=processor,
+        transformer=transformer,
+        scheduler=UniPCMultistepScheduler(
+            prediction_type="sample",
+            use_flow_sigmas=True,
+            flow_shift=args.shift,
+        ),
+    )
+    if args.device_map is None:
+        pipe.to(args.device)
+
+    timesteps = parse_schedule(args.timesteps, int)
+    sigmas = parse_schedule(args.sigmas, float)
+    num_inference_steps = args.num_inference_steps
+    guidance_scale = args.guidance_scale
+    shift = args.shift
+    noise_scale_start = args.noise_scale_start
+    noise_scale_end = args.noise_scale_end
+    noise_clip_std = args.noise_clip_std
+
+    if args.dev_defaults:
+        timesteps = DEV_TIMESTEPS
+        num_inference_steps = len(DEV_TIMESTEPS)
+        guidance_scale = 0.0
+        shift = 1.0
+        noise_scale_start = 7.5
+        noise_scale_end = 7.5
+        noise_clip_std = 2.5
+    elif timesteps is not None:
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        num_inference_steps = len(sigmas)
+
+    generator_device = args.device if args.device_map is None else "cpu"
+    generator = torch.Generator(device=generator_device).manual_seed(args.seed)
+    image = pipe(
+        args.prompt,
+        height=args.height,
+        width=args.width,
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+        shift=shift,
+        timesteps=timesteps,
+        sigmas=sigmas,
+        noise_scale_start=noise_scale_start,
+        noise_scale_end=noise_scale_end,
+        noise_clip_std=noise_clip_std,
+        use_resolution_binning=args.use_resolution_binning,
+        generator=generator,
+    ).images[0]
+
+    output_dir = os.path.dirname(os.path.abspath(args.output_image))
+    os.makedirs(output_dir, exist_ok=True)
+    image.save(args.output_image)
+    print(f"Saved image to {args.output_image}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -443,6 +443,7 @@
     ]
 
 else:
+    _import_structure["models"].append("HiDreamO1Transformer2DModel")
     _import_structure["modular_pipelines"].extend(
         [
             "ErnieImageAutoBlocks",
@@ -565,6 +566,7 @@
             "HeliosPipeline",
             "HeliosPyramidPipeline",
             "HiDreamImagePipeline",
+            "HiDreamO1ImagePipeline",
             "HunyuanDiTControlNetPipeline",
             "HunyuanDiTPAGPipeline",
             "HunyuanDiTPipeline",
@@ -1245,6 +1247,7 @@
     except OptionalDependencyNotAvailable:
         from .utils.dummy_torch_and_transformers_objects import *  # noqa F403
     else:
+        from .models import HiDreamO1Transformer2DModel
         from .modular_pipelines import (
             ErnieImageAutoBlocks,
             ErnieImageModularPipeline,
@@ -1362,6 +1365,7 @@
             HeliosPipeline,
             HeliosPyramidPipeline,
             HiDreamImagePipeline,
+            HiDreamO1ImagePipeline,
             HunyuanDiTControlNetPipeline,
             HunyuanDiTPAGPipeline,
             HunyuanDiTPipeline,

diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
@@ -19,6 +19,7 @@
     _LazyModule,
     is_flax_available,
     is_torch_available,
+    is_transformers_available,
 )
 
 
@@ -109,6 +110,8 @@
     _import_structure["transformers.transformer_glm_image"] = ["GlmImageTransformer2DModel"]
     _import_structure["transformers.transformer_helios"] = ["HeliosTransformer3DModel"]
     _import_structure["transformers.transformer_hidream_image"] = ["HiDreamImageTransformer2DModel"]
+    if is_transformers_available():
+        _import_structure["transformers.transformer_hidream_o1"] = ["HiDreamO1Transformer2DModel"]
     _import_structure["transformers.transformer_hunyuan_video"] = ["HunyuanVideoTransformer3DModel"]
     _import_structure["transformers.transformer_hunyuan_video15"] = ["HunyuanVideo15Transformer3DModel"]
     _import_structure["transformers.transformer_hunyuan_video_framepack"] = ["HunyuanVideoFramepackTransformer3DModel"]
@@ -267,6 +270,8 @@
             WanVACETransformer3DModel,
             ZImageTransformer2DModel,
         )
+        if is_transformers_available():
+            from .transformers.transformer_hidream_o1 import HiDreamO1Transformer2DModel
         from .unets import (
             I2VGenXLUNet,
             Kandinsky3UNet,

diff --git a/src/diffusers/models/transformers/__init__.py b/src/diffusers/models/transformers/__init__.py
@@ -1,4 +1,4 @@
-from ...utils import is_torch_available
+from ...utils import is_torch_available, is_transformers_available
 
 
 if is_torch_available():
@@ -32,6 +32,8 @@
     from .transformer_glm_image import GlmImageTransformer2DModel
     from .transformer_helios import HeliosTransformer3DModel
     from .transformer_hidream_image import HiDreamImageTransformer2DModel
+    if is_transformers_available():
+        from .transformer_hidream_o1 import HiDreamO1Transformer2DModel
     from .transformer_hunyuan_video import HunyuanVideoTransformer3DModel
     from .transformer_hunyuan_video15 import HunyuanVideo15Transformer3DModel
     from .transformer_hunyuan_video_framepack import HunyuanVideoFramepackTransformer3DModel