From 181e66148a9526f7ff0a47c11546d95e35e4115c Mon Sep 17 00:00:00 2001
From: Akshan Krithick <akshankrithick305@gmail.com>
Date: Thu, 16 Apr 2026 14:48:41 -0700
Subject: [PATCH 1/4] Add Ernie-Image modular pipeline

---
 src/diffusers/__init__.py                     |   4 +
 src/diffusers/modular_pipelines/__init__.py   |   5 +
 .../modular_pipelines/ernie_image/__init__.py |  47 +++
 .../ernie_image/before_denoise.py             | 269 ++++++++++++++++
 .../modular_pipelines/ernie_image/decoders.py | 100 ++++++
 .../modular_pipelines/ernie_image/denoise.py  | 242 +++++++++++++++
 .../modular_pipelines/ernie_image/encoders.py | 286 ++++++++++++++++++
 .../ernie_image/modular_blocks_ernie_image.py | 153 ++++++++++
 .../ernie_image/modular_pipeline.py           | 109 +++++++
 .../modular_pipelines/modular_pipeline.py     |   1 +
 .../dummy_torch_and_transformers_objects.py   |  30 ++
 .../modular_pipelines/ernie_image/__init__.py |   0
 .../test_modular_pipeline_ernie_image.py      |  60 ++++
 13 files changed, 1306 insertions(+)
 create mode 100644 src/diffusers/modular_pipelines/ernie_image/__init__.py
 create mode 100644 src/diffusers/modular_pipelines/ernie_image/before_denoise.py
 create mode 100644 src/diffusers/modular_pipelines/ernie_image/decoders.py
 create mode 100644 src/diffusers/modular_pipelines/ernie_image/denoise.py
 create mode 100644 src/diffusers/modular_pipelines/ernie_image/encoders.py
 create mode 100644 src/diffusers/modular_pipelines/ernie_image/modular_blocks_ernie_image.py
 create mode 100644 src/diffusers/modular_pipelines/ernie_image/modular_pipeline.py
 create mode 100644 tests/modular_pipelines/ernie_image/__init__.py
 create mode 100644 tests/modular_pipelines/ernie_image/test_modular_pipeline_ernie_image.py

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 2cbfd6e29305..2e70f602348e 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -449,6 +449,8 @@
             "Flux2KleinModularPipeline",
             "Flux2ModularPipeline",
             "FluxAutoBlocks",
+            "ErnieImageAutoBlocks",
+            "ErnieImageModularPipeline",
             "FluxKontextAutoBlocks",
             "FluxKontextModularPipeline",
             "FluxModularPipeline",
@@ -1237,6 +1239,8 @@
             Flux2KleinBaseModularPipeline,
             Flux2KleinModularPipeline,
             Flux2ModularPipeline,
+            ErnieImageAutoBlocks,
+            ErnieImageModularPipeline,
             FluxAutoBlocks,
             FluxKontextAutoBlocks,
             FluxKontextModularPipeline,
diff --git a/src/diffusers/modular_pipelines/__init__.py b/src/diffusers/modular_pipelines/__init__.py
index b7137249fe16..c3a3515cccc3 100644
--- a/src/diffusers/modular_pipelines/__init__.py
+++ b/src/diffusers/modular_pipelines/__init__.py
@@ -88,6 +88,10 @@
         "QwenImageLayeredModularPipeline",
         "QwenImageLayeredAutoBlocks",
     ]
+    _import_structure["ernie_image"] = [
+        "ErnieImageAutoBlocks",
+        "ErnieImageModularPipeline",
+    ]
     _import_structure["hunyuan_video1_5"] = [
         "HunyuanVideo15AutoBlocks",
         "HunyuanVideo15ModularPipeline",
@@ -110,6 +114,7 @@
         from ..utils.dummy_pt_objects import *  # noqa F403
     else:
         from .components_manager import ComponentsManager
+        from .ernie_image import ErnieImageAutoBlocks, ErnieImageModularPipeline
         from .flux import FluxAutoBlocks, FluxKontextAutoBlocks, FluxKontextModularPipeline, FluxModularPipeline
         from .flux2 import (
             Flux2AutoBlocks,
diff --git a/src/diffusers/modular_pipelines/ernie_image/__init__.py b/src/diffusers/modular_pipelines/ernie_image/__init__.py
new file mode 100644
index 000000000000..68ed723c590c
--- /dev/null
+++ b/src/diffusers/modular_pipelines/ernie_image/__init__.py
@@ -0,0 +1,47 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["modular_blocks_ernie_image"] = ["ErnieImageAutoBlocks"]
+    _import_structure["modular_pipeline"] = ["ErnieImageModularPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .modular_blocks_ernie_image import ErnieImageAutoBlocks
+        from .modular_pipeline import ErnieImageModularPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/modular_pipelines/ernie_image/before_denoise.py b/src/diffusers/modular_pipelines/ernie_image/before_denoise.py
new file mode 100644
index 000000000000..1c13c50f2db3
--- /dev/null
+++ b/src/diffusers/modular_pipelines/ernie_image/before_denoise.py
@@ -0,0 +1,269 @@
+# Copyright 2025 Baidu ERNIE-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from ...models import ErnieImageTransformer2DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import logging
+from ...utils.torch_utils import randn_tensor
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+from .modular_pipeline import ErnieImageModularPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def _pad_text(
+    text_hiddens: list[torch.Tensor], device: torch.device, dtype: torch.dtype, text_in_dim: int
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Pad a list of variable-length text hidden states to a common length and return (padded, lengths)."""
+    batch_size = len(text_hiddens)
+    if batch_size == 0:
+        return (
+            torch.zeros((0, 0, text_in_dim), device=device, dtype=dtype),
+            torch.zeros((0,), device=device, dtype=torch.long),
+        )
+    normalized = [t.squeeze(1).to(device).to(dtype) if t.dim() == 3 else t.to(device).to(dtype) for t in text_hiddens]
+    lengths = torch.tensor([t.shape[0] for t in normalized], device=device, dtype=torch.long)
+    max_length = int(lengths.max().item())
+    padded = torch.zeros((batch_size, max_length, text_in_dim), device=device, dtype=dtype)
+    for i, t in enumerate(normalized):
+        padded[i, : t.shape[0], :] = t
+    return padded, lengths
+
+
+class ErnieImageTextInputStep(ModularPipelineBlocks):
+    model_name = "ernie-image"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Input processing step that pads the variable-length text hidden states to a common length and "
+            "produces `text_bth` / `text_lens` tensors consumed by the denoiser."
+        )
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [ComponentSpec("transformer", ErnieImageTransformer2DModel)]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam(
+                "prompt_embeds",
+                required=True,
+                type_hint=list,
+                description="List of per-prompt text embeddings from the text encoder step.",
+            ),
+            InputParam(
+                "negative_prompt_embeds",
+                type_hint=list,
+                description="List of per-prompt negative text embeddings from the text encoder step.",
+            ),
+            InputParam(
+                "num_images_per_prompt",
+                type_hint=int,
+                default=1,
+                description="Number of images to generate per prompt.",
+            ),
+            InputParam(
+                "batch_size",
+                type_hint=int,
+                default=None,
+                description="Prompt batch size. Resolved from `prompt_embeds` when not provided.",
+            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam("batch_size", type_hint=int, description="The number of prompts in the batch."),
+            OutputParam(
+                "text_bth",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="Padded text hidden states of shape (B, T_max, H) fed into the transformer.",
+            ),
+            OutputParam(
+                "text_lens",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="Actual per-prompt text lengths used to build the transformer attention mask.",
+            ),
+            OutputParam(
+                "negative_text_bth",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="Padded negative text hidden states, when classifier-free guidance is enabled.",
+            ),
+            OutputParam(
+                "negative_text_lens",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="Actual per-prompt negative text lengths, when classifier-free guidance is enabled.",
+            ),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: ErnieImageModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+        dtype = components.transformer.dtype
+        text_in_dim = components.text_in_dim
+
+        prompt_embeds = block_state.prompt_embeds
+        block_state.batch_size = getattr(block_state, "batch_size", None) or len(prompt_embeds)
+
+        text_bth, text_lens = _pad_text(prompt_embeds, device, dtype, text_in_dim)
+        block_state.text_bth = text_bth
+        block_state.text_lens = text_lens
+
+        negative_prompt_embeds = getattr(block_state, "negative_prompt_embeds", None)
+        if negative_prompt_embeds is not None:
+            negative_text_bth, negative_text_lens = _pad_text(negative_prompt_embeds, device, dtype, text_in_dim)
+            block_state.negative_text_bth = negative_text_bth
+            block_state.negative_text_lens = negative_text_lens
+        else:
+            block_state.negative_text_bth = None
+            block_state.negative_text_lens = None
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class ErnieImageSetTimestepsStep(ModularPipelineBlocks):
+    model_name = "ernie-image"
+
+    @property
+    def description(self) -> str:
+        return "Step that sets the scheduler's timesteps for inference using a linear sigma schedule."
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam(
+                "num_inference_steps",
+                type_hint=int,
+                default=50,
+                description="Number of denoising steps.",
+            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam("timesteps", type_hint=torch.Tensor, description="The timesteps to use for inference."),
+            OutputParam("num_inference_steps", type_hint=int, description="The number of denoising steps."),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: ErnieImageModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+        num_inference_steps = block_state.num_inference_steps
+
+        sigmas = torch.linspace(1.0, 0.0, num_inference_steps + 1)[:-1]
+        components.scheduler.set_timesteps(sigmas=sigmas, device=device)
+
+        block_state.timesteps = components.scheduler.timesteps
+        block_state.num_inference_steps = num_inference_steps
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class ErnieImagePrepareLatentsStep(ModularPipelineBlocks):
+    model_name = "ernie-image"
+
+    @property
+    def description(self) -> str:
+        return "Prepare random noise latents for the ErnieImage text-to-image denoising process."
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [ComponentSpec("transformer", ErnieImageTransformer2DModel)]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam("height", type_hint=int, description="The height in pixels of the generated image."),
+            InputParam("width", type_hint=int, description="The width in pixels of the generated image."),
+            InputParam(
+                "latents",
+                type_hint=torch.Tensor,
+                description="Pre-generated noisy latents. If provided, skips noise sampling.",
+            ),
+            InputParam(
+                "num_images_per_prompt",
+                type_hint=int,
+                default=1,
+                description="Number of images to generate per prompt.",
+            ),
+            InputParam("generator", description="Torch generator for deterministic noise sampling."),
+            InputParam(
+                "batch_size",
+                required=True,
+                type_hint=int,
+                description="Prompt batch size resolved by the text input step.",
+            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam("latents", type_hint=torch.Tensor, description="The initial noise latents to denoise."),
+            OutputParam("height", type_hint=int, description="The resolved image height in pixels."),
+            OutputParam("width", type_hint=int, description="The resolved image width in pixels."),
+        ]
+
+    @staticmethod
+    def _check_inputs(components: ErnieImageModularPipeline, height: int, width: int) -> None:
+        vae_scale_factor = components.vae_scale_factor
+        if height % vae_scale_factor != 0 or width % vae_scale_factor != 0:
+            raise ValueError(
+                f"`height` and `width` must be divisible by {vae_scale_factor}, got {height} and {width}."
+            )
+
+    @torch.no_grad()
+    def __call__(self, components: ErnieImageModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+        dtype = components.transformer.dtype
+
+        height = block_state.height or components.default_height
+        width = block_state.width or components.default_width
+        self._check_inputs(components, height, width)
+
+        total_batch_size = block_state.batch_size * block_state.num_images_per_prompt
+        latent_h = height // components.vae_scale_factor
+        latent_w = width // components.vae_scale_factor
+        num_channels_latents = components.num_channels_latents
+
+        shape = (total_batch_size, num_channels_latents, latent_h, latent_w)
+        if block_state.latents is None:
+            block_state.latents = randn_tensor(shape, generator=block_state.generator, device=device, dtype=dtype)
+        else:
+            block_state.latents = block_state.latents.to(device=device, dtype=dtype)
+
+        block_state.height = height
+        block_state.width = width
+
+        self.set_block_state(state, block_state)
+        return components, state
diff --git a/src/diffusers/modular_pipelines/ernie_image/decoders.py b/src/diffusers/modular_pipelines/ernie_image/decoders.py
new file mode 100644
index 000000000000..fb65e80f112f
--- /dev/null
+++ b/src/diffusers/modular_pipelines/ernie_image/decoders.py
@@ -0,0 +1,100 @@
+# Copyright 2025 Baidu ERNIE-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import torch
+from PIL import Image
+
+from ...configuration_utils import FrozenDict
+from ...models import AutoencoderKLFlux2
+from ...utils import logging
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+from .modular_pipeline import ErnieImageModularPipeline, ErnieImagePachifier
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class ErnieImageVaeDecoderStep(ModularPipelineBlocks):
+    model_name = "ernie-image"
+
+    @property
+    def description(self) -> str:
+        return "Step that decodes the denoised latents into images (unpachify, BN denormalization, VAE decode)."
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoencoderKLFlux2),
+            ComponentSpec(
+                "pachifier",
+                ErnieImagePachifier,
+                config=FrozenDict({"patch_size": 2}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam(
+                "latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The latents to decode into images.",
+            ),
+            InputParam(
+                "output_type",
+                type_hint=str,
+                default="pil",
+                description="Output format: 'pil', 'np', or 'pt'.",
+            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [OutputParam("images", type_hint=list, description="The generated images.")]
+
+    @torch.no_grad()
+    def __call__(self, components: ErnieImageModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        vae = components.vae
+        device = block_state.latents.device
+
+        latents = block_state.latents
+        bn_mean = vae.bn.running_mean.view(1, -1, 1, 1).to(device=device, dtype=latents.dtype)
+        bn_std = torch.sqrt(vae.bn.running_var.view(1, -1, 1, 1) + vae.config.batch_norm_eps).to(
+            device=device, dtype=latents.dtype
+        )
+        latents = latents * bn_std + bn_mean
+
+        latents = components.pachifier.unpack_latents(latents)
+
+        images = vae.decode(latents.to(vae.dtype), return_dict=False)[0]
+        images = (images.clamp(-1, 1) + 1) / 2
+
+        output_type = block_state.output_type
+        if output_type == "pt":
+            block_state.images = images
+        elif output_type == "np":
+            block_state.images = images.cpu().permute(0, 2, 3, 1).float().numpy()
+        elif output_type == "pil":
+            images_np = images.cpu().permute(0, 2, 3, 1).float().numpy()
+            block_state.images = [Image.fromarray((img * 255).astype(np.uint8)) for img in images_np]
+        else:
+            raise ValueError(f"Unsupported `output_type`: {output_type!r}. Expected one of 'pil', 'np', 'pt'.")
+
+        self.set_block_state(state, block_state)
+        return components, state
diff --git a/src/diffusers/modular_pipelines/ernie_image/denoise.py b/src/diffusers/modular_pipelines/ernie_image/denoise.py
new file mode 100644
index 000000000000..d3ab9c78f60a
--- /dev/null
+++ b/src/diffusers/modular_pipelines/ernie_image/denoise.py
@@ -0,0 +1,242 @@
+# Copyright 2025 Baidu ERNIE-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from ...configuration_utils import FrozenDict
+from ...guiders import ClassifierFreeGuidance
+from ...models import ErnieImageTransformer2DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import logging
+from ..modular_pipeline import (
+    BlockState,
+    LoopSequentialPipelineBlocks,
+    ModularPipelineBlocks,
+    PipelineState,
+)
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+from .modular_pipeline import ErnieImageModularPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class ErnieImageLoopBeforeDenoiser(ModularPipelineBlocks):
+    model_name = "ernie-image"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Step within the denoising loop that prepares the latent model input and timestep tensor. "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `ErnieImageDenoiseLoopWrapper`)."
+        )
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [ComponentSpec("transformer", ErnieImageTransformer2DModel)]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam(
+                "latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The latents to denoise.",
+            ),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: ErnieImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
+        latents = block_state.latents
+        block_state.latent_model_input = latents.to(components.transformer.dtype)
+        block_state.timestep = t.expand(latents.shape[0]).to(components.transformer.dtype)
+        return components, block_state
+
+
+class ErnieImageLoopDenoiser(ModularPipelineBlocks):
+    model_name = "ernie-image"
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("transformer", ErnieImageTransformer2DModel),
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 4.0}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Step within the denoising loop that runs the ErnieImage transformer with classifier-free guidance via "
+            "the configured guider."
+        )
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam(
+                "text_bth",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Padded text hidden states fed into the transformer.",
+            ),
+            InputParam(
+                "text_lens",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Per-prompt text lengths used by the transformer attention mask.",
+            ),
+            InputParam(
+                "negative_text_bth",
+                type_hint=torch.Tensor,
+                description="Padded negative text hidden states for classifier-free guidance.",
+            ),
+            InputParam(
+                "negative_text_lens",
+                type_hint=torch.Tensor,
+                description="Per-prompt negative text lengths for classifier-free guidance.",
+            ),
+            InputParam(
+                "num_inference_steps",
+                required=True,
+                type_hint=int,
+                description="Total number of denoising steps. Used by the guider for step-aware scheduling.",
+            ),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: ErnieImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
+        guider_inputs = {
+            "text_bth": (
+                getattr(block_state, "text_bth", None),
+                getattr(block_state, "negative_text_bth", None),
+            ),
+            "text_lens": (
+                getattr(block_state, "text_lens", None),
+                getattr(block_state, "negative_text_lens", None),
+            ),
+        }
+
+        components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t)
+        guider_state = components.guider.prepare_inputs(guider_inputs)
+
+        for guider_state_batch in guider_state:
+            components.guider.prepare_models(components.transformer)
+            cond_kwargs = {name: getattr(guider_state_batch, name) for name in guider_inputs.keys()}
+            noise_pred = components.transformer(
+                hidden_states=block_state.latent_model_input,
+                timestep=block_state.timestep,
+                return_dict=False,
+                **cond_kwargs,
+            )[0]
+            guider_state_batch.noise_pred = noise_pred
+            components.guider.cleanup_models(components.transformer)
+
+        block_state.noise_pred = components.guider(guider_state)[0]
+        return components, block_state
+
+
+class ErnieImageLoopAfterDenoiser(ModularPipelineBlocks):
+    model_name = "ernie-image"
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]
+
+    @property
+    def description(self) -> str:
+        return "Step within the denoising loop that updates the latents using the scheduler step."
+
+    @torch.no_grad()
+    def __call__(self, components: ErnieImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
+        latents_dtype = block_state.latents.dtype
+        block_state.latents = components.scheduler.step(
+            block_state.noise_pred, t, block_state.latents, return_dict=False
+        )[0]
+        if block_state.latents.dtype != latents_dtype and torch.backends.mps.is_available():
+            block_state.latents = block_state.latents.to(latents_dtype)
+        return components, block_state
+
+
+class ErnieImageDenoiseLoopWrapper(LoopSequentialPipelineBlocks):
+    model_name = "ernie-image"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Pipeline block that iteratively denoises the latents over `timesteps`. "
+            "The specific steps within each iteration can be customized with `sub_blocks` attribute."
+        )
+
+    @property
+    def loop_expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler),
+            ComponentSpec("transformer", ErnieImageTransformer2DModel),
+        ]
+
+    @property
+    def loop_inputs(self) -> list[InputParam]:
+        return [
+            InputParam(
+                "timesteps",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The timesteps to use for inference.",
+            ),
+            InputParam(
+                "num_inference_steps",
+                required=True,
+                type_hint=int,
+                description="The number of denoising steps.",
+            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [OutputParam("latents", type_hint=torch.Tensor, description="The denoised latents.")]
+
+    @torch.no_grad()
+    def __call__(self, components: ErnieImageModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        with self.progress_bar(total=block_state.num_inference_steps) as progress_bar:
+            for i, t in enumerate(block_state.timesteps):
+                components, block_state = self.loop_step(components, block_state, i=i, t=t)
+                progress_bar.update()
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class ErnieImageDenoiseStep(ErnieImageDenoiseLoopWrapper):
+    block_classes = [
+        ErnieImageLoopBeforeDenoiser,
+        ErnieImageLoopDenoiser,
+        ErnieImageLoopAfterDenoiser,
+    ]
+    block_names = ["before_denoiser", "denoiser", "after_denoiser"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that iteratively denoises the latents. At each iteration it runs:\n"
+            " - `ErnieImageLoopBeforeDenoiser`\n"
+            " - `ErnieImageLoopDenoiser`\n"
+            " - `ErnieImageLoopAfterDenoiser`"
+        )
diff --git a/src/diffusers/modular_pipelines/ernie_image/encoders.py b/src/diffusers/modular_pipelines/ernie_image/encoders.py
new file mode 100644
index 000000000000..a1e65b53d90b
--- /dev/null
+++ b/src/diffusers/modular_pipelines/ernie_image/encoders.py
@@ -0,0 +1,286 @@
+# Copyright 2025 Baidu ERNIE-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+import torch
+from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...guiders import ClassifierFreeGuidance
+from ...utils import logging
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+from .modular_pipeline import ErnieImageModularPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class ErnieImagePromptEnhancerStep(ModularPipelineBlocks):
+    model_name = "ernie-image"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Prompt enhancer step that rewrites the input prompt using a causal language model (PE). "
+            "If `use_pe` is False or the PE components are not loaded, the step is a no-op."
+        )
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("pe", AutoModelForCausalLM),
+            ComponentSpec("pe_tokenizer", AutoTokenizer),
+        ]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam("prompt", required=True, description="The prompt or prompts to guide image generation."),
+            InputParam("height", type_hint=int, description="The height in pixels of the generated image."),
+            InputParam("width", type_hint=int, description="The width in pixels of the generated image."),
+            InputParam(
+                "use_pe",
+                type_hint=bool,
+                default=True,
+                description="Whether to use the prompt enhancer to rewrite the prompt before encoding.",
+            ),
+            InputParam(
+                "pe_system_prompt",
+                type_hint=str,
+                default=None,
+                description="Optional system prompt passed to the prompt enhancer.",
+            ),
+            InputParam(
+                "pe_temperature",
+                type_hint=float,
+                default=0.6,
+                description="Sampling temperature used when generating with the prompt enhancer.",
+            ),
+            InputParam(
+                "pe_top_p",
+                type_hint=float,
+                default=0.95,
+                description="Nucleus sampling `top_p` used when generating with the prompt enhancer.",
+            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(
+                "prompt",
+                type_hint=list,
+                description="The prompt list after optional prompt-enhancer rewriting.",
+            ),
+            OutputParam(
+                "revised_prompts",
+                type_hint=list,
+                description="The prompts returned by the prompt enhancer when it ran, else None.",
+            ),
+        ]
+
+    @staticmethod
+    def _enhance_prompt(
+        pe: AutoModelForCausalLM,
+        pe_tokenizer: AutoTokenizer,
+        prompt: str,
+        device: torch.device,
+        width: int,
+        height: int,
+        system_prompt: str | None,
+        temperature: float,
+        top_p: float,
+    ) -> str:
+        user_content = json.dumps({"prompt": prompt, "width": width, "height": height}, ensure_ascii=False)
+        messages = []
+        if system_prompt is not None:
+            messages.append({"role": "system", "content": system_prompt})
+        messages.append({"role": "user", "content": user_content})
+
+        input_text = pe_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
+        inputs = pe_tokenizer(input_text, return_tensors="pt").to(device)
+        output_ids = pe.generate(
+            **inputs,
+            max_new_tokens=pe_tokenizer.model_max_length,
+            do_sample=temperature != 1.0 or top_p != 1.0,
+            temperature=temperature,
+            top_p=top_p,
+            pad_token_id=pe_tokenizer.pad_token_id,
+            eos_token_id=pe_tokenizer.eos_token_id,
+        )
+        generated_ids = output_ids[0][inputs["input_ids"].shape[1] :]
+        return pe_tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
+
+    @torch.no_grad()
+    def __call__(self, components: ErnieImageModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+
+        prompt = block_state.prompt
+        if isinstance(prompt, str):
+            prompt = [prompt]
+
+        pe = getattr(components, "pe", None)
+        pe_tokenizer = getattr(components, "pe_tokenizer", None)
+        if not block_state.use_pe or pe is None or pe_tokenizer is None:
+            block_state.prompt = prompt
+            block_state.revised_prompts = None
+            self.set_block_state(state, block_state)
+            return components, state
+
+        height = block_state.height or components.default_height
+        width = block_state.width or components.default_width
+
+        revised = [
+            self._enhance_prompt(
+                pe=pe,
+                pe_tokenizer=pe_tokenizer,
+                prompt=p,
+                device=device,
+                width=width,
+                height=height,
+                system_prompt=block_state.pe_system_prompt,
+                temperature=block_state.pe_temperature,
+                top_p=block_state.pe_top_p,
+            )
+            for p in prompt
+        ]
+
+        block_state.prompt = revised
+        block_state.revised_prompts = list(revised)
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class ErnieImageTextEncoderStep(ModularPipelineBlocks):
+    model_name = "ernie-image"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Text encoder step that encodes prompts into variable-length hidden states for the ErnieImage transformer."
+        )
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("text_encoder", AutoModel),
+            ComponentSpec("tokenizer", AutoTokenizer),
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 4.0}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam("prompt", description="The prompt or prompts to guide image generation."),
+            InputParam("negative_prompt", description="The prompt or prompts to avoid during image generation."),
+            InputParam(
+                "num_images_per_prompt",
+                type_hint=int,
+                default=1,
+                description="Number of images to generate per prompt.",
+            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(
+                "prompt_embeds",
+                type_hint=list,
+                kwargs_type="denoiser_input_fields",
+                description="List of per-prompt text embeddings of shape (T, H) used as conditioning for the transformer.",
+            ),
+            OutputParam(
+                "negative_prompt_embeds",
+                type_hint=list,
+                kwargs_type="denoiser_input_fields",
+                description="List of per-prompt negative text embeddings used for classifier-free guidance.",
+            ),
+        ]
+
+    @staticmethod
+    def _encode(
+        text_encoder: AutoModel,
+        tokenizer: AutoTokenizer,
+        prompt: list[str],
+        device: torch.device,
+        num_images_per_prompt: int,
+    ) -> list[torch.Tensor]:
+        text_hiddens = []
+        for p in prompt:
+            ids = tokenizer(p, add_special_tokens=True, truncation=True, padding=False)["input_ids"]
+            if len(ids) == 0:
+                ids = [tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 0]
+            input_ids = torch.tensor([ids], device=device)
+            outputs = text_encoder(input_ids=input_ids, output_hidden_states=True)
+            # Second-to-last hidden state matches ErnieImage training
+            hidden = outputs.hidden_states[-2][0]
+            for _ in range(num_images_per_prompt):
+                text_hiddens.append(hidden)
+        return text_hiddens
+
+    @torch.no_grad()
+    def __call__(self, components: ErnieImageModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+
+        prompt = block_state.prompt
+        if prompt is None:
+            prompt = [""]
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        num_images_per_prompt = block_state.num_images_per_prompt
+
+        block_state.prompt_embeds = self._encode(
+            text_encoder=components.text_encoder,
+            tokenizer=components.tokenizer,
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+        )
+
+        if components.requires_unconditional_embeds:
+            negative_prompt = block_state.negative_prompt
+            if negative_prompt is None:
+                negative_prompt = ""
+            if isinstance(negative_prompt, str):
+                negative_prompt = [negative_prompt] * len(prompt)
+            if len(negative_prompt) != len(prompt):
+                raise ValueError(
+                    f"`negative_prompt` must have the same length as `prompt` ({len(prompt)}), "
+                    f"got {len(negative_prompt)}."
+                )
+            block_state.negative_prompt_embeds = self._encode(
+                text_encoder=components.text_encoder,
+                tokenizer=components.tokenizer,
+                prompt=negative_prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+        else:
+            block_state.negative_prompt_embeds = None
+
+        state.set("batch_size", len(prompt))
+
+        self.set_block_state(state, block_state)
+        return components, state
diff --git a/src/diffusers/modular_pipelines/ernie_image/modular_blocks_ernie_image.py b/src/diffusers/modular_pipelines/ernie_image/modular_blocks_ernie_image.py
new file mode 100644
index 000000000000..83a8bb5988bc
--- /dev/null
+++ b/src/diffusers/modular_pipelines/ernie_image/modular_blocks_ernie_image.py
@@ -0,0 +1,153 @@
+# Copyright 2025 Baidu ERNIE-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils import logging
+from ..modular_pipeline import SequentialPipelineBlocks
+from ..modular_pipeline_utils import OutputParam
+from .before_denoise import (
+    ErnieImagePrepareLatentsStep,
+    ErnieImageSetTimestepsStep,
+    ErnieImageTextInputStep,
+)
+from .decoders import ErnieImageVaeDecoderStep
+from .denoise import ErnieImageDenoiseStep
+from .encoders import ErnieImagePromptEnhancerStep, ErnieImageTextEncoderStep
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# auto_docstring
+class ErnieImageCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Denoise block that takes encoded conditions and runs the denoising process for ErnieImage.
+
+      Components:
+          transformer (`ErnieImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`)
+
+      Inputs:
+          prompt_embeds (`list`):
+              List of per-prompt text embeddings from the text encoder step.
+          negative_prompt_embeds (`list`, *optional*):
+              List of per-prompt negative text embeddings from the text encoder step.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              Number of images to generate per prompt.
+          batch_size (`int`, *optional*):
+              Prompt batch size. Resolved from `prompt_embeds` when not provided.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              Number of denoising steps.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents. If provided, skips noise sampling.
+          generator (`None`, *optional*):
+              Torch generator for deterministic noise sampling.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
+    model_name = "ernie-image"
+    block_classes = [
+        ErnieImageTextInputStep,
+        ErnieImageSetTimestepsStep,
+        ErnieImagePrepareLatentsStep,
+        ErnieImageDenoiseStep,
+    ]
+    block_names = ["input", "set_timesteps", "prepare_latents", "denoise"]
+
+    @property
+    def description(self):
+        return "Denoise block that takes encoded conditions and runs the denoising process for ErnieImage."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("latents")]
+
+
+# auto_docstring
+class ErnieImageAutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto modular pipeline for ErnieImage text-to-image generation. Supports an optional prompt enhancer when the `pe`
+    components are loaded and `use_pe=True`.
+
+      Supported workflows:
+        - `text2image`: requires `prompt`
+
+      Components:
+          pe (`AutoModelForCausalLM`) pe_tokenizer (`AutoTokenizer`) text_encoder (`AutoModel`) tokenizer
+          (`AutoTokenizer`) guider (`ClassifierFreeGuidance`) transformer (`ErnieImageTransformer2DModel`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`) vae (`AutoencoderKLFlux2`) pachifier (`ErnieImagePachifier`)
+
+      Inputs:
+          prompt (`None`):
+              The prompt or prompts to guide image generation.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          use_pe (`bool`, *optional*, defaults to True):
+              Whether to use the prompt enhancer to rewrite the prompt before encoding.
+          pe_system_prompt (`str`, *optional*):
+              Optional system prompt passed to the prompt enhancer.
+          pe_temperature (`float`, *optional*, defaults to 0.6):
+              Sampling temperature used when generating with the prompt enhancer.
+          pe_top_p (`float`, *optional*, defaults to 0.95):
+              Nucleus sampling `top_p` used when generating with the prompt enhancer.
+          negative_prompt (`None`, *optional*):
+              The prompt or prompts to avoid during image generation.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              Number of images to generate per prompt.
+          batch_size (`int`, *optional*):
+              Prompt batch size. Resolved from `prompt_embeds` when not provided.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              Number of denoising steps.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents. If provided, skips noise sampling.
+          generator (`None`, *optional*):
+              Torch generator for deterministic noise sampling.
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', or 'pt'.
+
+      Outputs:
+          images (`list`):
+              Generated images.
+    """
+
+    model_name = "ernie-image"
+    block_classes = [
+        ErnieImagePromptEnhancerStep,
+        ErnieImageTextEncoderStep,
+        ErnieImageCoreDenoiseStep,
+        ErnieImageVaeDecoderStep,
+    ]
+    block_names = ["prompt_enhancer", "text_encoder", "denoise", "decode"]
+    _workflow_map = {
+        "text2image": {"prompt": True},
+    }
+
+    @property
+    def description(self):
+        return (
+            "Auto modular pipeline for ErnieImage text-to-image generation. Supports an optional prompt enhancer "
+            "when the `pe` components are loaded and `use_pe=True`."
+        )
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("images")]
diff --git a/src/diffusers/modular_pipelines/ernie_image/modular_pipeline.py b/src/diffusers/modular_pipelines/ernie_image/modular_pipeline.py
new file mode 100644
index 000000000000..cf4497fe9138
--- /dev/null
+++ b/src/diffusers/modular_pipelines/ernie_image/modular_pipeline.py
@@ -0,0 +1,109 @@
+# Copyright 2025 Baidu ERNIE-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import logging
+from ..modular_pipeline import ModularPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class ErnieImagePachifier(ConfigMixin):
+    """
+    A class to pack and unpack latents for ErnieImage.
+    """
+
+    config_name = "config.json"
+
+    @register_to_config
+    def __init__(self, patch_size: int = 2):
+        super().__init__()
+
+    def pack_latents(self, latents: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, height, width = latents.shape
+        patch_size = self.config.patch_size
+
+        if height % patch_size != 0 or width % patch_size != 0:
+            raise ValueError(
+                f"Latent height and width must be divisible by {patch_size}, but got {height} and {width}"
+            )
+
+        latents = latents.view(
+            batch_size, num_channels, height // patch_size, patch_size, width // patch_size, patch_size
+        )
+        latents = latents.permute(0, 1, 3, 5, 2, 4)
+        return latents.reshape(
+            batch_size, num_channels * patch_size * patch_size, height // patch_size, width // patch_size
+        )
+
+    def unpack_latents(self, latents: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, height, width = latents.shape
+        patch_size = self.config.patch_size
+
+        latents = latents.reshape(
+            batch_size, num_channels // (patch_size * patch_size), patch_size, patch_size, height, width
+        )
+        latents = latents.permute(0, 1, 4, 2, 5, 3)
+        return latents.reshape(
+            batch_size, num_channels // (patch_size * patch_size), height * patch_size, width * patch_size
+        )
+
+
+class ErnieImageModularPipeline(ModularPipeline):
+    """
+    A ModularPipeline for ErnieImage.
+
+    > [!WARNING] > This is an experimental feature and is likely to change in the future.
+    """
+
+    default_blocks_name = "ErnieImageAutoBlocks"
+
+    @property
+    def default_height(self):
+        return 1024
+
+    @property
+    def default_width(self):
+        return 1024
+
+    @property
+    def vae_scale_factor(self):
+        vae_scale_factor = 16
+        if hasattr(self, "vae") and self.vae is not None:
+            vae_scale_factor = 2 ** len(self.vae.config.block_out_channels)
+        return vae_scale_factor
+
+    @property
+    def num_channels_latents(self):
+        num_channels_latents = 128
+        if hasattr(self, "transformer") and self.transformer is not None:
+            num_channels_latents = self.transformer.config.in_channels
+        return num_channels_latents
+
+    @property
+    def text_in_dim(self):
+        text_in_dim = 3584
+        if hasattr(self, "transformer") and self.transformer is not None:
+            text_in_dim = self.transformer.config.text_in_dim
+        return text_in_dim
+
+    @property
+    def requires_unconditional_embeds(self):
+        requires_unconditional_embeds = False
+        if hasattr(self, "guider") and self.guider is not None:
+            requires_unconditional_embeds = self.guider._enabled and self.guider.num_conditions > 1
+        return requires_unconditional_embeds
diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index d00bf716a78f..820f7a9b13fa 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -134,6 +134,7 @@ def _helios_pyramid_map_fn(config_dict=None):
         ("helios-pyramid", _helios_pyramid_map_fn),
         ("hunyuan-video-1.5", _create_default_map_fn("HunyuanVideo15ModularPipeline")),
         ("ltx", _create_default_map_fn("LTXModularPipeline")),
+        ("ernie-image", _create_default_map_fn("ErnieImageModularPipeline")),
     ]
 )
 
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index c95c56789e37..829be9b66a2d 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -92,6 +92,36 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class ErnieImageAutoBlocks(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class ErnieImageModularPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class FluxAutoBlocks(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
diff --git a/tests/modular_pipelines/ernie_image/__init__.py b/tests/modular_pipelines/ernie_image/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/modular_pipelines/ernie_image/test_modular_pipeline_ernie_image.py b/tests/modular_pipelines/ernie_image/test_modular_pipeline_ernie_image.py
new file mode 100644
index 000000000000..23be10abc073
--- /dev/null
+++ b/tests/modular_pipelines/ernie_image/test_modular_pipeline_ernie_image.py
@@ -0,0 +1,60 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from diffusers.modular_pipelines import ErnieImageAutoBlocks, ErnieImageModularPipeline
+
+from ..test_modular_pipelines_common import ModularPipelineTesterMixin
+
+
+ERNIE_IMAGE_WORKFLOWS = {
+    "text2image": [
+        ("prompt_enhancer", "ErnieImagePromptEnhancerStep"),
+        ("text_encoder", "ErnieImageTextEncoderStep"),
+        ("denoise.input", "ErnieImageTextInputStep"),
+        ("denoise.set_timesteps", "ErnieImageSetTimestepsStep"),
+        ("denoise.prepare_latents", "ErnieImagePrepareLatentsStep"),
+        ("denoise.denoise", "ErnieImageDenoiseStep"),
+        ("decode", "ErnieImageVaeDecoderStep"),
+    ],
+}
+
+
+class TestErnieImageModularPipelineFast(ModularPipelineTesterMixin):
+    pipeline_class = ErnieImageModularPipeline
+    pipeline_blocks_class = ErnieImageAutoBlocks
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-ernie-image-modular-pipe"
+
+    params = frozenset(["prompt", "height", "width"])
+    batch_params = frozenset(["prompt"])
+    optional_params = frozenset(["num_inference_steps", "num_images_per_prompt", "latents"])
+    expected_workflow_blocks = ERNIE_IMAGE_WORKFLOWS
+
+    def get_dummy_inputs(self, seed=0):
+        generator = self.get_generator(seed)
+        return {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "height": 32,
+            "width": 32,
+            "use_pe": False,
+            "output_type": "pt",
+        }
+
+    @pytest.mark.skip(reason="PE generation is non-deterministic on CPU")
+    def test_float16_inference(self):
+        pass

From c137fa199cf52c3cd28009ab16a14f79f681604a Mon Sep 17 00:00:00 2001
From: Akshan Krithick <akshankrithick305@gmail.com>
Date: Mon, 20 Apr 2026 17:49:52 -0700
Subject: [PATCH 2/4] Address review

---
 .../ernie_image/before_denoise.py             | 13 +++-
 .../modular_pipelines/ernie_image/encoders.py | 65 +++++--------------
 .../ernie_image/modular_blocks_ernie_image.py | 55 ++++++++++++++--
 .../test_modular_pipeline_ernie_image.py      |  4 +-
 4 files changed, 79 insertions(+), 58 deletions(-)

diff --git a/src/diffusers/modular_pipelines/ernie_image/before_denoise.py b/src/diffusers/modular_pipelines/ernie_image/before_denoise.py
index 1c13c50f2db3..a8013571ccba 100644
--- a/src/diffusers/modular_pipelines/ernie_image/before_denoise.py
+++ b/src/diffusers/modular_pipelines/ernie_image/before_denoise.py
@@ -117,22 +117,31 @@ def intermediate_outputs(self) -> list[OutputParam]:
             ),
         ]
 
+    @staticmethod
+    def _expand(hiddens: list[torch.Tensor], num_images_per_prompt: int) -> list[torch.Tensor]:
+        if num_images_per_prompt == 1:
+            return list(hiddens)
+        return [h for h in hiddens for _ in range(num_images_per_prompt)]
+
     @torch.no_grad()
     def __call__(self, components: ErnieImageModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
         device = components._execution_device
         dtype = components.transformer.dtype
         text_in_dim = components.text_in_dim
+        num_images_per_prompt = block_state.num_images_per_prompt
 
         prompt_embeds = block_state.prompt_embeds
-        block_state.batch_size = getattr(block_state, "batch_size", None) or len(prompt_embeds)
+        block_state.batch_size = block_state.batch_size or len(prompt_embeds)
 
+        prompt_embeds = self._expand(prompt_embeds, num_images_per_prompt)
         text_bth, text_lens = _pad_text(prompt_embeds, device, dtype, text_in_dim)
         block_state.text_bth = text_bth
         block_state.text_lens = text_lens
 
-        negative_prompt_embeds = getattr(block_state, "negative_prompt_embeds", None)
+        negative_prompt_embeds = block_state.negative_prompt_embeds
         if negative_prompt_embeds is not None:
+            negative_prompt_embeds = self._expand(negative_prompt_embeds, num_images_per_prompt)
             negative_text_bth, negative_text_lens = _pad_text(negative_prompt_embeds, device, dtype, text_in_dim)
             block_state.negative_text_bth = negative_text_bth
             block_state.negative_text_lens = negative_text_lens
diff --git a/src/diffusers/modular_pipelines/ernie_image/encoders.py b/src/diffusers/modular_pipelines/ernie_image/encoders.py
index a1e65b53d90b..d4298765fd67 100644
--- a/src/diffusers/modular_pipelines/ernie_image/encoders.py
+++ b/src/diffusers/modular_pipelines/ernie_image/encoders.py
@@ -33,10 +33,7 @@ class ErnieImagePromptEnhancerStep(ModularPipelineBlocks):
 
     @property
     def description(self) -> str:
-        return (
-            "Prompt enhancer step that rewrites the input prompt using a causal language model (PE). "
-            "If `use_pe` is False or the PE components are not loaded, the step is a no-op."
-        )
+        return "Prompt enhancer step that rewrites the input prompt using a causal language model (PE)."
 
     @property
     def expected_components(self) -> list[ComponentSpec]:
@@ -48,15 +45,14 @@ def expected_components(self) -> list[ComponentSpec]:
     @property
     def inputs(self) -> list[InputParam]:
         return [
-            InputParam("prompt", required=True, description="The prompt or prompts to guide image generation."),
-            InputParam("height", type_hint=int, description="The height in pixels of the generated image."),
-            InputParam("width", type_hint=int, description="The width in pixels of the generated image."),
             InputParam(
-                "use_pe",
-                type_hint=bool,
-                default=True,
-                description="Whether to use the prompt enhancer to rewrite the prompt before encoding.",
+                "prompt",
+                required=True,
+                type_hint=str,
+                description="The prompt or prompts to guide image generation.",
             ),
+            InputParam("height", type_hint=int, description="The height in pixels of the generated image."),
+            InputParam("width", type_hint=int, description="The width in pixels of the generated image."),
             InputParam(
                 "pe_system_prompt",
                 type_hint=str,
@@ -80,16 +76,8 @@ def inputs(self) -> list[InputParam]:
     @property
     def intermediate_outputs(self) -> list[OutputParam]:
         return [
-            OutputParam(
-                "prompt",
-                type_hint=list,
-                description="The prompt list after optional prompt-enhancer rewriting.",
-            ),
-            OutputParam(
-                "revised_prompts",
-                type_hint=list,
-                description="The prompts returned by the prompt enhancer when it ran, else None.",
-            ),
+            OutputParam("prompt", type_hint=list, description="The prompt list after prompt-enhancer rewriting."),
+            OutputParam("revised_prompts", type_hint=list, description="The prompts returned by the prompt enhancer."),
         ]
 
     @staticmethod
@@ -133,21 +121,13 @@ def __call__(self, components: ErnieImageModularPipeline, state: PipelineState)
         if isinstance(prompt, str):
             prompt = [prompt]
 
-        pe = getattr(components, "pe", None)
-        pe_tokenizer = getattr(components, "pe_tokenizer", None)
-        if not block_state.use_pe or pe is None or pe_tokenizer is None:
-            block_state.prompt = prompt
-            block_state.revised_prompts = None
-            self.set_block_state(state, block_state)
-            return components, state
-
         height = block_state.height or components.default_height
         width = block_state.width or components.default_width
 
         revised = [
             self._enhance_prompt(
-                pe=pe,
-                pe_tokenizer=pe_tokenizer,
+                pe=components.pe,
+                pe_tokenizer=components.pe_tokenizer,
                 prompt=p,
                 device=device,
                 width=width,
@@ -191,13 +171,11 @@ def expected_components(self) -> list[ComponentSpec]:
     @property
     def inputs(self) -> list[InputParam]:
         return [
-            InputParam("prompt", description="The prompt or prompts to guide image generation."),
-            InputParam("negative_prompt", description="The prompt or prompts to avoid during image generation."),
+            InputParam("prompt", type_hint=str, description="The prompt or prompts to guide image generation."),
             InputParam(
-                "num_images_per_prompt",
-                type_hint=int,
-                default=1,
-                description="Number of images to generate per prompt.",
+                "negative_prompt",
+                type_hint=str,
+                description="The prompt or prompts to avoid during image generation.",
             ),
         ]
 
@@ -208,13 +186,13 @@ def intermediate_outputs(self) -> list[OutputParam]:
                 "prompt_embeds",
                 type_hint=list,
                 kwargs_type="denoiser_input_fields",
-                description="List of per-prompt text embeddings of shape (T, H) used as conditioning for the transformer.",
+                description="List of per-prompt text embeddings of shape (T, H).",
             ),
             OutputParam(
                 "negative_prompt_embeds",
                 type_hint=list,
                 kwargs_type="denoiser_input_fields",
-                description="List of per-prompt negative text embeddings used for classifier-free guidance.",
+                description="List of per-prompt negative text embeddings for classifier-free guidance.",
             ),
         ]
 
@@ -224,7 +202,6 @@ def _encode(
         tokenizer: AutoTokenizer,
         prompt: list[str],
         device: torch.device,
-        num_images_per_prompt: int,
     ) -> list[torch.Tensor]:
         text_hiddens = []
         for p in prompt:
@@ -233,10 +210,7 @@ def _encode(
                 ids = [tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 0]
             input_ids = torch.tensor([ids], device=device)
             outputs = text_encoder(input_ids=input_ids, output_hidden_states=True)
-            # Second-to-last hidden state matches ErnieImage training
-            hidden = outputs.hidden_states[-2][0]
-            for _ in range(num_images_per_prompt):
-                text_hiddens.append(hidden)
+            text_hiddens.append(outputs.hidden_states[-2][0])
         return text_hiddens
 
     @torch.no_grad()
@@ -249,14 +223,12 @@ def __call__(self, components: ErnieImageModularPipeline, state: PipelineState)
             prompt = [""]
         if isinstance(prompt, str):
             prompt = [prompt]
-        num_images_per_prompt = block_state.num_images_per_prompt
 
         block_state.prompt_embeds = self._encode(
             text_encoder=components.text_encoder,
             tokenizer=components.tokenizer,
             prompt=prompt,
             device=device,
-            num_images_per_prompt=num_images_per_prompt,
         )
 
         if components.requires_unconditional_embeds:
@@ -275,7 +247,6 @@ def __call__(self, components: ErnieImageModularPipeline, state: PipelineState)
                 tokenizer=components.tokenizer,
                 prompt=negative_prompt,
                 device=device,
-                num_images_per_prompt=num_images_per_prompt,
             )
         else:
             block_state.negative_prompt_embeds = None
diff --git a/src/diffusers/modular_pipelines/ernie_image/modular_blocks_ernie_image.py b/src/diffusers/modular_pipelines/ernie_image/modular_blocks_ernie_image.py
index 83a8bb5988bc..dcb304eed582 100644
--- a/src/diffusers/modular_pipelines/ernie_image/modular_blocks_ernie_image.py
+++ b/src/diffusers/modular_pipelines/ernie_image/modular_blocks_ernie_image.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from ...utils import logging
-from ..modular_pipeline import SequentialPipelineBlocks
+from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
 from ..modular_pipeline_utils import OutputParam
 from .before_denoise import (
     ErnieImagePrepareLatentsStep,
@@ -28,6 +28,51 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
+# auto_docstring
+class ErnieImageAutoPromptEnhancerStep(AutoPipelineBlocks):
+    """
+    Auto block that runs the optional prompt enhancer when `use_pe` is provided.
+       - `ErnieImagePromptEnhancerStep` is used when `use_pe` is set.
+       - If `use_pe` is not provided, the step is skipped.
+
+      Components:
+          pe (`AutoModelForCausalLM`) pe_tokenizer (`AutoTokenizer`)
+
+      Inputs:
+          prompt (`str`, *optional*):
+              The prompt or prompts to guide image generation.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          pe_system_prompt (`str`, *optional*):
+              Optional system prompt passed to the prompt enhancer.
+          pe_temperature (`float`, *optional*, defaults to 0.6):
+              Sampling temperature used when generating with the prompt enhancer.
+          pe_top_p (`float`, *optional*, defaults to 0.95):
+              Nucleus sampling `top_p` used when generating with the prompt enhancer.
+
+      Outputs:
+          prompt (`list`):
+              The prompt list after prompt-enhancer rewriting.
+          revised_prompts (`list`):
+              The prompts returned by the prompt enhancer.
+    """
+
+    model_name = "ernie-image"
+    block_classes = [ErnieImagePromptEnhancerStep]
+    block_names = ["prompt_enhancer"]
+    block_trigger_inputs = ["use_pe"]
+
+    @property
+    def description(self):
+        return (
+            "Auto block that runs the optional prompt enhancer when `use_pe` is provided.\n"
+            " - `ErnieImagePromptEnhancerStep` is used when `use_pe` is set.\n"
+            " - If `use_pe` is not provided, the step is skipped."
+        )
+
+
 # auto_docstring
 class ErnieImageCoreDenoiseStep(SequentialPipelineBlocks):
     """
@@ -95,21 +140,19 @@ class ErnieImageAutoBlocks(SequentialPipelineBlocks):
           (`FlowMatchEulerDiscreteScheduler`) vae (`AutoencoderKLFlux2`) pachifier (`ErnieImagePachifier`)
 
       Inputs:
-          prompt (`None`):
+          prompt (`str`, *optional*):
               The prompt or prompts to guide image generation.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          use_pe (`bool`, *optional*, defaults to True):
-              Whether to use the prompt enhancer to rewrite the prompt before encoding.
           pe_system_prompt (`str`, *optional*):
               Optional system prompt passed to the prompt enhancer.
           pe_temperature (`float`, *optional*, defaults to 0.6):
               Sampling temperature used when generating with the prompt enhancer.
           pe_top_p (`float`, *optional*, defaults to 0.95):
               Nucleus sampling `top_p` used when generating with the prompt enhancer.
-          negative_prompt (`None`, *optional*):
+          negative_prompt (`str`, *optional*):
               The prompt or prompts to avoid during image generation.
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               Number of images to generate per prompt.
@@ -131,7 +174,7 @@ class ErnieImageAutoBlocks(SequentialPipelineBlocks):
 
     model_name = "ernie-image"
     block_classes = [
-        ErnieImagePromptEnhancerStep,
+        ErnieImageAutoPromptEnhancerStep,
         ErnieImageTextEncoderStep,
         ErnieImageCoreDenoiseStep,
         ErnieImageVaeDecoderStep,
diff --git a/tests/modular_pipelines/ernie_image/test_modular_pipeline_ernie_image.py b/tests/modular_pipelines/ernie_image/test_modular_pipeline_ernie_image.py
index 23be10abc073..511a5dc1b3eb 100644
--- a/tests/modular_pipelines/ernie_image/test_modular_pipeline_ernie_image.py
+++ b/tests/modular_pipelines/ernie_image/test_modular_pipeline_ernie_image.py
@@ -22,7 +22,6 @@
 
 ERNIE_IMAGE_WORKFLOWS = {
     "text2image": [
-        ("prompt_enhancer", "ErnieImagePromptEnhancerStep"),
         ("text_encoder", "ErnieImageTextEncoderStep"),
         ("denoise.input", "ErnieImageTextInputStep"),
         ("denoise.set_timesteps", "ErnieImageSetTimestepsStep"),
@@ -36,7 +35,7 @@
 class TestErnieImageModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = ErnieImageModularPipeline
     pipeline_blocks_class = ErnieImageAutoBlocks
-    pretrained_model_name_or_path = "hf-internal-testing/tiny-ernie-image-modular-pipe"
+    pretrained_model_name_or_path = "akshan-main/tiny-ernie-image-modular-pipe"
 
     params = frozenset(["prompt", "height", "width"])
     batch_params = frozenset(["prompt"])
@@ -51,7 +50,6 @@ def get_dummy_inputs(self, seed=0):
             "num_inference_steps": 2,
             "height": 32,
             "width": 32,
-            "use_pe": False,
             "output_type": "pt",
         }
 

From d4943161bdae20960071c26f6477425d1fe31198 Mon Sep 17 00:00:00 2001
From: Akshan Krithick <akshankrithick305@gmail.com>
Date: Mon, 20 Apr 2026 18:25:54 -0700
Subject: [PATCH 3/4] Fix alphabetical ordering and generator type_hint

---
 src/diffusers/__init__.py                        |  8 ++++----
 .../ernie_image/before_denoise.py                |  6 +++++-
 .../dummy_torch_and_transformers_objects.py      | 16 ++++++++--------
 3 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 2e70f602348e..470d18e860a7 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -442,6 +442,8 @@
 else:
     _import_structure["modular_pipelines"].extend(
         [
+            "ErnieImageAutoBlocks",
+            "ErnieImageModularPipeline",
             "Flux2AutoBlocks",
             "Flux2KleinAutoBlocks",
             "Flux2KleinBaseAutoBlocks",
@@ -449,8 +451,6 @@
             "Flux2KleinModularPipeline",
             "Flux2ModularPipeline",
             "FluxAutoBlocks",
-            "ErnieImageAutoBlocks",
-            "ErnieImageModularPipeline",
             "FluxKontextAutoBlocks",
             "FluxKontextModularPipeline",
             "FluxModularPipeline",
@@ -1233,14 +1233,14 @@
         from .utils.dummy_torch_and_transformers_objects import *  # noqa F403
     else:
         from .modular_pipelines import (
+            ErnieImageAutoBlocks,
+            ErnieImageModularPipeline,
             Flux2AutoBlocks,
             Flux2KleinAutoBlocks,
             Flux2KleinBaseAutoBlocks,
             Flux2KleinBaseModularPipeline,
             Flux2KleinModularPipeline,
             Flux2ModularPipeline,
-            ErnieImageAutoBlocks,
-            ErnieImageModularPipeline,
             FluxAutoBlocks,
             FluxKontextAutoBlocks,
             FluxKontextModularPipeline,
diff --git a/src/diffusers/modular_pipelines/ernie_image/before_denoise.py b/src/diffusers/modular_pipelines/ernie_image/before_denoise.py
index a8013571ccba..3e2f398bbb03 100644
--- a/src/diffusers/modular_pipelines/ernie_image/before_denoise.py
+++ b/src/diffusers/modular_pipelines/ernie_image/before_denoise.py
@@ -225,7 +225,11 @@ def inputs(self) -> list[InputParam]:
                 default=1,
                 description="Number of images to generate per prompt.",
             ),
-            InputParam("generator", description="Torch generator for deterministic noise sampling."),
+            InputParam(
+                "generator",
+                type_hint=torch.Generator,
+                description="Torch generator for deterministic noise sampling.",
+            ),
             InputParam(
                 "batch_size",
                 required=True,
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 829be9b66a2d..b5dbf7840e6f 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -2,7 +2,7 @@
 from ..utils import DummyObject, requires_backends
 
 
-class Flux2AutoBlocks(metaclass=DummyObject):
+class ErnieImageAutoBlocks(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
     def __init__(self, *args, **kwargs):
@@ -17,7 +17,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-class Flux2KleinAutoBlocks(metaclass=DummyObject):
+class ErnieImageModularPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
     def __init__(self, *args, **kwargs):
@@ -32,7 +32,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-class Flux2KleinBaseAutoBlocks(metaclass=DummyObject):
+class Flux2AutoBlocks(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
     def __init__(self, *args, **kwargs):
@@ -47,7 +47,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-class Flux2KleinBaseModularPipeline(metaclass=DummyObject):
+class Flux2KleinAutoBlocks(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
     def __init__(self, *args, **kwargs):
@@ -62,7 +62,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-class Flux2KleinModularPipeline(metaclass=DummyObject):
+class Flux2KleinBaseAutoBlocks(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
     def __init__(self, *args, **kwargs):
@@ -77,7 +77,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-class Flux2ModularPipeline(metaclass=DummyObject):
+class Flux2KleinBaseModularPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
     def __init__(self, *args, **kwargs):
@@ -92,7 +92,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-class ErnieImageAutoBlocks(metaclass=DummyObject):
+class Flux2KleinModularPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
     def __init__(self, *args, **kwargs):
@@ -107,7 +107,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-class ErnieImageModularPipeline(metaclass=DummyObject):
+class Flux2ModularPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
     def __init__(self, *args, **kwargs):

From e4df67a562b6be2c38ba58058f05695cb2bb8a75 Mon Sep 17 00:00:00 2001
From: Akshan Krithick <akshankrithick305@gmail.com>
Date: Wed, 22 Apr 2026 16:15:04 -0700
Subject: [PATCH 4/4] Address review

---
 .../ernie_image/before_denoise.py             | 22 +++++--------------
 .../modular_pipelines/ernie_image/denoise.py  | 10 ++-------
 .../modular_pipelines/ernie_image/encoders.py |  4 ----
 .../ernie_image/modular_blocks_ernie_image.py | 10 ++-------
 4 files changed, 9 insertions(+), 37 deletions(-)

diff --git a/src/diffusers/modular_pipelines/ernie_image/before_denoise.py b/src/diffusers/modular_pipelines/ernie_image/before_denoise.py
index 3e2f398bbb03..034230632396 100644
--- a/src/diffusers/modular_pipelines/ernie_image/before_denoise.py
+++ b/src/diffusers/modular_pipelines/ernie_image/before_denoise.py
@@ -79,12 +79,6 @@ def inputs(self) -> list[InputParam]:
                 default=1,
                 description="Number of images to generate per prompt.",
             ),
-            InputParam(
-                "batch_size",
-                type_hint=int,
-                default=None,
-                description="Prompt batch size. Resolved from `prompt_embeds` when not provided.",
-            ),
         ]
 
     @property
@@ -132,7 +126,7 @@ def __call__(self, components: ErnieImageModularPipeline, state: PipelineState)
         num_images_per_prompt = block_state.num_images_per_prompt
 
         prompt_embeds = block_state.prompt_embeds
-        block_state.batch_size = block_state.batch_size or len(prompt_embeds)
+        block_state.batch_size = len(prompt_embeds)
 
         prompt_embeds = self._expand(prompt_embeds, num_images_per_prompt)
         text_bth, text_lens = _pad_text(prompt_embeds, device, dtype, text_in_dim)
@@ -219,22 +213,16 @@ def inputs(self) -> list[InputParam]:
                 type_hint=torch.Tensor,
                 description="Pre-generated noisy latents. If provided, skips noise sampling.",
             ),
-            InputParam(
-                "num_images_per_prompt",
-                type_hint=int,
-                default=1,
-                description="Number of images to generate per prompt.",
-            ),
             InputParam(
                 "generator",
                 type_hint=torch.Generator,
                 description="Torch generator for deterministic noise sampling.",
             ),
             InputParam(
-                "batch_size",
+                "text_bth",
                 required=True,
-                type_hint=int,
-                description="Prompt batch size resolved by the text input step.",
+                type_hint=torch.Tensor,
+                description="Padded text hidden states; used to derive the total batch size for the latents.",
             ),
         ]
 
@@ -264,7 +252,7 @@ def __call__(self, components: ErnieImageModularPipeline, state: PipelineState)
         width = block_state.width or components.default_width
         self._check_inputs(components, height, width)
 
-        total_batch_size = block_state.batch_size * block_state.num_images_per_prompt
+        total_batch_size = block_state.text_bth.shape[0]
         latent_h = height // components.vae_scale_factor
         latent_w = width // components.vae_scale_factor
         num_channels_latents = components.num_channels_latents
diff --git a/src/diffusers/modular_pipelines/ernie_image/denoise.py b/src/diffusers/modular_pipelines/ernie_image/denoise.py
index d3ab9c78f60a..3a2a2e312486 100644
--- a/src/diffusers/modular_pipelines/ernie_image/denoise.py
+++ b/src/diffusers/modular_pipelines/ernie_image/denoise.py
@@ -124,14 +124,8 @@ def inputs(self) -> list[InputParam]:
     @torch.no_grad()
     def __call__(self, components: ErnieImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
         guider_inputs = {
-            "text_bth": (
-                getattr(block_state, "text_bth", None),
-                getattr(block_state, "negative_text_bth", None),
-            ),
-            "text_lens": (
-                getattr(block_state, "text_lens", None),
-                getattr(block_state, "negative_text_lens", None),
-            ),
+            "text_bth": (block_state.text_bth, block_state.negative_text_bth),
+            "text_lens": (block_state.text_lens, block_state.negative_text_lens),
         }
 
         components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t)
diff --git a/src/diffusers/modular_pipelines/ernie_image/encoders.py b/src/diffusers/modular_pipelines/ernie_image/encoders.py
index d4298765fd67..39560965f64b 100644
--- a/src/diffusers/modular_pipelines/ernie_image/encoders.py
+++ b/src/diffusers/modular_pipelines/ernie_image/encoders.py
@@ -77,7 +77,6 @@ def inputs(self) -> list[InputParam]:
     def intermediate_outputs(self) -> list[OutputParam]:
         return [
             OutputParam("prompt", type_hint=list, description="The prompt list after prompt-enhancer rewriting."),
-            OutputParam("revised_prompts", type_hint=list, description="The prompts returned by the prompt enhancer."),
         ]
 
     @staticmethod
@@ -140,7 +139,6 @@ def __call__(self, components: ErnieImageModularPipeline, state: PipelineState)
         ]
 
         block_state.prompt = revised
-        block_state.revised_prompts = list(revised)
 
         self.set_block_state(state, block_state)
         return components, state
@@ -251,7 +249,5 @@ def __call__(self, components: ErnieImageModularPipeline, state: PipelineState)
         else:
             block_state.negative_prompt_embeds = None
 
-        state.set("batch_size", len(prompt))
-
         self.set_block_state(state, block_state)
         return components, state
diff --git a/src/diffusers/modular_pipelines/ernie_image/modular_blocks_ernie_image.py b/src/diffusers/modular_pipelines/ernie_image/modular_blocks_ernie_image.py
index dcb304eed582..94a1cb5fb73f 100644
--- a/src/diffusers/modular_pipelines/ernie_image/modular_blocks_ernie_image.py
+++ b/src/diffusers/modular_pipelines/ernie_image/modular_blocks_ernie_image.py
@@ -55,8 +55,6 @@ class ErnieImageAutoPromptEnhancerStep(AutoPipelineBlocks):
       Outputs:
           prompt (`list`):
               The prompt list after prompt-enhancer rewriting.
-          revised_prompts (`list`):
-              The prompts returned by the prompt enhancer.
     """
 
     model_name = "ernie-image"
@@ -89,8 +87,6 @@ class ErnieImageCoreDenoiseStep(SequentialPipelineBlocks):
               List of per-prompt negative text embeddings from the text encoder step.
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               Number of images to generate per prompt.
-          batch_size (`int`, *optional*):
-              Prompt batch size. Resolved from `prompt_embeds` when not provided.
           num_inference_steps (`int`, *optional*, defaults to 50):
               Number of denoising steps.
           height (`int`, *optional*):
@@ -99,7 +95,7 @@ class ErnieImageCoreDenoiseStep(SequentialPipelineBlocks):
               The width in pixels of the generated image.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents. If provided, skips noise sampling.
-          generator (`None`, *optional*):
+          generator (`Generator`, *optional*):
               Torch generator for deterministic noise sampling.
 
       Outputs:
@@ -156,13 +152,11 @@ class ErnieImageAutoBlocks(SequentialPipelineBlocks):
               The prompt or prompts to avoid during image generation.
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               Number of images to generate per prompt.
-          batch_size (`int`, *optional*):
-              Prompt batch size. Resolved from `prompt_embeds` when not provided.
           num_inference_steps (`int`, *optional*, defaults to 50):
               Number of denoising steps.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents. If provided, skips noise sampling.
-          generator (`None`, *optional*):
+          generator (`Generator`, *optional*):
               Torch generator for deterministic noise sampling.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', or 'pt'.