diff --git a/src/conditioning/conditioner.hpp b/src/conditioning/conditioner.hpp
index ebbee97a9..3111d5837 100644
--- a/src/conditioning/conditioner.hpp
+++ b/src/conditioning/conditioner.hpp
@@ -1378,6 +1378,101 @@ struct T5CLIPEmbedder : public Conditioner {
     }
 };
 
+struct MiniT2IConditioner : public Conditioner {
+    T5UniGramTokenizer tokenizer;
+    std::shared_ptr<T5Runner> t5;
+    size_t prompt_length = 256;
+
+    MiniT2IConditioner(ggml_backend_t backend,
+                       const String2TensorStorage& tensor_storage_map      = {},
+                       std::shared_ptr<RunnerWeightManager> weight_manager = nullptr) {
+        bool use_t5 = false;
+        for (const auto& pair : tensor_storage_map) {
+            if (pair.first.find("text_encoders.t5xxl") != std::string::npos) {
+                use_t5 = true;
+                break;
+            }
+        }
+        if (!use_t5) {
+            LOG_WARN("IMPORTANT NOTICE: No MiniT2I T5 text encoder provided, cannot process prompts!");
+            return;
+        }
+        t5 = std::make_shared<T5Runner>(backend, tensor_storage_map, "text_encoders.t5xxl.transformer", false, weight_manager);
+    }
+
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
+        if (t5) {
+            t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
+        }
+    }
+
+    void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
+        if (t5) {
+            t5->set_max_graph_vram_bytes(max_vram_bytes);
+        }
+    }
+
+    void set_stream_layers_enabled(bool enabled) override {
+        if (t5) {
+            t5->set_stream_layers_enabled(enabled);
+        }
+    }
+
+    void set_flash_attention_enabled(bool enabled) override {
+        if (t5) {
+            t5->set_flash_attention_enabled(enabled);
+        }
+    }
+
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        if (t5) {
+            t5->set_weight_adapter(adapter);
+        }
+    }
+
+    void runner_done() override {
+        if (t5) {
+            t5->runner_done();
+        }
+    }
+
+    SDCondition get_learned_condition(int n_threads,
+                                      const ConditionerParams& conditioner_params) override {
+        SDCondition result;
+        if (!t5) {
+            result.c_crossattn = sd::Tensor<float>::zeros({1024, static_cast<int64_t>(prompt_length)});
+            result.c_vector    = sd::Tensor<float>::zeros({static_cast<int64_t>(prompt_length)});
+            return result;
+        }
+
+        std::vector<int> tokens = tokenizer.encode(conditioner_params.text);
+        if (tokens.size() > prompt_length) {
+            tokens.resize(prompt_length);
+        }
+        std::vector<float> mask(tokens.size(), 1.0f);
+        while (tokens.size() < prompt_length) {
+            tokens.push_back(tokenizer.PAD_TOKEN_ID);
+            mask.push_back(0.0f);
+        }
+
+        sd::Tensor<int32_t> input_ids({static_cast<int64_t>(tokens.size())}, tokens);
+        std::vector<float> t5_mask(mask.size(), 0.0f);
+        for (size_t i = 0; i < mask.size(); ++i) {
+            t5_mask[i] = mask[i] > 0.0f ? 0.0f : -HUGE_VALF;
+        }
+        sd::Tensor<float> hidden_states = t5->compute(n_threads,
+                                                      input_ids,
+                                                      sd::Tensor<float>::from_vector(t5_mask),
+                                                      false,
+                                                      true,
+                                                      true);
+        GGML_ASSERT(!hidden_states.empty());
+        result.c_crossattn = std::move(hidden_states);
+        result.c_vector    = sd::Tensor<float>::from_vector(mask);
+        return result;
+    }
+};
+
 struct AnimaConditioner : public Conditioner {
     std::shared_ptr<BPETokenizer> qwen_tokenizer;
     T5UniGramTokenizer t5_tokenizer;
diff --git a/src/core/ggml_extend_backend.cpp b/src/core/ggml_extend_backend.cpp
index f3e2cceba..2eb62d3a3 100644
--- a/src/core/ggml_extend_backend.cpp
+++ b/src/core/ggml_extend_backend.cpp
@@ -110,7 +110,67 @@ static std::string resolve_first_device_by_type(enum ggml_backend_dev_type type)
     if (dev == nullptr) {
         return "";
     }
-    return ggml_backend_dev_name(dev);
+    const char* dev_name = ggml_backend_dev_name(dev);
+    if (dev_name != nullptr && dev_name[0] != '\0') {
+        return dev_name;
+    }
+    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+    const char* reg_name   = reg != nullptr ? ggml_backend_reg_name(reg) : nullptr;
+    return reg_name != nullptr ? reg_name : "";
+}
+
+static ggml_backend_dev_t resolve_first_device_by_registry_name(const std::string& name) {
+    std::string lower = lower_copy(trim_copy(name));
+    if (lower == "metal") {
+        lower = "mtl";
+    }
+    if (lower.empty()) {
+        return nullptr;
+    }
+
+    const size_t device_count = ggml_backend_dev_count();
+    for (size_t i = 0; i < device_count; ++i) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+        if (reg == nullptr) {
+            continue;
+        }
+        const char* reg_name = ggml_backend_reg_name(reg);
+        if (reg_name != nullptr && lower_copy(reg_name) == lower) {
+            return dev;
+        }
+    }
+    return nullptr;
+}
+
+static ggml_backend_dev_t resolve_device_by_name(const std::string& name) {
+    const std::string lower = lower_copy(trim_copy(name));
+    if (lower.empty()) {
+        return nullptr;
+    }
+
+    const size_t device_count = ggml_backend_dev_count();
+    for (size_t i = 0; i < device_count; ++i) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        const char* dev_name   = ggml_backend_dev_name(dev);
+        if (dev_name != nullptr && lower_copy(dev_name) == lower) {
+            return dev;
+        }
+    }
+    return nullptr;
+}
+
+static std::string backend_device_name(ggml_backend_dev_t dev) {
+    if (dev == nullptr) {
+        return "";
+    }
+    const char* name = ggml_backend_dev_name(dev);
+    if (name != nullptr && name[0] != '\0') {
+        return name;
+    }
+    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+    const char* reg_name   = reg != nullptr ? ggml_backend_reg_name(reg) : nullptr;
+    return reg_name != nullptr ? reg_name : "";
 }
 
 static ggml_backend_buffer_t ggml_backend_tensor_buffer(const struct ggml_tensor* tensor) {
@@ -296,6 +356,10 @@ std::string sd_backend_resolve_name(const std::string& name) {
         return resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
     }
 
+    if (ggml_backend_dev_t dev = resolve_first_device_by_registry_name(requested)) {
+        return backend_device_name(dev);
+    }
+
     const size_t device_count = ggml_backend_dev_count();
     for (size_t i = 0; i < device_count; ++i) {
         ggml_backend_dev_t dev = ggml_backend_dev_get(i);
@@ -328,7 +392,20 @@ static ggml_backend_t init_named_backend(const std::string& name) {
         return ggml_backend_init_best();
     }
 
+    if (ggml_backend_dev_t dev = resolve_device_by_name(name)) {
+        return ggml_backend_dev_init(dev, nullptr);
+    }
+    if (ggml_backend_dev_t dev = resolve_first_device_by_registry_name(name)) {
+        return ggml_backend_dev_init(dev, nullptr);
+    }
+
     std::string resolved = sd_backend_resolve_name(name);
+    if (ggml_backend_dev_t dev = resolve_device_by_name(resolved)) {
+        return ggml_backend_dev_init(dev, nullptr);
+    }
+    if (ggml_backend_dev_t dev = resolve_first_device_by_registry_name(resolved)) {
+        return ggml_backend_dev_init(dev, nullptr);
+    }
     if (resolved.empty()) {
         return nullptr;
     }
@@ -599,7 +676,7 @@ bool SDBackendManager::validate(std::string* error) const {
             }
             return false;
         }
-        if (!sd_backend_resolve_name(name).empty()) {
+        if (!sd_backend_resolve_name(name).empty() || resolve_first_device_by_registry_name(name) != nullptr) {
             return true;
         }
         if (error != nullptr) {
diff --git a/src/model.h b/src/model.h
index a62c4d1bf..bfc97ddef 100644
--- a/src/model.h
+++ b/src/model.h
@@ -45,6 +45,7 @@ enum SDVersion {
     VERSION_OVIS_IMAGE,
     VERSION_ERNIE_IMAGE,
     VERSION_LENS,
+    VERSION_MINIT2I,
     VERSION_LONGCAT,
     VERSION_PID,
     VERSION_IDEOGRAM4,
@@ -164,6 +165,13 @@ static inline bool sd_version_is_lens(SDVersion version) {
     return false;
 }
 
+static inline bool sd_version_is_minit2i(SDVersion version) {
+    if (version == VERSION_MINIT2I) {
+        return true;
+    }
+    return false;
+}
+
 static inline bool sd_version_is_pid(SDVersion version) {
     if (version == VERSION_PID) {
         return true;
@@ -208,6 +216,7 @@ static inline bool sd_version_is_dit(SDVersion version) {
         sd_version_is_z_image(version) ||
         sd_version_is_ernie_image(version) ||
         sd_version_is_lens(version) ||
+        sd_version_is_minit2i(version) ||
         sd_version_is_longcat(version) ||
         sd_version_is_pid(version) ||
         sd_version_is_ideogram4(version)) {
diff --git a/src/model/diffusion/minit2i.hpp b/src/model/diffusion/minit2i.hpp
new file mode 100644
index 000000000..e3c9dd538
--- /dev/null
+++ b/src/model/diffusion/minit2i.hpp
@@ -0,0 +1,627 @@
+#ifndef __SD_MODEL_DIFFUSION_MINIT2I_HPP__
+#define __SD_MODEL_DIFFUSION_MINIT2I_HPP__
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "core/ggml_extend.hpp"
+#include "model/common/rope.hpp"
+#include "model/diffusion/dit.hpp"
+#include "model/diffusion/model.hpp"
+#include "model_loader.h"
+
+namespace MiniT2I {
+    constexpr int MINIT2I_GRAPH_SIZE = 196608;
+
+    struct MiniT2IConfig {
+        int64_t image_size         = 512;
+        int64_t patch_size         = 16;
+        int64_t in_channels        = 3;
+        int64_t txt_input_size     = 1024;
+        int64_t hidden_size        = 768;
+        int64_t txt_hidden_size    = 768;
+        int64_t cond_vec_size      = 768;
+        int64_t depth_double       = 17;
+        int64_t txt_preamble_depth = 2;
+        int64_t num_heads          = 12;
+        int64_t head_dim           = 64;
+        float mlp_ratio            = 2.6667f;
+        int64_t pca_channels       = 128;
+        int64_t prompt_length      = 256;
+        int64_t n_T                = 100;
+        float cfg_interval_start   = 0.0f;
+        float cfg_interval_end     = 1.0f;
+
+        static MiniT2IConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
+            MiniT2IConfig config;
+            config.depth_double       = 0;
+            config.txt_preamble_depth = 0;
+
+            for (const auto& [name, tensor_storage] : tensor_storage_map) {
+                if (!starts_with(name, prefix)) {
+                    continue;
+                }
+                if (ends_with(name, "img_embedder.proj1.weight") && tensor_storage.n_dims == 4) {
+                    config.patch_size  = tensor_storage.ne[0];
+                    config.in_channels = tensor_storage.ne[2];
+                    config.pca_channels = tensor_storage.ne[3];
+                } else if (ends_with(name, "img_embedder.proj2.weight") && tensor_storage.n_dims == 4) {
+                    config.pca_channels = tensor_storage.ne[2];
+                    config.hidden_size  = tensor_storage.ne[3];
+                } else if (ends_with(name, "txt_embedder.weight") && tensor_storage.n_dims == 2) {
+                    config.txt_input_size  = tensor_storage.ne[0];
+                    config.txt_hidden_size = tensor_storage.ne[1];
+                } else if (ends_with(name, "pooled_embedder.weight") && tensor_storage.n_dims == 2) {
+                    config.cond_vec_size = tensor_storage.ne[1];
+                } else if (ends_with(name, "double_blocks.0.img_qkv.weight") && tensor_storage.n_dims == 2) {
+                    int64_t inner3 = tensor_storage.ne[1];
+                    int64_t inner  = inner3 / 3;
+                    config.hidden_size = tensor_storage.ne[0];
+                    if (config.hidden_size == 768) {
+                        config.num_heads = 12;
+                        config.head_dim  = 64;
+                    } else if (config.hidden_size == 1248) {
+                        config.num_heads = 24;
+                        config.head_dim  = 52;
+                    } else if (inner > 0) {
+                        config.head_dim  = 64;
+                        config.num_heads = std::max<int64_t>(1, inner / config.head_dim);
+                    }
+                } else if (ends_with(name, "final_layer.linear.weight") && tensor_storage.n_dims == 2) {
+                    int64_t patch_area  = config.patch_size * config.patch_size;
+                    config.hidden_size  = tensor_storage.ne[0];
+                    config.in_channels  = patch_area > 0 ? tensor_storage.ne[1] / patch_area : config.in_channels;
+                } else if (ends_with(name, "mask_token") && tensor_storage.n_dims >= 2) {
+                    config.prompt_length = tensor_storage.ne[1];
+                }
+
+                size_t pos = name.find("double_blocks.");
+                if (pos != std::string::npos) {
+                    auto items = split_string(name.substr(pos), '.');
+                    if (items.size() > 1) {
+                        int64_t idx         = atoi(items[1].c_str());
+                        config.depth_double = std::max<int64_t>(config.depth_double, idx + 1);
+                    }
+                }
+                pos = name.find("txt_preamble_blocks.");
+                if (pos != std::string::npos) {
+                    auto items = split_string(name.substr(pos), '.');
+                    if (items.size() > 1) {
+                        int64_t idx                 = atoi(items[1].c_str());
+                        config.txt_preamble_depth   = std::max<int64_t>(config.txt_preamble_depth, idx + 1);
+                    }
+                }
+            }
+
+            if (config.depth_double <= 0) {
+                config.depth_double = config.hidden_size == 1248 ? 23 : 17;
+            }
+            if (config.txt_preamble_depth <= 0) {
+                config.txt_preamble_depth = 2;
+            }
+            if (config.head_dim <= 0 || config.num_heads <= 0) {
+                config.head_dim  = config.hidden_size == 1248 ? 52 : 64;
+                config.num_heads = config.hidden_size / config.head_dim;
+            }
+            LOG_DEBUG("minit2i: hidden_size=%" PRId64 ", txt_hidden_size=%" PRId64 ", heads=%" PRId64 ", head_dim=%" PRId64 ", double_blocks=%" PRId64 ", txt_blocks=%" PRId64 ", patch=%" PRId64 ", in_channels=%" PRId64,
+                      config.hidden_size,
+                      config.txt_hidden_size,
+                      config.num_heads,
+                      config.head_dim,
+                      config.depth_double,
+                      config.txt_preamble_depth,
+                      config.patch_size,
+                      config.in_channels);
+            return config;
+        }
+    };
+
+    inline std::vector<float> make_2d_sincos_pos_embed(int grid_size, int dim) {
+        GGML_ASSERT(dim % 4 == 0);
+        int half_dim = dim / 2;
+        int quarter  = half_dim / 2;
+        std::vector<float> out(static_cast<size_t>(grid_size) * grid_size * dim);
+        std::vector<float> omega(quarter);
+        for (int i = 0; i < quarter; ++i) {
+            omega[i] = 1.0f / std::pow(10000.0f, static_cast<float>(i) / static_cast<float>(quarter));
+        }
+        for (int y = 0; y < grid_size; ++y) {
+            for (int x = 0; x < grid_size; ++x) {
+                size_t base = static_cast<size_t>(y * grid_size + x) * dim;
+                for (int i = 0; i < quarter; ++i) {
+                    float ay = y * omega[i];
+                    float ax = x * omega[i];
+                    out[base + i]                      = std::sin(ax);
+                    out[base + quarter + i]            = std::cos(ax);
+                    out[base + half_dim + i]           = std::sin(ay);
+                    out[base + half_dim + quarter + i] = std::cos(ay);
+                }
+            }
+        }
+        return out;
+    }
+
+    inline std::vector<float> make_text_rope(int length, int head_dim) {
+        return Rope::flatten(Rope::rope(Rope::linspace(0.f, static_cast<float>(length - 1), length), head_dim, 10000.f));
+    }
+
+    inline std::vector<float> make_vision_rope(int side, int head_dim) {
+        GGML_ASSERT(head_dim % 4 == 0);
+        int dim      = head_dim / 2;
+        int quarter  = dim / 2;
+        int length   = side * side;
+        std::vector<float> out(static_cast<size_t>(length) * (head_dim / 2) * 4);
+        std::vector<float> freqs(quarter);
+        for (int i = 0; i < quarter; ++i) {
+            freqs[i] = 1.0f / std::pow(10000.0f, static_cast<float>(2 * i) / static_cast<float>(dim));
+        }
+        for (int y = 0; y < side; ++y) {
+            for (int x = 0; x < side; ++x) {
+                int pos     = y * side + x;
+                size_t base = static_cast<size_t>(pos) * (head_dim / 2) * 4;
+                for (int i = 0; i < quarter; ++i) {
+                    float ay = y * freqs[i];
+                    float ax = x * freqs[i];
+                    float angles[2] = {ay, ax};
+                    for (int axis = 0; axis < 2; ++axis) {
+                        int j                   = axis * quarter + i;
+                        out[base + 4 * j]       = std::cos(angles[axis]);
+                        out[base + 4 * j + 1]   = -std::sin(angles[axis]);
+                        out[base + 4 * j + 2]   = std::sin(angles[axis]);
+                        out[base + 4 * j + 3]   = std::cos(angles[axis]);
+                    }
+                }
+            }
+        }
+        return out;
+    }
+
+    struct SwiGLUMlp : public GGMLBlock {
+        SwiGLUMlp(int64_t in_features, int64_t hidden_features) {
+            int64_t hidden_dim = ((hidden_features + 7) / 8) * 8;
+            blocks["w1"] = std::make_shared<Linear>(in_features, hidden_dim, false);
+            blocks["w3"] = std::make_shared<Linear>(in_features, hidden_dim, false);
+            blocks["w2"] = std::make_shared<Linear>(hidden_dim, in_features, false);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+            auto w1 = std::dynamic_pointer_cast<Linear>(blocks["w1"]);
+            auto w3 = std::dynamic_pointer_cast<Linear>(blocks["w3"]);
+            auto w2 = std::dynamic_pointer_cast<Linear>(blocks["w2"]);
+            auto gate = ggml_silu(ctx->ggml_ctx, w1->forward(ctx, x));
+            auto up   = w3->forward(ctx, x);
+            return w2->forward(ctx, ggml_mul(ctx->ggml_ctx, gate, up));
+        }
+    };
+
+    struct BottleneckPatchEmbed : public GGMLBlock {
+        int64_t patch_size;
+
+        BottleneckPatchEmbed(int64_t patch_size, int64_t in_channels, int64_t pca_channels, int64_t hidden_size)
+            : patch_size(patch_size) {
+            blocks["proj1"] = std::make_shared<Conv2d>(in_channels,
+                                                        pca_channels,
+                                                        std::pair<int, int>{static_cast<int>(patch_size), static_cast<int>(patch_size)},
+                                                        std::pair<int, int>{static_cast<int>(patch_size), static_cast<int>(patch_size)},
+                                                        std::pair<int, int>{0, 0},
+                                                        std::pair<int, int>{1, 1},
+                                                        false);
+            blocks["proj2"] = std::make_shared<Conv2d>(pca_channels,
+                                                        hidden_size,
+                                                        std::pair<int, int>{1, 1},
+                                                        std::pair<int, int>{1, 1},
+                                                        std::pair<int, int>{0, 0},
+                                                        std::pair<int, int>{1, 1},
+                                                        true);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+            auto proj1 = std::dynamic_pointer_cast<Conv2d>(blocks["proj1"]);
+            auto proj2 = std::dynamic_pointer_cast<Conv2d>(blocks["proj2"]);
+            x = proj1->forward(ctx, x);
+            x = proj2->forward(ctx, x);
+            x = ggml_reshape_3d(ctx->ggml_ctx, x, x->ne[0] * x->ne[1], x->ne[2], x->ne[3]);
+            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3));
+            return x;
+        }
+    };
+
+    struct TimestepEmbedder : public GGMLBlock {
+        int frequency_embedding_size;
+
+        TimestepEmbedder(int64_t hidden_size, int frequency_embedding_size = 256)
+            : frequency_embedding_size(frequency_embedding_size) {
+            blocks["mlp.0"] = std::make_shared<Linear>(frequency_embedding_size, hidden_size, true, true);
+            blocks["mlp.2"] = std::make_shared<Linear>(hidden_size, hidden_size, true, true);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* t) {
+            auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
+            auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);
+            auto t_emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, t, frequency_embedding_size, 10000, 1.0f);
+            t_emb      = mlp_0->forward(ctx, t_emb);
+            t_emb      = ggml_silu_inplace(ctx->ggml_ctx, t_emb);
+            return mlp_2->forward(ctx, t_emb);
+        }
+    };
+
+    inline std::vector<ggml_tensor*> split_qkv(ggml_context* ctx, ggml_tensor* qkv, int64_t num_heads, int64_t head_dim) {
+        int64_t N = qkv->ne[2];
+        int64_t L = qkv->ne[1];
+        auto q = ggml_view_4d(ctx, qkv, head_dim, num_heads, L, N,
+                              qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], 0);
+        auto k = ggml_view_4d(ctx, qkv, head_dim, num_heads, L, N,
+                              qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], qkv->nb[0] * head_dim * num_heads);
+        auto v = ggml_view_4d(ctx, qkv, head_dim, num_heads, L, N,
+                              qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], qkv->nb[0] * head_dim * num_heads * 2);
+        return {q, k, v};
+    }
+
+    struct PlainTextTransformerBlock : public GGMLBlock {
+        int64_t num_heads;
+        int64_t head_dim;
+
+        PlainTextTransformerBlock(int64_t hidden_size, int64_t num_heads, int64_t head_dim, float mlp_ratio)
+            : num_heads(num_heads), head_dim(head_dim) {
+            int64_t inner_dim = num_heads * head_dim;
+            blocks["norm1"]     = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
+            blocks["norm2"]     = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
+            blocks["qkv"]       = std::make_shared<Linear>(hidden_size, inner_dim * 3, true);
+            blocks["attn_proj"] = std::make_shared<Linear>(inner_dim, hidden_size, true);
+            blocks["mlp"]       = std::make_shared<SwiGLUMlp>(hidden_size, static_cast<int64_t>(hidden_size * mlp_ratio));
+            blocks["q_norm"]    = std::make_shared<RMSNorm>(head_dim, 1e-6f);
+            blocks["k_norm"]    = std::make_shared<RMSNorm>(head_dim, 1e-6f);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* txt, ggml_tensor* pe) {
+            auto norm1     = std::dynamic_pointer_cast<RMSNorm>(blocks["norm1"]);
+            auto norm2     = std::dynamic_pointer_cast<RMSNorm>(blocks["norm2"]);
+            auto qkv_proj  = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);
+            auto attn_proj = std::dynamic_pointer_cast<Linear>(blocks["attn_proj"]);
+            auto mlp       = std::dynamic_pointer_cast<SwiGLUMlp>(blocks["mlp"]);
+            auto q_norm    = std::dynamic_pointer_cast<RMSNorm>(blocks["q_norm"]);
+            auto k_norm    = std::dynamic_pointer_cast<RMSNorm>(blocks["k_norm"]);
+
+            auto qkv = split_qkv(ctx->ggml_ctx, qkv_proj->forward(ctx, norm1->forward(ctx, txt)), num_heads, head_dim);
+            auto q   = q_norm->forward(ctx, qkv[0]);
+            auto k   = k_norm->forward(ctx, qkv[1]);
+            auto v   = qkv[2];
+            auto out = Rope::attention(ctx, q, k, v, pe, nullptr, 1.0f, false);
+            txt      = ggml_add(ctx->ggml_ctx, txt, attn_proj->forward(ctx, out));
+            txt      = ggml_add(ctx->ggml_ctx, txt, mlp->forward(ctx, norm2->forward(ctx, txt)));
+            return txt;
+        }
+    };
+
+    struct DoubleStreamDiTBlock : public GGMLBlock {
+        int64_t num_heads;
+        int64_t head_dim;
+
+        DoubleStreamDiTBlock(int64_t hidden_size, int64_t txt_hidden_size, int64_t num_heads, int64_t head_dim, float mlp_ratio)
+            : num_heads(num_heads), head_dim(head_dim) {
+            int64_t inner_dim = num_heads * head_dim;
+            blocks["img_norm1"]     = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
+            blocks["img_norm2"]     = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
+            blocks["txt_norm1"]     = std::make_shared<RMSNorm>(txt_hidden_size, 1e-6f);
+            blocks["txt_norm2"]     = std::make_shared<RMSNorm>(txt_hidden_size, 1e-6f);
+            blocks["img_qkv"]       = std::make_shared<Linear>(hidden_size, inner_dim * 3, true);
+            blocks["txt_qkv"]       = std::make_shared<Linear>(txt_hidden_size, inner_dim * 3, true);
+            blocks["q_norm"]        = std::make_shared<RMSNorm>(head_dim, 1e-6f);
+            blocks["k_norm"]        = std::make_shared<RMSNorm>(head_dim, 1e-6f);
+            blocks["img_attn_proj"] = std::make_shared<Linear>(inner_dim, hidden_size, true);
+            blocks["txt_attn_proj"] = std::make_shared<Linear>(inner_dim, txt_hidden_size, true);
+            blocks["img_mlp"]       = std::make_shared<SwiGLUMlp>(hidden_size, static_cast<int64_t>(hidden_size * mlp_ratio));
+            blocks["txt_mlp"]       = std::make_shared<SwiGLUMlp>(txt_hidden_size, static_cast<int64_t>(txt_hidden_size * mlp_ratio));
+        }
+
+        std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                      ggml_tensor* img,
+                                                      ggml_tensor* txt,
+                                                      ggml_tensor* pe) {
+            auto img_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["img_norm1"]);
+            auto img_norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["img_norm2"]);
+            auto txt_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm1"]);
+            auto txt_norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm2"]);
+            auto img_qkv_p = std::dynamic_pointer_cast<Linear>(blocks["img_qkv"]);
+            auto txt_qkv_p = std::dynamic_pointer_cast<Linear>(blocks["txt_qkv"]);
+            auto q_norm    = std::dynamic_pointer_cast<RMSNorm>(blocks["q_norm"]);
+            auto k_norm    = std::dynamic_pointer_cast<RMSNorm>(blocks["k_norm"]);
+            auto img_proj  = std::dynamic_pointer_cast<Linear>(blocks["img_attn_proj"]);
+            auto txt_proj  = std::dynamic_pointer_cast<Linear>(blocks["txt_attn_proj"]);
+            auto img_mlp   = std::dynamic_pointer_cast<SwiGLUMlp>(blocks["img_mlp"]);
+            auto txt_mlp   = std::dynamic_pointer_cast<SwiGLUMlp>(blocks["txt_mlp"]);
+
+            int64_t li = img->ne[1];
+            int64_t lt = txt->ne[1];
+
+            auto img_qkv = split_qkv(ctx->ggml_ctx, img_qkv_p->forward(ctx, img_norm1->forward(ctx, img)), num_heads, head_dim);
+            auto txt_qkv = split_qkv(ctx->ggml_ctx, txt_qkv_p->forward(ctx, txt_norm1->forward(ctx, txt)), num_heads, head_dim);
+
+            auto q = ggml_concat(ctx->ggml_ctx, q_norm->forward(ctx, txt_qkv[0]), q_norm->forward(ctx, img_qkv[0]), 2);
+            auto k = ggml_concat(ctx->ggml_ctx, k_norm->forward(ctx, txt_qkv[1]), k_norm->forward(ctx, img_qkv[1]), 2);
+            auto v = ggml_concat(ctx->ggml_ctx, txt_qkv[2], img_qkv[2], 2);
+
+            auto out = Rope::attention(ctx, q, k, v, pe, nullptr, 1.0f, false);
+            auto out_txt = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, lt);
+            auto out_img = ggml_ext_slice(ctx->ggml_ctx, out, 1, lt, lt + li);
+
+            img = ggml_add(ctx->ggml_ctx, img, img_proj->forward(ctx, out_img));
+            txt = ggml_add(ctx->ggml_ctx, txt, txt_proj->forward(ctx, out_txt));
+            img = ggml_add(ctx->ggml_ctx, img, img_mlp->forward(ctx, img_norm2->forward(ctx, img)));
+            txt = ggml_add(ctx->ggml_ctx, txt, txt_mlp->forward(ctx, txt_norm2->forward(ctx, txt)));
+            return {img, txt};
+        }
+    };
+
+    struct FinalLayer : public GGMLBlock {
+        FinalLayer(int64_t hidden_size, int64_t patch_size, int64_t out_channels) {
+            blocks["norm_final"] = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
+            blocks["linear"]     = std::make_shared<Linear>(hidden_size, patch_size * patch_size * out_channels, true);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+            auto norm_final = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_final"]);
+            auto linear     = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
+            return linear->forward(ctx, norm_final->forward(ctx, x));
+        }
+    };
+
+    struct MMJiT : public GGMLBlock {
+        MiniT2IConfig config;
+
+        MMJiT(const MiniT2IConfig& config)
+            : config(config) {
+            blocks["img_embedder"]    = std::make_shared<BottleneckPatchEmbed>(config.patch_size, config.in_channels, config.pca_channels, config.hidden_size);
+            blocks["txt_embedder"]    = std::make_shared<Linear>(config.txt_input_size, config.txt_hidden_size, false);
+            blocks["t_embedder"]      = std::make_shared<TimestepEmbedder>(config.cond_vec_size);
+            blocks["pooled_embedder"] = std::make_shared<Linear>(config.txt_input_size, config.cond_vec_size, false);
+            for (int64_t i = 0; i < config.txt_preamble_depth; ++i) {
+                blocks["txt_preamble_blocks." + std::to_string(i)] = std::make_shared<PlainTextTransformerBlock>(config.txt_hidden_size, config.num_heads, config.head_dim, config.mlp_ratio);
+            }
+            for (int64_t i = 0; i < config.depth_double; ++i) {
+                blocks["double_blocks." + std::to_string(i)] = std::make_shared<DoubleStreamDiTBlock>(config.hidden_size, config.txt_hidden_size, config.num_heads, config.head_dim, config.mlp_ratio);
+            }
+            blocks["final_layer"] = std::make_shared<FinalLayer>(config.hidden_size, config.patch_size, config.in_channels);
+        }
+
+        void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+            GGMLBlock::init_params(ctx, tensor_storage_map, prefix);
+            enum ggml_type wtype = get_type(prefix + "mask_token", tensor_storage_map, GGML_TYPE_F32);
+            params["mask_token"] = ggml_new_tensor_3d(ctx, wtype, config.txt_input_size, 1, 1);
+        }
+
+        ggml_tensor* apply_text_mask(GGMLRunnerContext* ctx, ggml_tensor* context, ggml_tensor* mask) {
+            if (mask == nullptr) {
+                return context;
+            }
+            mask = ggml_reshape_3d(ctx->ggml_ctx, mask, 1, mask->ne[0], mask->ne[1]);
+            mask = ggml_repeat(ctx->ggml_ctx, mask, context);
+            auto keep = ggml_mul(ctx->ggml_ctx, context, mask);
+            auto inv  = ggml_sub(ctx->ggml_ctx, ggml_ext_ones_like(ctx->ggml_ctx, mask), mask);
+            auto mask_token = ggml_repeat(ctx->ggml_ctx, params["mask_token"], context);
+            return ggml_add(ctx->ggml_ctx, keep, ggml_mul(ctx->ggml_ctx, mask_token, inv));
+        }
+
+        ggml_tensor* pool_context(GGMLRunnerContext* ctx, ggml_tensor* context) {
+            int64_t dim = context->ne[0];
+            int64_t len = context->ne[1];
+            int64_t N   = context->ne[2];
+            auto x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, context, 1, 0, 2, 3));
+            x      = ggml_reshape_3d(ctx->ggml_ctx, x, len, dim, N);
+            x      = ggml_mean(ctx->ggml_ctx, x);
+            x      = ggml_reshape_2d(ctx->ggml_ctx, x, dim, N);
+            return x;
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* img,
+                             ggml_tensor* context,
+                             ggml_tensor* mask,
+                             ggml_tensor* pos_embed,
+                             ggml_tensor* txt_pe,
+                             ggml_tensor* joint_pe) {
+            auto img_embedder = std::dynamic_pointer_cast<BottleneckPatchEmbed>(blocks["img_embedder"]);
+            auto txt_embedder = std::dynamic_pointer_cast<Linear>(blocks["txt_embedder"]);
+            auto final_layer  = std::dynamic_pointer_cast<FinalLayer>(blocks["final_layer"]);
+
+            int64_t W  = img->ne[0];
+            int64_t H  = img->ne[1];
+            int64_t hp = H / config.patch_size;
+            int64_t wp = W / config.patch_size;
+
+            context = apply_text_mask(ctx, context, mask);
+            auto x  = img_embedder->forward(ctx, img);
+            x       = ggml_add(ctx->ggml_ctx, x, pos_embed);
+
+            auto txt = txt_embedder->forward(ctx, context);
+            for (int64_t i = 0; i < config.txt_preamble_depth; ++i) {
+                auto block = std::dynamic_pointer_cast<PlainTextTransformerBlock>(blocks["txt_preamble_blocks." + std::to_string(i)]);
+                txt        = block->forward(ctx, txt, txt_pe);
+                sd::ggml_graph_cut::mark_graph_cut(txt, "minit2i.txt_preamble_blocks." + std::to_string(i), "txt");
+            }
+            for (int64_t i = 0; i < config.depth_double; ++i) {
+                auto block = std::dynamic_pointer_cast<DoubleStreamDiTBlock>(blocks["double_blocks." + std::to_string(i)]);
+                auto out   = block->forward(ctx, x, txt, joint_pe);
+                x          = out.first;
+                txt        = out.second;
+                sd::ggml_graph_cut::mark_graph_cut(x, "minit2i.double_blocks." + std::to_string(i), "x");
+                sd::ggml_graph_cut::mark_graph_cut(txt, "minit2i.double_blocks." + std::to_string(i), "txt");
+            }
+            auto combined = ggml_concat(ctx->ggml_ctx, txt, x, 1);
+            auto out      = final_layer->forward(ctx, combined);
+            auto img_out  = ggml_ext_slice(ctx->ggml_ctx, out, 1, txt->ne[1], txt->ne[1] + x->ne[1]);
+            return DiT::unpatchify(ctx->ggml_ctx, img_out, hp, wp, static_cast<int>(config.patch_size), static_cast<int>(config.patch_size), false);
+        }
+    };
+
+    inline std::string resolve_prefix(const String2TensorStorage& tensor_storage_map, const std::string& requested) {
+        if (!requested.empty() && tensor_storage_map.find(requested + ".img_embedder.proj1.weight") != tensor_storage_map.end()) {
+            return requested;
+        }
+        static const std::vector<std::string> candidates = {
+            "model.net",
+            "model.diffusion_model.net",
+            "model.diffusion_model.model.net",
+        };
+        for (const auto& candidate : candidates) {
+            if (tensor_storage_map.find(candidate + ".img_embedder.proj1.weight") != tensor_storage_map.end()) {
+                return candidate;
+            }
+        }
+        return requested.empty() ? "model.net" : requested;
+    }
+
+    struct MiniT2IRunner : public DiffusionModelRunner {
+        MiniT2IConfig config;
+        MMJiT model;
+        ggml_context* position_cache_ctx            = nullptr;
+        ggml_backend_buffer_t position_cache_buffer = nullptr;
+        ggml_tensor* cached_pos_embed               = nullptr;
+        ggml_tensor* cached_txt_pe                  = nullptr;
+        ggml_tensor* cached_joint_pe                = nullptr;
+        int64_t cached_img_side                     = -1;
+        int64_t cached_txt_len                      = -1;
+        int64_t cached_hidden_size                  = -1;
+        int64_t cached_head_dim                     = -1;
+
+        MiniT2IRunner(ggml_backend_t backend,
+                      const String2TensorStorage& tensor_storage_map      = {},
+                      const std::string prefix                            = "",
+                      std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
+            : DiffusionModelRunner(backend, resolve_prefix(tensor_storage_map, prefix), weight_manager),
+              config(MiniT2IConfig::detect_from_weights(tensor_storage_map, this->prefix)),
+              model(config) {
+            model.init(params_ctx, tensor_storage_map, this->prefix);
+        }
+
+        ~MiniT2IRunner() override {
+            free_position_cache();
+        }
+
+        std::string get_desc() override {
+            return "MiniT2I";
+        }
+
+        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
+            model.get_param_tensors(tensors, prefix);
+        }
+
+        void free_position_cache() {
+            if (position_cache_buffer != nullptr) {
+                ggml_backend_buffer_free(position_cache_buffer);
+                position_cache_buffer = nullptr;
+            }
+            if (position_cache_ctx != nullptr) {
+                ggml_free(position_cache_ctx);
+                position_cache_ctx = nullptr;
+            }
+            cached_pos_embed   = nullptr;
+            cached_txt_pe      = nullptr;
+            cached_joint_pe    = nullptr;
+            cached_img_side    = -1;
+            cached_txt_len     = -1;
+            cached_hidden_size = -1;
+            cached_head_dim    = -1;
+        }
+
+        void ensure_position_cache(int64_t img_side, int64_t txt_len) {
+            if (cached_img_side == img_side &&
+                cached_txt_len == txt_len &&
+                cached_hidden_size == config.hidden_size &&
+                cached_head_dim == config.head_dim &&
+                cached_pos_embed != nullptr &&
+                cached_txt_pe != nullptr &&
+                cached_joint_pe != nullptr) {
+                return;
+            }
+
+            free_position_cache();
+
+            auto pos_embed_vec = make_2d_sincos_pos_embed(static_cast<int>(img_side), static_cast<int>(config.hidden_size));
+            auto txt_pe_vec    = make_text_rope(static_cast<int>(txt_len), static_cast<int>(config.head_dim));
+            auto img_pe_vec = make_vision_rope(static_cast<int>(img_side), static_cast<int>(config.head_dim));
+            auto joint_pe_vec = txt_pe_vec;
+            joint_pe_vec.insert(joint_pe_vec.end(), img_pe_vec.begin(), img_pe_vec.end());
+
+            ggml_init_params params;
+            params.mem_size   = static_cast<size_t>(3 * ggml_tensor_overhead());
+            params.mem_buffer = nullptr;
+            params.no_alloc   = true;
+            position_cache_ctx = ggml_init(params);
+            GGML_ASSERT(position_cache_ctx != nullptr);
+
+            cached_pos_embed = ggml_new_tensor_3d(position_cache_ctx, GGML_TYPE_F32, config.hidden_size, img_side * img_side, 1);
+            ggml_set_name(cached_pos_embed, "minit2i.pos_embed");
+            cached_txt_pe = ggml_new_tensor_4d(position_cache_ctx, GGML_TYPE_F32, 2, 2, config.head_dim / 2, txt_len);
+            ggml_set_name(cached_txt_pe, "minit2i.txt_pe");
+            cached_joint_pe = ggml_new_tensor_4d(position_cache_ctx, GGML_TYPE_F32, 2, 2, config.head_dim / 2, txt_len + img_side * img_side);
+            ggml_set_name(cached_joint_pe, "minit2i.joint_pe");
+
+            position_cache_buffer = ggml_backend_alloc_ctx_tensors(position_cache_ctx, runtime_backend);
+            GGML_ASSERT(position_cache_buffer != nullptr);
+            ggml_backend_buffer_set_usage(position_cache_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+            ggml_backend_tensor_set(cached_pos_embed, pos_embed_vec.data(), 0, ggml_nbytes(cached_pos_embed));
+            ggml_backend_tensor_set(cached_txt_pe, txt_pe_vec.data(), 0, ggml_nbytes(cached_txt_pe));
+            ggml_backend_tensor_set(cached_joint_pe, joint_pe_vec.data(), 0, ggml_nbytes(cached_joint_pe));
+            ggml_backend_synchronize(runtime_backend);
+
+            cached_img_side    = img_side;
+            cached_txt_len     = txt_len;
+            cached_hidden_size = config.hidden_size;
+            cached_head_dim    = config.head_dim;
+        }
+
+        ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
+                                 const sd::Tensor<float>& timesteps_tensor,
+                                 const sd::Tensor<float>& context_tensor,
+                                 const sd::Tensor<float>& mask_tensor) {
+            ggml_cgraph* gf        = new_graph_custom(MINIT2I_GRAPH_SIZE);
+            ggml_tensor* x         = make_input(x_tensor);
+            ggml_tensor* context   = make_input(context_tensor);
+            ggml_tensor* mask      = make_input(mask_tensor);
+            SD_UNUSED(timesteps_tensor);
+
+            int64_t W        = x->ne[0];
+            int64_t H        = x->ne[1];
+            int64_t img_side = H / config.patch_size;
+            int64_t txt_len  = context->ne[1];
+            ensure_position_cache(img_side, txt_len);
+
+            auto runner_ctx = get_context();
+            auto out        = model.forward(&runner_ctx, x, context, mask, cached_pos_embed, cached_txt_pe, cached_joint_pe);
+            ggml_build_forward_expand(gf, out);
+            return gf;
+        }
+
+        sd::Tensor<float> compute(int n_threads,
+                                  const sd::Tensor<float>& x,
+                                  const sd::Tensor<float>& timesteps,
+                                  const sd::Tensor<float>& context,
+                                  const sd::Tensor<float>& mask) {
+            auto get_graph = [&]() -> ggml_cgraph* {
+                return build_graph(x, timesteps, context, mask);
+            };
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+        }
+
+        sd::Tensor<float> compute(int n_threads,
+                                  const DiffusionParams& diffusion_params) override {
+            GGML_ASSERT(diffusion_params.x != nullptr);
+            GGML_ASSERT(diffusion_params.timesteps != nullptr);
+            GGML_ASSERT(diffusion_params.context != nullptr);
+            GGML_ASSERT(diffusion_params.y != nullptr);
+            return compute(n_threads,
+                           *diffusion_params.x,
+                           *diffusion_params.timesteps,
+                           *diffusion_params.context,
+                           *diffusion_params.y);
+        }
+    };
+}  // namespace MiniT2I
+
+#endif  // __SD_MODEL_DIFFUSION_MINIT2I_HPP__
diff --git a/src/model/te/t5.hpp b/src/model/te/t5.hpp
index 23da08222..da12a0447 100644
--- a/src/model/te/t5.hpp
+++ b/src/model/te/t5.hpp
@@ -23,19 +23,72 @@ struct T5Config {
     int64_t vocab_size      = 32128;
     bool relative_attention = true;
 
-    static T5Config detect_from_weights(const String2TensorStorage& tensor_storage_map,
-                                        const std::string& prefix,
-                                        bool is_umt5 = false) {
-        (void)tensor_storage_map;
-        (void)prefix;
-        T5Config config;
-        if (is_umt5) {
-            config.vocab_size         = 256384;
-            config.relative_attention = false;
-        }
-        return config;
-    }
-};
+    static T5Config detect_from_weights(const String2TensorStorage& tensor_storage_map,
+                                        const std::string& prefix,
+                                        bool is_umt5 = false) {
+        T5Config config;
+        if (is_umt5) {
+            config.vocab_size         = 256384;
+            config.relative_attention = false;
+        }
+        auto find_tensor = [&](const std::string& suffix) -> const TensorStorage* {
+            auto it = tensor_storage_map.find(prefix + "." + suffix);
+            if (it != tensor_storage_map.end()) {
+                return &it->second;
+            }
+            it = tensor_storage_map.find(prefix + suffix);
+            if (it != tensor_storage_map.end()) {
+                return &it->second;
+            }
+            return nullptr;
+        };
+
+        if (const TensorStorage* shared = find_tensor("shared.weight")) {
+            if (shared->n_dims == 2) {
+                config.vocab_size = shared->ne[1];
+                config.model_dim  = shared->ne[0];
+            }
+        }
+        if (const TensorStorage* q = find_tensor("encoder.block.0.layer.0.SelfAttention.q.weight")) {
+            if (q->n_dims == 2) {
+                config.model_dim = q->ne[0];
+                int64_t inner_dim = q->ne[1];
+                // Flan-T5/T5 uses d_kv=64 for common sizes.
+                if (inner_dim % 64 == 0) {
+                    config.num_heads = inner_dim / 64;
+                }
+            }
+        }
+        if (const TensorStorage* wi = find_tensor("encoder.block.0.layer.1.DenseReluDense.wi_0.weight")) {
+            if (wi->n_dims == 2) {
+                config.model_dim = wi->ne[0];
+                config.ff_dim    = wi->ne[1];
+            }
+        }
+        int64_t detected_layers = 0;
+        for (const auto& [name, _] : tensor_storage_map) {
+            std::string base = prefix;
+            if (!base.empty() && base.back() != '.') {
+                base += ".";
+            }
+            std::string layer_prefix = base + "encoder.block.";
+            if (!starts_with(name, layer_prefix)) {
+                continue;
+            }
+            size_t pos = layer_prefix.size();
+            size_t dot = name.find('.', pos);
+            if (dot == std::string::npos) {
+                continue;
+            }
+            int64_t layer = atoi(name.substr(pos, dot - pos).c_str());
+            detected_layers = std::max(detected_layers, layer + 1);
+        }
+        if (detected_layers > 0) {
+            config.num_layers = detected_layers;
+        }
+        return config;
+    }
+};
 
 class T5LayerNorm : public UnaryBlock {
 protected:
diff --git a/src/model/vae/vae.hpp b/src/model/vae/vae.hpp
index af091bb57..e065b2467 100644
--- a/src/model/vae/vae.hpp
+++ b/src/model/vae/vae.hpp
@@ -78,7 +78,7 @@ struct VAE : public GGMLRunner {
             scale_factor = 16;
         } else if (sd_version_uses_flux2_vae(version)) {
             scale_factor = 16;
-        } else if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1) {
+        } else if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1 || sd_version_is_minit2i(version)) {
             scale_factor = 1;
         }
         return scale_factor;
diff --git a/src/model_loader.cpp b/src/model_loader.cpp
index 587b01f4d..6e5f48321 100644
--- a/src/model_loader.cpp
+++ b/src/model_loader.cpp
@@ -467,6 +467,17 @@ SDVersion ModelLoader::get_sd_version() {
             tensor_storage_map.find("model.diffusion_model.transformer_blocks.0.img_mlp.w1.weight") != tensor_storage_map.end()) {
             return VERSION_LENS;
         }
+        if ((tensor_storage_map.find("model.net.img_embedder.proj1.weight") != tensor_storage_map.end() &&
+             tensor_storage_map.find("model.net.double_blocks.0.img_qkv.weight") != tensor_storage_map.end() &&
+             tensor_storage_map.find("model.net.txt_embedder.weight") != tensor_storage_map.end()) ||
+            (tensor_storage_map.find("model.diffusion_model.net.img_embedder.proj1.weight") != tensor_storage_map.end() &&
+             tensor_storage_map.find("model.diffusion_model.net.double_blocks.0.img_qkv.weight") != tensor_storage_map.end() &&
+             tensor_storage_map.find("model.diffusion_model.net.txt_embedder.weight") != tensor_storage_map.end()) ||
+            (tensor_storage_map.find("model.diffusion_model.model.net.img_embedder.proj1.weight") != tensor_storage_map.end() &&
+             tensor_storage_map.find("model.diffusion_model.model.net.double_blocks.0.img_qkv.weight") != tensor_storage_map.end() &&
+             tensor_storage_map.find("model.diffusion_model.model.net.txt_embedder.weight") != tensor_storage_map.end())) {
+            return VERSION_MINIT2I;
+        }
         if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.img_mod.1.weight") != std::string::npos) {
             return VERSION_QWEN_IMAGE;
         }
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 4df047dd5..df3642c0b 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -27,6 +27,7 @@
 #include "model/diffusion/ideogram4.hpp"
 #include "model/diffusion/lens.hpp"
 #include "model/diffusion/ltxv.hpp"
+#include "model/diffusion/minit2i.hpp"
 #include "model/diffusion/mmdit.hpp"
 #include "model/diffusion/model.hpp"
 #include "model/diffusion/pid.hpp"
@@ -90,6 +91,7 @@ const char* model_version_to_str[] = {
     "Ovis Image",
     "Ernie Image",
     "Lens",
+    "MiniT2I",
     "Longcat-Image",
     "PiD",
     "Ideogram 4",
@@ -764,6 +766,14 @@ class StableDiffusionGGML {
                                                                                tensor_storage_map,
                                                                                "model",
                                                                                model_manager);
+            } else if (sd_version_is_minit2i(version)) {
+                cond_stage_model = std::make_shared<MiniT2IConditioner>(backend_for(SDBackendModule::TE),
+                                                                        tensor_storage_map,
+                                                                        model_manager);
+                diffusion_model  = std::make_shared<MiniT2I::MiniT2IRunner>(backend_for(SDBackendModule::DIFFUSION),
+                                                                            tensor_storage_map,
+                                                                            "",
+                                                                            model_manager);
             } else if (sd_version_is_anima(version)) {
                 cond_stage_model = std::make_shared<AnimaConditioner>(backend_for(SDBackendModule::TE),
                                                                       tensor_storage_map,
@@ -930,7 +940,7 @@ class StableDiffusionGGML {
                 }
             };
 
-            if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1) {
+            if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1 || sd_version_is_minit2i(version)) {
                 LOG_INFO("using FakeVAE");
                 first_stage_model = std::make_shared<FakeVAE>(version,
                                                               backend_for(SDBackendModule::VAE),
@@ -1951,11 +1961,93 @@ class StableDiffusionGGML {
         }
 
         int64_t last_progress_us     = ggml_time_us();
+        SamplePreviewContext preview = prepare_sample_preview_context();
+
+        if (sd_version_is_minit2i(version)) {
+            if (noise.empty()) {
+                LOG_ERROR("MiniT2I sampling requires initial noise");
+                return {};
+            }
+            if (cond.c_crossattn.empty() || cond.c_vector.empty()) {
+                LOG_ERROR("MiniT2I requires T5 hidden states and prompt mask");
+                return {};
+            }
+            size_t minit2i_steps = steps > 0 ? steps : 100;
+            sd::Tensor<float> x_t = noise * 2.0f;
+            sd::Tensor<float> denoised = x_t;
+            sd::Tensor<float> uncond_mask = sd::Tensor<float>::zeros_like(cond.c_vector);
+
+            auto run_minit2i = [&](const sd::Tensor<float>& x,
+                                   float t_value,
+                                   const sd::Tensor<float>& mask) -> sd::Tensor<float> {
+                int64_t batch = x.dim() >= 4 ? x.shape()[3] : 1;
+                if (batch <= 0) {
+                    LOG_ERROR("MiniT2I got invalid input shape for sampling");
+                    return {};
+                }
+                LOG_DEBUG("MiniT2I sampling input shape: dim=%" PRId64 ", batch=%" PRId64,
+                          x.dim(),
+                          batch);
+                std::vector<float> t_vec(static_cast<size_t>(batch), t_value);
+                const int64_t t_vec_size = static_cast<int64_t>(t_vec.size());
+                sd::Tensor<float> timesteps_tensor({t_vec_size}, std::move(t_vec));
+                DiffusionParams diffusion_params;
+                diffusion_params.x         = &x;
+                diffusion_params.timesteps = &timesteps_tensor;
+                diffusion_params.context   = &cond.c_crossattn;
+                diffusion_params.y         = &mask;
+                auto out = work_diffusion_model->compute(n_threads, diffusion_params);
+                if (out.empty()) {
+                    LOG_ERROR("MiniT2I diffusion model compute failed");
+                    return {};
+                }
+                return out;
+            };
+
+            pretty_progress(0, static_cast<int>(minit2i_steps), 0);
+            last_progress_us = ggml_time_us();
+            for (size_t i = 0; i < minit2i_steps; ++i) {
+                if (get_cancel_flag() == SD_CANCEL_ALL) {
+                    LOG_DEBUG("cancelling generation");
+                    return {};
+                }
+                float t_cur  = static_cast<float>(i) / static_cast<float>(minit2i_steps);
+                float t_next = static_cast<float>(i + 1) / static_cast<float>(minit2i_steps);
+
+                if (sd_should_preview_noisy() && preview.callback != nullptr) {
+                    preview_image(static_cast<int>(i + 1), x_t, version, preview.mode, preview.callback, preview.data, true);
+                }
+
+                auto cond_x0 = run_minit2i(x_t, t_cur, cond.c_vector);
+                if (cond_x0.empty()) {
+                    return {};
+                }
+                auto uncond_x0 = run_minit2i(x_t, t_cur, uncond_mask);
+                if (uncond_x0.empty()) {
+                    return {};
+                }
+                float denom = std::max(1.0f - t_cur, 0.001f);
+                auto cond_v = (cond_x0 - x_t) / denom;
+                auto uncond_v = (uncond_x0 - x_t) / denom;
+                auto v = uncond_v + (cond_v - uncond_v) * cfg_scale;
+                x_t += v * (t_next - t_cur);
+                denoised = x_t;
+
+                if (sd_should_preview_denoised() && preview.callback != nullptr) {
+                    preview_image(static_cast<int>(i + 1), denoised, version, preview.mode, preview.callback, preview.data, false);
+                }
+                report_sample_progress(static_cast<int>(i + 1), minit2i_steps, &last_progress_us);
+            }
+            if (work_diffusion_model) {
+                work_diffusion_model->free_compute_buffer();
+            }
+            return denoised;
+        }
+
         sd::Tensor<float> x_t        = !noise.empty()
                                            ? denoiser->noise_scaling(sigmas[0], noise, init_latent)
                                            : init_latent;
         sd::Tensor<float> denoised   = x_t;
-        SamplePreviewContext preview = prepare_sample_preview_context();
 
         auto denoise = [&](const sd::Tensor<float>& x, float sigma, int step) -> sd::guidance::GuiderOutput {
             if (get_cancel_flag() == SD_CANCEL_ALL) {
@@ -2254,6 +2346,8 @@ class StableDiffusionGGML {
                 latent_channel = 3;
             } else if (version == VERSION_CHROMA_RADIANCE) {
                 latent_channel = 3;
+            } else if (sd_version_is_minit2i(version)) {
+                latent_channel = 3;
             } else if (sd_version_is_pid(version)) {
                 latent_channel = 3;
             } else if (sd_version_uses_flux2_vae(version)) {
@@ -2333,7 +2427,7 @@ class StableDiffusionGGML {
     }
 
     sd::Tensor<float> decode_first_stage(const sd::Tensor<float>& x, bool decode_video = false) {
-        if (sd_version_is_pid(version)) {
+        if (sd_version_is_pid(version) || sd_version_is_minit2i(version)) {
             return sd::ops::clamp((x + 1.f) * 0.5f, 0.0f, 1.0f);
         }
         auto latents = first_stage_model->diffusion_to_vae_latents(x);