diff --git a/src/conditioning/conditioner.hpp b/src/conditioning/conditioner.hpp index ebbee97a9..3111d5837 100644 --- a/src/conditioning/conditioner.hpp +++ b/src/conditioning/conditioner.hpp @@ -1378,6 +1378,101 @@ struct T5CLIPEmbedder : public Conditioner { } }; +struct MiniT2IConditioner : public Conditioner { + T5UniGramTokenizer tokenizer; + std::shared_ptr t5; + size_t prompt_length = 256; + + MiniT2IConditioner(ggml_backend_t backend, + const String2TensorStorage& tensor_storage_map = {}, + std::shared_ptr weight_manager = nullptr) { + bool use_t5 = false; + for (const auto& pair : tensor_storage_map) { + if (pair.first.find("text_encoders.t5xxl") != std::string::npos) { + use_t5 = true; + break; + } + } + if (!use_t5) { + LOG_WARN("IMPORTANT NOTICE: No MiniT2I T5 text encoder provided, cannot process prompts!"); + return; + } + t5 = std::make_shared(backend, tensor_storage_map, "text_encoders.t5xxl.transformer", false, weight_manager); + } + + void get_param_tensors(std::map& tensors) override { + if (t5) { + t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer"); + } + } + + void set_max_graph_vram_bytes(size_t max_vram_bytes) override { + if (t5) { + t5->set_max_graph_vram_bytes(max_vram_bytes); + } + } + + void set_stream_layers_enabled(bool enabled) override { + if (t5) { + t5->set_stream_layers_enabled(enabled); + } + } + + void set_flash_attention_enabled(bool enabled) override { + if (t5) { + t5->set_flash_attention_enabled(enabled); + } + } + + void set_weight_adapter(const std::shared_ptr& adapter) override { + if (t5) { + t5->set_weight_adapter(adapter); + } + } + + void runner_done() override { + if (t5) { + t5->runner_done(); + } + } + + SDCondition get_learned_condition(int n_threads, + const ConditionerParams& conditioner_params) override { + SDCondition result; + if (!t5) { + result.c_crossattn = sd::Tensor::zeros({1024, static_cast(prompt_length)}); + result.c_vector = sd::Tensor::zeros({static_cast(prompt_length)}); + return result; + } + + std::vector tokens = tokenizer.encode(conditioner_params.text); + if (tokens.size() > prompt_length) { + tokens.resize(prompt_length); + } + std::vector mask(tokens.size(), 1.0f); + while (tokens.size() < prompt_length) { + tokens.push_back(tokenizer.PAD_TOKEN_ID); + mask.push_back(0.0f); + } + + sd::Tensor input_ids({static_cast(tokens.size())}, tokens); + std::vector t5_mask(mask.size(), 0.0f); + for (size_t i = 0; i < mask.size(); ++i) { + t5_mask[i] = mask[i] > 0.0f ? 0.0f : -HUGE_VALF; + } + sd::Tensor hidden_states = t5->compute(n_threads, + input_ids, + sd::Tensor::from_vector(t5_mask), + false, + true, + true); + GGML_ASSERT(!hidden_states.empty()); + result.c_crossattn = std::move(hidden_states); + result.c_vector = sd::Tensor::from_vector(mask); + return result; + } +}; + struct AnimaConditioner : public Conditioner { std::shared_ptr qwen_tokenizer; T5UniGramTokenizer t5_tokenizer; diff --git a/src/core/ggml_extend_backend.cpp b/src/core/ggml_extend_backend.cpp index f3e2cceba..2eb62d3a3 100644 --- a/src/core/ggml_extend_backend.cpp +++ b/src/core/ggml_extend_backend.cpp @@ -110,7 +110,67 @@ static std::string resolve_first_device_by_type(enum ggml_backend_dev_type type) if (dev == nullptr) { return ""; } - return ggml_backend_dev_name(dev); + const char* dev_name = ggml_backend_dev_name(dev); + if (dev_name != nullptr && dev_name[0] != '\0') { + return dev_name; + } + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); + const char* reg_name = reg != nullptr ? ggml_backend_reg_name(reg) : nullptr; + return reg_name != nullptr ? reg_name : ""; +} + +static ggml_backend_dev_t resolve_first_device_by_registry_name(const std::string& name) { + std::string lower = lower_copy(trim_copy(name)); + if (lower == "metal") { + lower = "mtl"; + } + if (lower.empty()) { + return nullptr; + } + + const size_t device_count = ggml_backend_dev_count(); + for (size_t i = 0; i < device_count; ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); + if (reg == nullptr) { + continue; + } + const char* reg_name = ggml_backend_reg_name(reg); + if (reg_name != nullptr && lower_copy(reg_name) == lower) { + return dev; + } + } + return nullptr; +} + +static ggml_backend_dev_t resolve_device_by_name(const std::string& name) { + const std::string lower = lower_copy(trim_copy(name)); + if (lower.empty()) { + return nullptr; + } + + const size_t device_count = ggml_backend_dev_count(); + for (size_t i = 0; i < device_count; ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + const char* dev_name = ggml_backend_dev_name(dev); + if (dev_name != nullptr && lower_copy(dev_name) == lower) { + return dev; + } + } + return nullptr; +} + +static std::string backend_device_name(ggml_backend_dev_t dev) { + if (dev == nullptr) { + return ""; + } + const char* name = ggml_backend_dev_name(dev); + if (name != nullptr && name[0] != '\0') { + return name; + } + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); + const char* reg_name = reg != nullptr ? ggml_backend_reg_name(reg) : nullptr; + return reg_name != nullptr ? reg_name : ""; } static ggml_backend_buffer_t ggml_backend_tensor_buffer(const struct ggml_tensor* tensor) { @@ -296,6 +356,10 @@ std::string sd_backend_resolve_name(const std::string& name) { return resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU); } + if (ggml_backend_dev_t dev = resolve_first_device_by_registry_name(requested)) { + return backend_device_name(dev); + } + const size_t device_count = ggml_backend_dev_count(); for (size_t i = 0; i < device_count; ++i) { ggml_backend_dev_t dev = ggml_backend_dev_get(i); @@ -328,7 +392,20 @@ static ggml_backend_t init_named_backend(const std::string& name) { return ggml_backend_init_best(); } + if (ggml_backend_dev_t dev = resolve_device_by_name(name)) { + return ggml_backend_dev_init(dev, nullptr); + } + if (ggml_backend_dev_t dev = resolve_first_device_by_registry_name(name)) { + return ggml_backend_dev_init(dev, nullptr); + } + std::string resolved = sd_backend_resolve_name(name); + if (ggml_backend_dev_t dev = resolve_device_by_name(resolved)) { + return ggml_backend_dev_init(dev, nullptr); + } + if (ggml_backend_dev_t dev = resolve_first_device_by_registry_name(resolved)) { + return ggml_backend_dev_init(dev, nullptr); + } if (resolved.empty()) { return nullptr; } @@ -599,7 +676,7 @@ bool SDBackendManager::validate(std::string* error) const { } return false; } - if (!sd_backend_resolve_name(name).empty()) { + if (!sd_backend_resolve_name(name).empty() || resolve_first_device_by_registry_name(name) != nullptr) { return true; } if (error != nullptr) { diff --git a/src/model.h b/src/model.h index a62c4d1bf..bfc97ddef 100644 --- a/src/model.h +++ b/src/model.h @@ -45,6 +45,7 @@ enum SDVersion { VERSION_OVIS_IMAGE, VERSION_ERNIE_IMAGE, VERSION_LENS, + VERSION_MINIT2I, VERSION_LONGCAT, VERSION_PID, VERSION_IDEOGRAM4, @@ -164,6 +165,13 @@ static inline bool sd_version_is_lens(SDVersion version) { return false; } +static inline bool sd_version_is_minit2i(SDVersion version) { + if (version == VERSION_MINIT2I) { + return true; + } + return false; +} + static inline bool sd_version_is_pid(SDVersion version) { if (version == VERSION_PID) { return true; @@ -208,6 +216,7 @@ static inline bool sd_version_is_dit(SDVersion version) { sd_version_is_z_image(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version) || + sd_version_is_minit2i(version) || sd_version_is_longcat(version) || sd_version_is_pid(version) || sd_version_is_ideogram4(version)) { diff --git a/src/model/diffusion/minit2i.hpp b/src/model/diffusion/minit2i.hpp new file mode 100644 index 000000000..e3c9dd538 --- /dev/null +++ b/src/model/diffusion/minit2i.hpp @@ -0,0 +1,627 @@ +#ifndef __SD_MODEL_DIFFUSION_MINIT2I_HPP__ +#define __SD_MODEL_DIFFUSION_MINIT2I_HPP__ + +#include +#include +#include +#include +#include +#include +#include + +#include "core/ggml_extend.hpp" +#include "model/common/rope.hpp" +#include "model/diffusion/dit.hpp" +#include "model/diffusion/model.hpp" +#include "model_loader.h" + +namespace MiniT2I { + constexpr int MINIT2I_GRAPH_SIZE = 196608; + + struct MiniT2IConfig { + int64_t image_size = 512; + int64_t patch_size = 16; + int64_t in_channels = 3; + int64_t txt_input_size = 1024; + int64_t hidden_size = 768; + int64_t txt_hidden_size = 768; + int64_t cond_vec_size = 768; + int64_t depth_double = 17; + int64_t txt_preamble_depth = 2; + int64_t num_heads = 12; + int64_t head_dim = 64; + float mlp_ratio = 2.6667f; + int64_t pca_channels = 128; + int64_t prompt_length = 256; + int64_t n_T = 100; + float cfg_interval_start = 0.0f; + float cfg_interval_end = 1.0f; + + static MiniT2IConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) { + MiniT2IConfig config; + config.depth_double = 0; + config.txt_preamble_depth = 0; + + for (const auto& [name, tensor_storage] : tensor_storage_map) { + if (!starts_with(name, prefix)) { + continue; + } + if (ends_with(name, "img_embedder.proj1.weight") && tensor_storage.n_dims == 4) { + config.patch_size = tensor_storage.ne[0]; + config.in_channels = tensor_storage.ne[2]; + config.pca_channels = tensor_storage.ne[3]; + } else if (ends_with(name, "img_embedder.proj2.weight") && tensor_storage.n_dims == 4) { + config.pca_channels = tensor_storage.ne[2]; + config.hidden_size = tensor_storage.ne[3]; + } else if (ends_with(name, "txt_embedder.weight") && tensor_storage.n_dims == 2) { + config.txt_input_size = tensor_storage.ne[0]; + config.txt_hidden_size = tensor_storage.ne[1]; + } else if (ends_with(name, "pooled_embedder.weight") && tensor_storage.n_dims == 2) { + config.cond_vec_size = tensor_storage.ne[1]; + } else if (ends_with(name, "double_blocks.0.img_qkv.weight") && tensor_storage.n_dims == 2) { + int64_t inner3 = tensor_storage.ne[1]; + int64_t inner = inner3 / 3; + config.hidden_size = tensor_storage.ne[0]; + if (config.hidden_size == 768) { + config.num_heads = 12; + config.head_dim = 64; + } else if (config.hidden_size == 1248) { + config.num_heads = 24; + config.head_dim = 52; + } else if (inner > 0) { + config.head_dim = 64; + config.num_heads = std::max(1, inner / config.head_dim); + } + } else if (ends_with(name, "final_layer.linear.weight") && tensor_storage.n_dims == 2) { + int64_t patch_area = config.patch_size * config.patch_size; + config.hidden_size = tensor_storage.ne[0]; + config.in_channels = patch_area > 0 ? tensor_storage.ne[1] / patch_area : config.in_channels; + } else if (ends_with(name, "mask_token") && tensor_storage.n_dims >= 2) { + config.prompt_length = tensor_storage.ne[1]; + } + + size_t pos = name.find("double_blocks."); + if (pos != std::string::npos) { + auto items = split_string(name.substr(pos), '.'); + if (items.size() > 1) { + int64_t idx = atoi(items[1].c_str()); + config.depth_double = std::max(config.depth_double, idx + 1); + } + } + pos = name.find("txt_preamble_blocks."); + if (pos != std::string::npos) { + auto items = split_string(name.substr(pos), '.'); + if (items.size() > 1) { + int64_t idx = atoi(items[1].c_str()); + config.txt_preamble_depth = std::max(config.txt_preamble_depth, idx + 1); + } + } + } + + if (config.depth_double <= 0) { + config.depth_double = config.hidden_size == 1248 ? 23 : 17; + } + if (config.txt_preamble_depth <= 0) { + config.txt_preamble_depth = 2; + } + if (config.head_dim <= 0 || config.num_heads <= 0) { + config.head_dim = config.hidden_size == 1248 ? 52 : 64; + config.num_heads = config.hidden_size / config.head_dim; + } + LOG_DEBUG("minit2i: hidden_size=%" PRId64 ", txt_hidden_size=%" PRId64 ", heads=%" PRId64 ", head_dim=%" PRId64 ", double_blocks=%" PRId64 ", txt_blocks=%" PRId64 ", patch=%" PRId64 ", in_channels=%" PRId64, + config.hidden_size, + config.txt_hidden_size, + config.num_heads, + config.head_dim, + config.depth_double, + config.txt_preamble_depth, + config.patch_size, + config.in_channels); + return config; + } + }; + + inline std::vector make_2d_sincos_pos_embed(int grid_size, int dim) { + GGML_ASSERT(dim % 4 == 0); + int half_dim = dim / 2; + int quarter = half_dim / 2; + std::vector out(static_cast(grid_size) * grid_size * dim); + std::vector omega(quarter); + for (int i = 0; i < quarter; ++i) { + omega[i] = 1.0f / std::pow(10000.0f, static_cast(i) / static_cast(quarter)); + } + for (int y = 0; y < grid_size; ++y) { + for (int x = 0; x < grid_size; ++x) { + size_t base = static_cast(y * grid_size + x) * dim; + for (int i = 0; i < quarter; ++i) { + float ay = y * omega[i]; + float ax = x * omega[i]; + out[base + i] = std::sin(ax); + out[base + quarter + i] = std::cos(ax); + out[base + half_dim + i] = std::sin(ay); + out[base + half_dim + quarter + i] = std::cos(ay); + } + } + } + return out; + } + + inline std::vector make_text_rope(int length, int head_dim) { + return Rope::flatten(Rope::rope(Rope::linspace(0.f, static_cast(length - 1), length), head_dim, 10000.f)); + } + + inline std::vector make_vision_rope(int side, int head_dim) { + GGML_ASSERT(head_dim % 4 == 0); + int dim = head_dim / 2; + int quarter = dim / 2; + int length = side * side; + std::vector out(static_cast(length) * (head_dim / 2) * 4); + std::vector freqs(quarter); + for (int i = 0; i < quarter; ++i) { + freqs[i] = 1.0f / std::pow(10000.0f, static_cast(2 * i) / static_cast(dim)); + } + for (int y = 0; y < side; ++y) { + for (int x = 0; x < side; ++x) { + int pos = y * side + x; + size_t base = static_cast(pos) * (head_dim / 2) * 4; + for (int i = 0; i < quarter; ++i) { + float ay = y * freqs[i]; + float ax = x * freqs[i]; + float angles[2] = {ay, ax}; + for (int axis = 0; axis < 2; ++axis) { + int j = axis * quarter + i; + out[base + 4 * j] = std::cos(angles[axis]); + out[base + 4 * j + 1] = -std::sin(angles[axis]); + out[base + 4 * j + 2] = std::sin(angles[axis]); + out[base + 4 * j + 3] = std::cos(angles[axis]); + } + } + } + } + return out; + } + + struct SwiGLUMlp : public GGMLBlock { + SwiGLUMlp(int64_t in_features, int64_t hidden_features) { + int64_t hidden_dim = ((hidden_features + 7) / 8) * 8; + blocks["w1"] = std::make_shared(in_features, hidden_dim, false); + blocks["w3"] = std::make_shared(in_features, hidden_dim, false); + blocks["w2"] = std::make_shared(hidden_dim, in_features, false); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + auto w1 = std::dynamic_pointer_cast(blocks["w1"]); + auto w3 = std::dynamic_pointer_cast(blocks["w3"]); + auto w2 = std::dynamic_pointer_cast(blocks["w2"]); + auto gate = ggml_silu(ctx->ggml_ctx, w1->forward(ctx, x)); + auto up = w3->forward(ctx, x); + return w2->forward(ctx, ggml_mul(ctx->ggml_ctx, gate, up)); + } + }; + + struct BottleneckPatchEmbed : public GGMLBlock { + int64_t patch_size; + + BottleneckPatchEmbed(int64_t patch_size, int64_t in_channels, int64_t pca_channels, int64_t hidden_size) + : patch_size(patch_size) { + blocks["proj1"] = std::make_shared(in_channels, + pca_channels, + std::pair{static_cast(patch_size), static_cast(patch_size)}, + std::pair{static_cast(patch_size), static_cast(patch_size)}, + std::pair{0, 0}, + std::pair{1, 1}, + false); + blocks["proj2"] = std::make_shared(pca_channels, + hidden_size, + std::pair{1, 1}, + std::pair{1, 1}, + std::pair{0, 0}, + std::pair{1, 1}, + true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + auto proj1 = std::dynamic_pointer_cast(blocks["proj1"]); + auto proj2 = std::dynamic_pointer_cast(blocks["proj2"]); + x = proj1->forward(ctx, x); + x = proj2->forward(ctx, x); + x = ggml_reshape_3d(ctx->ggml_ctx, x, x->ne[0] * x->ne[1], x->ne[2], x->ne[3]); + x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); + return x; + } + }; + + struct TimestepEmbedder : public GGMLBlock { + int frequency_embedding_size; + + TimestepEmbedder(int64_t hidden_size, int frequency_embedding_size = 256) + : frequency_embedding_size(frequency_embedding_size) { + blocks["mlp.0"] = std::make_shared(frequency_embedding_size, hidden_size, true, true); + blocks["mlp.2"] = std::make_shared(hidden_size, hidden_size, true, true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* t) { + auto mlp_0 = std::dynamic_pointer_cast(blocks["mlp.0"]); + auto mlp_2 = std::dynamic_pointer_cast(blocks["mlp.2"]); + auto t_emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, t, frequency_embedding_size, 10000, 1.0f); + t_emb = mlp_0->forward(ctx, t_emb); + t_emb = ggml_silu_inplace(ctx->ggml_ctx, t_emb); + return mlp_2->forward(ctx, t_emb); + } + }; + + inline std::vector split_qkv(ggml_context* ctx, ggml_tensor* qkv, int64_t num_heads, int64_t head_dim) { + int64_t N = qkv->ne[2]; + int64_t L = qkv->ne[1]; + auto q = ggml_view_4d(ctx, qkv, head_dim, num_heads, L, N, + qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], 0); + auto k = ggml_view_4d(ctx, qkv, head_dim, num_heads, L, N, + qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], qkv->nb[0] * head_dim * num_heads); + auto v = ggml_view_4d(ctx, qkv, head_dim, num_heads, L, N, + qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], qkv->nb[0] * head_dim * num_heads * 2); + return {q, k, v}; + } + + struct PlainTextTransformerBlock : public GGMLBlock { + int64_t num_heads; + int64_t head_dim; + + PlainTextTransformerBlock(int64_t hidden_size, int64_t num_heads, int64_t head_dim, float mlp_ratio) + : num_heads(num_heads), head_dim(head_dim) { + int64_t inner_dim = num_heads * head_dim; + blocks["norm1"] = std::make_shared(hidden_size, 1e-6f); + blocks["norm2"] = std::make_shared(hidden_size, 1e-6f); + blocks["qkv"] = std::make_shared(hidden_size, inner_dim * 3, true); + blocks["attn_proj"] = std::make_shared(inner_dim, hidden_size, true); + blocks["mlp"] = std::make_shared(hidden_size, static_cast(hidden_size * mlp_ratio)); + blocks["q_norm"] = std::make_shared(head_dim, 1e-6f); + blocks["k_norm"] = std::make_shared(head_dim, 1e-6f); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* txt, ggml_tensor* pe) { + auto norm1 = std::dynamic_pointer_cast(blocks["norm1"]); + auto norm2 = std::dynamic_pointer_cast(blocks["norm2"]); + auto qkv_proj = std::dynamic_pointer_cast(blocks["qkv"]); + auto attn_proj = std::dynamic_pointer_cast(blocks["attn_proj"]); + auto mlp = std::dynamic_pointer_cast(blocks["mlp"]); + auto q_norm = std::dynamic_pointer_cast(blocks["q_norm"]); + auto k_norm = std::dynamic_pointer_cast(blocks["k_norm"]); + + auto qkv = split_qkv(ctx->ggml_ctx, qkv_proj->forward(ctx, norm1->forward(ctx, txt)), num_heads, head_dim); + auto q = q_norm->forward(ctx, qkv[0]); + auto k = k_norm->forward(ctx, qkv[1]); + auto v = qkv[2]; + auto out = Rope::attention(ctx, q, k, v, pe, nullptr, 1.0f, false); + txt = ggml_add(ctx->ggml_ctx, txt, attn_proj->forward(ctx, out)); + txt = ggml_add(ctx->ggml_ctx, txt, mlp->forward(ctx, norm2->forward(ctx, txt))); + return txt; + } + }; + + struct DoubleStreamDiTBlock : public GGMLBlock { + int64_t num_heads; + int64_t head_dim; + + DoubleStreamDiTBlock(int64_t hidden_size, int64_t txt_hidden_size, int64_t num_heads, int64_t head_dim, float mlp_ratio) + : num_heads(num_heads), head_dim(head_dim) { + int64_t inner_dim = num_heads * head_dim; + blocks["img_norm1"] = std::make_shared(hidden_size, 1e-6f); + blocks["img_norm2"] = std::make_shared(hidden_size, 1e-6f); + blocks["txt_norm1"] = std::make_shared(txt_hidden_size, 1e-6f); + blocks["txt_norm2"] = std::make_shared(txt_hidden_size, 1e-6f); + blocks["img_qkv"] = std::make_shared(hidden_size, inner_dim * 3, true); + blocks["txt_qkv"] = std::make_shared(txt_hidden_size, inner_dim * 3, true); + blocks["q_norm"] = std::make_shared(head_dim, 1e-6f); + blocks["k_norm"] = std::make_shared(head_dim, 1e-6f); + blocks["img_attn_proj"] = std::make_shared(inner_dim, hidden_size, true); + blocks["txt_attn_proj"] = std::make_shared(inner_dim, txt_hidden_size, true); + blocks["img_mlp"] = std::make_shared(hidden_size, static_cast(hidden_size * mlp_ratio)); + blocks["txt_mlp"] = std::make_shared(txt_hidden_size, static_cast(txt_hidden_size * mlp_ratio)); + } + + std::pair forward(GGMLRunnerContext* ctx, + ggml_tensor* img, + ggml_tensor* txt, + ggml_tensor* pe) { + auto img_norm1 = std::dynamic_pointer_cast(blocks["img_norm1"]); + auto img_norm2 = std::dynamic_pointer_cast(blocks["img_norm2"]); + auto txt_norm1 = std::dynamic_pointer_cast(blocks["txt_norm1"]); + auto txt_norm2 = std::dynamic_pointer_cast(blocks["txt_norm2"]); + auto img_qkv_p = std::dynamic_pointer_cast(blocks["img_qkv"]); + auto txt_qkv_p = std::dynamic_pointer_cast(blocks["txt_qkv"]); + auto q_norm = std::dynamic_pointer_cast(blocks["q_norm"]); + auto k_norm = std::dynamic_pointer_cast(blocks["k_norm"]); + auto img_proj = std::dynamic_pointer_cast(blocks["img_attn_proj"]); + auto txt_proj = std::dynamic_pointer_cast(blocks["txt_attn_proj"]); + auto img_mlp = std::dynamic_pointer_cast(blocks["img_mlp"]); + auto txt_mlp = std::dynamic_pointer_cast(blocks["txt_mlp"]); + + int64_t li = img->ne[1]; + int64_t lt = txt->ne[1]; + + auto img_qkv = split_qkv(ctx->ggml_ctx, img_qkv_p->forward(ctx, img_norm1->forward(ctx, img)), num_heads, head_dim); + auto txt_qkv = split_qkv(ctx->ggml_ctx, txt_qkv_p->forward(ctx, txt_norm1->forward(ctx, txt)), num_heads, head_dim); + + auto q = ggml_concat(ctx->ggml_ctx, q_norm->forward(ctx, txt_qkv[0]), q_norm->forward(ctx, img_qkv[0]), 2); + auto k = ggml_concat(ctx->ggml_ctx, k_norm->forward(ctx, txt_qkv[1]), k_norm->forward(ctx, img_qkv[1]), 2); + auto v = ggml_concat(ctx->ggml_ctx, txt_qkv[2], img_qkv[2], 2); + + auto out = Rope::attention(ctx, q, k, v, pe, nullptr, 1.0f, false); + auto out_txt = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, lt); + auto out_img = ggml_ext_slice(ctx->ggml_ctx, out, 1, lt, lt + li); + + img = ggml_add(ctx->ggml_ctx, img, img_proj->forward(ctx, out_img)); + txt = ggml_add(ctx->ggml_ctx, txt, txt_proj->forward(ctx, out_txt)); + img = ggml_add(ctx->ggml_ctx, img, img_mlp->forward(ctx, img_norm2->forward(ctx, img))); + txt = ggml_add(ctx->ggml_ctx, txt, txt_mlp->forward(ctx, txt_norm2->forward(ctx, txt))); + return {img, txt}; + } + }; + + struct FinalLayer : public GGMLBlock { + FinalLayer(int64_t hidden_size, int64_t patch_size, int64_t out_channels) { + blocks["norm_final"] = std::make_shared(hidden_size, 1e-6f); + blocks["linear"] = std::make_shared(hidden_size, patch_size * patch_size * out_channels, true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + auto norm_final = std::dynamic_pointer_cast(blocks["norm_final"]); + auto linear = std::dynamic_pointer_cast(blocks["linear"]); + return linear->forward(ctx, norm_final->forward(ctx, x)); + } + }; + + struct MMJiT : public GGMLBlock { + MiniT2IConfig config; + + MMJiT(const MiniT2IConfig& config) + : config(config) { + blocks["img_embedder"] = std::make_shared(config.patch_size, config.in_channels, config.pca_channels, config.hidden_size); + blocks["txt_embedder"] = std::make_shared(config.txt_input_size, config.txt_hidden_size, false); + blocks["t_embedder"] = std::make_shared(config.cond_vec_size); + blocks["pooled_embedder"] = std::make_shared(config.txt_input_size, config.cond_vec_size, false); + for (int64_t i = 0; i < config.txt_preamble_depth; ++i) { + blocks["txt_preamble_blocks." + std::to_string(i)] = std::make_shared(config.txt_hidden_size, config.num_heads, config.head_dim, config.mlp_ratio); + } + for (int64_t i = 0; i < config.depth_double; ++i) { + blocks["double_blocks." + std::to_string(i)] = std::make_shared(config.hidden_size, config.txt_hidden_size, config.num_heads, config.head_dim, config.mlp_ratio); + } + blocks["final_layer"] = std::make_shared(config.hidden_size, config.patch_size, config.in_channels); + } + + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + GGMLBlock::init_params(ctx, tensor_storage_map, prefix); + enum ggml_type wtype = get_type(prefix + "mask_token", tensor_storage_map, GGML_TYPE_F32); + params["mask_token"] = ggml_new_tensor_3d(ctx, wtype, config.txt_input_size, 1, 1); + } + + ggml_tensor* apply_text_mask(GGMLRunnerContext* ctx, ggml_tensor* context, ggml_tensor* mask) { + if (mask == nullptr) { + return context; + } + mask = ggml_reshape_3d(ctx->ggml_ctx, mask, 1, mask->ne[0], mask->ne[1]); + mask = ggml_repeat(ctx->ggml_ctx, mask, context); + auto keep = ggml_mul(ctx->ggml_ctx, context, mask); + auto inv = ggml_sub(ctx->ggml_ctx, ggml_ext_ones_like(ctx->ggml_ctx, mask), mask); + auto mask_token = ggml_repeat(ctx->ggml_ctx, params["mask_token"], context); + return ggml_add(ctx->ggml_ctx, keep, ggml_mul(ctx->ggml_ctx, mask_token, inv)); + } + + ggml_tensor* pool_context(GGMLRunnerContext* ctx, ggml_tensor* context) { + int64_t dim = context->ne[0]; + int64_t len = context->ne[1]; + int64_t N = context->ne[2]; + auto x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, context, 1, 0, 2, 3)); + x = ggml_reshape_3d(ctx->ggml_ctx, x, len, dim, N); + x = ggml_mean(ctx->ggml_ctx, x); + x = ggml_reshape_2d(ctx->ggml_ctx, x, dim, N); + return x; + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* img, + ggml_tensor* context, + ggml_tensor* mask, + ggml_tensor* pos_embed, + ggml_tensor* txt_pe, + ggml_tensor* joint_pe) { + auto img_embedder = std::dynamic_pointer_cast(blocks["img_embedder"]); + auto txt_embedder = std::dynamic_pointer_cast(blocks["txt_embedder"]); + auto final_layer = std::dynamic_pointer_cast(blocks["final_layer"]); + + int64_t W = img->ne[0]; + int64_t H = img->ne[1]; + int64_t hp = H / config.patch_size; + int64_t wp = W / config.patch_size; + + context = apply_text_mask(ctx, context, mask); + auto x = img_embedder->forward(ctx, img); + x = ggml_add(ctx->ggml_ctx, x, pos_embed); + + auto txt = txt_embedder->forward(ctx, context); + for (int64_t i = 0; i < config.txt_preamble_depth; ++i) { + auto block = std::dynamic_pointer_cast(blocks["txt_preamble_blocks." + std::to_string(i)]); + txt = block->forward(ctx, txt, txt_pe); + sd::ggml_graph_cut::mark_graph_cut(txt, "minit2i.txt_preamble_blocks." + std::to_string(i), "txt"); + } + for (int64_t i = 0; i < config.depth_double; ++i) { + auto block = std::dynamic_pointer_cast(blocks["double_blocks." + std::to_string(i)]); + auto out = block->forward(ctx, x, txt, joint_pe); + x = out.first; + txt = out.second; + sd::ggml_graph_cut::mark_graph_cut(x, "minit2i.double_blocks." + std::to_string(i), "x"); + sd::ggml_graph_cut::mark_graph_cut(txt, "minit2i.double_blocks." + std::to_string(i), "txt"); + } + auto combined = ggml_concat(ctx->ggml_ctx, txt, x, 1); + auto out = final_layer->forward(ctx, combined); + auto img_out = ggml_ext_slice(ctx->ggml_ctx, out, 1, txt->ne[1], txt->ne[1] + x->ne[1]); + return DiT::unpatchify(ctx->ggml_ctx, img_out, hp, wp, static_cast(config.patch_size), static_cast(config.patch_size), false); + } + }; + + inline std::string resolve_prefix(const String2TensorStorage& tensor_storage_map, const std::string& requested) { + if (!requested.empty() && tensor_storage_map.find(requested + ".img_embedder.proj1.weight") != tensor_storage_map.end()) { + return requested; + } + static const std::vector candidates = { + "model.net", + "model.diffusion_model.net", + "model.diffusion_model.model.net", + }; + for (const auto& candidate : candidates) { + if (tensor_storage_map.find(candidate + ".img_embedder.proj1.weight") != tensor_storage_map.end()) { + return candidate; + } + } + return requested.empty() ? "model.net" : requested; + } + + struct MiniT2IRunner : public DiffusionModelRunner { + MiniT2IConfig config; + MMJiT model; + ggml_context* position_cache_ctx = nullptr; + ggml_backend_buffer_t position_cache_buffer = nullptr; + ggml_tensor* cached_pos_embed = nullptr; + ggml_tensor* cached_txt_pe = nullptr; + ggml_tensor* cached_joint_pe = nullptr; + int64_t cached_img_side = -1; + int64_t cached_txt_len = -1; + int64_t cached_hidden_size = -1; + int64_t cached_head_dim = -1; + + MiniT2IRunner(ggml_backend_t backend, + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "", + std::shared_ptr weight_manager = nullptr) + : DiffusionModelRunner(backend, resolve_prefix(tensor_storage_map, prefix), weight_manager), + config(MiniT2IConfig::detect_from_weights(tensor_storage_map, this->prefix)), + model(config) { + model.init(params_ctx, tensor_storage_map, this->prefix); + } + + ~MiniT2IRunner() override { + free_position_cache(); + } + + std::string get_desc() override { + return "MiniT2I"; + } + + void get_param_tensors(std::map& tensors, const std::string& prefix) override { + model.get_param_tensors(tensors, prefix); + } + + void free_position_cache() { + if (position_cache_buffer != nullptr) { + ggml_backend_buffer_free(position_cache_buffer); + position_cache_buffer = nullptr; + } + if (position_cache_ctx != nullptr) { + ggml_free(position_cache_ctx); + position_cache_ctx = nullptr; + } + cached_pos_embed = nullptr; + cached_txt_pe = nullptr; + cached_joint_pe = nullptr; + cached_img_side = -1; + cached_txt_len = -1; + cached_hidden_size = -1; + cached_head_dim = -1; + } + + void ensure_position_cache(int64_t img_side, int64_t txt_len) { + if (cached_img_side == img_side && + cached_txt_len == txt_len && + cached_hidden_size == config.hidden_size && + cached_head_dim == config.head_dim && + cached_pos_embed != nullptr && + cached_txt_pe != nullptr && + cached_joint_pe != nullptr) { + return; + } + + free_position_cache(); + + auto pos_embed_vec = make_2d_sincos_pos_embed(static_cast(img_side), static_cast(config.hidden_size)); + auto txt_pe_vec = make_text_rope(static_cast(txt_len), static_cast(config.head_dim)); + auto img_pe_vec = make_vision_rope(static_cast(img_side), static_cast(config.head_dim)); + auto joint_pe_vec = txt_pe_vec; + joint_pe_vec.insert(joint_pe_vec.end(), img_pe_vec.begin(), img_pe_vec.end()); + + ggml_init_params params; + params.mem_size = static_cast(3 * ggml_tensor_overhead()); + params.mem_buffer = nullptr; + params.no_alloc = true; + position_cache_ctx = ggml_init(params); + GGML_ASSERT(position_cache_ctx != nullptr); + + cached_pos_embed = ggml_new_tensor_3d(position_cache_ctx, GGML_TYPE_F32, config.hidden_size, img_side * img_side, 1); + ggml_set_name(cached_pos_embed, "minit2i.pos_embed"); + cached_txt_pe = ggml_new_tensor_4d(position_cache_ctx, GGML_TYPE_F32, 2, 2, config.head_dim / 2, txt_len); + ggml_set_name(cached_txt_pe, "minit2i.txt_pe"); + cached_joint_pe = ggml_new_tensor_4d(position_cache_ctx, GGML_TYPE_F32, 2, 2, config.head_dim / 2, txt_len + img_side * img_side); + ggml_set_name(cached_joint_pe, "minit2i.joint_pe"); + + position_cache_buffer = ggml_backend_alloc_ctx_tensors(position_cache_ctx, runtime_backend); + GGML_ASSERT(position_cache_buffer != nullptr); + ggml_backend_buffer_set_usage(position_cache_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + ggml_backend_tensor_set(cached_pos_embed, pos_embed_vec.data(), 0, ggml_nbytes(cached_pos_embed)); + ggml_backend_tensor_set(cached_txt_pe, txt_pe_vec.data(), 0, ggml_nbytes(cached_txt_pe)); + ggml_backend_tensor_set(cached_joint_pe, joint_pe_vec.data(), 0, ggml_nbytes(cached_joint_pe)); + ggml_backend_synchronize(runtime_backend); + + cached_img_side = img_side; + cached_txt_len = txt_len; + cached_hidden_size = config.hidden_size; + cached_head_dim = config.head_dim; + } + + ggml_cgraph* build_graph(const sd::Tensor& x_tensor, + const sd::Tensor& timesteps_tensor, + const sd::Tensor& context_tensor, + const sd::Tensor& mask_tensor) { + ggml_cgraph* gf = new_graph_custom(MINIT2I_GRAPH_SIZE); + ggml_tensor* x = make_input(x_tensor); + ggml_tensor* context = make_input(context_tensor); + ggml_tensor* mask = make_input(mask_tensor); + SD_UNUSED(timesteps_tensor); + + int64_t W = x->ne[0]; + int64_t H = x->ne[1]; + int64_t img_side = H / config.patch_size; + int64_t txt_len = context->ne[1]; + ensure_position_cache(img_side, txt_len); + + auto runner_ctx = get_context(); + auto out = model.forward(&runner_ctx, x, context, mask, cached_pos_embed, cached_txt_pe, cached_joint_pe); + ggml_build_forward_expand(gf, out); + return gf; + } + + sd::Tensor compute(int n_threads, + const sd::Tensor& x, + const sd::Tensor& timesteps, + const sd::Tensor& context, + const sd::Tensor& mask) { + auto get_graph = [&]() -> ggml_cgraph* { + return build_graph(x, timesteps, context, mask); + }; + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), x.dim()); + } + + sd::Tensor compute(int n_threads, + const DiffusionParams& diffusion_params) override { + GGML_ASSERT(diffusion_params.x != nullptr); + GGML_ASSERT(diffusion_params.timesteps != nullptr); + GGML_ASSERT(diffusion_params.context != nullptr); + GGML_ASSERT(diffusion_params.y != nullptr); + return compute(n_threads, + *diffusion_params.x, + *diffusion_params.timesteps, + *diffusion_params.context, + *diffusion_params.y); + } + }; +} // namespace MiniT2I + +#endif // __SD_MODEL_DIFFUSION_MINIT2I_HPP__ diff --git a/src/model/te/t5.hpp b/src/model/te/t5.hpp index 23da08222..da12a0447 100644 --- a/src/model/te/t5.hpp +++ b/src/model/te/t5.hpp @@ -23,19 +23,72 @@ struct T5Config { int64_t vocab_size = 32128; bool relative_attention = true; - static T5Config detect_from_weights(const String2TensorStorage& tensor_storage_map, - const std::string& prefix, - bool is_umt5 = false) { - (void)tensor_storage_map; - (void)prefix; - T5Config config; - if (is_umt5) { - config.vocab_size = 256384; - config.relative_attention = false; - } - return config; - } -}; + static T5Config detect_from_weights(const String2TensorStorage& tensor_storage_map, + const std::string& prefix, + bool is_umt5 = false) { + T5Config config; + if (is_umt5) { + config.vocab_size = 256384; + config.relative_attention = false; + } + auto find_tensor = [&](const std::string& suffix) -> const TensorStorage* { + auto it = tensor_storage_map.find(prefix + "." + suffix); + if (it != tensor_storage_map.end()) { + return &it->second; + } + it = tensor_storage_map.find(prefix + suffix); + if (it != tensor_storage_map.end()) { + return &it->second; + } + return nullptr; + }; + + if (const TensorStorage* shared = find_tensor("shared.weight")) { + if (shared->n_dims == 2) { + config.vocab_size = shared->ne[1]; + config.model_dim = shared->ne[0]; + } + } + if (const TensorStorage* q = find_tensor("encoder.block.0.layer.0.SelfAttention.q.weight")) { + if (q->n_dims == 2) { + config.model_dim = q->ne[0]; + int64_t inner_dim = q->ne[1]; + // Flan-T5/T5 uses d_kv=64 for common sizes. + if (inner_dim % 64 == 0) { + config.num_heads = inner_dim / 64; + } + } + } + if (const TensorStorage* wi = find_tensor("encoder.block.0.layer.1.DenseReluDense.wi_0.weight")) { + if (wi->n_dims == 2) { + config.model_dim = wi->ne[0]; + config.ff_dim = wi->ne[1]; + } + } + int64_t detected_layers = 0; + for (const auto& [name, _] : tensor_storage_map) { + std::string base = prefix; + if (!base.empty() && base.back() != '.') { + base += "."; + } + std::string layer_prefix = base + "encoder.block."; + if (!starts_with(name, layer_prefix)) { + continue; + } + size_t pos = layer_prefix.size(); + size_t dot = name.find('.', pos); + if (dot == std::string::npos) { + continue; + } + int64_t layer = atoi(name.substr(pos, dot - pos).c_str()); + detected_layers = std::max(detected_layers, layer + 1); + } + if (detected_layers > 0) { + config.num_layers = detected_layers; + } + return config; + } +}; class T5LayerNorm : public UnaryBlock { protected: diff --git a/src/model/vae/vae.hpp b/src/model/vae/vae.hpp index af091bb57..e065b2467 100644 --- a/src/model/vae/vae.hpp +++ b/src/model/vae/vae.hpp @@ -78,7 +78,7 @@ struct VAE : public GGMLRunner { scale_factor = 16; } else if (sd_version_uses_flux2_vae(version)) { scale_factor = 16; - } else if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1) { + } else if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1 || sd_version_is_minit2i(version)) { scale_factor = 1; } return scale_factor; diff --git a/src/model_loader.cpp b/src/model_loader.cpp index 587b01f4d..6e5f48321 100644 --- a/src/model_loader.cpp +++ b/src/model_loader.cpp @@ -467,6 +467,17 @@ SDVersion ModelLoader::get_sd_version() { tensor_storage_map.find("model.diffusion_model.transformer_blocks.0.img_mlp.w1.weight") != tensor_storage_map.end()) { return VERSION_LENS; } + if ((tensor_storage_map.find("model.net.img_embedder.proj1.weight") != tensor_storage_map.end() && + tensor_storage_map.find("model.net.double_blocks.0.img_qkv.weight") != tensor_storage_map.end() && + tensor_storage_map.find("model.net.txt_embedder.weight") != tensor_storage_map.end()) || + (tensor_storage_map.find("model.diffusion_model.net.img_embedder.proj1.weight") != tensor_storage_map.end() && + tensor_storage_map.find("model.diffusion_model.net.double_blocks.0.img_qkv.weight") != tensor_storage_map.end() && + tensor_storage_map.find("model.diffusion_model.net.txt_embedder.weight") != tensor_storage_map.end()) || + (tensor_storage_map.find("model.diffusion_model.model.net.img_embedder.proj1.weight") != tensor_storage_map.end() && + tensor_storage_map.find("model.diffusion_model.model.net.double_blocks.0.img_qkv.weight") != tensor_storage_map.end() && + tensor_storage_map.find("model.diffusion_model.model.net.txt_embedder.weight") != tensor_storage_map.end())) { + return VERSION_MINIT2I; + } if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.img_mod.1.weight") != std::string::npos) { return VERSION_QWEN_IMAGE; } diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 4df047dd5..df3642c0b 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -27,6 +27,7 @@ #include "model/diffusion/ideogram4.hpp" #include "model/diffusion/lens.hpp" #include "model/diffusion/ltxv.hpp" +#include "model/diffusion/minit2i.hpp" #include "model/diffusion/mmdit.hpp" #include "model/diffusion/model.hpp" #include "model/diffusion/pid.hpp" @@ -90,6 +91,7 @@ const char* model_version_to_str[] = { "Ovis Image", "Ernie Image", "Lens", + "MiniT2I", "Longcat-Image", "PiD", "Ideogram 4", @@ -764,6 +766,14 @@ class StableDiffusionGGML { tensor_storage_map, "model", model_manager); + } else if (sd_version_is_minit2i(version)) { + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + tensor_storage_map, + model_manager); + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + tensor_storage_map, + "", + model_manager); } else if (sd_version_is_anima(version)) { cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), tensor_storage_map, @@ -930,7 +940,7 @@ class StableDiffusionGGML { } }; - if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1) { + if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1 || sd_version_is_minit2i(version)) { LOG_INFO("using FakeVAE"); first_stage_model = std::make_shared(version, backend_for(SDBackendModule::VAE), @@ -1951,11 +1961,93 @@ class StableDiffusionGGML { } int64_t last_progress_us = ggml_time_us(); + SamplePreviewContext preview = prepare_sample_preview_context(); + + if (sd_version_is_minit2i(version)) { + if (noise.empty()) { + LOG_ERROR("MiniT2I sampling requires initial noise"); + return {}; + } + if (cond.c_crossattn.empty() || cond.c_vector.empty()) { + LOG_ERROR("MiniT2I requires T5 hidden states and prompt mask"); + return {}; + } + size_t minit2i_steps = steps > 0 ? steps : 100; + sd::Tensor x_t = noise * 2.0f; + sd::Tensor denoised = x_t; + sd::Tensor uncond_mask = sd::Tensor::zeros_like(cond.c_vector); + + auto run_minit2i = [&](const sd::Tensor& x, + float t_value, + const sd::Tensor& mask) -> sd::Tensor { + int64_t batch = x.dim() >= 4 ? x.shape()[3] : 1; + if (batch <= 0) { + LOG_ERROR("MiniT2I got invalid input shape for sampling"); + return {}; + } + LOG_DEBUG("MiniT2I sampling input shape: dim=%" PRId64 ", batch=%" PRId64, + x.dim(), + batch); + std::vector t_vec(static_cast(batch), t_value); + const int64_t t_vec_size = static_cast(t_vec.size()); + sd::Tensor timesteps_tensor({t_vec_size}, std::move(t_vec)); + DiffusionParams diffusion_params; + diffusion_params.x = &x; + diffusion_params.timesteps = ×teps_tensor; + diffusion_params.context = &cond.c_crossattn; + diffusion_params.y = &mask; + auto out = work_diffusion_model->compute(n_threads, diffusion_params); + if (out.empty()) { + LOG_ERROR("MiniT2I diffusion model compute failed"); + return {}; + } + return out; + }; + + pretty_progress(0, static_cast(minit2i_steps), 0); + last_progress_us = ggml_time_us(); + for (size_t i = 0; i < minit2i_steps; ++i) { + if (get_cancel_flag() == SD_CANCEL_ALL) { + LOG_DEBUG("cancelling generation"); + return {}; + } + float t_cur = static_cast(i) / static_cast(minit2i_steps); + float t_next = static_cast(i + 1) / static_cast(minit2i_steps); + + if (sd_should_preview_noisy() && preview.callback != nullptr) { + preview_image(static_cast(i + 1), x_t, version, preview.mode, preview.callback, preview.data, true); + } + + auto cond_x0 = run_minit2i(x_t, t_cur, cond.c_vector); + if (cond_x0.empty()) { + return {}; + } + auto uncond_x0 = run_minit2i(x_t, t_cur, uncond_mask); + if (uncond_x0.empty()) { + return {}; + } + float denom = std::max(1.0f - t_cur, 0.001f); + auto cond_v = (cond_x0 - x_t) / denom; + auto uncond_v = (uncond_x0 - x_t) / denom; + auto v = uncond_v + (cond_v - uncond_v) * cfg_scale; + x_t += v * (t_next - t_cur); + denoised = x_t; + + if (sd_should_preview_denoised() && preview.callback != nullptr) { + preview_image(static_cast(i + 1), denoised, version, preview.mode, preview.callback, preview.data, false); + } + report_sample_progress(static_cast(i + 1), minit2i_steps, &last_progress_us); + } + if (work_diffusion_model) { + work_diffusion_model->free_compute_buffer(); + } + return denoised; + } + sd::Tensor x_t = !noise.empty() ? denoiser->noise_scaling(sigmas[0], noise, init_latent) : init_latent; sd::Tensor denoised = x_t; - SamplePreviewContext preview = prepare_sample_preview_context(); auto denoise = [&](const sd::Tensor& x, float sigma, int step) -> sd::guidance::GuiderOutput { if (get_cancel_flag() == SD_CANCEL_ALL) { @@ -2254,6 +2346,8 @@ class StableDiffusionGGML { latent_channel = 3; } else if (version == VERSION_CHROMA_RADIANCE) { latent_channel = 3; + } else if (sd_version_is_minit2i(version)) { + latent_channel = 3; } else if (sd_version_is_pid(version)) { latent_channel = 3; } else if (sd_version_uses_flux2_vae(version)) { @@ -2333,7 +2427,7 @@ class StableDiffusionGGML { } sd::Tensor decode_first_stage(const sd::Tensor& x, bool decode_video = false) { - if (sd_version_is_pid(version)) { + if (sd_version_is_pid(version) || sd_version_is_minit2i(version)) { return sd::ops::clamp((x + 1.f) * 0.5f, 0.0f, 1.0f); } auto latents = first_stage_model->diffusion_to_vae_latents(x);