leejet · leejet · Jun 21, 2026 · Jun 21, 2026 · Jun 21, 2026
diff --git a/README.md b/README.md
@@ -50,12 +50,14 @@ API and command-line option may change frequently.***
     - [Ovis-Image](./docs/ovis_image.md)
     - [Anima](./docs/anima.md)
     - [ERNIE-Image](./docs/ernie_image.md)
+    - [Boogu Image](./docs/boogu_image.md)
     - [HiDream-O1-Image](./docs/hidream_o1_image.md)
     - [Ideogram4](./docs/ideogram4.md)
   - Image Edit Models
     - [FLUX.1-Kontext-dev](./docs/kontext.md)
     - [Qwen Image Edit series](./docs/qwen_image_edit.md)
     - [LongCat Image Edit](./docs/longcat_image.md)
+    - [Boogu Image Edit](./docs/boogu_image.md)
   - Video Models
     - [Wan2.1/Wan2.2](./docs/wan.md)
     - [LTX-2.3](./docs/ltx2.md)

diff --git a/assets/boogu/edit_example.png b/assets/boogu/edit_example.png
diff --git a/assets/boogu/example.png b/assets/boogu/example.png
diff --git a/docs/boogu_image.md b/docs/boogu_image.md
@@ -0,0 +1,31 @@
+# How to Use
+
+Boogu Image uses a Boogu diffusion transformer, the FLUX VAE, and Qwen3-VL as the LLM text and vision encoder.
+
+## Download weights
+
+- Download Boogu Image
+    - safetensors: https://huggingface.co/Comfy-Org/Boogu-Image/tree/main/diffusion_models
+- Download vae
+    - safetensors: https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
+- Download Qwen3-VL 8B
+    - gguf: https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-GGUF/tree/main
+        - For image editing with GGUF text encoders, also download the matching mmproj file and pass it with `--llm_vision`.
+
+## Examples
+
+### Boogu Image Base
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\boogu_image_base_bf16.safetensors --llm ..\..\llm\Qwen3VL-8B-Instruct-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ae.sft -p "a lovely cat" --diffusion-fa -v --offload-to-cpu
+```
+
+<img width="256" alt="Boogu Image Base example" src="../assets/boogu/example.png" />
+
+### Boogu Image Edit
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\boogu_image_edit_bf16.safetensors --llm ..\..\llm\Qwen3VL-8B-Instruct-Q4_K_M.gguf --llm_vision ..\..\llm\mmproj-Qwen3VL-8B-Instruct-F16.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --diffusion-fa -v --offload-to-cpu -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'boogu.cpp'"
+```
+
+<img width="256" alt="Boogu Image Edit example" src="../assets/boogu/edit_example.png" />
diff --git a/src/conditioning/conditioner.hpp b/src/conditioning/conditioner.hpp
@@ -1518,7 +1518,7 @@ struct LLMEmbedder : public Conditioner {
             arch = LLM::LLMArch::GPT_OSS_20B;
         } else if (sd_version_is_pid(version)) {
             arch = LLM::LLMArch::GEMMA2_2B;
-        } else if (sd_version_is_ideogram4(version)) {
+        } else if (sd_version_is_ideogram4(version) || sd_version_is_boogu_image(version)) {
             arch = LLM::LLMArch::QWEN3_VL;
         } else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
             arch = LLM::LLMArch::QWEN3;
@@ -1778,6 +1778,65 @@ struct LLMEmbedder : public Conditioner {
 
                 prompt += "<|im_end|>\n<|im_start|>assistant\n";
             }
+        } else if (sd_version_is_boogu_image(version)) {
+            prompt_template_encode_start_idx = 0;
+
+            const std::string t2i_system_prompt =
+                "You are a helpful assistant that generates high-quality images based on user instructions. The instructions are as follows.";
+            const std::string edit_system_prompt =
+                "Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.";
+            const bool has_ref_images = llm->enable_vision && conditioner_params.ref_images != nullptr && !conditioner_params.ref_images->empty();
+            const bool text_empty     = conditioner_params.text.find_first_not_of(" \t\r\n") == std::string::npos;
+
+            if (has_ref_images) {
+                LOG_INFO("BooguImageEditPipeline");
+                const std::string prompt_prefix = "<|im_start|>system\n" + edit_system_prompt + "<|im_end|>\n<|im_start|>user\n";
+                std::string img_prompt;
+                const std::string placeholder = "<|image_pad|>";
+
+                for (int i = 0; i < conditioner_params.ref_images->size(); i++) {
+                    const auto& image = (*conditioner_params.ref_images)[i];
+                    double factor     = llm->config.vision.patch_size * llm->config.vision.spatial_merge_size;
+                    int height        = static_cast<int>(image.shape()[1]);
+                    int width         = static_cast<int>(image.shape()[0]);
+                    double beta       = std::sqrt((384.0 * 384.0) / (static_cast<double>(height) * static_cast<double>(width)));
+                    int h_bar         = std::max(static_cast<int>(factor),
+                                                 static_cast<int>(std::round(height * beta / factor)) * static_cast<int>(factor));
+                    int w_bar         = std::max(static_cast<int>(factor),
+                                                 static_cast<int>(std::round(width * beta / factor)) * static_cast<int>(factor));
+
+                    LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, height, width, h_bar, w_bar);
+
+                    auto resized_image = clip_preprocess(image, w_bar, h_bar);
+                    auto image_embed   = llm->encode_image(n_threads, resized_image, false, true, true);
+                    GGML_ASSERT(!image_embed.empty());
+
+                    std::string image_prefix = prompt_prefix + img_prompt + "<|vision_start|>";
+                    int image_embed_idx      = static_cast<int>(tokenizer->encode(image_prefix, nullptr).size());
+                    image_embeds.emplace_back(image_embed_idx, image_embed);
+
+                    img_prompt += "<|vision_start|>";
+                    int64_t num_image_tokens = image_embed.shape()[1];
+                    img_prompt.reserve(img_prompt.size() + static_cast<size_t>(num_image_tokens) * placeholder.size() + 32);
+                    for (int j = 0; j < num_image_tokens; j++) {
+                        img_prompt += placeholder;
+                    }
+                    img_prompt += "<|vision_end|>";
+                }
+
+                prompt                  = prompt_prefix + img_prompt;
+                prompt_attn_range.first = static_cast<int>(prompt.size());
+                prompt += conditioner_params.text;
+                prompt_attn_range.second = static_cast<int>(prompt.size());
+                prompt += "<|im_end|>\n";
+            } else {
+                const std::string& system_prompt = text_empty ? edit_system_prompt : t2i_system_prompt;
+                prompt                           = "<|im_start|>system\n" + system_prompt + "<|im_end|>\n<|im_start|>user\n";
+                prompt_attn_range.first          = static_cast<int>(prompt.size());
+                prompt += conditioner_params.text;
+                prompt_attn_range.second = static_cast<int>(prompt.size());
+                prompt += "<|im_end|>\n";
+            }
         } else if (sd_version_is_longcat(version)) {
             spell_quotes = true;
 

diff --git a/src/model.h b/src/model.h
@@ -42,6 +42,7 @@ enum SDVersion {
     VERSION_LTXAV,
     VERSION_HIDREAM_O1,
     VERSION_Z_IMAGE,
+    VERSION_BOOGU_IMAGE,
     VERSION_OVIS_IMAGE,
     VERSION_ERNIE_IMAGE,
     VERSION_LENS,
@@ -143,6 +144,13 @@ static inline bool sd_version_is_z_image(SDVersion version) {
     return false;
 }
 
+static inline bool sd_version_is_boogu_image(SDVersion version) {
+    if (version == VERSION_BOOGU_IMAGE) {
+        return true;
+    }
+    return false;
+}
+
 static inline bool sd_version_is_longcat(SDVersion version) {
     if (version == VERSION_LONGCAT) {
         return true;
@@ -206,6 +214,7 @@ static inline bool sd_version_is_dit(SDVersion version) {
         version == VERSION_HIDREAM_O1 ||
         sd_version_is_anima(version) ||
         sd_version_is_z_image(version) ||
+        sd_version_is_boogu_image(version) ||
         sd_version_is_ernie_image(version) ||
         sd_version_is_lens(version) ||
         sd_version_is_longcat(version) ||

diff --git a/src/model/common/rope.hpp b/src/model/common/rope.hpp
@@ -899,10 +899,12 @@ namespace Rope {
         // q,k,v: [N, L, n_head, d_head]
         // pe: [L, d_head/2, 2, 2]
         // return: [N, L, n_head*d_head]
+        int64_t n_head = q->ne[1];
+
         q = apply_rope(ctx->ggml_ctx, q, pe, rope_interleaved);  // [N*n_head, L, d_head]
         k = apply_rope(ctx->ggml_ctx, k, pe, rope_interleaved);  // [N*n_head, L, d_head]
 
-        auto x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, v->ne[1], mask, true, ctx->flash_attn_enabled, kv_scale);  // [N, L, n_head*d_head]
+        auto x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, mask, true, ctx->flash_attn_enabled, kv_scale);  // [N, L, n_head*d_head]
         return x;
     }
 };  // namespace Rope