Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,14 @@ API and command-line option may change frequently.***
- [Ovis-Image](./docs/ovis_image.md)
- [Anima](./docs/anima.md)
- [ERNIE-Image](./docs/ernie_image.md)
- [Boogu Image](./docs/boogu_image.md)
- [HiDream-O1-Image](./docs/hidream_o1_image.md)
- [Ideogram4](./docs/ideogram4.md)
- Image Edit Models
- [FLUX.1-Kontext-dev](./docs/kontext.md)
- [Qwen Image Edit series](./docs/qwen_image_edit.md)
- [LongCat Image Edit](./docs/longcat_image.md)
- [Boogu Image Edit](./docs/boogu_image.md)
- Video Models
- [Wan2.1/Wan2.2](./docs/wan.md)
- [LTX-2.3](./docs/ltx2.md)
Expand Down
Binary file added assets/boogu/edit_example.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/boogu/example.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
31 changes: 31 additions & 0 deletions docs/boogu_image.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# How to Use

Boogu Image uses a Boogu diffusion transformer, the FLUX VAE, and Qwen3-VL as the LLM text and vision encoder.

## Download weights

- Download Boogu Image
- safetensors: https://huggingface.co/Comfy-Org/Boogu-Image/tree/main/diffusion_models
- Download vae
- safetensors: https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
- Download Qwen3-VL 8B
- gguf: https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-GGUF/tree/main
- For image editing with GGUF text encoders, also download the matching mmproj file and pass it with `--llm_vision`.

## Examples

### Boogu Image Base

```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\boogu_image_base_bf16.safetensors --llm ..\..\llm\Qwen3VL-8B-Instruct-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ae.sft -p "a lovely cat" --diffusion-fa -v --offload-to-cpu
```

<img width="256" alt="Boogu Image Base example" src="../assets/boogu/example.png" />

### Boogu Image Edit

```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\boogu_image_edit_bf16.safetensors --llm ..\..\llm\Qwen3VL-8B-Instruct-Q4_K_M.gguf --llm_vision ..\..\llm\mmproj-Qwen3VL-8B-Instruct-F16.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --diffusion-fa -v --offload-to-cpu -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'boogu.cpp'"
```

<img width="256" alt="Boogu Image Edit example" src="../assets/boogu/edit_example.png" />
61 changes: 60 additions & 1 deletion src/conditioning/conditioner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1518,7 +1518,7 @@ struct LLMEmbedder : public Conditioner {
arch = LLM::LLMArch::GPT_OSS_20B;
} else if (sd_version_is_pid(version)) {
arch = LLM::LLMArch::GEMMA2_2B;
} else if (sd_version_is_ideogram4(version)) {
} else if (sd_version_is_ideogram4(version) || sd_version_is_boogu_image(version)) {
arch = LLM::LLMArch::QWEN3_VL;
} else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
arch = LLM::LLMArch::QWEN3;
Expand Down Expand Up @@ -1778,6 +1778,65 @@ struct LLMEmbedder : public Conditioner {

prompt += "<|im_end|>\n<|im_start|>assistant\n";
}
} else if (sd_version_is_boogu_image(version)) {
prompt_template_encode_start_idx = 0;

const std::string t2i_system_prompt =
"You are a helpful assistant that generates high-quality images based on user instructions. The instructions are as follows.";
const std::string edit_system_prompt =
"Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.";
const bool has_ref_images = llm->enable_vision && conditioner_params.ref_images != nullptr && !conditioner_params.ref_images->empty();
const bool text_empty = conditioner_params.text.find_first_not_of(" \t\r\n") == std::string::npos;

if (has_ref_images) {
LOG_INFO("BooguImageEditPipeline");
const std::string prompt_prefix = "<|im_start|>system\n" + edit_system_prompt + "<|im_end|>\n<|im_start|>user\n";
std::string img_prompt;
const std::string placeholder = "<|image_pad|>";

for (int i = 0; i < conditioner_params.ref_images->size(); i++) {
const auto& image = (*conditioner_params.ref_images)[i];
double factor = llm->config.vision.patch_size * llm->config.vision.spatial_merge_size;
int height = static_cast<int>(image.shape()[1]);
int width = static_cast<int>(image.shape()[0]);
double beta = std::sqrt((384.0 * 384.0) / (static_cast<double>(height) * static_cast<double>(width)));
int h_bar = std::max(static_cast<int>(factor),
static_cast<int>(std::round(height * beta / factor)) * static_cast<int>(factor));
int w_bar = std::max(static_cast<int>(factor),
static_cast<int>(std::round(width * beta / factor)) * static_cast<int>(factor));

LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, height, width, h_bar, w_bar);

auto resized_image = clip_preprocess(image, w_bar, h_bar);
auto image_embed = llm->encode_image(n_threads, resized_image, false, true, true);
GGML_ASSERT(!image_embed.empty());

std::string image_prefix = prompt_prefix + img_prompt + "<|vision_start|>";
int image_embed_idx = static_cast<int>(tokenizer->encode(image_prefix, nullptr).size());
image_embeds.emplace_back(image_embed_idx, image_embed);

img_prompt += "<|vision_start|>";
int64_t num_image_tokens = image_embed.shape()[1];
img_prompt.reserve(img_prompt.size() + static_cast<size_t>(num_image_tokens) * placeholder.size() + 32);
for (int j = 0; j < num_image_tokens; j++) {
img_prompt += placeholder;
}
img_prompt += "<|vision_end|>";
}

prompt = prompt_prefix + img_prompt;
prompt_attn_range.first = static_cast<int>(prompt.size());
prompt += conditioner_params.text;
prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "<|im_end|>\n";
} else {
const std::string& system_prompt = text_empty ? edit_system_prompt : t2i_system_prompt;
prompt = "<|im_start|>system\n" + system_prompt + "<|im_end|>\n<|im_start|>user\n";
prompt_attn_range.first = static_cast<int>(prompt.size());
prompt += conditioner_params.text;
prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "<|im_end|>\n";
}
} else if (sd_version_is_longcat(version)) {
spell_quotes = true;

Expand Down
9 changes: 9 additions & 0 deletions src/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ enum SDVersion {
VERSION_LTXAV,
VERSION_HIDREAM_O1,
VERSION_Z_IMAGE,
VERSION_BOOGU_IMAGE,
VERSION_OVIS_IMAGE,
VERSION_ERNIE_IMAGE,
VERSION_LENS,
Expand Down Expand Up @@ -143,6 +144,13 @@ static inline bool sd_version_is_z_image(SDVersion version) {
return false;
}

static inline bool sd_version_is_boogu_image(SDVersion version) {
if (version == VERSION_BOOGU_IMAGE) {
return true;
}
return false;
}

static inline bool sd_version_is_longcat(SDVersion version) {
if (version == VERSION_LONGCAT) {
return true;
Expand Down Expand Up @@ -206,6 +214,7 @@ static inline bool sd_version_is_dit(SDVersion version) {
version == VERSION_HIDREAM_O1 ||
sd_version_is_anima(version) ||
sd_version_is_z_image(version) ||
sd_version_is_boogu_image(version) ||
sd_version_is_ernie_image(version) ||
sd_version_is_lens(version) ||
sd_version_is_longcat(version) ||
Expand Down
4 changes: 3 additions & 1 deletion src/model/common/rope.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -899,10 +899,12 @@ namespace Rope {
// q,k,v: [N, L, n_head, d_head]
// pe: [L, d_head/2, 2, 2]
// return: [N, L, n_head*d_head]
int64_t n_head = q->ne[1];

q = apply_rope(ctx->ggml_ctx, q, pe, rope_interleaved); // [N*n_head, L, d_head]
k = apply_rope(ctx->ggml_ctx, k, pe, rope_interleaved); // [N*n_head, L, d_head]

auto x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, v->ne[1], mask, true, ctx->flash_attn_enabled, kv_scale); // [N, L, n_head*d_head]
auto x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, mask, true, ctx->flash_attn_enabled, kv_scale); // [N, L, n_head*d_head]
return x;
}
}; // namespace Rope
Expand Down
Loading
Loading