diff --git a/src/conditioning/conditioner.hpp b/src/conditioning/conditioner.hpp index ae1a5b5b3..71367c5db 100644 --- a/src/conditioning/conditioner.hpp +++ b/src/conditioning/conditioner.hpp @@ -1918,9 +1918,60 @@ struct LLMEmbedder : public Conditioner { prompt_template_encode_start_idx = 0; out_layers = {1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 36}; + // If the loaded LLM has a vision encoder and reference images were provided, + // encode images into LLM image-embeds and insert vision placeholders into the prompt. prompt = "<|im_start|>user\n"; + if (llm->enable_vision && conditioner_params.ref_images != nullptr && !conditioner_params.ref_images->empty()) { + LOG_INFO("Experimental Ideogram4 vision Pipeline"); + std::string placeholder = "<|image_pad|>"; + std::string img_prompt; + + int min_pixels = 384 * 384; + int max_pixels = 560 * 560; + + for (int i = 0; i < conditioner_params.ref_images->size(); i++) { + const auto& image = (*conditioner_params.ref_images)[i]; + double factor = llm->config.vision.patch_size * llm->config.vision.spatial_merge_size; + int height = static_cast(image.shape()[1]); + int width = static_cast(image.shape()[0]); + int h_bar = static_cast(std::round(height / factor) * factor); + int w_bar = static_cast(std::round(width / factor) * factor); + + if (static_cast(h_bar) * w_bar > max_pixels) { + double beta = std::sqrt((height * width) / static_cast(max_pixels)); + h_bar = std::max(static_cast(factor), + static_cast(std::floor(height / beta / factor)) * static_cast(factor)); + w_bar = std::max(static_cast(factor), + static_cast(std::floor(width / beta / factor)) * static_cast(factor)); + } else if (static_cast(h_bar) * w_bar < min_pixels) { + double beta = std::sqrt(static_cast(min_pixels) / (height * width)); + h_bar = static_cast(std::ceil(height * beta / factor)) * static_cast(factor); + w_bar = static_cast(std::ceil(width * beta / factor)) * static_cast(factor); + } + + auto resized_image = clip_preprocess(image, w_bar, h_bar); + + auto image_embed = llm->encode_image(n_threads, resized_image, false, true, true); + GGML_ASSERT(!image_embed.empty()); + + img_prompt += "Picture " + std::to_string(i) + ": <|vision_start|>"; + + int image_embed_idx = static_cast(tokenizer->encode(img_prompt, nullptr).size()); + image_embeds.emplace_back(image_embed_idx, image_embed); + + int64_t num_image_tokens = image_embed.shape()[1]; + img_prompt.reserve(img_prompt.size() + static_cast(num_image_tokens) * placeholder.size() + 32); + for (int j = 0; j < num_image_tokens; j++) { + img_prompt += placeholder; + } + img_prompt += "<|vision_end|>"; + } + + prompt += img_prompt; + } prompt += conditioner_params.text; prompt += "<|im_end|>\n<|im_start|>assistant\n"; + prompt_attn_range = {0, 0}; } else if (sd_version_is_ernie_image(version)) { prompt_template_encode_start_idx = 0; diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index cb1a3f6d9..051e90ac8 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -635,7 +635,7 @@ class StableDiffusionGGML { tensor_storage_map, version, "", - false, + (strlen(SAFE_STR(sd_ctx_params->llm_vision_path)) > 0), model_manager); diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), tensor_storage_map,