I am integrating stable diffusion in my rendering software. I made a test with this 512x512 image:

I execute this command using the Vulkan build from master-707-5a34bc7 release.
`
sd-cli.exe --backend vulkan0 --diffusion-model flux-2-klein-4b-Q8_0.gguf --llm Qwen3-4B-UD-Q4_K_XL.gguf --vae flux2.full_encoder_small_decoder.safetensors -p "turn the image into a high quality photograph" -r Albedo.png -o output_photo.png --cfg-scale 1 --steps 4
`
It takes 10 seconds to complete. When i execute from my C++ code it takes 1min20 seconds, The difference is huge!
I am linking against the Vulkan variant that i built myself. Maybe i am missing some cmake settings?
Here my C++ code:
`
static uint32_t g_CurrentStep = 0u;
static uint32_t g_TotalSteps = 0u;
void ProgressCallback(int step, int steps, float time, void* user_data)
{
g_CurrentStep = step;
g_TotalSteps = steps;
}
void OfflineRenderer::performRenderImageEnhancing()
{
_ASSERT(m_inputImage->isFloatingPointImage());
_ASSERT(m_inputImage->getNbChannels() == 3);
// First build the RGB8 image from RGB32F to feed the enhancer.
const auto imageSrc = Graphics::Texture::Helper::create_RGB8FromRGB32F(*m_inputImage);
// Build the stable diffusion Api native input image.
sd_image_t nativeInputImage;
{
nativeInputImage.width = getRenderingWidth();
nativeInputImage.height = getRenderingHeight();
nativeInputImage.channel = 3u;
nativeInputImage.data = imageSrc->getRawData();
}
// Init the context parameters.
sd_ctx_t* ctx = nullptr;
{
sd_ctx_params_t sd_ctx_params = {};
sd_ctx_params_init(&sd_ctx_params);
sd_ctx_params.diffusion_model_path = "D:\\Stable-diffusion\\Models\\flux-2-klein-4b-Q8_0.gguf";
sd_ctx_params.llm_path = "D:\\Stable-diffusion\\Models\\Qwen3-4B-UD-Q4_K_XL.gguf";
sd_ctx_params.vae_path = "D:\\Stable-diffusion\\Models\\flux2.full_encoder_small_decoder.safetensors";
//sd_ctx_params.vae_conv_direct = true;
//sd_ctx_params.flash_attn = true;
sd_ctx_params.n_threads = 12;
sd_ctx_params.backend = "vulkan0";
sd_ctx_params.wtype = sd_type_t::SD_TYPE_F16;
ctx = new_sd_ctx(&sd_ctx_params);
_ASSERT(ctx != nullptr);
}
// Configure generation parameters.
sd_img_gen_params_t sd_img_gen_params;
{
sd_img_gen_params_init(&sd_img_gen_params);
sd_img_gen_params.prompt = "turn the image into a high quality photograph";
sd_img_gen_params.negative_prompt = "";
sd_img_gen_params.sample_params.sample_method = EULER_SAMPLE_METHOD;
sd_img_gen_params.width = getRenderingWidth();
sd_img_gen_params.height = getRenderingHeight();
sd_img_gen_params.strength = 0.75f;
sd_img_gen_params.seed = 42;
sd_img_gen_params.control_strength = 0.9f;
sd_img_gen_params.ref_images = &nativeInputImage;
sd_img_gen_params.ref_images_count = 1;
sd_img_gen_params.sample_params.guidance.txt_cfg = 1.0f;
sd_img_gen_params.sample_params.sample_steps = 4;
}
// Set the generation callback.
sd_set_progress_callback(ProgressCallback, &m_stopRender);
sd_image_t* result = generate_image(ctx, &sd_img_gen_params);
_ASSERT(result && result[0].data);
_ASSERT(result->width == imageSrc->getWidth());
_ASSERT(result->height == imageSrc->getHeight());
_ASSERT(result->channel == imageSrc->getNbChannels());
// Read back result.
const uint32_t nbTasks = getRenderingWidth() * getRenderingHeight();
tbb::parallel_for(size_t(0), size_t(nbTasks), [&](size_t tbbIdx) {
if (m_stopRender)
{
tbb::task::current_context()->cancel_group_execution();
return;
}
const uint32_t pixelLinearIdx = (uint32_t)tbbIdx;
const uint32_t pixelStartInArray = (uint32_t)tbbIdx * 3u;
const uint32_t pixelPosX = (uint32_t)(pixelLinearIdx % getRenderingWidth());
const uint32_t pixelPosY = (uint32_t)(pixelLinearIdx / getRenderingWidth());
const Math::Uvec2 pixelPos = Math::Uvec2(pixelPosX, pixelPosY);
const uint8_t R = result[0].data[pixelStartInArray];
const uint8_t G = result[0].data[pixelStartInArray + 1u];
const uint8_t B = result[0].data[pixelStartInArray + 2u];
const RGBFColor outputColor = RGBFColor((float)R/ 255.0f, (float)G/255.0f, (float)B/ 255.0f);
m_outputImage->setPixelFromPosition(outputColor, pixelPos);
});
// Cleanup resources.
free_sd_ctx(ctx);
free(result[0].data);
free(result);
}
`
I downloaded the models on the internet:
https://huggingface.co/unsloth/Qwen3-4B-GGUF/blob/main/Qwen3-4B-UD-Q4_K_XL.gguf
https://huggingface.co/unsloth/FLUX.2-klein-4B-GGUF/blob/main/flux-2-klein-4b-Q8_0.gguf
https://huggingface.co/black-forest-labs/FLUX.2-small-decoder/blob/main/full_encoder_small_decoder.safetensors
Thanks for helping!
I am integrating stable diffusion in my rendering software. I made a test with this 512x512 image:

I execute this command using the Vulkan build from master-707-5a34bc7 release.
`
sd-cli.exe --backend vulkan0 --diffusion-model flux-2-klein-4b-Q8_0.gguf --llm Qwen3-4B-UD-Q4_K_XL.gguf --vae flux2.full_encoder_small_decoder.safetensors -p "turn the image into a high quality photograph" -r Albedo.png -o output_photo.png --cfg-scale 1 --steps 4
`
It takes 10 seconds to complete. When i execute from my C++ code it takes 1min20 seconds, The difference is huge!
I am linking against the Vulkan variant that i built myself. Maybe i am missing some cmake settings?
Here my C++ code:
`
}
`
I downloaded the models on the internet:
https://huggingface.co/unsloth/Qwen3-4B-GGUF/blob/main/Qwen3-4B-UD-Q4_K_XL.gguf
https://huggingface.co/unsloth/FLUX.2-klein-4B-GGUF/blob/main/flux-2-klein-4b-Q8_0.gguf
https://huggingface.co/black-forest-labs/FLUX.2-small-decoder/blob/main/full_encoder_small_decoder.safetensors
Thanks for helping!