diff --git a/examples/common/common.cpp b/examples/common/common.cpp index ad3f97a08..53a755dc6 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -496,6 +496,10 @@ ArgOptions SDContextParams::get_options() { "--stream-layers", "enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram; defaults to false)", true, &stream_layers}, + {"", + "--eager-load", + "load all params into the params backend at model-load time instead of lazily on first use (defaults to false)", + true, &eager_load}, {"", "--force-sdxl-vae-conv-scale", "force use of conv scale on sdxl vae", @@ -799,6 +803,7 @@ std::string SDContextParams::to_string() const { << " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n" << " max_vram: \"" << max_vram << "\",\n" << " stream_layers: " << (stream_layers ? "true" : "false") << ",\n" + << " eager_load: " << (eager_load ? "true" : "false") << ",\n" << " backend: \"" << backend << "\",\n" << " params_backend: \"" << params_backend << "\",\n" << " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n" @@ -878,6 +883,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool taesd_preview) { sd_ctx_params.vae_format = str_to_vae_format(vae_format); sd_ctx_params.max_vram = max_vram.c_str(); sd_ctx_params.stream_layers = stream_layers; + sd_ctx_params.eager_load = eager_load; sd_ctx_params.backend = effective_backend.c_str(); sd_ctx_params.params_backend = effective_params_backend.c_str(); sd_ctx_params.rpc_servers = rpc_servers.c_str(); diff --git a/examples/common/common.h b/examples/common/common.h index 587cad29f..e7c25015b 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -148,6 +148,7 @@ struct SDContextParams { bool offload_params_to_cpu = false; std::string max_vram = "0"; bool stream_layers = false; + bool eager_load = false; std::string backend; std::string params_backend; std::string rpc_servers; diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index 1c04367b1..730794e6b 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -219,6 +219,7 @@ typedef struct { enum sd_vae_format_t vae_format; const char* max_vram; // GiB budget or backend assignment spec for graph-cut segmented param offload (0 = disabled, -1 = auto) bool stream_layers; // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram) + bool eager_load; // Load all params into the params backend at model-load time instead of lazily on first use const char* backend; const char* params_backend; const char* rpc_servers; diff --git a/src/model_manager.cpp b/src/model_manager.cpp index 5287e1069..bb87e0105 100644 --- a/src/model_manager.cpp +++ b/src/model_manager.cpp @@ -147,6 +147,17 @@ bool ModelManager::register_param_tensors(const std::string& desc, return true; } +bool ModelManager::load_all_params_eagerly() { + std::vector all_states; + all_states.reserve(tensor_states_.size()); + for (const auto& s : tensor_states_) { + if (s != nullptr) { + all_states.push_back(s.get()); + } + } + return load_tensors_to_params_backend(all_states); +} + bool ModelManager::validate_registered_tensors() { bool ok = true; for (const auto& state : tensor_states_) { diff --git a/src/model_manager.h b/src/model_manager.h index 1a414c15c..80fa5e73c 100644 --- a/src/model_manager.h +++ b/src/model_manager.h @@ -158,6 +158,7 @@ class ModelManager : public RunnerWeightManager { } bool validate_registered_tensors(); + bool load_all_params_eagerly(); bool prepare_params(const std::vector& tensors) override; void release_compute_backend_params(const std::vector& tensors) override; diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index cb1a3f6d9..a1a007d59 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -199,6 +199,7 @@ class StableDiffusionGGML { bool enable_mmap = false; sd::ggml_graph_cut::MaxVramAssignment max_vram_assignment; bool stream_layers = false; + bool eager_load = false; std::string backend_spec; std::string params_backend_spec; @@ -342,6 +343,7 @@ class StableDiffusionGGML { n_threads = sd_ctx_params->n_threads; enable_mmap = sd_ctx_params->enable_mmap; stream_layers = sd_ctx_params->stream_layers; + eager_load = sd_ctx_params->eager_load; backend_spec = SAFE_STR(sd_ctx_params->backend); params_backend_spec = SAFE_STR(sd_ctx_params->params_backend); max_vram_assignment.reset(0.f); @@ -1153,7 +1155,15 @@ class StableDiffusionGGML { return false; } - LOG_DEBUG("model metadata validated; weights will be prepared lazily"); + if (eager_load) { + if (!model_manager->load_all_params_eagerly()) { + LOG_ERROR("model params eager load failed"); + return false; + } + LOG_DEBUG("model metadata validated; weights pre-loaded to params backend"); + } else { + LOG_DEBUG("model metadata validated; weights will be prepared lazily"); + } { size_t total_params_ram_size = 0; @@ -2696,6 +2706,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->lora_apply_mode = LORA_APPLY_AUTO; sd_ctx_params->max_vram = nullptr; sd_ctx_params->stream_layers = false; + sd_ctx_params->eager_load = false; sd_ctx_params->enable_mmap = false; sd_ctx_params->diffusion_flash_attn = false; sd_ctx_params->circular_x = false; @@ -2742,6 +2753,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "prediction: %s\n" "max_vram: %s\n" "stream_layers: %s\n" + "eager_load: %s\n" "backend: %s\n" "params_backend: %s\n" "flash_attn: %s\n" @@ -2777,6 +2789,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { sd_prediction_name(sd_ctx_params->prediction), SAFE_STR(sd_ctx_params->max_vram), BOOL_STR(sd_ctx_params->stream_layers), + BOOL_STR(sd_ctx_params->eager_load), SAFE_STR(sd_ctx_params->backend), SAFE_STR(sd_ctx_params->params_backend), BOOL_STR(sd_ctx_params->flash_attn),