Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3524,9 +3524,10 @@ void common_params_add_preset_options(std::vector<common_arg> & args) {
[](common_params &, int) { /* unused */ }
).set_env(COMMON_ARG_PRESET_STOP_TIMEOUT).set_preset_only());

// args.push_back(common_arg(
// {"pin"},
// "in server router mode, do not unload this model if models_max is exceeded",
// [](common_params &) { /* unused */ }
// ).set_preset_only());
args.push_back(common_arg(
{"pin"},
nullptr,
"in server router mode, do not unload this model if models_max is exceeded",
[](common_params &, const std::string &) { /* unused */ }
).set_env(COMMON_ARG_PRESET_PIN).set_preset_only());
}
1 change: 1 addition & 0 deletions common/arg.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
// pseudo-env variable to identify preset-only arguments
#define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
#define COMMON_ARG_PRESET_STOP_TIMEOUT "__PRESET_STOP_TIMEOUT"
#define COMMON_ARG_PRESET_PIN "__PRESET_PIN"

//
// CLI argument parsing
Expand Down
1 change: 1 addition & 0 deletions tools/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1487,6 +1487,7 @@ The precedence rule for preset options is as follows:
We also offer additional options that are exclusive to presets (these aren't treated as command-line arguments):
- `load-on-startup` (boolean): Controls whether the model loads automatically when the server starts
- `stop-timeout` (int, seconds): After requested unload, wait for this many seconds before forcing termination (default: 10)
- `pin` (boolean): Prevents the router from unloading this model when the `models_max` limit is exceeded; the model remains loaded until explicitly unloaded or the server restarts.

### Routing requests

Expand Down
17 changes: 16 additions & 1 deletion tools/server/server-models.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,14 @@ void server_models::load_models() {
}
}

// handle custom pin option
for (auto & [name, inst] : mapping) {
std::string val;
if (inst.meta.preset.get_option(COMMON_ARG_PRESET_PIN, val)) {
inst.meta.pinned = true;
}
}

// load any autoload models
std::vector<std::string> models_to_load;
for (const auto & [name, inst] : mapping) {
Expand Down Expand Up @@ -383,7 +391,9 @@ void server_models::unload_lru() {
for (const auto & m : mapping) {
if (m.second.meta.is_active()) {
count_active++;
if (m.second.meta.last_used < lru_last_used) {
// If all active models are pinned, this condition never holds and no LRU eviction will occur.
// We throw an error instead of allowing the server to exceed models_max.
if (!m.second.meta.pinned && m.second.meta.last_used < lru_last_used) {
lru_model_name = m.first;
lru_last_used = m.second.meta.last_used;
}
Expand All @@ -400,6 +410,11 @@ void server_models::unload_lru() {
return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED;
});
}
} else if (count_active >= (size_t)base_params.models_max) {
throw std::runtime_error(string_format(
"models_max limit (%d) reached, but no unpinned models available for LRU eviction - cannot load more models",
base_params.models_max
));
}
}

Expand Down
1 change: 1 addition & 0 deletions tools/server/server-models.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ struct server_model_meta {
std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
bool pinned = false; // if true, this model will not be unloaded by LRU

bool is_active() const {
return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING;
Expand Down