From 0f70e3e0cd90326b768bb6b212e47dd5987e20ce Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 1 Jan 2026 12:49:19 +0100
Subject: [PATCH 1/8] arg: support remote preset

---
 common/arg.cpp      | 151 +++++++++++++++++++++++++++++---------------
 common/download.cpp |  15 +++--
 common/download.h   |   6 ++
 common/preset.cpp   |  77 +++++++++++++++++++++-
 common/preset.h     |  11 +++-
 docs/preset.md      |  50 +++++++++++++++
 6 files changed, 253 insertions(+), 57 deletions(-)
 create mode 100644 docs/preset.md

diff --git a/common/arg.cpp b/common/arg.cpp
index 62d31393c43..4d4f91e96f5 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -6,6 +6,7 @@
 #include "log.h"
 #include "sampling.h"
 #include "download.h"
+#include "preset.h"
 
 // fix problem with std::min and std::max
 #if defined(_WIN32)
@@ -268,6 +269,42 @@ static void parse_tensor_buffer_overrides(const std::string & value, std::vector
     }
 }
 
+static std::string clean_file_name(const std::string & fname) {
+    std::string clean_fname = fname;
+    string_replace_all(clean_fname, "\\", "/");
+    string_replace_all(clean_fname, "/", "");
+    return clean_fname;
+}
+
+static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
+    GGML_ASSERT(!params.model.hf_repo.empty());
+
+    const bool offline = params.offline;
+    std::string model_endpoint = get_model_endpoint();
+    auto preset_url = model_endpoint + params.model.hf_repo + "/resolve/main/preset.ini";
+
+    // prepare local path for caching
+    auto preset_fname = clean_file_name(params.model.hf_repo + "_preset.ini");
+    auto preset_path = fs_get_cache_file(preset_fname);
+    bool has_preset = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
+
+    // remote preset is optional, so we don't error out if not found
+    if (has_preset) {
+        LOG_INF("applying remote preset from %s\n", preset_url.c_str());
+        common_preset_context ctx(ex, /* only_remote_allowed */ true);
+        common_preset global; // unused for now
+        auto remote_presets = ctx.load_from_ini(preset_path, global);
+        if (remote_presets.find(COMMON_PRESET_DEFAULT_NAME) != remote_presets.end()) {
+            common_preset & preset = remote_presets.at(COMMON_PRESET_DEFAULT_NAME);
+            preset.apply_to_params(params);
+        } else {
+            throw std::runtime_error("Remote preset.ini does not contain [" + std::string(COMMON_PRESET_DEFAULT_NAME) + "] section");
+        }
+    }
+
+    return has_preset;
+}
+
 struct handle_model_result {
     bool found_mmproj = false;
     common_params_model mmproj;
@@ -309,9 +346,7 @@ static handle_model_result common_params_handle_model(
             // make sure model path is present (for caching purposes)
             if (model.path.empty()) {
                 // this is to avoid different repo having same file name, or same file name in different subdirs
-                std::string filename = model.hf_repo + "_" + model.hf_file;
-                // to make sure we don't have any slashes in the filename
-                string_replace_all(filename, "/", "_");
+                std::string filename = clean_file_name(model.hf_repo + "_" + model.hf_file);
                 model.path = fs_get_cache_file(filename);
             }
 
@@ -425,61 +460,75 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         }
     };
 
-    std::set<std::string> seen_args;
+    auto parse_cli_args = [&]() {
+        std::set<std::string> seen_args;
 
-    for (int i = 1; i < argc; i++) {
-        const std::string arg_prefix = "--";
+        for (int i = 1; i < argc; i++) {
+            const std::string arg_prefix = "--";
 
-        std::string arg = argv[i];
-        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
-            std::replace(arg.begin(), arg.end(), '_', '-');
-        }
-        if (arg_to_options.find(arg) == arg_to_options.end()) {
-            throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
-        }
-        if (!seen_args.insert(arg).second) {
-            LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
-        }
-        auto & tmp = arg_to_options[arg];
-        auto opt = *tmp.first;
-        bool is_positive = tmp.second;
-        if (opt.has_value_from_env()) {
-            fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
-        }
-        try {
-            if (opt.handler_void) {
-                opt.handler_void(params);
-                continue;
+            std::string arg = argv[i];
+            if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+                std::replace(arg.begin(), arg.end(), '_', '-');
             }
-            if (opt.handler_bool) {
-                opt.handler_bool(params, is_positive);
-                continue;
+            if (arg_to_options.find(arg) == arg_to_options.end()) {
+                throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
             }
-
-            // arg with single value
-            check_arg(i);
-            std::string val = argv[++i];
-            if (opt.handler_int) {
-                opt.handler_int(params, std::stoi(val));
-                continue;
+            if (!seen_args.insert(arg).second) {
+                LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
             }
-            if (opt.handler_string) {
-                opt.handler_string(params, val);
-                continue;
+            auto & tmp = arg_to_options[arg];
+            auto opt = *tmp.first;
+            bool is_positive = tmp.second;
+            if (opt.has_value_from_env()) {
+                fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
             }
+            try {
+                if (opt.handler_void) {
+                    opt.handler_void(params);
+                    continue;
+                }
+                if (opt.handler_bool) {
+                    opt.handler_bool(params, is_positive);
+                    continue;
+                }
 
-            // arg with 2 values
-            check_arg(i);
-            std::string val2 = argv[++i];
-            if (opt.handler_str_str) {
-                opt.handler_str_str(params, val, val2);
-                continue;
-            }
-        } catch (std::exception & e) {
-            throw std::invalid_argument(string_format(
-                "error while handling argument \"%s\": %s\n\n"
-                "usage:\n%s\n\nto show complete usage, run with -h",
-                arg.c_str(), e.what(), opt.to_string().c_str()));
+                // arg with single value
+                check_arg(i);
+                std::string val = argv[++i];
+                if (opt.handler_int) {
+                    opt.handler_int(params, std::stoi(val));
+                    continue;
+                }
+                if (opt.handler_string) {
+                    opt.handler_string(params, val);
+                    continue;
+                }
+
+                // arg with 2 values
+                check_arg(i);
+                std::string val2 = argv[++i];
+                if (opt.handler_str_str) {
+                    opt.handler_str_str(params, val, val2);
+                    continue;
+                }
+            } catch (std::exception & e) {
+                throw std::invalid_argument(string_format(
+                    "error while handling argument \"%s\": %s\n\n"
+                    "usage:\n%s\n\nto show complete usage, run with -h",
+                    arg.c_str(), e.what(), opt.to_string().c_str()));
+            }
+        }
+    };
+
+    // parse the first time to get -hf option (used for remote preset)
+    parse_cli_args();
+
+    // maybe handle remote preset
+    if (!params.model.hf_repo.empty()) {
+        bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
+        if (has_preset) {
+            // re-parse CLI args to override preset values
+            parse_cli_args();
         }
     }
 
diff --git a/common/download.cpp b/common/download.cpp
index ef874725607..d0aa3860812 100644
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -769,10 +769,10 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
 
 #if defined(LLAMA_USE_CURL) || defined(LLAMA_USE_HTTPLIB)
 
-static bool common_download_file_single(const std::string & url,
-                                        const std::string & path,
-                                        const std::string & bearer_token,
-                                        bool                offline) {
+bool common_download_file_single(const std::string & url,
+                                 const std::string & path,
+                                 const std::string & bearer_token,
+                                 bool                offline) {
     if (!offline) {
         return common_download_file_single_online(url, path, bearer_token);
     }
@@ -1096,6 +1096,13 @@ std::string common_docker_resolve_model(const std::string &) {
     throw std::runtime_error("download functionality is not enabled in this build");
 }
 
+bool common_download_file_single(const std::string &,
+                                 const std::string &,
+                                 const std::string &,
+                                 bool) {
+    throw std::runtime_error("download functionality is not enabled in this build");
+}
+
 #endif // LLAMA_USE_CURL || LLAMA_USE_HTTPLIB
 
 std::vector<common_cached_model_info> common_list_cached_models() {
diff --git a/common/download.h b/common/download.h
index d1321e6e90e..5f42527af8d 100644
--- a/common/download.h
+++ b/common/download.h
@@ -52,6 +52,12 @@ bool common_download_model(
 // returns list of cached models
 std::vector<common_cached_model_info> common_list_cached_models();
 
+// download single file from url to local path
+bool common_download_file_single(const std::string & url,
+                                 const std::string & path,
+                                 const std::string & bearer_token,
+                                 bool                offline);
+
 // resolve and download model from Docker registry
 // return local path to downloaded model file
 std::string common_docker_resolve_model(const std::string & docker);
diff --git a/common/preset.cpp b/common/preset.cpp
index e2fc18c5dad..949fe001109 100644
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -16,6 +16,46 @@ static std::string rm_leading_dashes(const std::string & str) {
     return str.substr(pos);
 }
 
+// only allow a subset of args for remote presets for security reasons
+// do not add more args unless absolutely necessary
+// args that output to files are strictly prohibited
+static std::set<std::string> get_remote_preset_whitelist(std::map<std::string, common_arg> & key_to_opt) {
+    static const std::set<std::string> allowed_options = {
+        "model-url",
+        "hf-repo",
+        "hf-repo-draft",
+        "hf-repo-v", // vocoder
+        "hf-file-v", // vocoder
+        "mmproj-url",
+        "pooling",
+        "jinja",
+        "batch-size",
+        "ubatch-size",
+        "cache-reuse",
+        // note: sampling params are automatically allowed by default
+        // negated args will be added automatically
+    };
+
+    std::set<std::string> allowed_keys;
+
+    for (const auto & it : key_to_opt) {
+        const std::string & key = it.first;
+        const common_arg & opt = it.second;
+        if (allowed_options.find(key) != allowed_options.end() || opt.is_sparam) {
+            allowed_keys.insert(key);
+            // also add variant keys (args without leading dashes and env vars)
+            for (const auto & arg : opt.get_args()) {
+                allowed_keys.insert(rm_leading_dashes(arg));
+            }
+            for (const auto & env : opt.get_env()) {
+                allowed_keys.insert(env);
+            }
+        }
+    }
+
+    return allowed_keys;
+}
+
 std::vector<std::string> common_preset::to_args(const std::string & bin_path) const {
     std::vector<std::string> args;
 
@@ -121,6 +161,29 @@ void common_preset::merge(const common_preset & other) {
     }
 }
 
+void common_preset::apply_to_params(common_params & params) const {
+    for (const auto & [opt, val] : options) {
+        // apply each option to params
+        if (opt.handler_string) {
+            opt.handler_string(params, val);
+        } else if (opt.handler_int) {
+            opt.handler_int(params, std::stoi(val));
+        } else if (opt.handler_bool) {
+            opt.handler_bool(params, common_arg_utils::is_truthy(val));
+        } else if (opt.handler_str_str) {
+            // not supported yet
+            throw std::runtime_error(string_format(
+                "%s: option with two values is not supported yet",
+                __func__
+            ));
+        } else if (opt.handler_void) {
+            opt.handler_void(params);
+        } else {
+            GGML_ABORT("unknown handler type");
+        }
+    }
+}
+
 static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_file(const std::string & path) {
     std::map<std::string, std::map<std::string, std::string>> parsed;
 
@@ -230,10 +293,16 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
     return value;
 }
 
-common_preset_context::common_preset_context(llama_example ex)
+common_preset_context::common_preset_context(llama_example ex, bool only_remote_allowed)
         : ctx_params(common_params_parser_init(default_params, ex)) {
     common_params_add_preset_options(ctx_params.options);
     key_to_opt = get_map_key_opt(ctx_params);
+
+    // setup allowed keys if only_remote_allowed is true
+    if (only_remote_allowed) {
+        filter_allowed_keys = true;
+        allowed_keys = get_remote_preset_whitelist(key_to_opt);
+    }
 }
 
 common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
@@ -250,6 +319,12 @@ common_presets common_preset_context::load_from_ini(const std::string & path, co
         LOG_DBG("loading preset: %s\n", preset.name.c_str());
         for (const auto & [key, value] : section.second) {
             LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
+            if (filter_allowed_keys && allowed_keys.find(key) == allowed_keys.end()) {
+                throw std::runtime_error(string_format(
+                    "option '%s' is not allowed in remote presets",
+                    key.c_str()
+                ));
+            }
             if (key_to_opt.find(key) != key_to_opt.end()) {
                 const auto & opt = key_to_opt.at(key);
                 if (is_bool_arg(opt)) {
diff --git a/common/preset.h b/common/preset.h
index 3a84d1be29c..11ba6ef8124 100644
--- a/common/preset.h
+++ b/common/preset.h
@@ -6,6 +6,7 @@
 #include <string>
 #include <vector>
 #include <map>
+#include <set>
 
 //
 // INI preset parser and writer
@@ -40,6 +41,9 @@ struct common_preset {
 
     // merge another preset into this one, overwriting existing options
     void merge(const common_preset & other);
+
+    // apply preset options to common_params
+    void apply_to_params(common_params & params) const;
 };
 
 // interface for multiple presets in one file
@@ -50,7 +54,12 @@ struct common_preset_context {
     common_params default_params; // unused for now
     common_params_context ctx_params;
     std::map<std::string, common_arg> key_to_opt;
-    common_preset_context(llama_example ex);
+
+    bool filter_allowed_keys = false;
+    std::set<std::string> allowed_keys;
+
+    // if only_remote_allowed is true, only accept whitelisted keys
+    common_preset_context(llama_example ex, bool only_remote_allowed = false);
 
     // load presets from INI file
     common_presets load_from_ini(const std::string & path, common_preset & global) const;
diff --git a/docs/preset.md b/docs/preset.md
new file mode 100644
index 00000000000..daea562cc55
--- /dev/null
+++ b/docs/preset.md
@@ -0,0 +1,50 @@
+# llama.cpp INI preset
+
+## Introduction
+
+INI preset is a feature that was added in [PR#17859](https://github.com/ggml-org/llama.cpp/pull/17859). The goal is to allow writing reusable and sharable parameter presets in llama.cpp
+
+### Using preset on server
+
+When using multiple models on server (router mode), INI preset file can be used to configure model-specific parameters. Please refer to [server documentations](../tools/server/README.md) for more.
+
+### Using a remote preset
+
+> [!NOTE]
+>
+> This feature is currently only supported via the `-hf` option
+
+For GGUF models stored on Hugging Face, you can create a file named `preset.ini` in the root directory of the repository that contains specific configurations for the current model.
+
+Example:
+
+```ini
+hf-repo-draft = username/my-draft-model-GGUF
+temp = 0.5
+top-k = 20
+top-p = 0.95
+```
+
+For security reason, only certain options are allowed. Please refer to [preset.cpp](../common/preset.cpp) for the list of allowed options.
+
+Example usage:
+
+Provided your repo is `username/my-model-with-preset` having a `preset.ini` with the content above.
+
+```sh
+llama-cli -hf username/my-model-with-preset
+
+# equivalent to
+llama-cli -hf username/my-model-with-preset \
+  --hf-repo-draft username/my-draft-model-GGUF \
+  --temp 0.5 \
+  --top-k 20 \
+  --top-p 0.95
+```
+
+You can also optionally override preset args by specifying them in the arguments:
+
+```sh
+# forcing temp = 0.1
+llama-cli -hf username/my-model-with-preset --temp 0.1
+```

From f9a97375cb539c7f195b473871a3a1eccf9da3d8 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 1 Jan 2026 12:50:45 +0100
Subject: [PATCH 2/8] proof reading

---
 docs/preset.md | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/docs/preset.md b/docs/preset.md
index daea562cc55..c11c5dc22fe 100644
--- a/docs/preset.md
+++ b/docs/preset.md
@@ -1,20 +1,20 @@
-# llama.cpp INI preset
+# llama.cpp INI Presets
 
 ## Introduction
 
-INI preset is a feature that was added in [PR#17859](https://github.com/ggml-org/llama.cpp/pull/17859). The goal is to allow writing reusable and sharable parameter presets in llama.cpp
+The INI preset feature, introduced in [PR#17859](https://github.com/ggml-org/llama.cpp/pull/17859), allows users to create reusable and shareable parameter configurations for llama.cpp.
 
-### Using preset on server
+### Using Presets with the Server
 
-When using multiple models on server (router mode), INI preset file can be used to configure model-specific parameters. Please refer to [server documentations](../tools/server/README.md) for more.
+When running multiple models on the server (router mode), INI preset files can be used to configure model-specific parameters. Please refer to the [server documentation](../tools/server/README.md) for more details.
 
-### Using a remote preset
+### Using a Remote Preset
 
 > [!NOTE]
 >
-> This feature is currently only supported via the `-hf` option
+> This feature is currently only supported via the `-hf` option.
 
-For GGUF models stored on Hugging Face, you can create a file named `preset.ini` in the root directory of the repository that contains specific configurations for the current model.
+For GGUF models hosted on Hugging Face, you can include a `preset.ini` file in the root directory of the repository to define specific configurations for that model.
 
 Example:
 
@@ -25,16 +25,16 @@ top-k = 20
 top-p = 0.95
 ```
 
-For security reason, only certain options are allowed. Please refer to [preset.cpp](../common/preset.cpp) for the list of allowed options.
+For security reasons, only certain options are allowed. Please refer to [preset.cpp](../common/preset.cpp) for the complete list of permitted options.
 
 Example usage:
 
-Provided your repo is `username/my-model-with-preset` having a `preset.ini` with the content above.
+Assuming your repository `username/my-model-with-preset` contains a `preset.ini` with the configuration above:
 
 ```sh
 llama-cli -hf username/my-model-with-preset
 
-# equivalent to
+# This is equivalent to:
 llama-cli -hf username/my-model-with-preset \
   --hf-repo-draft username/my-draft-model-GGUF \
   --temp 0.5 \
@@ -42,9 +42,9 @@ llama-cli -hf username/my-model-with-preset \
   --top-p 0.95
 ```
 
-You can also optionally override preset args by specifying them in the arguments:
+You can also override preset arguments by specifying them on the command line:
 
 ```sh
-# forcing temp = 0.1
+# Force temp = 0.1, overriding the preset value
 llama-cli -hf username/my-model-with-preset --temp 0.1
 ```

From 9935820cb881f83e1eda2f37cf44ded6930a2cb2 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 6 Jan 2026 16:48:40 +0100
Subject: [PATCH 3/8] allow one HF repo to point to multiple HF repos

---
 common/arg.cpp      | 13 +++++++++++++
 common/download.cpp |  2 +-
 common/preset.cpp   |  2 +-
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 4d4f91e96f5..d8ece34a3b7 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -296,6 +296,7 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
         auto remote_presets = ctx.load_from_ini(preset_path, global);
         if (remote_presets.find(COMMON_PRESET_DEFAULT_NAME) != remote_presets.end()) {
             common_preset & preset = remote_presets.at(COMMON_PRESET_DEFAULT_NAME);
+            LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
             preset.apply_to_params(params);
         } else {
             throw std::runtime_error("Remote preset.ini does not contain [" + std::string(COMMON_PRESET_DEFAULT_NAME) + "] section");
@@ -525,11 +526,23 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
 
     // maybe handle remote preset
     if (!params.model.hf_repo.empty()) {
+        std::string cli_hf_repo = params.model.hf_repo;
         bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
+
+        // special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value)
+        // this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs)
+        std::string preset_hf_repo = params.model.hf_repo;
+        bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo;
+
         if (has_preset) {
             // re-parse CLI args to override preset values
             parse_cli_args();
         }
+
+        // preserve hf_repo from preset if needed
+        if (preset_has_hf_repo) {
+            params.model.hf_repo = preset_hf_repo;
+        }
     }
 
     postprocess_cpu_params(params.cpuparams,       nullptr);
diff --git a/common/download.cpp b/common/download.cpp
index d0aa3860812..3bff968623e 100644
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -952,7 +952,7 @@ common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, cons
     } else if (res_code == 401) {
         throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
     } else {
-        throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
+        throw std::runtime_error(string_format("error from HF API (%s), response code: %ld, data: %s", url.c_str(), res_code, res_str.c_str()));
     }
 
     // check response
diff --git a/common/preset.cpp b/common/preset.cpp
index 949fe001109..aec14e07692 100644
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -19,7 +19,7 @@ static std::string rm_leading_dashes(const std::string & str) {
 // only allow a subset of args for remote presets for security reasons
 // do not add more args unless absolutely necessary
 // args that output to files are strictly prohibited
-static std::set<std::string> get_remote_preset_whitelist(std::map<std::string, common_arg> & key_to_opt) {
+static std::set<std::string> get_remote_preset_whitelist(const std::map<std::string, common_arg> & key_to_opt) {
     static const std::set<std::string> allowed_options = {
         "model-url",
         "hf-repo",

From 9e173f9061e181ea5d07e29731c6863269db4629 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 6 Jan 2026 17:00:59 +0100
Subject: [PATCH 4/8] docs: mention about multiple GGUF use case

---
 docs/preset.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/docs/preset.md b/docs/preset.md
index c11c5dc22fe..be50bb99266 100644
--- a/docs/preset.md
+++ b/docs/preset.md
@@ -48,3 +48,13 @@ You can also override preset arguments by specifying them on the command line:
 # Force temp = 0.1, overriding the preset value
 llama-cli -hf username/my-model-with-preset --temp 0.1
 ```
+
+If you want to define multiple preset configurations for one or more GGUF models, you can create a blank HF repo for each preset. Each HF repo should contain a `preset.ini` file that references the actual model(s):
+
+```ini
+hf-repo = user/my-model-main
+hf-repo-draft = user/my-model-draft
+temp = 0.8
+ctx-size = 1024
+; (and other configurations)
+```

From 74a33726868de463966233e9fba10a6b44cb1d77 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 8 Jan 2026 14:47:50 +0100
Subject: [PATCH 5/8] correct clean_file_name

---
 common/arg.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 64da5d293e9..1dcf7e86b29 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -271,8 +271,8 @@ static void parse_tensor_buffer_overrides(const std::string & value, std::vector
 
 static std::string clean_file_name(const std::string & fname) {
     std::string clean_fname = fname;
-    string_replace_all(clean_fname, "\\", "/");
-    string_replace_all(clean_fname, "/", "");
+    string_replace_all(clean_fname, "\\", "_");
+    string_replace_all(clean_fname, "/", "_");
     return clean_fname;
 }
 

From 7fccd041f33cef619920f7ee4dc238d7cb410386 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 8 Jan 2026 15:29:06 +0100
Subject: [PATCH 6/8] download: also return HTTP status code

---
 common/arg.cpp      |  5 ++-
 common/download.cpp | 82 +++++++++++++++++++++++++++------------------
 common/download.h   | 10 +++---
 3 files changed, 60 insertions(+), 37 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index a80f336a67d..72750a3cba0 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -286,7 +286,8 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
     // prepare local path for caching
     auto preset_fname = clean_file_name(params.model.hf_repo + "_preset.ini");
     auto preset_path = fs_get_cache_file(preset_fname);
-    bool has_preset = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
+    const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
+    const bool has_preset = status >= 200 && status < 400;
 
     // remote preset is optional, so we don't error out if not found
     if (has_preset) {
@@ -301,6 +302,8 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
         } else {
             throw std::runtime_error("Remote preset.ini does not contain [" + std::string(COMMON_PRESET_DEFAULT_NAME) + "] section");
         }
+    } else {
+        LOG_INF("%s", "no remote preset found, skipping\n");
     }
 
     return has_preset;
diff --git a/common/download.cpp b/common/download.cpp
index eca43e23afe..4b3e7930602 100644
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -157,6 +157,10 @@ static std::string read_etag(const std::string & path) {
     return none;
 }
 
+static bool is_http_status_ok(int status) {
+    return status >= 200 && status < 400;
+}
+
 #ifdef LLAMA_USE_CURL
 
 //
@@ -306,12 +310,14 @@ static bool common_download_head(CURL *              curl,
 }
 
 // download one single file from remote URL to local path
-static bool common_download_file_single_online(const std::string & url,
+// returns status code or -1 on error
+static int common_download_file_single_online(const std::string & url,
                                                const std::string & path,
                                                const std::string & bearer_token,
                                                const common_header_list & custom_headers) {
     static const int max_attempts        = 3;
     static const int retry_delay_seconds = 2;
+
     for (int i = 0; i < max_attempts; ++i) {
         std::string etag;
 
@@ -371,7 +377,7 @@ static bool common_download_file_single_online(const std::string & url,
                 LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
                 if (remove(path.c_str()) != 0) {
                     LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
-                    return false;
+                    return -1;
                 }
             }
 
@@ -380,14 +386,14 @@ static bool common_download_file_single_online(const std::string & url,
                 if (std::filesystem::exists(path_temporary)) {
                     if (remove(path_temporary.c_str()) != 0) {
                         LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
-                        return false;
+                        return -1;
                     }
                 }
 
                 if (std::filesystem::exists(path)) {
                     if (remove(path.c_str()) != 0) {
                         LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
-                        return false;
+                        return -1;
                     }
                 }
             }
@@ -414,23 +420,27 @@ static bool common_download_file_single_online(const std::string & url,
 
             long http_code = 0;
             curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
-            if (http_code < 200 || http_code >= 400) {
+
+            int status = static_cast<int>(http_code);
+            if (!is_http_status_ok(http_code)) {
                 LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
-                return false;
+                return status; // TODO: maybe only return on certain codes
             }
 
             if (rename(path_temporary.c_str(), path.c_str()) != 0) {
                 LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
-                return false;
+                return -1;
             }
+
+            return static_cast<int>(http_code);
         } else {
             LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
-        }
 
-        break;
+            return 304; // Not Modified - fake cached response
+        }
     }
 
-    return true;
+    return -1; // max attempts reached
 }
 
 std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
@@ -625,7 +635,8 @@ static bool common_pull_file(httplib::Client & cli,
 }
 
 // download one single file from remote URL to local path
-static bool common_download_file_single_online(const std::string & url,
+// returns status code or -1 on error
+static int common_download_file_single_online(const std::string & url,
                                                const std::string & path,
                                                const std::string & bearer_token,
                                                const common_header_list & custom_headers) {
@@ -659,8 +670,10 @@ static bool common_download_file_single_online(const std::string & url,
             LOG_WRN("%s: HEAD invalid http status code received: %d\n", __func__, head ? head->status : -1);
             if (file_exists) {
                 LOG_INF("%s: Using cached file (HEAD failed): %s\n", __func__, path.c_str());
-                return true;
+                return head->status;
             }
+            return head->status; // cannot use cached file, return raw status code
+            // TODO: maybe retry only on certain codes
         }
 
         std::string etag;
@@ -692,12 +705,12 @@ static bool common_download_file_single_online(const std::string & url,
         if (file_exists) {
             if (!should_download_from_scratch) {
                 LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
-                return true;
+                return 304; // 304 Not Modified - fake cached response
             }
             LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
             if (remove(path.c_str()) != 0) {
                 LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
-                return false;
+                return -1;
             }
         }
 
@@ -709,7 +722,7 @@ static bool common_download_file_single_online(const std::string & url,
                 existing_size = std::filesystem::file_size(path_temporary);
             } else if (remove(path_temporary.c_str()) != 0) {
                 LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
-                return false;
+                return -1;
             }
         }
 
@@ -730,15 +743,16 @@ static bool common_download_file_single_online(const std::string & url,
 
         if (std::rename(path_temporary.c_str(), path.c_str()) != 0) {
             LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
-            return false;
+            return -1;
         }
         if (!etag.empty()) {
             write_etag(path, etag);
         }
-        break;
+
+        return head->status; // TODO: use actual GET status?
     }
 
-    return true;
+    return -1; // max attempts reached
 }
 
 std::pair<long, std::vector<char>> common_remote_get_content(const std::string          & url,
@@ -777,22 +791,22 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
 
 #if defined(LLAMA_USE_CURL) || defined(LLAMA_USE_HTTPLIB)
 
-static bool common_download_file_single(const std::string & url,
-                                        const std::string & path,
-                                        const std::string & bearer_token,
-                                        bool                offline,
-                                        const common_header_list & headers) {
+int common_download_file_single(const std::string & url,
+                                const std::string & path,
+                                const std::string & bearer_token,
+                                bool offline,
+                                const common_header_list & headers) {
     if (!offline) {
         return common_download_file_single_online(url, path, bearer_token, headers);
     }
 
     if (!std::filesystem::exists(path)) {
         LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
-        return false;
+        return -1;
     }
 
     LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
-    return true;
+    return -1;
 }
 
 // download multiple files from remote URLs to local paths
@@ -810,7 +824,8 @@ static bool common_download_file_multiple(const std::vector<std::pair<std::strin
             std::async(
                 std::launch::async,
                 [&bearer_token, offline, &headers](const std::pair<std::string, std::string> & it) -> bool {
-                    return common_download_file_single(it.first, it.second, bearer_token, offline, headers);
+                    const int http_status = common_download_file_single(it.first, it.second, bearer_token, offline, headers);
+                    return is_http_status_ok(http_status);
                 },
                 item
             )
@@ -837,7 +852,8 @@ bool common_download_model(const common_params_model & model,
         return false;
     }
 
-    if (!common_download_file_single(model.url, model.path, bearer_token, offline, headers)) {
+    const int http_status = common_download_file_single(model.url, model.path, bearer_token, offline, headers);
+    if (!is_http_status_ok(http_status)) {
         return false;
     }
 
@@ -1094,7 +1110,8 @@ std::string common_docker_resolve_model(const std::string & docker) {
         std::string local_path = fs_get_cache_file(model_filename);
 
         const std::string blob_url = url_prefix + "/blobs/" + gguf_digest;
-        if (!common_download_file_single(blob_url, local_path, token, false, {})) {
+        const int http_status = common_download_file_single(blob_url, local_path, token, false, {});
+        if (!is_http_status_ok(http_status)) {
             throw std::runtime_error("Failed to download Docker Model");
         }
 
@@ -1120,10 +1137,11 @@ std::string common_docker_resolve_model(const std::string &) {
     throw std::runtime_error("download functionality is not enabled in this build");
 }
 
-bool common_download_file_single(const std::string &,
-                                 const std::string &,
-                                 const std::string &,
-                                 const common_header_list &) {
+int common_download_file_single(const std::string &,
+                                const std::string &,
+                                const std::string &,
+                                bool,
+                                const common_header_list &) {
     throw std::runtime_error("download functionality is not enabled in this build");
 }
 
diff --git a/common/download.h b/common/download.h
index a4803eee0b6..c79be2f90eb 100644
--- a/common/download.h
+++ b/common/download.h
@@ -66,10 +66,12 @@ bool common_download_model(
 std::vector<common_cached_model_info> common_list_cached_models();
 
 // download single file from url to local path
-bool common_download_file_single(const std::string & url,
-                                 const std::string & path,
-                                 const std::string & bearer_token,
-                                 const common_header_list & headers = {});
+// returns status code or -1 on error
+int common_download_file_single(const std::string & url,
+                                const std::string & path,
+                                const std::string & bearer_token,
+                                bool offline,
+                                const common_header_list & headers = {});
 
 // resolve and download model from Docker registry
 // return local path to downloaded model file

From 36b6b9856dd5a26f90faf55fcdbab64b9d80d2a2 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 8 Jan 2026 15:56:48 +0100
Subject: [PATCH 7/8] fix case with cache file used

---
 common/download.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/download.cpp b/common/download.cpp
index 4b3e7930602..b8b9b45b852 100644
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -670,7 +670,7 @@ static int common_download_file_single_online(const std::string & url,
             LOG_WRN("%s: HEAD invalid http status code received: %d\n", __func__, head ? head->status : -1);
             if (file_exists) {
                 LOG_INF("%s: Using cached file (HEAD failed): %s\n", __func__, path.c_str());
-                return head->status;
+                return 304; // 304 Not Modified - fake cached response
             }
             return head->status; // cannot use cached file, return raw status code
             // TODO: maybe retry only on certain codes

From 38cd7fb4f665c6bae7c355260d2c87371584297c Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 8 Jan 2026 17:51:06 +0100
Subject: [PATCH 8/8] fix --offline option

---
 common/download.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/download.cpp b/common/download.cpp
index b8b9b45b852..a1e0e518e9a 100644
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -806,7 +806,7 @@ int common_download_file_single(const std::string & url,
     }
 
     LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
-    return -1;
+    return 304; // Not Modified - fake cached response
 }
 
 // download multiple files from remote URLs to local paths