From 26d0e9b2eb6e30c137b442ffe65c9642b31ca2e0 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 19 Mar 2026 00:00:52 +0100 Subject: [PATCH 1/8] wip: server_tools --- common/arg.cpp | 8 + common/common.h | 3 + tools/server/CMakeLists.txt | 2 + tools/server/README-dev.md | 16 + tools/server/README.md | 8 + tools/server/server-tools.cpp | 596 ++++++++++++++++++++++++++++++++++ tools/server/server-tools.h | 7 + tools/server/server.cpp | 10 + 8 files changed, 650 insertions(+) create mode 100644 tools/server/server-tools.cpp create mode 100644 tools/server/server-tools.h diff --git a/common/arg.cpp b/common/arg.cpp index 666339a0945..f54cab4449d 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2848,6 +2848,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.webui_mcp_proxy = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY")); + add_opt(common_arg( + {"--tools"}, + {"--no-tools"}, + string_format("experimental: whether to enable tools for AI agents - do not enable in untrusted environments (default: %s)", params.server_tools ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.server_tools = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TOOLS")); add_opt(common_arg( {"--webui"}, {"--no-webui"}, diff --git a/common/common.h b/common/common.h index 073ef566d2d..25f798ff0ff 100644 --- a/common/common.h +++ b/common/common.h @@ -569,6 +569,9 @@ struct common_params { bool endpoint_props = false; // only control POST requests, not GET bool endpoint_metrics = false; + // enable built-in tools + bool server_tools = false; + // router server configs std::string models_dir = ""; // directory containing models for the router server std::string models_preset = ""; // directory containing model presets for the router server diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt index 5621a51b226..fc4cb5dcc8a 100644 --- a/tools/server/CMakeLists.txt +++ b/tools/server/CMakeLists.txt @@ -13,6 +13,8 @@ add_library(${TARGET} STATIC server-common.h server-context.cpp server-context.h + server-tools.cpp + server-tools.h ) if (BUILD_SHARED_LIBS) diff --git a/tools/server/README-dev.md b/tools/server/README-dev.md index 3fea3042f72..8318c5852d8 100644 --- a/tools/server/README-dev.md +++ b/tools/server/README-dev.md @@ -95,6 +95,22 @@ The framework automatically starts a `llama-server` instance, sends requests, an For detailed instructions, see the [test documentation](./tests/README.md). +### API for tools + +This endpoint is intended to be used internally by the Web UI and subject to change or to be removed in the future. + +**GET /tools** + +Get a list of tools, the tool definition is in OAI-compat format. + +**POST /tools** + +Invoke a tool call, request body is a JSON object with: +- `tool` (string): the name of the tool +- `params` (object): a mapping from argument name (string) to argument value + +Returns JSON object, the schema depends on the tool itself. + ### Notable Related PRs - Initial server implementation: https://github.com/ggml-org/llama.cpp/pull/1443 diff --git a/tools/server/README.md b/tools/server/README.md index da16ddc756e..c0fd8b06610 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1433,6 +1433,14 @@ curl http://localhost:8080/v1/messages/count_tokens \ {"input_tokens": 10} ``` +## Server built-in tools + +The server exposes a REST API under `/tools` that allows the Web UI to call built-in tools. This endpoint is intended to be used internally by the Web UI and subject to change or to be removed in the future. + +**Please do NOT use this endpoint in a downstream application** + +For further documentation about this endpoint, please refer to [server internal documentation](./README-dev.md) + ## Using multiple models `llama-server` can be launched in a **router mode** that exposes an API for dynamically loading and unloading models. The main process (the "router") automatically forwards each request to the appropriate model instance. diff --git a/tools/server/server-tools.cpp b/tools/server/server-tools.cpp new file mode 100644 index 00000000000..a040c42051f --- /dev/null +++ b/tools/server/server-tools.cpp @@ -0,0 +1,596 @@ +#include "server-tools.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace fs = std::filesystem; + +// +// internal helpers +// + +static std::vector to_cstr_vec(const std::vector & v) { + std::vector r; + r.reserve(v.size() + 1); + for (const auto & s : v) { + r.push_back(const_cast(s.c_str())); + } + r.push_back(nullptr); + return r; +} + +struct run_proc_result { + std::string output; + int exit_code = -1; + bool timed_out = false; +}; + +static run_proc_result run_process( + const std::vector & args, + size_t max_output, + int timeout_secs) { + run_proc_result res; + + subprocess_s proc; + auto argv = to_cstr_vec(args); + + int options = subprocess_option_no_window + | subprocess_option_combined_stdout_stderr + | subprocess_option_inherit_environment + | subprocess_option_search_user_path; + + if (subprocess_create(argv.data(), options, &proc) != 0) { + res.output = "failed to spawn process"; + return res; + } + + std::atomic done{false}; + std::atomic timed_out{false}; + + std::thread timeout_thread([&]() { + auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(timeout_secs); + while (!done.load()) { + if (std::chrono::steady_clock::now() >= deadline) { + timed_out.store(true); + subprocess_terminate(&proc); + return; + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + }); + + FILE * f = subprocess_stdout(&proc); + std::string output; + bool truncated = false; + if (f) { + char buf[4096]; + while (fgets(buf, sizeof(buf), f) != nullptr) { + if (!truncated) { + size_t len = strlen(buf); + if (output.size() + len <= max_output) { + output.append(buf, len); + } else { + output.append(buf, max_output - output.size()); + truncated = true; + } + } + } + } + + done.store(true); + if (timeout_thread.joinable()) { + timeout_thread.join(); + } + + subprocess_join(&proc, &res.exit_code); + subprocess_destroy(&proc); + + res.output = output; + res.timed_out = timed_out.load(); + if (truncated) { + res.output += "\n[output truncated]"; + } + return res; +} + +// simple glob: * matches non-/ chars, ** matches anything including / +static bool glob_match(const char * pattern, const char * str) { + if (*pattern == '\0') { + return *str == '\0'; + } + if (pattern[0] == '*' && pattern[1] == '*') { + const char * p = pattern + 2; + if (*p == '/') p++; + if (glob_match(p, str)) return true; + if (*str != '\0') return glob_match(pattern, str + 1); + return false; + } + if (*pattern == '*') { + const char * p = pattern + 1; + for (; *str != '\0' && *str != '/'; str++) { + if (glob_match(p, str)) return true; + } + return glob_match(p, str); + } + if (*pattern == '?' && *str != '\0' && *str != '/') { + return glob_match(pattern + 1, str + 1); + } + if (*pattern == *str) { + return glob_match(pattern + 1, str + 1); + } + return false; +} + +static bool glob_match(const std::string & pattern, const std::string & str) { + return glob_match(pattern.c_str(), str.c_str()); +} + +// +// base struct +// + +struct server_tool { + std::string name; + json definition; + bool permission_write = false; + virtual ~server_tool() = default; + virtual json to_json() = 0; + virtual json invoke(json params) = 0; +}; + +// +// read_file: read a file with optional line range and line-number prefix +// + +static constexpr size_t SERVER_TOOL_READ_FILE_MAX_SIZE = 16 * 1024; // 16 KB + +struct server_tool_read_file : server_tool { + server_tool_read_file() { name = "read_file"; permission_write = false; } + + json to_json() override { + return { + {"type", "function"}, + {"function", { + {"name", name}, + {"description", "Read the contents of a file. Optionally specify a 1-based line range. " + "If append_loc is true, each line is prefixed with its line number (e.g. \"1\u2192 ...\")."}, + {"parameters", { + {"type", "object"}, + {"properties", { + {"path", {{"type", "string"}, {"description", "Path to the file"}}}, + {"start_line", {{"type", "integer"}, {"description", "First line to read, 1-based (default: 1)"}}}, + {"end_line", {{"type", "integer"}, {"description", "Last line to read, 1-based inclusive (default: end of file)"}}}, + {"append_loc", {{"type", "boolean"}, {"description", "Prefix each line with its line number"}}}, + }}, + {"required", json::array({"path"})}, + }}, + }}, + }; + } + + json invoke(json params) override { + std::string path = params.at("path").get(); + int start_line = json_value(params, "start_line", 1); + int end_line = json_value(params, "end_line", -1); // -1 = no limit + bool append_loc = json_value(params, "append_loc", false); + + std::error_code ec; + uintmax_t file_size = fs::file_size(path, ec); + if (ec) { + return {{"error", "cannot stat file: " + ec.message()}}; + } + if (file_size > SERVER_TOOL_READ_FILE_MAX_SIZE && end_line == -1) { + return {{"error", string_format( + "file too large (%zu bytes, max %zu). Use start_line/end_line to read a portion.", + (size_t)file_size, SERVER_TOOL_READ_FILE_MAX_SIZE)}}; + } + + std::ifstream f(path); + if (!f) { + return {{"error", "failed to open file: " + path}}; + } + + std::string result; + std::string line; + int lineno = 0; + + while (std::getline(f, line)) { + lineno++; + if (lineno < start_line) continue; + if (end_line != -1 && lineno > end_line) break; + + std::string out_line; + if (append_loc) { + out_line = std::to_string(lineno) + "\u2192 " + line + "\n"; + } else { + out_line = line + "\n"; + } + + if (result.size() + out_line.size() > SERVER_TOOL_READ_FILE_MAX_SIZE) { + result += "[output truncated]"; + break; + } + result += out_line; + } + + return {{"content", result}}; + } +}; + +// +// file_glob_search: find files matching a glob pattern under a base directory +// + +static constexpr size_t SERVER_TOOL_FILE_SEARCH_MAX_RESULTS = 100; + +struct server_tool_file_glob_search : server_tool { + server_tool_file_glob_search() { name = "file_glob_search"; permission_write = false; } + + json to_json() override { + return { + {"type", "function"}, + {"function", { + {"name", name}, + {"description", "Recursively search for files matching a glob pattern under a directory."}, + {"parameters", { + {"type", "object"}, + {"properties", { + {"path", {{"type", "string"}, {"description", "Base directory to search in"}}}, + {"include", {{"type", "string"}, {"description", "Glob pattern for files to include (e.g. \"**/*.cpp\"). Default: **"}}}, + {"exclude", {{"type", "string"}, {"description", "Glob pattern for files to exclude"}}}, + }}, + {"required", json::array({"path"})}, + }}, + }}, + }; + } + + json invoke(json params) override { + std::string base = params.at("path").get(); + std::string include = json_value(params, "include", std::string("**")); + std::string exclude = json_value(params, "exclude", std::string("")); + + json files = json::array(); + + std::error_code ec; + for (const auto & entry : fs::recursive_directory_iterator(base, + fs::directory_options::skip_permission_denied, ec)) { + if (!entry.is_regular_file()) continue; + + std::string rel = fs::relative(entry.path(), base, ec).string(); + if (ec) continue; + std::replace(rel.begin(), rel.end(), '\\', '/'); + + if (!glob_match(include, rel)) continue; + if (!exclude.empty() && glob_match(exclude, rel)) continue; + + files.push_back(entry.path().string()); + if (files.size() >= SERVER_TOOL_FILE_SEARCH_MAX_RESULTS) { + break; + } + } + + return {{"files", files}, {"count", files.size()}}; + } +}; + +// +// grep_search: search for a regex pattern in files +// + +static constexpr size_t SERVER_TOOL_GREP_SEARCH_MAX_RESULTS = 100; + +struct server_tool_grep_search : server_tool { + server_tool_grep_search() { name = "grep_search"; permission_write = false; } + + json to_json() override { + return { + {"type", "function"}, + {"function", { + {"name", name}, + {"description", "Search for a regex pattern in files under a path. Returns matching lines."}, + {"parameters", { + {"type", "object"}, + {"properties", { + {"path", {{"type", "string"}, {"description", "File or directory to search in"}}}, + {"pattern", {{"type", "string"}, {"description", "Regular expression pattern to search for"}}}, + {"include", {{"type", "string"}, {"description", "Glob pattern to filter files (default: **)"}}}, + {"exclude", {{"type", "string"}, {"description", "Glob pattern to exclude files"}}}, + {"return_line_numbers", {{"type", "boolean"}, {"description", "If true, include line numbers in results"}}}, + }}, + {"required", json::array({"path", "pattern"})}, + }}, + }}, + }; + } + + json invoke(json params) override { + std::string path = params.at("path").get(); + std::string pat_str = params.at("pattern").get(); + std::string include = json_value(params, "include", std::string("**")); + std::string exclude = json_value(params, "exclude", std::string("")); + bool show_lineno = json_value(params, "return_line_numbers", false); + + std::regex pattern; + try { + pattern = std::regex(pat_str); + } catch (const std::regex_error & e) { + return {{"error", std::string("invalid regex: ") + e.what()}}; + } + + json matches = json::array(); + size_t total = 0; + + auto search_file = [&](const fs::path & fpath) { + std::ifstream f(fpath); + if (!f) return; + std::string line; + int lineno = 0; + while (std::getline(f, line) && total < SERVER_TOOL_GREP_SEARCH_MAX_RESULTS) { + lineno++; + if (std::regex_search(line, pattern)) { + json match = {{"file", fpath.string()}, {"content", line}}; + if (show_lineno) { + match["line"] = lineno; + } + matches.push_back(match); + total++; + } + } + }; + + std::error_code ec; + if (fs::is_regular_file(path, ec)) { + search_file(path); + } else if (fs::is_directory(path, ec)) { + for (const auto & entry : fs::recursive_directory_iterator(path, + fs::directory_options::skip_permission_denied, ec)) { + if (!entry.is_regular_file()) continue; + if (total >= SERVER_TOOL_GREP_SEARCH_MAX_RESULTS) break; + + std::string rel = fs::relative(entry.path(), path, ec).string(); + if (ec) continue; + std::replace(rel.begin(), rel.end(), '\\', '/'); + + if (!glob_match(include, rel)) continue; + if (!exclude.empty() && glob_match(exclude, rel)) continue; + + search_file(entry.path()); + } + } else { + return {{"error", "path does not exist: " + path}}; + } + + return {{"matches", matches}, {"count", total}}; + } +}; + +// +// exec_shell_command: run an arbitrary shell command +// + +static constexpr size_t SERVER_TOOL_EXEC_SHELL_COMMAND_MAX_OUTPUT_SIZE = 16 * 1024; // 16 KB +static constexpr int SERVER_TOOL_EXEC_SHELL_COMMAND_MAX_TIMEOUT = 60; // seconds + +struct server_tool_exec_shell_command : server_tool { + server_tool_exec_shell_command() { name = "exec_shell_command"; permission_write = true; } + + json to_json() override { + return { + {"type", "function"}, + {"function", { + {"name", name}, + {"description", "Execute a shell command and return its output (stdout and stderr combined)."}, + {"parameters", { + {"type", "object"}, + {"properties", { + {"command", {{"type", "string"}, {"description", "Shell command to execute"}}}, + {"timeout", {{"type", "integer"}, {"description", string_format("Timeout in seconds (default 10, max %d)", SERVER_TOOL_EXEC_SHELL_COMMAND_MAX_TIMEOUT)}}}, + {"max_output_size", {{"type", "integer"}, {"description", string_format("Maximum output size in bytes (default %zu)", SERVER_TOOL_EXEC_SHELL_COMMAND_MAX_OUTPUT_SIZE)}}}, + }}, + {"required", json::array({"command"})}, + }}, + }}, + }; + } + + json invoke(json params) override { + std::string command = params.at("command").get(); + int timeout = json_value(params, "timeout", 10); + size_t max_output = (size_t) json_value(params, "max_output_size", (int) SERVER_TOOL_EXEC_SHELL_COMMAND_MAX_OUTPUT_SIZE); + + timeout = std::min(timeout, SERVER_TOOL_EXEC_SHELL_COMMAND_MAX_TIMEOUT); + max_output = std::min(max_output, SERVER_TOOL_EXEC_SHELL_COMMAND_MAX_OUTPUT_SIZE); + +#ifdef _WIN32 + std::vector args = {"cmd", "/c", command}; +#else + std::vector args = {"sh", "-c", command}; +#endif + + auto res = run_process(args, max_output, timeout); + + json out = {{"output", res.output}, {"exit_code", res.exit_code}}; + if (res.timed_out) { + out["timed_out"] = true; + } + return out; + } +}; + +// +// write_file: create or overwrite a file +// + +struct server_tool_write_file : server_tool { + server_tool_write_file() { name = "write_file"; permission_write = true; } + + json to_json() override { + return { + {"type", "function"}, + {"function", { + {"name", name}, + {"description", "Write content to a file, creating it (including parent directories) if it does not exist."}, + {"parameters", { + {"type", "object"}, + {"properties", { + {"path", {{"type", "string"}, {"description", "Path of the file to write"}}}, + {"content", {{"type", "string"}, {"description", "Content to write"}}}, + }}, + {"required", json::array({"path", "content"})}, + }}, + }}, + }; + } + + json invoke(json params) override { + std::string path = params.at("path").get(); + std::string content = params.at("content").get(); + + std::error_code ec; + fs::path fpath(path); + if (fpath.has_parent_path()) { + fs::create_directories(fpath.parent_path(), ec); + if (ec) { + return {{"error", "failed to create directories: " + ec.message()}}; + } + } + + std::ofstream f(path, std::ios::binary); + if (!f) { + return {{"error", "failed to open file for writing: " + path}}; + } + f << content; + if (!f) { + return {{"error", "failed to write file: " + path}}; + } + + return {{"result", "file written successfully"}, {"path", path}, {"bytes", content.size()}}; + } +}; + +// +// edit_file: apply a unified diff via git apply +// + +struct server_tool_edit_file : server_tool { + server_tool_edit_file() { name = "edit_file"; permission_write = true; } + + json to_json() override { + return { + {"type", "function"}, + {"function", { + {"name", name}, + {"description", "Apply a unified diff to edit one or more files using git apply."}, + {"parameters", { + {"type", "object"}, + {"properties", { + {"diff", {{"type", "string"}, {"description", "Unified diff content in git diff format"}}}, + }}, + {"required", json::array({"diff"})}, + }}, + }}, + }; + } + + json invoke(json params) override { + std::string diff = params.at("diff").get(); + + // write diff to a temporary file + static std::atomic counter{0}; + std::string tmp_path = (fs::temp_directory_path() / + ("llama_patch_" + std::to_string(++counter) + ".patch")).string(); + + { + std::ofstream f(tmp_path, std::ios::binary); + if (!f) { + return {{"error", "failed to create temp patch file"}}; + } + f << diff; + } + + auto res = run_process({"git", "apply", tmp_path}, 4096, 10); + + std::error_code ec; + fs::remove(tmp_path, ec); + + if (res.exit_code != 0) { + return {{"error", "git apply failed (exit " + std::to_string(res.exit_code) + "): " + res.output}}; + } + return {{"result", "patch applied successfully"}}; + } +}; + +// +// public API +// + +static std::vector> build_tools() { + std::vector> tools; + tools.push_back(std::make_unique()); + tools.push_back(std::make_unique()); + tools.push_back(std::make_unique()); + tools.push_back(std::make_unique()); + tools.push_back(std::make_unique()); + tools.push_back(std::make_unique()); + return tools; +} + +static json server_tools_list() { + auto tools = build_tools(); + json result = json::array(); + for (const auto & t : tools) { + result.push_back(t->to_json()); + } + return result; +} + +static json server_tool_call(const std::string & name, const json & params) { + auto tools = build_tools(); + for (auto & t : tools) { + if (t->name == name) { + return t->invoke(params); + } + } + return {{"error", "unknown tool: " + name}}; +} + +server_http_context::handler_t server_tools_get = [](const server_http_req &) -> server_http_res_ptr { + auto res = std::make_unique(); + try { + json tools = server_tools_list(); + res->data = safe_json_to_str(tools); + } catch (const std::exception & e) { + SRV_ERR("got exception: %s\n", e.what()); + res->status = 500; + res->data = safe_json_to_str(format_error_response(e.what(), ERROR_TYPE_SERVER)); + } + return res; +}; + +server_http_context::handler_t server_tools_post = [](const server_http_req & req) -> server_http_res_ptr { + auto res = std::make_unique(); + try { + json body = json::parse(req.body); + std::string tool_name = body.at("tool").get(); + json params = body.value("params", json::object()); + json result = server_tool_call(tool_name, params); + res->data = safe_json_to_str(result); + } catch (const json::exception & e) { + res->status = 400; + res->data = safe_json_to_str(format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST)); + } catch (const std::exception & e) { + SRV_ERR("got exception: %s\n", e.what()); + res->status = 500; + res->data = safe_json_to_str(format_error_response(e.what(), ERROR_TYPE_SERVER)); + } + return res; +}; diff --git a/tools/server/server-tools.h b/tools/server/server-tools.h new file mode 100644 index 00000000000..141235d7993 --- /dev/null +++ b/tools/server/server-tools.h @@ -0,0 +1,7 @@ +#pragma once + +#include "server-common.h" +#include "server-http.h" + +extern server_http_context::handler_t server_tools_get; +extern server_http_context::handler_t server_tools_post; \ No newline at end of file diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 0bd6fda17d2..d1db1ed1ea0 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2,6 +2,7 @@ #include "server-http.h" #include "server-models.h" #include "server-cors-proxy.h" +#include "server-tools.h" #include "arg.h" #include "common.h" @@ -211,6 +212,15 @@ int main(int argc, char ** argv) { ctx_http.get ("/cors-proxy", ex_wrapper(proxy_handler_get)); ctx_http.post("/cors-proxy", ex_wrapper(proxy_handler_post)); } + // EXPERIMENTAL built-in tools + if (params.server_tools) { + SRV_WRN("%s", "-----------------\n"); + SRV_WRN("%s", "Built-in tools are enabled, do not expose server to untrusted environments\n"); + SRV_WRN("%s", "This feature is EXPERIMENTAL and may be changed in the future\n"); + SRV_WRN("%s", "-----------------\n"); + ctx_http.get ("/tools", ex_wrapper(server_tools_get)); + ctx_http.post("/tools", ex_wrapper(server_tools_post)); + } // // Start the server From 7f9f53124bf1a0ed0110332ed1377a728328e99c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 19 Mar 2026 23:03:33 +0100 Subject: [PATCH 2/8] refactor --- tools/server/README-dev.md | 43 +++++- tools/server/server-tools.cpp | 270 ++++++++++++++++++++++++++++++---- 2 files changed, 282 insertions(+), 31 deletions(-) diff --git a/tools/server/README-dev.md b/tools/server/README-dev.md index 8318c5852d8..326cb357b44 100644 --- a/tools/server/README-dev.md +++ b/tools/server/README-dev.md @@ -101,7 +101,12 @@ This endpoint is intended to be used internally by the Web UI and subject to cha **GET /tools** -Get a list of tools, the tool definition is in OAI-compat format. +Get a list of tools, each tool has these fields: +- `tool` (string): the ID name of the tool, to be used in POST call. Example: `read_file` +- `displayName` (string): the name to be displayed on UI. Example: `Read file` +- `type` (string): always be `"builtin"` for now +- `permissions` (object): a mapping string --> boolean that indicates the permission required by this tool. This is useful for the UI to ask the user before calling the tool. For now, the only permission supported is `"write"` +- `definition` (object): the OAI-compat definition of this tool **POST /tools** @@ -109,7 +114,41 @@ Invoke a tool call, request body is a JSON object with: - `tool` (string): the name of the tool - `params` (object): a mapping from argument name (string) to argument value -Returns JSON object, the schema depends on the tool itself. +Returns JSON object. There are two response formats: + +Format 1: Plain text. The text will be placed into a field called `plain_text_response`, example: + +```json +{ + "plain_text_response": "this is a text response" +} +``` + +The client should extract this value and place it inside message content (note: content is no longer a JSON), example + +```json +{ + "role": "tool", + "content": "this is a text response" +} +``` + +Format 2: Normal JSON response, example: + +```json +{ + "error": "cannot open this file" +} +``` + +That requires `JSON.stringify` when formatted to message content: + +```json +{ + "role": "tool", + "content": "{\"error\":\"cannot open this file\"}" +} +``` ### Notable Related PRs diff --git a/tools/server/server-tools.cpp b/tools/server/server-tools.cpp index a040c42051f..d5c7921efd2 100644 --- a/tools/server/server-tools.cpp +++ b/tools/server/server-tools.cpp @@ -139,11 +139,25 @@ static bool glob_match(const std::string & pattern, const std::string & str) { struct server_tool { std::string name; + std::string displayName; json definition; bool permission_write = false; + virtual ~server_tool() = default; - virtual json to_json() = 0; + virtual json get_definition() = 0; virtual json invoke(json params) = 0; + + json to_json() { + return { + {"displayName", displayName}, + {"tool", name}, + {"type", "builtin"}, + {"permissions", json{ + {"write", permission_write} + }}, + {"definition", get_definition()}, + }; + } }; // @@ -153,9 +167,13 @@ struct server_tool { static constexpr size_t SERVER_TOOL_READ_FILE_MAX_SIZE = 16 * 1024; // 16 KB struct server_tool_read_file : server_tool { - server_tool_read_file() { name = "read_file"; permission_write = false; } + server_tool_read_file() { + name = "read_file"; + displayName = "Read file"; + permission_write = false; + } - json to_json() override { + json get_definition() override { return { {"type", "function"}, {"function", { @@ -221,7 +239,7 @@ struct server_tool_read_file : server_tool { result += out_line; } - return {{"content", result}}; + return {{"plain_text_response", result}}; } }; @@ -232,9 +250,13 @@ struct server_tool_read_file : server_tool { static constexpr size_t SERVER_TOOL_FILE_SEARCH_MAX_RESULTS = 100; struct server_tool_file_glob_search : server_tool { - server_tool_file_glob_search() { name = "file_glob_search"; permission_write = false; } + server_tool_file_glob_search() { + name = "file_glob_search"; + displayName = "File search"; + permission_write = false; + } - json to_json() override { + json get_definition() override { return { {"type", "function"}, {"function", { @@ -258,7 +280,8 @@ struct server_tool_file_glob_search : server_tool { std::string include = json_value(params, "include", std::string("**")); std::string exclude = json_value(params, "exclude", std::string("")); - json files = json::array(); + std::ostringstream output_text; + size_t count = 0; std::error_code ec; for (const auto & entry : fs::recursive_directory_iterator(base, @@ -272,13 +295,15 @@ struct server_tool_file_glob_search : server_tool { if (!glob_match(include, rel)) continue; if (!exclude.empty() && glob_match(exclude, rel)) continue; - files.push_back(entry.path().string()); - if (files.size() >= SERVER_TOOL_FILE_SEARCH_MAX_RESULTS) { + output_text << entry.path().string() << "\n"; + if (++count >= SERVER_TOOL_FILE_SEARCH_MAX_RESULTS) { break; } } - return {{"files", files}, {"count", files.size()}}; + output_text << "\n---\nTotal matches: " << count << "\n"; + + return {{"plain_text_response", output_text.str()}}; } }; @@ -289,9 +314,13 @@ struct server_tool_file_glob_search : server_tool { static constexpr size_t SERVER_TOOL_GREP_SEARCH_MAX_RESULTS = 100; struct server_tool_grep_search : server_tool { - server_tool_grep_search() { name = "grep_search"; permission_write = false; } + server_tool_grep_search() { + name = "grep_search"; + displayName = "Grep search"; + permission_write = false; + } - json to_json() override { + json get_definition() override { return { {"type", "function"}, {"function", { @@ -326,7 +355,7 @@ struct server_tool_grep_search : server_tool { return {{"error", std::string("invalid regex: ") + e.what()}}; } - json matches = json::array(); + std::ostringstream output_text; size_t total = 0; auto search_file = [&](const fs::path & fpath) { @@ -337,11 +366,11 @@ struct server_tool_grep_search : server_tool { while (std::getline(f, line) && total < SERVER_TOOL_GREP_SEARCH_MAX_RESULTS) { lineno++; if (std::regex_search(line, pattern)) { - json match = {{"file", fpath.string()}, {"content", line}}; + output_text << fpath.string() << ":"; if (show_lineno) { - match["line"] = lineno; + output_text << lineno << ":"; } - matches.push_back(match); + output_text << line << "\n"; total++; } } @@ -369,7 +398,9 @@ struct server_tool_grep_search : server_tool { return {{"error", "path does not exist: " + path}}; } - return {{"matches", matches}, {"count", total}}; + output_text << "\n\n---\nTotal matches: " << total << "\n"; + + return {{"plain_text_response", output_text.str()}}; } }; @@ -381,9 +412,13 @@ static constexpr size_t SERVER_TOOL_EXEC_SHELL_COMMAND_MAX_OUTPUT_SIZE = 16 * 10 static constexpr int SERVER_TOOL_EXEC_SHELL_COMMAND_MAX_TIMEOUT = 60; // seconds struct server_tool_exec_shell_command : server_tool { - server_tool_exec_shell_command() { name = "exec_shell_command"; permission_write = true; } + server_tool_exec_shell_command() { + name = "exec_shell_command"; + displayName = "Execute shell command"; + permission_write = true; + } - json to_json() override { + json get_definition() override { return { {"type", "function"}, {"function", { @@ -418,11 +453,13 @@ struct server_tool_exec_shell_command : server_tool { auto res = run_process(args, max_output, timeout); - json out = {{"output", res.output}, {"exit_code", res.exit_code}}; + std::string text_output = res.output; + text_output += string_format("\n[exit code: %d]", res.exit_code); if (res.timed_out) { - out["timed_out"] = true; + text_output += " [exit due to timed out]"; } - return out; + + return {{"plain_text_response", text_output}}; } }; @@ -431,14 +468,18 @@ struct server_tool_exec_shell_command : server_tool { // struct server_tool_write_file : server_tool { - server_tool_write_file() { name = "write_file"; permission_write = true; } + server_tool_write_file() { + name = "write_file"; + displayName = "Write file"; + permission_write = true; + } - json to_json() override { + json get_definition() override { return { {"type", "function"}, {"function", { {"name", name}, - {"description", "Write content to a file, creating it (including parent directories) if it does not exist."}, + {"description", "Write content to a file, creating it (including parent directories) if it does not exist. May use with edit_file for more complex edits."}, {"parameters", { {"type", "object"}, {"properties", { @@ -478,18 +519,188 @@ struct server_tool_write_file : server_tool { }; // -// edit_file: apply a unified diff via git apply +// edit_file: edit file content via line-based changes // struct server_tool_edit_file : server_tool { - server_tool_edit_file() { name = "edit_file"; permission_write = true; } + server_tool_edit_file() { + name = "edit_file"; + displayName = "Edit file"; + permission_write = true; + } + + json get_definition() override { + return { + {"type", "function"}, + {"function", { + {"name", name}, + {"description", + "Edit a file by applying a list of line-based changes. " + "Each change targets a 1-based inclusive line range and has a mode: " + "\"replace\" (replace lines with content), " + "\"delete\" (remove lines, content must be empty string), " + "\"append\" (insert content after lineEnd). " + "Set lineStart to -1 to target the end of file (lineEnd is ignored in that case). " + "Changes must not overlap. They are applied in reverse line order automatically."}, + {"parameters", { + {"type", "object"}, + {"properties", { + {"path", {{"type", "string"}, {"description", "Path to the file to edit"}}}, + {"changes", { + {"type", "array"}, + {"description", "List of changes to apply"}, + {"items", { + {"type", "object"}, + {"properties", { + {"mode", {{"type", "string"}, {"description", "\"replace\", \"delete\", or \"append\""}}}, + {"lineStart", {{"type", "integer"}, {"description", "First line of the range (1-based); use -1 for end of file"}}}, + {"lineEnd", {{"type", "integer"}, {"description", "Last line of the range (1-based, inclusive); ignored when lineStart is -1"}}}, + {"content", {{"type", "string"}, {"description", "Content to insert; must be empty string for delete mode"}}}, + }}, + {"required", json::array({"mode", "lineStart", "lineEnd", "content"})}, + }}, + }}, + }}, + {"required", json::array({"path", "changes"})}, + }}, + }}, + }; + } + + json invoke(json params) override { + std::string path = params.at("path").get(); + const json & changes = params.at("changes"); + + if (!changes.is_array()) { + return {{"error", "\"changes\" must be an array"}}; + } + + // read file into lines + std::ifstream fin(path); + if (!fin) { + return {{"error", "failed to open file: " + path}}; + } + std::vector lines; + { + std::string line; + while (std::getline(fin, line)) { + lines.push_back(line); + } + } + fin.close(); + + // validate and collect changes, then sort descending by lineStart + struct change_entry { + std::string mode; + int line_start; // 1-based + int line_end; // 1-based inclusive + std::string content; + }; + std::vector entries; + entries.reserve(changes.size()); + + for (const auto & ch : changes) { + change_entry e; + e.mode = ch.at("mode").get(); + e.line_start = ch.at("lineStart").get(); + e.line_end = ch.at("lineEnd").get(); + e.content = ch.at("content").get(); + + if (e.mode != "replace" && e.mode != "delete" && e.mode != "append") { + return {{"error", "invalid mode \"" + e.mode + "\"; must be replace, delete, or append"}}; + } + if (e.mode == "delete" && !e.content.empty()) { + return {{"error", "content must be empty string for delete mode"}}; + } + int n = (int) lines.size(); + if (e.line_start == -1) { + // -1 means end of file; lineEnd is ignored — normalize to point past last line + e.line_start = n + 1; + e.line_end = n + 1; + } else { + if (e.line_start < 1 || e.line_end < e.line_start) { + return {{"error", string_format("invalid line range [%d, %d]", e.line_start, e.line_end)}}; + } + if (e.line_end > n) { + return {{"error", string_format("lineEnd %d exceeds file length %d", e.line_end, n)}}; + } + } + entries.push_back(std::move(e)); + } + + // sort descending so earlier-indexed changes don't shift later ones + std::sort(entries.begin(), entries.end(), [](const change_entry & a, const change_entry & b) { + return a.line_start > b.line_start; + }); + + // apply changes (0-based indices internally) + for (const auto & e : entries) { + int idx_start = e.line_start - 1; // 0-based + int idx_end = e.line_end - 1; // 0-based inclusive + + // split content into lines (preserve trailing newline awareness) + std::vector new_lines; + if (!e.content.empty()) { + std::istringstream ss(e.content); + std::string ln; + while (std::getline(ss, ln)) { + new_lines.push_back(ln); + } + // if content ends with \n, getline consumed it — no extra empty line needed + // if content does NOT end with \n, last line is still captured correctly + } + + if (e.mode == "replace") { + // erase [idx_start, idx_end] and insert new_lines + lines.erase(lines.begin() + idx_start, lines.begin() + idx_end + 1); + lines.insert(lines.begin() + idx_start, new_lines.begin(), new_lines.end()); + } else if (e.mode == "delete") { + lines.erase(lines.begin() + idx_start, lines.begin() + idx_end + 1); + } else { // append + // idx_end + 1 may equal lines.size() when lineStart == -1 (end of file) + lines.insert(lines.begin() + idx_end + 1, new_lines.begin(), new_lines.end()); + } + } + + // write file back + std::ofstream fout(path, std::ios::binary); + if (!fout) { + return {{"error", "failed to open file for writing: " + path}}; + } + for (size_t i = 0; i < lines.size(); i++) { + fout << lines[i]; + if (i + 1 < lines.size()) { + fout << "\n"; + } + } + if (!lines.empty()) { + fout << "\n"; + } + if (!fout) { + return {{"error", "failed to write file: " + path}}; + } + + return {{"result", "file edited successfully"}, {"path", path}, {"lines", (int) lines.size()}}; + } +}; + +// +// apply_diff: apply a unified diff via git apply +// + +struct server_tool_apply_diff : server_tool { + server_tool_apply_diff() { + name = "apply_diff"; + displayName = "Apply diff"; + permission_write = true; + } - json to_json() override { + json get_definition() override { return { {"type", "function"}, {"function", { {"name", name}, - {"description", "Apply a unified diff to edit one or more files using git apply."}, + {"description", "Apply a unified diff to edit one or more files using git apply. Use this instead of edit_file when the changes are complex."}, {"parameters", { {"type", "object"}, {"properties", { @@ -541,6 +752,7 @@ static std::vector> build_tools() { tools.push_back(std::make_unique()); tools.push_back(std::make_unique()); tools.push_back(std::make_unique()); + tools.push_back(std::make_unique()); return tools; } From 718bfb0777018927e9cbc3f4b8cdae6d27a4849b Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 19 Mar 2026 23:08:34 +0100 Subject: [PATCH 3/8] displayName -> display_name --- tools/server/README-dev.md | 2 +- tools/server/server-tools.cpp | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tools/server/README-dev.md b/tools/server/README-dev.md index 326cb357b44..f9fae5a6cc8 100644 --- a/tools/server/README-dev.md +++ b/tools/server/README-dev.md @@ -103,7 +103,7 @@ This endpoint is intended to be used internally by the Web UI and subject to cha Get a list of tools, each tool has these fields: - `tool` (string): the ID name of the tool, to be used in POST call. Example: `read_file` -- `displayName` (string): the name to be displayed on UI. Example: `Read file` +- `display_name` (string): the name to be displayed on UI. Example: `Read file` - `type` (string): always be `"builtin"` for now - `permissions` (object): a mapping string --> boolean that indicates the permission required by this tool. This is useful for the UI to ask the user before calling the tool. For now, the only permission supported is `"write"` - `definition` (object): the OAI-compat definition of this tool diff --git a/tools/server/server-tools.cpp b/tools/server/server-tools.cpp index d5c7921efd2..7e371bfb81e 100644 --- a/tools/server/server-tools.cpp +++ b/tools/server/server-tools.cpp @@ -139,7 +139,7 @@ static bool glob_match(const std::string & pattern, const std::string & str) { struct server_tool { std::string name; - std::string displayName; + std::string display_name; json definition; bool permission_write = false; @@ -149,7 +149,7 @@ struct server_tool { json to_json() { return { - {"displayName", displayName}, + {"display_name", display_name}, {"tool", name}, {"type", "builtin"}, {"permissions", json{ @@ -169,7 +169,7 @@ static constexpr size_t SERVER_TOOL_READ_FILE_MAX_SIZE = 16 * 1024; // 16 KB struct server_tool_read_file : server_tool { server_tool_read_file() { name = "read_file"; - displayName = "Read file"; + display_name = "Read file"; permission_write = false; } @@ -252,7 +252,7 @@ static constexpr size_t SERVER_TOOL_FILE_SEARCH_MAX_RESULTS = 100; struct server_tool_file_glob_search : server_tool { server_tool_file_glob_search() { name = "file_glob_search"; - displayName = "File search"; + display_name = "File search"; permission_write = false; } @@ -316,7 +316,7 @@ static constexpr size_t SERVER_TOOL_GREP_SEARCH_MAX_RESULTS = 100; struct server_tool_grep_search : server_tool { server_tool_grep_search() { name = "grep_search"; - displayName = "Grep search"; + display_name = "Grep search"; permission_write = false; } @@ -414,7 +414,7 @@ static constexpr int SERVER_TOOL_EXEC_SHELL_COMMAND_MAX_TIMEOUT = 60; struct server_tool_exec_shell_command : server_tool { server_tool_exec_shell_command() { name = "exec_shell_command"; - displayName = "Execute shell command"; + display_name = "Execute shell command"; permission_write = true; } @@ -470,7 +470,7 @@ struct server_tool_exec_shell_command : server_tool { struct server_tool_write_file : server_tool { server_tool_write_file() { name = "write_file"; - displayName = "Write file"; + display_name = "Write file"; permission_write = true; } @@ -525,7 +525,7 @@ struct server_tool_write_file : server_tool { struct server_tool_edit_file : server_tool { server_tool_edit_file() { name = "edit_file"; - displayName = "Edit file"; + display_name = "Edit file"; permission_write = true; } @@ -691,7 +691,7 @@ struct server_tool_edit_file : server_tool { struct server_tool_apply_diff : server_tool { server_tool_apply_diff() { name = "apply_diff"; - displayName = "Apply diff"; + display_name = "Apply diff"; permission_write = true; } From 6aba54e7d710f228e8c652c1eb548c54eb1450c5 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 19 Mar 2026 23:16:10 +0100 Subject: [PATCH 4/8] snake_case everywhere --- tools/server/server-tools.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tools/server/server-tools.cpp b/tools/server/server-tools.cpp index 7e371bfb81e..7f7d7a6d568 100644 --- a/tools/server/server-tools.cpp +++ b/tools/server/server-tools.cpp @@ -539,8 +539,8 @@ struct server_tool_edit_file : server_tool { "Each change targets a 1-based inclusive line range and has a mode: " "\"replace\" (replace lines with content), " "\"delete\" (remove lines, content must be empty string), " - "\"append\" (insert content after lineEnd). " - "Set lineStart to -1 to target the end of file (lineEnd is ignored in that case). " + "\"append\" (insert content after line_end). " + "Set line_start to -1 to target the end of file (line_end is ignored in that case). " "Changes must not overlap. They are applied in reverse line order automatically."}, {"parameters", { {"type", "object"}, @@ -552,12 +552,12 @@ struct server_tool_edit_file : server_tool { {"items", { {"type", "object"}, {"properties", { - {"mode", {{"type", "string"}, {"description", "\"replace\", \"delete\", or \"append\""}}}, - {"lineStart", {{"type", "integer"}, {"description", "First line of the range (1-based); use -1 for end of file"}}}, - {"lineEnd", {{"type", "integer"}, {"description", "Last line of the range (1-based, inclusive); ignored when lineStart is -1"}}}, - {"content", {{"type", "string"}, {"description", "Content to insert; must be empty string for delete mode"}}}, + {"mode", {{"type", "string"}, {"description", "\"replace\", \"delete\", or \"append\""}}}, + {"line_start", {{"type", "integer"}, {"description", "First line of the range (1-based); use -1 for end of file"}}}, + {"line_end", {{"type", "integer"}, {"description", "Last line of the range (1-based, inclusive); ignored when line_start is -1"}}}, + {"content", {{"type", "string"}, {"description", "Content to insert; must be empty string for delete mode"}}}, }}, - {"required", json::array({"mode", "lineStart", "lineEnd", "content"})}, + {"required", json::array({"mode", "line_start", "line_end", "content"})}, }}, }}, }}, @@ -589,7 +589,7 @@ struct server_tool_edit_file : server_tool { } fin.close(); - // validate and collect changes, then sort descending by lineStart + // validate and collect changes, then sort descending by line_start struct change_entry { std::string mode; int line_start; // 1-based @@ -602,8 +602,8 @@ struct server_tool_edit_file : server_tool { for (const auto & ch : changes) { change_entry e; e.mode = ch.at("mode").get(); - e.line_start = ch.at("lineStart").get(); - e.line_end = ch.at("lineEnd").get(); + e.line_start = ch.at("line_start").get(); + e.line_end = ch.at("line_end").get(); e.content = ch.at("content").get(); if (e.mode != "replace" && e.mode != "delete" && e.mode != "append") { @@ -614,7 +614,7 @@ struct server_tool_edit_file : server_tool { } int n = (int) lines.size(); if (e.line_start == -1) { - // -1 means end of file; lineEnd is ignored — normalize to point past last line + // -1 means end of file; line_end is ignored — normalize to point past last line e.line_start = n + 1; e.line_end = n + 1; } else { @@ -622,7 +622,7 @@ struct server_tool_edit_file : server_tool { return {{"error", string_format("invalid line range [%d, %d]", e.line_start, e.line_end)}}; } if (e.line_end > n) { - return {{"error", string_format("lineEnd %d exceeds file length %d", e.line_end, n)}}; + return {{"error", string_format("line_end %d exceeds file length %d", e.line_end, n)}}; } } entries.push_back(std::move(e)); @@ -657,7 +657,7 @@ struct server_tool_edit_file : server_tool { } else if (e.mode == "delete") { lines.erase(lines.begin() + idx_start, lines.begin() + idx_end + 1); } else { // append - // idx_end + 1 may equal lines.size() when lineStart == -1 (end of file) + // idx_end + 1 may equal lines.size() when line_start == -1 (end of file) lines.insert(lines.begin() + idx_end + 1, new_lines.begin(), new_lines.end()); } } From c33fd6f10c399bb5da7fc7cc1842bcb09a68166f Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 19 Mar 2026 23:25:22 +0100 Subject: [PATCH 5/8] rm redundant field --- tools/server/server-tools.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/server/server-tools.cpp b/tools/server/server-tools.cpp index 7f7d7a6d568..d7c116abd81 100644 --- a/tools/server/server-tools.cpp +++ b/tools/server/server-tools.cpp @@ -140,8 +140,7 @@ static bool glob_match(const std::string & pattern, const std::string & str) { struct server_tool { std::string name; std::string display_name; - json definition; - bool permission_write = false; + bool permission_write = false; virtual ~server_tool() = default; virtual json get_definition() = 0; @@ -592,8 +591,8 @@ struct server_tool_edit_file : server_tool { // validate and collect changes, then sort descending by line_start struct change_entry { std::string mode; - int line_start; // 1-based - int line_end; // 1-based inclusive + int line_start; // 1-based + int line_end; // 1-based inclusive std::string content; }; std::vector entries; From b0a1b31477d8c68ea7d23bce4b3748de27e88441 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 21 Mar 2026 01:11:57 +0100 Subject: [PATCH 6/8] change arg to --tools all --- common/arg.cpp | 11 +-- common/common.h | 2 +- tools/server/server-tools.cpp | 129 ++++++++++++++++------------------ tools/server/server-tools.h | 23 +++++- tools/server/server.cpp | 8 ++- 5 files changed, 94 insertions(+), 79 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index ba2afc77a70..98070d43e25 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2849,11 +2849,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY")); add_opt(common_arg( - {"--tools"}, - {"--no-tools"}, - string_format("experimental: whether to enable tools for AI agents - do not enable in untrusted environments (default: %s)", params.server_tools ? "enabled" : "disabled"), - [](common_params & params, bool value) { - params.server_tools = value; + {"--tools"}, "TOOL1,TOOL2,...", + "experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)\n" + "specify \"all\" to enable all tools\n" + "available tools: read_file, file_glob_search, grep_search, exec_shell_command, write_file, edit_file, apply_diff", + [](common_params & params, const std::string & value) { + params.server_tools = parse_csv_row(value); } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TOOLS")); add_opt(common_arg( diff --git a/common/common.h b/common/common.h index 9fd1b4dbbe1..fde5ba996ed 100644 --- a/common/common.h +++ b/common/common.h @@ -614,7 +614,7 @@ struct common_params { bool endpoint_metrics = false; // enable built-in tools - bool server_tools = false; + std::vector server_tools; // router server configs std::string models_dir = ""; // directory containing models for the router server diff --git a/tools/server/server-tools.cpp b/tools/server/server-tools.cpp index d7c116abd81..5e89a5668b7 100644 --- a/tools/server/server-tools.cpp +++ b/tools/server/server-tools.cpp @@ -29,8 +29,8 @@ static std::vector to_cstr_vec(const std::vector & v) { struct run_proc_result { std::string output; - int exit_code = -1; - bool timed_out = false; + int exit_code = -1; + bool timed_out = false; }; static run_proc_result run_process( @@ -133,31 +133,17 @@ static bool glob_match(const std::string & pattern, const std::string & str) { return glob_match(pattern.c_str(), str.c_str()); } -// -// base struct -// - -struct server_tool { - std::string name; - std::string display_name; - bool permission_write = false; - - virtual ~server_tool() = default; - virtual json get_definition() = 0; - virtual json invoke(json params) = 0; - - json to_json() { - return { - {"display_name", display_name}, - {"tool", name}, - {"type", "builtin"}, - {"permissions", json{ - {"write", permission_write} - }}, - {"definition", get_definition()}, - }; - } -}; +json server_tool::to_json() { + return { + {"display_name", display_name}, + {"tool", name}, + {"type", "builtin"}, + {"permissions", json{ + {"write", permission_write} + }}, + {"definition", get_definition()}, + }; +} // // read_file: read a file with optional line range and line-number prefix @@ -533,7 +519,7 @@ struct server_tool_edit_file : server_tool { {"type", "function"}, {"function", { {"name", name}, - {"description", + {"description", "Edit a file by applying a list of line-based changes. " "Each change targets a 1-based inclusive line range and has a mode: " "\"replace\" (replace lines with content), " @@ -755,17 +741,56 @@ static std::vector> build_tools() { return tools; } -static json server_tools_list() { - auto tools = build_tools(); - json result = json::array(); - for (const auto & t : tools) { - result.push_back(t->to_json()); +void server_tools::setup(const std::vector & enabled_tools) { + if (!enabled_tools.empty()) { + std::unordered_set enabled_set(enabled_tools.begin(), enabled_tools.end()); + auto all_tools = build_tools(); + + tools.clear(); + for (auto & t : all_tools) { + if (enabled_set.count(t->name) > 0 || enabled_set.count("all") > 0) { + tools.push_back(std::move(t)); + } + } } - return result; + + handle_get = [this](const server_http_req &) -> server_http_res_ptr { + auto res = std::make_unique(); + try { + json result = json::array(); + for (const auto & t : tools) { + result.push_back(t->to_json()); + } + res->data = safe_json_to_str(result); + } catch (const std::exception & e) { + SRV_ERR("got exception: %s\n", e.what()); + res->status = 500; + res->data = safe_json_to_str(format_error_response(e.what(), ERROR_TYPE_SERVER)); + } + return res; + }; + + handle_post = [this](const server_http_req & req) -> server_http_res_ptr { + auto res = std::make_unique(); + try { + json body = json::parse(req.body); + std::string tool_name = body.at("tool").get(); + json params = body.value("params", json::object()); + json result = invoke(tool_name, params); + res->data = safe_json_to_str(result); + } catch (const json::exception & e) { + res->status = 400; + res->data = safe_json_to_str(format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST)); + } catch (const std::exception & e) { + SRV_ERR("got exception: %s\n", e.what()); + res->status = 500; + res->data = safe_json_to_str(format_error_response(e.what(), ERROR_TYPE_SERVER)); + } + return res; + }; } -static json server_tool_call(const std::string & name, const json & params) { - auto tools = build_tools(); +json server_tools::invoke(const std::string & name, const json & params) { for (auto & t : tools) { if (t->name == name) { return t->invoke(params); @@ -773,35 +798,3 @@ static json server_tool_call(const std::string & name, const json & params) { } return {{"error", "unknown tool: " + name}}; } - -server_http_context::handler_t server_tools_get = [](const server_http_req &) -> server_http_res_ptr { - auto res = std::make_unique(); - try { - json tools = server_tools_list(); - res->data = safe_json_to_str(tools); - } catch (const std::exception & e) { - SRV_ERR("got exception: %s\n", e.what()); - res->status = 500; - res->data = safe_json_to_str(format_error_response(e.what(), ERROR_TYPE_SERVER)); - } - return res; -}; - -server_http_context::handler_t server_tools_post = [](const server_http_req & req) -> server_http_res_ptr { - auto res = std::make_unique(); - try { - json body = json::parse(req.body); - std::string tool_name = body.at("tool").get(); - json params = body.value("params", json::object()); - json result = server_tool_call(tool_name, params); - res->data = safe_json_to_str(result); - } catch (const json::exception & e) { - res->status = 400; - res->data = safe_json_to_str(format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST)); - } catch (const std::exception & e) { - SRV_ERR("got exception: %s\n", e.what()); - res->status = 500; - res->data = safe_json_to_str(format_error_response(e.what(), ERROR_TYPE_SERVER)); - } - return res; -}; diff --git a/tools/server/server-tools.h b/tools/server/server-tools.h index 141235d7993..444ef5f8098 100644 --- a/tools/server/server-tools.h +++ b/tools/server/server-tools.h @@ -3,5 +3,24 @@ #include "server-common.h" #include "server-http.h" -extern server_http_context::handler_t server_tools_get; -extern server_http_context::handler_t server_tools_post; \ No newline at end of file +struct server_tool { + std::string name; + std::string display_name; + bool permission_write = false; + + virtual ~server_tool() = default; + virtual json get_definition() = 0; + virtual json invoke(json params) = 0; + + json to_json(); +}; + +struct server_tools { + std::vector> tools; + + void setup(const std::vector & enabled_tools); + json invoke(const std::string & name, const json & params); + + server_http_context::handler_t handle_get; + server_http_context::handler_t handle_post; +}; diff --git a/tools/server/server.cpp b/tools/server/server.cpp index d1db1ed1ea0..2a0cf1bcf90 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -125,6 +125,7 @@ int main(int argc, char ** argv) { // register API routes server_routes routes(params, ctx_server); + server_tools tools; bool is_router_server = params.model.path.empty(); std::optional models_routes{}; @@ -213,13 +214,14 @@ int main(int argc, char ** argv) { ctx_http.post("/cors-proxy", ex_wrapper(proxy_handler_post)); } // EXPERIMENTAL built-in tools - if (params.server_tools) { + if (!params.server_tools.empty()) { + tools.setup(params.server_tools); SRV_WRN("%s", "-----------------\n"); SRV_WRN("%s", "Built-in tools are enabled, do not expose server to untrusted environments\n"); SRV_WRN("%s", "This feature is EXPERIMENTAL and may be changed in the future\n"); SRV_WRN("%s", "-----------------\n"); - ctx_http.get ("/tools", ex_wrapper(server_tools_get)); - ctx_http.post("/tools", ex_wrapper(server_tools_post)); + ctx_http.get ("/tools", ex_wrapper(tools.handle_get)); + ctx_http.post("/tools", ex_wrapper(tools.handle_post)); } // From b648215eb2353c46a5023666319d002227b9b4de Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 23 Mar 2026 12:32:33 +0100 Subject: [PATCH 7/8] add readme mention --- tools/server/README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/server/README.md b/tools/server/README.md index 72f3c8e5342..bb4edf6e6da 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -290,6 +290,12 @@ It is currently available in the following endpoints: For more details, please refer to [multimodal documentation](../../docs/multimodal.md) +### Built-in tools support + +The server includes a set of built-in tools that enable the LLM to access the local file system directly from the Web UI. + +To use this feature, start the server with `--tools all`. You can also enable only specific tools by passing a comma-separated list: `--tools name1,name2,...`. Run `--help` for the full list of available tool names. + ## Build `llama-server` is built alongside everything else from the root of the project From e4cc43a809f1e00b3392cf39a7be28a9a0fa516b Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 23 Mar 2026 12:35:19 +0100 Subject: [PATCH 8/8] llama-gen-docs --- tools/cli/README.md | 10 +++++++--- tools/completion/README.md | 7 +++++-- tools/server/README.md | 12 ++++++++---- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/tools/cli/README.md b/tools/cli/README.md index c344cab2a8d..840976a8848 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -134,7 +134,7 @@ | `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) | | `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) | | `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' | -| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') | +| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) | | `--grammar-file FNAME` | file to read grammar from | | `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | | `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | @@ -147,7 +147,8 @@ | -------- | ----------- | | `--display-prompt, --no-display-prompt` | whether to print prompt at generation (default: true) | | `-co, --color [on\|off\|auto]` | Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')
'auto' enables colors when output is to a terminal | -| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | +| `-ctxcp, --ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 32)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | +| `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)
(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) | | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | | `-sys, --system-prompt PROMPT` | system prompt to use with model (if applicable, depending on chat template) | @@ -172,9 +173,12 @@ | `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'
(env: LLAMA_CHAT_TEMPLATE_KWARGS) | | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)
(env: LLAMA_ARG_JINJA) | | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) | -| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | +| `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | +| `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | +| `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) | | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | | `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX) | | `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)
(env: LLAMA_ARG_DRAFT_MIN) | diff --git a/tools/completion/README.md b/tools/completion/README.md index b5eeba73349..25884ed92d0 100644 --- a/tools/completion/README.md +++ b/tools/completion/README.md @@ -217,7 +217,7 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) | | `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) | | `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' | -| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') | +| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) | | `--grammar-file FNAME` | file to read grammar from | | `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | | `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | @@ -252,9 +252,12 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-gaw, --grp-attn-w N` | group-attention width (default: 512)
(env: LLAMA_ARG_GRP_ATTN_W) | | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: disabled)
(env: LLAMA_ARG_JINJA) | | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) | -| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | +| `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | +| `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | +| `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) | | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | diff --git a/tools/server/README.md b/tools/server/README.md index bb4edf6e6da..cb53678416f 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -151,7 +151,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) | | `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) | | `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' | -| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') | +| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) | | `--grammar-file FNAME` | file to read grammar from | | `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | | `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | @@ -164,7 +164,8 @@ For the full list of features, please refer to [server's changelog](https://gith | -------- | ----------- | | `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) | | `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) | -| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | +| `-ctxcp, --ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 32)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | +| `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)
(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) | | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | | `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)
(env: LLAMA_ARG_KV_UNIFIED) | | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | @@ -192,6 +193,8 @@ For the full list of features, please refer to [server's changelog](https://gith | `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )
(env: LLAMA_ARG_API_PREFIX) | | `--webui-config JSON` | JSON that provides default WebUI settings (overrides WebUI defaults)
(env: LLAMA_ARG_WEBUI_CONFIG) | | `--webui-config-file PATH` | JSON file that provides default WebUI settings (overrides WebUI defaults)
(env: LLAMA_ARG_WEBUI_CONFIG_FILE) | +| `--webui-mcp-proxy, --no-webui-mcp-proxy` | experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)
(env: LLAMA_ARG_WEBUI_MCP_PROXY) | +| `--tools TOOL1,TOOL2,...` | experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)
specify "all" to enable all tools
available tools: read_file, file_glob_search, grep_search, exec_shell_command, write_file, edit_file, apply_diff
(env: LLAMA_ARG_TOOLS) | | `--webui, --no-webui` | whether to enable the Web UI (default: enabled)
(env: LLAMA_ARG_WEBUI) | | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) | | `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)
(env: LLAMA_ARG_RERANKING) | @@ -215,11 +218,12 @@ For the full list of features, please refer to [server's changelog](https://gith | `--models-autoload, --no-models-autoload` | for router server, whether to automatically load models (default: enabled)
(env: LLAMA_ARG_MODELS_AUTOLOAD) | | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)
(env: LLAMA_ARG_JINJA) | | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) | -| `-rea, --resoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | +| `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) | | `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)
when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled

(env: LLAMA_ARG_PREFILL_ASSISTANT) | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) | | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | @@ -234,7 +238,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | | `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_MODEL_DRAFT) | | `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible | -| `--spec-type [none\|ngram-cache\|ngram-simple\|ngram-map-k\|ngram-map-k4v\|ngram-mod]` | type of speculative decoding to use when no draft model is provided (default: none) | +| `--spec-type [none\|ngram-cache\|ngram-simple\|ngram-map-k\|ngram-map-k4v\|ngram-mod]` | type of speculative decoding to use when no draft model is provided (default: none)

(env: LLAMA_ARG_SPEC_TYPE) | | `--spec-ngram-size-n N` | ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: 12) | | `--spec-ngram-size-m N` | ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: 48) | | `--spec-ngram-min-hits N` | minimum hits for ngram-map speculative decoding (default: 1) |