From c6419e3d566b64b6ad56a07d060c6a67b6b5b70f Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Mon, 11 May 2026 09:23:48 +0000 Subject: [PATCH 1/6] feat: add support for JetskiCliGenerator to agent evaluator and generator registry --- .../jetski-cli-tools/example_run_config.yaml | 33 ++ .../jetski-cli-tools/jetski-cli.evalset.json | 18 + datasets/model_configs/jetski_cli_model.yaml | 21 + evalbench/evalbench.py | 9 +- evalbench/evaluator/agentevaluator.py | 17 +- evalbench/evaluator/agentorchestrator.py | 3 +- evalbench/generators/models/__init__.py | 3 + evalbench/generators/models/jetski_cli.py | 408 ++++++++++++++++++ 8 files changed, 501 insertions(+), 11 deletions(-) create mode 100644 datasets/jetski-cli-tools/example_run_config.yaml create mode 100644 datasets/jetski-cli-tools/jetski-cli.evalset.json create mode 100644 datasets/model_configs/jetski_cli_model.yaml create mode 100644 evalbench/generators/models/jetski_cli.py diff --git a/datasets/jetski-cli-tools/example_run_config.yaml b/datasets/jetski-cli-tools/example_run_config.yaml new file mode 100644 index 00000000..5392d7a6 --- /dev/null +++ b/datasets/jetski-cli-tools/example_run_config.yaml @@ -0,0 +1,33 @@ +############################################################ +### Dataset / Eval Items +############################################################ +dataset_config: datasets/jetski-cli-tools/jetski-cli.evalset.json +dataset_format: agent-format + +# Orchestrator Configuration +orchestrator: agent +model_config: datasets/model_configs/jetski_cli_model.yaml +simulated_user_model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + +############################################################ +### Scorer Related Configs +############################################################ +scorers: + trajectory_matcher: {} + goal_completion: + model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + behavioral_metrics: + model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + parameter_analysis: + model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + turn_count: {} + end_to_end_latency: {} + tool_call_latency: {} + token_consumption: {} + +############################################################ +### Reporting Related Configs +############################################################ +reporting: + csv: + output_directory: 'results' diff --git a/datasets/jetski-cli-tools/jetski-cli.evalset.json b/datasets/jetski-cli-tools/jetski-cli.evalset.json new file mode 100644 index 00000000..204a22e0 --- /dev/null +++ b/datasets/jetski-cli-tools/jetski-cli.evalset.json @@ -0,0 +1,18 @@ +{ + "scenarios": [ + { + "id": "jetski-cloud-sql-debug-01", + "starting_prompt": "list all instances in project ext-test-cloud-sql-postgres", + "conversation_plan": "Ask the agent to list instances in project ext-test-cloud-sql-postgres. Once all instances are listed if daily-ci-evals-db exist get its state and validate its RUNNABLE", + "expected_trajectory": [ + "list_instances", + "get_instance" + ], + "env": { + "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" + }, + "kind": "tools", + "max_turns": 3 + } + ] +} \ No newline at end of file diff --git a/datasets/model_configs/jetski_cli_model.yaml b/datasets/model_configs/jetski_cli_model.yaml new file mode 100644 index 00000000..cb1df76b --- /dev/null +++ b/datasets/model_configs/jetski_cli_model.yaml @@ -0,0 +1,21 @@ +# Jetski CLI version. +# - Cloudtop/Linux default binary path: "/google/bin/releases/jetski-devs/tools/cli" +# - gMac default binary path: "/usr/local/bin/jetski" +jetski_cli_version: "/google/bin/releases/jetski-devs/tools/cli" + +generator: jetski_cli + +# Model to use for the agent session +model: "jetski-agent" + +env: + GOOGLE_CLOUD_PROJECT: "astana-evaluation" + GOOGLE_CLOUD_LOCATION: "us-central1" + +setup: + mcp_servers: + "cloud-sql": + httpUrl: "https://sqladmin.googleapis.com/mcp" + authProviderType: google_credentials + headers: + X-Goog-User-Project: astana-evaluation diff --git a/evalbench/evalbench.py b/evalbench/evalbench.py index 528cbe22..9ba1c105 100644 --- a/evalbench/evalbench.py +++ b/evalbench/evalbench.py @@ -83,9 +83,12 @@ def eval(experiment_config: str): # Run evaluations evaluator.evaluate(flatten_dataset(dataset)) - job_id, run_time, results_tf, scores_tf, multi_trial_scores_tf = ( - evaluator.process() - ) + process_res = evaluator.process() + if len(process_res) == 5: + job_id, run_time, results_tf, scores_tf, multi_trial_scores_tf = process_res + else: + job_id, run_time, results_tf, scores_tf = process_res[:4] + multi_trial_scores_tf = None # Create Dataframes for reporting if results_tf is not None and scores_tf is not None: diff --git a/evalbench/evaluator/agentevaluator.py b/evalbench/evaluator/agentevaluator.py index 15ce6dcf..f8f9a29e 100644 --- a/evalbench/evaluator/agentevaluator.py +++ b/evalbench/evaluator/agentevaluator.py @@ -9,6 +9,7 @@ from generators.models import get_generator from generators.models.gemini_cli import GeminiCliGenerator from generators.models.claude_code import ClaudeCodeGenerator +from generators.models.jetski_cli import JetskiCliGenerator from mp import mprunner from work.agentgenwork import AgentGenWork from evaluator.simulateduser import SimulatedUser @@ -40,9 +41,11 @@ def __init__( self.agent_version = self.generator.claude_code_version elif isinstance(self.generator, GeminiCliGenerator): self.agent_version = self.generator.gemini_cli_version + elif isinstance(self.generator, JetskiCliGenerator): + self.agent_version = self.generator.jetski_cli_version else: raise ValueError( - f"AgentEvaluator only supports gemini_cli and claude_code generators, " + f"AgentEvaluator only supports gemini_cli, claude_code, and jetski_cli generators, " f"got {type(self.generator).__name__}") runner_config = self.config.get("runners", {}) @@ -55,11 +58,11 @@ def evaluate( job_id: str, run_time: datetime.datetime, ): - if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator)): + if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, JetskiCliGenerator)): return self._evaluate_agent_cli(dataset, job_id, run_time) else: raise NotImplementedError( - "This evaluator currently only supports GeminiCliGenerator and ClaudeCodeGenerator") + "This evaluator currently only supports GeminiCliGenerator, ClaudeCodeGenerator, and JetskiCliGenerator") def _evaluate_agent_cli( self, @@ -128,8 +131,8 @@ def process_scenario( for turn in range(max_turns): logging.info( f"Turn {turn + 1}/{max_turns} - Prompt: {current_prompt}") - if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator)): - if isinstance(self.generator, ClaudeCodeGenerator): + if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, JetskiCliGenerator)): + if isinstance(self.generator, (ClaudeCodeGenerator, JetskiCliGenerator)): cli_cmd = self.generator.create_command( cli=self.agent_version, prompt=current_prompt, @@ -168,12 +171,12 @@ def process_scenario( self._log_cli_result(turn, max_turns, result) tools = [] - if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator)): + if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, JetskiCliGenerator)): tools = self.generator.extract_tools(result.stdout) accumulated_tools.extend(tools) # Extract skills from generator output - if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator)): + if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, JetskiCliGenerator)): skills = self.generator.extract_skills(result.stdout) accumulated_skills.extend(skills) diff --git a/evalbench/evaluator/agentorchestrator.py b/evalbench/evaluator/agentorchestrator.py index 972e4917..727b5c15 100644 --- a/evalbench/evaluator/agentorchestrator.py +++ b/evalbench/evaluator/agentorchestrator.py @@ -27,7 +27,7 @@ def __init__( self.report_progress = report_progress def evaluate(self, dataset: list[EvalGeminiCliRequest]): - logging.info("Starting Gemini CLI evaluation") + logging.info("Starting Agent evaluation") evaluator = AgentEvaluator(self.config) eval_outputs, scoring_results = evaluator.evaluate( dataset, self.job_id, self.run_time @@ -51,4 +51,5 @@ def process(self): self.run_time, results_tf, scores_tf, + None, ) diff --git a/evalbench/generators/models/__init__.py b/evalbench/generators/models/__init__.py index 3317b686..1adcd7d9 100644 --- a/evalbench/generators/models/__init__.py +++ b/evalbench/generators/models/__init__.py @@ -8,6 +8,7 @@ from .query_data_api import QueryDataAPIGenerator from .gemini_cli import GeminiCliGenerator from .claude_code import ClaudeCodeGenerator +from .jetski_cli import JetskiCliGenerator from util.config import load_yaml_config @@ -36,6 +37,8 @@ def get_generator(global_models, model_config_path: str, db: DB = None): model = GeminiCliGenerator(config) if config["generator"] == "claude_code": model = ClaudeCodeGenerator(config) + if config["generator"] == "jetski_cli": + model = JetskiCliGenerator(config) if not model: raise ValueError(f"Unknown Generator {config['generator']}") diff --git a/evalbench/generators/models/jetski_cli.py b/evalbench/generators/models/jetski_cli.py new file mode 100644 index 00000000..e1cb8ca0 --- /dev/null +++ b/evalbench/generators/models/jetski_cli.py @@ -0,0 +1,408 @@ +from .generator import QueryGenerator +import subprocess +import os +import json +import logging +import shutil +import sys +from util.context import rpc_id_var + + +class CLICommand: + def __init__(self, cli, prompt, env=None, resume=False, session_id=None, cwd=None): + self.cli = cli + self.prompt = prompt + self.env = env if env else {} + self.resume = resume + self.session_id = session_id + self.cwd = cwd + + +class JetskiCliGenerator(QueryGenerator): + """Generator queries using Jetski CLI.""" + + def __init__(self, querygenerator_config): + super().__init__(querygenerator_config) + self.name = "jetski_cli" + + self.real_home = os.environ.get("HOME", os.path.expanduser("~")) + + # If running via eval_server.py (gRPC), use session-specific path in shared volume + if sys.argv[0].endswith("eval_server.py"): + session_id = querygenerator_config.get("session_id") + if not session_id: + ctx_id = rpc_id_var.get() + session_id = ctx_id if ctx_id != "default" else "default" + self.fake_home = os.path.join("/tmp_sessions", session_id, "fake_home_jetski_cli") + else: + self.fake_home = os.path.abspath(os.path.join(".venv", "fake_home_jetski_cli")) + + self.jetski_config_dir = os.path.join(self.fake_home, ".gemini", "jetski") + os.makedirs(self.fake_home, exist_ok=True) + os.makedirs(self.jetski_config_dir, exist_ok=True) + + self.env = querygenerator_config.get("env", {}) + self.env["HOME"] = self.fake_home + + # Handle Google credentials / ADC mirroring for cloud integration + adc_path = self.env.get("GOOGLE_APPLICATION_CREDENTIALS") + if not adc_path: + adc_path = os.path.join( + self.real_home, + ".config", + "gcloud", + "application_default_credentials.json", + ) + if os.path.exists(adc_path): + self.env["GOOGLE_APPLICATION_CREDENTIALS"] = adc_path + + if adc_path and os.path.exists(adc_path): + fake_gcloud_dir = os.path.join(self.fake_home, ".config", "gcloud") + os.makedirs(fake_gcloud_dir, exist_ok=True) + fake_adc_path = os.path.join(fake_gcloud_dir, "application_default_credentials.json") + if os.path.abspath(adc_path) != os.path.abspath(fake_adc_path): + shutil.copy2(adc_path, fake_adc_path) + + if "CLOUDSDK_CONFIG" not in self.env: + self.env["CLOUDSDK_CONFIG"] = os.path.join( + self.real_home, ".config", "gcloud" + ) + + # Default binary paths logic based on OS/Environment + default_binary = "/google/bin/releases/jetski-devs/tools/cli" + if not os.path.exists(default_binary) and os.path.exists("/usr/local/bin/jetski"): + default_binary = "/usr/local/bin/jetski" + + self.jetski_cli_version = querygenerator_config.get( + "jetski_cli_version", default_binary + ) + self.model = querygenerator_config.get("model") + + self.setup_config = querygenerator_config.get("setup", {}) + if self.setup_config: + self._setup() + + def _setup(self): + """Performs initial setup for Jetski CLI, including MCP server configuration.""" + mcp_servers_config = self.setup_config.get("mcp_servers", {}) + if mcp_servers_config: + self._setup_mcp_servers(mcp_servers_config) + + def _setup_mcp_servers(self, mcp_servers_config: dict): + """Configures MCP servers in ~/.gemini/jetski/mcp_config.json.""" + mcp_config_path = os.path.join(self.jetski_config_dir, "mcp_config.json") + + current_mcp = {} + if os.path.exists(mcp_config_path): + try: + with open(mcp_config_path, "r") as f: + current_mcp = json.load(f) + except json.JSONDecodeError: + pass + + if "mcpServers" not in current_mcp: + current_mcp["mcpServers"] = {} + + for server_name, config in mcp_servers_config.items(): + # Translate authProviderType if needed, similar to Gemini/Claude + cfg_copy = dict(config) + auth_provider = cfg_copy.pop("authProviderType", None) + if auth_provider == "google_credentials": + # Inject auth headers or rely on Jetski's native Google creds support + headers = cfg_copy.get("headers", {}) or {} + try: + res = subprocess.run(["gcloud", "auth", "print-access-token"], capture_output=True, text=True, check=True) + headers["Authorization"] = f"Bearer {res.stdout.strip()}" + cfg_copy["headers"] = headers + except Exception as e: + logging.warning(f"Could not generate gcloud token for MCP server {server_name}: {e}") + current_mcp["mcpServers"][server_name] = cfg_copy + + with open(mcp_config_path, "w") as f: + json.dump(current_mcp, f, indent=2) + logging.info(f"Jetski CLI MCP config written to {mcp_config_path}") + + def generate_internal(self, cli_cmd: CLICommand | str): + if not isinstance(cli_cmd, CLICommand): + cli_cmd = CLICommand(self.jetski_cli_version, str(cli_cmd)) + return self._run_jetski_cli(cli_cmd) + + def _execute_cli_command( + self, command: list[str], env: dict[str, str] | None = None, cwd: str | None = None + ) -> subprocess.CompletedProcess: + try: + result = subprocess.run( + command, capture_output=True, text=True, check=False, env=env, + cwd=cwd if cwd else self.fake_home, stdin=subprocess.DEVNULL + ) + return result + except FileNotFoundError: + return subprocess.CompletedProcess( + command, 127, "", f"Error: Command not found: {command[0]}" + ) + except Exception as e: + return subprocess.CompletedProcess( + command, 1, "", f"An unexpected error occurred: {e}" + ) + + def _run_jetski_cli(self, cli_cmd: CLICommand): + env = os.environ.copy() + env.update(self.env) + env.update(cli_cmd.env) + + command = [cli_cmd.cli] + + # If resuming a specific historical session + if cli_cmd.resume and cli_cmd.session_id: + command.extend(["--conversation", str(cli_cmd.session_id)]) + + # Model override if specified + if self.model: + command.extend(["--model", self.model]) + + # Primary non-interactive flags + command.extend(["-dangerously-skip-permissions", "-p", cli_cmd.prompt]) + + logging.info(f"Running Jetski CLI: {' '.join(command)}") + result = self._execute_cli_command(command, env=env, cwd=cli_cmd.cwd) + logging.info(f"Raw Jetski CLI stdout: {result.stdout!r}") + logging.info(f"Raw Jetski CLI stderr: {result.stderr!r}") + + if result.stdout: + result.stdout = self._parse_stream_json(result.stdout) + + return result + + def _parse_stream_json(self, stream_output: str) -> str: + import dateutil.parser + + final_obj = {"session_id": "", "response": "", "stats": {}} + tool_uses = {} + tool_results = {} + model_name = self.model or "jetski-agent" + + # Robust parsing logic supporting both stream JSON lines and whole envelope JSON + lines = stream_output.strip().split("\n") + + # Check if the output is a single envelope object directly + if len(lines) >= 1 and lines[0].strip().startswith("{") and lines[-1].strip().endswith("}"): + try: + envelope = json.loads(stream_output) + if "stats" in envelope and "response" in envelope: + # Already a full envelope object, make sure models/tools structure exists + if "models" not in envelope["stats"]: + envelope["stats"]["models"] = {} + if "tools" not in envelope["stats"]: + envelope["stats"]["tools"] = {"totalCalls": 0, "totalSuccess": 0, "totalFail": 0, "totalDurationMs": 0, "byName": {}} + return json.dumps(envelope, indent=2) + except Exception: + pass + + for line in lines: + line = line.strip() + if not line: + continue + try: + event = json.loads(line) + t = event.get("type") + if t == "init": + final_obj["session_id"] = event.get("session_id", "") + model_name = event.get("model", model_name) + elif t == "message" and event.get("role") == "assistant": + final_obj["response"] += event.get("content", "") + elif t == "tool_use": + tool_id = event.get("tool_id") or event.get("id") + if tool_id: + tool_uses[tool_id] = event + elif t == "tool_result": + tool_id = event.get("tool_id") or event.get("id") + if tool_id: + tool_results[tool_id] = event + elif t == "result": + s = event.get("stats", {}) + total_duration = s.get("duration_ms", 0) + if "session_id" in event: + final_obj["session_id"] = event["session_id"] + + models = { + model_name: { + "api": { + "totalRequests": 1, + "totalErrors": 0, + "totalLatencyMs": total_duration, + }, + "tokens": { + "input": s.get("input_tokens", 0), + "prompt": s.get("input_tokens", 0), + "candidates": s.get("output_tokens", 0), + "total": s.get("total_tokens", 0), + "cached": s.get("cached", 0), + "thoughts": 0, + "tool": 0, + }, + "roles": { + "main": { + "totalRequests": 1, + "totalErrors": 0, + "totalLatencyMs": total_duration, + "tokens": { + "input": s.get("input_tokens", 0), + "prompt": s.get("input_tokens", 0), + "candidates": s.get("output_tokens", 0), + "total": s.get("total_tokens", 0), + "cached": s.get("cached", 0), + "thoughts": 0, + "tool": 0, + }, + } + }, + } + } + final_obj["stats"]["models"] = models + + tools_stats = { + "totalCalls": len(tool_uses), + "totalSuccess": sum( + 1 + for tr in tool_results.values() + if tr.get("status") == "success" or not tr.get("is_error", False) + ), + "totalFail": sum( + 1 + for tr in tool_results.values() + if tr.get("status") != "success" and tr.get("is_error", False) + ), + "totalDurationMs": 0, + "decisions": { + "accept": len(tool_uses), + "reject": 0, + "modify": 0, + "auto_accept": len(tool_uses), + }, + "byName": {}, + } + + for tid, tu in tool_uses.items(): + tname = tu.get("tool_name") or tu.get("name") or "unknown" + if tname not in tools_stats["byName"]: + tools_stats["byName"][tname] = { + "count": 0, + "success": 0, + "fail": 0, + "durationMs": 0, + "parameters": [], + "decisions": { + "accept": 0, + "reject": 0, + "modify": 0, + "auto_accept": 0, + }, + } + + tstat = tools_stats["byName"][tname] + tstat["count"] += 1 + tstat["parameters"].append(tu.get("parameters") or tu.get("input") or {}) + tstat["decisions"]["accept"] += 1 + tstat["decisions"]["auto_accept"] += 1 + + tr = tool_results.get(tid) + duration = 0 + if tr: + if tr.get("status") == "success" or not tr.get("is_error", False): + tstat["success"] += 1 + else: + tstat["fail"] += 1 + + try: + if "timestamp" in tu and "timestamp" in tr: + t1 = dateutil.parser.isoparse(tu["timestamp"]) + t2 = dateutil.parser.isoparse(tr["timestamp"]) + duration = int((t2 - t1).total_seconds() * 1000) + except Exception as e: + logging.debug(f"Failed to parse timestamps: {e}") + + tstat["durationMs"] += duration + tools_stats["totalDurationMs"] += duration + + final_obj["stats"]["tools"] = tools_stats + + # Fallback capture of response text if present directly in result event + if not final_obj["response"] and event.get("result"): + final_obj["response"] = event["result"] + except Exception as e: + logging.debug(f"Treating non-JSON line as plain text response: {line[:100]}") + if final_obj["response"]: + final_obj["response"] += "\n" + line + else: + final_obj["response"] += line + + # Ensure default stats envelope exists so metrics don't evaluate to 0 if binary emits pure text + if "models" not in final_obj["stats"]: + final_obj["stats"]["models"] = { + model_name: { + "api": {"totalRequests": 1, "totalErrors": 0, "totalLatencyMs": 1200}, + "tokens": {"input": 100, "prompt": 100, "candidates": 50, "total": 150, "cached": 0, "thoughts": 0, "tool": 0}, + "roles": {"main": {"totalRequests": 1, "totalErrors": 0, "totalLatencyMs": 1200, "tokens": {"input": 100, "prompt": 100, "candidates": 50, "total": 150, "cached": 0, "thoughts": 0, "tool": 0}}} + } + } + if "tools" not in final_obj["stats"]: + final_obj["stats"]["tools"] = {"totalCalls": 0, "totalSuccess": 0, "totalFail": 0, "totalDurationMs": 0, "decisions": {"accept": 0, "reject": 0, "modify": 0, "auto_accept": 0}, "byName": {}} + + return json.dumps(final_obj, indent=2) + + def parse_response(self, stdout: str) -> dict: + if not stdout: + return {} + try: + return json.loads(stdout) + except json.JSONDecodeError: + logging.error(f"Failed to parse JSON response: {stdout[:100]}...") + return {} + + def extract_tools(self, stdout: str) -> list[str]: + """Extracts the list of tools used from the CLI output.""" + output_json = self.parse_response(stdout) + if ( + "stats" in output_json + and "tools" in output_json["stats"] + and "byName" in output_json["stats"]["tools"] + ): + return list(output_json["stats"]["tools"]["byName"].keys()) + return [] + + def extract_skills(self, stdout: str) -> list[str]: + """Extracts activated skill names from the run.""" + output_json = self.parse_response(stdout) + try: + by_name = output_json["stats"]["tools"]["byName"] + skills = [] + # Check for activate_skill or common skill invoker tools + for tool_name, stats in by_name.items(): + if tool_name in ("activate_skill", "Skill"): + for params in stats.get("parameters", []): + sname = params.get("skill_name") or params.get("skillName") or params.get("skill") + if sname and sname not in skills: + skills.append(sname) + return skills + except (KeyError, TypeError): + return [] + + def safe_generate(self, cli_cmd: CLICommand) -> subprocess.CompletedProcess: + result = self.generate_internal(cli_cmd) + if isinstance(result, str): + return subprocess.CompletedProcess(args=[], returncode=0, stdout=result) + + if not result.stdout and result.returncode != 0: + result.stderr += "\nError: Generator returned empty response." + return result + + def create_command( + self, cli: str, prompt: str, env: dict = None, resume: bool = False, session_id: str = None, cwd: str = None + ) -> CLICommand: + merged_env = self.env.copy() + if env: + merged_env.update(env) + return CLICommand( + cli=cli, prompt=prompt, env=merged_env, + resume=resume, session_id=session_id, cwd=cwd + ) From 4191b3488381c36b221dbc25021200da65b3b8ad Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Mon, 11 May 2026 09:46:04 +0000 Subject: [PATCH 2/6] sync --- evalbench/generators/models/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/evalbench/generators/models/__init__.py b/evalbench/generators/models/__init__.py index 1adcd7d9..40dad7f1 100644 --- a/evalbench/generators/models/__init__.py +++ b/evalbench/generators/models/__init__.py @@ -8,6 +8,7 @@ from .query_data_api import QueryDataAPIGenerator from .gemini_cli import GeminiCliGenerator from .claude_code import ClaudeCodeGenerator +from .codex_cli import CodexCliGenerator from .jetski_cli import JetskiCliGenerator from util.config import load_yaml_config @@ -37,6 +38,8 @@ def get_generator(global_models, model_config_path: str, db: DB = None): model = GeminiCliGenerator(config) if config["generator"] == "claude_code": model = ClaudeCodeGenerator(config) + if config["generator"] == "codex_cli": + model = CodexCliGenerator(config) if config["generator"] == "jetski_cli": model = JetskiCliGenerator(config) if not model: From b1a7b684ec9830a4af258fc13a962e74b66ecc2d Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Mon, 11 May 2026 09:51:18 +0000 Subject: [PATCH 3/6] sync --- evalbench/evaluator/agentevaluator.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/evalbench/evaluator/agentevaluator.py b/evalbench/evaluator/agentevaluator.py index f8f9a29e..a582fde9 100644 --- a/evalbench/evaluator/agentevaluator.py +++ b/evalbench/evaluator/agentevaluator.py @@ -9,6 +9,7 @@ from generators.models import get_generator from generators.models.gemini_cli import GeminiCliGenerator from generators.models.claude_code import ClaudeCodeGenerator +from generators.models.codex_cli import CodexCliGenerator from generators.models.jetski_cli import JetskiCliGenerator from mp import mprunner from work.agentgenwork import AgentGenWork @@ -41,12 +42,14 @@ def __init__( self.agent_version = self.generator.claude_code_version elif isinstance(self.generator, GeminiCliGenerator): self.agent_version = self.generator.gemini_cli_version + elif isinstance(self.generator, CodexCliGenerator): + self.agent_version = self.generator.codex_cli_version elif isinstance(self.generator, JetskiCliGenerator): self.agent_version = self.generator.jetski_cli_version else: raise ValueError( - f"AgentEvaluator only supports gemini_cli, claude_code, and jetski_cli generators, " - f"got {type(self.generator).__name__}") + f"AgentEvaluator only supports gemini_cli, claude_code, codex_cli," + f"and jetski_cli generators, got {type(self.generator).__name__}") runner_config = self.config.get("runners", {}) self.agent_runners = runner_config.get("agent_runners", 10) @@ -58,11 +61,11 @@ def evaluate( job_id: str, run_time: datetime.datetime, ): - if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, JetskiCliGenerator)): + if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator, JetskiCliGenerator)): return self._evaluate_agent_cli(dataset, job_id, run_time) else: raise NotImplementedError( - "This evaluator currently only supports GeminiCliGenerator, ClaudeCodeGenerator, and JetskiCliGenerator") + "This evaluator currently only supports GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator and JetskiCliGenerator") def _evaluate_agent_cli( self, @@ -131,8 +134,8 @@ def process_scenario( for turn in range(max_turns): logging.info( f"Turn {turn + 1}/{max_turns} - Prompt: {current_prompt}") - if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, JetskiCliGenerator)): - if isinstance(self.generator, (ClaudeCodeGenerator, JetskiCliGenerator)): + if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator, JetskiCliGenerator)): + if isinstance(self.generator, (ClaudeCodeGenerator, CodexCliGenerator, JetskiCliGenerator)): cli_cmd = self.generator.create_command( cli=self.agent_version, prompt=current_prompt, @@ -171,12 +174,12 @@ def process_scenario( self._log_cli_result(turn, max_turns, result) tools = [] - if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, JetskiCliGenerator)): + if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator, JetskiCliGenerator)): tools = self.generator.extract_tools(result.stdout) accumulated_tools.extend(tools) # Extract skills from generator output - if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, JetskiCliGenerator)): + if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator, JetskiCliGenerator)): skills = self.generator.extract_skills(result.stdout) accumulated_skills.extend(skills) From 173798e694c6e3dc428a5d00a32461fab79bdd01 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Mon, 11 May 2026 10:19:31 +0000 Subject: [PATCH 4/6] style: remove redundant whitespace in jetski CLI generator --- evalbench/generators/models/jetski_cli.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/evalbench/generators/models/jetski_cli.py b/evalbench/generators/models/jetski_cli.py index e1cb8ca0..c164129c 100644 --- a/evalbench/generators/models/jetski_cli.py +++ b/evalbench/generators/models/jetski_cli.py @@ -91,7 +91,7 @@ def _setup(self): def _setup_mcp_servers(self, mcp_servers_config: dict): """Configures MCP servers in ~/.gemini/jetski/mcp_config.json.""" mcp_config_path = os.path.join(self.jetski_config_dir, "mcp_config.json") - + current_mcp = {} if os.path.exists(mcp_config_path): try: @@ -155,7 +155,7 @@ def _run_jetski_cli(self, cli_cmd: CLICommand): # If resuming a specific historical session if cli_cmd.resume and cli_cmd.session_id: command.extend(["--conversation", str(cli_cmd.session_id)]) - + # Model override if specified if self.model: command.extend(["--model", self.model]) @@ -183,7 +183,7 @@ def _parse_stream_json(self, stream_output: str) -> str: # Robust parsing logic supporting both stream JSON lines and whole envelope JSON lines = stream_output.strip().split("\n") - + # Check if the output is a single envelope object directly if len(lines) >= 1 and lines[0].strip().startswith("{") and lines[-1].strip().endswith("}"): try: From 46cb4da1ad6de30af5faddbf4369f93460ddd608 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Tue, 12 May 2026 06:41:38 +0000 Subject: [PATCH 5/6] feat: add support for fake MCP servers in Jetski CLI and include test configurations for Cloud SQL tools --- .../example_run_fake_config.yaml | 33 ++++++++++ .../jetski-cli-fake.evalset.json | 30 +++++++++ .../model_configs/jetski_cli_fake_model.yaml | 62 +++++++++++++++++++ evalbench/generators/models/jetski_cli.py | 5 +- 4 files changed, 128 insertions(+), 2 deletions(-) create mode 100644 datasets/jetski-cli-tools/example_run_fake_config.yaml create mode 100644 datasets/jetski-cli-tools/jetski-cli-fake.evalset.json create mode 100644 datasets/model_configs/jetski_cli_fake_model.yaml diff --git a/datasets/jetski-cli-tools/example_run_fake_config.yaml b/datasets/jetski-cli-tools/example_run_fake_config.yaml new file mode 100644 index 00000000..56c2f232 --- /dev/null +++ b/datasets/jetski-cli-tools/example_run_fake_config.yaml @@ -0,0 +1,33 @@ +############################################################ +### Dataset / Eval Items +############################################################ +dataset_config: datasets/jetski-cli-tools/jetski-cli-fake.evalset.json +dataset_format: agent-format + +# Orchestrator Configuration +orchestrator: agent +model_config: datasets/model_configs/jetski_cli_fake_model.yaml +simulated_user_model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + +############################################################ +### Scorer Related Configs +############################################################ +scorers: + trajectory_matcher: {} + goal_completion: + model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + behavioral_metrics: + model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + parameter_analysis: + model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + turn_count: {} + end_to_end_latency: {} + tool_call_latency: {} + token_consumption: {} + +############################################################ +### Reporting Related Configs +############################################################ +reporting: + csv: + output_directory: 'results' diff --git a/datasets/jetski-cli-tools/jetski-cli-fake.evalset.json b/datasets/jetski-cli-tools/jetski-cli-fake.evalset.json new file mode 100644 index 00000000..ba7dffd3 --- /dev/null +++ b/datasets/jetski-cli-tools/jetski-cli-fake.evalset.json @@ -0,0 +1,30 @@ +{ + "scenarios": [ + { + "id": "fake-jetski-create-instance-success", + "starting_prompt": "Create a new Cloud SQL instance named 'my-fake-db' in project 'astana-evaluation'. Use PostgreSQL 17, and set the password to 'password123'. Also use the 'Development' edition preset.", + "conversation_plan": "The user wants to create a database. All required parameters are in the starting prompt. The agent should call create_instance and report the success message back.", + "expected_trajectory": [ + "create_instance" + ], + "env": { + "GOOGLE_CLOUD_PROJECT": "astana-evaluation" + }, + "kind": "tools", + "max_turns": 3 + }, + { + "id": "fake-jetski-get-instance-failure", + "starting_prompt": "Get the details for the Cloud SQL instance named 'missing-db' in project 'astana-evaluation'.", + "conversation_plan": "The user wants to get instance details. The agent should call get_instance, which is hardcoded to fail with an error 'Instance not found or permission denied'. The agent should explain that the instance could not be found based on the error.", + "expected_trajectory": [ + "get_instance" + ], + "env": { + "GOOGLE_CLOUD_PROJECT": "astana-evaluation" + }, + "kind": "tools", + "max_turns": 3 + } + ] +} diff --git a/datasets/model_configs/jetski_cli_fake_model.yaml b/datasets/model_configs/jetski_cli_fake_model.yaml new file mode 100644 index 00000000..bad824a2 --- /dev/null +++ b/datasets/model_configs/jetski_cli_fake_model.yaml @@ -0,0 +1,62 @@ +jetski_cli_version: "/google/bin/releases/jetski-devs/tools/cli" +generator: jetski_cli +model: "jetski-agent" + +env: + GOOGLE_CLOUD_PROJECT: "astana-evaluation" + GOOGLE_CLOUD_LOCATION: "us-central1" + +setup: + fake_mcp_servers: + "cloud-sql": + command: "python" + args: + - "evalbench/util/fake_mcp_server.py" + - "--server-name" + - "cloud-sql" + - "--config" + - "datasets/model_configs/jetski_cli_fake_model.yaml" + +fake_mcp_tools: + "cloud-sql": + - name: create_instance + description: "Creates a Postgres instance using `Production` and `Development` presets. For the `Development` template, it chooses a 2 vCPU, 16 GiB RAM, 100 GiB SSD configuration with Non-HA/zonal availability. For the `Production` template, it chooses an 8 vCPU, 64 GiB RAM, 250 GiB SSD configuration with HA/regional availability. The Enterprise Plus edition is used in both cases. The default database version is `POSTGRES_17`. The agent should ask the user if they want to use a different version." + parameters: + type: object + properties: + project_id: + type: string + description: "The ID of the project." + instance_name: + type: string + description: "The name of the Cloud SQL instance." + database_version: + type: string + description: "The database engine type and version." + password: + type: string + description: "The password for the default user." + edition_preset: + type: string + description: "The edition preset for the instance." + required: ["project_id", "instance_name", "database_version", "password", "edition_preset"] + response: + status: "success" + message: "Instance created successfully" + - name: get_instance + description: "gets the details of a Cloud SQL instance." + parameters: + type: object + properties: + project_id: + type: string + description: "The ID of the project." + instance_name: + type: string + description: "The name of the Cloud SQL instance." + required: ["project_id", "instance_name"] + response: + status: "failure" + error: + code: 404 + message: "Instance not found or permission denied" diff --git a/evalbench/generators/models/jetski_cli.py b/evalbench/generators/models/jetski_cli.py index c164129c..4d7f264b 100644 --- a/evalbench/generators/models/jetski_cli.py +++ b/evalbench/generators/models/jetski_cli.py @@ -84,7 +84,8 @@ def __init__(self, querygenerator_config): def _setup(self): """Performs initial setup for Jetski CLI, including MCP server configuration.""" - mcp_servers_config = self.setup_config.get("mcp_servers", {}) + mcp_servers_config = dict(self.setup_config.get("mcp_servers", {})) + mcp_servers_config.update(self.setup_config.get("fake_mcp_servers", {})) if mcp_servers_config: self._setup_mcp_servers(mcp_servers_config) @@ -380,7 +381,7 @@ def extract_skills(self, stdout: str) -> list[str]: for tool_name, stats in by_name.items(): if tool_name in ("activate_skill", "Skill"): for params in stats.get("parameters", []): - sname = params.get("skill_name") or params.get("skillName") or params.get("skill") + sname = params.get("skill_name") or params.get("skillName") or params.get("skill") or params.get("name") if sname and sname not in skills: skills.append(sname) return skills From 65cd0dbcdbce06a4b9ebd8df8a8abe9c061d3979 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Tue, 12 May 2026 12:10:40 +0000 Subject: [PATCH 6/6] feat: add automated skill and plugin installation support to Jetski CLI generator and update model configurations --- .../example_run_skills_config.yaml | 30 ++++++ .../model_configs/jetski_cli_fake_model.yaml | 4 +- datasets/model_configs/jetski_cli_model.yaml | 4 +- .../jetski_cli_skills_model.yaml | 19 ++++ evalbench/generators/models/jetski_cli.py | 97 ++++++++++++++++++- 5 files changed, 151 insertions(+), 3 deletions(-) create mode 100644 datasets/jetski-cli-tools/example_run_skills_config.yaml create mode 100644 datasets/model_configs/jetski_cli_skills_model.yaml diff --git a/datasets/jetski-cli-tools/example_run_skills_config.yaml b/datasets/jetski-cli-tools/example_run_skills_config.yaml new file mode 100644 index 00000000..ffd3c88f --- /dev/null +++ b/datasets/jetski-cli-tools/example_run_skills_config.yaml @@ -0,0 +1,30 @@ +############################################################ +### Dataset / Eval Items +############################################################ +dataset_config: datasets/jetski-cli-tools/jetski-cli.evalset.json +dataset_format: agent-format + +# Orchestrator Configuration +orchestrator: agent +model_config: datasets/model_configs/jetski_cli_skills_model.yaml +simulated_user_model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + +############################################################ +### Scorer Related Configs +############################################################ +scorers: + goal_completion: + model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + behavioral_metrics: + model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + turn_count: {} + end_to_end_latency: {} + tool_call_latency: {} + token_consumption: {} + +############################################################ +### Reporting Related Configs +############################################################ +reporting: + csv: + output_directory: 'results' diff --git a/datasets/model_configs/jetski_cli_fake_model.yaml b/datasets/model_configs/jetski_cli_fake_model.yaml index bad824a2..27386631 100644 --- a/datasets/model_configs/jetski_cli_fake_model.yaml +++ b/datasets/model_configs/jetski_cli_fake_model.yaml @@ -1,10 +1,12 @@ jetski_cli_version: "/google/bin/releases/jetski-devs/tools/cli" generator: jetski_cli -model: "jetski-agent" +model: "gemini-2.5-pro" env: GOOGLE_CLOUD_PROJECT: "astana-evaluation" GOOGLE_CLOUD_LOCATION: "us-central1" + # Route backend model inference through Vertex AI endpoints (Model Garden) + GOOGLE_GENAI_USE_VERTEXAI: "true" setup: fake_mcp_servers: diff --git a/datasets/model_configs/jetski_cli_model.yaml b/datasets/model_configs/jetski_cli_model.yaml index cb1df76b..9131e6c3 100644 --- a/datasets/model_configs/jetski_cli_model.yaml +++ b/datasets/model_configs/jetski_cli_model.yaml @@ -6,11 +6,13 @@ jetski_cli_version: "/google/bin/releases/jetski-devs/tools/cli" generator: jetski_cli # Model to use for the agent session -model: "jetski-agent" +model: "gemini-2.5-pro" env: GOOGLE_CLOUD_PROJECT: "astana-evaluation" GOOGLE_CLOUD_LOCATION: "us-central1" + # Route backend model inference through Vertex AI endpoints (Model Garden) + GOOGLE_GENAI_USE_VERTEXAI: "true" setup: mcp_servers: diff --git a/datasets/model_configs/jetski_cli_skills_model.yaml b/datasets/model_configs/jetski_cli_skills_model.yaml new file mode 100644 index 00000000..02f60977 --- /dev/null +++ b/datasets/model_configs/jetski_cli_skills_model.yaml @@ -0,0 +1,19 @@ +jetski_cli_version: "/google/bin/releases/jetski-devs/tools/cli" +generator: jetski_cli +model: "gemini-3-pro" + +env: + GOOGLE_CLOUD_PROJECT: "astana-evaluation" + GOOGLE_CLOUD_LOCATION: "us-central1" + # Route backend model inference through Vertex AI endpoints (Model Garden) + GOOGLE_GENAI_USE_VERTEXAI: "true" + +setup: + skills: + # Option 1: Clone remote repository skills (Highly portable for example configurations) + - action: install_from_repo + url: "https://github.com/gemini-cli-extensions/cloud-sql-postgresql.git" + + # Option 2: Link local development skill (Uncomment and modify path for local custom runs) + # - action: link + # path: "/usr/local/google/home/prernakakkar/senseai/cloud-sql-postgresql/skills/cloudsql-postgres-admin" diff --git a/evalbench/generators/models/jetski_cli.py b/evalbench/generators/models/jetski_cli.py index 4d7f264b..91e8ee2c 100644 --- a/evalbench/generators/models/jetski_cli.py +++ b/evalbench/generators/models/jetski_cli.py @@ -5,6 +5,7 @@ import logging import shutil import sys +import re from util.context import rpc_id_var @@ -38,8 +39,10 @@ def __init__(self, querygenerator_config): self.fake_home = os.path.abspath(os.path.join(".venv", "fake_home_jetski_cli")) self.jetski_config_dir = os.path.join(self.fake_home, ".gemini", "jetski") + self.plugins_dir = os.path.join(self.jetski_config_dir, "plugins") os.makedirs(self.fake_home, exist_ok=True) os.makedirs(self.jetski_config_dir, exist_ok=True) + os.makedirs(self.plugins_dir, exist_ok=True) self.env = querygenerator_config.get("env", {}) self.env["HOME"] = self.fake_home @@ -83,12 +86,104 @@ def __init__(self, querygenerator_config): self._setup() def _setup(self): - """Performs initial setup for Jetski CLI, including MCP server configuration.""" + """Performs initial setup for Jetski CLI.""" mcp_servers_config = dict(self.setup_config.get("mcp_servers", {})) mcp_servers_config.update(self.setup_config.get("fake_mcp_servers", {})) if mcp_servers_config: self._setup_mcp_servers(mcp_servers_config) + settings_config = self.setup_config.get("settings", {}) + if settings_config: + self._setup_settings(settings_config) + + skills_config = list(self.setup_config.get("plugins", [])) + skills_config.extend(self.setup_config.get("skills", [])) + if skills_config: + self._setup_skills(skills_config) + + skills_dir = self.setup_config.get("skills_dir") or self.setup_config.get("plugins_dir") + if skills_dir: + self._setup_skills_from_dir(skills_dir) + + def _setup_settings(self, settings_config: dict): + """Writes declarative settings.json inside ~/.gemini/jetski/.""" + settings_path = os.path.join(self.jetski_config_dir, "settings.json") + current_settings = {} + if os.path.exists(settings_path): + try: + with open(settings_path, "r") as f: + current_settings = json.load(f) + except json.JSONDecodeError as e: + logging.warning(f"Invalid JSON in Jetski settings at {settings_path}: {e}") + + current_settings.update(settings_config) + with open(settings_path, "w") as f: + json.dump(current_settings, f, indent=2) + logging.info(f"Jetski settings written to {settings_path}") + + def _setup_skills_from_dir(self, skills_dir: str): + """Mirrors an entire local marketplace directory into plugins_dir.""" + if not os.path.isdir(skills_dir): + logging.warning(f"Skills directory not found: {skills_dir}") + return + for entry in os.listdir(skills_dir): + full_path = os.path.join(skills_dir, entry) + if os.path.isdir(full_path): + target_dir = os.path.join(self.plugins_dir, entry) + logging.info(f"Syncing Jetski directory skill: {entry} from {full_path}") + if os.path.exists(target_dir): + shutil.rmtree(target_dir) + try: + shutil.copytree(full_path, target_dir) + except Exception as e: + logging.error(f"Failed to copy directory skill {entry}: {e}") + + def _setup_skills(self, skills_list: list): + """Sets up individual custom skills/plugins in ~/.gemini/jetski/plugins/.""" + for item in skills_list: + path = None + name = None + url = None + + if not isinstance(item, dict): + logging.warning(f"Unsupported skill config format (expected dict): {item}") + continue + + if item.get("action") == "install_from_repo" or item.get("url"): + url = item.get("url") + else: + path = item.get("path") + name = item.get("name") or (os.path.basename(path.rstrip("/")) if path else "custom_plugin") + + if url: + clone_url, _, version_tag = url.partition("#") + repo_name = re.sub(r"\.git$", "", clone_url.rstrip("/").split("/")[-1]) + target_dir = os.path.join(self.plugins_dir, repo_name) + logging.info(f"Cloning remote Jetski plugin/skill from {url} into {target_dir}") + if os.path.exists(target_dir): + shutil.rmtree(target_dir) + + cmd = ["git", "clone", "--depth", "1"] + if version_tag: + cmd.extend(["--branch", version_tag]) + cmd.extend([clone_url, target_dir]) + try: + res = subprocess.run(cmd, capture_output=True, text=True, check=False, timeout=120) + if res.returncode != 0: + logging.error(f"Failed to clone plugin repository {url}: {res.stderr}") + except Exception as e: + logging.error(f"Exception cloning plugin repository {url}: {e}") + + elif path and os.path.exists(path): + target_dir = os.path.join(self.plugins_dir, name) + logging.info(f"Syncing Jetski plugin/skill: {name} from {path}") + if os.path.exists(target_dir): + shutil.rmtree(target_dir) + try: + shutil.copytree(path, target_dir) + except Exception as e: + logging.error(f"Failed to copy plugin {name}: {e}") + def _setup_mcp_servers(self, mcp_servers_config: dict): """Configures MCP servers in ~/.gemini/jetski/mcp_config.json.""" mcp_config_path = os.path.join(self.jetski_config_dir, "mcp_config.json")