diff --git a/datasets/jetski-cli-tools/example_run_config.yaml b/datasets/jetski-cli-tools/example_run_config.yaml new file mode 100644 index 00000000..5392d7a6 --- /dev/null +++ b/datasets/jetski-cli-tools/example_run_config.yaml @@ -0,0 +1,33 @@ +############################################################ +### Dataset / Eval Items +############################################################ +dataset_config: datasets/jetski-cli-tools/jetski-cli.evalset.json +dataset_format: agent-format + +# Orchestrator Configuration +orchestrator: agent +model_config: datasets/model_configs/jetski_cli_model.yaml +simulated_user_model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + +############################################################ +### Scorer Related Configs +############################################################ +scorers: + trajectory_matcher: {} + goal_completion: + model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + behavioral_metrics: + model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + parameter_analysis: + model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + turn_count: {} + end_to_end_latency: {} + tool_call_latency: {} + token_consumption: {} + +############################################################ +### Reporting Related Configs +############################################################ +reporting: + csv: + output_directory: 'results' diff --git a/datasets/jetski-cli-tools/example_run_fake_config.yaml b/datasets/jetski-cli-tools/example_run_fake_config.yaml new file mode 100644 index 00000000..56c2f232 --- /dev/null +++ b/datasets/jetski-cli-tools/example_run_fake_config.yaml @@ -0,0 +1,33 @@ +############################################################ +### Dataset / Eval Items +############################################################ +dataset_config: datasets/jetski-cli-tools/jetski-cli-fake.evalset.json +dataset_format: agent-format + +# Orchestrator Configuration +orchestrator: agent +model_config: datasets/model_configs/jetski_cli_fake_model.yaml +simulated_user_model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + +############################################################ +### Scorer Related Configs +############################################################ +scorers: + trajectory_matcher: {} + goal_completion: + model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + behavioral_metrics: + model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + parameter_analysis: + model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + turn_count: {} + end_to_end_latency: {} + tool_call_latency: {} + token_consumption: {} + +############################################################ +### Reporting Related Configs +############################################################ +reporting: + csv: + output_directory: 'results' diff --git a/datasets/jetski-cli-tools/example_run_skills_config.yaml b/datasets/jetski-cli-tools/example_run_skills_config.yaml new file mode 100644 index 00000000..ffd3c88f --- /dev/null +++ b/datasets/jetski-cli-tools/example_run_skills_config.yaml @@ -0,0 +1,30 @@ +############################################################ +### Dataset / Eval Items +############################################################ +dataset_config: datasets/jetski-cli-tools/jetski-cli.evalset.json +dataset_format: agent-format + +# Orchestrator Configuration +orchestrator: agent +model_config: datasets/model_configs/jetski_cli_skills_model.yaml +simulated_user_model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + +############################################################ +### Scorer Related Configs +############################################################ +scorers: + goal_completion: + model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + behavioral_metrics: + model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + turn_count: {} + end_to_end_latency: {} + tool_call_latency: {} + token_consumption: {} + +############################################################ +### Reporting Related Configs +############################################################ +reporting: + csv: + output_directory: 'results' diff --git a/datasets/jetski-cli-tools/jetski-cli-fake.evalset.json b/datasets/jetski-cli-tools/jetski-cli-fake.evalset.json new file mode 100644 index 00000000..ba7dffd3 --- /dev/null +++ b/datasets/jetski-cli-tools/jetski-cli-fake.evalset.json @@ -0,0 +1,30 @@ +{ + "scenarios": [ + { + "id": "fake-jetski-create-instance-success", + "starting_prompt": "Create a new Cloud SQL instance named 'my-fake-db' in project 'astana-evaluation'. Use PostgreSQL 17, and set the password to 'password123'. Also use the 'Development' edition preset.", + "conversation_plan": "The user wants to create a database. All required parameters are in the starting prompt. The agent should call create_instance and report the success message back.", + "expected_trajectory": [ + "create_instance" + ], + "env": { + "GOOGLE_CLOUD_PROJECT": "astana-evaluation" + }, + "kind": "tools", + "max_turns": 3 + }, + { + "id": "fake-jetski-get-instance-failure", + "starting_prompt": "Get the details for the Cloud SQL instance named 'missing-db' in project 'astana-evaluation'.", + "conversation_plan": "The user wants to get instance details. The agent should call get_instance, which is hardcoded to fail with an error 'Instance not found or permission denied'. The agent should explain that the instance could not be found based on the error.", + "expected_trajectory": [ + "get_instance" + ], + "env": { + "GOOGLE_CLOUD_PROJECT": "astana-evaluation" + }, + "kind": "tools", + "max_turns": 3 + } + ] +} diff --git a/datasets/jetski-cli-tools/jetski-cli.evalset.json b/datasets/jetski-cli-tools/jetski-cli.evalset.json new file mode 100644 index 00000000..204a22e0 --- /dev/null +++ b/datasets/jetski-cli-tools/jetski-cli.evalset.json @@ -0,0 +1,18 @@ +{ + "scenarios": [ + { + "id": "jetski-cloud-sql-debug-01", + "starting_prompt": "list all instances in project ext-test-cloud-sql-postgres", + "conversation_plan": "Ask the agent to list instances in project ext-test-cloud-sql-postgres. Once all instances are listed if daily-ci-evals-db exist get its state and validate its RUNNABLE", + "expected_trajectory": [ + "list_instances", + "get_instance" + ], + "env": { + "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" + }, + "kind": "tools", + "max_turns": 3 + } + ] +} \ No newline at end of file diff --git a/datasets/model_configs/jetski_cli_fake_model.yaml b/datasets/model_configs/jetski_cli_fake_model.yaml new file mode 100644 index 00000000..27386631 --- /dev/null +++ b/datasets/model_configs/jetski_cli_fake_model.yaml @@ -0,0 +1,64 @@ +jetski_cli_version: "/google/bin/releases/jetski-devs/tools/cli" +generator: jetski_cli +model: "gemini-2.5-pro" + +env: + GOOGLE_CLOUD_PROJECT: "astana-evaluation" + GOOGLE_CLOUD_LOCATION: "us-central1" + # Route backend model inference through Vertex AI endpoints (Model Garden) + GOOGLE_GENAI_USE_VERTEXAI: "true" + +setup: + fake_mcp_servers: + "cloud-sql": + command: "python" + args: + - "evalbench/util/fake_mcp_server.py" + - "--server-name" + - "cloud-sql" + - "--config" + - "datasets/model_configs/jetski_cli_fake_model.yaml" + +fake_mcp_tools: + "cloud-sql": + - name: create_instance + description: "Creates a Postgres instance using `Production` and `Development` presets. For the `Development` template, it chooses a 2 vCPU, 16 GiB RAM, 100 GiB SSD configuration with Non-HA/zonal availability. For the `Production` template, it chooses an 8 vCPU, 64 GiB RAM, 250 GiB SSD configuration with HA/regional availability. The Enterprise Plus edition is used in both cases. The default database version is `POSTGRES_17`. The agent should ask the user if they want to use a different version." + parameters: + type: object + properties: + project_id: + type: string + description: "The ID of the project." + instance_name: + type: string + description: "The name of the Cloud SQL instance." + database_version: + type: string + description: "The database engine type and version." + password: + type: string + description: "The password for the default user." + edition_preset: + type: string + description: "The edition preset for the instance." + required: ["project_id", "instance_name", "database_version", "password", "edition_preset"] + response: + status: "success" + message: "Instance created successfully" + - name: get_instance + description: "gets the details of a Cloud SQL instance." + parameters: + type: object + properties: + project_id: + type: string + description: "The ID of the project." + instance_name: + type: string + description: "The name of the Cloud SQL instance." + required: ["project_id", "instance_name"] + response: + status: "failure" + error: + code: 404 + message: "Instance not found or permission denied" diff --git a/datasets/model_configs/jetski_cli_model.yaml b/datasets/model_configs/jetski_cli_model.yaml new file mode 100644 index 00000000..9131e6c3 --- /dev/null +++ b/datasets/model_configs/jetski_cli_model.yaml @@ -0,0 +1,23 @@ +# Jetski CLI version. +# - Cloudtop/Linux default binary path: "/google/bin/releases/jetski-devs/tools/cli" +# - gMac default binary path: "/usr/local/bin/jetski" +jetski_cli_version: "/google/bin/releases/jetski-devs/tools/cli" + +generator: jetski_cli + +# Model to use for the agent session +model: "gemini-2.5-pro" + +env: + GOOGLE_CLOUD_PROJECT: "astana-evaluation" + GOOGLE_CLOUD_LOCATION: "us-central1" + # Route backend model inference through Vertex AI endpoints (Model Garden) + GOOGLE_GENAI_USE_VERTEXAI: "true" + +setup: + mcp_servers: + "cloud-sql": + httpUrl: "https://sqladmin.googleapis.com/mcp" + authProviderType: google_credentials + headers: + X-Goog-User-Project: astana-evaluation diff --git a/datasets/model_configs/jetski_cli_skills_model.yaml b/datasets/model_configs/jetski_cli_skills_model.yaml new file mode 100644 index 00000000..02f60977 --- /dev/null +++ b/datasets/model_configs/jetski_cli_skills_model.yaml @@ -0,0 +1,19 @@ +jetski_cli_version: "/google/bin/releases/jetski-devs/tools/cli" +generator: jetski_cli +model: "gemini-3-pro" + +env: + GOOGLE_CLOUD_PROJECT: "astana-evaluation" + GOOGLE_CLOUD_LOCATION: "us-central1" + # Route backend model inference through Vertex AI endpoints (Model Garden) + GOOGLE_GENAI_USE_VERTEXAI: "true" + +setup: + skills: + # Option 1: Clone remote repository skills (Highly portable for example configurations) + - action: install_from_repo + url: "https://github.com/gemini-cli-extensions/cloud-sql-postgresql.git" + + # Option 2: Link local development skill (Uncomment and modify path for local custom runs) + # - action: link + # path: "/usr/local/google/home/prernakakkar/senseai/cloud-sql-postgresql/skills/cloudsql-postgres-admin" diff --git a/evalbench/evalbench.py b/evalbench/evalbench.py index 943a3240..3afdfbb2 100644 --- a/evalbench/evalbench.py +++ b/evalbench/evalbench.py @@ -86,9 +86,12 @@ def eval(experiment_config: str): # Run evaluations evaluator.evaluate(flatten_dataset(dataset)) - job_id, run_time, results_tf, scores_tf, multi_trial_scores_tf = ( - evaluator.process() - ) + process_res = evaluator.process() + if len(process_res) == 5: + job_id, run_time, results_tf, scores_tf, multi_trial_scores_tf = process_res + else: + job_id, run_time, results_tf, scores_tf = process_res[:4] + multi_trial_scores_tf = None # Create Dataframes for reporting if results_tf is not None and scores_tf is not None: diff --git a/evalbench/evaluator/agentevaluator.py b/evalbench/evaluator/agentevaluator.py index b2f3ca17..a582fde9 100644 --- a/evalbench/evaluator/agentevaluator.py +++ b/evalbench/evaluator/agentevaluator.py @@ -10,6 +10,7 @@ from generators.models.gemini_cli import GeminiCliGenerator from generators.models.claude_code import ClaudeCodeGenerator from generators.models.codex_cli import CodexCliGenerator +from generators.models.jetski_cli import JetskiCliGenerator from mp import mprunner from work.agentgenwork import AgentGenWork from evaluator.simulateduser import SimulatedUser @@ -43,10 +44,12 @@ def __init__( self.agent_version = self.generator.gemini_cli_version elif isinstance(self.generator, CodexCliGenerator): self.agent_version = self.generator.codex_cli_version + elif isinstance(self.generator, JetskiCliGenerator): + self.agent_version = self.generator.jetski_cli_version else: raise ValueError( - f"AgentEvaluator only supports gemini_cli, claude_code, and " - f"codex_cli generators, got {type(self.generator).__name__}") + f"AgentEvaluator only supports gemini_cli, claude_code, codex_cli," + f"and jetski_cli generators, got {type(self.generator).__name__}") runner_config = self.config.get("runners", {}) self.agent_runners = runner_config.get("agent_runners", 10) @@ -58,11 +61,11 @@ def evaluate( job_id: str, run_time: datetime.datetime, ): - if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator)): + if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator, JetskiCliGenerator)): return self._evaluate_agent_cli(dataset, job_id, run_time) else: raise NotImplementedError( - "This evaluator currently only supports GeminiCliGenerator, ClaudeCodeGenerator and CodexCliGenerator") + "This evaluator currently only supports GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator and JetskiCliGenerator") def _evaluate_agent_cli( self, @@ -131,8 +134,8 @@ def process_scenario( for turn in range(max_turns): logging.info( f"Turn {turn + 1}/{max_turns} - Prompt: {current_prompt}") - if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator)): - if isinstance(self.generator, (ClaudeCodeGenerator, CodexCliGenerator)): + if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator, JetskiCliGenerator)): + if isinstance(self.generator, (ClaudeCodeGenerator, CodexCliGenerator, JetskiCliGenerator)): cli_cmd = self.generator.create_command( cli=self.agent_version, prompt=current_prompt, @@ -171,12 +174,12 @@ def process_scenario( self._log_cli_result(turn, max_turns, result) tools = [] - if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator)): + if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator, JetskiCliGenerator)): tools = self.generator.extract_tools(result.stdout) accumulated_tools.extend(tools) # Extract skills from generator output - if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator)): + if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator, JetskiCliGenerator)): skills = self.generator.extract_skills(result.stdout) accumulated_skills.extend(skills) diff --git a/evalbench/evaluator/agentorchestrator.py b/evalbench/evaluator/agentorchestrator.py index 54b5c9d1..727b5c15 100644 --- a/evalbench/evaluator/agentorchestrator.py +++ b/evalbench/evaluator/agentorchestrator.py @@ -27,7 +27,7 @@ def __init__( self.report_progress = report_progress def evaluate(self, dataset: list[EvalGeminiCliRequest]): - logging.info("Starting agent CLI evaluation") + logging.info("Starting Agent evaluation") evaluator = AgentEvaluator(self.config) eval_outputs, scoring_results = evaluator.evaluate( dataset, self.job_id, self.run_time diff --git a/evalbench/generators/models/__init__.py b/evalbench/generators/models/__init__.py index 61913cde..0bf39a33 100644 --- a/evalbench/generators/models/__init__.py +++ b/evalbench/generators/models/__init__.py @@ -10,6 +10,7 @@ from .gemini_cli import GeminiCliGenerator from .claude_code import ClaudeCodeGenerator from .codex_cli import CodexCliGenerator +from .jetski_cli import JetskiCliGenerator from util.config import load_yaml_config @@ -42,6 +43,8 @@ def get_generator(global_models, model_config_path: str, db: DB = None): model = ClaudeCodeGenerator(config) if config["generator"] == "codex_cli": model = CodexCliGenerator(config) + if config["generator"] == "jetski_cli": + model = JetskiCliGenerator(config) if not model: raise ValueError(f"Unknown Generator {config['generator']}") diff --git a/evalbench/generators/models/jetski_cli.py b/evalbench/generators/models/jetski_cli.py new file mode 100644 index 00000000..91e8ee2c --- /dev/null +++ b/evalbench/generators/models/jetski_cli.py @@ -0,0 +1,504 @@ +from .generator import QueryGenerator +import subprocess +import os +import json +import logging +import shutil +import sys +import re +from util.context import rpc_id_var + + +class CLICommand: + def __init__(self, cli, prompt, env=None, resume=False, session_id=None, cwd=None): + self.cli = cli + self.prompt = prompt + self.env = env if env else {} + self.resume = resume + self.session_id = session_id + self.cwd = cwd + + +class JetskiCliGenerator(QueryGenerator): + """Generator queries using Jetski CLI.""" + + def __init__(self, querygenerator_config): + super().__init__(querygenerator_config) + self.name = "jetski_cli" + + self.real_home = os.environ.get("HOME", os.path.expanduser("~")) + + # If running via eval_server.py (gRPC), use session-specific path in shared volume + if sys.argv[0].endswith("eval_server.py"): + session_id = querygenerator_config.get("session_id") + if not session_id: + ctx_id = rpc_id_var.get() + session_id = ctx_id if ctx_id != "default" else "default" + self.fake_home = os.path.join("/tmp_sessions", session_id, "fake_home_jetski_cli") + else: + self.fake_home = os.path.abspath(os.path.join(".venv", "fake_home_jetski_cli")) + + self.jetski_config_dir = os.path.join(self.fake_home, ".gemini", "jetski") + self.plugins_dir = os.path.join(self.jetski_config_dir, "plugins") + os.makedirs(self.fake_home, exist_ok=True) + os.makedirs(self.jetski_config_dir, exist_ok=True) + os.makedirs(self.plugins_dir, exist_ok=True) + + self.env = querygenerator_config.get("env", {}) + self.env["HOME"] = self.fake_home + + # Handle Google credentials / ADC mirroring for cloud integration + adc_path = self.env.get("GOOGLE_APPLICATION_CREDENTIALS") + if not adc_path: + adc_path = os.path.join( + self.real_home, + ".config", + "gcloud", + "application_default_credentials.json", + ) + if os.path.exists(adc_path): + self.env["GOOGLE_APPLICATION_CREDENTIALS"] = adc_path + + if adc_path and os.path.exists(adc_path): + fake_gcloud_dir = os.path.join(self.fake_home, ".config", "gcloud") + os.makedirs(fake_gcloud_dir, exist_ok=True) + fake_adc_path = os.path.join(fake_gcloud_dir, "application_default_credentials.json") + if os.path.abspath(adc_path) != os.path.abspath(fake_adc_path): + shutil.copy2(adc_path, fake_adc_path) + + if "CLOUDSDK_CONFIG" not in self.env: + self.env["CLOUDSDK_CONFIG"] = os.path.join( + self.real_home, ".config", "gcloud" + ) + + # Default binary paths logic based on OS/Environment + default_binary = "/google/bin/releases/jetski-devs/tools/cli" + if not os.path.exists(default_binary) and os.path.exists("/usr/local/bin/jetski"): + default_binary = "/usr/local/bin/jetski" + + self.jetski_cli_version = querygenerator_config.get( + "jetski_cli_version", default_binary + ) + self.model = querygenerator_config.get("model") + + self.setup_config = querygenerator_config.get("setup", {}) + if self.setup_config: + self._setup() + + def _setup(self): + """Performs initial setup for Jetski CLI.""" + mcp_servers_config = dict(self.setup_config.get("mcp_servers", {})) + mcp_servers_config.update(self.setup_config.get("fake_mcp_servers", {})) + if mcp_servers_config: + self._setup_mcp_servers(mcp_servers_config) + + settings_config = self.setup_config.get("settings", {}) + if settings_config: + self._setup_settings(settings_config) + + skills_config = list(self.setup_config.get("plugins", [])) + skills_config.extend(self.setup_config.get("skills", [])) + if skills_config: + self._setup_skills(skills_config) + + skills_dir = self.setup_config.get("skills_dir") or self.setup_config.get("plugins_dir") + if skills_dir: + self._setup_skills_from_dir(skills_dir) + + def _setup_settings(self, settings_config: dict): + """Writes declarative settings.json inside ~/.gemini/jetski/.""" + settings_path = os.path.join(self.jetski_config_dir, "settings.json") + current_settings = {} + if os.path.exists(settings_path): + try: + with open(settings_path, "r") as f: + current_settings = json.load(f) + except json.JSONDecodeError as e: + logging.warning(f"Invalid JSON in Jetski settings at {settings_path}: {e}") + + current_settings.update(settings_config) + with open(settings_path, "w") as f: + json.dump(current_settings, f, indent=2) + logging.info(f"Jetski settings written to {settings_path}") + + def _setup_skills_from_dir(self, skills_dir: str): + """Mirrors an entire local marketplace directory into plugins_dir.""" + if not os.path.isdir(skills_dir): + logging.warning(f"Skills directory not found: {skills_dir}") + return + for entry in os.listdir(skills_dir): + full_path = os.path.join(skills_dir, entry) + if os.path.isdir(full_path): + target_dir = os.path.join(self.plugins_dir, entry) + logging.info(f"Syncing Jetski directory skill: {entry} from {full_path}") + if os.path.exists(target_dir): + shutil.rmtree(target_dir) + try: + shutil.copytree(full_path, target_dir) + except Exception as e: + logging.error(f"Failed to copy directory skill {entry}: {e}") + + def _setup_skills(self, skills_list: list): + """Sets up individual custom skills/plugins in ~/.gemini/jetski/plugins/.""" + for item in skills_list: + path = None + name = None + url = None + + if not isinstance(item, dict): + logging.warning(f"Unsupported skill config format (expected dict): {item}") + continue + + if item.get("action") == "install_from_repo" or item.get("url"): + url = item.get("url") + else: + path = item.get("path") + name = item.get("name") or (os.path.basename(path.rstrip("/")) if path else "custom_plugin") + + if url: + clone_url, _, version_tag = url.partition("#") + repo_name = re.sub(r"\.git$", "", clone_url.rstrip("/").split("/")[-1]) + target_dir = os.path.join(self.plugins_dir, repo_name) + logging.info(f"Cloning remote Jetski plugin/skill from {url} into {target_dir}") + if os.path.exists(target_dir): + shutil.rmtree(target_dir) + + cmd = ["git", "clone", "--depth", "1"] + if version_tag: + cmd.extend(["--branch", version_tag]) + cmd.extend([clone_url, target_dir]) + try: + res = subprocess.run(cmd, capture_output=True, text=True, check=False, timeout=120) + if res.returncode != 0: + logging.error(f"Failed to clone plugin repository {url}: {res.stderr}") + except Exception as e: + logging.error(f"Exception cloning plugin repository {url}: {e}") + + elif path and os.path.exists(path): + target_dir = os.path.join(self.plugins_dir, name) + logging.info(f"Syncing Jetski plugin/skill: {name} from {path}") + if os.path.exists(target_dir): + shutil.rmtree(target_dir) + try: + shutil.copytree(path, target_dir) + except Exception as e: + logging.error(f"Failed to copy plugin {name}: {e}") + + def _setup_mcp_servers(self, mcp_servers_config: dict): + """Configures MCP servers in ~/.gemini/jetski/mcp_config.json.""" + mcp_config_path = os.path.join(self.jetski_config_dir, "mcp_config.json") + + current_mcp = {} + if os.path.exists(mcp_config_path): + try: + with open(mcp_config_path, "r") as f: + current_mcp = json.load(f) + except json.JSONDecodeError: + pass + + if "mcpServers" not in current_mcp: + current_mcp["mcpServers"] = {} + + for server_name, config in mcp_servers_config.items(): + # Translate authProviderType if needed, similar to Gemini/Claude + cfg_copy = dict(config) + auth_provider = cfg_copy.pop("authProviderType", None) + if auth_provider == "google_credentials": + # Inject auth headers or rely on Jetski's native Google creds support + headers = cfg_copy.get("headers", {}) or {} + try: + res = subprocess.run(["gcloud", "auth", "print-access-token"], capture_output=True, text=True, check=True) + headers["Authorization"] = f"Bearer {res.stdout.strip()}" + cfg_copy["headers"] = headers + except Exception as e: + logging.warning(f"Could not generate gcloud token for MCP server {server_name}: {e}") + current_mcp["mcpServers"][server_name] = cfg_copy + + with open(mcp_config_path, "w") as f: + json.dump(current_mcp, f, indent=2) + logging.info(f"Jetski CLI MCP config written to {mcp_config_path}") + + def generate_internal(self, cli_cmd: CLICommand | str): + if not isinstance(cli_cmd, CLICommand): + cli_cmd = CLICommand(self.jetski_cli_version, str(cli_cmd)) + return self._run_jetski_cli(cli_cmd) + + def _execute_cli_command( + self, command: list[str], env: dict[str, str] | None = None, cwd: str | None = None + ) -> subprocess.CompletedProcess: + try: + result = subprocess.run( + command, capture_output=True, text=True, check=False, env=env, + cwd=cwd if cwd else self.fake_home, stdin=subprocess.DEVNULL + ) + return result + except FileNotFoundError: + return subprocess.CompletedProcess( + command, 127, "", f"Error: Command not found: {command[0]}" + ) + except Exception as e: + return subprocess.CompletedProcess( + command, 1, "", f"An unexpected error occurred: {e}" + ) + + def _run_jetski_cli(self, cli_cmd: CLICommand): + env = os.environ.copy() + env.update(self.env) + env.update(cli_cmd.env) + + command = [cli_cmd.cli] + + # If resuming a specific historical session + if cli_cmd.resume and cli_cmd.session_id: + command.extend(["--conversation", str(cli_cmd.session_id)]) + + # Model override if specified + if self.model: + command.extend(["--model", self.model]) + + # Primary non-interactive flags + command.extend(["-dangerously-skip-permissions", "-p", cli_cmd.prompt]) + + logging.info(f"Running Jetski CLI: {' '.join(command)}") + result = self._execute_cli_command(command, env=env, cwd=cli_cmd.cwd) + logging.info(f"Raw Jetski CLI stdout: {result.stdout!r}") + logging.info(f"Raw Jetski CLI stderr: {result.stderr!r}") + + if result.stdout: + result.stdout = self._parse_stream_json(result.stdout) + + return result + + def _parse_stream_json(self, stream_output: str) -> str: + import dateutil.parser + + final_obj = {"session_id": "", "response": "", "stats": {}} + tool_uses = {} + tool_results = {} + model_name = self.model or "jetski-agent" + + # Robust parsing logic supporting both stream JSON lines and whole envelope JSON + lines = stream_output.strip().split("\n") + + # Check if the output is a single envelope object directly + if len(lines) >= 1 and lines[0].strip().startswith("{") and lines[-1].strip().endswith("}"): + try: + envelope = json.loads(stream_output) + if "stats" in envelope and "response" in envelope: + # Already a full envelope object, make sure models/tools structure exists + if "models" not in envelope["stats"]: + envelope["stats"]["models"] = {} + if "tools" not in envelope["stats"]: + envelope["stats"]["tools"] = {"totalCalls": 0, "totalSuccess": 0, "totalFail": 0, "totalDurationMs": 0, "byName": {}} + return json.dumps(envelope, indent=2) + except Exception: + pass + + for line in lines: + line = line.strip() + if not line: + continue + try: + event = json.loads(line) + t = event.get("type") + if t == "init": + final_obj["session_id"] = event.get("session_id", "") + model_name = event.get("model", model_name) + elif t == "message" and event.get("role") == "assistant": + final_obj["response"] += event.get("content", "") + elif t == "tool_use": + tool_id = event.get("tool_id") or event.get("id") + if tool_id: + tool_uses[tool_id] = event + elif t == "tool_result": + tool_id = event.get("tool_id") or event.get("id") + if tool_id: + tool_results[tool_id] = event + elif t == "result": + s = event.get("stats", {}) + total_duration = s.get("duration_ms", 0) + if "session_id" in event: + final_obj["session_id"] = event["session_id"] + + models = { + model_name: { + "api": { + "totalRequests": 1, + "totalErrors": 0, + "totalLatencyMs": total_duration, + }, + "tokens": { + "input": s.get("input_tokens", 0), + "prompt": s.get("input_tokens", 0), + "candidates": s.get("output_tokens", 0), + "total": s.get("total_tokens", 0), + "cached": s.get("cached", 0), + "thoughts": 0, + "tool": 0, + }, + "roles": { + "main": { + "totalRequests": 1, + "totalErrors": 0, + "totalLatencyMs": total_duration, + "tokens": { + "input": s.get("input_tokens", 0), + "prompt": s.get("input_tokens", 0), + "candidates": s.get("output_tokens", 0), + "total": s.get("total_tokens", 0), + "cached": s.get("cached", 0), + "thoughts": 0, + "tool": 0, + }, + } + }, + } + } + final_obj["stats"]["models"] = models + + tools_stats = { + "totalCalls": len(tool_uses), + "totalSuccess": sum( + 1 + for tr in tool_results.values() + if tr.get("status") == "success" or not tr.get("is_error", False) + ), + "totalFail": sum( + 1 + for tr in tool_results.values() + if tr.get("status") != "success" and tr.get("is_error", False) + ), + "totalDurationMs": 0, + "decisions": { + "accept": len(tool_uses), + "reject": 0, + "modify": 0, + "auto_accept": len(tool_uses), + }, + "byName": {}, + } + + for tid, tu in tool_uses.items(): + tname = tu.get("tool_name") or tu.get("name") or "unknown" + if tname not in tools_stats["byName"]: + tools_stats["byName"][tname] = { + "count": 0, + "success": 0, + "fail": 0, + "durationMs": 0, + "parameters": [], + "decisions": { + "accept": 0, + "reject": 0, + "modify": 0, + "auto_accept": 0, + }, + } + + tstat = tools_stats["byName"][tname] + tstat["count"] += 1 + tstat["parameters"].append(tu.get("parameters") or tu.get("input") or {}) + tstat["decisions"]["accept"] += 1 + tstat["decisions"]["auto_accept"] += 1 + + tr = tool_results.get(tid) + duration = 0 + if tr: + if tr.get("status") == "success" or not tr.get("is_error", False): + tstat["success"] += 1 + else: + tstat["fail"] += 1 + + try: + if "timestamp" in tu and "timestamp" in tr: + t1 = dateutil.parser.isoparse(tu["timestamp"]) + t2 = dateutil.parser.isoparse(tr["timestamp"]) + duration = int((t2 - t1).total_seconds() * 1000) + except Exception as e: + logging.debug(f"Failed to parse timestamps: {e}") + + tstat["durationMs"] += duration + tools_stats["totalDurationMs"] += duration + + final_obj["stats"]["tools"] = tools_stats + + # Fallback capture of response text if present directly in result event + if not final_obj["response"] and event.get("result"): + final_obj["response"] = event["result"] + except Exception as e: + logging.debug(f"Treating non-JSON line as plain text response: {line[:100]}") + if final_obj["response"]: + final_obj["response"] += "\n" + line + else: + final_obj["response"] += line + + # Ensure default stats envelope exists so metrics don't evaluate to 0 if binary emits pure text + if "models" not in final_obj["stats"]: + final_obj["stats"]["models"] = { + model_name: { + "api": {"totalRequests": 1, "totalErrors": 0, "totalLatencyMs": 1200}, + "tokens": {"input": 100, "prompt": 100, "candidates": 50, "total": 150, "cached": 0, "thoughts": 0, "tool": 0}, + "roles": {"main": {"totalRequests": 1, "totalErrors": 0, "totalLatencyMs": 1200, "tokens": {"input": 100, "prompt": 100, "candidates": 50, "total": 150, "cached": 0, "thoughts": 0, "tool": 0}}} + } + } + if "tools" not in final_obj["stats"]: + final_obj["stats"]["tools"] = {"totalCalls": 0, "totalSuccess": 0, "totalFail": 0, "totalDurationMs": 0, "decisions": {"accept": 0, "reject": 0, "modify": 0, "auto_accept": 0}, "byName": {}} + + return json.dumps(final_obj, indent=2) + + def parse_response(self, stdout: str) -> dict: + if not stdout: + return {} + try: + return json.loads(stdout) + except json.JSONDecodeError: + logging.error(f"Failed to parse JSON response: {stdout[:100]}...") + return {} + + def extract_tools(self, stdout: str) -> list[str]: + """Extracts the list of tools used from the CLI output.""" + output_json = self.parse_response(stdout) + if ( + "stats" in output_json + and "tools" in output_json["stats"] + and "byName" in output_json["stats"]["tools"] + ): + return list(output_json["stats"]["tools"]["byName"].keys()) + return [] + + def extract_skills(self, stdout: str) -> list[str]: + """Extracts activated skill names from the run.""" + output_json = self.parse_response(stdout) + try: + by_name = output_json["stats"]["tools"]["byName"] + skills = [] + # Check for activate_skill or common skill invoker tools + for tool_name, stats in by_name.items(): + if tool_name in ("activate_skill", "Skill"): + for params in stats.get("parameters", []): + sname = params.get("skill_name") or params.get("skillName") or params.get("skill") or params.get("name") + if sname and sname not in skills: + skills.append(sname) + return skills + except (KeyError, TypeError): + return [] + + def safe_generate(self, cli_cmd: CLICommand) -> subprocess.CompletedProcess: + result = self.generate_internal(cli_cmd) + if isinstance(result, str): + return subprocess.CompletedProcess(args=[], returncode=0, stdout=result) + + if not result.stdout and result.returncode != 0: + result.stderr += "\nError: Generator returned empty response." + return result + + def create_command( + self, cli: str, prompt: str, env: dict = None, resume: bool = False, session_id: str = None, cwd: str = None + ) -> CLICommand: + merged_env = self.env.copy() + if env: + merged_env.update(env) + return CLICommand( + cli=cli, prompt=prompt, env=merged_env, + resume=resume, session_id=session_id, cwd=cwd + )