Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions datasets/jetski-cli-tools/example_run_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
############################################################
### Dataset / Eval Items
############################################################
dataset_config: datasets/jetski-cli-tools/jetski-cli.evalset.json
dataset_format: agent-format

# Orchestrator Configuration
orchestrator: agent
model_config: datasets/model_configs/jetski_cli_model.yaml
simulated_user_model_config: datasets/model_configs/gemini_2.5_pro_model.yaml

############################################################
### Scorer Related Configs
############################################################
scorers:
trajectory_matcher: {}
goal_completion:
model_config: datasets/model_configs/gemini_2.5_pro_model.yaml
behavioral_metrics:
model_config: datasets/model_configs/gemini_2.5_pro_model.yaml
parameter_analysis:
model_config: datasets/model_configs/gemini_2.5_pro_model.yaml
turn_count: {}
end_to_end_latency: {}
tool_call_latency: {}
token_consumption: {}

############################################################
### Reporting Related Configs
############################################################
reporting:
csv:
output_directory: 'results'
33 changes: 33 additions & 0 deletions datasets/jetski-cli-tools/example_run_fake_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
############################################################
### Dataset / Eval Items
############################################################
dataset_config: datasets/jetski-cli-tools/jetski-cli-fake.evalset.json
dataset_format: agent-format

# Orchestrator Configuration
orchestrator: agent
model_config: datasets/model_configs/jetski_cli_fake_model.yaml
simulated_user_model_config: datasets/model_configs/gemini_2.5_pro_model.yaml

############################################################
### Scorer Related Configs
############################################################
scorers:
trajectory_matcher: {}
goal_completion:
model_config: datasets/model_configs/gemini_2.5_pro_model.yaml
behavioral_metrics:
model_config: datasets/model_configs/gemini_2.5_pro_model.yaml
parameter_analysis:
model_config: datasets/model_configs/gemini_2.5_pro_model.yaml
turn_count: {}
end_to_end_latency: {}
tool_call_latency: {}
token_consumption: {}

############################################################
### Reporting Related Configs
############################################################
reporting:
csv:
output_directory: 'results'
30 changes: 30 additions & 0 deletions datasets/jetski-cli-tools/jetski-cli-fake.evalset.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"scenarios": [
{
"id": "fake-jetski-create-instance-success",
"starting_prompt": "Create a new Cloud SQL instance named 'my-fake-db' in project 'astana-evaluation'. Use PostgreSQL 17, and set the password to 'password123'. Also use the 'Development' edition preset.",
"conversation_plan": "The user wants to create a database. All required parameters are in the starting prompt. The agent should call create_instance and report the success message back.",
"expected_trajectory": [
"create_instance"
],
"env": {
"GOOGLE_CLOUD_PROJECT": "astana-evaluation"
},
"kind": "tools",
"max_turns": 3
},
{
"id": "fake-jetski-get-instance-failure",
"starting_prompt": "Get the details for the Cloud SQL instance named 'missing-db' in project 'astana-evaluation'.",
"conversation_plan": "The user wants to get instance details. The agent should call get_instance, which is hardcoded to fail with an error 'Instance not found or permission denied'. The agent should explain that the instance could not be found based on the error.",
"expected_trajectory": [
"get_instance"
],
"env": {
"GOOGLE_CLOUD_PROJECT": "astana-evaluation"
},
"kind": "tools",
"max_turns": 3
}
]
}
18 changes: 18 additions & 0 deletions datasets/jetski-cli-tools/jetski-cli.evalset.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"scenarios": [
{
"id": "jetski-cloud-sql-debug-01",
"starting_prompt": "list all instances in project ext-test-cloud-sql-postgres",
"conversation_plan": "Ask the agent to list instances in project ext-test-cloud-sql-postgres. Once all instances are listed if daily-ci-evals-db exist get its state and validate its RUNNABLE",
"expected_trajectory": [
"list_instances",
"get_instance"
],
"env": {
"GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
},
"kind": "tools",
"max_turns": 3
}
]
}
62 changes: 62 additions & 0 deletions datasets/model_configs/jetski_cli_fake_model.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
jetski_cli_version: "/google/bin/releases/jetski-devs/tools/cli"
generator: jetski_cli
model: "jetski-agent"

env:
GOOGLE_CLOUD_PROJECT: "astana-evaluation"
GOOGLE_CLOUD_LOCATION: "us-central1"

setup:
fake_mcp_servers:
"cloud-sql":
command: "python"
args:
- "evalbench/util/fake_mcp_server.py"
- "--server-name"
- "cloud-sql"
- "--config"
- "datasets/model_configs/jetski_cli_fake_model.yaml"

fake_mcp_tools:
"cloud-sql":
- name: create_instance
description: "Creates a Postgres instance using `Production` and `Development` presets. For the `Development` template, it chooses a 2 vCPU, 16 GiB RAM, 100 GiB SSD configuration with Non-HA/zonal availability. For the `Production` template, it chooses an 8 vCPU, 64 GiB RAM, 250 GiB SSD configuration with HA/regional availability. The Enterprise Plus edition is used in both cases. The default database version is `POSTGRES_17`. The agent should ask the user if they want to use a different version."
parameters:
type: object
properties:
project_id:
type: string
description: "The ID of the project."
instance_name:
type: string
description: "The name of the Cloud SQL instance."
database_version:
type: string
description: "The database engine type and version."
password:
type: string
description: "The password for the default user."
edition_preset:
type: string
description: "The edition preset for the instance."
required: ["project_id", "instance_name", "database_version", "password", "edition_preset"]
response:
status: "success"
message: "Instance created successfully"
- name: get_instance
description: "gets the details of a Cloud SQL instance."
parameters:
type: object
properties:
project_id:
type: string
description: "The ID of the project."
instance_name:
type: string
description: "The name of the Cloud SQL instance."
required: ["project_id", "instance_name"]
response:
status: "failure"
error:
code: 404
message: "Instance not found or permission denied"
21 changes: 21 additions & 0 deletions datasets/model_configs/jetski_cli_model.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Jetski CLI version.
# - Cloudtop/Linux default binary path: "/google/bin/releases/jetski-devs/tools/cli"
# - gMac default binary path: "/usr/local/bin/jetski"
jetski_cli_version: "/google/bin/releases/jetski-devs/tools/cli"

generator: jetski_cli

# Model to use for the agent session
model: "jetski-agent"

env:
GOOGLE_CLOUD_PROJECT: "astana-evaluation"
GOOGLE_CLOUD_LOCATION: "us-central1"

setup:
mcp_servers:
"cloud-sql":
httpUrl: "https://sqladmin.googleapis.com/mcp"
authProviderType: google_credentials
headers:
X-Goog-User-Project: astana-evaluation
9 changes: 6 additions & 3 deletions evalbench/evalbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,12 @@ def eval(experiment_config: str):

# Run evaluations
evaluator.evaluate(flatten_dataset(dataset))
job_id, run_time, results_tf, scores_tf, multi_trial_scores_tf = (
evaluator.process()
)
process_res = evaluator.process()
if len(process_res) == 5:
job_id, run_time, results_tf, scores_tf, multi_trial_scores_tf = process_res
else:
job_id, run_time, results_tf, scores_tf = process_res[:4]
multi_trial_scores_tf = None

# Create Dataframes for reporting
if results_tf is not None and scores_tf is not None:
Expand Down
19 changes: 11 additions & 8 deletions evalbench/evaluator/agentevaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from generators.models.gemini_cli import GeminiCliGenerator
from generators.models.claude_code import ClaudeCodeGenerator
from generators.models.codex_cli import CodexCliGenerator
from generators.models.jetski_cli import JetskiCliGenerator
from mp import mprunner
from work.agentgenwork import AgentGenWork
from evaluator.simulateduser import SimulatedUser
Expand Down Expand Up @@ -43,10 +44,12 @@ def __init__(
self.agent_version = self.generator.gemini_cli_version
elif isinstance(self.generator, CodexCliGenerator):
self.agent_version = self.generator.codex_cli_version
elif isinstance(self.generator, JetskiCliGenerator):
self.agent_version = self.generator.jetski_cli_version
else:
raise ValueError(
f"AgentEvaluator only supports gemini_cli, claude_code, and "
f"codex_cli generators, got {type(self.generator).__name__}")
f"AgentEvaluator only supports gemini_cli, claude_code, codex_cli,"
f"and jetski_cli generators, got {type(self.generator).__name__}")

runner_config = self.config.get("runners", {})
self.agent_runners = runner_config.get("agent_runners", 10)
Expand All @@ -58,11 +61,11 @@ def evaluate(
job_id: str,
run_time: datetime.datetime,
):
if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator)):
if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator, JetskiCliGenerator)):
return self._evaluate_agent_cli(dataset, job_id, run_time)
else:
raise NotImplementedError(
"This evaluator currently only supports GeminiCliGenerator, ClaudeCodeGenerator and CodexCliGenerator")
"This evaluator currently only supports GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator and JetskiCliGenerator")

def _evaluate_agent_cli(
self,
Expand Down Expand Up @@ -131,8 +134,8 @@ def process_scenario(
for turn in range(max_turns):
logging.info(
f"Turn {turn + 1}/{max_turns} - Prompt: {current_prompt}")
if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator)):
if isinstance(self.generator, (ClaudeCodeGenerator, CodexCliGenerator)):
if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator, JetskiCliGenerator)):
if isinstance(self.generator, (ClaudeCodeGenerator, CodexCliGenerator, JetskiCliGenerator)):
cli_cmd = self.generator.create_command(
cli=self.agent_version,
prompt=current_prompt,
Expand Down Expand Up @@ -171,12 +174,12 @@ def process_scenario(
self._log_cli_result(turn, max_turns, result)

tools = []
if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator)):
if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator, JetskiCliGenerator)):
tools = self.generator.extract_tools(result.stdout)
accumulated_tools.extend(tools)

# Extract skills from generator output
if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator)):
if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator, JetskiCliGenerator)):
skills = self.generator.extract_skills(result.stdout)
accumulated_skills.extend(skills)

Expand Down
2 changes: 1 addition & 1 deletion evalbench/evaluator/agentorchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def __init__(
self.report_progress = report_progress

def evaluate(self, dataset: list[EvalGeminiCliRequest]):
logging.info("Starting agent CLI evaluation")
logging.info("Starting Agent evaluation")
evaluator = AgentEvaluator(self.config)
eval_outputs, scoring_results = evaluator.evaluate(
dataset, self.job_id, self.run_time
Expand Down
3 changes: 3 additions & 0 deletions evalbench/generators/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .gemini_cli import GeminiCliGenerator
from .claude_code import ClaudeCodeGenerator
from .codex_cli import CodexCliGenerator
from .jetski_cli import JetskiCliGenerator
from util.config import load_yaml_config


Expand Down Expand Up @@ -42,6 +43,8 @@ def get_generator(global_models, model_config_path: str, db: DB = None):
model = ClaudeCodeGenerator(config)
if config["generator"] == "codex_cli":
model = CodexCliGenerator(config)
if config["generator"] == "jetski_cli":
model = JetskiCliGenerator(config)
if not model:
raise ValueError(f"Unknown Generator {config['generator']}")

Expand Down
Loading
Loading