From 860de6b19b6b26d0dd3b5c804f443a41bfe424d3 Mon Sep 17 00:00:00 2001 From: Milansuman Date: Tue, 7 Apr 2026 15:50:54 +0530 Subject: [PATCH] feat: create references for evaluations and simulations --- skills/netra-best-practices/SKILL.md | 2 + .../references/multi-turn-simulations.md | 155 ++++++++++++++ .../references/single-turn-eval.md | 201 ++++++++++++++++++ 3 files changed, 358 insertions(+) create mode 100644 skills/netra-best-practices/references/multi-turn-simulations.md create mode 100644 skills/netra-best-practices/references/single-turn-eval.md diff --git a/skills/netra-best-practices/SKILL.md b/skills/netra-best-practices/SKILL.md index c323614..f351c0b 100644 --- a/skills/netra-best-practices/SKILL.md +++ b/skills/netra-best-practices/SKILL.md @@ -27,6 +27,8 @@ npm install netra-sdk - Instrumenting an LLM application: references/instrumentation.md - custom metrics for the agent (counters, histograms, gauges): references/custom-metrics.md +- Setting up single-turn evaluations: references/single-turn-eval.md +- Setting up multi-turn simulations: references/multi-turn-simulations.md ## Feedback diff --git a/skills/netra-best-practices/references/multi-turn-simulations.md b/skills/netra-best-practices/references/multi-turn-simulations.md new file mode 100644 index 0000000..e88aea0 --- /dev/null +++ b/skills/netra-best-practices/references/multi-turn-simulations.md @@ -0,0 +1,155 @@ +# Multi-Turn Simulations (Netra SDK) + +Use this reference when the user asks to test an AI agent with goal-oriented, multi-turn conversations. + +## Outcome + +Set up simulation runs where: +- A multi-turn dataset defines realistic scenarios. +- A task wrapper calls the user's agent each turn. +- Netra evaluates whole conversations with simulation evaluators. + +## Prerequisites + +1. Netra SDK installed and initialized. +2. Multi-turn dataset created in Netra dashboard. +3. Dataset includes scenario goal, max turns, persona, user data, and fact checker data. +4. Simulation evaluators selected (start with Goal Fulfillment + Factual Accuracy). + +## Recommended workflow for the agent + +1. Confirm the target agent entry point (how to send one message and get one reply). +2. Verify Netra init happens before simulation execution. +3. Implement/extend `BaseTask` wrapper around the user's agent. +4. Ensure session continuity is preserved (`session_id` / `sessionId`). +5. Run simulation with conservative concurrency. +6. Report completed/failed counts and where to inspect results. + +## Python template + +```python +from netra import Netra +from netra.simulation.task import BaseTask +from netra.simulation.models import TaskResult +from uuid import uuid4 + +Netra.init( + app_name="my-app", + headers="x-api-key=YOUR_NETRA_API_KEY", +) + +class MyAgentTask(BaseTask): + def __init__(self, agent): + self.agent = agent + + def run(self, message: str, session_id: str | None = None) -> TaskResult: + # Netra can call the first turn with session_id=None. + # Generate a fresh per-conversation session id in that case. + sid = session_id or str(uuid4()) + # Replace with the user's real agent call + reply = self.agent.chat(message, session_id=sid) + return TaskResult( + message=reply, + session_id=sid, + ) + +result = Netra.simulation.run_simulation( + name="Customer Support Simulation", + dataset_id="your-multi-turn-dataset-id", + task=MyAgentTask(agent=...), + context={"environment": "staging"}, + max_concurrency=3, +) + +print("completed:", len(result["completed"])) +print("failed:", len(result["failed"])) +``` + +## TypeScript template + +```typescript +import { Netra } from "netra-sdk"; +import { BaseTask, TaskResult } from "netra-sdk/simulation"; + +await Netra.init({ + appName: "my-app", + headers: `x-api-key=${process.env.NETRA_API_KEY}`, +}); + +class MyAgentTask extends BaseTask { + constructor(private agent: any) { + super(); + } + + async run(message: string, sessionId?: string | null): Promise { + // Netra can call the first turn with sessionId=null/undefined. + // Generate a fresh per-conversation session id in that case. + const sid = sessionId ?? crypto.randomUUID(); + // Replace with the user's real agent call + const reply = await this.agent.chat(message, { sessionId: sid }); + return { + message: String(reply?.text ?? reply ?? ""), + sessionId: sid, + }; + } +} + +const result = await Netra.simulation.runSimulation({ + name: "Customer Support Simulation", + datasetId: "your-multi-turn-dataset-id", + task: new MyAgentTask(/* agent */), + context: { environment: "staging" }, + maxConcurrency: 3, +}); + +console.log("completed:", result?.completed.length ?? 0); +console.log("failed:", result?.failed.length ?? 0); +``` + +## Dataset design guidance + +When instructing users to create datasets in the dashboard, include: +1. Clear scenario goal (what success looks like). +2. Realistic max turns (support: 4-6 is a good default). +3. Persona fit (neutral/friendly/frustrated/confused/custom). +4. Simulated user data (context the simulator can reference). +5. Fact checker values (critical facts that must be correct). + +## Evaluator guidance + +Simulation evaluators are session-level (whole conversation). + +Start with: +- `Goal Fulfillment` +- `Factual Accuracy` + +Then add as needed: +- `Conversation Completeness` +- `Guideline Adherence` +- `Conversational Flow` +- `Conversation Memory` +- `Profile Utilization` +- `Information Elicitation` + +## What to check after setup + +1. Simulation run starts and returns totals. +2. Most conversations land in `completed`. +3. Failures include actionable `error` and `turn_id`/`turnId`. +4. Evaluation scores appear under Test Runs for each scenario. +5. Per-turn traces are available for debugging. + +## Troubleshooting guidance + +- Repeated session resets: verify you propagate returned session id every turn. +- High failure count: reduce concurrency and inspect first failing turn trace. +- Unrealistic simulations: improve scenario goal/persona/user-data quality. +- Weak signal: raise evaluator pass thresholds once baseline quality is stable. + +## References + +- https://docs.getnetra.ai/quick-start/QuickStart_Simulation +- https://docs.getnetra.ai/Simulation/Datasets +- https://docs.getnetra.ai/Simulation/Evaluators +- https://docs.getnetra.ai/sdk-reference/simulation/python +- https://docs.getnetra.ai/sdk-reference/simulation/typescript diff --git a/skills/netra-best-practices/references/single-turn-eval.md b/skills/netra-best-practices/references/single-turn-eval.md new file mode 100644 index 0000000..b8779b0 --- /dev/null +++ b/skills/netra-best-practices/references/single-turn-eval.md @@ -0,0 +1,201 @@ +# Single-Turn Evaluations (Netra SDK) + +Use this reference when the user asks to set up or automate single-turn evaluations (input -> output) with Netra. + +## Outcome + +Set up a repeatable evaluation loop where: +- Test cases live in a Netra dataset. +- A task function runs the user's app logic for each dataset item. +- Netra executes a test run and scores outputs with evaluators. + +## Prerequisites + +1. Netra SDK installed (`netra-sdk` for Python, `netra-sdk` for TypeScript). +2. Netra initialized with API key/header. +3. At least one dataset configured in the dashboard, or created programmatically. +4. Evaluators attached in the dashboard (recommended), or passed in code. + +## Recommended workflow for the agent + +1. Confirm language/runtime (Python or TypeScript). +2. Ensure `Netra.init(...)` / `await Netra.init(...)` is called once at startup. +3. Fetch dataset via SDK. +4. Define `task(input)` that returns the system output string/value. +5. Run test suite with a clear run name and safe concurrency. +6. Return run id and quick status summary to the user. +7. Direct the user to Evaluation -> Test Runs for detailed scores. + +## Python template + +```python +from netra import Netra + +Netra.init( + app_name="my-app", + headers="x-api-key=YOUR_NETRA_API_KEY", +) + +def my_task(input_data): + # Call the user's app/agent here and return generated output + return f"response for: {input_data}" + +dataset = Netra.evaluation.get_dataset(dataset_id="your-dataset-id") + +result = Netra.evaluation.run_test_suite( + name="My Single-Turn Eval", + data=dataset, + task=my_task, + evaluators=["correctness", "relevance"], # optional + max_concurrency=5, +) + +print(result["runId"]) +``` + +## TypeScript template + +```typescript +import { Netra } from "netra-sdk"; + +await Netra.init({ + appName: "my-app", + headers: `x-api-key=${process.env.NETRA_API_KEY}`, +}); + +async function myTask(inputData: any): Promise { + // Call the user's app/agent here and return generated output + return `response for: ${String(inputData)}`; +} + +const dataset = await Netra.evaluation.getDataset("your-dataset-id"); + +const result = await Netra.evaluation.runTestSuite( + "My Single-Turn Eval", + dataset, + myTask, + ["correctness", "relevance"], // optional + 5 +); + +console.log(result?.runId); +``` + +## Programmatic dataset management (optional) + +Use these SDK APIs when the user wants setup fully in code: +- Python: `create_dataset`, `add_dataset_item`, `get_dataset`, `run_test_suite` +- TypeScript: `createDataset`, `addDatasetItem`, `getDataset`, `runTestSuite` + +Minimal pattern: +1. Create dataset. +2. Add dataset items with `input` and `expected_output`/`expectedOutput`. +3. Fetch dataset and execute test suite. + +### Python example (fully programmatic) + +```python +from netra import Netra + +Netra.init( + app_name="my-app", + headers="x-api-key=YOUR_NETRA_API_KEY", +) + +created = Netra.evaluation.create_dataset(name="Support QA Dataset") +dataset_id = created["datasetId"] + +Netra.evaluation.add_dataset_item( + dataset_id=dataset_id, + item={ + "input": "What is your refund window?", + "expected_output": "You can request a refund within 30 days of purchase.", + }, +) + +Netra.evaluation.add_dataset_item( + dataset_id=dataset_id, + item={ + "input": "Do you support overnight shipping?", + "expected_output": "Yes, overnight shipping is available in select regions.", + }, +) + +def task(input_data): + # Replace with your real app/agent call. + return f"response for: {input_data}" + +dataset = Netra.evaluation.get_dataset(dataset_id=dataset_id) + +result = Netra.evaluation.run_test_suite( + name="Support QA Programmatic Eval", + data=dataset, + task=task, + max_concurrency=3, +) + +print(result["runId"]) +``` + +### TypeScript example (fully programmatic) + +```typescript +import { Netra } from "netra-sdk"; + +await Netra.init({ + appName: "my-app", + headers: `x-api-key=${process.env.NETRA_API_KEY}`, +}); + +const created = await Netra.evaluation.createDataset("Support QA Dataset"); +const datasetId = created?.datasetId; + +if (!datasetId) { + throw new Error("Dataset creation failed: missing datasetId"); +} + +await Netra.evaluation.addDatasetItem(datasetId, { + input: "What is your refund window?", + expectedOutput: "You can request a refund within 30 days of purchase.", +}); + +await Netra.evaluation.addDatasetItem(datasetId, { + input: "Do you support overnight shipping?", + expectedOutput: "Yes, overnight shipping is available in select regions.", +}); + +const dataset = await Netra.evaluation.getDataset(datasetId); + +const result = await Netra.evaluation.runTestSuite( + "Support QA Programmatic Eval", + dataset, + async (inputData: any) => { + // Replace with your real app/agent call. + return `response for: ${String(inputData)}`; + }, + undefined, + 3 +); + +console.log(result?.runId); +``` + +## What to check after setup + +1. `runId` is returned. +2. Items are mostly `completed` (not `failed`). +3. Traces are linked for each test item. +4. Evaluator scores appear in Test Runs. + +## Troubleshooting guidance + +- No test runs: confirm dataset has evaluators and the correct dataset id is used. +- Empty/invalid outputs: ensure the `task` function returns output for every item. +- Too many failures/timeouts: lower concurrency first (`max_concurrency` / `maxConcurrency`). + +## References + +- https://docs.getnetra.ai/quick-start/QuickStart_Evals +- https://docs.getnetra.ai/Evaluation/Datasets +- https://docs.getnetra.ai/sdk-reference/evaluation/python +- https://docs.getnetra.ai/sdk-reference/evaluation/typescript