From 564468735facc5a66da8f6606a585e1ce755890c Mon Sep 17 00:00:00 2001 From: Abhijeet Prasad Date: Tue, 17 Mar 2026 12:00:51 -0400 Subject: [PATCH 1/2] feat: Add spec doc for classifiers in the SDK --- docs/telemetry/classifier.md | 238 +++++++++++++++++++++++++++++++++++ 1 file changed, 238 insertions(+) create mode 100644 docs/telemetry/classifier.md diff --git a/docs/telemetry/classifier.md b/docs/telemetry/classifier.md new file mode 100644 index 0000000..a444d6a --- /dev/null +++ b/docs/telemetry/classifier.md @@ -0,0 +1,238 @@ +# Classifiers + +## Overview + +Classifiers categorize and label eval outputs. Unlike scorers (numeric 0-1), classifiers produce structured classification items with optional confidence and metadata. Both receive the same arguments (`output`, `expected`, `input`, `metadata`) and run in parallel during evaluations. + +Classifications are stored as `Record` -- a dictionary keyed by classifier name, where each value is an array of items. This supports multiple classifiers producing independent label sets, a single classifier producing multiple labels, and multiple classifiers contributing to the same key. + +--- + +## Public API + +### Evaluator Interface + +An evaluator **MUST** include at least one of `scores` or `classifiers` (or both). SDKs **MUST** validate this at runtime and raise a clear error if neither is provided, even if the constraint is also enforced at the type level. + +```typescript +interface EvaluatorBase { + data: () => Dataset; + task: (input: Input, hooks: Hooks) => Output | Promise; +} + +type Evaluator = + | EvaluatorBase & { + scores: EvalScorer[]; + classifiers?: EvalClassifier[]; + } + | EvaluatorBase & { + scores?: EvalScorer[]; + classifiers: EvalClassifier[]; + }; +``` + +### EvalClassifier + +A classifier function accepts the same arguments as a scorer. It **MAY** return a single `Classification`, an array (multi-label), or `null`. It **MAY** be synchronous or asynchronous. + +```typescript +type OneOrMoreClassifications = Classification | Classification[] | null; + +type EvalClassifier = ( + args: EvalScorerArgs, +) => OneOrMoreClassifications | Promise; +``` + +### Classification + +Returned by classifier functions. The `name` field is used as the grouping key in the results dictionary and is omitted when converting to the storage format. + +```typescript +interface Classification { + name: string; + id: string; + label?: string; + confidence?: number | null; + metadata?: Record; +} +``` + +### EvalResult + +The `classifications` field **MUST** be omitted (not an empty object) when no classifiers are defined or all return `null`. + +```typescript +interface EvalResult { + input: unknown; + output: unknown; + expected?: unknown; + scores?: Record; + classifications?: Record; + metadata?: Record; +} +``` + +--- + +## Behavior + +### Execution + +SDKs **MUST** run classifiers in parallel with scorers (e.g., `Promise.all`). + +Each classifier **MUST** run inside a traced span with `type: "classifier"` and `name` set to the resolved classifier name: + +```typescript +rootSpan.traced( + (classifierSpan) => { + const result = await classifierFn({ input, output, expected, metadata }); + classifierSpan.log({ output: result }); + return result; + }, + { + name: resolvedClassifierName, + span_attributes: { + name: resolvedClassifierName, + type: "classifier", + }, + }, +); +``` + +### Name Resolution + +SDKs **MUST** resolve classifier name with this precedence: + +1. `name` field on the returned `Classification` object(s) +2. `.name` property of the classifier function +3. Fallback: `classifier_${index}` + +Items with the same resolved name **MUST** be appended to the same array. + +### Validation + +Each classification result **MUST** have: +- A `name` that is a non-empty string +- An `id` that is a non-empty string + +If validation fails, treat the classifier as failed. + +Additional field rules: +- `confidence` is an unconstrained number (no 0-1 range enforced). SDKs **MUST NOT** reject values outside 0-1. +- `metadata` is an unconstrained `Record`. SDKs **MUST NOT** impose size limits. +- Duplicate `{name, id}` pairs are allowed. Multiple items with the same `id` under the same name key **MUST** all be stored. Deduplication, if needed, is handled at the display layer. +- Order is stable. Items **MUST** be stored in the order they are returned by the classifier. + +### Conversion to ClassificationItem + +When storing results, SDKs **MUST** convert `Classification` to `ClassificationItem`: + +1. Copy `id` as-is +2. Default `label` to `id` if not provided +3. Include `confidence` and `metadata` only if present +4. Omit `name` (it becomes the dictionary key) + +### Error Handling + +Classifier failures **MUST NOT** abort the evaluation or affect other classifiers/scorers. + +On failure: +1. Record the error under `classifier_errors` in eval metadata (maps classifier name to error message) +2. Log the error to the root span's metadata +3. **SHOULD** emit a debug warning + +This mirrors the `scorer_errors` pattern. + +--- + +## Wire Format + +### ClassificationItem + +The storage format for a single classification. Derived from `Classification` by dropping `name` and adding an optional `source`. + +| Field | Type | Required | Description | +|---|---|---|---| +| `id` | String | **REQUIRED** | Stable identifier for filtering and grouping. | +| `label` | String | **OPTIONAL** | Display label. Defaults to `id`. | +| `confidence` | Number \| null | **OPTIONAL** | Confidence score, typically 0-1. | +| `metadata` | Record\ | **OPTIONAL** | Arbitrary metadata. | +| `source` | SavedFunctionId \| null | **OPTIONAL** | Function that produced this classification. Set by the platform for online scoring; SDKs MAY omit. | + +### Classifications on Events + +Stored as a top-level `classifications` field on experiment and log events. **MUST** be `Record`. **MUST** be omitted when empty. + +```json +{ + "classifications": { + "category": [ + { "id": "greeting", "label": "Greeting", "confidence": 0.91 } + ], + "sentiment": [ + { "id": "positive", "label": "Positive" }, + { "id": "enthusiastic", "label": "Enthusiastic" } + ] + } +} +``` + +--- + +## Examples + +### Basic + +```javascript +Eval("my-project", { + data: () => [{ input: "Hello!", expected: "Hi there!" }], + task: async (input) => callMyModel(input), + scores: [ + ({ output, expected }) => ({ + name: "exact_match", + score: output === expected ? 1 : 0, + }), + ], + classifiers: [ + ({ output }) => ({ + name: "category", + id: "greeting", + label: "Greeting", + confidence: 0.95, + }), + ], +}); +``` + +### Classifiers Only (No Scores) + +```javascript +Eval("my-project", { + data: () => [{ input: "Hello!", expected: "Hi there!" }], + task: async (input) => callMyModel(input), + classifiers: [categoryClassifier, sentimentClassifier], +}); +``` + +### Multi-Label + +```javascript +const sentimentClassifier = ({ output }) => [ + { name: "sentiment", id: "positive", label: "Positive", confidence: 0.8 }, + { name: "sentiment", id: "enthusiastic", label: "Enthusiastic", confidence: 0.6 }, +]; +``` + +### Error Output + +When a classifier fails, the result includes: + +```json +{ + "metadata": { + "classifier_errors": { + "broken_classifier": "must return classifications with a non-empty string name" + } + } +} +``` From 18144db582e08e5e5da3e0b83167547dd9878a01 Mon Sep 17 00:00:00 2001 From: Abhijeet Prasad Date: Tue, 17 Mar 2026 15:49:38 -0400 Subject: [PATCH 2/2] clean up spec --- docs/telemetry/classifier.md | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/docs/telemetry/classifier.md b/docs/telemetry/classifier.md index a444d6a..9331af2 100644 --- a/docs/telemetry/classifier.md +++ b/docs/telemetry/classifier.md @@ -2,7 +2,7 @@ ## Overview -Classifiers categorize and label eval outputs. Unlike scorers (numeric 0-1), classifiers produce structured classification items with optional confidence and metadata. Both receive the same arguments (`output`, `expected`, `input`, `metadata`) and run in parallel during evaluations. +Classifiers categorize and label eval outputs. Unlike scorers (numeric 0-1), classifiers produce structured classification items with optional metadata. Both receive the same arguments (`output`, `expected`, `input`, `metadata`) and run in parallel during evaluations. Classifications are stored as `Record` -- a dictionary keyed by classifier name, where each value is an array of items. This supports multiple classifiers producing independent label sets, a single classifier producing multiple labels, and multiple classifiers contributing to the same key. @@ -52,7 +52,6 @@ interface Classification { name: string; id: string; label?: string; - confidence?: number | null; metadata?: Record; } ``` @@ -80,7 +79,7 @@ interface EvalResult { SDKs **MUST** run classifiers in parallel with scorers (e.g., `Promise.all`). -Each classifier **MUST** run inside a traced span with `type: "classifier"` and `name` set to the resolved classifier name: +Each classifier **MUST** run inside a traced span with `type: "classifier"` and `purpose: "scorer"`. The traced span name is resolved from the classifier function name (or fallback) rather than from returned classification items: ```typescript rootSpan.traced( @@ -90,10 +89,10 @@ rootSpan.traced( return result; }, { - name: resolvedClassifierName, + name: resolvedClassifierSpanName, span_attributes: { - name: resolvedClassifierName, type: "classifier", + purpose: "scorer", }, }, ); @@ -101,13 +100,12 @@ rootSpan.traced( ### Name Resolution -SDKs **MUST** resolve classifier name with this precedence: +SDKs **MUST** resolve names in two places: -1. `name` field on the returned `Classification` object(s) -2. `.name` property of the classifier function -3. Fallback: `classifier_${index}` +1. Classification span name: `.name` property of the classifier function, then fallback `classifier_${index}` +2. Classification result grouping key: `name` field on each returned `Classification` object -Items with the same resolved name **MUST** be appended to the same array. +Each returned classification item **MUST** include a non-empty string `name`; items with the same returned `name` **MUST** be appended to the same array. ### Validation @@ -118,7 +116,6 @@ Each classification result **MUST** have: If validation fails, treat the classifier as failed. Additional field rules: -- `confidence` is an unconstrained number (no 0-1 range enforced). SDKs **MUST NOT** reject values outside 0-1. - `metadata` is an unconstrained `Record`. SDKs **MUST NOT** impose size limits. - Duplicate `{name, id}` pairs are allowed. Multiple items with the same `id` under the same name key **MUST** all be stored. Deduplication, if needed, is handled at the display layer. - Order is stable. Items **MUST** be stored in the order they are returned by the classifier. @@ -129,7 +126,7 @@ When storing results, SDKs **MUST** convert `Classification` to `ClassificationI 1. Copy `id` as-is 2. Default `label` to `id` if not provided -3. Include `confidence` and `metadata` only if present +3. Include `metadata` only if present 4. Omit `name` (it becomes the dictionary key) ### Error Handling @@ -149,15 +146,13 @@ This mirrors the `scorer_errors` pattern. ### ClassificationItem -The storage format for a single classification. Derived from `Classification` by dropping `name` and adding an optional `source`. +The storage format for a single classification. Derived from `Classification` by dropping `name` and defaulting `label` to `id` when omitted. | Field | Type | Required | Description | |---|---|---|---| | `id` | String | **REQUIRED** | Stable identifier for filtering and grouping. | -| `label` | String | **OPTIONAL** | Display label. Defaults to `id`. | -| `confidence` | Number \| null | **OPTIONAL** | Confidence score, typically 0-1. | +| `label` | String | **REQUIRED** | Display label. Defaults to `id` during conversion. | | `metadata` | Record\ | **OPTIONAL** | Arbitrary metadata. | -| `source` | SavedFunctionId \| null | **OPTIONAL** | Function that produced this classification. Set by the platform for online scoring; SDKs MAY omit. | ### Classifications on Events @@ -167,7 +162,7 @@ Stored as a top-level `classifications` field on experiment and log events. **MU { "classifications": { "category": [ - { "id": "greeting", "label": "Greeting", "confidence": 0.91 } + { "id": "greeting", "label": "Greeting" } ], "sentiment": [ { "id": "positive", "label": "Positive" }, @@ -198,7 +193,6 @@ Eval("my-project", { name: "category", id: "greeting", label: "Greeting", - confidence: 0.95, }), ], }); @@ -218,8 +212,8 @@ Eval("my-project", { ```javascript const sentimentClassifier = ({ output }) => [ - { name: "sentiment", id: "positive", label: "Positive", confidence: 0.8 }, - { name: "sentiment", id: "enthusiastic", label: "Enthusiastic", confidence: 0.6 }, + { name: "sentiment", id: "positive", label: "Positive" }, + { name: "sentiment", id: "enthusiastic", label: "Enthusiastic" }, ]; ```