From c37340168524228f40da924aae2973f4465fbdde Mon Sep 17 00:00:00 2001 From: Adnan Rashid Hussain Date: Fri, 13 Mar 2026 15:58:44 -0700 Subject: [PATCH 1/2] feat: Add text-based SMK prompts --- evals/prompts/CHANGELOG.md | 6 ++ .../subject-matter-knowledge/system.txt | 69 +++++++++++++++++++ .../prompts/subject-matter-knowledge/user.txt | 4 ++ 3 files changed, 79 insertions(+) create mode 100644 evals/prompts/subject-matter-knowledge/system.txt create mode 100644 evals/prompts/subject-matter-knowledge/user.txt diff --git a/evals/prompts/CHANGELOG.md b/evals/prompts/CHANGELOG.md index 5298060..b4af3e8 100644 --- a/evals/prompts/CHANGELOG.md +++ b/evals/prompts/CHANGELOG.md @@ -6,6 +6,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), --- +## [1.3.0] - 2026-03-18 + +### Added +- `subject-matter-knowledge/system.txt` — system prompt for the SMK evaluator +- `subject-matter-knowledge/user.txt` — user prompt for the SMK evaluator + ## [1.2.0] - 2026-02-19 ### Added diff --git a/evals/prompts/subject-matter-knowledge/system.txt b/evals/prompts/subject-matter-knowledge/system.txt new file mode 100644 index 0000000..b0a4f77 --- /dev/null +++ b/evals/prompts/subject-matter-knowledge/system.txt @@ -0,0 +1,69 @@ + +To perform the task of evaluating text complexity based on Subject Matter Knowledge (SMK), strictly adhere to the following instructions. +Role +You are an expert K-12 Literacy Pedagogue and Text Complexity Evaluator. Your specific focus is analyzing Subject Matter Knowledge (SMK) demands according to the Common Core Qualitative Text Complexity Rubric. +Objective +Analyze a provided text relative to a target grade_level. You must determine the extent of background knowledge required to comprehend the text. You must distinguish between Common/Standard knowledge (generally lower/moderate complexity) and Specialized/Theoretical knowledge (generally higher complexity). +Input Data +text: The passage to analyze. +grade_level: The target student grade (integer). +fk_score: Flesch-Kincaid Grade Level. Note: Use this only as a loose proxy for sentence structure. Do not let a high FK score artificially inflate the Subject Matter Knowledge score if the concepts remain simple. + +1. The Rubric: Subject Matter Knowledge (SMK) +1. Slightly Complex +Scope: Everyday, practical knowledge, and Introduction to Skills. +Concept Type: Concrete, directly observable, and familiar. +Key Indicator: "How-to" texts involving familiar objects (e.g., drawing a cupboard, playing a game, family life). Even if specific terms (like "scale" or "measure") are used, if the application is on a common object, it remains Slightly Complex. +2. Moderately Complex +Scope: Common Discipline-Specific Knowledge or Narrative History. +Definition: Topics widely introduced in K-8 curricula (Basic American History, Geography, Earth Science, Biology). +Key Characteristic: The text bridges concrete descriptions with abstract themes (e.g., using farming to discuss justice), OR narrates historical events via sensory details. +Spatial Reasoning: Texts requiring mental manipulation of maps/routes are generally Moderate, unless the object is a familiar household item (see Slightly Complex). +3. Very Complex +Scope: Specialized Discipline-Specific, Engineering Mechanics, or Political Theory. +Definition: Topics characteristic of High School (9-12) curricula requiring abstract mental models. +Key Characteristic: Requires understanding mechanisms (how physics works/propulsion), chemical composition, or undefined political stakes (specific treaties, alliances, or secularization without context). +4. Exceedingly Complex +Scope: Professional or Academic knowledge. + +2. The Expert Mental Model (Decision Logic) +Use these refined rules to categorize cases. +Rule A: The "Layers of Meaning" Check +Concrete -> Abstract (Moderate): The text describes concrete things (farming) to argue an abstract point (justice, rights). +Concrete -> Concrete (Slightly): The text describes concrete things (lines, paper) to achieve a concrete result (drawing a cupboard). Do not over-rank practical instructions. +Rule B: The Science & Engineering Boundary +Observational (Moderate): Habitats, Water Cycle, observable traits, simple definitions. +Mechanistic/Theoretical (Very): Engineering mechanics (how propulsion works via reaction), Instrumentation (using a spectroscope), or Chemical/Atomic theory. +Test: Does the text explain how a machine functions using physical principles? If yes, it is Very Complex. +Rule C: The History/Social Studies Boundary +General/Narrative (Moderate): +Sensory: Battle descriptions focusing on sights/sounds (flashes, smoke). +Standard Topics: Immigration, Slavery, Government, Geography. Lists of nationalities or religions are "Common Knowledge" for Grades 6-8. +Political/Contextual (Very): +Implicit Context: Texts assuming knowledge of specific political factions, treaties, or the causes of events without explanation (e.g., "The Allies," "The Front," "The secularization of the clergy"). +Test: If the reader must know why two groups are fighting or the specific political history of a revolution to understand the text, it is Very Complex. +Rule D: The "Technical vs. Practical" Trap +Scenario: A text teaches a technical skill (e.g., Technical Drawing/Technology) but applies it to a familiar object (a cupboard). +Decision: Slightly Complex. +Reasoning: Do not confuse "Technical Vocabulary" (scale, thick lines) with "Theoretical Complexity." If the underlying concept is familiar (furniture), the SMK load is low. + +3. Critical Calibration Examples +Text: "Make a rough sketch... How many shelves should the cupboard have?" (Grade 2) -> Slightly Complex. +Reasoning: (Rule D/Rule A) Although it mentions "scale" and "technology," the task is concrete and relies on everyday knowledge. +Text: "Hydraulic propulsion works by sucking water at the bow and forcing it sternward." (Grade 10) -> Very Complex. +Reasoning: (Rule B) Explains a mechanism using physics principles. +Text: "The Allies fight the enemy's cavalry; we remember the hospitality to priests during the Revolution." (Grade 6) -> Very Complex. +Reasoning: (Rule C) Assumes undefined knowledge of WWI alliances and the specific political history of the French Revolution. +Text: "Immigrants from Poland, Italy, and Russia arrived. Most were Catholic or Orthodox." (Grade 7) -> Moderately Complex. +Reasoning: (Rule C) Standard K-8 topic. Lists of nationalities are content vocabulary, not specialized theory. + +4. Output Format +Return your analysis in a valid JSON object. Do not include markdown formatting. +Keys: +- identified_topics: List[str] identifying the core subjects. +- curriculum_check: String explaining if the topics are "Standard/General" (typical for K-8) or "Specialized/High School" (typical for 9-12). +- assumptions_and_scaffolding: String analyzing what the author assumes the reader knows vs what is explained. +- friction_analysis: String discussing the gap between Concrete description and Abstract meaning. +- complexity_score: String (One of: slightly_complex, moderately_complex, very_complex, exceedingly_complex). +- reasoning: String synthesizing the decision. + diff --git a/evals/prompts/subject-matter-knowledge/user.txt b/evals/prompts/subject-matter-knowledge/user.txt new file mode 100644 index 0000000..a4f53d2 --- /dev/null +++ b/evals/prompts/subject-matter-knowledge/user.txt @@ -0,0 +1,4 @@ +Analyze: +Text: {text} +Grade: {grade} +FK Score: {fk_score} \ No newline at end of file From baf4a8b446ecaa285521660d2557031c173eebe8 Mon Sep 17 00:00:00 2001 From: Adnan Rashid Hussain Date: Sun, 15 Mar 2026 21:24:07 -0700 Subject: [PATCH 2/2] feat: Add SMK evaluator to TypeScript SDK --- sdks/typescript/CHANGELOG.md | 10 + sdks/typescript/README.md | 108 +++++++-- sdks/typescript/src/evaluators/index.ts | 5 + sdks/typescript/src/evaluators/smk.ts | 229 ++++++++++++++++++ .../src/evaluators/text-complexity.ts | 30 ++- sdks/typescript/src/index.ts | 5 + .../prompts/subject-matter-knowledge/index.ts | 22 ++ sdks/typescript/src/schemas/smk.ts | 16 ++ .../tests/integration/smk.integration.test.ts | 175 +++++++++++++ .../tests/unit/evaluators/smk.test.ts | 127 ++++++++++ .../unit/evaluators/text-complexity.test.ts | 60 ++++- 11 files changed, 755 insertions(+), 32 deletions(-) create mode 100644 sdks/typescript/src/evaluators/smk.ts create mode 100644 sdks/typescript/src/prompts/subject-matter-knowledge/index.ts create mode 100644 sdks/typescript/src/schemas/smk.ts create mode 100644 sdks/typescript/tests/integration/smk.integration.test.ts create mode 100644 sdks/typescript/tests/unit/evaluators/smk.test.ts diff --git a/sdks/typescript/CHANGELOG.md b/sdks/typescript/CHANGELOG.md index 516fb03..143338f 100644 --- a/sdks/typescript/CHANGELOG.md +++ b/sdks/typescript/CHANGELOG.md @@ -2,6 +2,16 @@ All notable changes to the `@learning-commons/evaluators` TypeScript SDK will be documented in this file. +## [0.2.0] — 2026-03-18 + +### Added + +- **Subject Matter Knowledge (SMK) Evaluator** — evaluates background knowledge demands of educational texts relative to grades 3–12. +- **SMK added to TextComplexityEvaluator** — composite evaluator now runs vocabulary, sentence structure, and SMK in parallel; result includes `subjectMatterKnowledge` key. +- **Prompt versioning** — prompts updated to v1.3.0 (`evals/prompts/subject-matter-knowledge/`). + +--- + ## [0.1.0] — Early Release Initial early release of the TypeScript SDK for Learning Commons educational evaluators. diff --git a/sdks/typescript/README.md b/sdks/typescript/README.md index 51f8726..cf73044 100644 --- a/sdks/typescript/README.md +++ b/sdks/typescript/README.md @@ -117,19 +117,18 @@ await evaluator.evaluate(text: string, grade: string) --- -### 3. Text Complexity Evaluator +### 3. Subject Matter Knowledge (SMK) Evaluator -Composite evaluator that analyzes both vocabulary and sentence structure complexity in parallel. +Evaluates the background knowledge demands of educational texts relative to grade level. Determines how much prior subject knowledge a student needs to comprehend the text, based on the Common Core Qualitative Text Complexity Rubric. **Supported Grades:** 3-12 -**Uses:** Google Gemini 2.5 Pro + OpenAI GPT-4o (composite) +**Uses:** Google Gemini 3 Flash Preview **Constructor:** ```typescript -const evaluator = new TextComplexityEvaluator({ +const evaluator = new SmkEvaluator({ googleApiKey?: string; // Google API key (required by this evaluator) - openaiApiKey?: string; // OpenAI API key (required by this evaluator) maxRetries?: number; // Optional - Max retry attempts (default: 2) telemetry?: boolean | TelemetryOptions; // Optional (default: true) logger?: Logger; // Optional - Custom logger @@ -145,23 +144,103 @@ await evaluator.evaluate(text: string, grade: string) **Returns:** ```typescript { - score: { - overall: string; // Overall complexity (highest of the two) - vocabulary: string; // Vocabulary complexity score - sentenceStructure: string; // Sentence structure complexity score + score: 'Slightly complex' | 'Moderately complex' | 'Very complex' | 'Exceedingly complex'; + reasoning: string; + metadata: { + model: string; + processingTimeMs: number; }; - reasoning: string; // Combined reasoning from both evaluators - metadata: EvaluationMetadata; _internal: { - vocabulary: EvaluationResult | { error: Error }; - sentenceStructure: EvaluationResult | { error: Error }; + identified_topics: string[]; + curriculum_check: string; + assumptions_and_scaffolding: string; + friction_analysis: string; + complexity_score: 'Slightly complex' | 'Moderately complex' | 'Very complex' | 'Exceedingly complex'; + reasoning: string; }; } ``` +**Example:** +```typescript +import { SmkEvaluator } from '@learning-commons/evaluators'; + +const evaluator = new SmkEvaluator({ + googleApiKey: process.env.GOOGLE_API_KEY, +}); + +const result = await evaluator.evaluate( + "Hydraulic propulsion works by sucking water at the bow and forcing it sternward.", + "10" +); +console.log(result.score); // "Very complex" +console.log(result.reasoning); +console.log(result._internal.identified_topics); // ["hydraulics", "propulsion", "physics"] +``` + +--- + +### 4. Text Complexity Evaluator + +Composite evaluator that analyzes vocabulary, sentence structure, and subject matter knowledge complexity in parallel. + +**Supported Grades:** 3-12 + +**Uses:** Google Gemini 2.5 Pro + Google Gemini 3 Flash Preview + OpenAI GPT-4o (composite) + +**Constructor:** +```typescript +const evaluator = new TextComplexityEvaluator({ + googleApiKey?: string; // Google API key (required by this evaluator) + openaiApiKey?: string; // OpenAI API key (required by this evaluator) + maxRetries?: number; // Optional - Max retry attempts (default: 2) + telemetry?: boolean | TelemetryOptions; // Optional (default: true) + logger?: Logger; // Optional - Custom logger + logLevel?: LogLevel; // Optional - Logging verbosity (default: WARN) +}); +``` + +**API:** +```typescript +await evaluator.evaluate(text: string, grade: string) +``` + +**Returns:** +```typescript +{ + vocabulary: EvaluationResult | { error: Error }; + sentenceStructure: EvaluationResult | { error: Error }; + subjectMatterKnowledge: EvaluationResult | { error: Error }; +} +``` + +Each sub-evaluator result is either a full `EvaluationResult` or `{ error: Error }` if that evaluator failed. An error is only thrown if all three fail. + +**Example:** +```typescript +import { TextComplexityEvaluator } from '@learning-commons/evaluators'; + +const evaluator = new TextComplexityEvaluator({ + googleApiKey: process.env.GOOGLE_API_KEY, + openaiApiKey: process.env.OPENAI_API_KEY, +}); + +const result = await evaluator.evaluate("Your text here", "6"); + +if (!('error' in result.vocabulary)) { + console.log('Vocabulary:', result.vocabulary.score); +} +if (!('error' in result.sentenceStructure)) { + console.log('Sentence structure:', result.sentenceStructure.score); +} +if (!('error' in result.subjectMatterKnowledge)) { + console.log('Subject matter knowledge:', result.subjectMatterKnowledge.score); +} +``` + --- -### 4. Grade Level Appropriateness Evaluator +### 5. Grade Level Appropriateness Evaluator Determines appropriate grade level for text. @@ -308,6 +387,7 @@ interface BaseEvaluatorConfig { **Note:** Which API keys are required depends on the evaluator. The SDK validates required keys at runtime based on the evaluator's metadata: - **Vocabulary**: Requires both `googleApiKey` and `openaiApiKey` - **Sentence Structure**: Requires `openaiApiKey` only +- **Subject Matter Knowledge**: Requires `googleApiKey` only - **Text Complexity**: Requires both `googleApiKey` and `openaiApiKey` - **Grade Level Appropriateness**: Requires `googleApiKey` only diff --git a/sdks/typescript/src/evaluators/index.ts b/sdks/typescript/src/evaluators/index.ts index 7451765..1997c2b 100644 --- a/sdks/typescript/src/evaluators/index.ts +++ b/sdks/typescript/src/evaluators/index.ts @@ -20,6 +20,11 @@ export { evaluateGradeLevelAppropriateness, } from './grade-level-appropriateness.js'; +export { + SmkEvaluator, + evaluateSmk, +} from './smk.js'; + export { TextComplexityEvaluator, evaluateTextComplexity, diff --git a/sdks/typescript/src/evaluators/smk.ts b/sdks/typescript/src/evaluators/smk.ts new file mode 100644 index 0000000..186026b --- /dev/null +++ b/sdks/typescript/src/evaluators/smk.ts @@ -0,0 +1,229 @@ +import type { LLMProvider } from '../providers/index.js'; +import { createProvider } from '../providers/index.js'; +import { SmkOutputSchema, type SmkInternal } from '../schemas/smk.js'; +import { calculateFleschKincaidGrade } from '../features/index.js'; +import { getSystemPrompt, getUserPrompt } from '../prompts/subject-matter-knowledge/index.js'; +import type { EvaluationResult, TextComplexityLevel } from '../schemas/index.js'; +import { BaseEvaluator, type BaseEvaluatorConfig } from './base.js'; +import type { StageDetail } from '../telemetry/index.js'; +import { ValidationError, wrapProviderError } from '../errors.js'; + +/** + * Subject Matter Knowledge (SMK) Evaluator + * + * Evaluates the background knowledge demands of educational texts relative to grade level. + * Determines how much prior subject knowledge a student needs to comprehend the text. + * + * Based on the Common Core Qualitative Text Complexity Rubric with 4 levels: + * - Slightly complex + * - Moderately complex + * - Very complex + * - Exceedingly complex + * + * @example + * ```typescript + * const evaluator = new SmkEvaluator({ + * googleApiKey: process.env.GOOGLE_API_KEY + * }); + * + * const result = await evaluator.evaluate(text, "6"); + * console.log(result.score); // "Moderately complex" + * console.log(result.reasoning); + * ``` + */ +export class SmkEvaluator extends BaseEvaluator { + static readonly metadata = { + id: 'subject-matter-knowledge', + name: 'Subject Matter Knowledge', + description: 'Evaluates background knowledge demands of educational texts relative to grade level', + supportedGrades: ['3', '4', '5', '6', '7', '8', '9', '10', '11', '12'] as const, + requiresGoogleKey: true, + requiresOpenAIKey: false, + }; + + private provider: LLMProvider; + + constructor(config: BaseEvaluatorConfig) { + super(config); + + this.provider = createProvider({ + type: 'google', + model: 'gemini-3-flash-preview', + apiKey: config.googleApiKey, + maxRetries: this.config.maxRetries, + }); + } + + /** + * Evaluate subject matter knowledge complexity for a given text and grade level + * + * @param text - The text to evaluate + * @param grade - The target grade level (3-12) + * @returns Evaluation result with complexity score and detailed analysis + * @throws {ValidationError} If text is empty, too short/long, or grade is invalid + * @throws {APIError} If LLM API calls fail (includes AuthenticationError, RateLimitError, NetworkError, TimeoutError) + */ + async evaluate( + text: string, + grade: string + ): Promise> { + this.logger.info('Starting SMK evaluation', { + evaluator: 'subject-matter-knowledge', + operation: 'evaluate', + grade, + textLength: text.length, + }); + + const startTime = Date.now(); + const stageDetails: StageDetail[] = []; + + try { + // Validate inputs — inside try so validation errors are telemetered. + this.validateText(text); + this.validateGrade(grade, new Set(SmkEvaluator.metadata.supportedGrades)); + + this.logger.debug('Evaluating subject matter knowledge complexity', { + evaluator: 'subject-matter-knowledge', + operation: 'smk_evaluation', + }); + + const fkScore = calculateFleschKincaidGrade(text); + const response = await this.evaluateSmk(text, grade, fkScore); + + stageDetails.push({ + stage: 'smk_evaluation', + provider: 'google:gemini-3-flash-preview', + latency_ms: response.latencyMs, + token_usage: { + input_tokens: response.usage.inputTokens, + output_tokens: response.usage.outputTokens, + }, + }); + + const latencyMs = Date.now() - startTime; + + // Aggregate token usage + const totalTokenUsage = { + input_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.input_tokens || 0), 0), + output_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.output_tokens || 0), 0), + }; + + const result = { + score: response.data.complexity_score, + reasoning: response.data.reasoning, + metadata: { + model: 'google:gemini-3-flash-preview', + processingTimeMs: latencyMs, + }, + _internal: response.data, + }; + + // Send success telemetry (fire-and-forget) + this.sendTelemetry({ + status: 'success', + latencyMs, + textLength: text.length, + grade, + provider: 'google:gemini-3-flash-preview', + tokenUsage: totalTokenUsage, + metadata: { + stage_details: stageDetails, + }, + inputText: text, + }).catch(() => { + // Ignore telemetry errors + }); + + this.logger.info('SMK evaluation completed successfully', { + evaluator: 'subject-matter-knowledge', + operation: 'evaluate', + grade, + score: result.score, + processingTimeMs: latencyMs, + }); + + return result; + } catch (error) { + const latencyMs = Date.now() - startTime; + + this.logger.error('SMK evaluation failed', { + evaluator: 'subject-matter-knowledge', + operation: 'evaluate', + grade, + error: error instanceof Error ? error : undefined, + processingTimeMs: latencyMs, + completedStages: stageDetails.length, + }); + + const totalTokenUsage = stageDetails.length > 0 ? { + input_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.input_tokens || 0), 0), + output_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.output_tokens || 0), 0), + } : undefined; + + this.sendTelemetry({ + status: 'error', + latencyMs, + textLength: text.length, + grade, + provider: 'google:gemini-3-flash-preview', + tokenUsage: totalTokenUsage, + errorCode: error instanceof Error ? error.name : 'UnknownError', + metadata: stageDetails.length > 0 ? { stage_details: stageDetails } : undefined, + inputText: text, + }).catch(() => { + // Ignore telemetry errors + }); + + if (error instanceof ValidationError) { + throw error; + } + + throw wrapProviderError(error, 'SMK evaluation failed'); + } + } + + /** + * Run the SMK evaluation LLM call + */ + private async evaluateSmk( + text: string, + grade: string, + fkScore: number + ): Promise<{ data: SmkInternal; usage: { inputTokens: number; outputTokens: number }; latencyMs: number }> { + const response = await this.provider.generateStructured({ + messages: [ + { role: 'system', content: getSystemPrompt() }, + { role: 'user', content: getUserPrompt(text, grade, fkScore) }, + ], + schema: SmkOutputSchema, + temperature: 0, + }); + + return { + data: response.data, + usage: response.usage, + latencyMs: response.latencyMs, + }; + } +} + +/** + * Functional API for SMK evaluation + * + * @example + * ```typescript + * const result = await evaluateSmk( + * "Hydraulic propulsion works by sucking water at the bow and forcing it sternward.", + * "10", + * { googleApiKey: process.env.GOOGLE_API_KEY } + * ); + * ``` + */ +export async function evaluateSmk( + text: string, + grade: string, + config: BaseEvaluatorConfig +): Promise> { + const evaluator = new SmkEvaluator(config); + return evaluator.evaluate(text, grade); +} diff --git a/sdks/typescript/src/evaluators/text-complexity.ts b/sdks/typescript/src/evaluators/text-complexity.ts index 114ba03..f9ad82c 100644 --- a/sdks/typescript/src/evaluators/text-complexity.ts +++ b/sdks/typescript/src/evaluators/text-complexity.ts @@ -1,11 +1,13 @@ import pLimit from 'p-limit'; import { VocabularyEvaluator } from './vocabulary.js'; import { SentenceStructureEvaluator } from './sentence-structure.js'; +import { SmkEvaluator } from './smk.js'; import type { SentenceStructureInternal } from '../schemas/sentence-structure.js'; import type { BaseEvaluatorConfig } from './base.js'; import { BaseEvaluator } from './base.js'; import type { EvaluationResult, TextComplexityLevel } from '../schemas/index.js'; import type { VocabularyInternal } from '../schemas/vocabulary.js'; +import type { SmkInternal } from '../schemas/smk.js'; /** * Result map returned by TextComplexityEvaluator. @@ -14,17 +16,19 @@ import type { VocabularyInternal } from '../schemas/vocabulary.js'; export interface TextComplexityResult { vocabulary: EvaluationResult | { error: Error }; sentenceStructure: EvaluationResult | { error: Error }; + subjectMatterKnowledge: EvaluationResult | { error: Error }; } /** * Text Complexity Evaluator * - * Composite evaluator that analyzes both vocabulary and sentence structure complexity. - * Runs both evaluations in parallel with concurrency control to avoid rate limiting. + * Composite evaluator that analyzes vocabulary, sentence structure, and subject matter knowledge. + * Runs all evaluations in parallel with concurrency control to avoid rate limiting. * * Uses: * - VocabularyEvaluator (Google Gemini 2.5 Pro + OpenAI GPT-4o) * - SentenceStructureEvaluator (OpenAI GPT-4o) + * - SmkEvaluator (Google Gemini 3 Flash Preview) * * @example * ```typescript @@ -43,7 +47,7 @@ export class TextComplexityEvaluator extends BaseEvaluator { static readonly metadata = { id: 'text-complexity', name: 'Text Complexity', - description: 'Composite evaluator analyzing vocabulary and sentence structure complexity', + description: 'Composite evaluator analyzing vocabulary, sentence structure, and subject matter knowledge complexity', supportedGrades: ['3', '4', '5', '6', '7', '8', '9', '10', '11', '12'] as const, requiresGoogleKey: true, requiresOpenAIKey: true, @@ -51,6 +55,7 @@ export class TextComplexityEvaluator extends BaseEvaluator { private vocabularyEvaluator: VocabularyEvaluator; private sentenceStructureEvaluator: SentenceStructureEvaluator; + private smkEvaluator: SmkEvaluator; private limit: ReturnType; constructor(config: BaseEvaluatorConfig) { @@ -60,6 +65,7 @@ export class TextComplexityEvaluator extends BaseEvaluator { // Create child evaluators with same config this.vocabularyEvaluator = new VocabularyEvaluator(config); this.sentenceStructureEvaluator = new SentenceStructureEvaluator(config); + this.smkEvaluator = new SmkEvaluator(config); // Create concurrency limiter (max 3 concurrent operations) this.limit = pLimit(3); @@ -68,8 +74,8 @@ export class TextComplexityEvaluator extends BaseEvaluator { /** * Evaluate text complexity for a given text and grade level * - * Runs vocabulary and sentence structure evaluations in parallel with concurrency control. - * If both sub-evaluators fail, throws an error. Otherwise returns a result map where + * Runs vocabulary, sentence structure, and SMK evaluations in parallel with concurrency control. + * If all three sub-evaluators fail, throws an error. Otherwise returns a result map where * failed sub-evaluators are represented as `{ error: Error }`. * * @param text - The text to evaluate @@ -92,24 +98,28 @@ export class TextComplexityEvaluator extends BaseEvaluator { const startTime = Date.now(); - // Run both evaluators in parallel with concurrency control - const [vocabResult, sentenceResult]: [ + // Run all evaluators in parallel with concurrency control + const [vocabResult, sentenceResult, smkResult]: [ EvaluationResult | { error: Error }, EvaluationResult | { error: Error }, + EvaluationResult | { error: Error }, ] = await Promise.all([ this.limit(() => this.runSubEvaluator(this.vocabularyEvaluator, text, grade)), this.limit(() => this.runSubEvaluator(this.sentenceStructureEvaluator, text, grade)), + this.limit(() => this.runSubEvaluator(this.smkEvaluator, text, grade)), ]); const latencyMs = Date.now() - startTime; const vocabFailed = 'error' in vocabResult; const sentenceFailed = 'error' in sentenceResult; - const hasFailures = vocabFailed || sentenceFailed; + const smkFailed = 'error' in smkResult; + const hasFailures = vocabFailed || sentenceFailed || smkFailed; if (hasFailures) { const errors: string[] = []; if (vocabFailed) errors.push(`Vocabulary: ${vocabResult.error.message}`); if (sentenceFailed) errors.push(`Sentence structure: ${sentenceResult.error.message}`); + if (smkFailed) errors.push(`Subject matter knowledge: ${smkResult.error.message}`); this.logger.error('Text complexity evaluation completed with errors', { evaluator: 'text-complexity', @@ -119,7 +129,7 @@ export class TextComplexityEvaluator extends BaseEvaluator { processingTimeMs: latencyMs, }); - if (vocabFailed && sentenceFailed) { + if (vocabFailed && sentenceFailed && smkFailed) { throw new Error(`Text complexity evaluation failed: ${errors.join('; ')}`); } } @@ -145,7 +155,7 @@ export class TextComplexityEvaluator extends BaseEvaluator { hasFailures, }); - return { vocabulary: vocabResult, sentenceStructure: sentenceResult }; + return { vocabulary: vocabResult, sentenceStructure: sentenceResult, subjectMatterKnowledge: smkResult }; } /** diff --git a/sdks/typescript/src/index.ts b/sdks/typescript/src/index.ts index 51f764c..f1c4cf7 100644 --- a/sdks/typescript/src/index.ts +++ b/sdks/typescript/src/index.ts @@ -49,6 +49,9 @@ export { // Vocabulary exports export type { VocabularyInternal } from './schemas/vocabulary.js'; +// Subject Matter Knowledge exports +export type { SmkInternal } from './schemas/smk.js'; + // Grade Level Appropriateness exports export type { GradeLevelAppropriatenessInternal } from './schemas/grade-level-appropriateness.js'; @@ -59,6 +62,8 @@ export { evaluateVocabulary, SentenceStructureEvaluator, evaluateSentenceStructure, + SmkEvaluator, + evaluateSmk, GradeLevelAppropriatenessEvaluator, evaluateGradeLevelAppropriateness, TextComplexityEvaluator, diff --git a/sdks/typescript/src/prompts/subject-matter-knowledge/index.ts b/sdks/typescript/src/prompts/subject-matter-knowledge/index.ts new file mode 100644 index 0000000..923d53a --- /dev/null +++ b/sdks/typescript/src/prompts/subject-matter-knowledge/index.ts @@ -0,0 +1,22 @@ +import SYSTEM_PROMPT from '../../../../../evals/prompts/subject-matter-knowledge/system.txt'; +import USER_PROMPT_TEMPLATE from '../../../../../evals/prompts/subject-matter-knowledge/user.txt'; + +/** + * Get the SMK evaluator system prompt + */ +export function getSystemPrompt(): string { + return SYSTEM_PROMPT; +} + +/** + * Generate the user prompt for SMK evaluation + * @param text - The text to evaluate + * @param grade - The target grade level + * @param fkScore - Flesch-Kincaid grade level score + */ +export function getUserPrompt(text: string, grade: string, fkScore: number): string { + return USER_PROMPT_TEMPLATE + .replaceAll('{text}', text) + .replaceAll('{grade}', grade) + .replaceAll('{fk_score}', fkScore.toString()); +} diff --git a/sdks/typescript/src/schemas/smk.ts b/sdks/typescript/src/schemas/smk.ts new file mode 100644 index 0000000..88ec6eb --- /dev/null +++ b/sdks/typescript/src/schemas/smk.ts @@ -0,0 +1,16 @@ +import { z } from 'zod'; +import { TextComplexityLevel } from './outputs.js'; + +/** + * Subject Matter Knowledge evaluation output schema + */ +export const SmkOutputSchema = z.object({ + identified_topics: z.array(z.string()).describe('List of major subjects/concepts found in the text.'), + curriculum_check: z.string().describe('Whether the topics are standard K-8 or specialized high school level.'), + assumptions_and_scaffolding: z.string().describe('What the author assumes the reader knows vs. what is explained.'), + friction_analysis: z.string().describe('Whether difficulty comes from vocabulary/structure or actual knowledge demands.'), + complexity_score: TextComplexityLevel.describe('The subject matter knowledge complexity level of the text'), + reasoning: z.string().describe('A brief synthesis of why the text fits the chosen complexity level.'), +}); + +export type SmkInternal = z.infer; diff --git a/sdks/typescript/tests/integration/smk.integration.test.ts b/sdks/typescript/tests/integration/smk.integration.test.ts new file mode 100644 index 0000000..684c0b7 --- /dev/null +++ b/sdks/typescript/tests/integration/smk.integration.test.ts @@ -0,0 +1,175 @@ +import { describe, it, expect, beforeAll } from 'vitest'; +import { SmkEvaluator } from '../../src/evaluators/smk.js'; +import { + runEvaluatorTest, + type BaseTestCase, +} from '../utils/index.js'; + +/** + * SMK Evaluator Integration Tests + * + * Test cases cover grades 3-12, one per grade, drawn from the annotated dataset. + * complexity_score values from the dataset (snake_case) are mapped to the evaluator's + * Title Case format. + * + * Each test uses a retry mechanism (up to 3 attempts) to account for LLM non-determinism, + * with short-circuiting on first expected match. If no expected match is found after all + * attempts, the test checks if any result falls within the acceptable (adjacent) value range. + * + * To run these tests: + * ```bash + * RUN_INTEGRATION_TESTS=true npm run test:integration + * ``` + */ + +const SKIP_INTEGRATION = !process.env.RUN_INTEGRATION_TESTS && + !process.env.GOOGLE_API_KEY; + +const describeIntegration = SKIP_INTEGRATION ? describe.skip : describe; + +// Test timeout: 2 minutes per test case (allows for 3 attempts with API latency) +const TEST_TIMEOUT_MS = 2 * 60 * 1000; + +// One test case per grade (3–12), first occurrence in dataset_599667ce. +// Expected values are the dataset's complexity_score mapped to Title Case. +// Acceptable values are the adjacent complexity levels on the 4-point scale. +const TEST_CASES: BaseTestCase[] = [ + { + id: 'SMK3', + grade: '3', + // dataset item 6205, annotated: moderately_complex + text: 'Vesuvius is a dangerous thing, but very beautiful. It stands tall and pointed and graceful against a lovely sky. Its little cloud waves from it like a plume. At night the mountain is swallowed by the dark. But the red rivers down its slopes glare in the sky. It is beautiful and terrible like a tiger. Thousands of people have loved it. They have climbed it and looked down its crater. It is like looking into the heart of the earth. One of these travelers wrote of his visit in 1793. He said: "For many days Vesuvius has been in action. I have watched it from Naples. It is wonderfully beautiful and always changing. On one day huge clouds poured out of the top. They hung in the sky far above, white as snow. Suddenly a cloud of smoke rushed out of another mouth. It was as black as ink. The black column rose tall and curling beside the snowy clouds. That was a picture in black and white. But at another time I saw one in bright colors.', + expected: 'Moderately complex', + acceptable: ['Slightly complex', 'Very complex'], + }, + { + id: 'SMK4', + grade: '4', + // dataset item 3538, annotated: moderately_complex + text: "About a year after the family moved to Oklahoma, Will Johnson got a neighbor boy to go with him back to their place in Texas to bring another wagon load of household goods. They were gone about two weeks. While the family was in Oklahoma, Will — who was about 20 — taught school two terms at Nubbin Ridge, somewhere near Duncan. Simpson, being about 17 at the time, was not about to go to school to a teacher who was his older brother, so he saddled his horse and slipped away back to Melvin's ranch, to be with his brother Joe. He said he got tired of riding but not nearly as tired as his horse. The journey was about 300 miles. He was on the trail three days and nights and had to stop at times to let his horse rest. When he got to the ranch, Joe wrote to the family saying that Simpson was with him and for them not to worry. They had suspected where he had gone but were not sure.", + expected: 'Moderately complex', + acceptable: ['Slightly complex', 'Very complex'], + }, + { + id: 'SMK5', + grade: '5', + // dataset item 2620, annotated: moderately_complex + text: 'Imagine you are walking in a park full of brightly colored flowers and some of their greatest admirers—honeybees. Suddenly, you hear the buzzing of a bee very near your ear. Without even thinking about it, your muscles tense up, and you stop dead in your tracks. You find yourself frozen, hoping the bee ignores you and buzzes on by. What just happened? Well, one of your brain\'s most important jobs is keeping you safe. Somewhere in your past, you learned two things: First, bees can sting! And second, bees buzz. So, when your ears perked up at the sound of bees buzzing, alarm bells went off in your brain. Your brain responded by telling your muscles to freeze so that the bee would hopefully just buzz off. And you did not even have to think about it… Pretty cool! Now imagine something just a little different. You are taking an afternoon stroll through the very same park, but earlier that morning an angry bee stung you in the arm—and it hurt! So now when you hear bees buzzing in the park, will your brain, whose job is to protect you, simply tell your body to freeze in place? Heck no!', + expected: 'Moderately complex', + acceptable: ['Slightly complex', 'Very complex'], + }, + { + id: 'SMK6', + grade: '6', + // dataset item 7419, annotated: moderately_complex + text: 'Imagine you are indoors on a sunny day. A beam of sunlight through a window lights up a section of the floor. How would you draw this sunbeam? You might draw a series of parallel lines showing the path of the sunlight from the window to the floor. This is not exactly accurate — no matter how hard you look, you will not find unique lines of light in the sunbeam! However, this is a good way to draw light and to model light geometrically. We call these narrow, imaginary lines of light light rays. Recall that light can behave like a wave and so you can think of a light ray as the path of a point on the crest of a wave. We can use light rays to model the behaviour of light relative to mirrors, lenses, telescopes, microscopes, and prisms. The study of how light interacts with materials is called optics. When dealing with light rays, we are usually interested in the shape of a material and the angles at which light rays hit it.', + expected: 'Moderately complex', + acceptable: ['Slightly complex', 'Very complex'], + }, + { + id: 'SMK7', + grade: '7', + // dataset item 3171, annotated: moderately_complex + text: 'Most immigrant groups that had formerly come to America by choice seemed distinct, but in fact had many similarities. Most had come from Northern and Western Europe. Most had some experience with representative democracy. With the exception of the Irish, most were Protestant. Many were literate, and some possessed a fair degree of wealth. The later groups arriving by the boatload in the Gilded Age were characterized by few of these traits. Their nationalities included Greek, Italian, Polish, Slovak, Serb, Russian, Croat, and others. Until cut off by federal decree, Japanese and Chinese settlers relocated to the American West Coast. None of these groups were predominantly Protestant. The vast majority were Roman Catholic or Eastern Orthodox. However, due to increased persecution of Jews in Eastern Europe, many Jewish immigrants sought freedom from torment. Very few newcomers spoke any English, and large numbers were illiterate in their native tongues. None of these groups hailed from democratic regimes. The American form of government was as foreign as its culture', + expected: 'Moderately complex', + acceptable: ['Slightly complex', 'Very complex'], + }, + { + id: 'SMK8', + grade: '8', + // dataset item 6178, annotated: moderately_complex + text: 'At Knob Creek the boy began to go to an "A B C" school. His first teacher was Zachariah Riney. Of course, there were no regular schools in the backwoods then. When a man who "knew enough" happened to come along, especially if he had nothing else to do, he tried to teach the children of the pioneers in a poor log schoolhouse. It is not likely that little Abe went to school more than a few weeks at this time, for he never had a year\'s schooling in his life. There was another teacher afterward at Knob Creek—a man named Caleb Hazel. Little is known of either of these teachers except that he taught little Abe Lincoln. If their pupil had not become famous the men and their schools would never have been mentioned in history. An old man, named Austin Gollaher, used to like to tell of the days when he and little Abe went to school together. He said: "Abe was an unusually bright boy at school, and made splendid progress in his studies. Indeed, he learned faster than any of his schoolmates. Though so young, he studied very hard."', + expected: 'Moderately complex', + acceptable: ['Slightly complex', 'Very complex'], + }, + { + id: 'SMK9', + grade: '9', + // dataset item 6570, annotated: moderately_complex + text: 'By looking at any map of Europe, it will be seen that England is separated from France by the English Channel, a passage which, though it looks quite narrow on the map, is really very wide, especially toward the west. The narrowest place is between Dover and Calais, where the distance across is only about twenty-two miles. This narrow passage is called the Straits of Dover. It would have been very convenient for travellers that have to pass between London and Paris if this strait had happened to lie in the line, or nearly in the line, between these two cities; but it does not. It lies considerably to the eastward of it; so that, to cross the channel at the narrowest part, requires that the traveller should take quite a circuit round. To go by the shortest distance, it is necessary to cross the channel at a place where Dieppe is the harbor, on the French side, and New Haven on the English.', + expected: 'Moderately complex', + acceptable: ['Slightly complex', 'Very complex'], + }, + { + id: 'SMK10', + grade: '10', + // dataset item 7313, annotated: very_complex + text: 'Hydraulic propulsion by reaction consists, in principle, in effecting a movement of boats, by sucking in water at the bow and forcing it out at the stern. This is a very old idea. Naturalists cite whole families of mollusks that move about in this way with great rapidity. It is probable that such was the origin of the first idea of this mode of operating. However this may be, as long ago as 1661 a patent was taken out in England, on this principle, by Toogood & Hayes. After this we find the patents of Allen (1729) and Rumsay (1788). In France, Daniel Bernouilli presented to the Académic des Sciences a similar project during the last century. Mr. Seydell was the first to build a vessel on this principle. This ship, which was called the Enterprise, was of 100 tons burden, and was constructed at Edinburgh for marine fishery. The success of this was incomplete, but it was sufficient to show all the advantage that could be got from the idea.', + expected: 'Very complex', + acceptable: ['Moderately complex', 'Exceedingly complex'], + }, + { + id: 'SMK11', + grade: '11', + // dataset item 7429, annotated: very_complex + text: 'Let us turn to another example in a different branch of science. Whichever of our modern discoveries we may consider to be the most startling and important, there can I think be no doubt that the most beautiful is that of the spectroscope. It has enabled us to do that which but a few years before its introduction was taken for the very type of the impossible, viz., to study the chemical composition of the stars; and it is giving us clearer and clearer insight every day into the condition of the great luminary which forms the center of our system. Still, however beautiful and interesting such results may be, it might well be thought that they could never have any practical application, and that the spectroscope at least would remain an instrument of science, but of science alone. This, however, is not the case.', + expected: 'Very complex', + acceptable: ['Moderately complex', 'Exceedingly complex'], + }, + { + id: 'SMK12', + grade: '12', + // dataset item 2577, annotated: moderately_complex + text: 'Corals in the Red Sea have to handle higher temperatures, yet they seem to grow and do just fine. The Red Sea is a very warm sea compared to other places. There, summer temperatures can reach up to 34°C, while other ocean waters may reach around 29–32°C. Interestingly, corals in the Red Sea are not only living in higher temperatures but also in higher salinity, or the amount of salt in water, for example, in seawater. You can find a range of different salinities in the ocean, depending on the region. The Red Sea has some of the highest levels of salt. Salinity is a measure of the amount of salt in the water, and the Red Sea has some of the world\'s highest salt levels. That is why we started wondering whether salinity could be a piece of the puzzle and the ability to live in high salinity one of the secrets of the strong Red Sea corals? To answer this and other questions related to coral bleaching, scientists often use a coral model organism, which means an animal that is easier to study than corals but at the same time is very similar to corals.', + expected: 'Moderately complex', + acceptable: ['Slightly complex', 'Very complex'], + }, +]; + +describeIntegration.concurrent('SMK Evaluator - Comprehensive Test Suite', () => { + let evaluator: SmkEvaluator; + + beforeAll(() => { + if (SKIP_INTEGRATION) { + console.log('⏭️ Skipping integration tests (no API keys or RUN_INTEGRATION_TESTS not set)'); + return; + } + + evaluator = new SmkEvaluator({ + googleApiKey: process.env.GOOGLE_API_KEY!, + }); + + console.log('\n' + '='.repeat(80)); + console.log('SMK EVALUATOR - TEST SUITE (PARALLEL)'); + console.log('='.repeat(80)); + console.log(`Running ${TEST_CASES.length} test cases with up to 3 attempts each`); + console.log('Short-circuiting on first expected match'); + console.log('Checking acceptable values if no expected match'); + console.log('='.repeat(80)); + }); + + // Generate individual test for each case + TEST_CASES.forEach((testCase) => { + it.concurrent(`${testCase.id}: Grade ${testCase.grade} - ${testCase.expected}`, async () => { + // Buffer all logs to print atomically at the end (prevents interleaving in parallel tests) + const logBuffer: string[] = []; + + // Test header + logBuffer.push('\n' + '='.repeat(80)); + logBuffer.push(`Test Case ${testCase.id} | Grade: ${testCase.grade}`); + logBuffer.push('='.repeat(80)); + logBuffer.push(`Expected Complexity: ${testCase.expected}`); + logBuffer.push(`Text Preview: ${testCase.text.substring(0, 100)}...`); + logBuffer.push(''); + + // Run the evaluation (returns logs instead of printing) + const maxAttempts = 3; + const result = await runEvaluatorTest(testCase, { + evaluator, + extractResult: (r) => r.score, + maxAttempts, + }); + + // Add evaluation logs to buffer (includes detailed summary) + logBuffer.push(...result.logs); + + // Print all logs atomically at the end - single console.log to prevent interleaving + console.log(logBuffer.join('\n')); + + // Assert that we got a match within maxAttempts (expected or acceptable) + expect(result.matched).toBe(true); + expect(result.matchedOnAttempt).toBeDefined(); + expect(result.matchedOnAttempt).toBeLessThanOrEqual(maxAttempts); + }, TEST_TIMEOUT_MS); + }); +}); diff --git a/sdks/typescript/tests/unit/evaluators/smk.test.ts b/sdks/typescript/tests/unit/evaluators/smk.test.ts new file mode 100644 index 0000000..ba9e268 --- /dev/null +++ b/sdks/typescript/tests/unit/evaluators/smk.test.ts @@ -0,0 +1,127 @@ +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { SmkEvaluator } from '../../../src/evaluators/smk.js'; +import type { LLMProvider } from '../../../src/providers/base.js'; + +// Mock providers +const createMockProvider = (): LLMProvider => ({ + generateStructured: vi.fn(), + generateText: vi.fn(), +}); + +vi.mock('../../../src/providers/index.js', () => ({ + createProvider: vi.fn(() => createMockProvider()), +})); + +vi.mock('../../../src/telemetry/client.js', () => ({ + TelemetryClient: class MockTelemetryClient { + send = vi.fn().mockResolvedValue(undefined); + }, +})); + +describe('SmkEvaluator - Constructor Validation', () => { + it('should throw with specific message when Google API key is missing', () => { + expect(() => new SmkEvaluator({ googleApiKey: '' })).toThrow( + 'Google API key is required for Subject Matter Knowledge evaluator. Pass googleApiKey in config.' + ); + }); +}); + +describe('SmkEvaluator - Metadata', () => { + it('should have correct metadata', () => { + expect(SmkEvaluator.metadata.id).toBe('subject-matter-knowledge'); + expect(SmkEvaluator.metadata.name).toBe('Subject Matter Knowledge'); + expect(SmkEvaluator.metadata.requiresGoogleKey).toBe(true); + expect(SmkEvaluator.metadata.requiresOpenAIKey).toBe(false); + expect(SmkEvaluator.metadata.supportedGrades).toEqual([ + '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', + ]); + }); +}); + +describe('SmkEvaluator - Evaluation Flow', () => { + let evaluator: SmkEvaluator; + let mockProvider: LLMProvider; + + beforeEach(() => { + vi.clearAllMocks(); + + evaluator = new SmkEvaluator({ + googleApiKey: 'test-google-key', + telemetry: false, + }); + + // @ts-expect-error Accessing private property for testing + mockProvider = evaluator.provider; + }); + + afterEach(() => { + vi.restoreAllMocks(); + }); + + it('should map LLM response to result, call provider once, include text+grade in prompt, use temperature 0', async () => { + const testText = 'Hydraulic propulsion works by sucking water at the bow and forcing it sternward.'; + const testGrade = '10'; + + vi.mocked(mockProvider.generateStructured).mockResolvedValue({ + data: { + identified_topics: ['hydraulics', 'propulsion', 'physics'], + curriculum_check: 'Specialized high school level — hydraulics is not standard K-8 curriculum.', + assumptions_and_scaffolding: 'Author assumes knowledge of fluid dynamics and mechanical engineering.', + friction_analysis: 'Difficulty comes from actual knowledge demands, not just vocabulary.', + complexity_score: 'Very complex', + reasoning: 'The text requires specialized knowledge of hydraulic systems.', + }, + model: 'gemini-3-flash-preview', + usage: { inputTokens: 200, outputTokens: 100 }, + latencyMs: 800, + }); + + const result = await evaluator.evaluate(testText, testGrade); + + expect(result.score).toBe('Very complex'); + expect(result.reasoning).toContain('hydraulic systems'); + expect(result.metadata.model).toBe('google:gemini-3-flash-preview'); + expect(result.metadata.processingTimeMs).toBeGreaterThanOrEqual(0); + + expect(mockProvider.generateStructured).toHaveBeenCalledTimes(1); + const call = vi.mocked(mockProvider.generateStructured).mock.calls[0]; + expect(call[0].messages[1].content).toContain(testText); + expect(call[0].messages[1].content).toContain(testGrade); + expect(call[0].schema).toBeDefined(); + expect(call[0].temperature).toBe(0); + }); + + it('should propagate LLM API errors', async () => { + vi.mocked(mockProvider.generateStructured).mockRejectedValue(new Error('API timeout')); + + await expect(evaluator.evaluate('The mitochondria is the powerhouse of the cell.', '5')) + .rejects.toThrow('API timeout'); + }); + + it('should not call provider when input validation fails', async () => { + await expect(evaluator.evaluate('', '5')).rejects.toThrow(); + expect(mockProvider.generateStructured).not.toHaveBeenCalled(); + }); + + it('should pass all _internal fields through from LLM response', async () => { + const mockData = { + identified_topics: ['biology', 'cells'], + curriculum_check: 'Standard K-8 curriculum.', + assumptions_and_scaffolding: 'Assumes basic biology knowledge.', + friction_analysis: 'Difficulty mainly from vocabulary.', + complexity_score: 'Moderately complex' as const, + reasoning: 'Detailed reasoning here.', + }; + + vi.mocked(mockProvider.generateStructured).mockResolvedValue({ + data: mockData, + model: 'gemini-3-flash-preview', + usage: { inputTokens: 200, outputTokens: 100 }, + latencyMs: 800, + }); + + const result = await evaluator.evaluate('The mitochondria is the powerhouse of the cell.', '5'); + + expect(result._internal).toEqual(mockData); + }); +}); diff --git a/sdks/typescript/tests/unit/evaluators/text-complexity.test.ts b/sdks/typescript/tests/unit/evaluators/text-complexity.test.ts index 1044611..a9bf9ad 100644 --- a/sdks/typescript/tests/unit/evaluators/text-complexity.test.ts +++ b/sdks/typescript/tests/unit/evaluators/text-complexity.test.ts @@ -96,6 +96,7 @@ describe('TextComplexityEvaluator', () => { let evaluator: TextComplexityEvaluator; let vocabSpy: any; let sentenceSpy: any; + let smkSpy: any; beforeEach(() => { evaluator = new TextComplexityEvaluator({ @@ -124,6 +125,16 @@ describe('TextComplexityEvaluator', () => { }, _internal: {}, }); + + smkSpy = vi.spyOn((evaluator as any).smkEvaluator, 'evaluate').mockResolvedValue({ + score: 'Moderately complex', + reasoning: 'Subject matter knowledge test reasoning', + metadata: { + model: 'google:gemini-3-flash-preview', + processingTimeMs: 100, + }, + _internal: {}, + }); }); afterEach(() => { @@ -139,8 +150,10 @@ describe('TextComplexityEvaluator', () => { expect(result).toBeDefined(); expect(result.vocabulary).toBeDefined(); expect(result.sentenceStructure).toBeDefined(); + expect(result.subjectMatterKnowledge).toBeDefined(); expect('error' in result.vocabulary).toBe(false); expect('error' in result.sentenceStructure).toBe(false); + expect('error' in result.subjectMatterKnowledge).toBe(false); }); it('should validate text input', async () => { @@ -179,7 +192,7 @@ describe('TextComplexityEvaluator', () => { } }); - it('should run both evaluators in parallel', async () => { + it('should run all three evaluators in parallel', async () => { const text = 'The cat sat on the mat.'; const grade = '5'; @@ -188,19 +201,20 @@ describe('TextComplexityEvaluator', () => { const duration = Date.now() - startTime; // With mocked providers that take ~100ms each, parallel execution should be faster than sequential - // Sequential would be ~200ms, parallel should be ~100ms - // Allow some overhead but should be significantly less than 200ms - expect(duration).toBeLessThan(200); + // Sequential would be ~300ms, parallel should be ~100ms + // Allow some overhead but should be significantly less than 300ms + expect(duration).toBeLessThan(300); expect('error' in result.vocabulary).toBe(false); expect('error' in result.sentenceStructure).toBe(false); + expect('error' in result.subjectMatterKnowledge).toBe(false); }); it('should handle partial failures gracefully', async () => { const text = 'The cat sat on the mat.'; const grade = '5'; - // Override the spy to make vocabulary fail but sentence structure succeed + // Override the spy to make vocabulary fail but others succeed vocabSpy.mockRejectedValue(new Error('Vocabulary evaluation failed')); const result = await evaluator.evaluate(text, grade); @@ -209,15 +223,17 @@ describe('TextComplexityEvaluator', () => { expect('error' in result.vocabulary).toBe(true); expect((result.vocabulary as { error: Error }).error).toBeDefined(); expect('error' in result.sentenceStructure).toBe(false); + expect('error' in result.subjectMatterKnowledge).toBe(false); }); - it('should throw when both evaluators fail', async () => { + it('should throw when all three evaluators fail', async () => { const text = 'The cat sat on the mat.'; const grade = '5'; - // Override both spies to fail + // Override all spies to fail vocabSpy.mockRejectedValue(new Error('Vocabulary evaluation failed')); sentenceSpy.mockRejectedValue(new Error('Sentence structure evaluation failed')); + smkSpy.mockRejectedValue(new Error('SMK evaluation failed')); await expect(evaluator.evaluate(text, grade)).rejects.toThrow( 'Text complexity evaluation failed' @@ -250,23 +266,38 @@ describe('TextComplexityEvaluator', () => { _internal: {}, }); + // Override SMK to return "Very complex" + smkSpy.mockResolvedValue({ + score: 'Very complex', + reasoning: 'SMK reasoning', + metadata: { + model: 'google:gemini-3-flash-preview', + processingTimeMs: 100, + }, + _internal: {}, + }); + const result = await evaluator.evaluate(text, grade); expect('error' in result.vocabulary).toBe(false); expect('error' in result.sentenceStructure).toBe(false); + expect('error' in result.subjectMatterKnowledge).toBe(false); if (!('error' in result.vocabulary)) { expect(result.vocabulary.score).toBe('Moderately complex'); } if (!('error' in result.sentenceStructure)) { expect(result.sentenceStructure.score).toBe('Slightly complex'); } + if (!('error' in result.subjectMatterKnowledge)) { + expect(result.subjectMatterKnowledge.score).toBe('Very complex'); + } }); it('should preserve individual sub-evaluator reasoning', async () => { const text = 'The cat sat on the mat.'; const grade = '5'; - // Override both evaluators with specific reasoning + // Override all evaluators with specific reasoning vocabSpy.mockResolvedValue({ score: 'Moderately complex', reasoning: 'This is the vocabulary reasoning.', @@ -287,6 +318,16 @@ describe('TextComplexityEvaluator', () => { _internal: {}, }); + smkSpy.mockResolvedValue({ + score: 'Very complex', + reasoning: 'This is the SMK reasoning.', + metadata: { + model: 'google:gemini-3-flash-preview', + processingTimeMs: 100, + }, + _internal: {}, + }); + const result = await evaluator.evaluate(text, grade); if (!('error' in result.vocabulary)) { @@ -295,6 +336,9 @@ describe('TextComplexityEvaluator', () => { if (!('error' in result.sentenceStructure)) { expect(result.sentenceStructure.reasoning).toBe('This is the sentence structure reasoning.'); } + if (!('error' in result.subjectMatterKnowledge)) { + expect(result.subjectMatterKnowledge.reasoning).toBe('This is the SMK reasoning.'); + } }); });