diff --git a/Classes/Command/GradePendingLogs.php b/Classes/Command/GradePendingLogs.php new file mode 100644 index 0000000..5876476 --- /dev/null +++ b/Classes/Command/GradePendingLogs.php @@ -0,0 +1,99 @@ +setHelp( + 'Grades AiM request log rows where grade_status="pending" and the row is at ' + . 'least --min-age seconds old. Intended as a safety-net behind GraderMiddleware\'s ' + . 'shutdown-function path. Run it from the TYPO3 scheduler every few minutes.' + ) + ->addOption( + 'limit', + null, + InputOption::VALUE_REQUIRED, + 'Maximum number of rows to grade in this run.', + '50', + ) + ->addOption( + 'min-age', + null, + InputOption::VALUE_REQUIRED, + 'Only pick rows older than this many seconds (avoid racing the live shutdown handler).', + '60', + ); + } + + protected function execute(InputInterface $input, OutputInterface $output): int + { + $limit = max(1, (int)$input->getOption('limit')); + $minAge = max(0, (int)$input->getOption('min-age')); + + $rows = $this->logRepository->findPendingGrades($minAge, $limit); + if ($rows === []) { + $output->writeln('No pending grades older than ' . $minAge . 's.'); + return Command::SUCCESS; + } + + $output->writeln(sprintf('Grading %d pending row(s).', count($rows))); + $graded = 0; + $failed = 0; + foreach ($rows as $row) { + $uid = (int)$row['uid']; + try { + $this->gradingService->grade($uid); + $output->writeln(' - graded uid ' . $uid); + $graded++; + } catch (\Throwable $e) { + $output->writeln(sprintf(' - uid %d failed: %s', $uid, $e->getMessage())); + $failed++; + } + } + + $output->writeln(sprintf('Done: %d graded, %d failed.', $graded, $failed)); + return $failed > 0 ? Command::FAILURE : Command::SUCCESS; + } +} diff --git a/Classes/Command/TestRequest.php b/Classes/Command/TestRequest.php new file mode 100644 index 0000000..01b5ab2 --- /dev/null +++ b/Classes/Command/TestRequest.php @@ -0,0 +1,336 @@ + AiM > + * Providers). With --site, it is resolved from a site's settings.yaml instead + * and dispatched through the pipeline directly. + */ +#[AsCommand( + name: 'aim:test', + description: 'Send a one-off AI request (text, conversation, translate, embed) and report the result.', +)] +final class TestRequest extends Command +{ + private const TABLE = 'tx_aim_request_log'; + + private const CAPABILITIES = ['text', 'conversation', 'translate', 'embed']; + + public function __construct( + private readonly Ai $ai, + private readonly ConnectionPool $connectionPool, + private readonly ProviderResolver $resolver, + private readonly AiMiddlewarePipeline $pipeline, + private readonly SiteFinder $siteFinder, + ) { + parent::__construct(); + } + + protected function configure(): void + { + $this + ->setHelp( + 'Examples:' . PHP_EOL + . ' vendor/bin/typo3 aim:test text --prompt "Write a haiku about TYPO3"' . PHP_EOL + . ' vendor/bin/typo3 aim:test conversation -p "anthropic:*" --prompt "Hi there"' . PHP_EOL + . ' vendor/bin/typo3 aim:test translate --prompt "Hello world" --from English --to German' . PHP_EOL + . ' vendor/bin/typo3 aim:test embed --prompt "vector me"' . PHP_EOL + . ' vendor/bin/typo3 aim:test text --site main --prompt "Resolve from site settings"' + ) + ->addArgument( + 'capability', + InputArgument::OPTIONAL, + 'One of: ' . implode(', ', self::CAPABILITIES) . '.', + 'text', + ) + ->addOption( + 'prompt', + null, + InputOption::VALUE_REQUIRED, + 'Prompt / text to send.', + 'Write one short sentence about TYPO3.', + ) + ->addOption( + 'provider', + 'p', + InputOption::VALUE_REQUIRED, + 'Provider notation (e.g. "openai:gpt-4o", "anthropic:*"). Defaults to the configured default.', + '', + ) + ->addOption( + 'site', + null, + InputOption::VALUE_REQUIRED, + 'Resolve the provider from this site\'s settings.yaml instead of the database. Takes precedence over --provider.', + '', + ) + ->addOption( + 'system-prompt', + null, + InputOption::VALUE_REQUIRED, + 'Optional system prompt (ignored for embed).', + '', + ) + ->addOption( + 'max-tokens', + null, + InputOption::VALUE_REQUIRED, + 'Max tokens to generate (ignored for embed).', + '300', + ) + ->addOption( + 'from', + null, + InputOption::VALUE_REQUIRED, + 'Source language (translate only).', + 'English', + ) + ->addOption( + 'to', + null, + InputOption::VALUE_REQUIRED, + 'Target language (translate only).', + 'German', + ); + } + + protected function execute(InputInterface $input, OutputInterface $output): int + { + $io = new SymfonyStyle($input, $output); + $capability = strtolower((string)$input->getArgument('capability')); + if (!in_array($capability, self::CAPABILITIES, true)) { + $io->error(sprintf('Unknown capability "%s". Use one of: %s.', $capability, implode(', ', self::CAPABILITIES))); + return Command::INVALID; + } + + $prompt = (string)$input->getOption('prompt'); + $provider = (string)$input->getOption('provider'); + $site = (string)$input->getOption('site'); + $systemPrompt = (string)$input->getOption('system-prompt'); + $maxTokens = (int)$input->getOption('max-tokens'); + + $io->title('AiM request test'); + $io->writeln(sprintf(' Capability: %s', $capability)); + if ($site !== '') { + $io->writeln(sprintf(' Source: site settings of "%s"', $site)); + } else { + $io->writeln(sprintf(' Provider: %s', $provider !== '' ? $provider : '(default)')); + } + $io->writeln(sprintf(' Prompt: %s', $prompt)); + if ($systemPrompt !== '' && $capability !== 'embed') { + $io->writeln(sprintf(' System: %s', $systemPrompt)); + } + $io->newLine(); + + $logCountBefore = $this->countLogRows(); + $start = hrtime(true); + + try { + $response = $site !== '' + ? $this->sendViaSite($capability, $site, $input, $prompt, $systemPrompt, $maxTokens) + : $this->sendRequest($capability, $input, $prompt, $systemPrompt, $maxTokens, $provider); + } catch (ProviderNotFoundException $e) { + if ($site !== '') { + // Site-settings path: the exception message is already specific + // (e.g. the configured provider's bridge is not installed). + $io->error('Request failed: ' . $e->getMessage()); + return Command::FAILURE; + } + $io->error('No AI provider is available for the "' . $capability . '" capability.'); + $io->writeln(' AiM needs at least one provider configuration before it can send requests.'); + $io->writeln(' Create one in the TYPO3 backend under Admin Tools > AiM > Providers:'); + $io->newLine(); + $io->listing([ + 'Pick a provider (auto-populated from installed Symfony AI bridges)', + 'Enter your API key — or an endpoint URL such as http://localhost:11434 for Ollama', + 'Select a model that supports the capability you want to test', + ]); + $io->writeln(' Or point --site at a site whose settings.yaml configures an AI provider.'); + return Command::FAILURE; + } catch (\Throwable $e) { + $io->error('Request failed: ' . $e->getMessage()); + return Command::FAILURE; + } + + $durationMs = (int)((hrtime(true) - $start) / 1_000_000); + + if ($response->errors !== []) { + $io->error('Response carries errors: ' . implode(' | ', $response->errors)); + return Command::FAILURE; + } + + $io->writeln('--- response ---'); + $io->writeln($response->content !== '' ? $response->content : '(empty content)'); + $io->writeln('--- end ---'); + $io->newLine(); + + $usage = $response->usage; + $io->writeln(sprintf('Model used: %s', $usage->modelUsed !== '' ? $usage->modelUsed : '(unknown)')); + $io->writeln(sprintf( + 'Usage: prompt=%d, completion=%d, total=%d', + $usage->promptTokens, + $usage->completionTokens, + $usage->getTotalTokens(), + )); + $io->writeln(sprintf('Cost: %.6f', $usage->cost)); + $io->writeln(sprintf('Wall time: %d ms', $durationMs)); + + $delta = $this->countLogRows() - $logCountBefore; + $io->writeln(sprintf('Log rows delta: %+d', $delta)); + + if ($delta === 0) { + $io->newLine(); + $io->warning('No request-log row was written — check the configuration\'s privacy level.'); + } + + $io->newLine(); + $io->success('Request completed.'); + return Command::SUCCESS; + } + + private function sendRequest( + string $capability, + InputInterface $input, + string $prompt, + string $systemPrompt, + int $maxTokens, + string $provider, + ): TextResponse { + return match ($capability) { + 'conversation' => $this->ai->conversation( + messages: [new UserMessage($prompt)], + systemPrompt: $systemPrompt, + maxTokens: $maxTokens, + extensionKey: 'aim', + provider: $provider, + ), + 'translate' => $this->ai->translate( + text: $prompt, + sourceLanguage: (string)$input->getOption('from'), + targetLanguage: (string)$input->getOption('to'), + systemPrompt: $systemPrompt, + maxTokens: $maxTokens, + extensionKey: 'aim', + provider: $provider, + ), + 'embed' => $this->ai->embed( + input: $prompt, + extensionKey: 'aim', + provider: $provider, + ), + default => $this->ai->text( + prompt: $prompt, + systemPrompt: $systemPrompt, + maxTokens: $maxTokens, + extensionKey: 'aim', + provider: $provider, + ), + }; + } + + /** + * Resolve the provider from a site's settings.yaml and dispatch directly + * through the pipeline. The Ai facade only resolves database-backed + * configurations, so site-settings configs need this explicit path. + */ + private function sendViaSite( + string $capability, + string $siteIdentifier, + InputInterface $input, + string $prompt, + string $systemPrompt, + int $maxTokens, + ): TextResponse { + $site = $this->siteFinder->getSiteByIdentifier($siteIdentifier); + + $capabilityFqcn = match ($capability) { + 'conversation' => ConversationCapableInterface::class, + 'translate' => TranslationCapableInterface::class, + 'embed' => EmbeddingCapableInterface::class, + default => TextGenerationCapableInterface::class, + }; + + $resolved = $this->resolver->resolveFromSiteSettings($capabilityFqcn, $site); + $configuration = $resolved->configuration; + $metadata = ['extension' => 'aim']; + + $request = match ($capability) { + 'conversation' => new ConversationRequest( + configuration: $configuration, + messages: [new UserMessage($prompt)], + systemPrompt: $systemPrompt, + maxTokens: $maxTokens, + metadata: $metadata, + ), + 'translate' => new TranslationRequest( + configuration: $configuration, + text: $prompt, + sourceLanguage: (string)$input->getOption('from'), + targetLanguage: (string)$input->getOption('to'), + systemPrompt: $systemPrompt, + maxTokens: $maxTokens, + metadata: $metadata, + ), + 'embed' => new EmbeddingRequest( + configuration: $configuration, + input: [$prompt], + metadata: $metadata, + ), + default => new TextGenerationRequest( + configuration: $configuration, + prompt: $prompt, + systemPrompt: $systemPrompt, + maxTokens: $maxTokens, + metadata: $metadata, + ), + }; + + return $this->pipeline->dispatch($request, $resolved); + } + + private function countLogRows(): int + { + return $this->connectionPool + ->getConnectionForTable(self::TABLE) + ->count('*', self::TABLE, []); + } +} diff --git a/Classes/Controller/RequestLogController.php b/Classes/Controller/RequestLogController.php index 95b3912..f075fe6 100644 --- a/Classes/Controller/RequestLogController.php +++ b/Classes/Controller/RequestLogController.php @@ -84,6 +84,7 @@ public function logAction(ServerRequestInterface $request): ResponseInterface $pagination = new SimplePagination($paginator); $statistics = $this->logRepository->getStatistics(); + $statistics['pending_grades'] = $this->logRepository->countPendingGradesOlderThan(3600); // Build pagination base URL with demand filters (append &page=N in template) $paginationBaseParams = [ @@ -149,6 +150,12 @@ public function pollAction(ServerRequestInterface $request): ResponseInterface 'complexity_score' => (float)($entry['complexity_score'] ?? 0), 'reroute_type' => $entry['reroute_type'] ?? '', 'reroute_reason' => $entry['reroute_reason'] ?? '', + 'grade_status' => $entry['grade_status'] ?? 'none', + 'grade_score' => (float)($entry['grade_score'] ?? 0), + 'grade_label' => $entry['grade_label'] ?? '', + 'grade_reason' => $entry['grade_reason'] ?? '', + 'judge_model' => $entry['judge_model'] ?? '', + 'judge_cost' => number_format((float)($entry['judge_cost'] ?? 0), 6), ]; } diff --git a/Classes/Domain/Model/ProviderConfiguration.php b/Classes/Domain/Model/ProviderConfiguration.php index 6c92419..5e7fc5a 100644 --- a/Classes/Domain/Model/ProviderConfiguration.php +++ b/Classes/Domain/Model/ProviderConfiguration.php @@ -37,6 +37,9 @@ final class ProviderConfiguration public readonly string $privacyLevel; public readonly bool $reroutingAllowed; public readonly bool $autoModelSwitch; + public readonly bool $gradingEnabled; + public readonly int $judgeConfigurationUid; + public readonly string $gradingRubric; public function __construct( public readonly array $row, @@ -54,6 +57,9 @@ public function __construct( $this->privacyLevel = (string)($row['privacy_level'] ?? 'standard'); $this->reroutingAllowed = (bool)($row['rerouting_allowed'] ?? true); $this->autoModelSwitch = (bool)($row['auto_model_switch'] ?? true); + $this->gradingEnabled = (bool)($row['grading_enabled'] ?? false); + $this->judgeConfigurationUid = (int)($row['judge_configuration_uid'] ?? 0); + $this->gradingRubric = (string)($row['grading_rubric'] ?? ''); } /** diff --git a/Classes/Domain/Repository/RequestLogDemand.php b/Classes/Domain/Repository/RequestLogDemand.php index 9d689dc..2e9b7b7 100644 --- a/Classes/Domain/Repository/RequestLogDemand.php +++ b/Classes/Domain/Repository/RequestLogDemand.php @@ -34,6 +34,8 @@ public function __construct( protected ?bool $success = null, protected int $dateFrom = 0, protected int $dateTo = 0, + protected string $gradeStatus = '', + protected string $gradeLabel = '', ) { if (!in_array($orderField, self::ORDER_FIELDS, true)) { $orderField = self::DEFAULT_ORDER_FIELD; @@ -65,9 +67,31 @@ public static function fromRequest(ServerRequestInterface $request): self isset($demand['success']) && $demand['success'] !== '' ? (bool)(int)$demand['success'] : null, (int)($demand['date_from'] ?? 0), (int)($demand['date_to'] ?? 0), + (string)($demand['grade_status'] ?? ''), + (string)($demand['grade_label'] ?? ''), ); } + public function getGradeStatus(): string + { + return $this->gradeStatus; + } + + public function hasGradeStatus(): bool + { + return $this->gradeStatus !== ''; + } + + public function getGradeLabel(): string + { + return $this->gradeLabel; + } + + public function hasGradeLabel(): bool + { + return $this->gradeLabel !== ''; + } + public function getOrderField(): string { return $this->orderField; @@ -166,7 +190,9 @@ public function hasConstraints(): bool || $this->hasModelUsed() || $this->hasSuccess() || $this->hasDateFrom() - || $this->hasDateTo(); + || $this->hasDateTo() + || $this->hasGradeStatus() + || $this->hasGradeLabel(); } public function getPage(): int @@ -208,6 +234,12 @@ public function getParameters(): array if ($this->hasDateTo()) { $parameters['date_to'] = $this->getDateTo(); } + if ($this->hasGradeStatus()) { + $parameters['grade_status'] = $this->getGradeStatus(); + } + if ($this->hasGradeLabel()) { + $parameters['grade_label'] = $this->getGradeLabel(); + } return $parameters; } } diff --git a/Classes/Domain/Repository/RequestLogRepository.php b/Classes/Domain/Repository/RequestLogRepository.php index b6e43b9..f1e9433 100644 --- a/Classes/Domain/Repository/RequestLogRepository.php +++ b/Classes/Domain/Repository/RequestLogRepository.php @@ -12,6 +12,8 @@ namespace B13\Aim\Domain\Repository; +use B13\Aim\Grading\GradeLabel; +use B13\Aim\Grading\GradeStatus; use TYPO3\CMS\Core\Database\Connection; use TYPO3\CMS\Core\Database\ConnectionPool; use TYPO3\CMS\Core\Database\Query\QueryBuilder; @@ -24,12 +26,113 @@ public function __construct( private readonly ConnectionPool $connectionPool, ) {} - public function log(array $data): void + public function log(array $data): int { $data['crdate'] = (int)($data['crdate'] ?? $GLOBALS['EXEC_TIME'] ?? time()); - $this->connectionPool - ->getConnectionForTable(self::TABLE) - ->insert(self::TABLE, $data); + $connection = $this->connectionPool->getConnectionForTable(self::TABLE); + $connection->insert(self::TABLE, $data); + return (int)$connection->lastInsertId(); + } + + public function findByUid(int $uid): ?array + { + $qb = $this->getQueryBuilder(); + $row = $qb + ->where($qb->expr()->eq('uid', $qb->createNamedParameter($uid, Connection::PARAM_INT))) + ->setMaxResults(1) + ->executeQuery() + ->fetchAssociative(); + return $row === false ? null : $row; + } + + public function markGradePending(int $uid): void + { + if ($uid <= 0) { + return; + } + $this->connectionPool->getConnectionForTable(self::TABLE)->update( + self::TABLE, + ['grade_status' => GradeStatus::Pending->value], + ['uid' => $uid], + ); + } + + public function updateGrade( + int $uid, + float $score, + GradeLabel $label, + string $reason, + string $judgeModel, + float $judgeCost, + int $durationMs, + ): void { + if ($uid <= 0) { + return; + } + $this->connectionPool->getConnectionForTable(self::TABLE)->update( + self::TABLE, + [ + 'grade_status' => GradeStatus::Done->value, + 'grade_score' => $score, + 'grade_label' => $label->value, + 'grade_reason' => $reason, + 'judge_model' => $judgeModel, + 'judge_cost' => $judgeCost, + 'grade_duration_ms' => $durationMs, + 'grade_error' => '', + ], + ['uid' => $uid], + ); + } + + public function markGradeFailed(int $uid, string $error): void + { + if ($uid <= 0) { + return; + } + $this->connectionPool->getConnectionForTable(self::TABLE)->update( + self::TABLE, + [ + 'grade_status' => GradeStatus::Failed->value, + 'grade_error' => mb_substr($error, 0, 500), + ], + ['uid' => $uid], + ); + } + + /** + * Find rows that are marked pending and have been waiting at least $minAgeSeconds. + * Used by the scheduler safety-net command to pick up rows the shutdown handler missed. + * + * @return list> + */ + public function findPendingGrades(int $minAgeSeconds, int $limit): array + { + $cutoff = ($GLOBALS['EXEC_TIME'] ?? time()) - $minAgeSeconds; + $qb = $this->getQueryBuilder(); + return $qb + ->where( + $qb->expr()->eq('grade_status', $qb->createNamedParameter(GradeStatus::Pending->value)), + $qb->expr()->lte('crdate', $qb->createNamedParameter($cutoff, Connection::PARAM_INT)), + ) + ->orderBy('crdate', 'ASC') + ->setMaxResults($limit) + ->executeQuery() + ->fetchAllAssociative(); + } + + public function countPendingGradesOlderThan(int $minAgeSeconds): int + { + $cutoff = ($GLOBALS['EXEC_TIME'] ?? time()) - $minAgeSeconds; + $qb = $this->getQueryBuilder(); + return (int)$qb + ->count('*') + ->where( + $qb->expr()->eq('grade_status', $qb->createNamedParameter(GradeStatus::Pending->value)), + $qb->expr()->lte('crdate', $qb->createNamedParameter($cutoff, Connection::PARAM_INT)), + ) + ->executeQuery() + ->fetchOne(); } /** @@ -126,10 +229,11 @@ public function getStatisticsByExtension(): array * Performance profile per model for a given request type. * Used by smart routing middleware. * - * @return list + * @return list */ public function getModelPerformanceProfile(string $requestType = ''): array { + $done = GradeStatus::Done->value; $qb = $this->getQueryBuilder(); $qb->addSelectLiteral( 'model_used', @@ -138,6 +242,8 @@ public function getModelPerformanceProfile(string $requestType = ''): array 'AVG(duration_ms) AS avg_duration_ms', 'SUM(success) AS successful_requests', 'AVG(total_tokens) AS avg_tokens', + sprintf("SUM(CASE WHEN grade_status = '%s' THEN grade_score ELSE 0 END) AS grade_score_sum", $done), + sprintf("SUM(CASE WHEN grade_status = '%s' THEN 1 ELSE 0 END) AS graded_count", $done), ); if ($requestType !== '') { $qb->where($qb->expr()->eq('request_type', $qb->createNamedParameter($requestType))); @@ -154,6 +260,7 @@ public function getModelPerformanceProfile(string $requestType = ''): array return array_map(static function (array $row): array { $count = (int)$row['request_count']; $successful = (int)$row['successful_requests']; + $gradedCount = (int)$row['graded_count']; return [ 'model_used' => $row['model_used'], 'request_count' => $count, @@ -161,6 +268,8 @@ public function getModelPerformanceProfile(string $requestType = ''): array 'avg_duration_ms' => (int)$row['avg_duration_ms'], 'success_rate' => $count > 0 ? round($successful / $count * 100, 1) : 0, 'avg_tokens' => (int)$row['avg_tokens'], + 'graded_count' => $gradedCount, + 'avg_grade_score' => $gradedCount > 0 ? round((float)$row['grade_score_sum'] / $gradedCount, 4) : 0.0, ]; }, $rows); } @@ -262,6 +371,18 @@ protected function getQueryBuilderForDemand(RequestLogDemand $demand): QueryBuil $qb->createNamedParameter($demand->getDateTo(), Connection::PARAM_INT) ); } + if ($demand->hasGradeStatus()) { + $constraints[] = $qb->expr()->eq( + 'grade_status', + $qb->createNamedParameter($demand->getGradeStatus()) + ); + } + if ($demand->hasGradeLabel()) { + $constraints[] = $qb->expr()->eq( + 'grade_label', + $qb->createNamedParameter($demand->getGradeLabel()) + ); + } if (!empty($constraints)) { $qb->where(...$constraints); diff --git a/Classes/Grading/GradeLabel.php b/Classes/Grading/GradeLabel.php new file mode 100644 index 0000000..4b5f799 --- /dev/null +++ b/Classes/Grading/GradeLabel.php @@ -0,0 +1,42 @@ += 0.85 => self::Excellent, + $score >= 0.65 => self::Good, + $score >= 0.40 => self::Fair, + default => self::Poor, + }; + } +} diff --git a/Classes/Grading/GradeStatus.php b/Classes/Grading/GradeStatus.php new file mode 100644 index 0000000..2775060 --- /dev/null +++ b/Classes/Grading/GradeStatus.php @@ -0,0 +1,29 @@ +logUid is populated. + * + * The middleware never grades inline. It marks the row as pending and + * defers the actual judge call via register_shutdown_function. The response + * is therefore returned to the caller before any grading happens, keeping + * live-request latency unchanged. A scheduler command (aim:grade-pending) + * picks up rows the shutdown handler missed. + */ +#[AsAiMiddleware(priority: -600)] +final class GraderMiddleware implements AiMiddlewareInterface +{ + public function __construct( + private readonly GradingService $gradingService, + private readonly RequestLogRepository $logRepository, + private readonly LoggerInterface $logger, + ) {} + + public function process( + AiRequestInterface $request, + AiProviderInterface $provider, + ProviderConfiguration $configuration, + AiMiddlewareHandler $next, + ): TextResponse { + $response = $next->handle($request, $provider, $configuration); + + if (!$this->shouldGrade($request, $response, $configuration, $next->context)) { + return $response; + } + + $logUid = (int)$next->context->logUid; + try { + $this->logRepository->markGradePending($logUid); + } catch (\Throwable $e) { + $this->logger->warning('AiM: failed to mark grade pending for log ' . $logUid . ': ' . $e->getMessage()); + return $response; + } + + $gradingService = $this->gradingService; + register_shutdown_function(static function () use ($gradingService, $logUid): void { + if (function_exists('fastcgi_finish_request')) { + @fastcgi_finish_request(); + } + try { + $gradingService->grade($logUid); + } catch (\Throwable) { + } + }); + + return $response; + } + + private function shouldGrade( + AiRequestInterface $request, + TextResponse $response, + ProviderConfiguration $configuration, + RequestContext $context, + ): bool { + if (!$configuration->gradingEnabled) { + return false; + } + if ($configuration->judgeConfigurationUid <= 0 + || $configuration->judgeConfigurationUid === $configuration->uid + ) { + return false; + } + if (!$response->isSuccessful()) { + return false; + } + if (!($request instanceof ConversationRequest) && !($request instanceof TextGenerationRequest)) { + return false; + } + if (($request->metadata['_aim_grading'] ?? false) === true) { + return false; + } + if ($context->logUid === null || $context->logUid <= 0) { + return false; + } + if ($this->resolvePrivacyLevel($configuration, $request) !== PrivacyLevel::Standard) { + return false; + } + return true; + } + + /** + * Mirror of RequestLoggingMiddleware::resolvePrivacyLevel. Kept inline rather + * than extracted to avoid premature abstraction — there are only two callers. + */ + private function resolvePrivacyLevel(ProviderConfiguration $configuration, AiRequestInterface $request): PrivacyLevel + { + $level = PrivacyLevel::fromString($configuration->privacyLevel); + + $user = $this->getBackendUser(); + if ($user !== null && method_exists($user, 'getTSConfig')) { + $userLevel = PrivacyLevel::fromString( + (string)($user->getTSConfig()['aim.']['privacyLevel'] ?? 'standard') + ); + $level = $level->strictest($userLevel); + } + + $requestOverride = $request->getPrivacyLevelOverride(); + if ($requestOverride !== null) { + $level = $level->strictest($requestOverride); + } + + return $level; + } + + private function getBackendUser(): ?BackendUserAuthentication + { + return $GLOBALS['BE_USER'] ?? null; + } +} diff --git a/Classes/Middleware/RequestContext.php b/Classes/Middleware/RequestContext.php index 4d1d493..c0ef3b6 100644 --- a/Classes/Middleware/RequestContext.php +++ b/Classes/Middleware/RequestContext.php @@ -26,4 +26,10 @@ final class RequestContext /** @var array{from: string, to: string, reason: string}|null */ public ?array $fallbackInfo = null; + + /** + * Primary key of the row written by RequestLoggingMiddleware, + * surfaced so GraderMiddleware can attach a grade to it later. + */ + public ?int $logUid = null; } diff --git a/Classes/Middleware/RequestLoggingMiddleware.php b/Classes/Middleware/RequestLoggingMiddleware.php index 67df8fc..9939d1e 100644 --- a/Classes/Middleware/RequestLoggingMiddleware.php +++ b/Classes/Middleware/RequestLoggingMiddleware.php @@ -156,7 +156,7 @@ private function logRequest( } try { - $this->repository->log($data); + $context->logUid = $this->repository->log($data); } catch (\Throwable $logError) { $this->logger->error('AiM request log insert failed: ' . $logError->getMessage(), [ 'data' => $data, diff --git a/Classes/Middleware/SmartRoutingMiddleware.php b/Classes/Middleware/SmartRoutingMiddleware.php index 27173a7..752d465 100644 --- a/Classes/Middleware/SmartRoutingMiddleware.php +++ b/Classes/Middleware/SmartRoutingMiddleware.php @@ -63,6 +63,19 @@ final class SmartRoutingMiddleware implements AiMiddlewareInterface */ private const MIN_SUCCESS_RATE = 90.0; + /** + * Minimum number of graded requests before the grade gate is trusted. + * Below this, grading is treated as "no signal" and the candidate is + * judged on cost and success rate alone. + */ + private const MIN_GRADED_REQUESTS = 10; + + /** + * Minimum average grade (0.0–1.0) for a cheaper model to remain a + * candidate. 0.65 is the "good" label boundary. + */ + private const MIN_GRADE_SCORE = 0.65; + public function __construct( private readonly RequestLogRepository $logRepository, private readonly ProviderResolver $providerResolver, @@ -101,12 +114,16 @@ public function process( if ($classification['label'] === 'simple') { $cheaperResult = $this->findCheaperModel($request, $configuration); if ($cheaperResult !== null) { + $gradeNote = $cheaperResult['graded_count'] > 0 + ? sprintf('avg grade: %.2f over %d graded', $cheaperResult['avg_grade_score'], $cheaperResult['graded_count']) + : 'ungraded'; $this->logger->info(sprintf( - 'Smart routing: downgrading from "%s" to cheaper model "%s" for simple prompt (score: %.2f, reason: %s)', + 'Smart routing: downgrading from "%s" to cheaper model "%s" for simple prompt (score: %.2f, reason: %s, %s)', $configuration->model, $cheaperResult['configuration']->model, $classification['score'], $classification['reason'], + $gradeNote, )); return $next->handle( @@ -257,7 +274,7 @@ private function classifyComplexity(string $prompt): array * Queries historical performance data from the request log to find * models with lower cost but high success rates for the same request type. * - * @return array{provider: AiProviderInterface, configuration: ProviderConfiguration}|null + * @return array{provider: AiProviderInterface, configuration: ProviderConfiguration, avg_grade_score: float, graded_count: int}|null */ private function findCheaperModel(AiRequestInterface $request, ProviderConfiguration $currentConfig): ?array { @@ -298,6 +315,13 @@ private function findCheaperModel(AiRequestInterface $request, ProviderConfigura if ($profile['success_rate'] < self::MIN_SUCCESS_RATE) { continue; } + // Quality gate: veto a cheap, reliable model only when we have enough + // graded samples to trust the signal. Too few grades = no signal, fall through. + if ($profile['graded_count'] >= self::MIN_GRADED_REQUESTS + && $profile['avg_grade_score'] < self::MIN_GRADE_SCORE + ) { + continue; + } if ($profile['avg_cost'] >= $currentCost) { continue; } @@ -328,6 +352,8 @@ private function findCheaperModel(AiRequestInterface $request, ProviderConfigura return [ 'provider' => $resolved->manifest->getInstance(), 'configuration' => $resolved->configuration, + 'avg_grade_score' => $bestCandidate['avg_grade_score'], + 'graded_count' => $bestCandidate['graded_count'], ]; } } diff --git a/Classes/Service/GradingService.php b/Classes/Service/GradingService.php new file mode 100644 index 0000000..6b86348 --- /dev/null +++ b/Classes/Service/GradingService.php @@ -0,0 +1,242 @@ +, "label": "", "reason": ""}' + . "\nDo not wrap the JSON in markdown or prose."; + + public function __construct( + private readonly RequestLogRepository $logRepository, + private readonly ProviderConfigurationRepository $configurationRepository, + private readonly ProviderResolver $resolver, + private readonly LoggerInterface $logger, + ) {} + + /** + * Grade the request log row identified by $logUid. + * + * Idempotent in spirit: callers should ensure they only invoke this for + * rows in grade_status='pending'. The method itself does not double-check — + * the shutdown handler holds the freshly inserted uid and the scheduler + * filters by status, so a re-entry would only happen on a true race. + */ + public function grade(int $logUid): void + { + if ($logUid <= 0) { + return; + } + + $start = hrtime(true); + try { + $row = $this->logRepository->findByUid($logUid); + if ($row === null) { + return; + } + + $primaryConfig = $this->configurationRepository->findByUid((int)($row['configuration_uid'] ?? 0)); + if ($primaryConfig === null || !$primaryConfig->gradingEnabled) { + // Config gone or grading turned off between insert and grade — nothing to do. + return; + } + + $judgeUid = $primaryConfig->judgeConfigurationUid; + if ($judgeUid <= 0 || $judgeUid === $primaryConfig->uid) { + $this->logRepository->markGradeFailed( + $logUid, + 'Judge configuration is missing or points at the same configuration.', + ); + return; + } + + $prompt = (string)($row['request_prompt'] ?? ''); + $response = (string)($row['response_content'] ?? ''); + if ($prompt === '' || $response === '') { + $this->logRepository->markGradeFailed( + $logUid, + 'Prompt or response content is empty (likely redacted) — cannot grade.', + ); + return; + } + + $judgeResolved = $this->resolver->resolveForCapability(ConversationCapableInterface::class, $judgeUid); + $judgeProvider = $judgeResolved->manifest->getInstance(); + if (!$judgeProvider instanceof ConversationCapableInterface) { + $this->logRepository->markGradeFailed( + $logUid, + 'Judge provider does not support conversation capability.', + ); + return; + } + + $judgeRequest = $this->buildJudgeRequest($primaryConfig, $judgeResolved->configuration, $prompt, $response, $logUid); + $judgeResponse = $judgeProvider->processConversationRequest($judgeRequest); + + // The judge call bypasses the middleware pipeline (would otherwise create + // a DI cycle and a duplicate request-log row), so CostTrackingMiddleware + // never sees it. Roll the spend into the judge configuration's total_cost + // here instead. This is done as soon as the call returns so a paid-for but + // unparseable response is still accounted for. + $this->configurationRepository->updateTotalCost($judgeUid, $judgeResponse->usage->cost); + + if (!$judgeResponse->isSuccessful()) { + $error = $judgeResponse->errors[0] ?? 'Judge returned an unsuccessful response.'; + $this->logRepository->markGradeFailed($logUid, $error); + return; + } + + $parsed = $this->parseJudgeOutput($judgeResponse->content); + if ($parsed === null) { + $this->logRepository->markGradeFailed( + $logUid, + 'Judge response could not be parsed as JSON: ' . mb_substr($judgeResponse->content, 0, 200), + ); + return; + } + + $durationMs = (int)((hrtime(true) - $start) / 1_000_000); + $this->logRepository->updateGrade( + $logUid, + $parsed['score'], + $parsed['label'], + $parsed['reason'], + $judgeResponse->usage->modelUsed ?: '', + $judgeResponse->usage->cost, + $durationMs, + ); + } catch (\Throwable $e) { + $this->logger->warning('AiM grading failed for log uid ' . $logUid . ': ' . $e->getMessage()); + $this->logRepository->markGradeFailed($logUid, $e->getMessage()); + } + } + + private function buildJudgeRequest( + ProviderConfiguration $primaryConfig, + ProviderConfiguration $judgeConfig, + string $prompt, + string $response, + int $logUid, + ): ConversationRequest { + $rubric = trim($primaryConfig->gradingRubric); + if ($rubric === '') { + $rubric = 'Evaluate the response for factual accuracy and relevance to the user prompt.'; + } + $systemPrompt = $rubric . self::JSON_INSTRUCTION; + + $userContent = "Prompt:\n" . $prompt . "\n\nResponse:\n" . $response; + + return new ConversationRequest( + configuration: $judgeConfig, + messages: [new UserMessage($userContent)], + systemPrompt: $systemPrompt, + responseFormat: ResponseFormat::json(), + maxTokens: 300, + temperature: 0.0, + metadata: [ + '_aim_grading' => true, + 'graded_log_uid' => $logUid, + 'extension' => 'aim', + ], + ); + } + + /** + * Parse and validate the judge's JSON output. + * + * @return array{score: float, label: GradeLabel, reason: string}|null + */ + private function parseJudgeOutput(string $raw): ?array + { + $json = $this->extractJsonObject($raw); + if ($json === null) { + return null; + } + try { + $decoded = json_decode($json, true, 8, JSON_THROW_ON_ERROR); + } catch (\JsonException) { + return null; + } + if (!is_array($decoded) || !isset($decoded['score'], $decoded['label'])) { + return null; + } + + $score = (float)$decoded['score']; + if ($score < 0.0) { + $score = 0.0; + } + if ($score > 1.0) { + $score = 1.0; + } + + $label = GradeLabel::tryFrom(strtolower(trim((string)$decoded['label']))) + ?? GradeLabel::fromScore($score); + + $reason = trim((string)($decoded['reason'] ?? '')); + if (mb_strlen($reason) > 500) { + $reason = mb_substr($reason, 0, 497) . '...'; + } + + return ['score' => $score, 'label' => $label, 'reason' => $reason]; + } + + /** + * Strip code fences and locate the first balanced JSON object in the string. + * Tolerates the common failure mode of LLMs wrapping JSON in markdown. + */ + private function extractJsonObject(string $raw): ?string + { + $raw = trim($raw); + if ($raw === '') { + return null; + } + // Strip ```json ... ``` fence if present + if (str_starts_with($raw, '```')) { + $raw = preg_replace('/^```(?:json)?\s*\n?|\n?```$/m', '', $raw) ?? $raw; + $raw = trim($raw); + } + $start = strpos($raw, '{'); + $end = strrpos($raw, '}'); + if ($start === false || $end === false || $end <= $start) { + return null; + } + return substr($raw, $start, $end - $start + 1); + } +} diff --git a/Configuration/TCA/tx_aim_configuration.php b/Configuration/TCA/tx_aim_configuration.php index 4a46e06..0fd2bed 100644 --- a/Configuration/TCA/tx_aim_configuration.php +++ b/Configuration/TCA/tx_aim_configuration.php @@ -32,7 +32,10 @@ --palette--;;cost, --div--;LLL:EXT:core/Resources/Private/Language/Form/locallang_tabs.xlf:access, --palette--;;access, - --palette--;;governance', + --palette--;;governance, + --div--;LLL:EXT:aim/Resources/Private/Language/locallang_tca.xlf:tx_aim_configuration.tab.grading, + --palette--;;grading, + grading_rubric', ], ], 'palettes' => [ @@ -56,6 +59,10 @@ 'label' => 'LLL:EXT:frontend/Resources/Private/Language/locallang_tca.xlf:pages.palettes.access', 'showitem' => 'disabled', ], + 'grading' => [ + 'label' => 'LLL:EXT:aim/Resources/Private/Language/locallang_tca.xlf:tx_aim_configuration.palette.grading.label', + 'showitem' => 'grading_enabled, --linebreak--, judge_configuration_uid', + ], ], 'columns' => [ 'ai_provider' => [ @@ -209,5 +216,40 @@ ], ], ], + 'grading_enabled' => [ + 'label' => 'LLL:EXT:aim/Resources/Private/Language/locallang_tca.xlf:tx_aim_configuration.columns.grading_enabled.label', + 'description' => 'LLL:EXT:aim/Resources/Private/Language/locallang_tca.xlf:tx_aim_configuration.columns.grading_enabled.description', + 'config' => [ + 'type' => 'check', + 'renderType' => 'checkboxToggle', + 'default' => 0, + ], + ], + 'judge_configuration_uid' => [ + 'label' => 'LLL:EXT:aim/Resources/Private/Language/locallang_tca.xlf:tx_aim_configuration.columns.judge_configuration_uid.label', + 'description' => 'LLL:EXT:aim/Resources/Private/Language/locallang_tca.xlf:tx_aim_configuration.columns.judge_configuration_uid.description', + 'displayCond' => 'FIELD:grading_enabled:REQ:true', + 'config' => [ + 'type' => 'select', + 'renderType' => 'selectSingle', + 'foreign_table' => 'tx_aim_configuration', + 'foreign_table_where' => 'AND tx_aim_configuration.uid != ###THIS_UID### AND tx_aim_configuration.disabled = 0 ORDER BY tx_aim_configuration.title', + 'items' => [ + ['label' => '', 'value' => 0], + ], + 'default' => 0, + ], + ], + 'grading_rubric' => [ + 'label' => 'LLL:EXT:aim/Resources/Private/Language/locallang_tca.xlf:tx_aim_configuration.columns.grading_rubric.label', + 'description' => 'LLL:EXT:aim/Resources/Private/Language/locallang_tca.xlf:tx_aim_configuration.columns.grading_rubric.description', + 'displayCond' => 'FIELD:grading_enabled:REQ:true', + 'config' => [ + 'type' => 'text', + 'rows' => 6, + 'cols' => 40, + 'placeholder' => 'LLL:EXT:aim/Resources/Private/Language/locallang_tca.xlf:tx_aim_configuration.columns.grading_rubric.placeholder', + ], + ], ], ]; diff --git a/Documentation/Introduction.md b/Documentation/Introduction.md index 5d4036e..14c8ada 100644 --- a/Documentation/Introduction.md +++ b/Documentation/Introduction.md @@ -112,6 +112,8 @@ Every extension using AiM now has AI capabilities. No further configuration need AiM analyzes each prompt's complexity before sending it to an AI provider. A simple "What is PHP?" doesn't need GPT-4.1. A smaller, cheaper model handles it just fine. AiM learns from your request history which models work well for which types of questions and automatically routes to the most cost-effective option. +If you also enable [response quality grading](#response-quality-grading), routing gets smarter still: a cheaper model is only chosen if its past answers were actually graded as good. Not just "didn't error". A model that runs cheaply but produces weak responses is left out of the downgrade. Until enough graded requests exist for a model, routing falls back to cost and reliability alone, so nothing changes for setups that don't use grading. + This happens transparently. Your extensions don't need to change anything. ### Auto model switching @@ -185,6 +187,7 @@ Every AI request is tracked in the **AiM > Request Log** module: - **Which model answered**: requested model vs. actually used model - **How much it cost**: token counts (prompt, completion, cached, reasoning) and calculated cost - **How complex it was**: AiM's complexity classification (simple/moderate/complex) with the scoring reason +- **How good it was**: when grading is enabled, an LLM-as-a-judge quality score, label, and reason - **How long it took**: wall-clock duration in milliseconds - **Who asked**: the backend username is displayed for each request, so you can see which user triggered it. Automated/CLI requests show no user. - **Which extension**: the calling extension key is shown per request @@ -194,6 +197,16 @@ Filter by provider, extension, request type, or success/failure. Statistics dash ![Request Log](Images/request-log.png) +### Response quality grading + +How good are the AI responses your site produces? AiM can answer that automatically. Enable **LLM grading** on any provider configuration and AiM scores each response with a second AI model acting as an impartial judge ("LLM-as-a-judge"). + +You write the rubric ("evaluate factual accuracy and relevance", "check the tone is friendly and professional", ...) and pick which configuration acts as the judge, typically a cheaper model. After each response is delivered, AiM asks the judge to score it and records a grade (poor / fair / good / excellent), a 0–1 score, and a one-sentence reason on the request log row. + +Grading runs *after* the response reaches the user, so it never slows anything down. It applies to text and conversation requests, and only when full logging is active, since the judge needs to see the content it is scoring. Grading is delivered by a shutdown handler on the live request, with a scheduler task (`aim:grade-pending`) as a safety net for anything it misses. + +This turns the request log into a quality dashboard: spot which models or prompts produce weak answers, compare providers on real output, and catch quality regressions before your editors do. + ### Provider verification Click the verify button next to any provider configuration to test the connection. See "connected" or "disconnected" with the exact error message. Results are persisted so you see the last check status on every page load. diff --git a/README.md b/README.md index f3066cc..bfd1875 100644 --- a/README.md +++ b/README.md @@ -40,15 +40,16 @@ A few lines to add AI to any TYPO3 extension. No API keys in your code, no provi - Budget limits and rate limiting per user (including admins as a safety net) - Privacy levels (standard / reduced / none) per provider - Provider group restrictions and capability permissions via native TYPO3 mechanisms +- LLM grading: score response quality with a second model acting as a judge **Under the hood:** - Zero provider dependencies. Install Symfony AI bridge packages as needed. - Auto-discovery of installed bridges (OpenAI, Anthropic, Gemini, Mistral, Ollama, etc.) - Capability-based routing with model-level awareness - Auto model switch: one config covers all capabilities -- Smart routing: routes simple prompts to cheaper models based on historical cost data +- Smart routing: routes simple prompts to cheaper models based on historical cost, reliability, and (with grading) quality data - Fallback chains: automatic retry with alternative providers on failure -- 8-layer middleware pipeline: retry, access control, smart routing, capability validation, logging, cost tracking, events, dispatch +- 9-layer middleware pipeline: retry, access control, smart routing, capability validation, grading, logging, cost tracking, events, dispatch ## Installation @@ -77,6 +78,37 @@ After installation, create a provider configuration in the backend (Admin Tools > **Local providers (Ollama, LM Studio):** The *API Key* field doubles as the endpoint URL. Enter `http://localhost:11434` (Ollama) or `http://localhost:1234` (LM Studio) instead of a key. The available models are then fetched live from that endpoint. +## Trying AiM from the command line + +Once a provider configuration exists, you can fire requests without writing an extension first. The `aim:test` command sends a one-off request through the full pipeline and reports the response, model used, token usage, cost, timing, and whether a request-log row was written: + +```bash +# Text generation (default capability) +vendor/bin/typo3 aim:test text --prompt "Write a haiku about TYPO3" + +# Conversation, against a specific provider +vendor/bin/typo3 aim:test conversation -p "anthropic:*" --prompt "Explain dependency injection" + +# Translation +vendor/bin/typo3 aim:test translate --prompt "Hello world" --from English --to German + +# Embeddings +vendor/bin/typo3 aim:test embed --prompt "TYPO3 is an open-source CMS" +``` + +The capability is a positional argument (`text`, `conversation`, `translate`, or `embed`; defaults to `text`). Options: + +| Option | Purpose | +|---|---| +| `--prompt` | The prompt / text to send | +| `--provider` / `-p` | Provider notation (`openai:gpt-4o`, `anthropic:*`); defaults to the configured default | +| `--site` | Resolve the provider from a site's `settings.yaml` instead of the database; takes precedence over `--provider` | +| `--system-prompt` | Optional system prompt | +| `--max-tokens` | Token limit for the response | +| `--from` / `--to` | Source / target language (translate only) | + +Because it runs through the real pipeline, every call also lands in the request log. A quick way to see logging, cost tracking, smart routing, and grading in action before integrating the API into your own code. + ## Usage ### Tier 1: Proxy (recommended) @@ -437,6 +469,14 @@ The `SmartRoutingMiddleware` classifies prompt complexity using language-agnosti Classification is logged per request (`complexity_score`, `complexity_label`, `complexity_reason`). When a cheaper model has proven reliable for simple prompts (based on historical request log data with minimum 10 requests and 90%+ success rate), the middleware automatically downgrades. +### Quality gate + +"Reliable" on its own only means *the API call didn't error*. A cheap model can succeed every time while producing weak answers. When [LLM grading](#llm-grading) is enabled, smart routing also consults the recorded `grade_score`: a cheaper model is only chosen if its graded responses for that request type average at least **0.65** (the "good" boundary) across at least **10 graded requests**. + +The gate is a one-way veto, not a tie-breaker. The cheapest cost-and-success-eligible model is still the one picked; a poor average grade simply removes a candidate. Crucially, **too few graded requests means "no signal", not "bad"**: a model with fewer than 10 graded samples is judged on cost and success rate exactly as before, so installs without grading enabled see no change in routing behavior. + +The downgrade decision is logged with the candidate's graded quality, e.g. `... (avg grade: 0.82 over 14 graded)` or `... (ungraded)`. + ### Extending complexity signals Ship a `Configuration/SmartRouting/ComplexitySignals.php` in any extension: @@ -457,6 +497,49 @@ Or add signals at runtime: $GLOBALS['TYPO3_CONF_VARS']['EXTENSIONS']['aim']['complexitySignals']['de']['complex'][] = 'analysiere'; ``` +## LLM Grading + +AiM can score the quality of AI responses using a second model as a judge ("LLM-as-a-judge"). Grading is opt-in per provider configuration and runs *after* the response has been delivered to the caller, so it adds no latency to the live request. + +### Enabling grading + +On any provider configuration (Admin Tools > AiM > Providers), open the **LLM Grading** tab: + +| Field | Purpose | +|---|---| +| `grading_enabled` | Turns grading on for this configuration | +| `judge_configuration_uid` | A *different* AiM configuration used to score responses — typically a cheaper or specialized model that supports the conversation capability | +| `grading_rubric` | The judge's instructions: what to evaluate (factual accuracy, relevance, tone, ...). The required JSON output format is appended automatically. | + +Grading covers `ConversationRequest` and `TextGenerationRequest`. It only runs when the effective privacy level is `standard`, `reduced` and `none` skip it, since the judge needs the prompt and response content. + +### How it runs + +1. After a successful, gradeable response, `GraderMiddleware` marks the request log row `grade_status = pending` and registers a shutdown function. +2. The shutdown function runs *after* the response is flushed to the caller, then calls the judge model. +3. The judge returns a JSON `{score, label, reason}`, written back to the row (`grade_score`, `grade_label`, `grade_reason`). + +If the shutdown path is missed (CLI crash, an unusual SAPI), a scheduler command picks up the stragglers: + +```bash +vendor/bin/typo3 aim:grade-pending +``` + +Run it from the TYPO3 scheduler every few minutes. It grades rows still marked `pending` that are older than `--min-age` seconds (default 60), so it never races the live shutdown handler. The request log module shows a warning when a pending backlog builds up. + +### Grades + +The judge assigns one of four labels. When it returns a score but no recognizable label, the label is derived from the score: + +| Label | Score range | +|---|---| +| `poor` | 0.00–0.39 | +| `fair` | 0.40–0.64 | +| `good` | 0.65–0.84 | +| `excellent` | 0.85–1.00 | + +The judge call deliberately bypasses the middleware pipeline (it would otherwise produce a duplicate request-log row), but its cost is still rolled into the judge configuration's `total_cost` and recorded on the graded row's `judge_cost` column. + ## Custom Middleware Add middleware to intercept all AI requests: @@ -544,6 +627,7 @@ The middleware pipeline is intentionally the only logging extension point: it gi | `AccessControlMiddleware` | 90 | Provider access, capability permissions, budgets, rate limits | | `SmartRoutingMiddleware` | 75 | Complexity classification, cost-based model downgrade | | `CapabilityValidationMiddleware` | 50 | Validates provider capability, auto-reroutes if needed | +| `GraderMiddleware` | -600 | Schedules LLM-as-a-judge grading after a successful response | | `RequestLoggingMiddleware` | -700 | Logs every request (respects privacy levels) | | `CostTrackingMiddleware` | -800 | Updates cumulative cost per configuration | | `EventDispatchMiddleware` | -900 | Fires `BeforeAiRequestEvent` / `AfterAiResponseEvent` | @@ -580,6 +664,7 @@ Monitor all AI requests: - **User tracking**: shows the backend username for each request (empty for CLI/automation) - **Full content**: prompt, system prompt, and response content per request (respects privacy levels) - **Complexity classification**: score, label, and reason for each request +- **Quality grades**: LLM-as-a-judge score, label, and reason per request when grading is enabled - **Token details**: prompt, completion, cached, and reasoning token breakdowns - **Rerouting info**: fallback and capability rerouting details @@ -604,7 +689,7 @@ All widgets are refreshable and grouped under "AiM" in the widget picker. The re | Table | Purpose | |---|---| | `tx_aim_configuration` | Provider configurations (TCA-managed). API keys, models, cost tracking, governance settings. | -| `tx_aim_request_log` | Per-request log (no TCA). Tokens, cost, duration, prompt/response content, complexity classification, rerouting details. | +| `tx_aim_request_log` | Per-request log (no TCA). Tokens, cost, duration, prompt/response content, complexity classification, rerouting details, LLM grading results. | | `tx_aim_usage_budget` | Per-user budget tracking. Rolling period counters for tokens, cost, and request count. | See `ext_tables.sql` for the full schema. diff --git a/Resources/Private/Language/locallang_module.xlf b/Resources/Private/Language/locallang_module.xlf index 7af95ce..bd56a1a 100644 --- a/Resources/Private/Language/locallang_module.xlf +++ b/Resources/Private/Language/locallang_module.xlf @@ -190,9 +190,24 @@ Duration + + Grade + Status + + Pending + + + Grade failed + + + Pending grades + + + %d request log row(s) have been waiting more than an hour to be graded. The shutdown handler appears to have missed them — run "vendor/bin/typo3 aim:grade-pending" or schedule it via the TYPO3 scheduler. + Success diff --git a/Resources/Private/Language/locallang_tca.xlf b/Resources/Private/Language/locallang_tca.xlf index 703044f..14a2236 100644 --- a/Resources/Private/Language/locallang_tca.xlf +++ b/Resources/Private/Language/locallang_tca.xlf @@ -81,6 +81,33 @@ Output Token Cost (per 1M) + + LLM Grading + + + Response Quality Grading + + + Enable LLM grading + + + When enabled, successful responses from this configuration are scored by a second LLM (the judge) after the response has been delivered. The score, label, and reason are stored on the request log row. Grading only runs when the effective privacy level is "standard". Reduced or none disables it. + + + Judge configuration + + + The AiM configuration used to score responses. Should be a separate, cheaper or specialized configuration that supports the conversation capability. Cannot be this configuration itself. + + + Grading rubric (judge system prompt) + + + System prompt for the judge. Describe what to evaluate (e.g. factual accuracy, relevance, tone). The judge is instructed automatically to respond with JSON containing score (0.0–1.0), label (poor|fair|good|excellent), and reason. No need to mention the output format here. + + + Evaluate the response for factual accuracy and relevance to the user's prompt. + AI Management diff --git a/Resources/Private/Partials/RequestLog/Row.html b/Resources/Private/Partials/RequestLog/Row.html index 5098435..72c8eec 100644 --- a/Resources/Private/Partials/RequestLog/Row.html +++ b/Resources/Private/Partials/RequestLog/Row.html @@ -39,6 +39,21 @@ {entry.cost -> f:format.number(decimals: 6)} {entry.duration_ms -> f:format.number(decimals: 0, thousandsSeparator: ',')} ms + + + + + excellent ({entry.grade_score -> f:format.number(decimals: 2)}) + good ({entry.grade_score -> f:format.number(decimals: 2)}) + fair ({entry.grade_score -> f:format.number(decimals: 2)}) + {entry.grade_label} ({entry.grade_score -> f:format.number(decimals: 2)}) + + + + + - + + diff --git a/Resources/Private/Partials/RequestLog/Table.html b/Resources/Private/Partials/RequestLog/Table.html index 4b4b44e..3e9e532 100644 --- a/Resources/Private/Partials/RequestLog/Table.html +++ b/Resources/Private/Partials/RequestLog/Table.html @@ -13,6 +13,7 @@ + diff --git a/Resources/Private/Templates/Aim/RequestLog.html b/Resources/Private/Templates/Aim/RequestLog.html index e082d8d..054290c 100644 --- a/Resources/Private/Templates/Aim/RequestLog.html +++ b/Resources/Private/Templates/Aim/RequestLog.html @@ -13,6 +13,11 @@ + + + + + diff --git a/Tests/Functional/Command/GradePendingLogsTest.php b/Tests/Functional/Command/GradePendingLogsTest.php new file mode 100644 index 0000000..aa35325 --- /dev/null +++ b/Tests/Functional/Command/GradePendingLogsTest.php @@ -0,0 +1,95 @@ +get(RequestLogRepository::class); + $now = time(); + + // Old pending row — should be graded + $oldUid = $logRepo->log([ + 'crdate' => $now - 120, + 'configuration_uid' => 1, + 'grade_status' => GradeStatus::Pending->value, + ]); + // Fresh pending row — should be skipped (within min-age window) + $freshUid = $logRepo->log([ + 'crdate' => $now - 5, + 'configuration_uid' => 1, + 'grade_status' => GradeStatus::Pending->value, + ]); + + $gradedUids = []; + $stubGrader = $this->stubGradingService(function (int $uid) use (&$gradedUids, $logRepo): void { + $gradedUids[] = $uid; + $logRepo->updateGrade($uid, 0.8, GradeLabel::Good, 'test', 'gpt-4o-mini', 0.0001, 12); + }); + + $command = new GradePendingLogs($logRepo, $stubGrader); + $tester = new CommandTester($command); + $tester->execute(['--min-age' => '60', '--limit' => '10']); + + self::assertSame([$oldUid], $gradedUids); + self::assertSame(0, $tester->getStatusCode()); + + $oldRow = $logRepo->findByUid($oldUid); + self::assertSame(GradeStatus::Done->value, $oldRow['grade_status']); + + $freshRow = $logRepo->findByUid($freshUid); + self::assertSame(GradeStatus::Pending->value, $freshRow['grade_status']); + } + + #[Test] + public function reportsNoPendingRowsWhenEmpty(): void + { + $logRepo = $this->get(RequestLogRepository::class); + $stubGrader = $this->stubGradingService(static function () {}); + + $command = new GradePendingLogs($logRepo, $stubGrader); + $tester = new CommandTester($command); + $tester->execute([]); + + self::assertSame(0, $tester->getStatusCode()); + self::assertStringContainsString('No pending grades', $tester->getDisplay()); + } + + private function stubGradingService(\Closure $onGrade): GradingService + { + return new class($onGrade) extends GradingService { + // @phpstan-ignore-next-line — overriding constructor on purpose + public function __construct(private readonly \Closure $onGrade) {} + + public function grade(int $logUid): void + { + ($this->onGrade)($logUid); + } + }; + } +} diff --git a/Tests/Functional/Domain/Repository/RequestLogRepositoryTest.php b/Tests/Functional/Domain/Repository/RequestLogRepositoryTest.php new file mode 100644 index 0000000..bb00780 --- /dev/null +++ b/Tests/Functional/Domain/Repository/RequestLogRepositoryTest.php @@ -0,0 +1,92 @@ +get(RequestLogRepository::class); + + // Three graded "done" rows for cheap-model: scores 0.6, 0.8, 1.0 → avg 0.8 + foreach ([0.6, 0.8, 1.0] as $score) { + $logRepo->log($this->row('cheap-model', 0.5, GradeStatus::Done, $score)); + } + // One failed and one ungraded row — must be excluded from the grade average + $logRepo->log($this->row('cheap-model', 0.4, GradeStatus::Failed, 0.0)); + $logRepo->log($this->row('cheap-model', 0.4, GradeStatus::None, 0.0)); + + $profiles = $logRepo->getModelPerformanceProfile('TextGenerationRequest'); + $cheap = $this->profileFor($profiles, 'cheap-model'); + + self::assertSame(5, $cheap['request_count']); + self::assertSame(3, $cheap['graded_count']); + self::assertEqualsWithDelta(0.8, $cheap['avg_grade_score'], 0.0001); + } + + #[Test] + public function modelPerformanceProfileReportsZeroGradesForUngradedModel(): void + { + $logRepo = $this->get(RequestLogRepository::class); + $logRepo->log($this->row('ungraded-model', 0.5, GradeStatus::None, 0.0)); + $logRepo->log($this->row('ungraded-model', 0.5, GradeStatus::None, 0.0)); + + $profiles = $logRepo->getModelPerformanceProfile('TextGenerationRequest'); + $model = $this->profileFor($profiles, 'ungraded-model'); + + self::assertSame(2, $model['request_count']); + self::assertSame(0, $model['graded_count']); + self::assertSame(0.0, $model['avg_grade_score']); + } + + private function row(string $model, float $cost, GradeStatus $status, float $gradeScore): array + { + return [ + 'crdate' => time(), + 'request_type' => 'TextGenerationRequest', + 'provider_identifier' => 'test', + 'model_used' => $model, + 'success' => 1, + 'cost' => $cost, + 'total_tokens' => 100, + 'grade_status' => $status->value, + 'grade_score' => $gradeScore, + 'grade_label' => $status === GradeStatus::Done ? GradeLabel::fromScore($gradeScore)->value : '', + ]; + } + + /** + * @param list> $profiles + * @return array + */ + private function profileFor(array $profiles, string $model): array + { + foreach ($profiles as $profile) { + if ($profile['model_used'] === $model) { + return $profile; + } + } + self::fail('No performance profile for model "' . $model . '".'); + } +} diff --git a/Tests/Functional/Middleware/GraderMiddlewareTest.php b/Tests/Functional/Middleware/GraderMiddlewareTest.php new file mode 100644 index 0000000..8729c6b --- /dev/null +++ b/Tests/Functional/Middleware/GraderMiddlewareTest.php @@ -0,0 +1,265 @@ +get(RequestLogRepository::class); + $logUid = $logRepo->log([ + 'crdate' => time(), + 'request_type' => 'ConversationRequest', + 'provider_identifier' => 'openai', + 'configuration_uid' => 1, + 'success' => 1, + 'response_content' => 'a response', + ]); + self::assertGreaterThan(0, $logUid); + + $context = new RequestContext(); + $context->logUid = $logUid; + + $stubGrader = $this->buildStubGradingService(); + $middleware = new GraderMiddleware($stubGrader, $logRepo, new NullLogger()); + + $config = $this->buildConfiguration([ + 'grading_enabled' => 1, + 'judge_configuration_uid' => 99, + 'privacy_level' => 'standard', + ]); + + $request = $this->buildConversationRequest($config); + $response = new TextResponse('hello', errors: []); + + $next = new AiMiddlewareHandler( + static fn() => $response, + $context, + ); + + $middleware->process($request, $this->stubProvider(), $config, $next); + + $row = $logRepo->findByUid($logUid); + self::assertSame(GradeStatus::Pending->value, $row['grade_status']); + } + + #[Test] + public function doesNotMarkPendingWhenGradingDisabled(): void + { + $logRepo = $this->get(RequestLogRepository::class); + $logUid = $logRepo->log(['crdate' => time(), 'configuration_uid' => 1]); + + $context = new RequestContext(); + $context->logUid = $logUid; + $middleware = new GraderMiddleware($this->buildStubGradingService(), $logRepo, new NullLogger()); + + $config = $this->buildConfiguration(['grading_enabled' => 0]); + $request = $this->buildConversationRequest($config); + $next = new AiMiddlewareHandler(static fn() => new TextResponse('x'), $context); + + $middleware->process($request, $this->stubProvider(), $config, $next); + + $row = $logRepo->findByUid($logUid); + self::assertSame(GradeStatus::None->value, $row['grade_status']); + } + + #[Test] + public function doesNotMarkPendingWhenRecursionFlagIsSet(): void + { + $logRepo = $this->get(RequestLogRepository::class); + $logUid = $logRepo->log(['crdate' => time(), 'configuration_uid' => 1]); + + $context = new RequestContext(); + $context->logUid = $logUid; + $middleware = new GraderMiddleware($this->buildStubGradingService(), $logRepo, new NullLogger()); + + $config = $this->buildConfiguration([ + 'grading_enabled' => 1, + 'judge_configuration_uid' => 99, + ]); + $request = new ConversationRequest( + configuration: $config, + messages: [new UserMessage('hi')], + metadata: ['_aim_grading' => true], + ); + $next = new AiMiddlewareHandler(static fn() => new TextResponse('x'), $context); + + $middleware->process($request, $this->stubProvider(), $config, $next); + + $row = $logRepo->findByUid($logUid); + self::assertSame(GradeStatus::None->value, $row['grade_status']); + } + + #[Test] + public function doesNotMarkPendingWhenPrivacyIsReduced(): void + { + $logRepo = $this->get(RequestLogRepository::class); + $logUid = $logRepo->log(['crdate' => time(), 'configuration_uid' => 1]); + + $context = new RequestContext(); + $context->logUid = $logUid; + $middleware = new GraderMiddleware($this->buildStubGradingService(), $logRepo, new NullLogger()); + + $config = $this->buildConfiguration([ + 'grading_enabled' => 1, + 'judge_configuration_uid' => 99, + 'privacy_level' => 'reduced', + ]); + $request = $this->buildConversationRequest($config); + $next = new AiMiddlewareHandler(static fn() => new TextResponse('x'), $context); + + $middleware->process($request, $this->stubProvider(), $config, $next); + + $row = $logRepo->findByUid($logUid); + self::assertSame(GradeStatus::None->value, $row['grade_status']); + } + + #[Test] + public function doesNotMarkPendingForEmbeddingRequests(): void + { + $logRepo = $this->get(RequestLogRepository::class); + $logUid = $logRepo->log(['crdate' => time(), 'configuration_uid' => 1]); + + $context = new RequestContext(); + $context->logUid = $logUid; + $middleware = new GraderMiddleware($this->buildStubGradingService(), $logRepo, new NullLogger()); + + $config = $this->buildConfiguration([ + 'grading_enabled' => 1, + 'judge_configuration_uid' => 99, + ]); + $request = new EmbeddingRequest( + configuration: $config, + input: ['some text'], + ); + $next = new AiMiddlewareHandler(static fn() => new TextResponse('x'), $context); + + $middleware->process($request, $this->stubProvider(), $config, $next); + + $row = $logRepo->findByUid($logUid); + self::assertSame(GradeStatus::None->value, $row['grade_status']); + } + + #[Test] + public function doesNotMarkPendingWhenResponseFailed(): void + { + $logRepo = $this->get(RequestLogRepository::class); + $logUid = $logRepo->log(['crdate' => time(), 'configuration_uid' => 1]); + + $context = new RequestContext(); + $context->logUid = $logUid; + $middleware = new GraderMiddleware($this->buildStubGradingService(), $logRepo, new NullLogger()); + + $config = $this->buildConfiguration([ + 'grading_enabled' => 1, + 'judge_configuration_uid' => 99, + ]); + $request = $this->buildConversationRequest($config); + $failedResponse = new TextResponse('', errors: ['boom']); + $next = new AiMiddlewareHandler(static fn() => $failedResponse, $context); + + $middleware->process($request, $this->stubProvider(), $config, $next); + + $row = $logRepo->findByUid($logUid); + self::assertSame(GradeStatus::None->value, $row['grade_status']); + } + + #[Test] + public function doesNotMarkPendingWhenJudgeIsSameConfiguration(): void + { + $logRepo = $this->get(RequestLogRepository::class); + $logUid = $logRepo->log(['crdate' => time(), 'configuration_uid' => 5]); + + $context = new RequestContext(); + $context->logUid = $logUid; + $middleware = new GraderMiddleware($this->buildStubGradingService(), $logRepo, new NullLogger()); + + $config = $this->buildConfiguration([ + 'uid' => 5, + 'grading_enabled' => 1, + 'judge_configuration_uid' => 5, + ]); + $request = $this->buildConversationRequest($config); + $next = new AiMiddlewareHandler(static fn() => new TextResponse('ok'), $context); + + $middleware->process($request, $this->stubProvider(), $config, $next); + + $row = $logRepo->findByUid($logUid); + self::assertSame(GradeStatus::None->value, $row['grade_status']); + } + + private function buildConfiguration(array $overrides): ProviderConfiguration + { + $base = [ + 'uid' => 1, + 'ai_provider' => 'openai', + 'title' => 'Test', + 'api_key' => 'sk-test', + 'model' => 'gpt-4o-mini', + 'privacy_level' => 'standard', + 'grading_enabled' => 0, + 'judge_configuration_uid' => 0, + 'grading_rubric' => '', + ]; + return new ProviderConfiguration(array_merge($base, $overrides)); + } + + private function buildConversationRequest(ProviderConfiguration $config): ConversationRequest + { + return new ConversationRequest( + configuration: $config, + messages: [new UserMessage('hi')], + ); + } + + private function stubProvider(): AiProviderInterface + { + return new class implements AiProviderInterface {}; + } + + /** + * A no-op grading service so register_shutdown_function doesn't actually + * try to dispatch a judge request at end of test. + */ + private function buildStubGradingService(): GradingService + { + return new class extends GradingService { + // @phpstan-ignore-next-line — overriding constructor on purpose + public function __construct() {} + + public function grade(int $logUid): void {} + }; + } +} diff --git a/Tests/Functional/Service/GradingServiceTest.php b/Tests/Functional/Service/GradingServiceTest.php new file mode 100644 index 0000000..4ccaa46 --- /dev/null +++ b/Tests/Functional/Service/GradingServiceTest.php @@ -0,0 +1,153 @@ +get(RequestLogRepository::class); + $uid = $logRepo->log([ + 'crdate' => time(), + 'configuration_uid' => 1, + 'grade_status' => 'pending', + ]); + + $logRepo->updateGrade( + $uid, + score: 0.83, + label: GradeLabel::Good, + reason: 'Mostly correct, minor omissions.', + judgeModel: 'gpt-4o-mini', + judgeCost: 0.000123, + durationMs: 280, + ); + + $row = $logRepo->findByUid($uid); + self::assertSame(GradeStatus::Done->value, $row['grade_status']); + self::assertEqualsWithDelta(0.83, (float)$row['grade_score'], 0.001); + self::assertSame('good', $row['grade_label']); + self::assertStringContainsString('omissions', $row['grade_reason']); + self::assertSame('gpt-4o-mini', $row['judge_model']); + self::assertEqualsWithDelta(0.000123, (float)$row['judge_cost'], 1e-6); + self::assertSame(280, (int)$row['grade_duration_ms']); + self::assertSame('', $row['grade_error']); + } + + #[Test] + public function repositoryMarkGradeFailedClampsErrorLength(): void + { + $logRepo = $this->get(RequestLogRepository::class); + $uid = $logRepo->log([ + 'crdate' => time(), + 'configuration_uid' => 1, + 'grade_status' => 'pending', + ]); + + $logRepo->markGradeFailed($uid, str_repeat('x', 800)); + + $row = $logRepo->findByUid($uid); + self::assertSame(GradeStatus::Failed->value, $row['grade_status']); + self::assertSame(500, mb_strlen($row['grade_error'])); + } + + #[Test] + public function findPendingGradesRespectsAgeFilter(): void + { + $logRepo = $this->get(RequestLogRepository::class); + $now = time(); + + $oldUid = $logRepo->log([ + 'crdate' => $now - 600, + 'configuration_uid' => 1, + 'grade_status' => 'pending', + ]); + $logRepo->log([ + 'crdate' => $now - 10, + 'configuration_uid' => 1, + 'grade_status' => 'pending', + ]); + $logRepo->log([ + 'crdate' => $now - 600, + 'configuration_uid' => 1, + 'grade_status' => 'done', + ]); + + $found = $logRepo->findPendingGrades(60, 100); + self::assertCount(1, $found); + self::assertSame($oldUid, (int)$found[0]['uid']); + + $pendingCount = $logRepo->countPendingGradesOlderThan(60); + self::assertSame(1, $pendingCount); + } + + #[Test] + public function parseJudgeOutputAcceptsCleanJson(): void + { + $parsed = $this->invokeParser('{"score": 0.75, "label": "good", "reason": "ok"}'); + self::assertSame(0.75, $parsed['score']); + self::assertSame(GradeLabel::Good, $parsed['label']); + self::assertSame('ok', $parsed['reason']); + } + + #[Test] + public function parseJudgeOutputStripsMarkdownFences(): void + { + $parsed = $this->invokeParser("```json\n{\"score\": 0.9, \"label\": \"excellent\", \"reason\": \"great\"}\n```"); + self::assertSame(0.9, $parsed['score']); + self::assertSame(GradeLabel::Excellent, $parsed['label']); + } + + #[Test] + public function parseJudgeOutputClampsOutOfRangeScores(): void + { + $tooHigh = $this->invokeParser('{"score": 1.5, "label": "excellent", "reason": "x"}'); + self::assertSame(1.0, $tooHigh['score']); + + $tooLow = $this->invokeParser('{"score": -0.3, "label": "poor", "reason": "x"}'); + self::assertSame(0.0, $tooLow['score']); + } + + #[Test] + public function parseJudgeOutputBackfillsLabelFromScoreWhenInvalid(): void + { + $parsed = $this->invokeParser('{"score": 0.7, "label": "meh", "reason": "x"}'); + self::assertSame(GradeLabel::Good, $parsed['label']); + } + + #[Test] + public function parseJudgeOutputReturnsNullOnMalformedJson(): void + { + self::assertNull($this->invokeParser('not json at all')); + self::assertNull($this->invokeParser('{"missing": "score"}')); + } + + private function invokeParser(string $raw): ?array + { + $service = $this->get(GradingService::class); + $method = new \ReflectionMethod($service, 'parseJudgeOutput'); + return $method->invoke($service, $raw); + } +} diff --git a/ext_tables.sql b/ext_tables.sql index 7167b24..fc35ef9 100644 --- a/ext_tables.sql +++ b/ext_tables.sql @@ -14,7 +14,10 @@ CREATE TABLE tx_aim_configuration ( be_groups varchar(255) DEFAULT '' NOT NULL, privacy_level varchar(20) DEFAULT 'standard' NOT NULL, rerouting_allowed tinyint(1) unsigned DEFAULT '1' NOT NULL, - auto_model_switch tinyint(1) unsigned DEFAULT '1' NOT NULL + auto_model_switch tinyint(1) unsigned DEFAULT '1' NOT NULL, + grading_enabled tinyint(1) unsigned DEFAULT '0' NOT NULL, + judge_configuration_uid int(11) unsigned DEFAULT '0' NOT NULL, + grading_rubric text ); CREATE TABLE tx_aim_usage_budget ( @@ -61,6 +64,14 @@ CREATE TABLE tx_aim_request_log ( rerouted tinyint(1) unsigned DEFAULT '0' NOT NULL, reroute_type varchar(20) DEFAULT '' NOT NULL, reroute_reason varchar(255) DEFAULT '' NOT NULL, + grade_status varchar(20) DEFAULT 'none' NOT NULL, + grade_score double(5,4) DEFAULT '0.0000' NOT NULL, + grade_label varchar(20) DEFAULT '' NOT NULL, + grade_reason text, + judge_model varchar(255) DEFAULT '' NOT NULL, + judge_cost double(10,6) DEFAULT '0.000000' NOT NULL, + grade_duration_ms int(11) unsigned DEFAULT '0' NOT NULL, + grade_error varchar(500) DEFAULT '' NOT NULL, PRIMARY KEY (uid), KEY crdate (crdate), @@ -69,5 +80,6 @@ CREATE TABLE tx_aim_request_log ( KEY configuration_uid (configuration_uid), KEY user_id (user_id), KEY model_used (model_used), - KEY request_type (request_type) + KEY request_type (request_type), + KEY grade_status (grade_status) );