1717import io .opentelemetry .api .common .AttributeKey ;
1818import io .opentelemetry .api .trace .Span ;
1919import io .opentelemetry .api .trace .SpanKind ;
20+ import io .opentelemetry .api .trace .StatusCode ;
21+ import io .opentelemetry .api .trace .Tracer ;
2022import io .opentelemetry .context .Context ;
2123import java .io .IOException ;
2224import java .io .InputStream ;
@@ -332,12 +334,12 @@ private void handleEval(HttpExchange exchange) throws IOException {
332334 }
333335
334336 @ SuppressWarnings ({"unchecked" , "rawtypes" })
335- private void handleStreamingEval (
337+ private < I , O > void handleStreamingEval (
336338 HttpExchange exchange ,
337339 RemoteEval eval ,
338340 EvalRequest request ,
339341 RequestContext context ,
340- List <Scorer <Object , Object >> remoteScorers )
342+ List <Scorer <I , O >> remoteScorers )
341343 throws Exception {
342344 // Set SSE headers
343345 exchange .getResponseHeaders ().set ("Content-Type" , "text/event-stream" );
@@ -383,7 +385,9 @@ private void handleStreamingEval(
383385 // concurrent dataset fetching and eval execution
384386 extractDataset (request , apiClient )
385387 .forEach (
386- datasetCase -> {
388+ rawDataset -> {
389+ final DatasetCase <I , O > datasetCase =
390+ (DatasetCase <I , O >) rawDataset ;
387391 var evalSpan =
388392 tracer .spanBuilder ("eval" )
389393 .setNoParent ()
@@ -400,15 +404,49 @@ private void handleStreamingEval(
400404 braintrustParent .id ());
401405 // Make the eval context (with span and baggage) current
402406 try (var rootScope = evalContext .makeCurrent ()) {
403- final dev . braintrust . eval . TaskResult taskResult ;
407+ final TaskResult < I , O > taskResult ;
404408 { // run task
405409 var taskSpan = tracer .spanBuilder ("task" ).startSpan ();
406410 try (var unused =
407411 Context .current ()
408412 .with (taskSpan )
409413 .makeCurrent ()) {
410414 var task = eval .getTask ();
411- taskResult = task .apply (datasetCase );
415+ try {
416+ taskResult = task .apply (datasetCase );
417+ } catch (Exception e ) {
418+ taskSpan .setStatus (
419+ StatusCode .ERROR , e .getMessage ());
420+ taskSpan .recordException (e );
421+ taskSpan .end ();
422+ evalSpan .setStatus (
423+ StatusCode .ERROR , e .getMessage ());
424+ evalSpan .setAttribute (
425+ "braintrust.output_json" ,
426+ toJson (
427+ Collections .singletonMap (
428+ "output" , null )));
429+ log .debug (
430+ "Task threw exception for input: "
431+ + datasetCase .input (),
432+ e );
433+ // run scoreForTaskException on each scorer
434+ List <Scorer <I , O >> allScorersForError =
435+ new ArrayList <>(eval .getScorers ());
436+ allScorersForError .addAll (remoteScorers );
437+ for (var scorer : allScorersForError ) {
438+ runScoreForTaskException (
439+ tracer ,
440+ evalSpan ,
441+ braintrustParent ,
442+ braintrustGeneration ,
443+ scorer ,
444+ e ,
445+ datasetCase ,
446+ scoresByName );
447+ }
448+ return ;
449+ }
412450 // Send progress event for task completion
413451 sendProgressEvent (
414452 os ,
@@ -437,37 +475,18 @@ private void handleStreamingEval(
437475 // run scorers - one score span per scorer
438476 // Combine local scorers from RemoteEval with remote scorers
439477 // from request
440- List <Scorer <?, ? >> allScorers =
478+ List <Scorer <I , O >> allScorers =
441479 new ArrayList <>(eval .getScorers ());
442480 allScorers .addAll (remoteScorers );
443481 for (var scorer : allScorers ) {
444- var scoreSpan = tracer .spanBuilder ("score" ).startSpan ();
445- try (var unused =
446- Context .current ()
447- .with (scoreSpan )
448- .makeCurrent ()) {
449- List <Score > scores = scorer .score (taskResult );
450-
451- Map <String , Double > scorerScores =
452- new LinkedHashMap <>();
453- for (Score score : scores ) {
454- scoresByName
455- .computeIfAbsent (
456- score .name (),
457- k -> new ArrayList <>())
458- .add (score .value ());
459- scorerScores .put (score .name (), score .value ());
460- }
461- // Set score span attributes before ending span
462- setScoreSpanAttributes (
463- scoreSpan ,
464- braintrustParent ,
465- braintrustGeneration ,
466- scorer .getName (),
467- scorerScores );
468- } finally {
469- scoreSpan .end ();
470- }
482+ runScorer (
483+ tracer ,
484+ evalSpan ,
485+ braintrustParent ,
486+ braintrustGeneration ,
487+ scorer ,
488+ taskResult ,
489+ scoresByName );
471490 }
472491 } catch (IOException e ) {
473492 throw new RuntimeException (
@@ -599,6 +618,91 @@ private void setScoreSpanAttributes(
599618 .setAttribute ("braintrust.scores" , scoresJson );
600619 }
601620
621+ /**
622+ * Runs a scorer against a successful task result. If the scorer throws, falls back to {@link
623+ * Scorer#scoreForScorerException}.
624+ */
625+ private <I , O > void runScorer (
626+ Tracer tracer ,
627+ Span evalSpan ,
628+ BraintrustUtils .Parent braintrustParent ,
629+ String braintrustGeneration ,
630+ Scorer <I , O > scorer ,
631+ TaskResult <I , O > taskResult ,
632+ Map <String , List <Double >> scoresByName ) {
633+ var scoreSpan = tracer .spanBuilder ("score" ).startSpan ();
634+ try (var unused = Context .current ().with (scoreSpan ).makeCurrent ()) {
635+ List <Score > scores ;
636+ try {
637+ scores = scorer .score (taskResult );
638+ } catch (Exception e ) {
639+ scoreSpan .setStatus (StatusCode .ERROR , e .getMessage ());
640+ scoreSpan .recordException (e );
641+ log .debug ("Scorer '{}' threw exception" , scorer .getName (), e );
642+ // fall back to scoreForScorerException — if this throws, eval aborts
643+ scores = scorer .scoreForScorerException (e , taskResult );
644+ }
645+ recordScores (
646+ scoreSpan ,
647+ braintrustParent ,
648+ braintrustGeneration ,
649+ scorer ,
650+ scores ,
651+ scoresByName );
652+ } finally {
653+ scoreSpan .end ();
654+ }
655+ }
656+
657+ /**
658+ * Runs {@link Scorer#scoreForTaskException} when the task threw. If the fallback throws, the
659+ * eval aborts.
660+ */
661+ private <I , O > void runScoreForTaskException (
662+ Tracer tracer ,
663+ Span evalSpan ,
664+ BraintrustUtils .Parent braintrustParent ,
665+ String braintrustGeneration ,
666+ Scorer <I , O > scorer ,
667+ Exception taskException ,
668+ DatasetCase <I , O > datasetCase ,
669+ Map <String , List <Double >> scoresByName ) {
670+ var scoreSpan = tracer .spanBuilder ("score" ).startSpan ();
671+ try (var unused = Context .current ().with (scoreSpan ).makeCurrent ()) {
672+ // if this throws, it propagates and the eval aborts
673+ var scores = scorer .scoreForTaskException (taskException , datasetCase );
674+ recordScores (
675+ scoreSpan ,
676+ braintrustParent ,
677+ braintrustGeneration ,
678+ scorer ,
679+ scores ,
680+ scoresByName );
681+ } finally {
682+ scoreSpan .end ();
683+ }
684+ }
685+
686+ /** Records scores on the score span and accumulates them into scoresByName. */
687+ private void recordScores (
688+ Span scoreSpan ,
689+ BraintrustUtils .Parent braintrustParent ,
690+ String braintrustGeneration ,
691+ Scorer <?, ?> scorer ,
692+ List <Score > scores ,
693+ Map <String , List <Double >> scoresByName ) {
694+ if (scores == null || scores .isEmpty ()) {
695+ return ;
696+ }
697+ Map <String , Double > scorerScores = new LinkedHashMap <>();
698+ for (Score score : scores ) {
699+ scoresByName .computeIfAbsent (score .name (), k -> new ArrayList <>()).add (score .value ());
700+ scorerScores .put (score .name (), score .value ());
701+ }
702+ setScoreSpanAttributes (
703+ scoreSpan , braintrustParent , braintrustGeneration , scorer .getName (), scorerScores );
704+ }
705+
602706 private void sendSSEEvent (OutputStream os , String eventType , String data ) throws IOException {
603707 String event = "event: " + eventType + "\n " + "data: " + data + "\n \n " ;
604708 synchronized (this ) {
0 commit comments