Skip to content

Commit f6ec2fe

Browse files
authored
Merge pull request #41 from braintrustdata/ark/devserver-eval-errors
handle task+scorer errors in devserver
2 parents d6f94ca + 597c55c commit f6ec2fe

6 files changed

Lines changed: 540 additions & 38 deletions

File tree

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ jobs:
2525
run: chmod +x gradlew
2626

2727
- name: Run all checks and tests
28-
run: ./gradlew compileTestJava check
28+
run: ./gradlew compileTestJava publishToMavenLocal check
2929

3030
- name: Upload test results
3131
uses: actions/upload-artifact@v4

build.gradle

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,8 @@ task testJar(type: JavaExec) {
389389
compileJava.dependsOn validateJavaVersion
390390

391391
// Run jar test as part of check task
392+
check.dependsOn javadoc
393+
check.dependsOn delombok
392394
check.dependsOn testJar
393395

394396
// Task to install git hooks

src/main/java/dev/braintrust/devserver/Devserver.java

Lines changed: 137 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
import io.opentelemetry.api.common.AttributeKey;
1818
import io.opentelemetry.api.trace.Span;
1919
import io.opentelemetry.api.trace.SpanKind;
20+
import io.opentelemetry.api.trace.StatusCode;
21+
import io.opentelemetry.api.trace.Tracer;
2022
import io.opentelemetry.context.Context;
2123
import java.io.IOException;
2224
import java.io.InputStream;
@@ -332,12 +334,12 @@ private void handleEval(HttpExchange exchange) throws IOException {
332334
}
333335

334336
@SuppressWarnings({"unchecked", "rawtypes"})
335-
private void handleStreamingEval(
337+
private <I, O> void handleStreamingEval(
336338
HttpExchange exchange,
337339
RemoteEval eval,
338340
EvalRequest request,
339341
RequestContext context,
340-
List<Scorer<Object, Object>> remoteScorers)
342+
List<Scorer<I, O>> remoteScorers)
341343
throws Exception {
342344
// Set SSE headers
343345
exchange.getResponseHeaders().set("Content-Type", "text/event-stream");
@@ -383,7 +385,9 @@ private void handleStreamingEval(
383385
// concurrent dataset fetching and eval execution
384386
extractDataset(request, apiClient)
385387
.forEach(
386-
datasetCase -> {
388+
rawDataset -> {
389+
final DatasetCase<I, O> datasetCase =
390+
(DatasetCase<I, O>) rawDataset;
387391
var evalSpan =
388392
tracer.spanBuilder("eval")
389393
.setNoParent()
@@ -400,15 +404,49 @@ private void handleStreamingEval(
400404
braintrustParent.id());
401405
// Make the eval context (with span and baggage) current
402406
try (var rootScope = evalContext.makeCurrent()) {
403-
final dev.braintrust.eval.TaskResult taskResult;
407+
final TaskResult<I, O> taskResult;
404408
{ // run task
405409
var taskSpan = tracer.spanBuilder("task").startSpan();
406410
try (var unused =
407411
Context.current()
408412
.with(taskSpan)
409413
.makeCurrent()) {
410414
var task = eval.getTask();
411-
taskResult = task.apply(datasetCase);
415+
try {
416+
taskResult = task.apply(datasetCase);
417+
} catch (Exception e) {
418+
taskSpan.setStatus(
419+
StatusCode.ERROR, e.getMessage());
420+
taskSpan.recordException(e);
421+
taskSpan.end();
422+
evalSpan.setStatus(
423+
StatusCode.ERROR, e.getMessage());
424+
evalSpan.setAttribute(
425+
"braintrust.output_json",
426+
toJson(
427+
Collections.singletonMap(
428+
"output", null)));
429+
log.debug(
430+
"Task threw exception for input: "
431+
+ datasetCase.input(),
432+
e);
433+
// run scoreForTaskException on each scorer
434+
List<Scorer<I, O>> allScorersForError =
435+
new ArrayList<>(eval.getScorers());
436+
allScorersForError.addAll(remoteScorers);
437+
for (var scorer : allScorersForError) {
438+
runScoreForTaskException(
439+
tracer,
440+
evalSpan,
441+
braintrustParent,
442+
braintrustGeneration,
443+
scorer,
444+
e,
445+
datasetCase,
446+
scoresByName);
447+
}
448+
return;
449+
}
412450
// Send progress event for task completion
413451
sendProgressEvent(
414452
os,
@@ -437,37 +475,18 @@ private void handleStreamingEval(
437475
// run scorers - one score span per scorer
438476
// Combine local scorers from RemoteEval with remote scorers
439477
// from request
440-
List<Scorer<?, ?>> allScorers =
478+
List<Scorer<I, O>> allScorers =
441479
new ArrayList<>(eval.getScorers());
442480
allScorers.addAll(remoteScorers);
443481
for (var scorer : allScorers) {
444-
var scoreSpan = tracer.spanBuilder("score").startSpan();
445-
try (var unused =
446-
Context.current()
447-
.with(scoreSpan)
448-
.makeCurrent()) {
449-
List<Score> scores = scorer.score(taskResult);
450-
451-
Map<String, Double> scorerScores =
452-
new LinkedHashMap<>();
453-
for (Score score : scores) {
454-
scoresByName
455-
.computeIfAbsent(
456-
score.name(),
457-
k -> new ArrayList<>())
458-
.add(score.value());
459-
scorerScores.put(score.name(), score.value());
460-
}
461-
// Set score span attributes before ending span
462-
setScoreSpanAttributes(
463-
scoreSpan,
464-
braintrustParent,
465-
braintrustGeneration,
466-
scorer.getName(),
467-
scorerScores);
468-
} finally {
469-
scoreSpan.end();
470-
}
482+
runScorer(
483+
tracer,
484+
evalSpan,
485+
braintrustParent,
486+
braintrustGeneration,
487+
scorer,
488+
taskResult,
489+
scoresByName);
471490
}
472491
} catch (IOException e) {
473492
throw new RuntimeException(
@@ -599,6 +618,91 @@ private void setScoreSpanAttributes(
599618
.setAttribute("braintrust.scores", scoresJson);
600619
}
601620

621+
/**
622+
* Runs a scorer against a successful task result. If the scorer throws, falls back to {@link
623+
* Scorer#scoreForScorerException}.
624+
*/
625+
private <I, O> void runScorer(
626+
Tracer tracer,
627+
Span evalSpan,
628+
BraintrustUtils.Parent braintrustParent,
629+
String braintrustGeneration,
630+
Scorer<I, O> scorer,
631+
TaskResult<I, O> taskResult,
632+
Map<String, List<Double>> scoresByName) {
633+
var scoreSpan = tracer.spanBuilder("score").startSpan();
634+
try (var unused = Context.current().with(scoreSpan).makeCurrent()) {
635+
List<Score> scores;
636+
try {
637+
scores = scorer.score(taskResult);
638+
} catch (Exception e) {
639+
scoreSpan.setStatus(StatusCode.ERROR, e.getMessage());
640+
scoreSpan.recordException(e);
641+
log.debug("Scorer '{}' threw exception", scorer.getName(), e);
642+
// fall back to scoreForScorerException — if this throws, eval aborts
643+
scores = scorer.scoreForScorerException(e, taskResult);
644+
}
645+
recordScores(
646+
scoreSpan,
647+
braintrustParent,
648+
braintrustGeneration,
649+
scorer,
650+
scores,
651+
scoresByName);
652+
} finally {
653+
scoreSpan.end();
654+
}
655+
}
656+
657+
/**
658+
* Runs {@link Scorer#scoreForTaskException} when the task threw. If the fallback throws, the
659+
* eval aborts.
660+
*/
661+
private <I, O> void runScoreForTaskException(
662+
Tracer tracer,
663+
Span evalSpan,
664+
BraintrustUtils.Parent braintrustParent,
665+
String braintrustGeneration,
666+
Scorer<I, O> scorer,
667+
Exception taskException,
668+
DatasetCase<I, O> datasetCase,
669+
Map<String, List<Double>> scoresByName) {
670+
var scoreSpan = tracer.spanBuilder("score").startSpan();
671+
try (var unused = Context.current().with(scoreSpan).makeCurrent()) {
672+
// if this throws, it propagates and the eval aborts
673+
var scores = scorer.scoreForTaskException(taskException, datasetCase);
674+
recordScores(
675+
scoreSpan,
676+
braintrustParent,
677+
braintrustGeneration,
678+
scorer,
679+
scores,
680+
scoresByName);
681+
} finally {
682+
scoreSpan.end();
683+
}
684+
}
685+
686+
/** Records scores on the score span and accumulates them into scoresByName. */
687+
private void recordScores(
688+
Span scoreSpan,
689+
BraintrustUtils.Parent braintrustParent,
690+
String braintrustGeneration,
691+
Scorer<?, ?> scorer,
692+
List<Score> scores,
693+
Map<String, List<Double>> scoresByName) {
694+
if (scores == null || scores.isEmpty()) {
695+
return;
696+
}
697+
Map<String, Double> scorerScores = new LinkedHashMap<>();
698+
for (Score score : scores) {
699+
scoresByName.computeIfAbsent(score.name(), k -> new ArrayList<>()).add(score.value());
700+
scorerScores.put(score.name(), score.value());
701+
}
702+
setScoreSpanAttributes(
703+
scoreSpan, braintrustParent, braintrustGeneration, scorer.getName(), scorerScores);
704+
}
705+
602706
private void sendSSEEvent(OutputStream os, String eventType, String data) throws IOException {
603707
String event = "event: " + eventType + "\n" + "data: " + data + "\n\n";
604708
synchronized (this) {

src/main/java/dev/braintrust/eval/Scorer.java

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ public interface Scorer<INPUT, OUTPUT> {
2121
*
2222
* @param taskResult the task output and originating dataset case
2323
* @return one or more scores, each with a value between 0 and 1 inclusive
24-
* @throws Exception if scoring fails, the error will be recorded on the span and {@link
24+
* <p>If this method thows, the error will be recorded on the span and {@link
2525
* #scoreForScorerException} will be called as a fallback
2626
*/
2727
List<Score> score(TaskResult<INPUT, OUTPUT> taskResult);
@@ -33,7 +33,6 @@ public interface Scorer<INPUT, OUTPUT> {
3333
* @param taskException the exception thrown by the task
3434
* @param datasetCase the dataset case that was being evaluated
3535
* @return fallback scores, or an empty list to skip scoring for this case
36-
* @throws Exception if this method throws, the entire eval run is aborted
3736
*/
3837
default List<Score> scoreForTaskException(
3938
Exception taskException, DatasetCase<INPUT, OUTPUT> datasetCase) {
@@ -46,7 +45,6 @@ default List<Score> scoreForTaskException(
4645
* @param scorerException the exception thrown by {@link #score}
4746
* @param taskResult the task result that was being scored
4847
* @return fallback scores, or an empty list to skip scoring for this case
49-
* @throws Exception if this method throws, the entire eval run is aborted
5048
*/
5149
default List<Score> scoreForScorerException(
5250
Exception scorerException, TaskResult<INPUT, OUTPUT> taskResult) {

src/main/java/dev/braintrust/eval/Task.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,5 @@ public interface Task<INPUT, OUTPUT> {
1818
* @throws Exception if the task fails, the error will be recorded on the span and scoring will
1919
* fall back to {@link Scorer#scoreForTaskException}
2020
*/
21-
TaskResult<INPUT, OUTPUT> apply(DatasetCase<INPUT, OUTPUT> datasetCase);
21+
TaskResult<INPUT, OUTPUT> apply(DatasetCase<INPUT, OUTPUT> datasetCase) throws Exception;
2222
}

0 commit comments

Comments
 (0)