Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions packages/evals/framework/braintrust.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
let braintrustPromise: Promise<typeof import("braintrust")> | undefined;

export function hasBraintrustApiKey(): boolean {
return Boolean(process.env.BRAINTRUST_API_KEY);
}

export function loadBraintrust(): Promise<typeof import("braintrust")> {
braintrustPromise ??= import("braintrust");
return braintrustPromise;
Expand All @@ -9,6 +13,9 @@ export async function tracedSpan<T>(
fn: () => Promise<T>,
options: { name: string },
): Promise<T> {
if (!hasBraintrustApiKey()) {
return fn();
}
const { traced } = await loadBraintrust();
return traced(fn, options);
}
8 changes: 6 additions & 2 deletions packages/evals/framework/runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import type { Testcase, EvalInput } from "../types/evals.js";
import { generateBenchTestcases } from "./benchPlanner.js";
import { DEFAULT_BENCH_HARNESS, type Harness } from "./benchTypes.js";
import { executeBenchTask } from "./benchRunner.js";
import { loadBraintrust, tracedSpan } from "./braintrust.js";
import { hasBraintrustApiKey, loadBraintrust, tracedSpan } from "./braintrust.js";
import { onceAsync, registerActiveRunCleanup } from "./activeRunCleanup.js";
import { loadTaskModuleFromPath } from "./taskLoader.js";

Expand Down Expand Up @@ -352,6 +352,7 @@ export async function runEvals(
: [exactMatch, errorMatch];

const { Eval, flush } = await loadBraintrust();
const sendLogs = hasBraintrustApiKey();

// Aggressive abort: when the caller flips signal.reason to "aggressive",
// close every active session so any in-flight task throws on its next
Expand Down Expand Up @@ -440,10 +441,13 @@ export async function runEvals(
{
progress: silentBraintrustProgress,
reporter: silentBraintrustReporter,
...(sendLogs ? {} : { noSendLogs: true }),
},
);

await flush();
if (sendLogs) {
await flush();
}

const summaryResults = evalResult.results.map((result) => {
const output =
Expand Down
20 changes: 11 additions & 9 deletions packages/evals/lib/AISdkClientWrapped.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,18 @@ import {
toJsonSchema,
} from "@browserbasehq/stagehand";

let wrappedAiPromise:
| Promise<ReturnType<(typeof import("braintrust"))["wrapAISDK"]>>
| undefined;
type WrappedAI = ReturnType<(typeof import("braintrust"))["wrapAISDK"]>;

async function loadWrappedAISDK(): Promise<
ReturnType<(typeof import("braintrust"))["wrapAISDK"]>
> {
wrappedAiPromise ??= import("braintrust").then(({ wrapAISDK }) =>
wrapAISDK(ai),
);
let wrappedAiPromise: Promise<WrappedAI> | undefined;

async function loadWrappedAISDK(): Promise<WrappedAI> {
wrappedAiPromise ??= (async () => {
if (!process.env.BRAINTRUST_API_KEY) {
return ai as unknown as WrappedAI;
}
const { wrapAISDK } = await import("braintrust");
return wrapAISDK(ai);
})();
return wrappedAiPromise;
}

Expand Down
65 changes: 65 additions & 0 deletions packages/evals/tests/framework/braintrust-optional.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
import {
hasBraintrustApiKey,
tracedSpan,
} from "../../framework/braintrust.js";

/**
* Verify that braintrust.ts helpers and AISdkClientWrapped work without
* BRAINTRUST_API_KEY. The runner-level tests live in a separate file
* (braintrust-runner-nolog.test.ts) because they need to vi.mock the
* braintrust module entirely.
*/

describe("braintrust.ts helpers without BRAINTRUST_API_KEY", () => {
const originalKey = process.env.BRAINTRUST_API_KEY;

beforeEach(() => {
delete process.env.BRAINTRUST_API_KEY;
});

afterEach(() => {
if (originalKey !== undefined) {
process.env.BRAINTRUST_API_KEY = originalKey;
} else {
delete process.env.BRAINTRUST_API_KEY;
}
});

it("hasBraintrustApiKey returns false when BRAINTRUST_API_KEY is unset", () => {
expect(hasBraintrustApiKey()).toBe(false);
});

it("hasBraintrustApiKey returns true when BRAINTRUST_API_KEY is set", () => {
process.env.BRAINTRUST_API_KEY = "test-key-123";
expect(hasBraintrustApiKey()).toBe(true);
});

it("tracedSpan calls fn directly without Braintrust when key is absent", async () => {
const fn = vi.fn(async () => 42);
const result = await tracedSpan(fn, { name: "test-span" });
expect(fn).toHaveBeenCalledOnce();
expect(result).toBe(42);
});
});

describe("AISdkClientWrapped without BRAINTRUST_API_KEY", () => {
const originalKey = process.env.BRAINTRUST_API_KEY;

beforeEach(() => {
delete process.env.BRAINTRUST_API_KEY;
});

afterEach(() => {
if (originalKey !== undefined) {
process.env.BRAINTRUST_API_KEY = originalKey;
} else {
delete process.env.BRAINTRUST_API_KEY;
}
});

it("AISdkClientWrapped class is importable without BRAINTRUST_API_KEY", async () => {
const mod = await import("../../lib/AISdkClientWrapped.js");
expect(mod.AISdkClientWrapped).toBeDefined();
});
});
112 changes: 112 additions & 0 deletions packages/evals/tests/framework/braintrust-runner-nolog.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";

/**
* Verify that the runner passes noSendLogs to Braintrust Eval and skips
* flush() when BRAINTRUST_API_KEY is absent. This file lives separately
* because it needs to vi.mock the braintrust module at the top level.
*/

vi.mock("playwright", () => ({
chromium: {},
}));

const mockEval = vi.fn(async () => ({
results: [],
summary: { experimentName: "test", scores: {} },
}));
const mockFlush = vi.fn(async () => {});
let mockHasKey = false;

vi.mock("../../framework/braintrust.js", () => ({
hasBraintrustApiKey: () => mockHasKey,
loadBraintrust: async () => ({
Eval: mockEval,
flush: mockFlush,
}),
tracedSpan: async <T>(fn: () => Promise<T>) => fn(),
}));

describe("runner.ts skips Braintrust logging when API key is absent", () => {
const originalKey = process.env.BRAINTRUST_API_KEY;

beforeEach(() => {
delete process.env.BRAINTRUST_API_KEY;
mockEval.mockClear();
mockFlush.mockClear();
mockHasKey = false;
});

afterEach(() => {
if (originalKey !== undefined) {
process.env.BRAINTRUST_API_KEY = originalKey;
} else {
delete process.env.BRAINTRUST_API_KEY;
}
});

it("passes noSendLogs: true to Eval when BRAINTRUST_API_KEY is unset", async () => {
mockHasKey = false;
const { runEvals } = await import("../../framework/runner.js");

const task = {
name: "test-task",
tier: "bench" as const,
primaryCategory: "extract",
categories: ["extract"],
tags: [],
filePath: "/fake.ts",
isLegacy: false,
};

await runEvals({
tasks: [task],
registry: {
tasks: [task],
byName: new Map([[task.name, task]]),
byTier: new Map([["bench", [task]]]),
byCategory: new Map([["extract", [task]]]),
},
trials: 1,
});

expect(mockEval).toHaveBeenCalledTimes(1);
const evalOptions = mockEval.mock.calls[0][2];
expect(evalOptions).toHaveProperty("noSendLogs", true);

// flush should NOT be called
expect(mockFlush).not.toHaveBeenCalled();
});

it("does NOT pass noSendLogs when BRAINTRUST_API_KEY is set", async () => {
mockHasKey = true;
const { runEvals } = await import("../../framework/runner.js");

const task = {
name: "test-task",
tier: "bench" as const,
primaryCategory: "extract",
categories: ["extract"],
tags: [],
filePath: "/fake.ts",
isLegacy: false,
};

await runEvals({
tasks: [task],
registry: {
tasks: [task],
byName: new Map([[task.name, task]]),
byTier: new Map([["bench", [task]]]),
byCategory: new Map([["extract", [task]]]),
},
trials: 1,
});

expect(mockEval).toHaveBeenCalledTimes(1);
const evalOptions = mockEval.mock.calls[0][2];
expect(evalOptions).not.toHaveProperty("noSendLogs");

// flush SHOULD be called
expect(mockFlush).toHaveBeenCalledTimes(1);
});
});
4 changes: 4 additions & 0 deletions packages/evals/tests/framework/core-runner.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ function makeRegistry(tasks: DiscoveredTask[]): TaskRegistry {
beforeEach(() => {
originalCi = process.env.CI;
delete process.env.CI;
// Set a dummy API key so the runner sends logs and calls flush(),
// which this test asserts.
process.env.BRAINTRUST_API_KEY = "test-key";
tracedNames.length = 0;
evalMock.mockReset();
flushMock.mockClear();
Expand All @@ -79,6 +82,7 @@ afterEach(() => {
} else {
process.env.CI = originalCi;
}
delete process.env.BRAINTRUST_API_KEY;

while (tempDirs.length > 0) {
const dir = tempDirs.pop();
Expand Down
Loading