diff --git a/packages/evals/framework/braintrust.ts b/packages/evals/framework/braintrust.ts index 803f30d53b..1476fc629a 100644 --- a/packages/evals/framework/braintrust.ts +++ b/packages/evals/framework/braintrust.ts @@ -1,5 +1,9 @@ let braintrustPromise: Promise | undefined; +export function hasBraintrustApiKey(): boolean { + return Boolean(process.env.BRAINTRUST_API_KEY); +} + export function loadBraintrust(): Promise { braintrustPromise ??= import("braintrust"); return braintrustPromise; @@ -9,6 +13,9 @@ export async function tracedSpan( fn: () => Promise, options: { name: string }, ): Promise { + if (!hasBraintrustApiKey()) { + return fn(); + } const { traced } = await loadBraintrust(); return traced(fn, options); } diff --git a/packages/evals/framework/runner.ts b/packages/evals/framework/runner.ts index 336db1c029..2c712ecc22 100644 --- a/packages/evals/framework/runner.ts +++ b/packages/evals/framework/runner.ts @@ -22,7 +22,7 @@ import type { Testcase, EvalInput } from "../types/evals.js"; import { generateBenchTestcases } from "./benchPlanner.js"; import { DEFAULT_BENCH_HARNESS, type Harness } from "./benchTypes.js"; import { executeBenchTask } from "./benchRunner.js"; -import { loadBraintrust, tracedSpan } from "./braintrust.js"; +import { hasBraintrustApiKey, loadBraintrust, tracedSpan } from "./braintrust.js"; import { onceAsync, registerActiveRunCleanup } from "./activeRunCleanup.js"; import { loadTaskModuleFromPath } from "./taskLoader.js"; @@ -352,6 +352,7 @@ export async function runEvals( : [exactMatch, errorMatch]; const { Eval, flush } = await loadBraintrust(); + const sendLogs = hasBraintrustApiKey(); // Aggressive abort: when the caller flips signal.reason to "aggressive", // close every active session so any in-flight task throws on its next @@ -440,10 +441,13 @@ export async function runEvals( { progress: silentBraintrustProgress, reporter: silentBraintrustReporter, + ...(sendLogs ? {} : { noSendLogs: true }), }, ); - await flush(); + if (sendLogs) { + await flush(); + } const summaryResults = evalResult.results.map((result) => { const output = diff --git a/packages/evals/lib/AISdkClientWrapped.ts b/packages/evals/lib/AISdkClientWrapped.ts index 213aaa0f34..1d4ab7df5f 100644 --- a/packages/evals/lib/AISdkClientWrapped.ts +++ b/packages/evals/lib/AISdkClientWrapped.ts @@ -21,16 +21,18 @@ import { toJsonSchema, } from "@browserbasehq/stagehand"; -let wrappedAiPromise: - | Promise> - | undefined; +type WrappedAI = ReturnType<(typeof import("braintrust"))["wrapAISDK"]>; -async function loadWrappedAISDK(): Promise< - ReturnType<(typeof import("braintrust"))["wrapAISDK"]> -> { - wrappedAiPromise ??= import("braintrust").then(({ wrapAISDK }) => - wrapAISDK(ai), - ); +let wrappedAiPromise: Promise | undefined; + +async function loadWrappedAISDK(): Promise { + wrappedAiPromise ??= (async () => { + if (!process.env.BRAINTRUST_API_KEY) { + return ai as unknown as WrappedAI; + } + const { wrapAISDK } = await import("braintrust"); + return wrapAISDK(ai); + })(); return wrappedAiPromise; } diff --git a/packages/evals/tests/framework/braintrust-optional.test.ts b/packages/evals/tests/framework/braintrust-optional.test.ts new file mode 100644 index 0000000000..97965fd9e8 --- /dev/null +++ b/packages/evals/tests/framework/braintrust-optional.test.ts @@ -0,0 +1,65 @@ +import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; +import { + hasBraintrustApiKey, + tracedSpan, +} from "../../framework/braintrust.js"; + +/** + * Verify that braintrust.ts helpers and AISdkClientWrapped work without + * BRAINTRUST_API_KEY. The runner-level tests live in a separate file + * (braintrust-runner-nolog.test.ts) because they need to vi.mock the + * braintrust module entirely. + */ + +describe("braintrust.ts helpers without BRAINTRUST_API_KEY", () => { + const originalKey = process.env.BRAINTRUST_API_KEY; + + beforeEach(() => { + delete process.env.BRAINTRUST_API_KEY; + }); + + afterEach(() => { + if (originalKey !== undefined) { + process.env.BRAINTRUST_API_KEY = originalKey; + } else { + delete process.env.BRAINTRUST_API_KEY; + } + }); + + it("hasBraintrustApiKey returns false when BRAINTRUST_API_KEY is unset", () => { + expect(hasBraintrustApiKey()).toBe(false); + }); + + it("hasBraintrustApiKey returns true when BRAINTRUST_API_KEY is set", () => { + process.env.BRAINTRUST_API_KEY = "test-key-123"; + expect(hasBraintrustApiKey()).toBe(true); + }); + + it("tracedSpan calls fn directly without Braintrust when key is absent", async () => { + const fn = vi.fn(async () => 42); + const result = await tracedSpan(fn, { name: "test-span" }); + expect(fn).toHaveBeenCalledOnce(); + expect(result).toBe(42); + }); +}); + +describe("AISdkClientWrapped without BRAINTRUST_API_KEY", () => { + const originalKey = process.env.BRAINTRUST_API_KEY; + + beforeEach(() => { + delete process.env.BRAINTRUST_API_KEY; + }); + + afterEach(() => { + if (originalKey !== undefined) { + process.env.BRAINTRUST_API_KEY = originalKey; + } else { + delete process.env.BRAINTRUST_API_KEY; + } + }); + + it("AISdkClientWrapped class is importable without BRAINTRUST_API_KEY", async () => { + const mod = await import("../../lib/AISdkClientWrapped.js"); + expect(mod.AISdkClientWrapped).toBeDefined(); + }); +}); diff --git a/packages/evals/tests/framework/braintrust-runner-nolog.test.ts b/packages/evals/tests/framework/braintrust-runner-nolog.test.ts new file mode 100644 index 0000000000..2978808b14 --- /dev/null +++ b/packages/evals/tests/framework/braintrust-runner-nolog.test.ts @@ -0,0 +1,112 @@ +import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; + +/** + * Verify that the runner passes noSendLogs to Braintrust Eval and skips + * flush() when BRAINTRUST_API_KEY is absent. This file lives separately + * because it needs to vi.mock the braintrust module at the top level. + */ + +vi.mock("playwright", () => ({ + chromium: {}, +})); + +const mockEval = vi.fn(async () => ({ + results: [], + summary: { experimentName: "test", scores: {} }, +})); +const mockFlush = vi.fn(async () => {}); +let mockHasKey = false; + +vi.mock("../../framework/braintrust.js", () => ({ + hasBraintrustApiKey: () => mockHasKey, + loadBraintrust: async () => ({ + Eval: mockEval, + flush: mockFlush, + }), + tracedSpan: async (fn: () => Promise) => fn(), +})); + +describe("runner.ts skips Braintrust logging when API key is absent", () => { + const originalKey = process.env.BRAINTRUST_API_KEY; + + beforeEach(() => { + delete process.env.BRAINTRUST_API_KEY; + mockEval.mockClear(); + mockFlush.mockClear(); + mockHasKey = false; + }); + + afterEach(() => { + if (originalKey !== undefined) { + process.env.BRAINTRUST_API_KEY = originalKey; + } else { + delete process.env.BRAINTRUST_API_KEY; + } + }); + + it("passes noSendLogs: true to Eval when BRAINTRUST_API_KEY is unset", async () => { + mockHasKey = false; + const { runEvals } = await import("../../framework/runner.js"); + + const task = { + name: "test-task", + tier: "bench" as const, + primaryCategory: "extract", + categories: ["extract"], + tags: [], + filePath: "/fake.ts", + isLegacy: false, + }; + + await runEvals({ + tasks: [task], + registry: { + tasks: [task], + byName: new Map([[task.name, task]]), + byTier: new Map([["bench", [task]]]), + byCategory: new Map([["extract", [task]]]), + }, + trials: 1, + }); + + expect(mockEval).toHaveBeenCalledTimes(1); + const evalOptions = mockEval.mock.calls[0][2]; + expect(evalOptions).toHaveProperty("noSendLogs", true); + + // flush should NOT be called + expect(mockFlush).not.toHaveBeenCalled(); + }); + + it("does NOT pass noSendLogs when BRAINTRUST_API_KEY is set", async () => { + mockHasKey = true; + const { runEvals } = await import("../../framework/runner.js"); + + const task = { + name: "test-task", + tier: "bench" as const, + primaryCategory: "extract", + categories: ["extract"], + tags: [], + filePath: "/fake.ts", + isLegacy: false, + }; + + await runEvals({ + tasks: [task], + registry: { + tasks: [task], + byName: new Map([[task.name, task]]), + byTier: new Map([["bench", [task]]]), + byCategory: new Map([["extract", [task]]]), + }, + trials: 1, + }); + + expect(mockEval).toHaveBeenCalledTimes(1); + const evalOptions = mockEval.mock.calls[0][2]; + expect(evalOptions).not.toHaveProperty("noSendLogs"); + + // flush SHOULD be called + expect(mockFlush).toHaveBeenCalledTimes(1); + }); +}); diff --git a/packages/evals/tests/framework/core-runner.test.ts b/packages/evals/tests/framework/core-runner.test.ts index 6d2879fb4f..6aa154af47 100644 --- a/packages/evals/tests/framework/core-runner.test.ts +++ b/packages/evals/tests/framework/core-runner.test.ts @@ -65,6 +65,9 @@ function makeRegistry(tasks: DiscoveredTask[]): TaskRegistry { beforeEach(() => { originalCi = process.env.CI; delete process.env.CI; + // Set a dummy API key so the runner sends logs and calls flush(), + // which this test asserts. + process.env.BRAINTRUST_API_KEY = "test-key"; tracedNames.length = 0; evalMock.mockReset(); flushMock.mockClear(); @@ -79,6 +82,7 @@ afterEach(() => { } else { process.env.CI = originalCi; } + delete process.env.BRAINTRUST_API_KEY; while (tempDirs.length > 0) { const dir = tempDirs.pop();