Skip to content

Commit a9400b1

Browse files
committed
chore(mollifier): fix misleading rate-counter comment + symmetric evaluator fail-open
The TripDecision header comment claimed each webapp instance maintained its own rate counter — wrong. evaluateTrip writes to mollifier:rate:\${envId} with no per-instance prefix, so all replicas pointing at the same Redis share the key. The threshold is the fleet-wide ceiling. Also wrap d.evaluator() in evaluateGate in try/catch so a throwing evaluator falls back to no-divert. The default createRealTripEvaluator catches its own errors, but the contract should be symmetric with the already-wrapped resolveOrgFlag call so a future evaluator can't break the trigger hot path's fail-open contract.
1 parent 5b66401 commit a9400b1

2 files changed

Lines changed: 64 additions & 10 deletions

File tree

apps/webapp/app/v3/mollifier/mollifierGate.server.ts

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,16 @@ import {
1010
type DecisionReason,
1111
} from "./mollifierTelemetry.server";
1212

13-
// `count` is the *single-instance* fixed-window counter (INCR with a PEXPIRE
14-
// armed on the first tick of each window — see `mollifierEvaluateTrip` in
15-
// `packages/redis-worker/src/mollifier/buffer.ts`). It is not a fleet-wide
16-
// aggregate: each webapp instance maintains its own Redis key, so the fleet
17-
// effective ceiling is `instance_count * threshold`, and at a window boundary
18-
// the instance can briefly admit up to ~2x threshold before tripping. The
19-
// tripped marker is refreshed on every overage call, so a sustained burst
20-
// holds the divert state until the rate falls below threshold within a
21-
// window. Phase 2 consumers must not treat `count` as a global rate.
13+
// `count` is the fleet-wide fixed-window counter for the env (INCR with a
14+
// PEXPIRE armed on the first tick of each window — see
15+
// `mollifierEvaluateTrip` in `packages/redis-worker/src/mollifier/buffer.ts`).
16+
// All webapp replicas pointing at the same Redis share the key
17+
// `mollifier:rate:${envId}`, so the threshold is the fleet-wide ceiling
18+
// rather than a per-instance one. At a window boundary an env can briefly
19+
// admit up to ~2x threshold across the fleet before tripping (fixed-window
20+
// not sliding-window). The tripped marker is refreshed on every overage
21+
// call, so a sustained burst holds the divert state until the rate falls
22+
// below threshold within a window.
2223
export type TripDecision =
2324
| { divert: false }
2425
| {
@@ -165,7 +166,23 @@ export async function evaluateGate(
165166
return { action: "pass_through" };
166167
}
167168

168-
const decision = await d.evaluator(inputs);
169+
// Fail open on evaluator errors too. The default `createRealTripEvaluator`
170+
// catches its own errors and returns `{ divert: false }`, but injected or
171+
// future evaluators may not — keep the contract symmetric with the org
172+
// flag resolution above so the trigger hot path can never be broken by a
173+
// gate-internal failure.
174+
let decision: TripDecision;
175+
try {
176+
decision = await d.evaluator(inputs);
177+
} catch (error) {
178+
logger.warn("mollifier.evaluator_failed", {
179+
envId: inputs.envId,
180+
orgId: inputs.orgId,
181+
taskId: inputs.taskId,
182+
error: error instanceof Error ? error.message : String(error),
183+
});
184+
decision = { divert: false };
185+
}
169186
if (!decision.divert) {
170187
d.recordDecision("pass_through");
171188
return { action: "pass_through" };

apps/webapp/test/mollifierGate.test.ts

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,43 @@ describe("resolveMollifierFlag — hot path", () => {
254254
});
255255
});
256256

257+
describe("evaluateGate — fail open on evaluator error", () => {
258+
it("treats a throwing evaluator as no-divert (pass_through), and never blocks the trigger", async () => {
259+
const spies: Spies = {
260+
evaluatorCalls: 0,
261+
logShadowCalls: [],
262+
logMollifiedCalls: [],
263+
recordDecisionCalls: [],
264+
};
265+
const deps: Partial<GateDependencies> = {
266+
isMollifierEnabled: () => true,
267+
isShadowModeOn: () => false,
268+
resolveOrgFlag: async () => true,
269+
evaluator: async () => {
270+
spies.evaluatorCalls += 1;
271+
throw new Error("simulated evaluator failure");
272+
},
273+
logShadow: (inputs, decision) => {
274+
spies.logShadowCalls.push({ inputs, decision });
275+
},
276+
logMollified: (inputs, decision) => {
277+
spies.logMollifiedCalls.push({ inputs, decision });
278+
},
279+
recordDecision: (outcome, reason) => {
280+
spies.recordDecisionCalls.push({ outcome, reason });
281+
},
282+
};
283+
284+
const outcome = await evaluateGate(inputs, deps);
285+
286+
expect(outcome.action).toBe("pass_through");
287+
expect(spies.evaluatorCalls).toBe(1);
288+
expect(spies.logMollifiedCalls).toHaveLength(0);
289+
expect(spies.logShadowCalls).toHaveLength(0);
290+
expect(spies.recordDecisionCalls).toEqual([{ outcome: "pass_through", reason: undefined }]);
291+
});
292+
});
293+
257294
describe("evaluateGate — fail open on resolveOrgFlag error", () => {
258295
it("treats org flag as false when resolveOrgFlag throws, and does not block triggers", async () => {
259296
const spies: Spies = {

0 commit comments

Comments
 (0)