chore(mollifier): fix misleading rate-counter comment + symmetric evaluator fail-open

d-cs · d-cs · commit a9400b178227 · 2026-05-14T15:07:41.000+01:00
The TripDecision header comment claimed each webapp instance maintained
its own rate counter — wrong. evaluateTrip writes to mollifier:rate:\${envId}
with no per-instance prefix, so all replicas pointing at the same Redis
share the key. The threshold is the fleet-wide ceiling.

Also wrap d.evaluator() in evaluateGate in try/catch so a throwing
evaluator falls back to no-divert. The default createRealTripEvaluator
catches its own errors, but the contract should be symmetric with the
already-wrapped resolveOrgFlag call so a future evaluator can't break
the trigger hot path's fail-open contract.
diff --git a/apps/webapp/app/v3/mollifier/mollifierGate.server.ts b/apps/webapp/app/v3/mollifier/mollifierGate.server.ts
@@ -10,15 +10,16 @@ import {
   type DecisionReason,
 } from "./mollifierTelemetry.server";
 
-// `count` is the *single-instance* fixed-window counter (INCR with a PEXPIRE
-// armed on the first tick of each window — see `mollifierEvaluateTrip` in
-// `packages/redis-worker/src/mollifier/buffer.ts`). It is not a fleet-wide
-// aggregate: each webapp instance maintains its own Redis key, so the fleet
-// effective ceiling is `instance_count * threshold`, and at a window boundary
-// the instance can briefly admit up to ~2x threshold before tripping. The
-// tripped marker is refreshed on every overage call, so a sustained burst
-// holds the divert state until the rate falls below threshold within a
-// window. Phase 2 consumers must not treat `count` as a global rate.
+// `count` is the fleet-wide fixed-window counter for the env (INCR with a
+// PEXPIRE armed on the first tick of each window — see
+// `mollifierEvaluateTrip` in `packages/redis-worker/src/mollifier/buffer.ts`).
+// All webapp replicas pointing at the same Redis share the key
+// `mollifier:rate:${envId}`, so the threshold is the fleet-wide ceiling
+// rather than a per-instance one. At a window boundary an env can briefly
+// admit up to ~2x threshold across the fleet before tripping (fixed-window
+// not sliding-window). The tripped marker is refreshed on every overage
+// call, so a sustained burst holds the divert state until the rate falls
+// below threshold within a window.
 export type TripDecision =
   | { divert: false }
   | {
@@ -165,7 +166,23 @@ export async function evaluateGate(
     return { action: "pass_through" };
   }
 
-  const decision = await d.evaluator(inputs);
+  // Fail open on evaluator errors too. The default `createRealTripEvaluator`
+  // catches its own errors and returns `{ divert: false }`, but injected or
+  // future evaluators may not — keep the contract symmetric with the org
+  // flag resolution above so the trigger hot path can never be broken by a
+  // gate-internal failure.
+  let decision: TripDecision;
+  try {
+    decision = await d.evaluator(inputs);
+  } catch (error) {
+    logger.warn("mollifier.evaluator_failed", {
+      envId: inputs.envId,
+      orgId: inputs.orgId,
+      taskId: inputs.taskId,
+      error: error instanceof Error ? error.message : String(error),
+    });
+    decision = { divert: false };
+  }
   if (!decision.divert) {
     d.recordDecision("pass_through");
     return { action: "pass_through" };
diff --git a/apps/webapp/test/mollifierGate.test.ts b/apps/webapp/test/mollifierGate.test.ts
@@ -254,6 +254,43 @@ describe("resolveMollifierFlag — hot path", () => {
   });
 });
 
+describe("evaluateGate — fail open on evaluator error", () => {
+  it("treats a throwing evaluator as no-divert (pass_through), and never blocks the trigger", async () => {
+    const spies: Spies = {
+      evaluatorCalls: 0,
+      logShadowCalls: [],
+      logMollifiedCalls: [],
+      recordDecisionCalls: [],
+    };
+    const deps: Partial<GateDependencies> = {
+      isMollifierEnabled: () => true,
+      isShadowModeOn: () => false,
+      resolveOrgFlag: async () => true,
+      evaluator: async () => {
+        spies.evaluatorCalls += 1;
+        throw new Error("simulated evaluator failure");
+      },
+      logShadow: (inputs, decision) => {
+        spies.logShadowCalls.push({ inputs, decision });
+      },
+      logMollified: (inputs, decision) => {
+        spies.logMollifiedCalls.push({ inputs, decision });
+      },
+      recordDecision: (outcome, reason) => {
+        spies.recordDecisionCalls.push({ outcome, reason });
+      },
+    };
+
+    const outcome = await evaluateGate(inputs, deps);
+
+    expect(outcome.action).toBe("pass_through");
+    expect(spies.evaluatorCalls).toBe(1);
+    expect(spies.logMollifiedCalls).toHaveLength(0);
+    expect(spies.logShadowCalls).toHaveLength(0);
+    expect(spies.recordDecisionCalls).toEqual([{ outcome: "pass_through", reason: undefined }]);
+  });
+});
+
 describe("evaluateGate — fail open on resolveOrgFlag error", () => {
   it("treats org flag as false when resolveOrgFlag throws, and does not block triggers", async () => {
     const spies: Spies = {