databricks · jamesbroadhead · May 21, 2026 · May 22, 2026 · May 22, 2026
@@ -184,12 +184,12 @@ describe("useAnalyticsQuery", () => {
     expect(result.current.data).toBeNull();
   });
 
-  test("a server error event carrying a structured errorCode exposes it on the hook return value", async () => {
-    // The SSE error broadcaster forwards an `errorCode` field for
-    // UI branching (e.g. INLINE_ARROW_STASH_EXHAUSTED). The hook
-    // surfaces both the human `error` text AND the structured
-    // `errorCode` so consumers can branch on the stable identifier
-    // instead of substring-matching the sanitized human message.
+  test("a server error event carrying a structured errorCode + requestId exposes both on the hook return value", async () => {
+    // The SSE error broadcaster forwards `errorCode` (for UI branching)
+    // and `requestId` (for support triage — same id appears in the
+    // server-side logger.error line). The hook surfaces both so
+    // consumers can render "Error ref: <requestId>" alongside the
+    // human message and branch behavior on the stable errorCode.
     const errorSpy = vi.spyOn(console, "error").mockImplementation(() => {});
 
     const { result } = renderHook(() =>
@@ -202,6 +202,7 @@ describe("useAnalyticsQuery", () => {
         error: "Server is at capacity, please retry",
         code: "UPSTREAM_ERROR",
         errorCode: "INLINE_ARROW_STASH_EXHAUSTED",
+        requestId: "11111111-2222-3333-4444-555555555555",
       }),
     });
 
@@ -210,6 +211,9 @@ describe("useAnalyticsQuery", () => {
     });
     expect(result.current.loading).toBe(false);
     expect(result.current.errorCode).toBe("INLINE_ARROW_STASH_EXHAUSTED");
+    expect(result.current.requestId).toBe(
+      "11111111-2222-3333-4444-555555555555",
+    );
 
     errorSpy.mockRestore();
   });

@@ -73,6 +73,15 @@ export interface UseAnalyticsQueryResult<T> {
    * matching `error`, which is a free-form sanitized message.
    */
   errorCode: string | null;
+  /**
+   * Server-side correlation id for support triage. When non-null, the
+   * server emitted this id on the error frame and the same id appears
+   * in the backend's `logger.error` line. Surface it in error UIs
+   * ("Error ref: abc-123 — quote when filing a support ticket") so
+   * staff can grep server logs directly instead of asking the user to
+   * reproduce.
+   */
+  requestId: string | null;
 }
 
 /**

@@ -122,6 +122,7 @@ export function useAnalyticsQuery<
   const [loading, setLoading] = useState(false);
   const [error, setError] = useState<string | null>(null);
   const [errorCode, setErrorCode] = useState<string | null>(null);
+  const [requestId, setRequestId] = useState<string | null>(null);
   const abortControllerRef = useRef<AbortController | null>(null);
 
   if (!queryKey || queryKey.trim().length === 0) {
@@ -170,6 +171,7 @@ export function useAnalyticsQuery<
     setLoading(true);
     setError(null);
     setErrorCode(null);
+    setRequestId(null);
     setData(null);
 
     const abortController = new AbortController();
@@ -247,10 +249,16 @@ export function useAnalyticsQuery<
             if (typeof parsed.errorCode === "string") {
               setErrorCode(parsed.errorCode);
             }
+            // requestId is the SSE event id of the error frame — same
+            // value the server logs in its `Stream execution failed`
+            // line, so users can quote it in support tickets.
+            if (typeof parsed.requestId === "string") {
+              setRequestId(parsed.requestId);
+            }
 
             if (parsed.code) {
               console.error(
-                `[useAnalyticsQuery] Code: ${parsed.code}, Message: ${errorMsg}`,
+                `[useAnalyticsQuery] Code: ${parsed.code}, Message: ${errorMsg}, requestId: ${parsed.requestId ?? "n/a"}`,
               );
             }
             return;
@@ -321,5 +329,5 @@ export function useAnalyticsQuery<
   // Enable HMR for query updates in dev mode
   useQueryHMR(queryKey, start);
 
-  return { data, loading, error, errorCode };
+  return { data, loading, error, errorCode, requestId };
 }
@@ -0,0 +1,179 @@
+/**
+ * Allowlist of Databricks SQL Statement Execution API error codes whose
+ * `error.message` is safe to forward to clients verbatim.
+ *
+ * Threat model — the recipient of these messages is an *authenticated
+ * workspace user*, not an anonymous internet visitor. They already know
+ * the workspace URL, their orgId, their session token, the warehouses
+ * they have grants on, and the catalog contents they can see. Names of
+ * resources within the workspace, internal correlation IDs, and class
+ * names from Databricks-internal packages are not new information to
+ * this audience — the classic CWE-209 "stack-trace-on-500-page-leaks-
+ * to-attacker" scenario does not apply.
+ *
+ * Given that, the bar for passthrough is: does the message help the
+ * user (or their support contact) understand and act on the failure?
+ * Defaults skew toward yes because:
+ *
+ * - The cost of denial is *certain* (every error becomes "Query
+ *   execution failed" with no signal; users open tickets, support has
+ *   to reproduce or grep server logs to find the actual error).
+ * - The cost of allowing is *speculative* — depends on whatever the
+ *   most recent wrapped exception happened to put in its `.toString()`.
+ *
+ * Server-side, the route handlers always `logger.error(rawMsg)` the
+ * full upstream text before sanitization — operators retain complete
+ * visibility regardless of what reaches the wire. SSE error frames
+ * also carry a `requestId` (the SSE event id) so users can quote it
+ * in support tickets and staff can grep logs against it directly.
+ *
+ * **Currently denied**: `UNKNOWN` only — by definition unclassified,
+ * so we can't reason about its contents and a default-deny is the
+ * minimum safe stance. Every other `ServiceErrorCode` variant is on
+ * the allowlist below with a documented rationale.
+ *
+ * Source-of-truth construction sites for the classifications below:
+ *
+ * - DBR data-plane (`BAD_REQUEST`, `NOT_FOUND`, `ALREADY_EXISTS`):
+ *   `sqlgateway/scheduler/src/main/scala/DriverRequests.scala:181-208`
+ *   populates `ErrorInfo.withMessage(command.errorDisplayMessage)` and
+ *   sets `withErrorReturnedFromDataPlane(true)`.
+ *
+ * - User-facing service messages (`TEMPORARILY_UNAVAILABLE`,
+ *   `WORKSPACE_TEMPORARILY_UNAVAILABLE`, `SERVICE_UNDER_MAINTENANCE`):
+ *   - `sqlgateway/scheduler/api/src/main/scala/client/SchedulerRpcClient.scala`
+ *     constructs `SchedulerTemporarilyUnavailableException` with the
+ *     literal "DBSQL temporarily unavailable. Please try again in a
+ *     few minutes."
+ *   - `sqlgateway/scheduler/api/src/main/scala/client/linkstore/SqlGatewayClerk.scala`
+ *     constructs with `internalMessage = "DBSQL temporarily unavailable..."`
+ *   - `sqlgateway/proxy/src/main/scala/thrift/Exceptions.scala`
+ *     constructs `WORKSPACE_TEMPORARILY_UNAVAILABLE` from
+ *     `NotServedByShardException.getMessage` — names a shard but no
+ *     credentials / PII / stack traces.
+ *
+ * - User-actionable quota messages (`RESOURCE_EXHAUSTED`):
+ *   `sqlgateway/scheduler/src/main/scala/SchedulerRpcValidatorHook.scala`
+ *   emits stable templates: "The maximum number of warehouses has been
+ *   reached. Please contact Databricks support.", "You've hit the limit
+ *   for warehouses for free usage. Stop or delete existing warehouses
+ *   to free up capacity.", etc.
+ *
+ * - Concurrency conflict (`ABORTED`): short reason strings like
+ *   "version mismatch" (Reyden warehouse monitor). No internal context.
+ *
+ * - `INTERNAL_ERROR` / `IO_ERROR`:
+ *   `sqlgateway/database/src/main/scala/endpoints/EndpointModel.scala`
+ *   interpolates `s"SQL ${conf.warehouse} cannot be created due to
+ *   repeated collisions on unique IDs."`;
+ *   `SchedulerRpcContextValidatorHook.workspaceNotOwned(orgId)`
+ *   interpolates orgId. These identifiers are not sensitive to a
+ *   workspace user (they own / can list both). The wrapped
+ *   `ex.getMessage` content is unbounded but for the typical case is
+ *   either operational (RPC timeout, lock contention) or shape-of-error
+ *   information the user needs to act. Allowlisted with the requestId
+ *   plumbing as the safety net for triage. *Not* allowlisted if a
+ *   future review finds construction sites that interpolate user
+ *   credentials, OAuth tokens, or other secrets — open an issue + flip
+ *   the entry.
+ *
+ * - `UNKNOWN`: unclassified by definition. We have no way to reason
+ *   about its contents — could be anything from "RPC connect failed"
+ *   to a panic dump from an upstream library. Default-deny is the
+ *   only defensible stance until the SDK gives us a finer code.
+ *
+ * - `UNAUTHENTICATED` / `CANCELLED` / `DEADLINE_EXCEEDED`: SDK-level
+ *   codes with generic message templates.
+ *
+ * SDK type reference (`@databricks/sdk-experimental`,
+ * `apis/sql/model.d.ts`):
+ * ```
+ * type ServiceErrorCode =
+ *   | "ABORTED" | "ALREADY_EXISTS" | "BAD_REQUEST" | "CANCELLED"
+ *   | "DEADLINE_EXCEEDED" | "INTERNAL_ERROR" | "IO_ERROR" | "NOT_FOUND"
+ *   | "RESOURCE_EXHAUSTED" | "SERVICE_UNDER_MAINTENANCE"
+ *   | "TEMPORARILY_UNAVAILABLE" | "UNAUTHENTICATED" | "UNKNOWN"
+ *   | "WORKSPACE_TEMPORARILY_UNAVAILABLE";
+ * ```
+ *
+ * **Default-deny on unknown codes** — when the SDK ships a new variant
+ * the allowlist hasn't been reviewed against, the new code collapses to
+ * the generic clientMessage until someone audits the construction sites
+ * and updates the Set below.
+ */
+
+/** ServiceErrorCode values for which `error.message` may be forwarded verbatim. */
+const SAFE_PASSTHROUGH_CODES: ReadonlySet<string> = new Set([
+  // DBR-authored: user's own SQL errors. Parse, semantic, type, missing
+  // table/column. Examples: "Table or view 'foo' not found", "Syntax
+  // error at or near ',' on line 3".
+  "BAD_REQUEST",
+  // DBR catalog miss — "Table 'foo' not found", "Schema 'bar' not found".
+  "NOT_FOUND",
+  // DBR catalog conflict — "Table 'foo' already exists".
+  "ALREADY_EXISTS",
+  // Stable template: "Query timed out after Ns". User-actionable.
+  "DEADLINE_EXCEEDED",
+  // "Statement was canceled". User/admin-initiated; no internal state.
+  "CANCELLED",
+  // SDK generic — "Permission denied", "Authentication required".
+  "UNAUTHENTICATED",
+  // Stable user-facing template: "DBSQL temporarily unavailable. Please
+  // try again in a few minutes." Hiding this would force users to
+  // distinguish "warehouse issue" from "user error" by guesswork.
+  "TEMPORARILY_UNAVAILABLE",
+  // Workspace-level service messages from shard-routing failures. The
+  // message may name a shard ID — that's internal topology metadata,
+  // low sensitivity, and operationally useful for the user to know
+  // the workspace is in motion.
+  "WORKSPACE_TEMPORARILY_UNAVAILABLE",
+  // Maintenance windows — generic template messages.
+  "SERVICE_UNDER_MAINTENANCE",
+  // Stable user-actionable templates from the warehouse quota path:
+  // "The maximum number of warehouses has been reached. Please contact
+  // Databricks support.", "Stop or delete existing warehouses to free
+  // up capacity.", etc. Telling users "Query execution failed" when
+  // they've actually hit a quota is unhelpful.
+  "RESOURCE_EXHAUSTED",
+  // Concurrency conflict — short reason strings ("version mismatch",
+  // "concurrent modification"). User-relevant for retry decisions.
+  "ABORTED",
+  // Wrapped internal exceptions. Construction sites interpolate
+  // workspace-internal identifiers (warehouse names, orgIds) and
+  // `ex.getMessage` from arbitrary causes. To an authenticated
+  // workspace user these identifiers are not sensitive — they can
+  // already enumerate them. The wrapped `ex.getMessage` content is
+  // unbounded but in practice is operational ("RPC timed out",
+  // "deadlock detected") or shape-of-failure detail the user needs to
+  // act on. The `requestId` we emit on the SSE error frame is the
+  // safety net for unhappy cases — users can quote it in tickets and
+  // staff can grep logs against it for the full unsanitized message.
+  "INTERNAL_ERROR",
+  // Storage / network failures. Same logic as INTERNAL_ERROR: path /
+  // bucket names visible to a workspace user with catalog access are
+  // not new information, and the wrapped error text usually tells
+  // them whether to retry, switch warehouses, or escalate.
+  "IO_ERROR",
+]);
+
+/**
+ * Explicitly NOT on the allowlist (documented for future-reviewer
+ * context; classification is by absence from `SAFE_PASSTHROUGH_CODES`):
+ *
+ * - `UNKNOWN` — unclassified by definition; could be anything from a
+ *   library panic to an unhandled `case` in the gateway. Default-deny
+ *   is the only defensible stance until the SDK gives us a finer code.
+ */
+
+/**
+ * Returns true when the given upstream `error_code` belongs to the
+ * passthrough allowlist — i.e. the corresponding `error.message` was
+ * authored by DBR for the user's own SQL and is safe to forward.
+ *
+ * Returns false (default-deny) for any unrecognized code, including
+ * codes added to the SDK after this allowlist was last reviewed.
+ */
+export function isSqlErrorPassthrough(errorCode: string | undefined): boolean {
+  if (!errorCode) return false;
+  return SAFE_PASSTHROUGH_CODES.has(errorCode);
+}
@@ -1,4 +1,5 @@
 import { AppKitError } from "./base";
+import { isSqlErrorPassthrough } from "./dbsql-error-allowlist";
 
 /**
  * Error thrown when an operation execution fails.
@@ -50,16 +51,22 @@ export class ExecutionError extends AppKitError {
   /**
    * Create an execution error for statement failure.
    * @param errorMessage Human-readable error from the warehouse / SDK.
-   *   Goes into `.message` for server logs only — *never* echoed to the
-   *   client. Pass `clientMessage` explicitly if a sanitized text should
-   *   reach the UI.
-   * @param errorCode Structured code (e.g. "INVALID_PARAMETER_VALUE") to
-   *   preserve through wrapping. Optional. Forwarded on SSE error
-   *   payloads so UI can branch on it instead of substring-matching
-   *   `error`.
-   * @param clientMessage Optional client-safe replacement for `.message`.
-   *   Defaults to "Query execution failed" via the `clientMessage`
-   *   getter. Set this only when the upstream text is known-safe.
+   *   When `errorCode` is on the DBSQL safe-passthrough allowlist (see
+   *   `dbsql-error-allowlist.ts`), this text is forwarded to clients
+   *   verbatim — it's authored by DBR for the user's own SQL and is
+   *   inherently user-facing. For codes NOT on the allowlist (control
+   *   plane errors carrying correlation IDs, internal paths, stack
+   *   traces), `.message` goes into server logs only and the client
+   *   sees a generic "Query execution failed". An explicit
+   *   `clientMessage` argument always wins over both paths.
+   * @param errorCode Structured code (e.g. "BAD_REQUEST",
+   *   "INVALID_PARAMETER_VALUE") to preserve through wrapping. Optional.
+   *   Forwarded on SSE error payloads so UI can branch on it instead of
+   *   substring-matching `error`. Also drives the passthrough decision.
+   * @param clientMessage Explicit client-safe message that always wins —
+   *   bypasses the allowlist check. Use when the upstream text is
+   *   known-safe regardless of code (e.g. constructed in-process from a
+   *   trusted template).
    */
   static statementFailed(
     errorMessage?: string,
@@ -69,7 +76,20 @@ export class ExecutionError extends AppKitError {
     const message = errorMessage
       ? `Statement failed: ${errorMessage}`
       : "Statement failed: Unknown error";
-    return new ExecutionError(message, { errorCode, clientMessage });
+
+    // Allowlist-driven passthrough: BAD_REQUEST / NOT_FOUND /
+    // ALREADY_EXISTS / etc. carry DBR-authored messages that *are* the
+    // user's own SQL error ("Table 'foo' not found", "Syntax error
+    // near ',' on line 3"). Hiding these forces users to debug in the
+    // dark; surfacing them is the entire reason an error message
+    // exists. Any code not on the allowlist defaults to generic.
+    const inferredClient =
+      clientMessage ?? (isSqlErrorPassthrough(errorCode) ? message : undefined);
+
+    return new ExecutionError(message, {
+      errorCode,
+      clientMessage: inferredClient,
+    });
   }
 
   /**