From 8d0c6c57fb34c81d0c91fa6b831c13cf4cf0d680 Mon Sep 17 00:00:00 2001
From: James Broadhead <jamesbroadhead@gmail.com>
Date: Thu, 21 May 2026 22:15:17 +0000
Subject: [PATCH 1/3] feat(appkit): allowlist DBSQL error codes whose messages
 are safe to forward
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Follow-up to the error-sanitization work on #329. The clientMessage
default of "Query execution failed" was applied uniformly to every
ExecutionError.statementFailed call, which hid genuinely user-facing
errors — the user's own SQL "Table 'foo' not found", "Syntax error
near ',' on line 3", etc. — and forced users to debug in the dark.

The fix is *not* to remove sanitization (CWE-209 is real — control
plane messages carry correlation IDs, internal RPC paths, stack traces),
but to distinguish the two sources by `ServiceErrorCode`:

1. **DBR (data plane)** — populates `errorDisplayMessage` for
   user-authored SQL errors and surfaces them via the gateway untouched.
   Source: `sqlgateway/scheduler/src/main/scala/DriverRequests.scala:181-208`,
   `ErrorInfo.withMessage(command.errorDisplayMessage).withErrorReturnedFromDataPlane(true)`.
   Codes: BAD_REQUEST, NOT_FOUND, ALREADY_EXISTS, DEADLINE_EXCEEDED,
   CANCELLED, UNAUTHENTICATED.

2. **Control plane** — wraps internal RPC failures, scheduler exceptions,
   capacity rejections. Source:
   `sqlgateway/scheduler/src/main/scala/utils/CommandUtils.scala`,
   `exceptions.scala`. Codes: INTERNAL_ERROR, IO_ERROR, UNKNOWN,
   RESOURCE_EXHAUSTED, SERVICE_UNDER_MAINTENANCE,
   TEMPORARILY_UNAVAILABLE, WORKSPACE_TEMPORARILY_UNAVAILABLE, ABORTED.

`dbsql-error-allowlist.ts` enumerates source-1 codes. The Set is
default-deny: any code not on the list — including future SDK additions
the allowlist hasn't yet been reviewed against — collapses to the
generic clientMessage. An explicit `clientMessage` argument to
`statementFailed()` always wins.

Server-side `.message` always preserves the raw upstream text for
`logger.error` capture; sanitization is purely at the wire boundary.

Validation: 2,129 appkit tests pass (+31 new); tsc clean; biome clean.

Co-authored-by: Isaac
Signed-off-by: James Broadhead <jamesbroadhead@gmail.com>
---
 .../src/errors/dbsql-error-allowlist.ts       |  97 ++++++++++++
 packages/appkit/src/errors/execution.ts       |  42 +++--
 .../tests/dbsql-error-allowlist.test.ts       |  54 +++++++
 .../appkit/src/errors/tests/errors.test.ts    | 148 ++++++++++++++++++
 4 files changed, 330 insertions(+), 11 deletions(-)
 create mode 100644 packages/appkit/src/errors/dbsql-error-allowlist.ts
 create mode 100644 packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts

diff --git a/packages/appkit/src/errors/dbsql-error-allowlist.ts b/packages/appkit/src/errors/dbsql-error-allowlist.ts
new file mode 100644
index 000000000..080cc781d
--- /dev/null
+++ b/packages/appkit/src/errors/dbsql-error-allowlist.ts
@@ -0,0 +1,97 @@
+/**
+ * Allowlist of Databricks SQL Statement Execution API error codes whose
+ * `error.message` is safe to forward to clients verbatim.
+ *
+ * Background — the warehouse populates `error.message` from two distinct
+ * sources, distinguishable only by `error.error_code`:
+ *
+ * 1. **DBR (data plane)** sets `errorDisplayMessage` for user-authored
+ *    SQL errors (parse, semantic, type, missing-table) and surfaces them
+ *    via the gateway untouched. These messages are inherently user-facing
+ *    — they reference the user's own SQL — and were filtered through
+ *    DBR's Thrift layer with the express purpose of being shown back to
+ *    the user. They are *safe to passthrough*.
+ *
+ *    Source of truth: DBR -> SQL Gateway scheduler
+ *    `sqlgateway/scheduler/src/main/scala/DriverRequests.scala:181-208`
+ *    populates `ErrorInfo.withMessage(command.errorDisplayMessage)` and
+ *    sets `withErrorReturnedFromDataPlane(true)` for these.
+ *
+ * 2. **Control plane (proxy / scheduler / metaservice)** sets
+ *    `error.message` from internal RPC failures, scheduler exceptions,
+ *    capacity rejections, etc. These messages can include correlation
+ *    IDs, internal pod / service names, stack traces, and storage paths
+ *    — they are *not safe to passthrough* (CWE-209).
+ *
+ *    Source: `sqlgateway/scheduler/src/main/scala/utils/CommandUtils.scala`
+ *    and `exceptions.scala` wrap internal failures with
+ *    `CommandExecutionFailed`.
+ *
+ * The mapping below classifies each `ServiceErrorCode` value from
+ * `@databricks/sdk-experimental` (`apis/sql/model.d.ts`) by which source
+ * dominates in practice. When the SDK adds a new code, default it to
+ * `false` here so it stays out of the passthrough until reviewed.
+ *
+ * SDK type reference:
+ * ```
+ * type ServiceErrorCode =
+ *   | "ABORTED" | "ALREADY_EXISTS" | "BAD_REQUEST" | "CANCELLED"
+ *   | "DEADLINE_EXCEEDED" | "INTERNAL_ERROR" | "IO_ERROR" | "NOT_FOUND"
+ *   | "RESOURCE_EXHAUSTED" | "SERVICE_UNDER_MAINTENANCE"
+ *   | "TEMPORARILY_UNAVAILABLE" | "UNAUTHENTICATED" | "UNKNOWN"
+ *   | "WORKSPACE_TEMPORARILY_UNAVAILABLE";
+ * ```
+ */
+
+/** ServiceErrorCode values for which `error.message` may be forwarded verbatim. */
+const SAFE_PASSTHROUGH_CODES: ReadonlySet<string> = new Set([
+  // User-authored SQL errors flow through DBR's errorDisplayMessage.
+  // Examples: "Table or view 'foo' not found", "Syntax error at or near
+  // ',' on line 3", "Cannot resolve column 'col_x' given input columns".
+  "BAD_REQUEST",
+  // Catalog lookup misses — "Table 'foo' not found", "Schema 'bar' not
+  // found". DBR sets these via the catalog layer; messages reference
+  // the user's own SQL.
+  "NOT_FOUND",
+  // Catalog conflicts — "Table 'foo' already exists". Same provenance.
+  "ALREADY_EXISTS",
+  // Query timeout. Message is typically "Query timed out after Ns" —
+  // user-actionable, no internal state.
+  "DEADLINE_EXCEEDED",
+  // Client / admin cancellation. Message is "Statement was canceled".
+  "CANCELLED",
+  // Authentication failures don't carry internal state; the SDK surfaces
+  // a generic "Permission denied" or "Authentication required" string.
+  "UNAUTHENTICATED",
+]);
+
+/**
+ * Codes that explicitly carry internal control-plane wording and must
+ * NEVER pass through to clients. Listed for documentation; classification
+ * is by absence from `SAFE_PASSTHROUGH_CODES`.
+ *
+ * - `INTERNAL_ERROR` — control-plane stack traces, RPC failures.
+ * - `IO_ERROR` — storage / network paths (may leak bucket names, pod IPs).
+ * - `UNKNOWN` — by definition unclassified; default-deny.
+ * - `RESOURCE_EXHAUSTED` — mixed; control plane sometimes leaks
+ *   internal load data ("scheduler-pod-3 at 92% CPU"). Default-deny.
+ * - `SERVICE_UNDER_MAINTENANCE` / `TEMPORARILY_UNAVAILABLE` /
+ *   `WORKSPACE_TEMPORARILY_UNAVAILABLE` — typically generic, but the
+ *   gateway has been observed to include workspace identifiers and
+ *   scheduler pod names. Default-deny.
+ * - `ABORTED` — server-side abort; can carry transaction / lock context
+ *   identifying internal locking subsystems. Default-deny.
+ */
+
+/**
+ * Returns true when the given upstream `error_code` belongs to the
+ * passthrough allowlist — i.e. the corresponding `error.message` was
+ * authored by DBR for the user's own SQL and is safe to forward.
+ *
+ * Returns false (default-deny) for any unrecognized code, including
+ * codes added to the SDK after this allowlist was last reviewed.
+ */
+export function isSqlErrorPassthrough(errorCode: string | undefined): boolean {
+  if (!errorCode) return false;
+  return SAFE_PASSTHROUGH_CODES.has(errorCode);
+}
diff --git a/packages/appkit/src/errors/execution.ts b/packages/appkit/src/errors/execution.ts
index 3c9df7fe0..3dc2944f3 100644
--- a/packages/appkit/src/errors/execution.ts
+++ b/packages/appkit/src/errors/execution.ts
@@ -1,4 +1,5 @@
 import { AppKitError } from "./base";
+import { isSqlErrorPassthrough } from "./dbsql-error-allowlist";
 
 /**
  * Error thrown when an operation execution fails.
@@ -50,16 +51,22 @@ export class ExecutionError extends AppKitError {
   /**
    * Create an execution error for statement failure.
    * @param errorMessage Human-readable error from the warehouse / SDK.
-   *   Goes into `.message` for server logs only — *never* echoed to the
-   *   client. Pass `clientMessage` explicitly if a sanitized text should
-   *   reach the UI.
-   * @param errorCode Structured code (e.g. "INVALID_PARAMETER_VALUE") to
-   *   preserve through wrapping. Optional. Forwarded on SSE error
-   *   payloads so UI can branch on it instead of substring-matching
-   *   `error`.
-   * @param clientMessage Optional client-safe replacement for `.message`.
-   *   Defaults to "Query execution failed" via the `clientMessage`
-   *   getter. Set this only when the upstream text is known-safe.
+   *   When `errorCode` is on the DBSQL safe-passthrough allowlist (see
+   *   `dbsql-error-allowlist.ts`), this text is forwarded to clients
+   *   verbatim — it's authored by DBR for the user's own SQL and is
+   *   inherently user-facing. For codes NOT on the allowlist (control
+   *   plane errors carrying correlation IDs, internal paths, stack
+   *   traces), `.message` goes into server logs only and the client
+   *   sees a generic "Query execution failed". An explicit
+   *   `clientMessage` argument always wins over both paths.
+   * @param errorCode Structured code (e.g. "BAD_REQUEST",
+   *   "INVALID_PARAMETER_VALUE") to preserve through wrapping. Optional.
+   *   Forwarded on SSE error payloads so UI can branch on it instead of
+   *   substring-matching `error`. Also drives the passthrough decision.
+   * @param clientMessage Explicit client-safe message that always wins —
+   *   bypasses the allowlist check. Use when the upstream text is
+   *   known-safe regardless of code (e.g. constructed in-process from a
+   *   trusted template).
    */
   static statementFailed(
     errorMessage?: string,
@@ -69,7 +76,20 @@ export class ExecutionError extends AppKitError {
     const message = errorMessage
       ? `Statement failed: ${errorMessage}`
       : "Statement failed: Unknown error";
-    return new ExecutionError(message, { errorCode, clientMessage });
+
+    // Allowlist-driven passthrough: BAD_REQUEST / NOT_FOUND /
+    // ALREADY_EXISTS / etc. carry DBR-authored messages that *are* the
+    // user's own SQL error ("Table 'foo' not found", "Syntax error
+    // near ',' on line 3"). Hiding these forces users to debug in the
+    // dark; surfacing them is the entire reason an error message
+    // exists. Any code not on the allowlist defaults to generic.
+    const inferredClient =
+      clientMessage ?? (isSqlErrorPassthrough(errorCode) ? message : undefined);
+
+    return new ExecutionError(message, {
+      errorCode,
+      clientMessage: inferredClient,
+    });
   }
 
   /**
diff --git a/packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts b/packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts
new file mode 100644
index 000000000..f65856dd2
--- /dev/null
+++ b/packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts
@@ -0,0 +1,54 @@
+import { describe, expect, test } from "vitest";
+import { isSqlErrorPassthrough } from "../dbsql-error-allowlist";
+
+describe("isSqlErrorPassthrough", () => {
+  // Allowlist members — DBR-authored, user-facing messages.
+  test.each([
+    ["BAD_REQUEST"],
+    ["NOT_FOUND"],
+    ["ALREADY_EXISTS"],
+    ["DEADLINE_EXCEEDED"],
+    ["CANCELLED"],
+    ["UNAUTHENTICATED"],
+  ])("%s is allowlisted (DBR-authored user-facing message)", (code) => {
+    expect(isSqlErrorPassthrough(code)).toBe(true);
+  });
+
+  // Explicit denylist — control-plane sourced, internal state risk.
+  test.each([
+    ["INTERNAL_ERROR"],
+    ["IO_ERROR"],
+    ["UNKNOWN"],
+    ["RESOURCE_EXHAUSTED"],
+    ["SERVICE_UNDER_MAINTENANCE"],
+    ["TEMPORARILY_UNAVAILABLE"],
+    ["WORKSPACE_TEMPORARILY_UNAVAILABLE"],
+    ["ABORTED"],
+  ])(
+    "%s is denied (control-plane sourced, may carry internal state)",
+    (code) => {
+      expect(isSqlErrorPassthrough(code)).toBe(false);
+    },
+  );
+
+  test("undefined and empty string are denied (default-deny on missing source)", () => {
+    expect(isSqlErrorPassthrough(undefined)).toBe(false);
+    expect(isSqlErrorPassthrough("")).toBe(false);
+  });
+
+  test("any unrecognized code (e.g. new SDK additions) is denied", () => {
+    // When the SDK ships a new ServiceErrorCode variant, the allowlist
+    // must default-deny it until a human reviews the upstream message
+    // source. This test pins the default-deny behavior.
+    expect(isSqlErrorPassthrough("BRAND_NEW_CODE")).toBe(false);
+    expect(isSqlErrorPassthrough("PERMISSION_DENIED")).toBe(false);
+    expect(isSqlErrorPassthrough("FOOBAR")).toBe(false);
+  });
+
+  test("case-sensitive match — codes from the SDK are SHOUTY_CASE, lowercased input is rejected", () => {
+    // Defensive: if an upstream layer ever lowercases the code, we
+    // want the allowlist to fail closed rather than silently match.
+    expect(isSqlErrorPassthrough("bad_request")).toBe(false);
+    expect(isSqlErrorPassthrough("Bad_Request")).toBe(false);
+  });
+});
diff --git a/packages/appkit/src/errors/tests/errors.test.ts b/packages/appkit/src/errors/tests/errors.test.ts
index 347ce1c0d..03cc80017 100644
--- a/packages/appkit/src/errors/tests/errors.test.ts
+++ b/packages/appkit/src/errors/tests/errors.test.ts
@@ -303,6 +303,154 @@ describe("ExecutionError", () => {
     expect(error.message).toBe("No chunks or schema found in response");
     expect(error.context?.dataType).toBe("chunks or schema");
   });
+
+  describe("statementFailed clientMessage passthrough", () => {
+    test("BAD_REQUEST passes through the warehouse-authored message — user's own SQL error", () => {
+      // DBR sets errorDisplayMessage from the SQL parser / semantic
+      // analyzer; "Table not found" is the user's own SQL artifact
+      // and the entire point of an error message is to show it back.
+      const error = ExecutionError.statementFailed(
+        "Table or view 'users' not found",
+        "BAD_REQUEST",
+      );
+      expect(error.clientMessage).toBe(
+        "Statement failed: Table or view 'users' not found",
+      );
+      expect(error.errorCode).toBe("BAD_REQUEST");
+    });
+
+    test("NOT_FOUND passes through (catalog miss is user-authored)", () => {
+      const error = ExecutionError.statementFailed(
+        "Schema 'analytics' not found",
+        "NOT_FOUND",
+      );
+      expect(error.clientMessage).toBe(
+        "Statement failed: Schema 'analytics' not found",
+      );
+    });
+
+    test("ALREADY_EXISTS passes through (catalog conflict is user-authored)", () => {
+      const error = ExecutionError.statementFailed(
+        "Table 'foo' already exists",
+        "ALREADY_EXISTS",
+      );
+      expect(error.clientMessage).toBe(
+        "Statement failed: Table 'foo' already exists",
+      );
+    });
+
+    test("DEADLINE_EXCEEDED passes through (timeout text is user-actionable)", () => {
+      const error = ExecutionError.statementFailed(
+        "Query timed out after 30s",
+        "DEADLINE_EXCEEDED",
+      );
+      expect(error.clientMessage).toBe(
+        "Statement failed: Query timed out after 30s",
+      );
+    });
+
+    test("CANCELLED passes through (cancellation is initiated by user/admin)", () => {
+      const error = ExecutionError.statementFailed(
+        "Statement was canceled",
+        "CANCELLED",
+      );
+      expect(error.clientMessage).toBe(
+        "Statement failed: Statement was canceled",
+      );
+    });
+
+    test("UNAUTHENTICATED passes through (SDK auth messages are generic)", () => {
+      const error = ExecutionError.statementFailed(
+        "Permission denied",
+        "UNAUTHENTICATED",
+      );
+      expect(error.clientMessage).toBe("Statement failed: Permission denied");
+    });
+
+    test("INTERNAL_ERROR collapses to generic — control-plane text may carry stack traces, correlation IDs, internal paths (CWE-209)", () => {
+      const error = ExecutionError.statementFailed(
+        "RPC to warehouse-prod-us-east-1.svc failed: corrId=abc-123, stack: at scheduler.process()",
+        "INTERNAL_ERROR",
+      );
+      // .message keeps the raw text for server logs.
+      expect(error.message).toContain("corrId=abc-123");
+      // .clientMessage collapses to the generic default.
+      expect(error.clientMessage).toBe("Query execution failed");
+    });
+
+    test("IO_ERROR collapses to generic (may leak storage paths / bucket names)", () => {
+      const error = ExecutionError.statementFailed(
+        "Failed to read s3://internal-staging-bucket-3/path/to/shard",
+        "IO_ERROR",
+      );
+      expect(error.clientMessage).toBe("Query execution failed");
+      expect(error.message).toContain("internal-staging-bucket-3");
+    });
+
+    test("UNKNOWN collapses to generic (unclassified by definition)", () => {
+      const error = ExecutionError.statementFailed(
+        "Something went wrong: scheduler-pod-3 returned 500",
+        "UNKNOWN",
+      );
+      expect(error.clientMessage).toBe("Query execution failed");
+    });
+
+    test("RESOURCE_EXHAUSTED collapses to generic (mixed: may leak load data)", () => {
+      const error = ExecutionError.statementFailed(
+        "Cluster at 92% capacity, scheduler-pod-3",
+        "RESOURCE_EXHAUSTED",
+      );
+      expect(error.clientMessage).toBe("Query execution failed");
+    });
+
+    test("absent errorCode falls back to generic (default-deny on unknown source)", () => {
+      const error = ExecutionError.statementFailed("Some upstream text");
+      expect(error.clientMessage).toBe("Query execution failed");
+    });
+
+    test("unrecognized errorCode (post-SDK-update) defaults to generic", () => {
+      // When the SDK adds a new code we haven't reviewed yet, we must
+      // NOT passthrough by default — security relies on the allowlist
+      // being explicit.
+      const error = ExecutionError.statementFailed(
+        "scheduler internal: stack overflow at sun.misc.Unsafe.park()",
+        "SOME_NEW_CODE_NOT_IN_ALLOWLIST",
+      );
+      expect(error.clientMessage).toBe("Query execution failed");
+    });
+
+    test("explicit clientMessage argument wins over the allowlist (caller knows best)", () => {
+      // Even on an allowlisted code, an explicit clientMessage takes
+      // precedence — for cases where the caller has constructed a
+      // better-tuned message in-process.
+      const error = ExecutionError.statementFailed(
+        "Table or view 'users' not found",
+        "BAD_REQUEST",
+        "We could not find the requested table.",
+      );
+      expect(error.clientMessage).toBe(
+        "We could not find the requested table.",
+      );
+    });
+
+    test("server-side .message always preserves the raw upstream text for log debugging", () => {
+      // Regardless of passthrough decision, the raw text stays on
+      // `.message` for `logger.error(err)` calls to capture full detail.
+      const safe = ExecutionError.statementFailed(
+        "Table 'users' not found",
+        "BAD_REQUEST",
+      );
+      expect(safe.message).toBe("Statement failed: Table 'users' not found");
+
+      const unsafe = ExecutionError.statementFailed(
+        "Internal RPC failed: corrId=abc",
+        "INTERNAL_ERROR",
+      );
+      expect(unsafe.message).toBe(
+        "Statement failed: Internal RPC failed: corrId=abc",
+      );
+    });
+  });
 });
 
 describe("InitializationError", () => {

From a5c8e4a4377ac581b5a93ddc4702e8498c6260f3 Mon Sep 17 00:00:00 2001
From: James Broadhead <jamesbroadhead@gmail.com>
Date: Fri, 22 May 2026 10:58:05 +0000
Subject: [PATCH 2/3] fix(appkit): expand allowlist to 5xx-class user-facing
 service errors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewer pushback on the original allowlist: TEMPORARILY_UNAVAILABLE,
RESOURCE_EXHAUSTED, SERVICE_UNDER_MAINTENANCE, WORKSPACE_TEMPORARILY_-
UNAVAILABLE, and ABORTED carry messages that ARE the entire point of
having an error string at the wire — telling users "Query execution
failed" when the actual cause is a quota / maintenance / shard-moving
condition forces them to guess. Allowlisting these.

Verified against universe sources before reclassifying:

- TEMPORARILY_UNAVAILABLE: stable template "DBSQL temporarily
  unavailable. Please try again in a few minutes." constructed at
  `sqlgateway/scheduler/api/src/main/scala/client/SchedulerRpcClient.scala`
  and `client/linkstore/SqlGatewayClerk.scala`.

- RESOURCE_EXHAUSTED: stable user-actionable templates from
  `SchedulerRpcValidatorHook.scala`: "The maximum number of warehouses
  has been reached. Please contact Databricks support.", "You've hit
  the limit for warehouses for free usage. Stop or delete existing
  warehouses to free up capacity."

- WORKSPACE_TEMPORARILY_UNAVAILABLE: workspace-level shard-routing
  messages at `sqlgateway/proxy/src/main/scala/thrift/Exceptions.scala`
  — names a shard ID, which is internal topology metadata but
  low-sensitivity and operationally useful for retry.

- ABORTED: short reason strings like "version mismatch" at
  `ReydenWarehouseMonitor`.

- SERVICE_UNDER_MAINTENANCE: maintenance-window template messages.

INTERNAL_ERROR, IO_ERROR, UNKNOWN stay denied — these wrap raw
`ex.getMessage` and interpolate orgIds, warehouse IDs, stack traces.
`EndpointModel.scala` emits "SQL ${conf.warehouse} cannot be created
due to repeated collisions on unique IDs." — exactly the kind of
internal-state leak the sanitization exists to prevent.

Reframed the module docs from "DBR vs control plane" to
"designed-for-user vs designed-for-debugging" — the same code can be
constructed at user-facing or debug-facing sites, but the contested
5xx-class codes are overwhelmingly the former in practice.

Co-authored-by: Isaac
Signed-off-by: James Broadhead <jamesbroadhead@gmail.com>
---
 .../src/errors/dbsql-error-allowlist.ts       | 147 ++++++++++++------
 .../tests/dbsql-error-allowlist.test.ts       |  35 +++--
 .../appkit/src/errors/tests/errors.test.ts    |  52 ++++++-
 3 files changed, 166 insertions(+), 68 deletions(-)

diff --git a/packages/appkit/src/errors/dbsql-error-allowlist.ts b/packages/appkit/src/errors/dbsql-error-allowlist.ts
index 080cc781d..34cbefd56 100644
--- a/packages/appkit/src/errors/dbsql-error-allowlist.ts
+++ b/packages/appkit/src/errors/dbsql-error-allowlist.ts
@@ -2,37 +2,66 @@
  * Allowlist of Databricks SQL Statement Execution API error codes whose
  * `error.message` is safe to forward to clients verbatim.
  *
- * Background — the warehouse populates `error.message` from two distinct
- * sources, distinguishable only by `error.error_code`:
+ * Framing: each `ServiceErrorCode` value falls into one of two camps,
+ * based on what wording is interpolated into `error.message` at the
+ * gateway construction site:
  *
- * 1. **DBR (data plane)** sets `errorDisplayMessage` for user-authored
- *    SQL errors (parse, semantic, type, missing-table) and surfaces them
- *    via the gateway untouched. These messages are inherently user-facing
- *    — they reference the user's own SQL — and were filtered through
- *    DBR's Thrift layer with the express purpose of being shown back to
- *    the user. They are *safe to passthrough*.
+ * 1. **Designed-for-user** — the gateway interpolates a stable template
+ *    string authored for end users (e.g. "DBSQL temporarily unavailable.
+ *    Please try again in a few minutes.") or a DBR-stamped
+ *    `errorDisplayMessage` covering the user's own SQL. These messages
+ *    are *the entire point* of having an error string — hiding them
+ *    forces users to debug in the dark. *Safe to passthrough.*
  *
- *    Source of truth: DBR -> SQL Gateway scheduler
- *    `sqlgateway/scheduler/src/main/scala/DriverRequests.scala:181-208`
- *    populates `ErrorInfo.withMessage(command.errorDisplayMessage)` and
- *    sets `withErrorReturnedFromDataPlane(true)` for these.
+ * 2. **Designed-for-debugging** — the gateway passes through a wrapped
+ *    `ex.getMessage` or interpolates internal identifiers (orgId, unique
+ *    warehouse IDs, scheduler pod names, internal column structure).
+ *    These are intended for operator log inspection, not for clients.
+ *    *Not safe to passthrough* (CWE-209).
  *
- * 2. **Control plane (proxy / scheduler / metaservice)** sets
- *    `error.message` from internal RPC failures, scheduler exceptions,
- *    capacity rejections, etc. These messages can include correlation
- *    IDs, internal pod / service names, stack traces, and storage paths
- *    — they are *not safe to passthrough* (CWE-209).
+ * Source-of-truth construction sites for the classifications below:
  *
- *    Source: `sqlgateway/scheduler/src/main/scala/utils/CommandUtils.scala`
- *    and `exceptions.scala` wrap internal failures with
- *    `CommandExecutionFailed`.
+ * - DBR data-plane (`BAD_REQUEST`, `NOT_FOUND`, `ALREADY_EXISTS`):
+ *   `sqlgateway/scheduler/src/main/scala/DriverRequests.scala:181-208`
+ *   populates `ErrorInfo.withMessage(command.errorDisplayMessage)` and
+ *   sets `withErrorReturnedFromDataPlane(true)`.
  *
- * The mapping below classifies each `ServiceErrorCode` value from
- * `@databricks/sdk-experimental` (`apis/sql/model.d.ts`) by which source
- * dominates in practice. When the SDK adds a new code, default it to
- * `false` here so it stays out of the passthrough until reviewed.
+ * - User-facing service messages (`TEMPORARILY_UNAVAILABLE`,
+ *   `WORKSPACE_TEMPORARILY_UNAVAILABLE`, `SERVICE_UNDER_MAINTENANCE`):
+ *   - `sqlgateway/scheduler/api/src/main/scala/client/SchedulerRpcClient.scala`
+ *     constructs `SchedulerTemporarilyUnavailableException` with the
+ *     literal "DBSQL temporarily unavailable. Please try again in a
+ *     few minutes."
+ *   - `sqlgateway/scheduler/api/src/main/scala/client/linkstore/SqlGatewayClerk.scala`
+ *     constructs with `internalMessage = "DBSQL temporarily unavailable..."`
+ *   - `sqlgateway/proxy/src/main/scala/thrift/Exceptions.scala`
+ *     constructs `WORKSPACE_TEMPORARILY_UNAVAILABLE` from
+ *     `NotServedByShardException.getMessage` — names a shard but no
+ *     credentials / PII / stack traces.
  *
- * SDK type reference:
+ * - User-actionable quota messages (`RESOURCE_EXHAUSTED`):
+ *   `sqlgateway/scheduler/src/main/scala/SchedulerRpcValidatorHook.scala`
+ *   emits stable templates: "The maximum number of warehouses has been
+ *   reached. Please contact Databricks support.", "You've hit the limit
+ *   for warehouses for free usage. Stop or delete existing warehouses
+ *   to free up capacity.", etc.
+ *
+ * - Concurrency conflict (`ABORTED`): short reason strings like
+ *   "version mismatch" (Reyden warehouse monitor). No internal context.
+ *
+ * - Genuinely-internal-only (`INTERNAL_ERROR`, `IO_ERROR`, `UNKNOWN`):
+ *   `sqlgateway/database/src/main/scala/endpoints/EndpointModel.scala`
+ *   interpolates `s"SQL ${conf.warehouse} cannot be created due to
+ *   repeated collisions on unique IDs."` — exposes internal database
+ *   state. `SchedulerRpcContextValidatorHook.workspaceNotOwned(orgId)`
+ *   interpolates orgId. Test fixtures show stack traces interpolated
+ *   into `internalMessage`. *Default-deny.*
+ *
+ * - `UNAUTHENTICATED` / `CANCELLED` / `DEADLINE_EXCEEDED`: SDK-level
+ *   codes with generic message templates.
+ *
+ * SDK type reference (`@databricks/sdk-experimental`,
+ * `apis/sql/model.d.ts`):
  * ```
  * type ServiceErrorCode =
  *   | "ABORTED" | "ALREADY_EXISTS" | "BAD_REQUEST" | "CANCELLED"
@@ -41,46 +70,62 @@
  *   | "TEMPORARILY_UNAVAILABLE" | "UNAUTHENTICATED" | "UNKNOWN"
  *   | "WORKSPACE_TEMPORARILY_UNAVAILABLE";
  * ```
+ *
+ * **Default-deny on unknown codes** — when the SDK ships a new variant
+ * the allowlist hasn't been reviewed against, the new code collapses to
+ * the generic clientMessage until someone audits the construction sites
+ * and updates the Set below.
  */
 
 /** ServiceErrorCode values for which `error.message` may be forwarded verbatim. */
 const SAFE_PASSTHROUGH_CODES: ReadonlySet<string> = new Set([
-  // User-authored SQL errors flow through DBR's errorDisplayMessage.
-  // Examples: "Table or view 'foo' not found", "Syntax error at or near
-  // ',' on line 3", "Cannot resolve column 'col_x' given input columns".
+  // DBR-authored: user's own SQL errors. Parse, semantic, type, missing
+  // table/column. Examples: "Table or view 'foo' not found", "Syntax
+  // error at or near ',' on line 3".
   "BAD_REQUEST",
-  // Catalog lookup misses — "Table 'foo' not found", "Schema 'bar' not
-  // found". DBR sets these via the catalog layer; messages reference
-  // the user's own SQL.
+  // DBR catalog miss — "Table 'foo' not found", "Schema 'bar' not found".
   "NOT_FOUND",
-  // Catalog conflicts — "Table 'foo' already exists". Same provenance.
+  // DBR catalog conflict — "Table 'foo' already exists".
   "ALREADY_EXISTS",
-  // Query timeout. Message is typically "Query timed out after Ns" —
-  // user-actionable, no internal state.
+  // Stable template: "Query timed out after Ns". User-actionable.
   "DEADLINE_EXCEEDED",
-  // Client / admin cancellation. Message is "Statement was canceled".
+  // "Statement was canceled". User/admin-initiated; no internal state.
   "CANCELLED",
-  // Authentication failures don't carry internal state; the SDK surfaces
-  // a generic "Permission denied" or "Authentication required" string.
+  // SDK generic — "Permission denied", "Authentication required".
   "UNAUTHENTICATED",
+  // Stable user-facing template: "DBSQL temporarily unavailable. Please
+  // try again in a few minutes." Hiding this would force users to
+  // distinguish "warehouse issue" from "user error" by guesswork.
+  "TEMPORARILY_UNAVAILABLE",
+  // Workspace-level service messages from shard-routing failures. The
+  // message may name a shard ID — that's internal topology metadata,
+  // low sensitivity, and operationally useful for the user to know
+  // the workspace is in motion.
+  "WORKSPACE_TEMPORARILY_UNAVAILABLE",
+  // Maintenance windows — generic template messages.
+  "SERVICE_UNDER_MAINTENANCE",
+  // Stable user-actionable templates from the warehouse quota path:
+  // "The maximum number of warehouses has been reached. Please contact
+  // Databricks support.", "Stop or delete existing warehouses to free
+  // up capacity.", etc. Telling users "Query execution failed" when
+  // they've actually hit a quota is unhelpful.
+  "RESOURCE_EXHAUSTED",
+  // Concurrency conflict — short reason strings ("version mismatch",
+  // "concurrent modification"). User-relevant for retry decisions.
+  "ABORTED",
 ]);
 
 /**
- * Codes that explicitly carry internal control-plane wording and must
- * NEVER pass through to clients. Listed for documentation; classification
- * is by absence from `SAFE_PASSTHROUGH_CODES`.
+ * Explicitly NOT on the allowlist (documented for future-reviewer
+ * context; classification is by absence from `SAFE_PASSTHROUGH_CODES`):
  *
- * - `INTERNAL_ERROR` — control-plane stack traces, RPC failures.
- * - `IO_ERROR` — storage / network paths (may leak bucket names, pod IPs).
- * - `UNKNOWN` — by definition unclassified; default-deny.
- * - `RESOURCE_EXHAUSTED` — mixed; control plane sometimes leaks
- *   internal load data ("scheduler-pod-3 at 92% CPU"). Default-deny.
- * - `SERVICE_UNDER_MAINTENANCE` / `TEMPORARILY_UNAVAILABLE` /
- *   `WORKSPACE_TEMPORARILY_UNAVAILABLE` — typically generic, but the
- *   gateway has been observed to include workspace identifiers and
- *   scheduler pod names. Default-deny.
- * - `ABORTED` — server-side abort; can carry transaction / lock context
- *   identifying internal locking subsystems. Default-deny.
+ * - `INTERNAL_ERROR` — construction sites interpolate unique warehouse
+ *   IDs, orgIds, and wrap `ex.getMessage` from arbitrary exceptions.
+ *   Example: `s"SQL ${conf.warehouse} cannot be created due to repeated
+ *   collisions on unique IDs."` (`EndpointModel.scala`). High leak risk.
+ * - `IO_ERROR` — storage / network failures; messages typically include
+ *   bucket names, paths, internal hostnames.
+ * - `UNKNOWN` — unclassified by definition; can be anything.
  */
 
 /**
diff --git a/packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts b/packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts
index f65856dd2..115f3e25b 100644
--- a/packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts
+++ b/packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts
@@ -2,30 +2,37 @@ import { describe, expect, test } from "vitest";
 import { isSqlErrorPassthrough } from "../dbsql-error-allowlist";
 
 describe("isSqlErrorPassthrough", () => {
-  // Allowlist members — DBR-authored, user-facing messages.
+  // Allowlist members — designed-for-user messages.
   test.each([
+    // DBR data plane — user's own SQL errors.
     ["BAD_REQUEST"],
     ["NOT_FOUND"],
     ["ALREADY_EXISTS"],
+    // SDK-level codes with generic templates.
     ["DEADLINE_EXCEEDED"],
     ["CANCELLED"],
     ["UNAUTHENTICATED"],
-  ])("%s is allowlisted (DBR-authored user-facing message)", (code) => {
-    expect(isSqlErrorPassthrough(code)).toBe(true);
-  });
-
-  // Explicit denylist — control-plane sourced, internal state risk.
-  test.each([
-    ["INTERNAL_ERROR"],
-    ["IO_ERROR"],
-    ["UNKNOWN"],
-    ["RESOURCE_EXHAUSTED"],
-    ["SERVICE_UNDER_MAINTENANCE"],
+    // 5xx-class service messages — stable user-facing templates
+    // ("DBSQL temporarily unavailable. Please try again in a few
+    // minutes."). Hiding these forces users to guess whether a failure
+    // is on their side or ours.
     ["TEMPORARILY_UNAVAILABLE"],
     ["WORKSPACE_TEMPORARILY_UNAVAILABLE"],
+    ["SERVICE_UNDER_MAINTENANCE"],
+    // Quota — stable user-actionable templates ("Stop or delete
+    // existing warehouses to free up capacity.").
+    ["RESOURCE_EXHAUSTED"],
+    // Concurrency conflict — short reason strings, user-relevant for
+    // retry decisions.
     ["ABORTED"],
-  ])(
-    "%s is denied (control-plane sourced, may carry internal state)",
+  ])("%s is allowlisted (designed-for-user message)", (code) => {
+    expect(isSqlErrorPassthrough(code)).toBe(true);
+  });
+
+  // Explicit denylist — designed-for-debugging, interpolates internal
+  // identifiers, stack traces, or storage paths.
+  test.each([["INTERNAL_ERROR"], ["IO_ERROR"], ["UNKNOWN"]])(
+    "%s is denied (designed-for-debugging, interpolates internal state)",
     (code) => {
       expect(isSqlErrorPassthrough(code)).toBe(false);
     },
diff --git a/packages/appkit/src/errors/tests/errors.test.ts b/packages/appkit/src/errors/tests/errors.test.ts
index 03cc80017..296e0ebc5 100644
--- a/packages/appkit/src/errors/tests/errors.test.ts
+++ b/packages/appkit/src/errors/tests/errors.test.ts
@@ -395,12 +395,58 @@ describe("ExecutionError", () => {
       expect(error.clientMessage).toBe("Query execution failed");
     });
 
-    test("RESOURCE_EXHAUSTED collapses to generic (mixed: may leak load data)", () => {
+    test("RESOURCE_EXHAUSTED passes through (quota templates are user-actionable)", () => {
+      // SchedulerRpcValidatorHook.scala emits stable templates like
+      // "The maximum number of warehouses has been reached. Please
+      // contact Databricks support." Telling users "Query execution
+      // failed" when they've actually hit a quota is unhelpful.
       const error = ExecutionError.statementFailed(
-        "Cluster at 92% capacity, scheduler-pod-3",
+        "The maximum number of warehouses has been reached. Please contact Databricks support.",
         "RESOURCE_EXHAUSTED",
       );
-      expect(error.clientMessage).toBe("Query execution failed");
+      expect(error.clientMessage).toBe(
+        "Statement failed: The maximum number of warehouses has been reached. Please contact Databricks support.",
+      );
+    });
+
+    test("TEMPORARILY_UNAVAILABLE passes through (stable service-down template)", () => {
+      // Source: SchedulerRpcClient.scala — fixed string authored for
+      // end users.
+      const error = ExecutionError.statementFailed(
+        "DBSQL temporarily unavailable. Please try again in a few minutes.",
+        "TEMPORARILY_UNAVAILABLE",
+      );
+      expect(error.clientMessage).toBe(
+        "Statement failed: DBSQL temporarily unavailable. Please try again in a few minutes.",
+      );
+    });
+
+    test("SERVICE_UNDER_MAINTENANCE passes through (maintenance messages are user-facing by design)", () => {
+      const error = ExecutionError.statementFailed(
+        "DBSQL is undergoing scheduled maintenance.",
+        "SERVICE_UNDER_MAINTENANCE",
+      );
+      expect(error.clientMessage).toBe(
+        "Statement failed: DBSQL is undergoing scheduled maintenance.",
+      );
+    });
+
+    test("WORKSPACE_TEMPORARILY_UNAVAILABLE passes through (shard topology is low-sensitivity, retry-relevant)", () => {
+      const error = ExecutionError.statementFailed(
+        "Workspace temporarily unavailable: shard moving",
+        "WORKSPACE_TEMPORARILY_UNAVAILABLE",
+      );
+      expect(error.clientMessage).toBe(
+        "Statement failed: Workspace temporarily unavailable: shard moving",
+      );
+    });
+
+    test("ABORTED passes through (conflict reason strings are retry-relevant)", () => {
+      const error = ExecutionError.statementFailed(
+        "version mismatch",
+        "ABORTED",
+      );
+      expect(error.clientMessage).toBe("Statement failed: version mismatch");
     });
 
     test("absent errorCode falls back to generic (default-deny on unknown source)", () => {

From b6ba29c697d677fcfe32814dde20a27beb416353 Mon Sep 17 00:00:00 2001
From: James Broadhead <jamesbroadhead@gmail.com>
Date: Fri, 22 May 2026 11:20:51 +0000
Subject: [PATCH 3/3] fix(appkit): allowlist INTERNAL_ERROR + IO_ERROR; plumb
 requestId for triage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Threat model walkthrough — the recipient of these messages is an
authenticated workspace user, not an anonymous internet visitor. They
already know:
- Their workspace URL and orgId (visible in every SDK call)
- The warehouses they have grants on (listable via the SDK)
- Their catalog contents

Interpolated identifiers in INTERNAL_ERROR / IO_ERROR messages
(warehouse names, orgIds, internal class names) are not new information
to this audience — the classic CWE-209 leak-to-anonymous-attacker
scenario does not apply. The cost of denial (every internal error
becomes "Query execution failed" with zero signal) is certain; the
benefit (defense-in-depth against unbounded wrapped `ex.getMessage`
content) is speculative.

Allowlisting both, with a requestId on the SSE error frame as the
safety net:

- `SSEError` now carries `requestId` (the SSE event id, same value the
  server logs in its `Stream execution failed` line — quoting it in a
  support ticket lets staff grep logs directly).
- `SSEWriter.writeError` populates `requestId` from the event id.
- `stream-manager.ts` logs `requestId=...` alongside `rawMsg`,
  `errorCode`, `upstreamCode` so a user-supplied requestId pinpoints
  the exact log line.
- Analytics `/arrow-result` 410/404 paths generate their own requestId
  and emit it in both the JSON response and the server log line.
- `useAnalyticsQuery` exposes `requestId: string | null` on
  `UseAnalyticsQueryResult` so UI code can render "Error ref: <id>".

Only `UNKNOWN` remains denied — unclassified by definition, so the
contents are unbounded in both shape and source.

Validation: 2,133 appkit + 292 appkit-ui tests pass; tsc clean.

Co-authored-by: Isaac
Signed-off-by: James Broadhead <jamesbroadhead@gmail.com>
---
 .../__tests__/use-analytics-query.test.ts     | 16 ++--
 packages/appkit-ui/src/react/hooks/types.ts   |  9 ++
 .../src/react/hooks/use-analytics-query.ts    | 12 ++-
 .../src/errors/dbsql-error-allowlist.ts       | 93 +++++++++++++------
 .../tests/dbsql-error-allowlist.test.ts       | 30 +++---
 .../appkit/src/errors/tests/errors.test.ts    | 42 +++++----
 .../appkit/src/plugins/analytics/analytics.ts | 20 +++-
 packages/appkit/src/stream/sse-writer.ts      |  8 ++
 packages/appkit/src/stream/stream-manager.ts  |  7 +-
 .../appkit/src/stream/tests/stream.test.ts    | 11 ++-
 packages/appkit/src/stream/types.ts           |  8 ++
 11 files changed, 182 insertions(+), 74 deletions(-)

diff --git a/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts b/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts
index ba4c962b0..5417ac39c 100644
--- a/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts
+++ b/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts
@@ -184,12 +184,12 @@ describe("useAnalyticsQuery", () => {
     expect(result.current.data).toBeNull();
   });
 
-  test("a server error event carrying a structured errorCode exposes it on the hook return value", async () => {
-    // The SSE error broadcaster forwards an `errorCode` field for
-    // UI branching (e.g. INLINE_ARROW_STASH_EXHAUSTED). The hook
-    // surfaces both the human `error` text AND the structured
-    // `errorCode` so consumers can branch on the stable identifier
-    // instead of substring-matching the sanitized human message.
+  test("a server error event carrying a structured errorCode + requestId exposes both on the hook return value", async () => {
+    // The SSE error broadcaster forwards `errorCode` (for UI branching)
+    // and `requestId` (for support triage — same id appears in the
+    // server-side logger.error line). The hook surfaces both so
+    // consumers can render "Error ref: <requestId>" alongside the
+    // human message and branch behavior on the stable errorCode.
     const errorSpy = vi.spyOn(console, "error").mockImplementation(() => {});
 
     const { result } = renderHook(() =>
@@ -202,6 +202,7 @@ describe("useAnalyticsQuery", () => {
         error: "Server is at capacity, please retry",
         code: "UPSTREAM_ERROR",
         errorCode: "INLINE_ARROW_STASH_EXHAUSTED",
+        requestId: "11111111-2222-3333-4444-555555555555",
       }),
     });
 
@@ -210,6 +211,9 @@ describe("useAnalyticsQuery", () => {
     });
     expect(result.current.loading).toBe(false);
     expect(result.current.errorCode).toBe("INLINE_ARROW_STASH_EXHAUSTED");
+    expect(result.current.requestId).toBe(
+      "11111111-2222-3333-4444-555555555555",
+    );
 
     errorSpy.mockRestore();
   });
diff --git a/packages/appkit-ui/src/react/hooks/types.ts b/packages/appkit-ui/src/react/hooks/types.ts
index a603e10a4..feb1d8bf6 100644
--- a/packages/appkit-ui/src/react/hooks/types.ts
+++ b/packages/appkit-ui/src/react/hooks/types.ts
@@ -73,6 +73,15 @@ export interface UseAnalyticsQueryResult<T> {
    * matching `error`, which is a free-form sanitized message.
    */
   errorCode: string | null;
+  /**
+   * Server-side correlation id for support triage. When non-null, the
+   * server emitted this id on the error frame and the same id appears
+   * in the backend's `logger.error` line. Surface it in error UIs
+   * ("Error ref: abc-123 — quote when filing a support ticket") so
+   * staff can grep server logs directly instead of asking the user to
+   * reproduce.
+   */
+  requestId: string | null;
 }
 
 /**
diff --git a/packages/appkit-ui/src/react/hooks/use-analytics-query.ts b/packages/appkit-ui/src/react/hooks/use-analytics-query.ts
index a3d2cf306..ff2b18bab 100644
--- a/packages/appkit-ui/src/react/hooks/use-analytics-query.ts
+++ b/packages/appkit-ui/src/react/hooks/use-analytics-query.ts
@@ -122,6 +122,7 @@ export function useAnalyticsQuery<
   const [loading, setLoading] = useState(false);
   const [error, setError] = useState<string | null>(null);
   const [errorCode, setErrorCode] = useState<string | null>(null);
+  const [requestId, setRequestId] = useState<string | null>(null);
   const abortControllerRef = useRef<AbortController | null>(null);
 
   if (!queryKey || queryKey.trim().length === 0) {
@@ -170,6 +171,7 @@ export function useAnalyticsQuery<
     setLoading(true);
     setError(null);
     setErrorCode(null);
+    setRequestId(null);
     setData(null);
 
     const abortController = new AbortController();
@@ -247,10 +249,16 @@ export function useAnalyticsQuery<
             if (typeof parsed.errorCode === "string") {
               setErrorCode(parsed.errorCode);
             }
+            // requestId is the SSE event id of the error frame — same
+            // value the server logs in its `Stream execution failed`
+            // line, so users can quote it in support tickets.
+            if (typeof parsed.requestId === "string") {
+              setRequestId(parsed.requestId);
+            }
 
             if (parsed.code) {
               console.error(
-                `[useAnalyticsQuery] Code: ${parsed.code}, Message: ${errorMsg}`,
+                `[useAnalyticsQuery] Code: ${parsed.code}, Message: ${errorMsg}, requestId: ${parsed.requestId ?? "n/a"}`,
               );
             }
             return;
@@ -321,5 +329,5 @@ export function useAnalyticsQuery<
   // Enable HMR for query updates in dev mode
   useQueryHMR(queryKey, start);
 
-  return { data, loading, error, errorCode };
+  return { data, loading, error, errorCode, requestId };
 }
diff --git a/packages/appkit/src/errors/dbsql-error-allowlist.ts b/packages/appkit/src/errors/dbsql-error-allowlist.ts
index 34cbefd56..780797fff 100644
--- a/packages/appkit/src/errors/dbsql-error-allowlist.ts
+++ b/packages/appkit/src/errors/dbsql-error-allowlist.ts
@@ -2,22 +2,35 @@
  * Allowlist of Databricks SQL Statement Execution API error codes whose
  * `error.message` is safe to forward to clients verbatim.
  *
- * Framing: each `ServiceErrorCode` value falls into one of two camps,
- * based on what wording is interpolated into `error.message` at the
- * gateway construction site:
- *
- * 1. **Designed-for-user** — the gateway interpolates a stable template
- *    string authored for end users (e.g. "DBSQL temporarily unavailable.
- *    Please try again in a few minutes.") or a DBR-stamped
- *    `errorDisplayMessage` covering the user's own SQL. These messages
- *    are *the entire point* of having an error string — hiding them
- *    forces users to debug in the dark. *Safe to passthrough.*
- *
- * 2. **Designed-for-debugging** — the gateway passes through a wrapped
- *    `ex.getMessage` or interpolates internal identifiers (orgId, unique
- *    warehouse IDs, scheduler pod names, internal column structure).
- *    These are intended for operator log inspection, not for clients.
- *    *Not safe to passthrough* (CWE-209).
+ * Threat model — the recipient of these messages is an *authenticated
+ * workspace user*, not an anonymous internet visitor. They already know
+ * the workspace URL, their orgId, their session token, the warehouses
+ * they have grants on, and the catalog contents they can see. Names of
+ * resources within the workspace, internal correlation IDs, and class
+ * names from Databricks-internal packages are not new information to
+ * this audience — the classic CWE-209 "stack-trace-on-500-page-leaks-
+ * to-attacker" scenario does not apply.
+ *
+ * Given that, the bar for passthrough is: does the message help the
+ * user (or their support contact) understand and act on the failure?
+ * Defaults skew toward yes because:
+ *
+ * - The cost of denial is *certain* (every error becomes "Query
+ *   execution failed" with no signal; users open tickets, support has
+ *   to reproduce or grep server logs to find the actual error).
+ * - The cost of allowing is *speculative* — depends on whatever the
+ *   most recent wrapped exception happened to put in its `.toString()`.
+ *
+ * Server-side, the route handlers always `logger.error(rawMsg)` the
+ * full upstream text before sanitization — operators retain complete
+ * visibility regardless of what reaches the wire. SSE error frames
+ * also carry a `requestId` (the SSE event id) so users can quote it
+ * in support tickets and staff can grep logs against it directly.
+ *
+ * **Currently denied**: `UNKNOWN` only — by definition unclassified,
+ * so we can't reason about its contents and a default-deny is the
+ * minimum safe stance. Every other `ServiceErrorCode` variant is on
+ * the allowlist below with a documented rationale.
  *
  * Source-of-truth construction sites for the classifications below:
  *
@@ -49,13 +62,25 @@
  * - Concurrency conflict (`ABORTED`): short reason strings like
  *   "version mismatch" (Reyden warehouse monitor). No internal context.
  *
- * - Genuinely-internal-only (`INTERNAL_ERROR`, `IO_ERROR`, `UNKNOWN`):
+ * - `INTERNAL_ERROR` / `IO_ERROR`:
  *   `sqlgateway/database/src/main/scala/endpoints/EndpointModel.scala`
  *   interpolates `s"SQL ${conf.warehouse} cannot be created due to
- *   repeated collisions on unique IDs."` — exposes internal database
- *   state. `SchedulerRpcContextValidatorHook.workspaceNotOwned(orgId)`
- *   interpolates orgId. Test fixtures show stack traces interpolated
- *   into `internalMessage`. *Default-deny.*
+ *   repeated collisions on unique IDs."`;
+ *   `SchedulerRpcContextValidatorHook.workspaceNotOwned(orgId)`
+ *   interpolates orgId. These identifiers are not sensitive to a
+ *   workspace user (they own / can list both). The wrapped
+ *   `ex.getMessage` content is unbounded but for the typical case is
+ *   either operational (RPC timeout, lock contention) or shape-of-error
+ *   information the user needs to act. Allowlisted with the requestId
+ *   plumbing as the safety net for triage. *Not* allowlisted if a
+ *   future review finds construction sites that interpolate user
+ *   credentials, OAuth tokens, or other secrets — open an issue + flip
+ *   the entry.
+ *
+ * - `UNKNOWN`: unclassified by definition. We have no way to reason
+ *   about its contents — could be anything from "RPC connect failed"
+ *   to a panic dump from an upstream library. Default-deny is the
+ *   only defensible stance until the SDK gives us a finer code.
  *
  * - `UNAUTHENTICATED` / `CANCELLED` / `DEADLINE_EXCEEDED`: SDK-level
  *   codes with generic message templates.
@@ -113,19 +138,31 @@ const SAFE_PASSTHROUGH_CODES: ReadonlySet<string> = new Set([
   // Concurrency conflict — short reason strings ("version mismatch",
   // "concurrent modification"). User-relevant for retry decisions.
   "ABORTED",
+  // Wrapped internal exceptions. Construction sites interpolate
+  // workspace-internal identifiers (warehouse names, orgIds) and
+  // `ex.getMessage` from arbitrary causes. To an authenticated
+  // workspace user these identifiers are not sensitive — they can
+  // already enumerate them. The wrapped `ex.getMessage` content is
+  // unbounded but in practice is operational ("RPC timed out",
+  // "deadlock detected") or shape-of-failure detail the user needs to
+  // act on. The `requestId` we emit on the SSE error frame is the
+  // safety net for unhappy cases — users can quote it in tickets and
+  // staff can grep logs against it for the full unsanitized message.
+  "INTERNAL_ERROR",
+  // Storage / network failures. Same logic as INTERNAL_ERROR: path /
+  // bucket names visible to a workspace user with catalog access are
+  // not new information, and the wrapped error text usually tells
+  // them whether to retry, switch warehouses, or escalate.
+  "IO_ERROR",
 ]);
 
 /**
  * Explicitly NOT on the allowlist (documented for future-reviewer
  * context; classification is by absence from `SAFE_PASSTHROUGH_CODES`):
  *
- * - `INTERNAL_ERROR` — construction sites interpolate unique warehouse
- *   IDs, orgIds, and wrap `ex.getMessage` from arbitrary exceptions.
- *   Example: `s"SQL ${conf.warehouse} cannot be created due to repeated
- *   collisions on unique IDs."` (`EndpointModel.scala`). High leak risk.
- * - `IO_ERROR` — storage / network failures; messages typically include
- *   bucket names, paths, internal hostnames.
- * - `UNKNOWN` — unclassified by definition; can be anything.
+ * - `UNKNOWN` — unclassified by definition; could be anything from a
+ *   library panic to an unhandled `case` in the gateway. Default-deny
+ *   is the only defensible stance until the SDK gives us a finer code.
  */
 
 /**
diff --git a/packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts b/packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts
index 115f3e25b..638271440 100644
--- a/packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts
+++ b/packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts
@@ -2,7 +2,10 @@ import { describe, expect, test } from "vitest";
 import { isSqlErrorPassthrough } from "../dbsql-error-allowlist";
 
 describe("isSqlErrorPassthrough", () => {
-  // Allowlist members — designed-for-user messages.
+  // Allowlist members — passthrough is net-positive given the
+  // workspace-user threat model (recipient already knows the workspace's
+  // resources). Server-side full-detail logging + the requestId on the
+  // SSE error frame backstop the unhappy cases.
   test.each([
     // DBR data plane — user's own SQL errors.
     ["BAD_REQUEST"],
@@ -22,21 +25,24 @@ describe("isSqlErrorPassthrough", () => {
     // Quota — stable user-actionable templates ("Stop or delete
     // existing warehouses to free up capacity.").
     ["RESOURCE_EXHAUSTED"],
-    // Concurrency conflict — short reason strings, user-relevant for
-    // retry decisions.
+    // Concurrency conflict — short reason strings, user-relevant.
     ["ABORTED"],
-  ])("%s is allowlisted (designed-for-user message)", (code) => {
+    // Wrapped internal exceptions — interpolated identifiers (orgId,
+    // warehouse name) are non-sensitive to an authenticated workspace
+    // user; the wrapped `ex.getMessage` content is operationally
+    // useful for triage. RequestId on the SSE error frame is the
+    // safety net for unhappy cases.
+    ["INTERNAL_ERROR"],
+    ["IO_ERROR"],
+  ])("%s is allowlisted", (code) => {
     expect(isSqlErrorPassthrough(code)).toBe(true);
   });
 
-  // Explicit denylist — designed-for-debugging, interpolates internal
-  // identifiers, stack traces, or storage paths.
-  test.each([["INTERNAL_ERROR"], ["IO_ERROR"], ["UNKNOWN"]])(
-    "%s is denied (designed-for-debugging, interpolates internal state)",
-    (code) => {
-      expect(isSqlErrorPassthrough(code)).toBe(false);
-    },
-  );
+  // Only UNKNOWN stays denied — unclassified by definition, so the
+  // wrapped content is unbounded in shape AND in source.
+  test("UNKNOWN is denied (unclassified — cannot reason about contents)", () => {
+    expect(isSqlErrorPassthrough("UNKNOWN")).toBe(false);
+  });
 
   test("undefined and empty string are denied (default-deny on missing source)", () => {
     expect(isSqlErrorPassthrough(undefined)).toBe(false);
diff --git a/packages/appkit/src/errors/tests/errors.test.ts b/packages/appkit/src/errors/tests/errors.test.ts
index 296e0ebc5..5a13fb69d 100644
--- a/packages/appkit/src/errors/tests/errors.test.ts
+++ b/packages/appkit/src/errors/tests/errors.test.ts
@@ -367,27 +367,32 @@ describe("ExecutionError", () => {
       expect(error.clientMessage).toBe("Statement failed: Permission denied");
     });
 
-    test("INTERNAL_ERROR collapses to generic — control-plane text may carry stack traces, correlation IDs, internal paths (CWE-209)", () => {
+    test("INTERNAL_ERROR passes through (workspace identifiers are non-sensitive to authenticated user; requestId backstops triage)", () => {
+      // Interpolated warehouse names and orgIds are not new information
+      // to a workspace user — they can enumerate both via the SDK.
+      // Wrapped `ex.getMessage` is operationally useful for the user
+      // to act ("RPC timed out", "deadlock detected") and the
+      // requestId on the SSE error frame keys server logs for triage.
       const error = ExecutionError.statementFailed(
-        "RPC to warehouse-prod-us-east-1.svc failed: corrId=abc-123, stack: at scheduler.process()",
+        "SQL my-warehouse cannot be created due to repeated collisions on unique IDs.",
         "INTERNAL_ERROR",
       );
-      // .message keeps the raw text for server logs.
-      expect(error.message).toContain("corrId=abc-123");
-      // .clientMessage collapses to the generic default.
-      expect(error.clientMessage).toBe("Query execution failed");
+      expect(error.clientMessage).toBe(
+        "Statement failed: SQL my-warehouse cannot be created due to repeated collisions on unique IDs.",
+      );
     });
 
-    test("IO_ERROR collapses to generic (may leak storage paths / bucket names)", () => {
+    test("IO_ERROR passes through (storage path / network failure detail is actionable, not credential-bearing)", () => {
       const error = ExecutionError.statementFailed(
-        "Failed to read s3://internal-staging-bucket-3/path/to/shard",
+        "Failed to read manifest after 3 retries",
         "IO_ERROR",
       );
-      expect(error.clientMessage).toBe("Query execution failed");
-      expect(error.message).toContain("internal-staging-bucket-3");
+      expect(error.clientMessage).toBe(
+        "Statement failed: Failed to read manifest after 3 retries",
+      );
     });
 
-    test("UNKNOWN collapses to generic (unclassified by definition)", () => {
+    test("UNKNOWN collapses to generic (unclassified by definition — cannot reason about contents)", () => {
       const error = ExecutionError.statementFailed(
         "Something went wrong: scheduler-pod-3 returned 500",
         "UNKNOWN",
@@ -481,20 +486,23 @@ describe("ExecutionError", () => {
 
     test("server-side .message always preserves the raw upstream text for log debugging", () => {
       // Regardless of passthrough decision, the raw text stays on
-      // `.message` for `logger.error(err)` calls to capture full detail.
+      // `.message` for `logger.error(err)` calls to capture full
+      // detail. For UNKNOWN (the only denied code), clientMessage
+      // collapses but .message still has the full content.
       const safe = ExecutionError.statementFailed(
         "Table 'users' not found",
         "BAD_REQUEST",
       );
       expect(safe.message).toBe("Statement failed: Table 'users' not found");
 
-      const unsafe = ExecutionError.statementFailed(
-        "Internal RPC failed: corrId=abc",
-        "INTERNAL_ERROR",
+      const denied = ExecutionError.statementFailed(
+        "Unclassified failure with corrId=abc",
+        "UNKNOWN",
       );
-      expect(unsafe.message).toBe(
-        "Statement failed: Internal RPC failed: corrId=abc",
+      expect(denied.message).toBe(
+        "Statement failed: Unclassified failure with corrId=abc",
       );
+      expect(denied.clientMessage).toBe("Query execution failed");
     });
   });
 });
diff --git a/packages/appkit/src/plugins/analytics/analytics.ts b/packages/appkit/src/plugins/analytics/analytics.ts
index 5ba121d1b..dca3ddd8f 100644
--- a/packages/appkit/src/plugins/analytics/analytics.ts
+++ b/packages/appkit/src/plugins/analytics/analytics.ts
@@ -1,3 +1,4 @@
+import { randomUUID } from "node:crypto";
 import type { WorkspaceClient } from "@databricks/sdk-experimental";
 import { tableFromIPC } from "apache-arrow";
 import type express from "express";
@@ -146,9 +147,15 @@ export class AnalyticsPlugin extends Plugin implements ToolProvider {
         // Already drained, expired, or never belonged to this user. 410
         // distinguishes this from "warehouse statement id not found" (404)
         // so the client can surface a useful error.
-        logger.debug("Inline Arrow stash miss for jobId=%s", jobId);
+        const requestId = randomUUID();
+        logger.debug(
+          "Inline Arrow stash miss for jobId=%s requestId=%s",
+          jobId,
+          requestId,
+        );
         res.status(410).json({
           error: "Inline Arrow result expired or unknown",
+          requestId,
           plugin: this.name,
         });
         return;
@@ -183,15 +190,18 @@ export class AnalyticsPlugin extends Plugin implements ToolProvider {
       );
       res.send(Buffer.from(result.data));
     } catch (error) {
-      logger.error("Arrow job error: %O", error);
-      // Do not echo upstream / SDK error text to the client — it can
-      // include statement fragments, internal object names, and
-      // correlation IDs. The full detail stays in the server log above.
+      const requestId = randomUUID();
+      logger.error("Arrow job error (requestId=%s): %O", requestId, error);
+      // Do not echo upstream / SDK error text to the client — the
+      // detail stays in the server log above, keyed by requestId so
+      // a user quoting it in a support ticket lets staff grep
+      // directly to this line.
       const errorCode =
         error instanceof ExecutionError ? error.errorCode : undefined;
       res.status(404).json({
         error: "Arrow result unavailable",
         errorCode,
+        requestId,
         plugin: this.name,
       });
     }
diff --git a/packages/appkit/src/stream/sse-writer.ts b/packages/appkit/src/stream/sse-writer.ts
index 2b49d1dee..bf3659937 100644
--- a/packages/appkit/src/stream/sse-writer.ts
+++ b/packages/appkit/src/stream/sse-writer.ts
@@ -43,9 +43,17 @@ export class SSEWriter {
   ): void {
     if (res.writableEnded) return;
 
+    // The SSE `id:` line and the in-payload `requestId` are
+    // intentionally the same value: the wire-level id is invisible to
+    // most React consumers (the event-source `data` is what reaches
+    // app code), so we duplicate it inside the JSON so the hook can
+    // surface it. Server-side logs use this same id in the
+    // `Stream execution failed` line — a user quoting it lets staff
+    // grep logs directly.
     const errorData: SSEError = {
       error,
       code,
+      requestId: eventId,
       ...(errorCode ? { errorCode } : {}),
     };
 
diff --git a/packages/appkit/src/stream/stream-manager.ts b/packages/appkit/src/stream/stream-manager.ts
index 2cacbcb9c..2ac0ea968 100644
--- a/packages/appkit/src/stream/stream-manager.ts
+++ b/packages/appkit/src/stream/stream-manager.ts
@@ -309,17 +309,22 @@ export class StreamManager {
         if (errorCode === SSEErrorCode.STREAM_ABORTED) {
           logger.info("Stream aborted by client (code=%s)", errorCode);
         } else {
+          // Log line includes the requestId we'll emit to the client —
+          // a user quoting their requestId in a support ticket lets
+          // staff grep this exact line for the full raw message.
           logger.error(
-            "Stream execution failed: %s (code=%s upstreamCode=%s)",
+            "Stream execution failed: %s (code=%s upstreamCode=%s requestId=%s)",
             rawMsg,
             errorCode,
             upstreamCode ?? "n/a",
+            errorEventId,
           );
         }
 
         const payload: Record<string, unknown> = {
           error: clientMsg,
           code: errorCode,
+          requestId: errorEventId,
         };
         if (upstreamCode) payload.errorCode = upstreamCode;
 
diff --git a/packages/appkit/src/stream/tests/stream.test.ts b/packages/appkit/src/stream/tests/stream.test.ts
index a99242038..243178eb3 100644
--- a/packages/appkit/src/stream/tests/stream.test.ts
+++ b/packages/appkit/src/stream/tests/stream.test.ts
@@ -271,9 +271,14 @@ describe("StreamManager", () => {
       await streamManager.stream(mockRes as any, generator);
 
       expect(events).toContain("event: error\n");
-      expect(events).toContain(
-        'data: {"error":"Internal server error","code":"INTERNAL_ERROR"}\n\n',
-      );
+      // Match the error payload shape rather than the exact frame string —
+      // the frame now also carries a per-error requestId (randomUUID) for
+      // support triage, so we can't pin the full data line literally.
+      const dataLines = events.filter((e: string) => e.startsWith("data: "));
+      const errorLine = dataLines.find((e: string) => e.includes('"error"'));
+      expect(errorLine).toContain('"error":"Internal server error"');
+      expect(errorLine).toContain('"code":"INTERNAL_ERROR"');
+      expect(errorLine).toMatch(/"requestId":"[0-9a-f-]+"/);
     });
 
     test("should not crash if client disconnects during error", async () => {
diff --git a/packages/appkit/src/stream/types.ts b/packages/appkit/src/stream/types.ts
index ba0d4915e..89a573bad 100644
--- a/packages/appkit/src/stream/types.ts
+++ b/packages/appkit/src/stream/types.ts
@@ -31,6 +31,14 @@ export interface SSEError {
    * the human-readable `error` string.
    */
   errorCode?: string;
+  /**
+   * Correlation id for support triage. Set to the SSE event id of the
+   * error frame — the same value that appears in the server-side
+   * `logger.error("Stream execution failed: ...")` line, so a user
+   * quoting this id lets staff grep logs directly. Stable per-error,
+   * unique across the process lifetime.
+   */
+  requestId?: string;
 }
 
 export interface BufferedEvent {