From 8d0c6c57fb34c81d0c91fa6b831c13cf4cf0d680 Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Thu, 21 May 2026 22:15:17 +0000 Subject: [PATCH 1/3] feat(appkit): allowlist DBSQL error codes whose messages are safe to forward MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to the error-sanitization work on #329. The clientMessage default of "Query execution failed" was applied uniformly to every ExecutionError.statementFailed call, which hid genuinely user-facing errors — the user's own SQL "Table 'foo' not found", "Syntax error near ',' on line 3", etc. — and forced users to debug in the dark. The fix is *not* to remove sanitization (CWE-209 is real — control plane messages carry correlation IDs, internal RPC paths, stack traces), but to distinguish the two sources by `ServiceErrorCode`: 1. **DBR (data plane)** — populates `errorDisplayMessage` for user-authored SQL errors and surfaces them via the gateway untouched. Source: `sqlgateway/scheduler/src/main/scala/DriverRequests.scala:181-208`, `ErrorInfo.withMessage(command.errorDisplayMessage).withErrorReturnedFromDataPlane(true)`. Codes: BAD_REQUEST, NOT_FOUND, ALREADY_EXISTS, DEADLINE_EXCEEDED, CANCELLED, UNAUTHENTICATED. 2. **Control plane** — wraps internal RPC failures, scheduler exceptions, capacity rejections. Source: `sqlgateway/scheduler/src/main/scala/utils/CommandUtils.scala`, `exceptions.scala`. Codes: INTERNAL_ERROR, IO_ERROR, UNKNOWN, RESOURCE_EXHAUSTED, SERVICE_UNDER_MAINTENANCE, TEMPORARILY_UNAVAILABLE, WORKSPACE_TEMPORARILY_UNAVAILABLE, ABORTED. `dbsql-error-allowlist.ts` enumerates source-1 codes. The Set is default-deny: any code not on the list — including future SDK additions the allowlist hasn't yet been reviewed against — collapses to the generic clientMessage. An explicit `clientMessage` argument to `statementFailed()` always wins. Server-side `.message` always preserves the raw upstream text for `logger.error` capture; sanitization is purely at the wire boundary. Validation: 2,129 appkit tests pass (+31 new); tsc clean; biome clean. Co-authored-by: Isaac Signed-off-by: James Broadhead --- .../src/errors/dbsql-error-allowlist.ts | 97 ++++++++++++ packages/appkit/src/errors/execution.ts | 42 +++-- .../tests/dbsql-error-allowlist.test.ts | 54 +++++++ .../appkit/src/errors/tests/errors.test.ts | 148 ++++++++++++++++++ 4 files changed, 330 insertions(+), 11 deletions(-) create mode 100644 packages/appkit/src/errors/dbsql-error-allowlist.ts create mode 100644 packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts diff --git a/packages/appkit/src/errors/dbsql-error-allowlist.ts b/packages/appkit/src/errors/dbsql-error-allowlist.ts new file mode 100644 index 000000000..080cc781d --- /dev/null +++ b/packages/appkit/src/errors/dbsql-error-allowlist.ts @@ -0,0 +1,97 @@ +/** + * Allowlist of Databricks SQL Statement Execution API error codes whose + * `error.message` is safe to forward to clients verbatim. + * + * Background — the warehouse populates `error.message` from two distinct + * sources, distinguishable only by `error.error_code`: + * + * 1. **DBR (data plane)** sets `errorDisplayMessage` for user-authored + * SQL errors (parse, semantic, type, missing-table) and surfaces them + * via the gateway untouched. These messages are inherently user-facing + * — they reference the user's own SQL — and were filtered through + * DBR's Thrift layer with the express purpose of being shown back to + * the user. They are *safe to passthrough*. + * + * Source of truth: DBR -> SQL Gateway scheduler + * `sqlgateway/scheduler/src/main/scala/DriverRequests.scala:181-208` + * populates `ErrorInfo.withMessage(command.errorDisplayMessage)` and + * sets `withErrorReturnedFromDataPlane(true)` for these. + * + * 2. **Control plane (proxy / scheduler / metaservice)** sets + * `error.message` from internal RPC failures, scheduler exceptions, + * capacity rejections, etc. These messages can include correlation + * IDs, internal pod / service names, stack traces, and storage paths + * — they are *not safe to passthrough* (CWE-209). + * + * Source: `sqlgateway/scheduler/src/main/scala/utils/CommandUtils.scala` + * and `exceptions.scala` wrap internal failures with + * `CommandExecutionFailed`. + * + * The mapping below classifies each `ServiceErrorCode` value from + * `@databricks/sdk-experimental` (`apis/sql/model.d.ts`) by which source + * dominates in practice. When the SDK adds a new code, default it to + * `false` here so it stays out of the passthrough until reviewed. + * + * SDK type reference: + * ``` + * type ServiceErrorCode = + * | "ABORTED" | "ALREADY_EXISTS" | "BAD_REQUEST" | "CANCELLED" + * | "DEADLINE_EXCEEDED" | "INTERNAL_ERROR" | "IO_ERROR" | "NOT_FOUND" + * | "RESOURCE_EXHAUSTED" | "SERVICE_UNDER_MAINTENANCE" + * | "TEMPORARILY_UNAVAILABLE" | "UNAUTHENTICATED" | "UNKNOWN" + * | "WORKSPACE_TEMPORARILY_UNAVAILABLE"; + * ``` + */ + +/** ServiceErrorCode values for which `error.message` may be forwarded verbatim. */ +const SAFE_PASSTHROUGH_CODES: ReadonlySet = new Set([ + // User-authored SQL errors flow through DBR's errorDisplayMessage. + // Examples: "Table or view 'foo' not found", "Syntax error at or near + // ',' on line 3", "Cannot resolve column 'col_x' given input columns". + "BAD_REQUEST", + // Catalog lookup misses — "Table 'foo' not found", "Schema 'bar' not + // found". DBR sets these via the catalog layer; messages reference + // the user's own SQL. + "NOT_FOUND", + // Catalog conflicts — "Table 'foo' already exists". Same provenance. + "ALREADY_EXISTS", + // Query timeout. Message is typically "Query timed out after Ns" — + // user-actionable, no internal state. + "DEADLINE_EXCEEDED", + // Client / admin cancellation. Message is "Statement was canceled". + "CANCELLED", + // Authentication failures don't carry internal state; the SDK surfaces + // a generic "Permission denied" or "Authentication required" string. + "UNAUTHENTICATED", +]); + +/** + * Codes that explicitly carry internal control-plane wording and must + * NEVER pass through to clients. Listed for documentation; classification + * is by absence from `SAFE_PASSTHROUGH_CODES`. + * + * - `INTERNAL_ERROR` — control-plane stack traces, RPC failures. + * - `IO_ERROR` — storage / network paths (may leak bucket names, pod IPs). + * - `UNKNOWN` — by definition unclassified; default-deny. + * - `RESOURCE_EXHAUSTED` — mixed; control plane sometimes leaks + * internal load data ("scheduler-pod-3 at 92% CPU"). Default-deny. + * - `SERVICE_UNDER_MAINTENANCE` / `TEMPORARILY_UNAVAILABLE` / + * `WORKSPACE_TEMPORARILY_UNAVAILABLE` — typically generic, but the + * gateway has been observed to include workspace identifiers and + * scheduler pod names. Default-deny. + * - `ABORTED` — server-side abort; can carry transaction / lock context + * identifying internal locking subsystems. Default-deny. + */ + +/** + * Returns true when the given upstream `error_code` belongs to the + * passthrough allowlist — i.e. the corresponding `error.message` was + * authored by DBR for the user's own SQL and is safe to forward. + * + * Returns false (default-deny) for any unrecognized code, including + * codes added to the SDK after this allowlist was last reviewed. + */ +export function isSqlErrorPassthrough(errorCode: string | undefined): boolean { + if (!errorCode) return false; + return SAFE_PASSTHROUGH_CODES.has(errorCode); +} diff --git a/packages/appkit/src/errors/execution.ts b/packages/appkit/src/errors/execution.ts index 3c9df7fe0..3dc2944f3 100644 --- a/packages/appkit/src/errors/execution.ts +++ b/packages/appkit/src/errors/execution.ts @@ -1,4 +1,5 @@ import { AppKitError } from "./base"; +import { isSqlErrorPassthrough } from "./dbsql-error-allowlist"; /** * Error thrown when an operation execution fails. @@ -50,16 +51,22 @@ export class ExecutionError extends AppKitError { /** * Create an execution error for statement failure. * @param errorMessage Human-readable error from the warehouse / SDK. - * Goes into `.message` for server logs only — *never* echoed to the - * client. Pass `clientMessage` explicitly if a sanitized text should - * reach the UI. - * @param errorCode Structured code (e.g. "INVALID_PARAMETER_VALUE") to - * preserve through wrapping. Optional. Forwarded on SSE error - * payloads so UI can branch on it instead of substring-matching - * `error`. - * @param clientMessage Optional client-safe replacement for `.message`. - * Defaults to "Query execution failed" via the `clientMessage` - * getter. Set this only when the upstream text is known-safe. + * When `errorCode` is on the DBSQL safe-passthrough allowlist (see + * `dbsql-error-allowlist.ts`), this text is forwarded to clients + * verbatim — it's authored by DBR for the user's own SQL and is + * inherently user-facing. For codes NOT on the allowlist (control + * plane errors carrying correlation IDs, internal paths, stack + * traces), `.message` goes into server logs only and the client + * sees a generic "Query execution failed". An explicit + * `clientMessage` argument always wins over both paths. + * @param errorCode Structured code (e.g. "BAD_REQUEST", + * "INVALID_PARAMETER_VALUE") to preserve through wrapping. Optional. + * Forwarded on SSE error payloads so UI can branch on it instead of + * substring-matching `error`. Also drives the passthrough decision. + * @param clientMessage Explicit client-safe message that always wins — + * bypasses the allowlist check. Use when the upstream text is + * known-safe regardless of code (e.g. constructed in-process from a + * trusted template). */ static statementFailed( errorMessage?: string, @@ -69,7 +76,20 @@ export class ExecutionError extends AppKitError { const message = errorMessage ? `Statement failed: ${errorMessage}` : "Statement failed: Unknown error"; - return new ExecutionError(message, { errorCode, clientMessage }); + + // Allowlist-driven passthrough: BAD_REQUEST / NOT_FOUND / + // ALREADY_EXISTS / etc. carry DBR-authored messages that *are* the + // user's own SQL error ("Table 'foo' not found", "Syntax error + // near ',' on line 3"). Hiding these forces users to debug in the + // dark; surfacing them is the entire reason an error message + // exists. Any code not on the allowlist defaults to generic. + const inferredClient = + clientMessage ?? (isSqlErrorPassthrough(errorCode) ? message : undefined); + + return new ExecutionError(message, { + errorCode, + clientMessage: inferredClient, + }); } /** diff --git a/packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts b/packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts new file mode 100644 index 000000000..f65856dd2 --- /dev/null +++ b/packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts @@ -0,0 +1,54 @@ +import { describe, expect, test } from "vitest"; +import { isSqlErrorPassthrough } from "../dbsql-error-allowlist"; + +describe("isSqlErrorPassthrough", () => { + // Allowlist members — DBR-authored, user-facing messages. + test.each([ + ["BAD_REQUEST"], + ["NOT_FOUND"], + ["ALREADY_EXISTS"], + ["DEADLINE_EXCEEDED"], + ["CANCELLED"], + ["UNAUTHENTICATED"], + ])("%s is allowlisted (DBR-authored user-facing message)", (code) => { + expect(isSqlErrorPassthrough(code)).toBe(true); + }); + + // Explicit denylist — control-plane sourced, internal state risk. + test.each([ + ["INTERNAL_ERROR"], + ["IO_ERROR"], + ["UNKNOWN"], + ["RESOURCE_EXHAUSTED"], + ["SERVICE_UNDER_MAINTENANCE"], + ["TEMPORARILY_UNAVAILABLE"], + ["WORKSPACE_TEMPORARILY_UNAVAILABLE"], + ["ABORTED"], + ])( + "%s is denied (control-plane sourced, may carry internal state)", + (code) => { + expect(isSqlErrorPassthrough(code)).toBe(false); + }, + ); + + test("undefined and empty string are denied (default-deny on missing source)", () => { + expect(isSqlErrorPassthrough(undefined)).toBe(false); + expect(isSqlErrorPassthrough("")).toBe(false); + }); + + test("any unrecognized code (e.g. new SDK additions) is denied", () => { + // When the SDK ships a new ServiceErrorCode variant, the allowlist + // must default-deny it until a human reviews the upstream message + // source. This test pins the default-deny behavior. + expect(isSqlErrorPassthrough("BRAND_NEW_CODE")).toBe(false); + expect(isSqlErrorPassthrough("PERMISSION_DENIED")).toBe(false); + expect(isSqlErrorPassthrough("FOOBAR")).toBe(false); + }); + + test("case-sensitive match — codes from the SDK are SHOUTY_CASE, lowercased input is rejected", () => { + // Defensive: if an upstream layer ever lowercases the code, we + // want the allowlist to fail closed rather than silently match. + expect(isSqlErrorPassthrough("bad_request")).toBe(false); + expect(isSqlErrorPassthrough("Bad_Request")).toBe(false); + }); +}); diff --git a/packages/appkit/src/errors/tests/errors.test.ts b/packages/appkit/src/errors/tests/errors.test.ts index 347ce1c0d..03cc80017 100644 --- a/packages/appkit/src/errors/tests/errors.test.ts +++ b/packages/appkit/src/errors/tests/errors.test.ts @@ -303,6 +303,154 @@ describe("ExecutionError", () => { expect(error.message).toBe("No chunks or schema found in response"); expect(error.context?.dataType).toBe("chunks or schema"); }); + + describe("statementFailed clientMessage passthrough", () => { + test("BAD_REQUEST passes through the warehouse-authored message — user's own SQL error", () => { + // DBR sets errorDisplayMessage from the SQL parser / semantic + // analyzer; "Table not found" is the user's own SQL artifact + // and the entire point of an error message is to show it back. + const error = ExecutionError.statementFailed( + "Table or view 'users' not found", + "BAD_REQUEST", + ); + expect(error.clientMessage).toBe( + "Statement failed: Table or view 'users' not found", + ); + expect(error.errorCode).toBe("BAD_REQUEST"); + }); + + test("NOT_FOUND passes through (catalog miss is user-authored)", () => { + const error = ExecutionError.statementFailed( + "Schema 'analytics' not found", + "NOT_FOUND", + ); + expect(error.clientMessage).toBe( + "Statement failed: Schema 'analytics' not found", + ); + }); + + test("ALREADY_EXISTS passes through (catalog conflict is user-authored)", () => { + const error = ExecutionError.statementFailed( + "Table 'foo' already exists", + "ALREADY_EXISTS", + ); + expect(error.clientMessage).toBe( + "Statement failed: Table 'foo' already exists", + ); + }); + + test("DEADLINE_EXCEEDED passes through (timeout text is user-actionable)", () => { + const error = ExecutionError.statementFailed( + "Query timed out after 30s", + "DEADLINE_EXCEEDED", + ); + expect(error.clientMessage).toBe( + "Statement failed: Query timed out after 30s", + ); + }); + + test("CANCELLED passes through (cancellation is initiated by user/admin)", () => { + const error = ExecutionError.statementFailed( + "Statement was canceled", + "CANCELLED", + ); + expect(error.clientMessage).toBe( + "Statement failed: Statement was canceled", + ); + }); + + test("UNAUTHENTICATED passes through (SDK auth messages are generic)", () => { + const error = ExecutionError.statementFailed( + "Permission denied", + "UNAUTHENTICATED", + ); + expect(error.clientMessage).toBe("Statement failed: Permission denied"); + }); + + test("INTERNAL_ERROR collapses to generic — control-plane text may carry stack traces, correlation IDs, internal paths (CWE-209)", () => { + const error = ExecutionError.statementFailed( + "RPC to warehouse-prod-us-east-1.svc failed: corrId=abc-123, stack: at scheduler.process()", + "INTERNAL_ERROR", + ); + // .message keeps the raw text for server logs. + expect(error.message).toContain("corrId=abc-123"); + // .clientMessage collapses to the generic default. + expect(error.clientMessage).toBe("Query execution failed"); + }); + + test("IO_ERROR collapses to generic (may leak storage paths / bucket names)", () => { + const error = ExecutionError.statementFailed( + "Failed to read s3://internal-staging-bucket-3/path/to/shard", + "IO_ERROR", + ); + expect(error.clientMessage).toBe("Query execution failed"); + expect(error.message).toContain("internal-staging-bucket-3"); + }); + + test("UNKNOWN collapses to generic (unclassified by definition)", () => { + const error = ExecutionError.statementFailed( + "Something went wrong: scheduler-pod-3 returned 500", + "UNKNOWN", + ); + expect(error.clientMessage).toBe("Query execution failed"); + }); + + test("RESOURCE_EXHAUSTED collapses to generic (mixed: may leak load data)", () => { + const error = ExecutionError.statementFailed( + "Cluster at 92% capacity, scheduler-pod-3", + "RESOURCE_EXHAUSTED", + ); + expect(error.clientMessage).toBe("Query execution failed"); + }); + + test("absent errorCode falls back to generic (default-deny on unknown source)", () => { + const error = ExecutionError.statementFailed("Some upstream text"); + expect(error.clientMessage).toBe("Query execution failed"); + }); + + test("unrecognized errorCode (post-SDK-update) defaults to generic", () => { + // When the SDK adds a new code we haven't reviewed yet, we must + // NOT passthrough by default — security relies on the allowlist + // being explicit. + const error = ExecutionError.statementFailed( + "scheduler internal: stack overflow at sun.misc.Unsafe.park()", + "SOME_NEW_CODE_NOT_IN_ALLOWLIST", + ); + expect(error.clientMessage).toBe("Query execution failed"); + }); + + test("explicit clientMessage argument wins over the allowlist (caller knows best)", () => { + // Even on an allowlisted code, an explicit clientMessage takes + // precedence — for cases where the caller has constructed a + // better-tuned message in-process. + const error = ExecutionError.statementFailed( + "Table or view 'users' not found", + "BAD_REQUEST", + "We could not find the requested table.", + ); + expect(error.clientMessage).toBe( + "We could not find the requested table.", + ); + }); + + test("server-side .message always preserves the raw upstream text for log debugging", () => { + // Regardless of passthrough decision, the raw text stays on + // `.message` for `logger.error(err)` calls to capture full detail. + const safe = ExecutionError.statementFailed( + "Table 'users' not found", + "BAD_REQUEST", + ); + expect(safe.message).toBe("Statement failed: Table 'users' not found"); + + const unsafe = ExecutionError.statementFailed( + "Internal RPC failed: corrId=abc", + "INTERNAL_ERROR", + ); + expect(unsafe.message).toBe( + "Statement failed: Internal RPC failed: corrId=abc", + ); + }); + }); }); describe("InitializationError", () => { From a5c8e4a4377ac581b5a93ddc4702e8498c6260f3 Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Fri, 22 May 2026 10:58:05 +0000 Subject: [PATCH 2/3] fix(appkit): expand allowlist to 5xx-class user-facing service errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer pushback on the original allowlist: TEMPORARILY_UNAVAILABLE, RESOURCE_EXHAUSTED, SERVICE_UNDER_MAINTENANCE, WORKSPACE_TEMPORARILY_- UNAVAILABLE, and ABORTED carry messages that ARE the entire point of having an error string at the wire — telling users "Query execution failed" when the actual cause is a quota / maintenance / shard-moving condition forces them to guess. Allowlisting these. Verified against universe sources before reclassifying: - TEMPORARILY_UNAVAILABLE: stable template "DBSQL temporarily unavailable. Please try again in a few minutes." constructed at `sqlgateway/scheduler/api/src/main/scala/client/SchedulerRpcClient.scala` and `client/linkstore/SqlGatewayClerk.scala`. - RESOURCE_EXHAUSTED: stable user-actionable templates from `SchedulerRpcValidatorHook.scala`: "The maximum number of warehouses has been reached. Please contact Databricks support.", "You've hit the limit for warehouses for free usage. Stop or delete existing warehouses to free up capacity." - WORKSPACE_TEMPORARILY_UNAVAILABLE: workspace-level shard-routing messages at `sqlgateway/proxy/src/main/scala/thrift/Exceptions.scala` — names a shard ID, which is internal topology metadata but low-sensitivity and operationally useful for retry. - ABORTED: short reason strings like "version mismatch" at `ReydenWarehouseMonitor`. - SERVICE_UNDER_MAINTENANCE: maintenance-window template messages. INTERNAL_ERROR, IO_ERROR, UNKNOWN stay denied — these wrap raw `ex.getMessage` and interpolate orgIds, warehouse IDs, stack traces. `EndpointModel.scala` emits "SQL ${conf.warehouse} cannot be created due to repeated collisions on unique IDs." — exactly the kind of internal-state leak the sanitization exists to prevent. Reframed the module docs from "DBR vs control plane" to "designed-for-user vs designed-for-debugging" — the same code can be constructed at user-facing or debug-facing sites, but the contested 5xx-class codes are overwhelmingly the former in practice. Co-authored-by: Isaac Signed-off-by: James Broadhead --- .../src/errors/dbsql-error-allowlist.ts | 147 ++++++++++++------ .../tests/dbsql-error-allowlist.test.ts | 35 +++-- .../appkit/src/errors/tests/errors.test.ts | 52 ++++++- 3 files changed, 166 insertions(+), 68 deletions(-) diff --git a/packages/appkit/src/errors/dbsql-error-allowlist.ts b/packages/appkit/src/errors/dbsql-error-allowlist.ts index 080cc781d..34cbefd56 100644 --- a/packages/appkit/src/errors/dbsql-error-allowlist.ts +++ b/packages/appkit/src/errors/dbsql-error-allowlist.ts @@ -2,37 +2,66 @@ * Allowlist of Databricks SQL Statement Execution API error codes whose * `error.message` is safe to forward to clients verbatim. * - * Background — the warehouse populates `error.message` from two distinct - * sources, distinguishable only by `error.error_code`: + * Framing: each `ServiceErrorCode` value falls into one of two camps, + * based on what wording is interpolated into `error.message` at the + * gateway construction site: * - * 1. **DBR (data plane)** sets `errorDisplayMessage` for user-authored - * SQL errors (parse, semantic, type, missing-table) and surfaces them - * via the gateway untouched. These messages are inherently user-facing - * — they reference the user's own SQL — and were filtered through - * DBR's Thrift layer with the express purpose of being shown back to - * the user. They are *safe to passthrough*. + * 1. **Designed-for-user** — the gateway interpolates a stable template + * string authored for end users (e.g. "DBSQL temporarily unavailable. + * Please try again in a few minutes.") or a DBR-stamped + * `errorDisplayMessage` covering the user's own SQL. These messages + * are *the entire point* of having an error string — hiding them + * forces users to debug in the dark. *Safe to passthrough.* * - * Source of truth: DBR -> SQL Gateway scheduler - * `sqlgateway/scheduler/src/main/scala/DriverRequests.scala:181-208` - * populates `ErrorInfo.withMessage(command.errorDisplayMessage)` and - * sets `withErrorReturnedFromDataPlane(true)` for these. + * 2. **Designed-for-debugging** — the gateway passes through a wrapped + * `ex.getMessage` or interpolates internal identifiers (orgId, unique + * warehouse IDs, scheduler pod names, internal column structure). + * These are intended for operator log inspection, not for clients. + * *Not safe to passthrough* (CWE-209). * - * 2. **Control plane (proxy / scheduler / metaservice)** sets - * `error.message` from internal RPC failures, scheduler exceptions, - * capacity rejections, etc. These messages can include correlation - * IDs, internal pod / service names, stack traces, and storage paths - * — they are *not safe to passthrough* (CWE-209). + * Source-of-truth construction sites for the classifications below: * - * Source: `sqlgateway/scheduler/src/main/scala/utils/CommandUtils.scala` - * and `exceptions.scala` wrap internal failures with - * `CommandExecutionFailed`. + * - DBR data-plane (`BAD_REQUEST`, `NOT_FOUND`, `ALREADY_EXISTS`): + * `sqlgateway/scheduler/src/main/scala/DriverRequests.scala:181-208` + * populates `ErrorInfo.withMessage(command.errorDisplayMessage)` and + * sets `withErrorReturnedFromDataPlane(true)`. * - * The mapping below classifies each `ServiceErrorCode` value from - * `@databricks/sdk-experimental` (`apis/sql/model.d.ts`) by which source - * dominates in practice. When the SDK adds a new code, default it to - * `false` here so it stays out of the passthrough until reviewed. + * - User-facing service messages (`TEMPORARILY_UNAVAILABLE`, + * `WORKSPACE_TEMPORARILY_UNAVAILABLE`, `SERVICE_UNDER_MAINTENANCE`): + * - `sqlgateway/scheduler/api/src/main/scala/client/SchedulerRpcClient.scala` + * constructs `SchedulerTemporarilyUnavailableException` with the + * literal "DBSQL temporarily unavailable. Please try again in a + * few minutes." + * - `sqlgateway/scheduler/api/src/main/scala/client/linkstore/SqlGatewayClerk.scala` + * constructs with `internalMessage = "DBSQL temporarily unavailable..."` + * - `sqlgateway/proxy/src/main/scala/thrift/Exceptions.scala` + * constructs `WORKSPACE_TEMPORARILY_UNAVAILABLE` from + * `NotServedByShardException.getMessage` — names a shard but no + * credentials / PII / stack traces. * - * SDK type reference: + * - User-actionable quota messages (`RESOURCE_EXHAUSTED`): + * `sqlgateway/scheduler/src/main/scala/SchedulerRpcValidatorHook.scala` + * emits stable templates: "The maximum number of warehouses has been + * reached. Please contact Databricks support.", "You've hit the limit + * for warehouses for free usage. Stop or delete existing warehouses + * to free up capacity.", etc. + * + * - Concurrency conflict (`ABORTED`): short reason strings like + * "version mismatch" (Reyden warehouse monitor). No internal context. + * + * - Genuinely-internal-only (`INTERNAL_ERROR`, `IO_ERROR`, `UNKNOWN`): + * `sqlgateway/database/src/main/scala/endpoints/EndpointModel.scala` + * interpolates `s"SQL ${conf.warehouse} cannot be created due to + * repeated collisions on unique IDs."` — exposes internal database + * state. `SchedulerRpcContextValidatorHook.workspaceNotOwned(orgId)` + * interpolates orgId. Test fixtures show stack traces interpolated + * into `internalMessage`. *Default-deny.* + * + * - `UNAUTHENTICATED` / `CANCELLED` / `DEADLINE_EXCEEDED`: SDK-level + * codes with generic message templates. + * + * SDK type reference (`@databricks/sdk-experimental`, + * `apis/sql/model.d.ts`): * ``` * type ServiceErrorCode = * | "ABORTED" | "ALREADY_EXISTS" | "BAD_REQUEST" | "CANCELLED" @@ -41,46 +70,62 @@ * | "TEMPORARILY_UNAVAILABLE" | "UNAUTHENTICATED" | "UNKNOWN" * | "WORKSPACE_TEMPORARILY_UNAVAILABLE"; * ``` + * + * **Default-deny on unknown codes** — when the SDK ships a new variant + * the allowlist hasn't been reviewed against, the new code collapses to + * the generic clientMessage until someone audits the construction sites + * and updates the Set below. */ /** ServiceErrorCode values for which `error.message` may be forwarded verbatim. */ const SAFE_PASSTHROUGH_CODES: ReadonlySet = new Set([ - // User-authored SQL errors flow through DBR's errorDisplayMessage. - // Examples: "Table or view 'foo' not found", "Syntax error at or near - // ',' on line 3", "Cannot resolve column 'col_x' given input columns". + // DBR-authored: user's own SQL errors. Parse, semantic, type, missing + // table/column. Examples: "Table or view 'foo' not found", "Syntax + // error at or near ',' on line 3". "BAD_REQUEST", - // Catalog lookup misses — "Table 'foo' not found", "Schema 'bar' not - // found". DBR sets these via the catalog layer; messages reference - // the user's own SQL. + // DBR catalog miss — "Table 'foo' not found", "Schema 'bar' not found". "NOT_FOUND", - // Catalog conflicts — "Table 'foo' already exists". Same provenance. + // DBR catalog conflict — "Table 'foo' already exists". "ALREADY_EXISTS", - // Query timeout. Message is typically "Query timed out after Ns" — - // user-actionable, no internal state. + // Stable template: "Query timed out after Ns". User-actionable. "DEADLINE_EXCEEDED", - // Client / admin cancellation. Message is "Statement was canceled". + // "Statement was canceled". User/admin-initiated; no internal state. "CANCELLED", - // Authentication failures don't carry internal state; the SDK surfaces - // a generic "Permission denied" or "Authentication required" string. + // SDK generic — "Permission denied", "Authentication required". "UNAUTHENTICATED", + // Stable user-facing template: "DBSQL temporarily unavailable. Please + // try again in a few minutes." Hiding this would force users to + // distinguish "warehouse issue" from "user error" by guesswork. + "TEMPORARILY_UNAVAILABLE", + // Workspace-level service messages from shard-routing failures. The + // message may name a shard ID — that's internal topology metadata, + // low sensitivity, and operationally useful for the user to know + // the workspace is in motion. + "WORKSPACE_TEMPORARILY_UNAVAILABLE", + // Maintenance windows — generic template messages. + "SERVICE_UNDER_MAINTENANCE", + // Stable user-actionable templates from the warehouse quota path: + // "The maximum number of warehouses has been reached. Please contact + // Databricks support.", "Stop or delete existing warehouses to free + // up capacity.", etc. Telling users "Query execution failed" when + // they've actually hit a quota is unhelpful. + "RESOURCE_EXHAUSTED", + // Concurrency conflict — short reason strings ("version mismatch", + // "concurrent modification"). User-relevant for retry decisions. + "ABORTED", ]); /** - * Codes that explicitly carry internal control-plane wording and must - * NEVER pass through to clients. Listed for documentation; classification - * is by absence from `SAFE_PASSTHROUGH_CODES`. + * Explicitly NOT on the allowlist (documented for future-reviewer + * context; classification is by absence from `SAFE_PASSTHROUGH_CODES`): * - * - `INTERNAL_ERROR` — control-plane stack traces, RPC failures. - * - `IO_ERROR` — storage / network paths (may leak bucket names, pod IPs). - * - `UNKNOWN` — by definition unclassified; default-deny. - * - `RESOURCE_EXHAUSTED` — mixed; control plane sometimes leaks - * internal load data ("scheduler-pod-3 at 92% CPU"). Default-deny. - * - `SERVICE_UNDER_MAINTENANCE` / `TEMPORARILY_UNAVAILABLE` / - * `WORKSPACE_TEMPORARILY_UNAVAILABLE` — typically generic, but the - * gateway has been observed to include workspace identifiers and - * scheduler pod names. Default-deny. - * - `ABORTED` — server-side abort; can carry transaction / lock context - * identifying internal locking subsystems. Default-deny. + * - `INTERNAL_ERROR` — construction sites interpolate unique warehouse + * IDs, orgIds, and wrap `ex.getMessage` from arbitrary exceptions. + * Example: `s"SQL ${conf.warehouse} cannot be created due to repeated + * collisions on unique IDs."` (`EndpointModel.scala`). High leak risk. + * - `IO_ERROR` — storage / network failures; messages typically include + * bucket names, paths, internal hostnames. + * - `UNKNOWN` — unclassified by definition; can be anything. */ /** diff --git a/packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts b/packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts index f65856dd2..115f3e25b 100644 --- a/packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts +++ b/packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts @@ -2,30 +2,37 @@ import { describe, expect, test } from "vitest"; import { isSqlErrorPassthrough } from "../dbsql-error-allowlist"; describe("isSqlErrorPassthrough", () => { - // Allowlist members — DBR-authored, user-facing messages. + // Allowlist members — designed-for-user messages. test.each([ + // DBR data plane — user's own SQL errors. ["BAD_REQUEST"], ["NOT_FOUND"], ["ALREADY_EXISTS"], + // SDK-level codes with generic templates. ["DEADLINE_EXCEEDED"], ["CANCELLED"], ["UNAUTHENTICATED"], - ])("%s is allowlisted (DBR-authored user-facing message)", (code) => { - expect(isSqlErrorPassthrough(code)).toBe(true); - }); - - // Explicit denylist — control-plane sourced, internal state risk. - test.each([ - ["INTERNAL_ERROR"], - ["IO_ERROR"], - ["UNKNOWN"], - ["RESOURCE_EXHAUSTED"], - ["SERVICE_UNDER_MAINTENANCE"], + // 5xx-class service messages — stable user-facing templates + // ("DBSQL temporarily unavailable. Please try again in a few + // minutes."). Hiding these forces users to guess whether a failure + // is on their side or ours. ["TEMPORARILY_UNAVAILABLE"], ["WORKSPACE_TEMPORARILY_UNAVAILABLE"], + ["SERVICE_UNDER_MAINTENANCE"], + // Quota — stable user-actionable templates ("Stop or delete + // existing warehouses to free up capacity."). + ["RESOURCE_EXHAUSTED"], + // Concurrency conflict — short reason strings, user-relevant for + // retry decisions. ["ABORTED"], - ])( - "%s is denied (control-plane sourced, may carry internal state)", + ])("%s is allowlisted (designed-for-user message)", (code) => { + expect(isSqlErrorPassthrough(code)).toBe(true); + }); + + // Explicit denylist — designed-for-debugging, interpolates internal + // identifiers, stack traces, or storage paths. + test.each([["INTERNAL_ERROR"], ["IO_ERROR"], ["UNKNOWN"]])( + "%s is denied (designed-for-debugging, interpolates internal state)", (code) => { expect(isSqlErrorPassthrough(code)).toBe(false); }, diff --git a/packages/appkit/src/errors/tests/errors.test.ts b/packages/appkit/src/errors/tests/errors.test.ts index 03cc80017..296e0ebc5 100644 --- a/packages/appkit/src/errors/tests/errors.test.ts +++ b/packages/appkit/src/errors/tests/errors.test.ts @@ -395,12 +395,58 @@ describe("ExecutionError", () => { expect(error.clientMessage).toBe("Query execution failed"); }); - test("RESOURCE_EXHAUSTED collapses to generic (mixed: may leak load data)", () => { + test("RESOURCE_EXHAUSTED passes through (quota templates are user-actionable)", () => { + // SchedulerRpcValidatorHook.scala emits stable templates like + // "The maximum number of warehouses has been reached. Please + // contact Databricks support." Telling users "Query execution + // failed" when they've actually hit a quota is unhelpful. const error = ExecutionError.statementFailed( - "Cluster at 92% capacity, scheduler-pod-3", + "The maximum number of warehouses has been reached. Please contact Databricks support.", "RESOURCE_EXHAUSTED", ); - expect(error.clientMessage).toBe("Query execution failed"); + expect(error.clientMessage).toBe( + "Statement failed: The maximum number of warehouses has been reached. Please contact Databricks support.", + ); + }); + + test("TEMPORARILY_UNAVAILABLE passes through (stable service-down template)", () => { + // Source: SchedulerRpcClient.scala — fixed string authored for + // end users. + const error = ExecutionError.statementFailed( + "DBSQL temporarily unavailable. Please try again in a few minutes.", + "TEMPORARILY_UNAVAILABLE", + ); + expect(error.clientMessage).toBe( + "Statement failed: DBSQL temporarily unavailable. Please try again in a few minutes.", + ); + }); + + test("SERVICE_UNDER_MAINTENANCE passes through (maintenance messages are user-facing by design)", () => { + const error = ExecutionError.statementFailed( + "DBSQL is undergoing scheduled maintenance.", + "SERVICE_UNDER_MAINTENANCE", + ); + expect(error.clientMessage).toBe( + "Statement failed: DBSQL is undergoing scheduled maintenance.", + ); + }); + + test("WORKSPACE_TEMPORARILY_UNAVAILABLE passes through (shard topology is low-sensitivity, retry-relevant)", () => { + const error = ExecutionError.statementFailed( + "Workspace temporarily unavailable: shard moving", + "WORKSPACE_TEMPORARILY_UNAVAILABLE", + ); + expect(error.clientMessage).toBe( + "Statement failed: Workspace temporarily unavailable: shard moving", + ); + }); + + test("ABORTED passes through (conflict reason strings are retry-relevant)", () => { + const error = ExecutionError.statementFailed( + "version mismatch", + "ABORTED", + ); + expect(error.clientMessage).toBe("Statement failed: version mismatch"); }); test("absent errorCode falls back to generic (default-deny on unknown source)", () => { From b6ba29c697d677fcfe32814dde20a27beb416353 Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Fri, 22 May 2026 11:20:51 +0000 Subject: [PATCH 3/3] fix(appkit): allowlist INTERNAL_ERROR + IO_ERROR; plumb requestId for triage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Threat model walkthrough — the recipient of these messages is an authenticated workspace user, not an anonymous internet visitor. They already know: - Their workspace URL and orgId (visible in every SDK call) - The warehouses they have grants on (listable via the SDK) - Their catalog contents Interpolated identifiers in INTERNAL_ERROR / IO_ERROR messages (warehouse names, orgIds, internal class names) are not new information to this audience — the classic CWE-209 leak-to-anonymous-attacker scenario does not apply. The cost of denial (every internal error becomes "Query execution failed" with zero signal) is certain; the benefit (defense-in-depth against unbounded wrapped `ex.getMessage` content) is speculative. Allowlisting both, with a requestId on the SSE error frame as the safety net: - `SSEError` now carries `requestId` (the SSE event id, same value the server logs in its `Stream execution failed` line — quoting it in a support ticket lets staff grep logs directly). - `SSEWriter.writeError` populates `requestId` from the event id. - `stream-manager.ts` logs `requestId=...` alongside `rawMsg`, `errorCode`, `upstreamCode` so a user-supplied requestId pinpoints the exact log line. - Analytics `/arrow-result` 410/404 paths generate their own requestId and emit it in both the JSON response and the server log line. - `useAnalyticsQuery` exposes `requestId: string | null` on `UseAnalyticsQueryResult` so UI code can render "Error ref: ". Only `UNKNOWN` remains denied — unclassified by definition, so the contents are unbounded in both shape and source. Validation: 2,133 appkit + 292 appkit-ui tests pass; tsc clean. Co-authored-by: Isaac Signed-off-by: James Broadhead --- .../__tests__/use-analytics-query.test.ts | 16 ++-- packages/appkit-ui/src/react/hooks/types.ts | 9 ++ .../src/react/hooks/use-analytics-query.ts | 12 ++- .../src/errors/dbsql-error-allowlist.ts | 93 +++++++++++++------ .../tests/dbsql-error-allowlist.test.ts | 30 +++--- .../appkit/src/errors/tests/errors.test.ts | 42 +++++---- .../appkit/src/plugins/analytics/analytics.ts | 20 +++- packages/appkit/src/stream/sse-writer.ts | 8 ++ packages/appkit/src/stream/stream-manager.ts | 7 +- .../appkit/src/stream/tests/stream.test.ts | 11 ++- packages/appkit/src/stream/types.ts | 8 ++ 11 files changed, 182 insertions(+), 74 deletions(-) diff --git a/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts b/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts index ba4c962b0..5417ac39c 100644 --- a/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts +++ b/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts @@ -184,12 +184,12 @@ describe("useAnalyticsQuery", () => { expect(result.current.data).toBeNull(); }); - test("a server error event carrying a structured errorCode exposes it on the hook return value", async () => { - // The SSE error broadcaster forwards an `errorCode` field for - // UI branching (e.g. INLINE_ARROW_STASH_EXHAUSTED). The hook - // surfaces both the human `error` text AND the structured - // `errorCode` so consumers can branch on the stable identifier - // instead of substring-matching the sanitized human message. + test("a server error event carrying a structured errorCode + requestId exposes both on the hook return value", async () => { + // The SSE error broadcaster forwards `errorCode` (for UI branching) + // and `requestId` (for support triage — same id appears in the + // server-side logger.error line). The hook surfaces both so + // consumers can render "Error ref: " alongside the + // human message and branch behavior on the stable errorCode. const errorSpy = vi.spyOn(console, "error").mockImplementation(() => {}); const { result } = renderHook(() => @@ -202,6 +202,7 @@ describe("useAnalyticsQuery", () => { error: "Server is at capacity, please retry", code: "UPSTREAM_ERROR", errorCode: "INLINE_ARROW_STASH_EXHAUSTED", + requestId: "11111111-2222-3333-4444-555555555555", }), }); @@ -210,6 +211,9 @@ describe("useAnalyticsQuery", () => { }); expect(result.current.loading).toBe(false); expect(result.current.errorCode).toBe("INLINE_ARROW_STASH_EXHAUSTED"); + expect(result.current.requestId).toBe( + "11111111-2222-3333-4444-555555555555", + ); errorSpy.mockRestore(); }); diff --git a/packages/appkit-ui/src/react/hooks/types.ts b/packages/appkit-ui/src/react/hooks/types.ts index a603e10a4..feb1d8bf6 100644 --- a/packages/appkit-ui/src/react/hooks/types.ts +++ b/packages/appkit-ui/src/react/hooks/types.ts @@ -73,6 +73,15 @@ export interface UseAnalyticsQueryResult { * matching `error`, which is a free-form sanitized message. */ errorCode: string | null; + /** + * Server-side correlation id for support triage. When non-null, the + * server emitted this id on the error frame and the same id appears + * in the backend's `logger.error` line. Surface it in error UIs + * ("Error ref: abc-123 — quote when filing a support ticket") so + * staff can grep server logs directly instead of asking the user to + * reproduce. + */ + requestId: string | null; } /** diff --git a/packages/appkit-ui/src/react/hooks/use-analytics-query.ts b/packages/appkit-ui/src/react/hooks/use-analytics-query.ts index a3d2cf306..ff2b18bab 100644 --- a/packages/appkit-ui/src/react/hooks/use-analytics-query.ts +++ b/packages/appkit-ui/src/react/hooks/use-analytics-query.ts @@ -122,6 +122,7 @@ export function useAnalyticsQuery< const [loading, setLoading] = useState(false); const [error, setError] = useState(null); const [errorCode, setErrorCode] = useState(null); + const [requestId, setRequestId] = useState(null); const abortControllerRef = useRef(null); if (!queryKey || queryKey.trim().length === 0) { @@ -170,6 +171,7 @@ export function useAnalyticsQuery< setLoading(true); setError(null); setErrorCode(null); + setRequestId(null); setData(null); const abortController = new AbortController(); @@ -247,10 +249,16 @@ export function useAnalyticsQuery< if (typeof parsed.errorCode === "string") { setErrorCode(parsed.errorCode); } + // requestId is the SSE event id of the error frame — same + // value the server logs in its `Stream execution failed` + // line, so users can quote it in support tickets. + if (typeof parsed.requestId === "string") { + setRequestId(parsed.requestId); + } if (parsed.code) { console.error( - `[useAnalyticsQuery] Code: ${parsed.code}, Message: ${errorMsg}`, + `[useAnalyticsQuery] Code: ${parsed.code}, Message: ${errorMsg}, requestId: ${parsed.requestId ?? "n/a"}`, ); } return; @@ -321,5 +329,5 @@ export function useAnalyticsQuery< // Enable HMR for query updates in dev mode useQueryHMR(queryKey, start); - return { data, loading, error, errorCode }; + return { data, loading, error, errorCode, requestId }; } diff --git a/packages/appkit/src/errors/dbsql-error-allowlist.ts b/packages/appkit/src/errors/dbsql-error-allowlist.ts index 34cbefd56..780797fff 100644 --- a/packages/appkit/src/errors/dbsql-error-allowlist.ts +++ b/packages/appkit/src/errors/dbsql-error-allowlist.ts @@ -2,22 +2,35 @@ * Allowlist of Databricks SQL Statement Execution API error codes whose * `error.message` is safe to forward to clients verbatim. * - * Framing: each `ServiceErrorCode` value falls into one of two camps, - * based on what wording is interpolated into `error.message` at the - * gateway construction site: - * - * 1. **Designed-for-user** — the gateway interpolates a stable template - * string authored for end users (e.g. "DBSQL temporarily unavailable. - * Please try again in a few minutes.") or a DBR-stamped - * `errorDisplayMessage` covering the user's own SQL. These messages - * are *the entire point* of having an error string — hiding them - * forces users to debug in the dark. *Safe to passthrough.* - * - * 2. **Designed-for-debugging** — the gateway passes through a wrapped - * `ex.getMessage` or interpolates internal identifiers (orgId, unique - * warehouse IDs, scheduler pod names, internal column structure). - * These are intended for operator log inspection, not for clients. - * *Not safe to passthrough* (CWE-209). + * Threat model — the recipient of these messages is an *authenticated + * workspace user*, not an anonymous internet visitor. They already know + * the workspace URL, their orgId, their session token, the warehouses + * they have grants on, and the catalog contents they can see. Names of + * resources within the workspace, internal correlation IDs, and class + * names from Databricks-internal packages are not new information to + * this audience — the classic CWE-209 "stack-trace-on-500-page-leaks- + * to-attacker" scenario does not apply. + * + * Given that, the bar for passthrough is: does the message help the + * user (or their support contact) understand and act on the failure? + * Defaults skew toward yes because: + * + * - The cost of denial is *certain* (every error becomes "Query + * execution failed" with no signal; users open tickets, support has + * to reproduce or grep server logs to find the actual error). + * - The cost of allowing is *speculative* — depends on whatever the + * most recent wrapped exception happened to put in its `.toString()`. + * + * Server-side, the route handlers always `logger.error(rawMsg)` the + * full upstream text before sanitization — operators retain complete + * visibility regardless of what reaches the wire. SSE error frames + * also carry a `requestId` (the SSE event id) so users can quote it + * in support tickets and staff can grep logs against it directly. + * + * **Currently denied**: `UNKNOWN` only — by definition unclassified, + * so we can't reason about its contents and a default-deny is the + * minimum safe stance. Every other `ServiceErrorCode` variant is on + * the allowlist below with a documented rationale. * * Source-of-truth construction sites for the classifications below: * @@ -49,13 +62,25 @@ * - Concurrency conflict (`ABORTED`): short reason strings like * "version mismatch" (Reyden warehouse monitor). No internal context. * - * - Genuinely-internal-only (`INTERNAL_ERROR`, `IO_ERROR`, `UNKNOWN`): + * - `INTERNAL_ERROR` / `IO_ERROR`: * `sqlgateway/database/src/main/scala/endpoints/EndpointModel.scala` * interpolates `s"SQL ${conf.warehouse} cannot be created due to - * repeated collisions on unique IDs."` — exposes internal database - * state. `SchedulerRpcContextValidatorHook.workspaceNotOwned(orgId)` - * interpolates orgId. Test fixtures show stack traces interpolated - * into `internalMessage`. *Default-deny.* + * repeated collisions on unique IDs."`; + * `SchedulerRpcContextValidatorHook.workspaceNotOwned(orgId)` + * interpolates orgId. These identifiers are not sensitive to a + * workspace user (they own / can list both). The wrapped + * `ex.getMessage` content is unbounded but for the typical case is + * either operational (RPC timeout, lock contention) or shape-of-error + * information the user needs to act. Allowlisted with the requestId + * plumbing as the safety net for triage. *Not* allowlisted if a + * future review finds construction sites that interpolate user + * credentials, OAuth tokens, or other secrets — open an issue + flip + * the entry. + * + * - `UNKNOWN`: unclassified by definition. We have no way to reason + * about its contents — could be anything from "RPC connect failed" + * to a panic dump from an upstream library. Default-deny is the + * only defensible stance until the SDK gives us a finer code. * * - `UNAUTHENTICATED` / `CANCELLED` / `DEADLINE_EXCEEDED`: SDK-level * codes with generic message templates. @@ -113,19 +138,31 @@ const SAFE_PASSTHROUGH_CODES: ReadonlySet = new Set([ // Concurrency conflict — short reason strings ("version mismatch", // "concurrent modification"). User-relevant for retry decisions. "ABORTED", + // Wrapped internal exceptions. Construction sites interpolate + // workspace-internal identifiers (warehouse names, orgIds) and + // `ex.getMessage` from arbitrary causes. To an authenticated + // workspace user these identifiers are not sensitive — they can + // already enumerate them. The wrapped `ex.getMessage` content is + // unbounded but in practice is operational ("RPC timed out", + // "deadlock detected") or shape-of-failure detail the user needs to + // act on. The `requestId` we emit on the SSE error frame is the + // safety net for unhappy cases — users can quote it in tickets and + // staff can grep logs against it for the full unsanitized message. + "INTERNAL_ERROR", + // Storage / network failures. Same logic as INTERNAL_ERROR: path / + // bucket names visible to a workspace user with catalog access are + // not new information, and the wrapped error text usually tells + // them whether to retry, switch warehouses, or escalate. + "IO_ERROR", ]); /** * Explicitly NOT on the allowlist (documented for future-reviewer * context; classification is by absence from `SAFE_PASSTHROUGH_CODES`): * - * - `INTERNAL_ERROR` — construction sites interpolate unique warehouse - * IDs, orgIds, and wrap `ex.getMessage` from arbitrary exceptions. - * Example: `s"SQL ${conf.warehouse} cannot be created due to repeated - * collisions on unique IDs."` (`EndpointModel.scala`). High leak risk. - * - `IO_ERROR` — storage / network failures; messages typically include - * bucket names, paths, internal hostnames. - * - `UNKNOWN` — unclassified by definition; can be anything. + * - `UNKNOWN` — unclassified by definition; could be anything from a + * library panic to an unhandled `case` in the gateway. Default-deny + * is the only defensible stance until the SDK gives us a finer code. */ /** diff --git a/packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts b/packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts index 115f3e25b..638271440 100644 --- a/packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts +++ b/packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts @@ -2,7 +2,10 @@ import { describe, expect, test } from "vitest"; import { isSqlErrorPassthrough } from "../dbsql-error-allowlist"; describe("isSqlErrorPassthrough", () => { - // Allowlist members — designed-for-user messages. + // Allowlist members — passthrough is net-positive given the + // workspace-user threat model (recipient already knows the workspace's + // resources). Server-side full-detail logging + the requestId on the + // SSE error frame backstop the unhappy cases. test.each([ // DBR data plane — user's own SQL errors. ["BAD_REQUEST"], @@ -22,21 +25,24 @@ describe("isSqlErrorPassthrough", () => { // Quota — stable user-actionable templates ("Stop or delete // existing warehouses to free up capacity."). ["RESOURCE_EXHAUSTED"], - // Concurrency conflict — short reason strings, user-relevant for - // retry decisions. + // Concurrency conflict — short reason strings, user-relevant. ["ABORTED"], - ])("%s is allowlisted (designed-for-user message)", (code) => { + // Wrapped internal exceptions — interpolated identifiers (orgId, + // warehouse name) are non-sensitive to an authenticated workspace + // user; the wrapped `ex.getMessage` content is operationally + // useful for triage. RequestId on the SSE error frame is the + // safety net for unhappy cases. + ["INTERNAL_ERROR"], + ["IO_ERROR"], + ])("%s is allowlisted", (code) => { expect(isSqlErrorPassthrough(code)).toBe(true); }); - // Explicit denylist — designed-for-debugging, interpolates internal - // identifiers, stack traces, or storage paths. - test.each([["INTERNAL_ERROR"], ["IO_ERROR"], ["UNKNOWN"]])( - "%s is denied (designed-for-debugging, interpolates internal state)", - (code) => { - expect(isSqlErrorPassthrough(code)).toBe(false); - }, - ); + // Only UNKNOWN stays denied — unclassified by definition, so the + // wrapped content is unbounded in shape AND in source. + test("UNKNOWN is denied (unclassified — cannot reason about contents)", () => { + expect(isSqlErrorPassthrough("UNKNOWN")).toBe(false); + }); test("undefined and empty string are denied (default-deny on missing source)", () => { expect(isSqlErrorPassthrough(undefined)).toBe(false); diff --git a/packages/appkit/src/errors/tests/errors.test.ts b/packages/appkit/src/errors/tests/errors.test.ts index 296e0ebc5..5a13fb69d 100644 --- a/packages/appkit/src/errors/tests/errors.test.ts +++ b/packages/appkit/src/errors/tests/errors.test.ts @@ -367,27 +367,32 @@ describe("ExecutionError", () => { expect(error.clientMessage).toBe("Statement failed: Permission denied"); }); - test("INTERNAL_ERROR collapses to generic — control-plane text may carry stack traces, correlation IDs, internal paths (CWE-209)", () => { + test("INTERNAL_ERROR passes through (workspace identifiers are non-sensitive to authenticated user; requestId backstops triage)", () => { + // Interpolated warehouse names and orgIds are not new information + // to a workspace user — they can enumerate both via the SDK. + // Wrapped `ex.getMessage` is operationally useful for the user + // to act ("RPC timed out", "deadlock detected") and the + // requestId on the SSE error frame keys server logs for triage. const error = ExecutionError.statementFailed( - "RPC to warehouse-prod-us-east-1.svc failed: corrId=abc-123, stack: at scheduler.process()", + "SQL my-warehouse cannot be created due to repeated collisions on unique IDs.", "INTERNAL_ERROR", ); - // .message keeps the raw text for server logs. - expect(error.message).toContain("corrId=abc-123"); - // .clientMessage collapses to the generic default. - expect(error.clientMessage).toBe("Query execution failed"); + expect(error.clientMessage).toBe( + "Statement failed: SQL my-warehouse cannot be created due to repeated collisions on unique IDs.", + ); }); - test("IO_ERROR collapses to generic (may leak storage paths / bucket names)", () => { + test("IO_ERROR passes through (storage path / network failure detail is actionable, not credential-bearing)", () => { const error = ExecutionError.statementFailed( - "Failed to read s3://internal-staging-bucket-3/path/to/shard", + "Failed to read manifest after 3 retries", "IO_ERROR", ); - expect(error.clientMessage).toBe("Query execution failed"); - expect(error.message).toContain("internal-staging-bucket-3"); + expect(error.clientMessage).toBe( + "Statement failed: Failed to read manifest after 3 retries", + ); }); - test("UNKNOWN collapses to generic (unclassified by definition)", () => { + test("UNKNOWN collapses to generic (unclassified by definition — cannot reason about contents)", () => { const error = ExecutionError.statementFailed( "Something went wrong: scheduler-pod-3 returned 500", "UNKNOWN", @@ -481,20 +486,23 @@ describe("ExecutionError", () => { test("server-side .message always preserves the raw upstream text for log debugging", () => { // Regardless of passthrough decision, the raw text stays on - // `.message` for `logger.error(err)` calls to capture full detail. + // `.message` for `logger.error(err)` calls to capture full + // detail. For UNKNOWN (the only denied code), clientMessage + // collapses but .message still has the full content. const safe = ExecutionError.statementFailed( "Table 'users' not found", "BAD_REQUEST", ); expect(safe.message).toBe("Statement failed: Table 'users' not found"); - const unsafe = ExecutionError.statementFailed( - "Internal RPC failed: corrId=abc", - "INTERNAL_ERROR", + const denied = ExecutionError.statementFailed( + "Unclassified failure with corrId=abc", + "UNKNOWN", ); - expect(unsafe.message).toBe( - "Statement failed: Internal RPC failed: corrId=abc", + expect(denied.message).toBe( + "Statement failed: Unclassified failure with corrId=abc", ); + expect(denied.clientMessage).toBe("Query execution failed"); }); }); }); diff --git a/packages/appkit/src/plugins/analytics/analytics.ts b/packages/appkit/src/plugins/analytics/analytics.ts index 5ba121d1b..dca3ddd8f 100644 --- a/packages/appkit/src/plugins/analytics/analytics.ts +++ b/packages/appkit/src/plugins/analytics/analytics.ts @@ -1,3 +1,4 @@ +import { randomUUID } from "node:crypto"; import type { WorkspaceClient } from "@databricks/sdk-experimental"; import { tableFromIPC } from "apache-arrow"; import type express from "express"; @@ -146,9 +147,15 @@ export class AnalyticsPlugin extends Plugin implements ToolProvider { // Already drained, expired, or never belonged to this user. 410 // distinguishes this from "warehouse statement id not found" (404) // so the client can surface a useful error. - logger.debug("Inline Arrow stash miss for jobId=%s", jobId); + const requestId = randomUUID(); + logger.debug( + "Inline Arrow stash miss for jobId=%s requestId=%s", + jobId, + requestId, + ); res.status(410).json({ error: "Inline Arrow result expired or unknown", + requestId, plugin: this.name, }); return; @@ -183,15 +190,18 @@ export class AnalyticsPlugin extends Plugin implements ToolProvider { ); res.send(Buffer.from(result.data)); } catch (error) { - logger.error("Arrow job error: %O", error); - // Do not echo upstream / SDK error text to the client — it can - // include statement fragments, internal object names, and - // correlation IDs. The full detail stays in the server log above. + const requestId = randomUUID(); + logger.error("Arrow job error (requestId=%s): %O", requestId, error); + // Do not echo upstream / SDK error text to the client — the + // detail stays in the server log above, keyed by requestId so + // a user quoting it in a support ticket lets staff grep + // directly to this line. const errorCode = error instanceof ExecutionError ? error.errorCode : undefined; res.status(404).json({ error: "Arrow result unavailable", errorCode, + requestId, plugin: this.name, }); } diff --git a/packages/appkit/src/stream/sse-writer.ts b/packages/appkit/src/stream/sse-writer.ts index 2b49d1dee..bf3659937 100644 --- a/packages/appkit/src/stream/sse-writer.ts +++ b/packages/appkit/src/stream/sse-writer.ts @@ -43,9 +43,17 @@ export class SSEWriter { ): void { if (res.writableEnded) return; + // The SSE `id:` line and the in-payload `requestId` are + // intentionally the same value: the wire-level id is invisible to + // most React consumers (the event-source `data` is what reaches + // app code), so we duplicate it inside the JSON so the hook can + // surface it. Server-side logs use this same id in the + // `Stream execution failed` line — a user quoting it lets staff + // grep logs directly. const errorData: SSEError = { error, code, + requestId: eventId, ...(errorCode ? { errorCode } : {}), }; diff --git a/packages/appkit/src/stream/stream-manager.ts b/packages/appkit/src/stream/stream-manager.ts index 2cacbcb9c..2ac0ea968 100644 --- a/packages/appkit/src/stream/stream-manager.ts +++ b/packages/appkit/src/stream/stream-manager.ts @@ -309,17 +309,22 @@ export class StreamManager { if (errorCode === SSEErrorCode.STREAM_ABORTED) { logger.info("Stream aborted by client (code=%s)", errorCode); } else { + // Log line includes the requestId we'll emit to the client — + // a user quoting their requestId in a support ticket lets + // staff grep this exact line for the full raw message. logger.error( - "Stream execution failed: %s (code=%s upstreamCode=%s)", + "Stream execution failed: %s (code=%s upstreamCode=%s requestId=%s)", rawMsg, errorCode, upstreamCode ?? "n/a", + errorEventId, ); } const payload: Record = { error: clientMsg, code: errorCode, + requestId: errorEventId, }; if (upstreamCode) payload.errorCode = upstreamCode; diff --git a/packages/appkit/src/stream/tests/stream.test.ts b/packages/appkit/src/stream/tests/stream.test.ts index a99242038..243178eb3 100644 --- a/packages/appkit/src/stream/tests/stream.test.ts +++ b/packages/appkit/src/stream/tests/stream.test.ts @@ -271,9 +271,14 @@ describe("StreamManager", () => { await streamManager.stream(mockRes as any, generator); expect(events).toContain("event: error\n"); - expect(events).toContain( - 'data: {"error":"Internal server error","code":"INTERNAL_ERROR"}\n\n', - ); + // Match the error payload shape rather than the exact frame string — + // the frame now also carries a per-error requestId (randomUUID) for + // support triage, so we can't pin the full data line literally. + const dataLines = events.filter((e: string) => e.startsWith("data: ")); + const errorLine = dataLines.find((e: string) => e.includes('"error"')); + expect(errorLine).toContain('"error":"Internal server error"'); + expect(errorLine).toContain('"code":"INTERNAL_ERROR"'); + expect(errorLine).toMatch(/"requestId":"[0-9a-f-]+"/); }); test("should not crash if client disconnects during error", async () => { diff --git a/packages/appkit/src/stream/types.ts b/packages/appkit/src/stream/types.ts index ba0d4915e..89a573bad 100644 --- a/packages/appkit/src/stream/types.ts +++ b/packages/appkit/src/stream/types.ts @@ -31,6 +31,14 @@ export interface SSEError { * the human-readable `error` string. */ errorCode?: string; + /** + * Correlation id for support triage. Set to the SSE event id of the + * error frame — the same value that appears in the server-side + * `logger.error("Stream execution failed: ...")` line, so a user + * quoting this id lets staff grep logs directly. Stable per-error, + * unique across the process lifetime. + */ + requestId?: string; } export interface BufferedEvent {