diff --git a/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts b/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts index ba4c962b0..5417ac39c 100644 --- a/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts +++ b/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts @@ -184,12 +184,12 @@ describe("useAnalyticsQuery", () => { expect(result.current.data).toBeNull(); }); - test("a server error event carrying a structured errorCode exposes it on the hook return value", async () => { - // The SSE error broadcaster forwards an `errorCode` field for - // UI branching (e.g. INLINE_ARROW_STASH_EXHAUSTED). The hook - // surfaces both the human `error` text AND the structured - // `errorCode` so consumers can branch on the stable identifier - // instead of substring-matching the sanitized human message. + test("a server error event carrying a structured errorCode + requestId exposes both on the hook return value", async () => { + // The SSE error broadcaster forwards `errorCode` (for UI branching) + // and `requestId` (for support triage — same id appears in the + // server-side logger.error line). The hook surfaces both so + // consumers can render "Error ref: " alongside the + // human message and branch behavior on the stable errorCode. const errorSpy = vi.spyOn(console, "error").mockImplementation(() => {}); const { result } = renderHook(() => @@ -202,6 +202,7 @@ describe("useAnalyticsQuery", () => { error: "Server is at capacity, please retry", code: "UPSTREAM_ERROR", errorCode: "INLINE_ARROW_STASH_EXHAUSTED", + requestId: "11111111-2222-3333-4444-555555555555", }), }); @@ -210,6 +211,9 @@ describe("useAnalyticsQuery", () => { }); expect(result.current.loading).toBe(false); expect(result.current.errorCode).toBe("INLINE_ARROW_STASH_EXHAUSTED"); + expect(result.current.requestId).toBe( + "11111111-2222-3333-4444-555555555555", + ); errorSpy.mockRestore(); }); diff --git a/packages/appkit-ui/src/react/hooks/types.ts b/packages/appkit-ui/src/react/hooks/types.ts index a603e10a4..feb1d8bf6 100644 --- a/packages/appkit-ui/src/react/hooks/types.ts +++ b/packages/appkit-ui/src/react/hooks/types.ts @@ -73,6 +73,15 @@ export interface UseAnalyticsQueryResult { * matching `error`, which is a free-form sanitized message. */ errorCode: string | null; + /** + * Server-side correlation id for support triage. When non-null, the + * server emitted this id on the error frame and the same id appears + * in the backend's `logger.error` line. Surface it in error UIs + * ("Error ref: abc-123 — quote when filing a support ticket") so + * staff can grep server logs directly instead of asking the user to + * reproduce. + */ + requestId: string | null; } /** diff --git a/packages/appkit-ui/src/react/hooks/use-analytics-query.ts b/packages/appkit-ui/src/react/hooks/use-analytics-query.ts index a3d2cf306..ff2b18bab 100644 --- a/packages/appkit-ui/src/react/hooks/use-analytics-query.ts +++ b/packages/appkit-ui/src/react/hooks/use-analytics-query.ts @@ -122,6 +122,7 @@ export function useAnalyticsQuery< const [loading, setLoading] = useState(false); const [error, setError] = useState(null); const [errorCode, setErrorCode] = useState(null); + const [requestId, setRequestId] = useState(null); const abortControllerRef = useRef(null); if (!queryKey || queryKey.trim().length === 0) { @@ -170,6 +171,7 @@ export function useAnalyticsQuery< setLoading(true); setError(null); setErrorCode(null); + setRequestId(null); setData(null); const abortController = new AbortController(); @@ -247,10 +249,16 @@ export function useAnalyticsQuery< if (typeof parsed.errorCode === "string") { setErrorCode(parsed.errorCode); } + // requestId is the SSE event id of the error frame — same + // value the server logs in its `Stream execution failed` + // line, so users can quote it in support tickets. + if (typeof parsed.requestId === "string") { + setRequestId(parsed.requestId); + } if (parsed.code) { console.error( - `[useAnalyticsQuery] Code: ${parsed.code}, Message: ${errorMsg}`, + `[useAnalyticsQuery] Code: ${parsed.code}, Message: ${errorMsg}, requestId: ${parsed.requestId ?? "n/a"}`, ); } return; @@ -321,5 +329,5 @@ export function useAnalyticsQuery< // Enable HMR for query updates in dev mode useQueryHMR(queryKey, start); - return { data, loading, error, errorCode }; + return { data, loading, error, errorCode, requestId }; } diff --git a/packages/appkit/src/errors/dbsql-error-allowlist.ts b/packages/appkit/src/errors/dbsql-error-allowlist.ts new file mode 100644 index 000000000..780797fff --- /dev/null +++ b/packages/appkit/src/errors/dbsql-error-allowlist.ts @@ -0,0 +1,179 @@ +/** + * Allowlist of Databricks SQL Statement Execution API error codes whose + * `error.message` is safe to forward to clients verbatim. + * + * Threat model — the recipient of these messages is an *authenticated + * workspace user*, not an anonymous internet visitor. They already know + * the workspace URL, their orgId, their session token, the warehouses + * they have grants on, and the catalog contents they can see. Names of + * resources within the workspace, internal correlation IDs, and class + * names from Databricks-internal packages are not new information to + * this audience — the classic CWE-209 "stack-trace-on-500-page-leaks- + * to-attacker" scenario does not apply. + * + * Given that, the bar for passthrough is: does the message help the + * user (or their support contact) understand and act on the failure? + * Defaults skew toward yes because: + * + * - The cost of denial is *certain* (every error becomes "Query + * execution failed" with no signal; users open tickets, support has + * to reproduce or grep server logs to find the actual error). + * - The cost of allowing is *speculative* — depends on whatever the + * most recent wrapped exception happened to put in its `.toString()`. + * + * Server-side, the route handlers always `logger.error(rawMsg)` the + * full upstream text before sanitization — operators retain complete + * visibility regardless of what reaches the wire. SSE error frames + * also carry a `requestId` (the SSE event id) so users can quote it + * in support tickets and staff can grep logs against it directly. + * + * **Currently denied**: `UNKNOWN` only — by definition unclassified, + * so we can't reason about its contents and a default-deny is the + * minimum safe stance. Every other `ServiceErrorCode` variant is on + * the allowlist below with a documented rationale. + * + * Source-of-truth construction sites for the classifications below: + * + * - DBR data-plane (`BAD_REQUEST`, `NOT_FOUND`, `ALREADY_EXISTS`): + * `sqlgateway/scheduler/src/main/scala/DriverRequests.scala:181-208` + * populates `ErrorInfo.withMessage(command.errorDisplayMessage)` and + * sets `withErrorReturnedFromDataPlane(true)`. + * + * - User-facing service messages (`TEMPORARILY_UNAVAILABLE`, + * `WORKSPACE_TEMPORARILY_UNAVAILABLE`, `SERVICE_UNDER_MAINTENANCE`): + * - `sqlgateway/scheduler/api/src/main/scala/client/SchedulerRpcClient.scala` + * constructs `SchedulerTemporarilyUnavailableException` with the + * literal "DBSQL temporarily unavailable. Please try again in a + * few minutes." + * - `sqlgateway/scheduler/api/src/main/scala/client/linkstore/SqlGatewayClerk.scala` + * constructs with `internalMessage = "DBSQL temporarily unavailable..."` + * - `sqlgateway/proxy/src/main/scala/thrift/Exceptions.scala` + * constructs `WORKSPACE_TEMPORARILY_UNAVAILABLE` from + * `NotServedByShardException.getMessage` — names a shard but no + * credentials / PII / stack traces. + * + * - User-actionable quota messages (`RESOURCE_EXHAUSTED`): + * `sqlgateway/scheduler/src/main/scala/SchedulerRpcValidatorHook.scala` + * emits stable templates: "The maximum number of warehouses has been + * reached. Please contact Databricks support.", "You've hit the limit + * for warehouses for free usage. Stop or delete existing warehouses + * to free up capacity.", etc. + * + * - Concurrency conflict (`ABORTED`): short reason strings like + * "version mismatch" (Reyden warehouse monitor). No internal context. + * + * - `INTERNAL_ERROR` / `IO_ERROR`: + * `sqlgateway/database/src/main/scala/endpoints/EndpointModel.scala` + * interpolates `s"SQL ${conf.warehouse} cannot be created due to + * repeated collisions on unique IDs."`; + * `SchedulerRpcContextValidatorHook.workspaceNotOwned(orgId)` + * interpolates orgId. These identifiers are not sensitive to a + * workspace user (they own / can list both). The wrapped + * `ex.getMessage` content is unbounded but for the typical case is + * either operational (RPC timeout, lock contention) or shape-of-error + * information the user needs to act. Allowlisted with the requestId + * plumbing as the safety net for triage. *Not* allowlisted if a + * future review finds construction sites that interpolate user + * credentials, OAuth tokens, or other secrets — open an issue + flip + * the entry. + * + * - `UNKNOWN`: unclassified by definition. We have no way to reason + * about its contents — could be anything from "RPC connect failed" + * to a panic dump from an upstream library. Default-deny is the + * only defensible stance until the SDK gives us a finer code. + * + * - `UNAUTHENTICATED` / `CANCELLED` / `DEADLINE_EXCEEDED`: SDK-level + * codes with generic message templates. + * + * SDK type reference (`@databricks/sdk-experimental`, + * `apis/sql/model.d.ts`): + * ``` + * type ServiceErrorCode = + * | "ABORTED" | "ALREADY_EXISTS" | "BAD_REQUEST" | "CANCELLED" + * | "DEADLINE_EXCEEDED" | "INTERNAL_ERROR" | "IO_ERROR" | "NOT_FOUND" + * | "RESOURCE_EXHAUSTED" | "SERVICE_UNDER_MAINTENANCE" + * | "TEMPORARILY_UNAVAILABLE" | "UNAUTHENTICATED" | "UNKNOWN" + * | "WORKSPACE_TEMPORARILY_UNAVAILABLE"; + * ``` + * + * **Default-deny on unknown codes** — when the SDK ships a new variant + * the allowlist hasn't been reviewed against, the new code collapses to + * the generic clientMessage until someone audits the construction sites + * and updates the Set below. + */ + +/** ServiceErrorCode values for which `error.message` may be forwarded verbatim. */ +const SAFE_PASSTHROUGH_CODES: ReadonlySet = new Set([ + // DBR-authored: user's own SQL errors. Parse, semantic, type, missing + // table/column. Examples: "Table or view 'foo' not found", "Syntax + // error at or near ',' on line 3". + "BAD_REQUEST", + // DBR catalog miss — "Table 'foo' not found", "Schema 'bar' not found". + "NOT_FOUND", + // DBR catalog conflict — "Table 'foo' already exists". + "ALREADY_EXISTS", + // Stable template: "Query timed out after Ns". User-actionable. + "DEADLINE_EXCEEDED", + // "Statement was canceled". User/admin-initiated; no internal state. + "CANCELLED", + // SDK generic — "Permission denied", "Authentication required". + "UNAUTHENTICATED", + // Stable user-facing template: "DBSQL temporarily unavailable. Please + // try again in a few minutes." Hiding this would force users to + // distinguish "warehouse issue" from "user error" by guesswork. + "TEMPORARILY_UNAVAILABLE", + // Workspace-level service messages from shard-routing failures. The + // message may name a shard ID — that's internal topology metadata, + // low sensitivity, and operationally useful for the user to know + // the workspace is in motion. + "WORKSPACE_TEMPORARILY_UNAVAILABLE", + // Maintenance windows — generic template messages. + "SERVICE_UNDER_MAINTENANCE", + // Stable user-actionable templates from the warehouse quota path: + // "The maximum number of warehouses has been reached. Please contact + // Databricks support.", "Stop or delete existing warehouses to free + // up capacity.", etc. Telling users "Query execution failed" when + // they've actually hit a quota is unhelpful. + "RESOURCE_EXHAUSTED", + // Concurrency conflict — short reason strings ("version mismatch", + // "concurrent modification"). User-relevant for retry decisions. + "ABORTED", + // Wrapped internal exceptions. Construction sites interpolate + // workspace-internal identifiers (warehouse names, orgIds) and + // `ex.getMessage` from arbitrary causes. To an authenticated + // workspace user these identifiers are not sensitive — they can + // already enumerate them. The wrapped `ex.getMessage` content is + // unbounded but in practice is operational ("RPC timed out", + // "deadlock detected") or shape-of-failure detail the user needs to + // act on. The `requestId` we emit on the SSE error frame is the + // safety net for unhappy cases — users can quote it in tickets and + // staff can grep logs against it for the full unsanitized message. + "INTERNAL_ERROR", + // Storage / network failures. Same logic as INTERNAL_ERROR: path / + // bucket names visible to a workspace user with catalog access are + // not new information, and the wrapped error text usually tells + // them whether to retry, switch warehouses, or escalate. + "IO_ERROR", +]); + +/** + * Explicitly NOT on the allowlist (documented for future-reviewer + * context; classification is by absence from `SAFE_PASSTHROUGH_CODES`): + * + * - `UNKNOWN` — unclassified by definition; could be anything from a + * library panic to an unhandled `case` in the gateway. Default-deny + * is the only defensible stance until the SDK gives us a finer code. + */ + +/** + * Returns true when the given upstream `error_code` belongs to the + * passthrough allowlist — i.e. the corresponding `error.message` was + * authored by DBR for the user's own SQL and is safe to forward. + * + * Returns false (default-deny) for any unrecognized code, including + * codes added to the SDK after this allowlist was last reviewed. + */ +export function isSqlErrorPassthrough(errorCode: string | undefined): boolean { + if (!errorCode) return false; + return SAFE_PASSTHROUGH_CODES.has(errorCode); +} diff --git a/packages/appkit/src/errors/execution.ts b/packages/appkit/src/errors/execution.ts index 3c9df7fe0..3dc2944f3 100644 --- a/packages/appkit/src/errors/execution.ts +++ b/packages/appkit/src/errors/execution.ts @@ -1,4 +1,5 @@ import { AppKitError } from "./base"; +import { isSqlErrorPassthrough } from "./dbsql-error-allowlist"; /** * Error thrown when an operation execution fails. @@ -50,16 +51,22 @@ export class ExecutionError extends AppKitError { /** * Create an execution error for statement failure. * @param errorMessage Human-readable error from the warehouse / SDK. - * Goes into `.message` for server logs only — *never* echoed to the - * client. Pass `clientMessage` explicitly if a sanitized text should - * reach the UI. - * @param errorCode Structured code (e.g. "INVALID_PARAMETER_VALUE") to - * preserve through wrapping. Optional. Forwarded on SSE error - * payloads so UI can branch on it instead of substring-matching - * `error`. - * @param clientMessage Optional client-safe replacement for `.message`. - * Defaults to "Query execution failed" via the `clientMessage` - * getter. Set this only when the upstream text is known-safe. + * When `errorCode` is on the DBSQL safe-passthrough allowlist (see + * `dbsql-error-allowlist.ts`), this text is forwarded to clients + * verbatim — it's authored by DBR for the user's own SQL and is + * inherently user-facing. For codes NOT on the allowlist (control + * plane errors carrying correlation IDs, internal paths, stack + * traces), `.message` goes into server logs only and the client + * sees a generic "Query execution failed". An explicit + * `clientMessage` argument always wins over both paths. + * @param errorCode Structured code (e.g. "BAD_REQUEST", + * "INVALID_PARAMETER_VALUE") to preserve through wrapping. Optional. + * Forwarded on SSE error payloads so UI can branch on it instead of + * substring-matching `error`. Also drives the passthrough decision. + * @param clientMessage Explicit client-safe message that always wins — + * bypasses the allowlist check. Use when the upstream text is + * known-safe regardless of code (e.g. constructed in-process from a + * trusted template). */ static statementFailed( errorMessage?: string, @@ -69,7 +76,20 @@ export class ExecutionError extends AppKitError { const message = errorMessage ? `Statement failed: ${errorMessage}` : "Statement failed: Unknown error"; - return new ExecutionError(message, { errorCode, clientMessage }); + + // Allowlist-driven passthrough: BAD_REQUEST / NOT_FOUND / + // ALREADY_EXISTS / etc. carry DBR-authored messages that *are* the + // user's own SQL error ("Table 'foo' not found", "Syntax error + // near ',' on line 3"). Hiding these forces users to debug in the + // dark; surfacing them is the entire reason an error message + // exists. Any code not on the allowlist defaults to generic. + const inferredClient = + clientMessage ?? (isSqlErrorPassthrough(errorCode) ? message : undefined); + + return new ExecutionError(message, { + errorCode, + clientMessage: inferredClient, + }); } /** diff --git a/packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts b/packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts new file mode 100644 index 000000000..638271440 --- /dev/null +++ b/packages/appkit/src/errors/tests/dbsql-error-allowlist.test.ts @@ -0,0 +1,67 @@ +import { describe, expect, test } from "vitest"; +import { isSqlErrorPassthrough } from "../dbsql-error-allowlist"; + +describe("isSqlErrorPassthrough", () => { + // Allowlist members — passthrough is net-positive given the + // workspace-user threat model (recipient already knows the workspace's + // resources). Server-side full-detail logging + the requestId on the + // SSE error frame backstop the unhappy cases. + test.each([ + // DBR data plane — user's own SQL errors. + ["BAD_REQUEST"], + ["NOT_FOUND"], + ["ALREADY_EXISTS"], + // SDK-level codes with generic templates. + ["DEADLINE_EXCEEDED"], + ["CANCELLED"], + ["UNAUTHENTICATED"], + // 5xx-class service messages — stable user-facing templates + // ("DBSQL temporarily unavailable. Please try again in a few + // minutes."). Hiding these forces users to guess whether a failure + // is on their side or ours. + ["TEMPORARILY_UNAVAILABLE"], + ["WORKSPACE_TEMPORARILY_UNAVAILABLE"], + ["SERVICE_UNDER_MAINTENANCE"], + // Quota — stable user-actionable templates ("Stop or delete + // existing warehouses to free up capacity."). + ["RESOURCE_EXHAUSTED"], + // Concurrency conflict — short reason strings, user-relevant. + ["ABORTED"], + // Wrapped internal exceptions — interpolated identifiers (orgId, + // warehouse name) are non-sensitive to an authenticated workspace + // user; the wrapped `ex.getMessage` content is operationally + // useful for triage. RequestId on the SSE error frame is the + // safety net for unhappy cases. + ["INTERNAL_ERROR"], + ["IO_ERROR"], + ])("%s is allowlisted", (code) => { + expect(isSqlErrorPassthrough(code)).toBe(true); + }); + + // Only UNKNOWN stays denied — unclassified by definition, so the + // wrapped content is unbounded in shape AND in source. + test("UNKNOWN is denied (unclassified — cannot reason about contents)", () => { + expect(isSqlErrorPassthrough("UNKNOWN")).toBe(false); + }); + + test("undefined and empty string are denied (default-deny on missing source)", () => { + expect(isSqlErrorPassthrough(undefined)).toBe(false); + expect(isSqlErrorPassthrough("")).toBe(false); + }); + + test("any unrecognized code (e.g. new SDK additions) is denied", () => { + // When the SDK ships a new ServiceErrorCode variant, the allowlist + // must default-deny it until a human reviews the upstream message + // source. This test pins the default-deny behavior. + expect(isSqlErrorPassthrough("BRAND_NEW_CODE")).toBe(false); + expect(isSqlErrorPassthrough("PERMISSION_DENIED")).toBe(false); + expect(isSqlErrorPassthrough("FOOBAR")).toBe(false); + }); + + test("case-sensitive match — codes from the SDK are SHOUTY_CASE, lowercased input is rejected", () => { + // Defensive: if an upstream layer ever lowercases the code, we + // want the allowlist to fail closed rather than silently match. + expect(isSqlErrorPassthrough("bad_request")).toBe(false); + expect(isSqlErrorPassthrough("Bad_Request")).toBe(false); + }); +}); diff --git a/packages/appkit/src/errors/tests/errors.test.ts b/packages/appkit/src/errors/tests/errors.test.ts index 347ce1c0d..5a13fb69d 100644 --- a/packages/appkit/src/errors/tests/errors.test.ts +++ b/packages/appkit/src/errors/tests/errors.test.ts @@ -303,6 +303,208 @@ describe("ExecutionError", () => { expect(error.message).toBe("No chunks or schema found in response"); expect(error.context?.dataType).toBe("chunks or schema"); }); + + describe("statementFailed clientMessage passthrough", () => { + test("BAD_REQUEST passes through the warehouse-authored message — user's own SQL error", () => { + // DBR sets errorDisplayMessage from the SQL parser / semantic + // analyzer; "Table not found" is the user's own SQL artifact + // and the entire point of an error message is to show it back. + const error = ExecutionError.statementFailed( + "Table or view 'users' not found", + "BAD_REQUEST", + ); + expect(error.clientMessage).toBe( + "Statement failed: Table or view 'users' not found", + ); + expect(error.errorCode).toBe("BAD_REQUEST"); + }); + + test("NOT_FOUND passes through (catalog miss is user-authored)", () => { + const error = ExecutionError.statementFailed( + "Schema 'analytics' not found", + "NOT_FOUND", + ); + expect(error.clientMessage).toBe( + "Statement failed: Schema 'analytics' not found", + ); + }); + + test("ALREADY_EXISTS passes through (catalog conflict is user-authored)", () => { + const error = ExecutionError.statementFailed( + "Table 'foo' already exists", + "ALREADY_EXISTS", + ); + expect(error.clientMessage).toBe( + "Statement failed: Table 'foo' already exists", + ); + }); + + test("DEADLINE_EXCEEDED passes through (timeout text is user-actionable)", () => { + const error = ExecutionError.statementFailed( + "Query timed out after 30s", + "DEADLINE_EXCEEDED", + ); + expect(error.clientMessage).toBe( + "Statement failed: Query timed out after 30s", + ); + }); + + test("CANCELLED passes through (cancellation is initiated by user/admin)", () => { + const error = ExecutionError.statementFailed( + "Statement was canceled", + "CANCELLED", + ); + expect(error.clientMessage).toBe( + "Statement failed: Statement was canceled", + ); + }); + + test("UNAUTHENTICATED passes through (SDK auth messages are generic)", () => { + const error = ExecutionError.statementFailed( + "Permission denied", + "UNAUTHENTICATED", + ); + expect(error.clientMessage).toBe("Statement failed: Permission denied"); + }); + + test("INTERNAL_ERROR passes through (workspace identifiers are non-sensitive to authenticated user; requestId backstops triage)", () => { + // Interpolated warehouse names and orgIds are not new information + // to a workspace user — they can enumerate both via the SDK. + // Wrapped `ex.getMessage` is operationally useful for the user + // to act ("RPC timed out", "deadlock detected") and the + // requestId on the SSE error frame keys server logs for triage. + const error = ExecutionError.statementFailed( + "SQL my-warehouse cannot be created due to repeated collisions on unique IDs.", + "INTERNAL_ERROR", + ); + expect(error.clientMessage).toBe( + "Statement failed: SQL my-warehouse cannot be created due to repeated collisions on unique IDs.", + ); + }); + + test("IO_ERROR passes through (storage path / network failure detail is actionable, not credential-bearing)", () => { + const error = ExecutionError.statementFailed( + "Failed to read manifest after 3 retries", + "IO_ERROR", + ); + expect(error.clientMessage).toBe( + "Statement failed: Failed to read manifest after 3 retries", + ); + }); + + test("UNKNOWN collapses to generic (unclassified by definition — cannot reason about contents)", () => { + const error = ExecutionError.statementFailed( + "Something went wrong: scheduler-pod-3 returned 500", + "UNKNOWN", + ); + expect(error.clientMessage).toBe("Query execution failed"); + }); + + test("RESOURCE_EXHAUSTED passes through (quota templates are user-actionable)", () => { + // SchedulerRpcValidatorHook.scala emits stable templates like + // "The maximum number of warehouses has been reached. Please + // contact Databricks support." Telling users "Query execution + // failed" when they've actually hit a quota is unhelpful. + const error = ExecutionError.statementFailed( + "The maximum number of warehouses has been reached. Please contact Databricks support.", + "RESOURCE_EXHAUSTED", + ); + expect(error.clientMessage).toBe( + "Statement failed: The maximum number of warehouses has been reached. Please contact Databricks support.", + ); + }); + + test("TEMPORARILY_UNAVAILABLE passes through (stable service-down template)", () => { + // Source: SchedulerRpcClient.scala — fixed string authored for + // end users. + const error = ExecutionError.statementFailed( + "DBSQL temporarily unavailable. Please try again in a few minutes.", + "TEMPORARILY_UNAVAILABLE", + ); + expect(error.clientMessage).toBe( + "Statement failed: DBSQL temporarily unavailable. Please try again in a few minutes.", + ); + }); + + test("SERVICE_UNDER_MAINTENANCE passes through (maintenance messages are user-facing by design)", () => { + const error = ExecutionError.statementFailed( + "DBSQL is undergoing scheduled maintenance.", + "SERVICE_UNDER_MAINTENANCE", + ); + expect(error.clientMessage).toBe( + "Statement failed: DBSQL is undergoing scheduled maintenance.", + ); + }); + + test("WORKSPACE_TEMPORARILY_UNAVAILABLE passes through (shard topology is low-sensitivity, retry-relevant)", () => { + const error = ExecutionError.statementFailed( + "Workspace temporarily unavailable: shard moving", + "WORKSPACE_TEMPORARILY_UNAVAILABLE", + ); + expect(error.clientMessage).toBe( + "Statement failed: Workspace temporarily unavailable: shard moving", + ); + }); + + test("ABORTED passes through (conflict reason strings are retry-relevant)", () => { + const error = ExecutionError.statementFailed( + "version mismatch", + "ABORTED", + ); + expect(error.clientMessage).toBe("Statement failed: version mismatch"); + }); + + test("absent errorCode falls back to generic (default-deny on unknown source)", () => { + const error = ExecutionError.statementFailed("Some upstream text"); + expect(error.clientMessage).toBe("Query execution failed"); + }); + + test("unrecognized errorCode (post-SDK-update) defaults to generic", () => { + // When the SDK adds a new code we haven't reviewed yet, we must + // NOT passthrough by default — security relies on the allowlist + // being explicit. + const error = ExecutionError.statementFailed( + "scheduler internal: stack overflow at sun.misc.Unsafe.park()", + "SOME_NEW_CODE_NOT_IN_ALLOWLIST", + ); + expect(error.clientMessage).toBe("Query execution failed"); + }); + + test("explicit clientMessage argument wins over the allowlist (caller knows best)", () => { + // Even on an allowlisted code, an explicit clientMessage takes + // precedence — for cases where the caller has constructed a + // better-tuned message in-process. + const error = ExecutionError.statementFailed( + "Table or view 'users' not found", + "BAD_REQUEST", + "We could not find the requested table.", + ); + expect(error.clientMessage).toBe( + "We could not find the requested table.", + ); + }); + + test("server-side .message always preserves the raw upstream text for log debugging", () => { + // Regardless of passthrough decision, the raw text stays on + // `.message` for `logger.error(err)` calls to capture full + // detail. For UNKNOWN (the only denied code), clientMessage + // collapses but .message still has the full content. + const safe = ExecutionError.statementFailed( + "Table 'users' not found", + "BAD_REQUEST", + ); + expect(safe.message).toBe("Statement failed: Table 'users' not found"); + + const denied = ExecutionError.statementFailed( + "Unclassified failure with corrId=abc", + "UNKNOWN", + ); + expect(denied.message).toBe( + "Statement failed: Unclassified failure with corrId=abc", + ); + expect(denied.clientMessage).toBe("Query execution failed"); + }); + }); }); describe("InitializationError", () => { diff --git a/packages/appkit/src/plugins/analytics/analytics.ts b/packages/appkit/src/plugins/analytics/analytics.ts index 5ba121d1b..dca3ddd8f 100644 --- a/packages/appkit/src/plugins/analytics/analytics.ts +++ b/packages/appkit/src/plugins/analytics/analytics.ts @@ -1,3 +1,4 @@ +import { randomUUID } from "node:crypto"; import type { WorkspaceClient } from "@databricks/sdk-experimental"; import { tableFromIPC } from "apache-arrow"; import type express from "express"; @@ -146,9 +147,15 @@ export class AnalyticsPlugin extends Plugin implements ToolProvider { // Already drained, expired, or never belonged to this user. 410 // distinguishes this from "warehouse statement id not found" (404) // so the client can surface a useful error. - logger.debug("Inline Arrow stash miss for jobId=%s", jobId); + const requestId = randomUUID(); + logger.debug( + "Inline Arrow stash miss for jobId=%s requestId=%s", + jobId, + requestId, + ); res.status(410).json({ error: "Inline Arrow result expired or unknown", + requestId, plugin: this.name, }); return; @@ -183,15 +190,18 @@ export class AnalyticsPlugin extends Plugin implements ToolProvider { ); res.send(Buffer.from(result.data)); } catch (error) { - logger.error("Arrow job error: %O", error); - // Do not echo upstream / SDK error text to the client — it can - // include statement fragments, internal object names, and - // correlation IDs. The full detail stays in the server log above. + const requestId = randomUUID(); + logger.error("Arrow job error (requestId=%s): %O", requestId, error); + // Do not echo upstream / SDK error text to the client — the + // detail stays in the server log above, keyed by requestId so + // a user quoting it in a support ticket lets staff grep + // directly to this line. const errorCode = error instanceof ExecutionError ? error.errorCode : undefined; res.status(404).json({ error: "Arrow result unavailable", errorCode, + requestId, plugin: this.name, }); } diff --git a/packages/appkit/src/stream/sse-writer.ts b/packages/appkit/src/stream/sse-writer.ts index 2b49d1dee..bf3659937 100644 --- a/packages/appkit/src/stream/sse-writer.ts +++ b/packages/appkit/src/stream/sse-writer.ts @@ -43,9 +43,17 @@ export class SSEWriter { ): void { if (res.writableEnded) return; + // The SSE `id:` line and the in-payload `requestId` are + // intentionally the same value: the wire-level id is invisible to + // most React consumers (the event-source `data` is what reaches + // app code), so we duplicate it inside the JSON so the hook can + // surface it. Server-side logs use this same id in the + // `Stream execution failed` line — a user quoting it lets staff + // grep logs directly. const errorData: SSEError = { error, code, + requestId: eventId, ...(errorCode ? { errorCode } : {}), }; diff --git a/packages/appkit/src/stream/stream-manager.ts b/packages/appkit/src/stream/stream-manager.ts index 2cacbcb9c..2ac0ea968 100644 --- a/packages/appkit/src/stream/stream-manager.ts +++ b/packages/appkit/src/stream/stream-manager.ts @@ -309,17 +309,22 @@ export class StreamManager { if (errorCode === SSEErrorCode.STREAM_ABORTED) { logger.info("Stream aborted by client (code=%s)", errorCode); } else { + // Log line includes the requestId we'll emit to the client — + // a user quoting their requestId in a support ticket lets + // staff grep this exact line for the full raw message. logger.error( - "Stream execution failed: %s (code=%s upstreamCode=%s)", + "Stream execution failed: %s (code=%s upstreamCode=%s requestId=%s)", rawMsg, errorCode, upstreamCode ?? "n/a", + errorEventId, ); } const payload: Record = { error: clientMsg, code: errorCode, + requestId: errorEventId, }; if (upstreamCode) payload.errorCode = upstreamCode; diff --git a/packages/appkit/src/stream/tests/stream.test.ts b/packages/appkit/src/stream/tests/stream.test.ts index a99242038..243178eb3 100644 --- a/packages/appkit/src/stream/tests/stream.test.ts +++ b/packages/appkit/src/stream/tests/stream.test.ts @@ -271,9 +271,14 @@ describe("StreamManager", () => { await streamManager.stream(mockRes as any, generator); expect(events).toContain("event: error\n"); - expect(events).toContain( - 'data: {"error":"Internal server error","code":"INTERNAL_ERROR"}\n\n', - ); + // Match the error payload shape rather than the exact frame string — + // the frame now also carries a per-error requestId (randomUUID) for + // support triage, so we can't pin the full data line literally. + const dataLines = events.filter((e: string) => e.startsWith("data: ")); + const errorLine = dataLines.find((e: string) => e.includes('"error"')); + expect(errorLine).toContain('"error":"Internal server error"'); + expect(errorLine).toContain('"code":"INTERNAL_ERROR"'); + expect(errorLine).toMatch(/"requestId":"[0-9a-f-]+"/); }); test("should not crash if client disconnects during error", async () => { diff --git a/packages/appkit/src/stream/types.ts b/packages/appkit/src/stream/types.ts index ba0d4915e..89a573bad 100644 --- a/packages/appkit/src/stream/types.ts +++ b/packages/appkit/src/stream/types.ts @@ -31,6 +31,14 @@ export interface SSEError { * the human-readable `error` string. */ errorCode?: string; + /** + * Correlation id for support triage. Set to the SSE event id of the + * error frame — the same value that appears in the server-side + * `logger.error("Stream execution failed: ...")` line, so a user + * quoting this id lets staff grep logs directly. Stable per-error, + * unique across the process lifetime. + */ + requestId?: string; } export interface BufferedEvent {