Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -184,12 +184,12 @@ describe("useAnalyticsQuery", () => {
expect(result.current.data).toBeNull();
});

test("a server error event carrying a structured errorCode exposes it on the hook return value", async () => {
// The SSE error broadcaster forwards an `errorCode` field for
// UI branching (e.g. INLINE_ARROW_STASH_EXHAUSTED). The hook
// surfaces both the human `error` text AND the structured
// `errorCode` so consumers can branch on the stable identifier
// instead of substring-matching the sanitized human message.
test("a server error event carrying a structured errorCode + requestId exposes both on the hook return value", async () => {
// The SSE error broadcaster forwards `errorCode` (for UI branching)
// and `requestId` (for support triage — same id appears in the
// server-side logger.error line). The hook surfaces both so
// consumers can render "Error ref: <requestId>" alongside the
// human message and branch behavior on the stable errorCode.
const errorSpy = vi.spyOn(console, "error").mockImplementation(() => {});

const { result } = renderHook(() =>
Expand All @@ -202,6 +202,7 @@ describe("useAnalyticsQuery", () => {
error: "Server is at capacity, please retry",
code: "UPSTREAM_ERROR",
errorCode: "INLINE_ARROW_STASH_EXHAUSTED",
requestId: "11111111-2222-3333-4444-555555555555",
}),
});

Expand All @@ -210,6 +211,9 @@ describe("useAnalyticsQuery", () => {
});
expect(result.current.loading).toBe(false);
expect(result.current.errorCode).toBe("INLINE_ARROW_STASH_EXHAUSTED");
expect(result.current.requestId).toBe(
"11111111-2222-3333-4444-555555555555",
);

errorSpy.mockRestore();
});
Expand Down
9 changes: 9 additions & 0 deletions packages/appkit-ui/src/react/hooks/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,15 @@ export interface UseAnalyticsQueryResult<T> {
* matching `error`, which is a free-form sanitized message.
*/
errorCode: string | null;
/**
* Server-side correlation id for support triage. When non-null, the
* server emitted this id on the error frame and the same id appears
* in the backend's `logger.error` line. Surface it in error UIs
* ("Error ref: abc-123 — quote when filing a support ticket") so
* staff can grep server logs directly instead of asking the user to
* reproduce.
*/
requestId: string | null;
}

/**
Expand Down
12 changes: 10 additions & 2 deletions packages/appkit-ui/src/react/hooks/use-analytics-query.ts
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ export function useAnalyticsQuery<
const [loading, setLoading] = useState(false);
const [error, setError] = useState<string | null>(null);
const [errorCode, setErrorCode] = useState<string | null>(null);
const [requestId, setRequestId] = useState<string | null>(null);
const abortControllerRef = useRef<AbortController | null>(null);

if (!queryKey || queryKey.trim().length === 0) {
Expand Down Expand Up @@ -170,6 +171,7 @@ export function useAnalyticsQuery<
setLoading(true);
setError(null);
setErrorCode(null);
setRequestId(null);
setData(null);

const abortController = new AbortController();
Expand Down Expand Up @@ -247,10 +249,16 @@ export function useAnalyticsQuery<
if (typeof parsed.errorCode === "string") {
setErrorCode(parsed.errorCode);
}
// requestId is the SSE event id of the error frame — same
// value the server logs in its `Stream execution failed`
// line, so users can quote it in support tickets.
if (typeof parsed.requestId === "string") {
setRequestId(parsed.requestId);
}

if (parsed.code) {
console.error(
`[useAnalyticsQuery] Code: ${parsed.code}, Message: ${errorMsg}`,
`[useAnalyticsQuery] Code: ${parsed.code}, Message: ${errorMsg}, requestId: ${parsed.requestId ?? "n/a"}`,
);
}
return;
Expand Down Expand Up @@ -321,5 +329,5 @@ export function useAnalyticsQuery<
// Enable HMR for query updates in dev mode
useQueryHMR(queryKey, start);

return { data, loading, error, errorCode };
return { data, loading, error, errorCode, requestId };
}
179 changes: 179 additions & 0 deletions packages/appkit/src/errors/dbsql-error-allowlist.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
/**
* Allowlist of Databricks SQL Statement Execution API error codes whose
* `error.message` is safe to forward to clients verbatim.
*
* Threat model — the recipient of these messages is an *authenticated
* workspace user*, not an anonymous internet visitor. They already know
* the workspace URL, their orgId, their session token, the warehouses
* they have grants on, and the catalog contents they can see. Names of
* resources within the workspace, internal correlation IDs, and class
* names from Databricks-internal packages are not new information to
* this audience — the classic CWE-209 "stack-trace-on-500-page-leaks-
* to-attacker" scenario does not apply.
*
* Given that, the bar for passthrough is: does the message help the
* user (or their support contact) understand and act on the failure?
* Defaults skew toward yes because:
*
* - The cost of denial is *certain* (every error becomes "Query
* execution failed" with no signal; users open tickets, support has
* to reproduce or grep server logs to find the actual error).
* - The cost of allowing is *speculative* — depends on whatever the
* most recent wrapped exception happened to put in its `.toString()`.
*
* Server-side, the route handlers always `logger.error(rawMsg)` the
* full upstream text before sanitization — operators retain complete
* visibility regardless of what reaches the wire. SSE error frames
* also carry a `requestId` (the SSE event id) so users can quote it
* in support tickets and staff can grep logs against it directly.
*
* **Currently denied**: `UNKNOWN` only — by definition unclassified,
* so we can't reason about its contents and a default-deny is the
* minimum safe stance. Every other `ServiceErrorCode` variant is on
* the allowlist below with a documented rationale.
*
* Source-of-truth construction sites for the classifications below:
*
* - DBR data-plane (`BAD_REQUEST`, `NOT_FOUND`, `ALREADY_EXISTS`):
* `sqlgateway/scheduler/src/main/scala/DriverRequests.scala:181-208`
* populates `ErrorInfo.withMessage(command.errorDisplayMessage)` and
* sets `withErrorReturnedFromDataPlane(true)`.
*
* - User-facing service messages (`TEMPORARILY_UNAVAILABLE`,
* `WORKSPACE_TEMPORARILY_UNAVAILABLE`, `SERVICE_UNDER_MAINTENANCE`):
* - `sqlgateway/scheduler/api/src/main/scala/client/SchedulerRpcClient.scala`
* constructs `SchedulerTemporarilyUnavailableException` with the
* literal "DBSQL temporarily unavailable. Please try again in a
* few minutes."
* - `sqlgateway/scheduler/api/src/main/scala/client/linkstore/SqlGatewayClerk.scala`
* constructs with `internalMessage = "DBSQL temporarily unavailable..."`
* - `sqlgateway/proxy/src/main/scala/thrift/Exceptions.scala`
* constructs `WORKSPACE_TEMPORARILY_UNAVAILABLE` from
* `NotServedByShardException.getMessage` — names a shard but no
* credentials / PII / stack traces.
*
* - User-actionable quota messages (`RESOURCE_EXHAUSTED`):
* `sqlgateway/scheduler/src/main/scala/SchedulerRpcValidatorHook.scala`
* emits stable templates: "The maximum number of warehouses has been
* reached. Please contact Databricks support.", "You've hit the limit
* for warehouses for free usage. Stop or delete existing warehouses
* to free up capacity.", etc.
*
* - Concurrency conflict (`ABORTED`): short reason strings like
* "version mismatch" (Reyden warehouse monitor). No internal context.
*
* - `INTERNAL_ERROR` / `IO_ERROR`:
* `sqlgateway/database/src/main/scala/endpoints/EndpointModel.scala`
* interpolates `s"SQL ${conf.warehouse} cannot be created due to
* repeated collisions on unique IDs."`;
* `SchedulerRpcContextValidatorHook.workspaceNotOwned(orgId)`
* interpolates orgId. These identifiers are not sensitive to a
* workspace user (they own / can list both). The wrapped
* `ex.getMessage` content is unbounded but for the typical case is
* either operational (RPC timeout, lock contention) or shape-of-error
* information the user needs to act. Allowlisted with the requestId
* plumbing as the safety net for triage. *Not* allowlisted if a
* future review finds construction sites that interpolate user
* credentials, OAuth tokens, or other secrets — open an issue + flip
* the entry.
*
* - `UNKNOWN`: unclassified by definition. We have no way to reason
* about its contents — could be anything from "RPC connect failed"
* to a panic dump from an upstream library. Default-deny is the
* only defensible stance until the SDK gives us a finer code.
*
* - `UNAUTHENTICATED` / `CANCELLED` / `DEADLINE_EXCEEDED`: SDK-level
* codes with generic message templates.
*
* SDK type reference (`@databricks/sdk-experimental`,
* `apis/sql/model.d.ts`):
* ```
* type ServiceErrorCode =
* | "ABORTED" | "ALREADY_EXISTS" | "BAD_REQUEST" | "CANCELLED"
* | "DEADLINE_EXCEEDED" | "INTERNAL_ERROR" | "IO_ERROR" | "NOT_FOUND"
* | "RESOURCE_EXHAUSTED" | "SERVICE_UNDER_MAINTENANCE"
* | "TEMPORARILY_UNAVAILABLE" | "UNAUTHENTICATED" | "UNKNOWN"
* | "WORKSPACE_TEMPORARILY_UNAVAILABLE";
* ```
*
* **Default-deny on unknown codes** — when the SDK ships a new variant
* the allowlist hasn't been reviewed against, the new code collapses to
* the generic clientMessage until someone audits the construction sites
* and updates the Set below.
*/

/** ServiceErrorCode values for which `error.message` may be forwarded verbatim. */
const SAFE_PASSTHROUGH_CODES: ReadonlySet<string> = new Set([
// DBR-authored: user's own SQL errors. Parse, semantic, type, missing
// table/column. Examples: "Table or view 'foo' not found", "Syntax
// error at or near ',' on line 3".
"BAD_REQUEST",
// DBR catalog miss — "Table 'foo' not found", "Schema 'bar' not found".
"NOT_FOUND",
// DBR catalog conflict — "Table 'foo' already exists".
"ALREADY_EXISTS",
// Stable template: "Query timed out after Ns". User-actionable.
"DEADLINE_EXCEEDED",
// "Statement was canceled". User/admin-initiated; no internal state.
"CANCELLED",
// SDK generic — "Permission denied", "Authentication required".
"UNAUTHENTICATED",
// Stable user-facing template: "DBSQL temporarily unavailable. Please
// try again in a few minutes." Hiding this would force users to
// distinguish "warehouse issue" from "user error" by guesswork.
"TEMPORARILY_UNAVAILABLE",
// Workspace-level service messages from shard-routing failures. The
// message may name a shard ID — that's internal topology metadata,
// low sensitivity, and operationally useful for the user to know
// the workspace is in motion.
"WORKSPACE_TEMPORARILY_UNAVAILABLE",
// Maintenance windows — generic template messages.
"SERVICE_UNDER_MAINTENANCE",
// Stable user-actionable templates from the warehouse quota path:
// "The maximum number of warehouses has been reached. Please contact
// Databricks support.", "Stop or delete existing warehouses to free
// up capacity.", etc. Telling users "Query execution failed" when
// they've actually hit a quota is unhelpful.
"RESOURCE_EXHAUSTED",
// Concurrency conflict — short reason strings ("version mismatch",
// "concurrent modification"). User-relevant for retry decisions.
"ABORTED",
// Wrapped internal exceptions. Construction sites interpolate
// workspace-internal identifiers (warehouse names, orgIds) and
// `ex.getMessage` from arbitrary causes. To an authenticated
// workspace user these identifiers are not sensitive — they can
// already enumerate them. The wrapped `ex.getMessage` content is
// unbounded but in practice is operational ("RPC timed out",
// "deadlock detected") or shape-of-failure detail the user needs to
// act on. The `requestId` we emit on the SSE error frame is the
// safety net for unhappy cases — users can quote it in tickets and
// staff can grep logs against it for the full unsanitized message.
"INTERNAL_ERROR",
// Storage / network failures. Same logic as INTERNAL_ERROR: path /
// bucket names visible to a workspace user with catalog access are
// not new information, and the wrapped error text usually tells
// them whether to retry, switch warehouses, or escalate.
"IO_ERROR",
]);

/**
* Explicitly NOT on the allowlist (documented for future-reviewer
* context; classification is by absence from `SAFE_PASSTHROUGH_CODES`):
*
* - `UNKNOWN` — unclassified by definition; could be anything from a
* library panic to an unhandled `case` in the gateway. Default-deny
* is the only defensible stance until the SDK gives us a finer code.
*/

/**
* Returns true when the given upstream `error_code` belongs to the
* passthrough allowlist — i.e. the corresponding `error.message` was
* authored by DBR for the user's own SQL and is safe to forward.
*
* Returns false (default-deny) for any unrecognized code, including
* codes added to the SDK after this allowlist was last reviewed.
*/
export function isSqlErrorPassthrough(errorCode: string | undefined): boolean {
if (!errorCode) return false;
return SAFE_PASSTHROUGH_CODES.has(errorCode);
}
42 changes: 31 additions & 11 deletions packages/appkit/src/errors/execution.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { AppKitError } from "./base";
import { isSqlErrorPassthrough } from "./dbsql-error-allowlist";

/**
* Error thrown when an operation execution fails.
Expand Down Expand Up @@ -50,16 +51,22 @@ export class ExecutionError extends AppKitError {
/**
* Create an execution error for statement failure.
* @param errorMessage Human-readable error from the warehouse / SDK.
* Goes into `.message` for server logs only — *never* echoed to the
* client. Pass `clientMessage` explicitly if a sanitized text should
* reach the UI.
* @param errorCode Structured code (e.g. "INVALID_PARAMETER_VALUE") to
* preserve through wrapping. Optional. Forwarded on SSE error
* payloads so UI can branch on it instead of substring-matching
* `error`.
* @param clientMessage Optional client-safe replacement for `.message`.
* Defaults to "Query execution failed" via the `clientMessage`
* getter. Set this only when the upstream text is known-safe.
* When `errorCode` is on the DBSQL safe-passthrough allowlist (see
* `dbsql-error-allowlist.ts`), this text is forwarded to clients
* verbatim — it's authored by DBR for the user's own SQL and is
* inherently user-facing. For codes NOT on the allowlist (control
* plane errors carrying correlation IDs, internal paths, stack
* traces), `.message` goes into server logs only and the client
* sees a generic "Query execution failed". An explicit
* `clientMessage` argument always wins over both paths.
* @param errorCode Structured code (e.g. "BAD_REQUEST",
* "INVALID_PARAMETER_VALUE") to preserve through wrapping. Optional.
* Forwarded on SSE error payloads so UI can branch on it instead of
* substring-matching `error`. Also drives the passthrough decision.
* @param clientMessage Explicit client-safe message that always wins —
* bypasses the allowlist check. Use when the upstream text is
* known-safe regardless of code (e.g. constructed in-process from a
* trusted template).
*/
static statementFailed(
errorMessage?: string,
Expand All @@ -69,7 +76,20 @@ export class ExecutionError extends AppKitError {
const message = errorMessage
? `Statement failed: ${errorMessage}`
: "Statement failed: Unknown error";
return new ExecutionError(message, { errorCode, clientMessage });

// Allowlist-driven passthrough: BAD_REQUEST / NOT_FOUND /
// ALREADY_EXISTS / etc. carry DBR-authored messages that *are* the
// user's own SQL error ("Table 'foo' not found", "Syntax error
// near ',' on line 3"). Hiding these forces users to debug in the
// dark; surfacing them is the entire reason an error message
// exists. Any code not on the allowlist defaults to generic.
const inferredClient =
clientMessage ?? (isSqlErrorPassthrough(errorCode) ? message : undefined);

return new ExecutionError(message, {
errorCode,
clientMessage: inferredClient,
});
}

/**
Expand Down
Loading