diff --git a/app/workflows/clearLifecycleRunIdIfOwned.ts b/app/workflows/clearLifecycleRunIdIfOwned.ts new file mode 100644 index 000000000..df6f55228 --- /dev/null +++ b/app/workflows/clearLifecycleRunIdIfOwned.ts @@ -0,0 +1,22 @@ +import { selectSessions } from "@/lib/supabase/sessions/selectSessions"; +import { updateSession } from "@/lib/supabase/sessions/updateSession"; + +/** + * Workflow step that clears `lifecycle_run_id` only if it still + * matches the supplied `runId`. Used at the end of a workflow run to + * release the lease without clobbering one that's been reclaimed by + * a kick. + * + * @param sessionId - The session id. + * @param runId - The lease this workflow owned; only clear if it + * still matches. + */ +export async function clearLifecycleRunIdIfOwned(sessionId: string, runId: string): Promise { + "use step"; + + const rows = await selectSessions({ id: sessionId }); + const session = rows[0]; + if (!session || session.lifecycle_run_id !== runId) return; + + await updateSession(sessionId, { lifecycle_run_id: null }); +} diff --git a/app/workflows/computeLifecycleWakeDecision.ts b/app/workflows/computeLifecycleWakeDecision.ts new file mode 100644 index 000000000..9b084a770 --- /dev/null +++ b/app/workflows/computeLifecycleWakeDecision.ts @@ -0,0 +1,47 @@ +import { getLifecycleDueAtMs } from "@/lib/sandbox/getLifecycleDueAtMs"; +import { hasRuntimeSandboxState } from "@/lib/sandbox/hasRuntimeSandboxState"; +import { selectSessions } from "@/lib/supabase/sessions/selectSessions"; + +interface LifecycleWakeDecision { + shouldContinue: boolean; + wakeAtMs?: number; + reason?: string; +} + +/** + * Workflow step run at the top of each `sandboxLifecycleWorkflow` + * iteration. Reads the session, decides whether to continue looping, + * and (when continuing) returns the next wake time. Bails when a + * concurrent kick has overwritten `lifecycle_run_id` with a different + * value — that newer run is now responsible for the session. + * + * @param sessionId - The session id the workflow is tracking. + * @param runId - The lease this workflow run owns. + * @returns A decision object with continuation flag, wake time, and + * skip reason (when terminating). + */ +export async function computeLifecycleWakeDecision( + sessionId: string, + runId: string, +): Promise { + "use step"; + + const rows = await selectSessions({ id: sessionId }); + const session = rows[0]; + if (!session) return { shouldContinue: false, reason: "session-not-found" }; + if (session.status === "archived" || session.lifecycle_state === "archived") { + return { shouldContinue: false, reason: "session-archived" }; + } + if ( + !hasRuntimeSandboxState(session.sandbox_state) || + (session.sandbox_state as { type?: unknown } | null)?.type !== "vercel" + ) { + return { shouldContinue: false, reason: "sandbox-not-operable" }; + } + + if (session.lifecycle_run_id !== null && session.lifecycle_run_id !== runId) { + return { shouldContinue: false, reason: "run-replaced" }; + } + + return { shouldContinue: true, wakeAtMs: getLifecycleDueAtMs(session) }; +} diff --git a/app/workflows/runLifecycleEvaluation.ts b/app/workflows/runLifecycleEvaluation.ts new file mode 100644 index 000000000..38687061e --- /dev/null +++ b/app/workflows/runLifecycleEvaluation.ts @@ -0,0 +1,23 @@ +import { evaluateSandboxLifecycle } from "@/lib/sandbox/evaluateSandboxLifecycle"; +import type { + SandboxLifecycleEvaluationResult, + SandboxLifecycleReason, +} from "@/lib/sandbox/sandboxLifecycleTypes"; + +/** + * Workflow step that runs a single lifecycle evaluation pass. Thin + * wrapper around `evaluateSandboxLifecycle` so the workflow + * orchestrator gets a step boundary (with `"use step"` durability + * semantics) on each evaluation. + * + * @param sessionId - The session whose sandbox to evaluate. + * @param reason - Why the workflow was triggered (for logging only). + * @returns The result of this single evaluation pass. + */ +export async function runLifecycleEvaluation( + sessionId: string, + reason: SandboxLifecycleReason, +): Promise { + "use step"; + return evaluateSandboxLifecycle(sessionId, reason); +} diff --git a/app/workflows/sandboxLifecycleWorkflow.ts b/app/workflows/sandboxLifecycleWorkflow.ts new file mode 100644 index 000000000..aa765dfdc --- /dev/null +++ b/app/workflows/sandboxLifecycleWorkflow.ts @@ -0,0 +1,45 @@ +import { sleep } from "workflow"; +import { clearLifecycleRunIdIfOwned } from "@/app/workflows/clearLifecycleRunIdIfOwned"; +import { computeLifecycleWakeDecision } from "@/app/workflows/computeLifecycleWakeDecision"; +import { runLifecycleEvaluation } from "@/app/workflows/runLifecycleEvaluation"; +import { SANDBOX_LIFECYCLE_MIN_SLEEP_MS } from "@/lib/sandbox/sandboxLifecycleConfig"; +import type { SandboxLifecycleReason } from "@/lib/sandbox/sandboxLifecycleTypes"; + +/** + * Vercel Workflow that pauses idle sandboxes. Runs as a `while(true)` + * loop: compute next wake time → `sleep(date)` → evaluate → either + * loop (when not-due-yet or active-stream defers) or terminate + * (hibernated / failed / sandbox gone). Holds the + * `lifecycle_run_id` lease throughout so concurrent kicks can't + * spawn duplicate workflows. + */ +export async function sandboxLifecycleWorkflow( + sessionId: string, + reason: SandboxLifecycleReason, + runId: string, +) { + "use workflow"; + + while (true) { + const decision = await computeLifecycleWakeDecision(sessionId, runId); + if (!decision.shouldContinue || decision.wakeAtMs === undefined) { + await clearLifecycleRunIdIfOwned(sessionId, runId); + return { skipped: true, reason: decision.reason ?? "no-decision" }; + } + + const wakeAtMs = Math.max(decision.wakeAtMs, Date.now() + SANDBOX_LIFECYCLE_MIN_SLEEP_MS); + await sleep(new Date(wakeAtMs)); + + const evaluation = await runLifecycleEvaluation(sessionId, reason); + + if ( + evaluation.action === "skipped" && + (evaluation.reason === "not-due-yet" || evaluation.reason === "active-workflow") + ) { + continue; + } + + await clearLifecycleRunIdIfOwned(sessionId, runId); + return { skipped: false, evaluation }; + } +} diff --git a/lib/sandbox/__tests__/createSandboxHandler.test.ts b/lib/sandbox/__tests__/createSandboxHandler.test.ts index 13311f99e..2bf048a10 100644 --- a/lib/sandbox/__tests__/createSandboxHandler.test.ts +++ b/lib/sandbox/__tests__/createSandboxHandler.test.ts @@ -9,6 +9,7 @@ import { updateSession } from "@/lib/supabase/sessions/updateSession"; import { installSessionGlobalSkills } from "@/lib/sandbox/installSessionGlobalSkills"; import { findOrgSnapshot } from "@/lib/sandbox/findOrgSnapshot"; import { kickBuildOrgSnapshotWorkflow } from "@/lib/sandbox/kickBuildOrgSnapshotWorkflow"; +import { kickSandboxLifecycleWorkflow } from "@/lib/sandbox/kickSandboxLifecycleWorkflow"; vi.mock("@/lib/networking/getCorsHeaders", () => ({ getCorsHeaders: () => ({ "Access-Control-Allow-Origin": "*" }), @@ -37,6 +38,9 @@ vi.mock("@/lib/sandbox/findOrgSnapshot", () => ({ vi.mock("@/lib/sandbox/kickBuildOrgSnapshotWorkflow", () => ({ kickBuildOrgSnapshotWorkflow: vi.fn(), })); +vi.mock("@/lib/sandbox/kickSandboxLifecycleWorkflow", () => ({ + kickSandboxLifecycleWorkflow: vi.fn(), +})); const ACCOUNT_ID = "acc-1"; @@ -281,6 +285,29 @@ describe("createSandboxHandler", () => { expect(kickBuildOrgSnapshotWorkflow).not.toHaveBeenCalled(); }); + it("kicks the sandbox lifecycle workflow with reason='sandbox-created' when sessionId is provided", async () => { + await createSandboxHandler(makeReq()); + + expect(kickSandboxLifecycleWorkflow).toHaveBeenCalledWith( + expect.objectContaining({ + sessionId: "sess-1", + reason: "sandbox-created", + scheduleBackgroundWork: expect.any(Function), + }), + ); + }); + + it("does not kick the lifecycle workflow when no sessionId is provided", async () => { + vi.mocked(validateCreateSandboxBody).mockResolvedValueOnce({ + body: { repoUrl: "https://github.com/o/r" }, + auth: { accountId: ACCOUNT_ID, orgId: null, authToken: "k" }, + }); + + await createSandboxHandler(makeReq()); + + expect(kickSandboxLifecycleWorkflow).not.toHaveBeenCalled(); + }); + it("does not attempt skill installation when no sessionId is provided", async () => { vi.mocked(validateCreateSandboxBody).mockResolvedValueOnce({ body: { repoUrl: "https://github.com/o/r" }, diff --git a/lib/sandbox/__tests__/getSandboxStatusHandler.test.ts b/lib/sandbox/__tests__/getSandboxStatusHandler.test.ts index 2bf66ea01..c4b116cb3 100644 --- a/lib/sandbox/__tests__/getSandboxStatusHandler.test.ts +++ b/lib/sandbox/__tests__/getSandboxStatusHandler.test.ts @@ -14,6 +14,9 @@ vi.mock("@/lib/auth/validateAuthContext", () => ({ vi.mock("@/lib/supabase/sessions/selectSessions", () => ({ selectSessions: vi.fn(), })); +vi.mock("@/lib/sandbox/kickSandboxLifecycleWorkflow", () => ({ + kickSandboxLifecycleWorkflow: vi.fn(), +})); const ACCOUNT_ID = "acc-1"; const FAR_FUTURE = "2099-01-01T00:00:00.000Z"; diff --git a/lib/sandbox/buildActiveLifecycleUpdate.ts b/lib/sandbox/buildActiveLifecycleUpdate.ts new file mode 100644 index 000000000..0fb818f9a --- /dev/null +++ b/lib/sandbox/buildActiveLifecycleUpdate.ts @@ -0,0 +1,27 @@ +import { buildLifecycleActivityUpdate } from "@/lib/sandbox/buildLifecycleActivityUpdate"; +import { getSandboxExpiresAtDate } from "@/lib/sandbox/getSandboxExpiresAtDate"; +import type { TablesUpdate } from "@/types/database.types"; + +/** + * Builds the lifecycle-related fields to write when transitioning a + * session into the `active` state right after a sandbox has been + * provisioned or resumed. Combines `buildLifecycleActivityUpdate` + * with the sandbox's own `expiresAt` so the row's + * `sandbox_expires_at` matches the freshly-probed runtime expiry. + * + * @param sandboxState - The `sandbox_state` JSON value, typically + * from `sandbox.getState()`. + * @param options.activityAt - Optional override for "now". + * @param options.lifecycleState - Defaults to `"active"`; pass + * `"restoring"` for the snapshot-resume path. + * @returns A partial Supabase update object. + */ +export function buildActiveLifecycleUpdate( + sandboxState: unknown, + options?: { activityAt?: Date; lifecycleState?: "active" | "restoring" }, +): TablesUpdate<"sessions"> { + return { + ...buildLifecycleActivityUpdate(options?.activityAt, options?.lifecycleState ?? "active"), + sandbox_expires_at: getSandboxExpiresAtDate(sandboxState), + }; +} diff --git a/lib/sandbox/buildHibernatedLifecycleUpdate.ts b/lib/sandbox/buildHibernatedLifecycleUpdate.ts new file mode 100644 index 000000000..be1df3a6f --- /dev/null +++ b/lib/sandbox/buildHibernatedLifecycleUpdate.ts @@ -0,0 +1,20 @@ +import type { TablesUpdate } from "@/types/database.types"; + +/** + * Builds the lifecycle-related fields to write when the workflow + * pauses a sandbox. Clears `sandbox_expires_at`, `hibernate_after`, + * and the `lifecycle_run_id` lease so a future kick can claim it. + * Note the caller is responsible for separately clearing + * `sandbox_state` runtime metadata via `clearSandboxState`. + * + * @returns A partial Supabase update object. + */ +export function buildHibernatedLifecycleUpdate(): TablesUpdate<"sessions"> { + return { + lifecycle_state: "hibernated", + sandbox_expires_at: null, + hibernate_after: null, + lifecycle_run_id: null, + lifecycle_error: null, + }; +} diff --git a/lib/sandbox/buildLifecycleActivityUpdate.ts b/lib/sandbox/buildLifecycleActivityUpdate.ts new file mode 100644 index 000000000..671e400bf --- /dev/null +++ b/lib/sandbox/buildLifecycleActivityUpdate.ts @@ -0,0 +1,29 @@ +import { SANDBOX_INACTIVITY_TIMEOUT_MS } from "@/lib/sandbox/sandboxLifecycleConfig"; +import type { TablesUpdate } from "@/types/database.types"; + +/** + * Builds the lifecycle-related fields to write when refreshing a + * sandbox's "last activity" — used by `/api/sandbox/activity` and + * by `buildActiveLifecycleUpdate`. Sets `last_activity_at` to now, + * pushes `hibernate_after` out by SANDBOX_INACTIVITY_TIMEOUT_MS, and + * clears any stale `lifecycle_error`. Defaults `lifecycle_state` to + * `"active"` but accepts `"restoring"` for the snapshot-resume path. + * + * @param activityAt - Optional override for "now"; defaults to current time. + * @param lifecycleState - Defaults to `"active"`. + * @returns A partial Supabase update object. + */ +export function buildLifecycleActivityUpdate( + activityAt: Date = new Date(), + lifecycleState: "active" | "restoring" = "active", +): Pick< + TablesUpdate<"sessions">, + "lifecycle_state" | "lifecycle_error" | "last_activity_at" | "hibernate_after" +> { + return { + lifecycle_state: lifecycleState, + lifecycle_error: null, + last_activity_at: activityAt.toISOString(), + hibernate_after: new Date(activityAt.getTime() + SANDBOX_INACTIVITY_TIMEOUT_MS).toISOString(), + }; +} diff --git a/lib/sandbox/clearSandboxState.ts b/lib/sandbox/clearSandboxState.ts new file mode 100644 index 000000000..5cecb2bc3 --- /dev/null +++ b/lib/sandbox/clearSandboxState.ts @@ -0,0 +1,23 @@ +import { getPersistentSandboxName } from "@/lib/sandbox/getPersistentSandboxName"; + +/** + * Strips runtime metadata (expiresAt, etc.) from a sandbox state while + * preserving the durable resume handle (sandboxName) so a future + * `connectSandbox` can pick it back up. Used by the lifecycle + * workflow when transitioning to `hibernated`. + * + * @param state - The current `sandbox_state` JSON value. + * @returns A trimmed state with only `type` + `sandboxName`, or null + * when the input is null. + */ +export function clearSandboxState(state: unknown): { type: string; sandboxName?: string } | null { + if (!state || typeof state !== "object") return null; + + const sandboxName = getPersistentSandboxName(state); + const type = (state as { type?: unknown }).type; + + return { + type: typeof type === "string" ? type : "vercel", + ...(sandboxName ? { sandboxName } : {}), + }; +} diff --git a/lib/sandbox/computeExpiryDueAtMs.ts b/lib/sandbox/computeExpiryDueAtMs.ts new file mode 100644 index 000000000..57320d51b --- /dev/null +++ b/lib/sandbox/computeExpiryDueAtMs.ts @@ -0,0 +1,20 @@ +import { isoToEpochMs } from "@/lib/sandbox/isoToEpochMs"; +import { SANDBOX_EXPIRES_BUFFER_MS } from "@/lib/sandbox/sandboxLifecycleConfig"; +import type { Tables } from "@/types/database.types"; + +/** + * Computes when a session's sandbox should hibernate due to expiry — + * `sandbox_expires_at` minus a small buffer so we pause before + * Vercel's hard timeout. Returns null when the row has no expiry set + * (paused sandbox, type stub). + * + * @param row - The `sessions` row. + * @returns Epoch ms of the expiry due time, or null when not applicable. + */ +export function computeExpiryDueAtMs( + row: Pick, "sandbox_expires_at">, +): number | null { + const expiresAt = isoToEpochMs(row.sandbox_expires_at); + if (expiresAt === null) return null; + return expiresAt - SANDBOX_EXPIRES_BUFFER_MS; +} diff --git a/lib/sandbox/computeInactivityDueAtMs.ts b/lib/sandbox/computeInactivityDueAtMs.ts new file mode 100644 index 000000000..53df9285a --- /dev/null +++ b/lib/sandbox/computeInactivityDueAtMs.ts @@ -0,0 +1,21 @@ +import { isoToEpochMs } from "@/lib/sandbox/isoToEpochMs"; +import { SANDBOX_INACTIVITY_TIMEOUT_MS } from "@/lib/sandbox/sandboxLifecycleConfig"; +import type { Tables } from "@/types/database.types"; + +/** + * Computes when a session's sandbox should hibernate due to inactivity. + * Prefers `hibernate_after` if set; otherwise computes from the most + * recent activity timestamp + SANDBOX_INACTIVITY_TIMEOUT_MS. + * + * @param row - The `sessions` row. + * @returns Epoch ms of the inactivity due time. + */ +export function computeInactivityDueAtMs( + row: Pick, "hibernate_after" | "last_activity_at" | "updated_at">, +): number { + const hibernateAfter = isoToEpochMs(row.hibernate_after); + if (hibernateAfter !== null) return hibernateAfter; + const lastActivity = isoToEpochMs(row.last_activity_at); + const fallback = lastActivity ?? isoToEpochMs(row.updated_at) ?? Date.now(); + return fallback + SANDBOX_INACTIVITY_TIMEOUT_MS; +} diff --git a/lib/sandbox/createLifecycleRunId.ts b/lib/sandbox/createLifecycleRunId.ts new file mode 100644 index 000000000..2b4ae02e8 --- /dev/null +++ b/lib/sandbox/createLifecycleRunId.ts @@ -0,0 +1,11 @@ +/** + * Generates a unique identifier for a sandbox-lifecycle workflow run. + * Format: `lifecycle::` — the timestamp gives + * humans a quick sanity check of when the run started, the UUID + * guarantees uniqueness across concurrent kicks. + * + * @returns A new run id string. + */ +export function createLifecycleRunId(): string { + return `lifecycle:${Date.now()}:${crypto.randomUUID()}`; +} diff --git a/lib/sandbox/createSandboxHandler.ts b/lib/sandbox/createSandboxHandler.ts index 7e6dd55d3..399126d17 100644 --- a/lib/sandbox/createSandboxHandler.ts +++ b/lib/sandbox/createSandboxHandler.ts @@ -1,5 +1,5 @@ import ms from "ms"; -import { NextRequest, NextResponse } from "next/server"; +import { NextRequest, NextResponse, after } from "next/server"; import { getCorsHeaders } from "@/lib/networking/getCorsHeaders"; import { validateCreateSandboxBody } from "@/lib/sandbox/validateCreateSandboxBody"; import { selectSessions } from "@/lib/supabase/sessions/selectSessions"; @@ -8,6 +8,7 @@ import { findOrgSnapshot } from "@/lib/sandbox/findOrgSnapshot"; import { getSessionSandboxName } from "@/lib/sandbox/getSessionSandboxName"; import { installSessionGlobalSkills } from "@/lib/sandbox/installSessionGlobalSkills"; import { kickBuildOrgSnapshotWorkflow } from "@/lib/sandbox/kickBuildOrgSnapshotWorkflow"; +import { kickSandboxLifecycleWorkflow } from "@/lib/sandbox/kickSandboxLifecycleWorkflow"; import { extractOrgRepoName } from "@/lib/recoupable/extractOrgRepoName"; import { updateSession } from "@/lib/supabase/sessions/updateSession"; import { getServiceGithubToken } from "@/lib/github/getServiceGithubToken"; @@ -142,6 +143,19 @@ export async function createSandboxHandler(request: NextRequest): Promise after(() => task), + }); } return NextResponse.json( diff --git a/lib/sandbox/evaluateSandboxLifecycle.ts b/lib/sandbox/evaluateSandboxLifecycle.ts new file mode 100644 index 000000000..cc8e9cb5b --- /dev/null +++ b/lib/sandbox/evaluateSandboxLifecycle.ts @@ -0,0 +1,106 @@ +import { connectSandbox } from "@/lib/sandbox/factory"; +import { buildHibernatedLifecycleUpdate } from "@/lib/sandbox/buildHibernatedLifecycleUpdate"; +import { clearSandboxState } from "@/lib/sandbox/clearSandboxState"; +import { getLifecycleDueAtMs } from "@/lib/sandbox/getLifecycleDueAtMs"; +import { getPersistentSandboxName } from "@/lib/sandbox/getPersistentSandboxName"; +import { hasActiveStreamForSession } from "@/lib/sandbox/hasActiveStreamForSession"; +import { hasRuntimeSandboxState } from "@/lib/sandbox/hasRuntimeSandboxState"; +import { restoreActiveLifecycleState } from "@/lib/sandbox/restoreActiveLifecycleState"; +import { wasLifecycleTimingExtended } from "@/lib/sandbox/wasLifecycleTimingExtended"; +import { selectSessions } from "@/lib/supabase/sessions/selectSessions"; +import { updateSession } from "@/lib/supabase/sessions/updateSession"; +import type { SandboxState } from "@/lib/sandbox/factory"; +import type { + SandboxLifecycleEvaluationResult, + SandboxLifecycleReason, +} from "@/lib/sandbox/sandboxLifecycleTypes"; +import type { Json } from "@/types/database.types"; + +/** + * One-shot lifecycle evaluator called by the sandbox-lifecycle + * Vercel Workflow on each wake. Decides whether the session's + * sandbox is past due for hibernation, and either pauses it or + * skips with a reason for the workflow loop to act on. + * + * Returns: + * - `skipped` with a reason — workflow re-evaluates on the next + * iteration (for `not-due-yet`) or terminates (for terminal + * reasons like `session-not-found`) + * - `hibernated` — sandbox was stopped, session row updated + * - `failed` — an error occurred mid-evaluation; row's + * `lifecycle_state` is set to `"failed"` for self-healing on + * the next status read + */ +export async function evaluateSandboxLifecycle( + sessionId: string, + reason: SandboxLifecycleReason, +): Promise { + const rows = await selectSessions({ id: sessionId }); + const session = rows[0]; + + if (!session) return { action: "skipped", reason: "session-not-found" }; + if (session.status === "archived" || session.lifecycle_state === "archived") { + return { action: "skipped", reason: "session-archived" }; + } + + const sandboxState = session.sandbox_state; + if (!hasRuntimeSandboxState(sandboxState)) { + return { action: "skipped", reason: "sandbox-not-operable" }; + } + if ((sandboxState as { type?: unknown }).type !== "vercel") { + return { action: "skipped", reason: "unsupported-sandbox-type" }; + } + + if (Date.now() < getLifecycleDueAtMs(session)) { + return { action: "skipped", reason: "not-due-yet" }; + } + + if (await hasActiveStreamForSession(sessionId)) { + return { action: "skipped", reason: "active-workflow" }; + } + + try { + await updateSession(sessionId, { lifecycle_state: "hibernating", lifecycle_error: null }); + + const sandbox = await connectSandbox(sandboxState as unknown as SandboxState); + + if (await hasActiveStreamForSession(sessionId)) { + await restoreActiveLifecycleState(sessionId, sandboxState); + return { action: "skipped", reason: "active-workflow" }; + } + + if (await wasLifecycleTimingExtended(sessionId, session)) { + const refreshed = (await selectSessions({ id: sessionId }))[0]; + if (refreshed?.sandbox_state) { + await restoreActiveLifecycleState(sessionId, refreshed.sandbox_state); + } + return { action: "skipped", reason: "not-due-yet" }; + } + + await sandbox.stop(); + + const cleared = clearSandboxState(sandboxState); + await updateSession(sessionId, { + sandbox_state: cleared as unknown as Json, + snapshot_url: null, + snapshot_created_at: null, + ...buildHibernatedLifecycleUpdate(), + }); + console.log( + `[evaluateSandboxLifecycle] hibernated session=${sessionId} reason=${reason} sandboxName=${getPersistentSandboxName(cleared) ?? "none"}`, + ); + return { action: "hibernated" }; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + await updateSession(sessionId, { + lifecycle_state: "failed", + lifecycle_run_id: null, + lifecycle_error: message, + }); + console.error( + `[evaluateSandboxLifecycle] failed session=${sessionId} reason=${reason}:`, + error, + ); + return { action: "failed", reason: message }; + } +} diff --git a/lib/sandbox/getLifecycleDueAtMs.ts b/lib/sandbox/getLifecycleDueAtMs.ts new file mode 100644 index 000000000..c022193cc --- /dev/null +++ b/lib/sandbox/getLifecycleDueAtMs.ts @@ -0,0 +1,25 @@ +import { computeExpiryDueAtMs } from "@/lib/sandbox/computeExpiryDueAtMs"; +import { computeInactivityDueAtMs } from "@/lib/sandbox/computeInactivityDueAtMs"; +import type { Tables } from "@/types/database.types"; + +/** + * Computes when the lifecycle workflow should next wake up for a + * session. Returns the earlier of the inactivity-due time and the + * expiry-due time. Used by both the workflow (for + * `sleep(new Date(wakeAtMs))`) and the stale-run detector in the + * kick logic. + * + * @param row - The `sessions` row. + * @returns Epoch ms when the next lifecycle action is due. + */ +export function getLifecycleDueAtMs( + row: Pick< + Tables<"sessions">, + "hibernate_after" | "last_activity_at" | "sandbox_expires_at" | "updated_at" + >, +): number { + const inactivityDue = computeInactivityDueAtMs(row); + const expiryDue = computeExpiryDueAtMs(row); + if (expiryDue === null) return inactivityDue; + return Math.min(inactivityDue, expiryDue); +} diff --git a/lib/sandbox/getPersistentSandboxName.ts b/lib/sandbox/getPersistentSandboxName.ts new file mode 100644 index 000000000..5500773f0 --- /dev/null +++ b/lib/sandbox/getPersistentSandboxName.ts @@ -0,0 +1,20 @@ +/** + * Reads the durable `sandboxName` from a `sandbox_state` JSON value. + * Returns null when the state has no `sandboxName` (or it's empty). + * + * The `sandboxName` is the deterministic identifier under which a + * sandbox is registered with the Vercel sandbox runtime — assigned at + * creation time via `getSessionSandboxName(sessionId)` and preserved + * across resume / pause / restore cycles. It survives even when the + * runtime metadata (expiresAt, etc.) has been cleared. + * + * @param state - The persisted `sandbox_state` JSON value. + * @returns The persistent sandbox name, or null when absent. + */ +export function getPersistentSandboxName(state: unknown): string | null { + if (!state || typeof state !== "object") return null; + const candidate = state as { sandboxName?: unknown }; + return typeof candidate.sandboxName === "string" && candidate.sandboxName.length > 0 + ? candidate.sandboxName + : null; +} diff --git a/lib/sandbox/getResumableSandboxName.ts b/lib/sandbox/getResumableSandboxName.ts new file mode 100644 index 000000000..39717905c --- /dev/null +++ b/lib/sandbox/getResumableSandboxName.ts @@ -0,0 +1,20 @@ +import { getPersistentSandboxName } from "@/lib/sandbox/getPersistentSandboxName"; + +/** + * Returns the durable name used to resume a paused sandbox. Falls back + * to a legacy `sandboxId` field for sessions written before the + * persistent-name migration in open-agents. + * + * @param state - The persisted `sandbox_state` JSON value. + * @returns A name suitable for `connectSandbox` resume, or null. + */ +export function getResumableSandboxName(state: unknown): string | null { + const persistent = getPersistentSandboxName(state); + if (persistent) return persistent; + + if (!state || typeof state !== "object") return null; + const candidate = state as { sandboxId?: unknown }; + return typeof candidate.sandboxId === "string" && candidate.sandboxId.length > 0 + ? candidate.sandboxId + : null; +} diff --git a/lib/sandbox/getSandboxExpiresAtDate.ts b/lib/sandbox/getSandboxExpiresAtDate.ts new file mode 100644 index 000000000..fe26a6ef8 --- /dev/null +++ b/lib/sandbox/getSandboxExpiresAtDate.ts @@ -0,0 +1,16 @@ +/** + * Reads the runtime `expiresAt` field (epoch milliseconds) off a + * sandbox state and converts it to an ISO-8601 string suitable for + * persisting to `sessions.sandbox_expires_at`. Returns null when the + * state has no `expiresAt` (paused sandboxes or empty stubs). + * + * @param state - The `sandbox_state` JSON value, typically from + * `sandbox.getState()`. + * @returns ISO timestamp string, or null when no expiry is set. + */ +export function getSandboxExpiresAtDate(state: unknown): string | null { + if (!state || typeof state !== "object") return null; + const expiresAt = (state as { expiresAt?: unknown }).expiresAt; + if (typeof expiresAt !== "number") return null; + return new Date(expiresAt).toISOString(); +} diff --git a/lib/sandbox/getSandboxStatusHandler.ts b/lib/sandbox/getSandboxStatusHandler.ts index 8c9e1b988..e2b2f39f4 100644 --- a/lib/sandbox/getSandboxStatusHandler.ts +++ b/lib/sandbox/getSandboxStatusHandler.ts @@ -1,8 +1,10 @@ -import { NextRequest, NextResponse } from "next/server"; +import { NextRequest, NextResponse, after } from "next/server"; import { getCorsHeaders } from "@/lib/networking/getCorsHeaders"; import { validateAuthContext } from "@/lib/auth/validateAuthContext"; import { buildLifecycle } from "@/lib/sandbox/buildLifecycle"; +import { getLifecycleDueAtMs } from "@/lib/sandbox/getLifecycleDueAtMs"; import { isSandboxActive } from "@/lib/sandbox/isSandboxActive"; +import { kickSandboxLifecycleWorkflow } from "@/lib/sandbox/kickSandboxLifecycleWorkflow"; import { selectSessions } from "@/lib/supabase/sessions/selectSessions"; /** @@ -12,6 +14,12 @@ import { selectSessions } from "@/lib/supabase/sessions/selectSessions"; * non-expired `sandbox_state` (with real runtime metadata), otherwise * `"no_sandbox"`. `hasSnapshot` is true when the row records a saved * snapshot the UI can offer to resume. + * + * Side-effect: when the row reports `lifecycle_state: "active"` but + * the lifecycle is past due (workflow never woke or its lease died), + * fires a `status-check-overdue` lifecycle kick. The kick reclaims + * stale leases and starts a fresh workflow run — that's how the + * lifecycle FSM self-heals from crashed workflows. */ export async function getSandboxStatusHandler(request: NextRequest): Promise { const auth = await validateAuthContext(request); @@ -44,9 +52,19 @@ export async function getSandboxStatusHandler(request: NextRequest): Promise= getLifecycleDueAtMs(row)) { + kickSandboxLifecycleWorkflow({ + sessionId: row.id, + reason: "status-check-overdue", + scheduleBackgroundWork: task => after(() => task), + }); + } + return NextResponse.json( { - status: isSandboxActive(row) ? "active" : "no_sandbox", + status: active ? "active" : "no_sandbox", hasSnapshot: !!row.snapshot_url, lifecycleVersion: row.lifecycle_version, lifecycle: buildLifecycle(row), diff --git a/lib/sandbox/hasActiveStreamForSession.ts b/lib/sandbox/hasActiveStreamForSession.ts new file mode 100644 index 000000000..3529a5f3d --- /dev/null +++ b/lib/sandbox/hasActiveStreamForSession.ts @@ -0,0 +1,15 @@ +import { selectChats } from "@/lib/supabase/chats/selectChats"; + +/** + * True when any chat in the session has an `active_stream_id` set, + * indicating an in-flight assistant stream. The lifecycle workflow + * uses this to defer hibernation while a chat is actively being + * served — pausing the sandbox mid-stream would 500 the response. + * + * @param sessionId - The session to check. + * @returns true when at least one chat has an active stream id. + */ +export async function hasActiveStreamForSession(sessionId: string): Promise { + const chats = await selectChats({ sessionId }); + return chats.some(chat => chat.active_stream_id !== null); +} diff --git a/lib/sandbox/hasPausedSandboxState.ts b/lib/sandbox/hasPausedSandboxState.ts new file mode 100644 index 000000000..c78000ee8 --- /dev/null +++ b/lib/sandbox/hasPausedSandboxState.ts @@ -0,0 +1,14 @@ +import { getResumableSandboxName } from "@/lib/sandbox/getResumableSandboxName"; +import { hasRuntimeSandboxState } from "@/lib/sandbox/hasRuntimeSandboxState"; + +/** + * True when the state describes a sandbox that is paused but resumable + * — has a durable name yet no live runtime metadata. Used by the UI to + * decide whether to show "Resume" affordances. + * + * @param state - The persisted `sandbox_state` JSON value. + * @returns true when the sandbox is paused-but-resumable. + */ +export function hasPausedSandboxState(state: unknown): boolean { + return getResumableSandboxName(state) !== null && !hasRuntimeSandboxState(state); +} diff --git a/lib/sandbox/isLifecycleRunStale.ts b/lib/sandbox/isLifecycleRunStale.ts new file mode 100644 index 000000000..f44b9a493 --- /dev/null +++ b/lib/sandbox/isLifecycleRunStale.ts @@ -0,0 +1,19 @@ +import { getLifecycleDueAtMs } from "@/lib/sandbox/getLifecycleDueAtMs"; +import { SANDBOX_LIFECYCLE_STALE_RUN_GRACE_MS } from "@/lib/sandbox/sandboxLifecycleConfig"; +import type { Tables } from "@/types/database.types"; + +/** + * True when a session's `lifecycle_run_id` lease is past the calculated + * due time + grace window. Indicates the workflow that owned the lease + * either crashed or never started — the kick can safely reclaim the + * lease and start a fresh run. + * + * @param session - The `sessions` row. + * @returns true when the lease is stale and should be reclaimed. + */ +export function isLifecycleRunStale(session: Tables<"sessions">): boolean { + if (!session.lifecycle_run_id) return false; + if (session.lifecycle_state !== "active") return false; + const overdueMs = Date.now() - getLifecycleDueAtMs(session); + return overdueMs > SANDBOX_LIFECYCLE_STALE_RUN_GRACE_MS; +} diff --git a/lib/sandbox/kickSandboxLifecycleWorkflow.ts b/lib/sandbox/kickSandboxLifecycleWorkflow.ts new file mode 100644 index 000000000..f1b29a334 --- /dev/null +++ b/lib/sandbox/kickSandboxLifecycleWorkflow.ts @@ -0,0 +1,41 @@ +import { runKick } from "@/lib/sandbox/runKick"; +import type { SandboxLifecycleReason } from "@/lib/sandbox/sandboxLifecycleTypes"; + +interface KickInput { + sessionId: string; + reason: SandboxLifecycleReason; + /** + * Optional scheduler for the kick chain. Callers in serverless + * contexts should pass `task => after(() => task)` (or + * `waitUntil(task)`) so the platform keeps the function alive + * until the chain completes — without it the chain dies when the + * request returns. Mirrors open-agents' `scheduleBackgroundWork` + * parameter. + */ + scheduleBackgroundWork?: (task: Promise) => void; +} + +/** + * Fire-and-forget kick of the sandbox-lifecycle workflow. Used by: + * + * - `POST /api/sandbox` (reason: `sandbox-created`) — register a + * freshly-provisioned sandbox so the workflow auto-pauses it after + * `SANDBOX_INACTIVITY_TIMEOUT_MS` of idle time + * - `GET /api/sandbox/status` (reason: `status-check-overdue`) — poke + * the workflow when a status read sees stale lifecycle state + * + * Wraps `runKick` in error handling and the optional background-work + * scheduler. `runKick` itself owns all the decision logic. + */ +export function kickSandboxLifecycleWorkflow(input: KickInput): void { + const task = runKick({ sessionId: input.sessionId, reason: input.reason }).catch(error => + console.error(`[kickSandboxLifecycleWorkflow] failed for session ${input.sessionId}:`, error), + ); + + if (input.scheduleBackgroundWork) { + input.scheduleBackgroundWork(task); + return; + } + + void task; +} diff --git a/lib/sandbox/reclaimStaleLease.ts b/lib/sandbox/reclaimStaleLease.ts new file mode 100644 index 000000000..857335a84 --- /dev/null +++ b/lib/sandbox/reclaimStaleLease.ts @@ -0,0 +1,18 @@ +import { selectSessions } from "@/lib/supabase/sessions/selectSessions"; +import { updateSession } from "@/lib/supabase/sessions/updateSession"; +import type { Tables } from "@/types/database.types"; + +/** + * Clears a stale `lifecycle_run_id` lease and re-reads the row so the + * caller has the post-clear snapshot to work with. Used by the kick + * logic when `isLifecycleRunStale` returns true. + * + * @param sessionId - The session whose lease to reclaim. + * @returns The session row with `lifecycle_run_id: null`, or null if + * the row vanished between the clear and the re-read. + */ +export async function reclaimStaleLease(sessionId: string): Promise | null> { + await updateSession(sessionId, { lifecycle_run_id: null }); + const rows = await selectSessions({ id: sessionId }); + return rows[0] ?? null; +} diff --git a/lib/sandbox/restoreActiveLifecycleState.ts b/lib/sandbox/restoreActiveLifecycleState.ts new file mode 100644 index 000000000..fbdbacbfb --- /dev/null +++ b/lib/sandbox/restoreActiveLifecycleState.ts @@ -0,0 +1,23 @@ +import { getSandboxExpiresAtDate } from "@/lib/sandbox/getSandboxExpiresAtDate"; +import { updateSession } from "@/lib/supabase/sessions/updateSession"; + +/** + * Re-marks a session as `lifecycle_state: "active"` after a transient + * skip during evaluation (e.g. an active chat stream took precedence + * over hibernation, or the user extended the timing while the + * workflow was about to pause). Refreshes `sandbox_expires_at` from + * the live sandbox state so the row matches reality. + * + * @param sessionId - The session id. + * @param sandboxState - The live `sandbox_state` JSON value. + */ +export async function restoreActiveLifecycleState( + sessionId: string, + sandboxState: unknown, +): Promise { + await updateSession(sessionId, { + lifecycle_state: "active", + lifecycle_error: null, + sandbox_expires_at: getSandboxExpiresAtDate(sandboxState), + }); +} diff --git a/lib/sandbox/runKick.ts b/lib/sandbox/runKick.ts new file mode 100644 index 000000000..f5e1fa918 --- /dev/null +++ b/lib/sandbox/runKick.ts @@ -0,0 +1,58 @@ +import { start } from "workflow/api"; +import { sandboxLifecycleWorkflow } from "@/app/workflows/sandboxLifecycleWorkflow"; +import { createLifecycleRunId } from "@/lib/sandbox/createLifecycleRunId"; +import { isLifecycleRunStale } from "@/lib/sandbox/isLifecycleRunStale"; +import { reclaimStaleLease } from "@/lib/sandbox/reclaimStaleLease"; +import { shouldStartLifecycle } from "@/lib/sandbox/shouldStartLifecycle"; +import { selectSessions } from "@/lib/supabase/sessions/selectSessions"; +import { updateSession } from "@/lib/supabase/sessions/updateSession"; +import type { SandboxLifecycleReason } from "@/lib/sandbox/sandboxLifecycleTypes"; + +interface RunKickInput { + sessionId: string; + reason: SandboxLifecycleReason; +} + +/** + * The async chain that the lifecycle kick runs in the background: + * + * 1. Read the session row + * 2. Reclaim the lease if it's stale (workflow crashed) + * 3. Skip if the session isn't in a shape where lifecycle makes sense + * (`shouldStartLifecycle` already filters out rows that already + * have a `lifecycle_run_id` — best-effort concurrency guard) + * 4. Generate a fresh run id and write it to the session row + * 5. `start()` the Vercel Workflow run + * 6. On `start()` failure, clear the lease so retry can succeed + * + * Errors are caught at the call site (`kickSandboxLifecycleWorkflow`) + * and never surface to the request — this is fire-and-forget by + * design. + */ +export async function runKick(input: RunKickInput): Promise { + const rows = await selectSessions({ id: input.sessionId }); + const session = rows[0]; + if (!session) return; + + const sessionForStart = isLifecycleRunStale(session) + ? await reclaimStaleLease(input.sessionId) + : session; + + if (!shouldStartLifecycle(sessionForStart)) return; + + const runId = createLifecycleRunId(); + await updateSession(input.sessionId, { lifecycle_run_id: runId }); + + try { + const run = await start(sandboxLifecycleWorkflow, [input.sessionId, input.reason, runId]); + console.log( + `[kickSandboxLifecycleWorkflow] started run ${run.runId} for session ${input.sessionId} (reason=${input.reason})`, + ); + } catch (error) { + console.error( + `[kickSandboxLifecycleWorkflow] failed to start workflow for session ${input.sessionId}; clearing lease:`, + error, + ); + await updateSession(input.sessionId, { lifecycle_run_id: null }); + } +} diff --git a/lib/sandbox/sandboxLifecycleConfig.ts b/lib/sandbox/sandboxLifecycleConfig.ts new file mode 100644 index 000000000..3f474a48c --- /dev/null +++ b/lib/sandbox/sandboxLifecycleConfig.ts @@ -0,0 +1,35 @@ +/** + * Lifecycle workflow tuning. All values copied from open-agents' + * `lib/sandbox/config.ts` and intentionally kept in sync — these are + * the timings the sandbox lifecycle FSM uses to decide when to wake + * up, hibernate, and detect stale runs. + */ + +/** + * How long a sandbox can be idle before the lifecycle workflow pauses + * it. 5 minutes — short enough to free compute aggressively, long + * enough that a user briefly switching tabs doesn't lose their + * session. + */ +export const SANDBOX_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000; + +/** + * Buffer subtracted from `sandbox_expires_at` when computing the + * lifecycle wake time. Pause a few seconds before Vercel hard-stops + * the sandbox so we have a chance to do graceful cleanup. + */ +export const SANDBOX_EXPIRES_BUFFER_MS = 10_000; + +/** + * Floor on the workflow's `sleep(date)` duration. Prevents tight loops + * when the calculated wake time is in the past or extremely soon. + */ +export const SANDBOX_LIFECYCLE_MIN_SLEEP_MS = 5_000; + +/** + * How far past the calculated due time a session's `lifecycle_run_id` + * may sit before the kick logic considers the run dead and reclaims + * the lease. Prevents a crashed workflow from blocking future kicks + * forever. + */ +export const SANDBOX_LIFECYCLE_STALE_RUN_GRACE_MS = 2 * 60 * 1000; diff --git a/lib/sandbox/sandboxLifecycleTypes.ts b/lib/sandbox/sandboxLifecycleTypes.ts new file mode 100644 index 000000000..44f77bf20 --- /dev/null +++ b/lib/sandbox/sandboxLifecycleTypes.ts @@ -0,0 +1,36 @@ +/** + * Lifecycle FSM states stored as `lifecycle_state` on the `sessions` + * row. The workflow transitions through these as it pauses, restores, + * and recovers from failure. + */ +export type SandboxLifecycleState = + | "provisioning" + | "active" + | "hibernating" + | "hibernated" + | "restoring" + | "archived" + | "failed"; + +/** + * Reason the lifecycle workflow was kicked. Each callsite passes a + * specific value so observability can trace which path triggered an + * evaluation. Mirrored from open-agents — extend / snapshot reasons + * are listed for parity even though those endpoints aren't ported yet. + */ +export type SandboxLifecycleReason = + | "sandbox-created" + | "timeout-extended" + | "snapshot-restored" + | "reconnect" + | "manual-stop" + | "status-check-overdue"; + +/** + * Result of a single lifecycle evaluation pass — what the workflow + * returns to the kick caller for logging. + */ +export interface SandboxLifecycleEvaluationResult { + action: "skipped" | "hibernated" | "failed"; + reason?: string; +} diff --git a/lib/sandbox/shouldStartLifecycle.ts b/lib/sandbox/shouldStartLifecycle.ts new file mode 100644 index 000000000..9294e37a8 --- /dev/null +++ b/lib/sandbox/shouldStartLifecycle.ts @@ -0,0 +1,24 @@ +import { hasRuntimeSandboxState } from "@/lib/sandbox/hasRuntimeSandboxState"; +import type { Tables } from "@/types/database.types"; + +/** + * Predicate for the kick logic: returns true when a session is in a + * shape where starting a lifecycle workflow makes sense. Filters out + * archived sessions, sessions without a runtime sandbox, non-vercel + * sandbox types, and sessions that already have a lifecycle run in + * flight. + * + * @param session - The `sessions` row, or null. + * @returns true when a lifecycle run should be started. + */ +export function shouldStartLifecycle( + session: Tables<"sessions"> | null, +): session is Tables<"sessions"> { + if (!session) return false; + if (session.status === "archived" || session.lifecycle_state === "archived") return false; + if (!session.sandbox_state) return false; + if (!hasRuntimeSandboxState(session.sandbox_state)) return false; + if ((session.sandbox_state as { type?: unknown }).type !== "vercel") return false; + if (session.lifecycle_run_id) return false; + return true; +} diff --git a/lib/sandbox/wasLifecycleTimingExtended.ts b/lib/sandbox/wasLifecycleTimingExtended.ts new file mode 100644 index 000000000..c3980a1df --- /dev/null +++ b/lib/sandbox/wasLifecycleTimingExtended.ts @@ -0,0 +1,30 @@ +import { getLifecycleDueAtMs } from "@/lib/sandbox/getLifecycleDueAtMs"; +import { hasRuntimeSandboxState } from "@/lib/sandbox/hasRuntimeSandboxState"; +import { selectSessions } from "@/lib/supabase/sessions/selectSessions"; +import type { Tables } from "@/types/database.types"; + +/** + * True when, between the workflow's wake-up and the moment of + * evaluation, another caller (an `/extend` or `/activity` hit, or a + * snapshot resume) updated the session's lifecycle timing fields and + * the new due time is still in the future. Tells the evaluator to + * skip hibernation this pass and let the workflow loop sleep again. + * + * @param sessionId - The session id. + * @param prior - The session row read at the start of evaluation. + * @returns true when timing was extended; false otherwise. + */ +export async function wasLifecycleTimingExtended( + sessionId: string, + prior: Tables<"sessions">, +): Promise { + const refreshed = (await selectSessions({ id: sessionId }))[0]; + if (!refreshed?.sandbox_state || !hasRuntimeSandboxState(refreshed.sandbox_state)) return false; + + const timingChanged = + refreshed.last_activity_at !== prior.last_activity_at || + refreshed.hibernate_after !== prior.hibernate_after || + refreshed.sandbox_expires_at !== prior.sandbox_expires_at; + + return timingChanged && Date.now() < getLifecycleDueAtMs(refreshed); +} diff --git a/lib/supabase/chats/selectChats.ts b/lib/supabase/chats/selectChats.ts new file mode 100644 index 000000000..e36c0454c --- /dev/null +++ b/lib/supabase/chats/selectChats.ts @@ -0,0 +1,30 @@ +import supabase from "@/lib/supabase/serverClient"; +import type { Tables } from "@/types/database.types"; + +interface SelectChatsFilter { + /** Optional id filter — when set, returns at most one row. */ + id?: string; + /** Optional session filter — when set, returns every chat in the session. */ + sessionId?: string; +} + +/** + * General-purpose `chats` reader. Pass any combination of filters to + * narrow the result set; an unset filter is ignored. Mirrors the + * `selectSessions` pattern. Returns [] on Supabase error after logging. + * + * @param filter - Optional filters narrowing the query. + * @returns Matching chat rows, or [] on error / no match. + */ +export async function selectChats(filter: SelectChatsFilter = {}): Promise[]> { + let query = supabase.from("chats").select("*"); + if (filter.id) query = query.eq("id", filter.id); + if (filter.sessionId) query = query.eq("session_id", filter.sessionId); + + const { data, error } = await query; + if (error) { + console.error("[selectChats] error:", error); + return []; + } + return data ?? []; +}