-
Notifications
You must be signed in to change notification settings - Fork 9
feat(sandbox): port sandbox lifecycle workflow + kicks (Phase 2) #533
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
4b70015
34b1041
30ef641
22095c2
c37821d
2bf8cf9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,22 @@ | ||
| import { selectSessions } from "@/lib/supabase/sessions/selectSessions"; | ||
| import { updateSession } from "@/lib/supabase/sessions/updateSession"; | ||
|
|
||
| /** | ||
| * Workflow step that clears `lifecycle_run_id` only if it still | ||
| * matches the supplied `runId`. Used at the end of a workflow run to | ||
| * release the lease without clobbering one that's been reclaimed by | ||
| * a kick. | ||
| * | ||
| * @param sessionId - The session id. | ||
| * @param runId - The lease this workflow owned; only clear if it | ||
| * still matches. | ||
| */ | ||
| export async function clearLifecycleRunIdIfOwned(sessionId: string, runId: string): Promise<void> { | ||
| "use step"; | ||
|
|
||
| const rows = await selectSessions({ id: sessionId }); | ||
| const session = rows[0]; | ||
| if (!session || session.lifecycle_run_id !== runId) return; | ||
|
|
||
| await updateSession(sessionId, { lifecycle_run_id: null }); | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,47 @@ | ||
| import { getLifecycleDueAtMs } from "@/lib/sandbox/getLifecycleDueAtMs"; | ||
| import { hasRuntimeSandboxState } from "@/lib/sandbox/hasRuntimeSandboxState"; | ||
| import { selectSessions } from "@/lib/supabase/sessions/selectSessions"; | ||
|
|
||
| interface LifecycleWakeDecision { | ||
| shouldContinue: boolean; | ||
| wakeAtMs?: number; | ||
| reason?: string; | ||
| } | ||
|
|
||
| /** | ||
| * Workflow step run at the top of each `sandboxLifecycleWorkflow` | ||
| * iteration. Reads the session, decides whether to continue looping, | ||
| * and (when continuing) returns the next wake time. Bails when a | ||
| * concurrent kick has overwritten `lifecycle_run_id` with a different | ||
| * value — that newer run is now responsible for the session. | ||
| * | ||
| * @param sessionId - The session id the workflow is tracking. | ||
| * @param runId - The lease this workflow run owns. | ||
| * @returns A decision object with continuation flag, wake time, and | ||
| * skip reason (when terminating). | ||
| */ | ||
| export async function computeLifecycleWakeDecision( | ||
| sessionId: string, | ||
| runId: string, | ||
| ): Promise<LifecycleWakeDecision> { | ||
| "use step"; | ||
|
|
||
| const rows = await selectSessions({ id: sessionId }); | ||
| const session = rows[0]; | ||
| if (!session) return { shouldContinue: false, reason: "session-not-found" }; | ||
| if (session.status === "archived" || session.lifecycle_state === "archived") { | ||
| return { shouldContinue: false, reason: "session-archived" }; | ||
| } | ||
| if ( | ||
| !hasRuntimeSandboxState(session.sandbox_state) || | ||
| (session.sandbox_state as { type?: unknown } | null)?.type !== "vercel" | ||
| ) { | ||
| return { shouldContinue: false, reason: "sandbox-not-operable" }; | ||
| } | ||
|
|
||
| if (session.lifecycle_run_id !== null && session.lifecycle_run_id !== runId) { | ||
| return { shouldContinue: false, reason: "run-replaced" }; | ||
| } | ||
|
|
||
| return { shouldContinue: true, wakeAtMs: getLifecycleDueAtMs(session) }; | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| import { evaluateSandboxLifecycle } from "@/lib/sandbox/evaluateSandboxLifecycle"; | ||
| import type { | ||
| SandboxLifecycleEvaluationResult, | ||
| SandboxLifecycleReason, | ||
| } from "@/lib/sandbox/sandboxLifecycleTypes"; | ||
|
|
||
| /** | ||
| * Workflow step that runs a single lifecycle evaluation pass. Thin | ||
| * wrapper around `evaluateSandboxLifecycle` so the workflow | ||
| * orchestrator gets a step boundary (with `"use step"` durability | ||
| * semantics) on each evaluation. | ||
| * | ||
| * @param sessionId - The session whose sandbox to evaluate. | ||
| * @param reason - Why the workflow was triggered (for logging only). | ||
| * @returns The result of this single evaluation pass. | ||
| */ | ||
| export async function runLifecycleEvaluation( | ||
| sessionId: string, | ||
| reason: SandboxLifecycleReason, | ||
| ): Promise<SandboxLifecycleEvaluationResult> { | ||
| "use step"; | ||
| return evaluateSandboxLifecycle(sessionId, reason); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,45 @@ | ||
| import { sleep } from "workflow"; | ||
| import { clearLifecycleRunIdIfOwned } from "@/app/workflows/clearLifecycleRunIdIfOwned"; | ||
| import { computeLifecycleWakeDecision } from "@/app/workflows/computeLifecycleWakeDecision"; | ||
| import { runLifecycleEvaluation } from "@/app/workflows/runLifecycleEvaluation"; | ||
| import { SANDBOX_LIFECYCLE_MIN_SLEEP_MS } from "@/lib/sandbox/sandboxLifecycleConfig"; | ||
| import type { SandboxLifecycleReason } from "@/lib/sandbox/sandboxLifecycleTypes"; | ||
|
|
||
| /** | ||
| * Vercel Workflow that pauses idle sandboxes. Runs as a `while(true)` | ||
| * loop: compute next wake time → `sleep(date)` → evaluate → either | ||
| * loop (when not-due-yet or active-stream defers) or terminate | ||
| * (hibernated / failed / sandbox gone). Holds the | ||
| * `lifecycle_run_id` lease throughout so concurrent kicks can't | ||
| * spawn duplicate workflows. | ||
| */ | ||
| export async function sandboxLifecycleWorkflow( | ||
| sessionId: string, | ||
| reason: SandboxLifecycleReason, | ||
| runId: string, | ||
| ) { | ||
| "use workflow"; | ||
|
|
||
| while (true) { | ||
| const decision = await computeLifecycleWakeDecision(sessionId, runId); | ||
| if (!decision.shouldContinue || decision.wakeAtMs === undefined) { | ||
| await clearLifecycleRunIdIfOwned(sessionId, runId); | ||
| return { skipped: true, reason: decision.reason ?? "no-decision" }; | ||
| } | ||
|
|
||
| const wakeAtMs = Math.max(decision.wakeAtMs, Date.now() + SANDBOX_LIFECYCLE_MIN_SLEEP_MS); | ||
| await sleep(new Date(wakeAtMs)); | ||
|
|
||
| const evaluation = await runLifecycleEvaluation(sessionId, reason); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. P1: Revalidate lifecycle lease after Prompt for AI agents |
||
|
|
||
| if ( | ||
| evaluation.action === "skipped" && | ||
| (evaluation.reason === "not-due-yet" || evaluation.reason === "active-workflow") | ||
| ) { | ||
| continue; | ||
| } | ||
|
|
||
| await clearLifecycleRunIdIfOwned(sessionId, runId); | ||
| return { skipped: false, evaluation }; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,27 @@ | ||
| import { buildLifecycleActivityUpdate } from "@/lib/sandbox/buildLifecycleActivityUpdate"; | ||
| import { getSandboxExpiresAtDate } from "@/lib/sandbox/getSandboxExpiresAtDate"; | ||
| import type { TablesUpdate } from "@/types/database.types"; | ||
|
|
||
| /** | ||
| * Builds the lifecycle-related fields to write when transitioning a | ||
| * session into the `active` state right after a sandbox has been | ||
| * provisioned or resumed. Combines `buildLifecycleActivityUpdate` | ||
| * with the sandbox's own `expiresAt` so the row's | ||
| * `sandbox_expires_at` matches the freshly-probed runtime expiry. | ||
| * | ||
| * @param sandboxState - The `sandbox_state` JSON value, typically | ||
| * from `sandbox.getState()`. | ||
| * @param options.activityAt - Optional override for "now". | ||
| * @param options.lifecycleState - Defaults to `"active"`; pass | ||
| * `"restoring"` for the snapshot-resume path. | ||
| * @returns A partial Supabase update object. | ||
| */ | ||
| export function buildActiveLifecycleUpdate( | ||
| sandboxState: unknown, | ||
| options?: { activityAt?: Date; lifecycleState?: "active" | "restoring" }, | ||
| ): TablesUpdate<"sessions"> { | ||
| return { | ||
| ...buildLifecycleActivityUpdate(options?.activityAt, options?.lifecycleState ?? "active"), | ||
| sandbox_expires_at: getSandboxExpiresAtDate(sandboxState), | ||
| }; | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| import type { TablesUpdate } from "@/types/database.types"; | ||
|
|
||
| /** | ||
| * Builds the lifecycle-related fields to write when the workflow | ||
| * pauses a sandbox. Clears `sandbox_expires_at`, `hibernate_after`, | ||
| * and the `lifecycle_run_id` lease so a future kick can claim it. | ||
| * Note the caller is responsible for separately clearing | ||
| * `sandbox_state` runtime metadata via `clearSandboxState`. | ||
| * | ||
| * @returns A partial Supabase update object. | ||
| */ | ||
| export function buildHibernatedLifecycleUpdate(): TablesUpdate<"sessions"> { | ||
| return { | ||
| lifecycle_state: "hibernated", | ||
| sandbox_expires_at: null, | ||
| hibernate_after: null, | ||
| lifecycle_run_id: null, | ||
| lifecycle_error: null, | ||
| }; | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,29 @@ | ||
| import { SANDBOX_INACTIVITY_TIMEOUT_MS } from "@/lib/sandbox/sandboxLifecycleConfig"; | ||
| import type { TablesUpdate } from "@/types/database.types"; | ||
|
|
||
| /** | ||
| * Builds the lifecycle-related fields to write when refreshing a | ||
| * sandbox's "last activity" — used by `/api/sandbox/activity` and | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. P3: Custom agent: Flag AI Slop and Fabricated Changes Doc comment references a non-existent Prompt for AI agents |
||
| * by `buildActiveLifecycleUpdate`. Sets `last_activity_at` to now, | ||
| * pushes `hibernate_after` out by SANDBOX_INACTIVITY_TIMEOUT_MS, and | ||
| * clears any stale `lifecycle_error`. Defaults `lifecycle_state` to | ||
| * `"active"` but accepts `"restoring"` for the snapshot-resume path. | ||
| * | ||
| * @param activityAt - Optional override for "now"; defaults to current time. | ||
| * @param lifecycleState - Defaults to `"active"`. | ||
| * @returns A partial Supabase update object. | ||
| */ | ||
| export function buildLifecycleActivityUpdate( | ||
| activityAt: Date = new Date(), | ||
| lifecycleState: "active" | "restoring" = "active", | ||
| ): Pick< | ||
| TablesUpdate<"sessions">, | ||
| "lifecycle_state" | "lifecycle_error" | "last_activity_at" | "hibernate_after" | ||
| > { | ||
| return { | ||
| lifecycle_state: lifecycleState, | ||
| lifecycle_error: null, | ||
| last_activity_at: activityAt.toISOString(), | ||
| hibernate_after: new Date(activityAt.getTime() + SANDBOX_INACTIVITY_TIMEOUT_MS).toISOString(), | ||
| }; | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| import { getPersistentSandboxName } from "@/lib/sandbox/getPersistentSandboxName"; | ||
|
|
||
| /** | ||
| * Strips runtime metadata (expiresAt, etc.) from a sandbox state while | ||
| * preserving the durable resume handle (sandboxName) so a future | ||
| * `connectSandbox` can pick it back up. Used by the lifecycle | ||
| * workflow when transitioning to `hibernated`. | ||
| * | ||
| * @param state - The current `sandbox_state` JSON value. | ||
| * @returns A trimmed state with only `type` + `sandboxName`, or null | ||
| * when the input is null. | ||
| */ | ||
| export function clearSandboxState(state: unknown): { type: string; sandboxName?: string } | null { | ||
| if (!state || typeof state !== "object") return null; | ||
|
|
||
| const sandboxName = getPersistentSandboxName(state); | ||
| const type = (state as { type?: unknown }).type; | ||
|
|
||
| return { | ||
| type: typeof type === "string" ? type : "vercel", | ||
| ...(sandboxName ? { sandboxName } : {}), | ||
| }; | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| import { isoToEpochMs } from "@/lib/sandbox/isoToEpochMs"; | ||
| import { SANDBOX_EXPIRES_BUFFER_MS } from "@/lib/sandbox/sandboxLifecycleConfig"; | ||
| import type { Tables } from "@/types/database.types"; | ||
|
|
||
| /** | ||
| * Computes when a session's sandbox should hibernate due to expiry — | ||
| * `sandbox_expires_at` minus a small buffer so we pause before | ||
| * Vercel's hard timeout. Returns null when the row has no expiry set | ||
| * (paused sandbox, type stub). | ||
| * | ||
| * @param row - The `sessions` row. | ||
| * @returns Epoch ms of the expiry due time, or null when not applicable. | ||
| */ | ||
| export function computeExpiryDueAtMs( | ||
| row: Pick<Tables<"sessions">, "sandbox_expires_at">, | ||
| ): number | null { | ||
| const expiresAt = isoToEpochMs(row.sandbox_expires_at); | ||
| if (expiresAt === null) return null; | ||
| return expiresAt - SANDBOX_EXPIRES_BUFFER_MS; | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,21 @@ | ||
| import { isoToEpochMs } from "@/lib/sandbox/isoToEpochMs"; | ||
| import { SANDBOX_INACTIVITY_TIMEOUT_MS } from "@/lib/sandbox/sandboxLifecycleConfig"; | ||
| import type { Tables } from "@/types/database.types"; | ||
|
|
||
| /** | ||
| * Computes when a session's sandbox should hibernate due to inactivity. | ||
| * Prefers `hibernate_after` if set; otherwise computes from the most | ||
| * recent activity timestamp + SANDBOX_INACTIVITY_TIMEOUT_MS. | ||
| * | ||
| * @param row - The `sessions` row. | ||
| * @returns Epoch ms of the inactivity due time. | ||
| */ | ||
| export function computeInactivityDueAtMs( | ||
| row: Pick<Tables<"sessions">, "hibernate_after" | "last_activity_at" | "updated_at">, | ||
| ): number { | ||
| const hibernateAfter = isoToEpochMs(row.hibernate_after); | ||
| if (hibernateAfter !== null) return hibernateAfter; | ||
| const lastActivity = isoToEpochMs(row.last_activity_at); | ||
| const fallback = lastActivity ?? isoToEpochMs(row.updated_at) ?? Date.now(); | ||
| return fallback + SANDBOX_INACTIVITY_TIMEOUT_MS; | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| /** | ||
| * Generates a unique identifier for a sandbox-lifecycle workflow run. | ||
| * Format: `lifecycle:<timestamp-ms>:<uuid-v4>` — the timestamp gives | ||
| * humans a quick sanity check of when the run started, the UUID | ||
| * guarantees uniqueness across concurrent kicks. | ||
| * | ||
| * @returns A new run id string. | ||
| */ | ||
| export function createLifecycleRunId(): string { | ||
| return `lifecycle:${Date.now()}:${crypto.randomUUID()}`; | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,5 @@ | ||
| import ms from "ms"; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. P1: Custom agent: Enforce Clear Code Style and Maintainability Practices File exceeds the rule’s 100-line limit and mixes several concerns, including the newly added background scheduling logic. Prompt for AI agents |
||
| import { NextRequest, NextResponse } from "next/server"; | ||
| import { NextRequest, NextResponse, after } from "next/server"; | ||
| import { getCorsHeaders } from "@/lib/networking/getCorsHeaders"; | ||
| import { validateCreateSandboxBody } from "@/lib/sandbox/validateCreateSandboxBody"; | ||
| import { selectSessions } from "@/lib/supabase/sessions/selectSessions"; | ||
|
|
@@ -8,6 +8,7 @@ import { findOrgSnapshot } from "@/lib/sandbox/findOrgSnapshot"; | |
| import { getSessionSandboxName } from "@/lib/sandbox/getSessionSandboxName"; | ||
| import { installSessionGlobalSkills } from "@/lib/sandbox/installSessionGlobalSkills"; | ||
| import { kickBuildOrgSnapshotWorkflow } from "@/lib/sandbox/kickBuildOrgSnapshotWorkflow"; | ||
| import { kickSandboxLifecycleWorkflow } from "@/lib/sandbox/kickSandboxLifecycleWorkflow"; | ||
| import { extractOrgRepoName } from "@/lib/recoupable/extractOrgRepoName"; | ||
| import { updateSession } from "@/lib/supabase/sessions/updateSession"; | ||
| import { getServiceGithubToken } from "@/lib/github/getServiceGithubToken"; | ||
|
|
@@ -142,6 +143,19 @@ export async function createSandboxHandler(request: NextRequest): Promise<NextRe | |
| error, | ||
| ); | ||
| } | ||
|
|
||
| // Register the new sandbox with the lifecycle workflow so it gets | ||
| // auto-paused after SANDBOX_INACTIVITY_TIMEOUT_MS of idle. The | ||
| // kick chain (selectSessions → claim lease → start workflow) is | ||
| // registered with `after()` so the serverless platform keeps the | ||
| // function alive past the response until the chain completes — | ||
| // without that, the chain dies on function teardown and the | ||
| // workflow never starts. Failures are logged and never surfaced. | ||
| kickSandboxLifecycleWorkflow({ | ||
| sessionId: sessionRow.id, | ||
| reason: "sandbox-created", | ||
| scheduleBackgroundWork: task => after(() => task), | ||
| }); | ||
| } | ||
|
|
||
| return NextResponse.json( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use an atomic conditional update to avoid lease clobbering.
Lines 17–21 have a TOCTOU race: a new workflow can claim
lifecycle_run_idafter the read and before the write, then this step clears the new owner’s lease. This should be a single DB update withWHERE id = sessionId AND lifecycle_run_id = runId.🤖 Prompt for AI Agents