From 7cbd3aa7137fe3b9d165cb409036848af05b0a38 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Wed, 11 Feb 2026 09:44:11 +0000 Subject: [PATCH 01/14] feat(supervisor): add ComputeWorkloadManager for compute gateway Add a third WorkloadManager implementation that creates sandboxes via the compute gateway HTTP API (POST /api/sandboxes). Uses native fetch with no new dependencies. Enabled by setting COMPUTE_GATEWAY_URL, which takes priority over Kubernetes and Docker providers. --- apps/supervisor/src/env.ts | 4 + apps/supervisor/src/index.ts | 13 +- .../supervisor/src/workloadManager/compute.ts | 116 ++++++++++++++++++ 3 files changed, 130 insertions(+), 3 deletions(-) create mode 100644 apps/supervisor/src/workloadManager/compute.ts diff --git a/apps/supervisor/src/env.ts b/apps/supervisor/src/env.ts index faf34bcd025..77f34d04867 100644 --- a/apps/supervisor/src/env.ts +++ b/apps/supervisor/src/env.ts @@ -77,6 +77,10 @@ const Env = z.object({ */ DOCKER_RUNNER_NETWORKS: z.string().default("host"), + // Compute settings + COMPUTE_GATEWAY_URL: z.string().url().optional(), + COMPUTE_GATEWAY_AUTH_TOKEN: z.string().optional(), + // Kubernetes settings KUBERNETES_FORCE_ENABLED: BoolEnv.default(false), KUBERNETES_NAMESPACE: z.string().default("default"), diff --git a/apps/supervisor/src/index.ts b/apps/supervisor/src/index.ts index 0e274b30390..bd16f54dd2b 100644 --- a/apps/supervisor/src/index.ts +++ b/apps/supervisor/src/index.ts @@ -14,6 +14,7 @@ import { } from "./resourceMonitor.js"; import { KubernetesWorkloadManager } from "./workloadManager/kubernetes.js"; import { DockerWorkloadManager } from "./workloadManager/docker.js"; +import { ComputeWorkloadManager } from "./workloadManager/compute.js"; import { HttpServer, CheckpointClient, @@ -77,9 +78,15 @@ class ManagedSupervisor { : new DockerResourceMonitor(new Docker()) : new NoopResourceMonitor(); - this.workloadManager = this.isKubernetes - ? new KubernetesWorkloadManager(workloadManagerOptions) - : new DockerWorkloadManager(workloadManagerOptions); + this.workloadManager = env.COMPUTE_GATEWAY_URL + ? new ComputeWorkloadManager({ + ...workloadManagerOptions, + gatewayUrl: env.COMPUTE_GATEWAY_URL, + gatewayAuthToken: env.COMPUTE_GATEWAY_AUTH_TOKEN, + }) + : this.isKubernetes + ? new KubernetesWorkloadManager(workloadManagerOptions) + : new DockerWorkloadManager(workloadManagerOptions); if (this.isKubernetes) { if (env.POD_CLEANER_ENABLED) { diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts new file mode 100644 index 00000000000..a984ca7794a --- /dev/null +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -0,0 +1,116 @@ +import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; +import { + type WorkloadManager, + type WorkloadManagerCreateOptions, + type WorkloadManagerOptions, +} from "./types.js"; +import { env } from "../env.js"; +import { getRunnerId } from "../util.js"; +import { tryCatch } from "@trigger.dev/core"; + +type ComputeWorkloadManagerOptions = WorkloadManagerOptions & { + gatewayUrl: string; + gatewayAuthToken?: string; +}; + +export class ComputeWorkloadManager implements WorkloadManager { + private readonly logger = new SimpleStructuredLogger("compute-workload-manager"); + + constructor(private opts: ComputeWorkloadManagerOptions) { + if (!opts.workloadApiDomain) { + this.logger.warn( + "⚠️ workloadApiDomain is unset — VMs need an explicit host IP to reach the supervisor" + ); + } + } + + async create(opts: WorkloadManagerCreateOptions) { + this.logger.log("create()", { opts }); + + const runnerId = getRunnerId(opts.runFriendlyId, opts.nextAttemptNumber); + + const envVars: Record = { + OTEL_EXPORTER_OTLP_ENDPOINT: env.OTEL_EXPORTER_OTLP_ENDPOINT, + TRIGGER_DEQUEUED_AT_MS: String(opts.dequeuedAt.getTime()), + TRIGGER_POD_SCHEDULED_AT_MS: String(Date.now()), + TRIGGER_ENV_ID: opts.envId, + TRIGGER_DEPLOYMENT_ID: opts.deploymentFriendlyId, + TRIGGER_DEPLOYMENT_VERSION: opts.deploymentVersion, + TRIGGER_RUN_ID: opts.runFriendlyId, + TRIGGER_SNAPSHOT_ID: opts.snapshotFriendlyId, + TRIGGER_SUPERVISOR_API_PROTOCOL: this.opts.workloadApiProtocol, + TRIGGER_SUPERVISOR_API_PORT: String(this.opts.workloadApiPort), + TRIGGER_SUPERVISOR_API_DOMAIN: this.opts.workloadApiDomain ?? "", + TRIGGER_WORKER_INSTANCE_NAME: env.TRIGGER_WORKER_INSTANCE_NAME, + TRIGGER_RUNNER_ID: runnerId, + TRIGGER_MACHINE_CPU: String(opts.machine.cpu), + TRIGGER_MACHINE_MEMORY: String(opts.machine.memory), + PRETTY_LOGS: String(env.RUNNER_PRETTY_LOGS), + }; + + if (this.opts.warmStartUrl) { + envVars.TRIGGER_WARM_START_URL = this.opts.warmStartUrl; + } + + if (this.opts.metadataUrl) { + envVars.TRIGGER_METADATA_URL = this.opts.metadataUrl; + } + + if (this.opts.heartbeatIntervalSeconds) { + envVars.TRIGGER_HEARTBEAT_INTERVAL_SECONDS = String(this.opts.heartbeatIntervalSeconds); + } + + if (this.opts.snapshotPollIntervalSeconds) { + envVars.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS = String(this.opts.snapshotPollIntervalSeconds); + } + + if (this.opts.additionalEnvVars) { + Object.assign(envVars, this.opts.additionalEnvVars); + } + + const headers: Record = { + "Content-Type": "application/json", + }; + + if (this.opts.gatewayAuthToken) { + headers["Authorization"] = `Bearer ${this.opts.gatewayAuthToken}`; + } + + const url = `${this.opts.gatewayUrl}/api/sandboxes`; + + const [fetchError, response] = await tryCatch( + fetch(url, { + method: "POST", + headers, + body: JSON.stringify({ + image: opts.image, + env: envVars, + }), + }) + ); + + if (fetchError) { + this.logger.error("Failed to create sandbox", { error: fetchError, url }); + return; + } + + if (!response.ok) { + const [bodyError, body] = await tryCatch(response.text()); + this.logger.error("Gateway returned error", { + status: response.status, + body: bodyError ? undefined : body, + url, + }); + return; + } + + const [parseError, data] = await tryCatch(response.json()); + + if (parseError) { + this.logger.error("Failed to parse gateway response", { error: parseError }); + return; + } + + this.logger.debug("create succeeded", { sandboxId: data.id, runnerId }); + } +} From 3175a10c73947cda95d7dfc5d37a28357194f5a2 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 12 Feb 2026 11:49:13 +0000 Subject: [PATCH 02/14] fix(supervisor): strip image digest in ComputeWorkloadManager --- apps/supervisor/src/workloadManager/compute.ts | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index a984ca7794a..ad01d7b6225 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -17,10 +17,10 @@ export class ComputeWorkloadManager implements WorkloadManager { private readonly logger = new SimpleStructuredLogger("compute-workload-manager"); constructor(private opts: ComputeWorkloadManagerOptions) { - if (!opts.workloadApiDomain) { - this.logger.warn( - "⚠️ workloadApiDomain is unset — VMs need an explicit host IP to reach the supervisor" - ); + if (opts.workloadApiDomain) { + this.logger.warn("⚠️ Custom workload API domain", { + domain: opts.workloadApiDomain, + }); } } @@ -76,6 +76,9 @@ export class ComputeWorkloadManager implements WorkloadManager { headers["Authorization"] = `Bearer ${this.opts.gatewayAuthToken}`; } + // Strip image digest — resolve by tag, not digest + const imageRef = opts.image.split("@")[0]!; + const url = `${this.opts.gatewayUrl}/api/sandboxes`; const [fetchError, response] = await tryCatch( @@ -83,7 +86,7 @@ export class ComputeWorkloadManager implements WorkloadManager { method: "POST", headers, body: JSON.stringify({ - image: opts.image, + image: imageRef, env: envVars, }), }) From 56ef39f813ccff71bee51ff30957849751a84a21 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 12 Feb 2026 21:57:56 +0000 Subject: [PATCH 03/14] fix: add fetch timeout and wide event logging to ComputeWorkloadManager The fetch() call had no timeout, causing infinite hangs when the gateway accepted requests but never returned responses. Adds AbortSignal.timeout (30s) and consolidates all logging into a single structured event per create() call with timing, status, and error context. --- .../supervisor/src/workloadManager/compute.ts | 41 ++++++++++++++----- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index ad01d7b6225..a35cd951d6e 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -25,8 +25,6 @@ export class ComputeWorkloadManager implements WorkloadManager { } async create(opts: WorkloadManagerCreateOptions) { - this.logger.log("create()", { opts }); - const runnerId = getRunnerId(opts.runFriendlyId, opts.nextAttemptNumber); const envVars: Record = { @@ -81,10 +79,20 @@ export class ComputeWorkloadManager implements WorkloadManager { const url = `${this.opts.gatewayUrl}/api/sandboxes`; + const event: Record = { + runId: opts.runFriendlyId, + runnerId, + image: imageRef, + url, + }; + + const startMs = performance.now(); + const [fetchError, response] = await tryCatch( fetch(url, { method: "POST", headers, + signal: AbortSignal.timeout(30_000), body: JSON.stringify({ image: imageRef, env: envVars, @@ -92,28 +100,41 @@ export class ComputeWorkloadManager implements WorkloadManager { }) ); + event.durationMs = Math.round(performance.now() - startMs); + if (fetchError) { - this.logger.error("Failed to create sandbox", { error: fetchError, url }); + event.ok = false; + event.error = fetchError instanceof Error ? fetchError.message : String(fetchError); + event.errorType = + fetchError instanceof DOMException && fetchError.name === "TimeoutError" + ? "timeout" + : "fetch"; + this.logger.error("create sandbox", event); return; } + event.status = response.status; + if (!response.ok) { const [bodyError, body] = await tryCatch(response.text()); - this.logger.error("Gateway returned error", { - status: response.status, - body: bodyError ? undefined : body, - url, - }); + event.ok = false; + event.responseBody = bodyError ? undefined : body; + this.logger.error("create sandbox", event); return; } const [parseError, data] = await tryCatch(response.json()); if (parseError) { - this.logger.error("Failed to parse gateway response", { error: parseError }); + event.ok = false; + event.error = parseError instanceof Error ? parseError.message : String(parseError); + event.errorType = "parse"; + this.logger.error("create sandbox", event); return; } - this.logger.debug("create succeeded", { sandboxId: data.id, runnerId }); + event.ok = true; + event.sandboxId = data.id; + this.logger.log("create sandbox", event); } } From 1bccd1eb8eeddf2e825c2f941aa58f3a45401b6d Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 12 Feb 2026 22:00:22 +0000 Subject: [PATCH 04/14] feat: make gateway fetch timeout configurable --- apps/supervisor/src/env.ts | 1 + apps/supervisor/src/index.ts | 1 + apps/supervisor/src/workloadManager/compute.ts | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/apps/supervisor/src/env.ts b/apps/supervisor/src/env.ts index 77f34d04867..a8750221a87 100644 --- a/apps/supervisor/src/env.ts +++ b/apps/supervisor/src/env.ts @@ -80,6 +80,7 @@ const Env = z.object({ // Compute settings COMPUTE_GATEWAY_URL: z.string().url().optional(), COMPUTE_GATEWAY_AUTH_TOKEN: z.string().optional(), + COMPUTE_GATEWAY_TIMEOUT_MS: z.coerce.number().int().default(30_000), // Kubernetes settings KUBERNETES_FORCE_ENABLED: BoolEnv.default(false), diff --git a/apps/supervisor/src/index.ts b/apps/supervisor/src/index.ts index bd16f54dd2b..bdf3332406a 100644 --- a/apps/supervisor/src/index.ts +++ b/apps/supervisor/src/index.ts @@ -83,6 +83,7 @@ class ManagedSupervisor { ...workloadManagerOptions, gatewayUrl: env.COMPUTE_GATEWAY_URL, gatewayAuthToken: env.COMPUTE_GATEWAY_AUTH_TOKEN, + gatewayTimeoutMs: env.COMPUTE_GATEWAY_TIMEOUT_MS, }) : this.isKubernetes ? new KubernetesWorkloadManager(workloadManagerOptions) diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index a35cd951d6e..26fbc99caf6 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -11,6 +11,7 @@ import { tryCatch } from "@trigger.dev/core"; type ComputeWorkloadManagerOptions = WorkloadManagerOptions & { gatewayUrl: string; gatewayAuthToken?: string; + gatewayTimeoutMs: number; }; export class ComputeWorkloadManager implements WorkloadManager { @@ -92,7 +93,7 @@ export class ComputeWorkloadManager implements WorkloadManager { fetch(url, { method: "POST", headers, - signal: AbortSignal.timeout(30_000), + signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs), body: JSON.stringify({ image: imageRef, env: envVars, From 74817d75211bd371a1567767b1f28095c680e756 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 12 Feb 2026 23:54:34 +0000 Subject: [PATCH 05/14] refactor(supervisor): improve ComputeWorkloadManager wide event logging Emit a single canonical log line in a finally block instead of scattered log calls at each early return. Adds business context (envId, envType, orgId, projectId, deploymentVersion, machine) and instanceName to the event. Always emits at info level with ok=true/false for queryability. --- .../supervisor/src/workloadManager/compute.ts | 102 ++++++++++-------- 1 file changed, 55 insertions(+), 47 deletions(-) diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index 26fbc99caf6..85a43112f0d 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -80,62 +80,70 @@ export class ComputeWorkloadManager implements WorkloadManager { const url = `${this.opts.gatewayUrl}/api/sandboxes`; + // Wide event: single canonical log line emitted in finally const event: Record = { + // High-cardinality identifiers runId: opts.runFriendlyId, runnerId, + envId: opts.envId, + envType: opts.envType, + orgId: opts.orgId, + projectId: opts.projectId, + deploymentVersion: opts.deploymentVersion, + machine: opts.machine.name, + // Environment + instanceName: env.TRIGGER_WORKER_INSTANCE_NAME, + // Request image: imageRef, url, }; const startMs = performance.now(); - const [fetchError, response] = await tryCatch( - fetch(url, { - method: "POST", - headers, - signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs), - body: JSON.stringify({ - image: imageRef, - env: envVars, - }), - }) - ); - - event.durationMs = Math.round(performance.now() - startMs); - - if (fetchError) { - event.ok = false; - event.error = fetchError instanceof Error ? fetchError.message : String(fetchError); - event.errorType = - fetchError instanceof DOMException && fetchError.name === "TimeoutError" - ? "timeout" - : "fetch"; - this.logger.error("create sandbox", event); - return; + try { + const [fetchError, response] = await tryCatch( + fetch(url, { + method: "POST", + headers, + signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs), + body: JSON.stringify({ + image: imageRef, + env: envVars, + }), + }) + ); + + if (fetchError) { + event.error = fetchError instanceof Error ? fetchError.message : String(fetchError); + event.errorType = + fetchError instanceof DOMException && fetchError.name === "TimeoutError" + ? "timeout" + : "fetch"; + return; + } + + event.status = response.status; + + if (!response.ok) { + const [bodyError, body] = await tryCatch(response.text()); + event.responseBody = bodyError ? undefined : body; + return; + } + + const [parseError, data] = await tryCatch(response.json()); + + if (parseError) { + event.error = parseError instanceof Error ? parseError.message : String(parseError); + event.errorType = "parse"; + return; + } + + event.sandboxId = data.id; + event.ok = true; + } finally { + event.durationMs = Math.round(performance.now() - startMs); + event.ok ??= false; + this.logger.info("create sandbox", event); } - - event.status = response.status; - - if (!response.ok) { - const [bodyError, body] = await tryCatch(response.text()); - event.ok = false; - event.responseBody = bodyError ? undefined : body; - this.logger.error("create sandbox", event); - return; - } - - const [parseError, data] = await tryCatch(response.json()); - - if (parseError) { - event.ok = false; - event.error = parseError instanceof Error ? parseError.message : String(parseError); - event.errorType = "parse"; - this.logger.error("create sandbox", event); - return; - } - - event.ok = true; - event.sandboxId = data.id; - this.logger.log("create sandbox", event); } } From a538735ac3c873b9cf0db1037b5d3fe6b6db0f53 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 13 Feb 2026 00:08:54 +0000 Subject: [PATCH 06/14] feat(supervisor): send metadata in compute sandbox creation requests Pass business context (runId, envId, orgId, projectId, machine, etc.) as metadata on CreateSandboxRequest instead of relying on env vars. This enables wide event logging in the compute stack without parsing env or leaking secrets. --- apps/supervisor/src/workloadManager/compute.ts | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index 85a43112f0d..2cdd84a5ef4 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -109,6 +109,15 @@ export class ComputeWorkloadManager implements WorkloadManager { body: JSON.stringify({ image: imageRef, env: envVars, + metadata: { + runId: opts.runFriendlyId, + envId: opts.envId, + envType: opts.envType, + orgId: opts.orgId, + projectId: opts.projectId, + deploymentVersion: opts.deploymentVersion, + machine: opts.machine.name, + }, }), }) ); From ac3dadfc5b9aedf53647a8bab002de1246a650dd Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 13 Feb 2026 00:19:52 +0000 Subject: [PATCH 07/14] feat(supervisor): send machine cpu/memory in compute sandbox requests Passes machine preset cpu and memory as top-level fields on the CreateSandboxRequest so the compute stack can use them for admission control and resource allocation. --- apps/supervisor/src/workloadManager/compute.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index 2cdd84a5ef4..355f8ae1760 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -109,6 +109,8 @@ export class ComputeWorkloadManager implements WorkloadManager { body: JSON.stringify({ image: imageRef, env: envVars, + cpu: opts.machine.cpu, + memory_gb: opts.machine.memory, metadata: { runId: opts.runFriendlyId, envId: opts.envId, From 5a7b8ce550d6ca2f512ab6fbac455635f33427aa Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Wed, 18 Feb 2026 16:26:28 +0000 Subject: [PATCH 08/14] feat(supervisor): add dequeue and warm start timing to wide event Thread timing context from queue consumer through to the compute workload manager's wide event: - dequeueResponseMs: platform dequeue HTTP round-trip - pollingIntervalMs: which polling interval was active (idle vs active) - warmStartCheckMs: warm start check duration All fields are optional to avoid breaking existing consumers. --- apps/supervisor/src/index.ts | 7 ++++++- apps/supervisor/src/workloadManager/compute.ts | 4 ++++ apps/supervisor/src/workloadManager/types.ts | 4 ++++ .../src/v3/runEngineWorker/supervisor/consumerPool.ts | 4 ++-- .../core/src/v3/runEngineWorker/supervisor/events.ts | 2 ++ .../v3/runEngineWorker/supervisor/queueConsumer.ts | 11 ++++++++--- .../core/src/v3/runEngineWorker/supervisor/session.ts | 4 +++- 7 files changed, 29 insertions(+), 7 deletions(-) diff --git a/apps/supervisor/src/index.ts b/apps/supervisor/src/index.ts index bdf3332406a..ede628f82a4 100644 --- a/apps/supervisor/src/index.ts +++ b/apps/supervisor/src/index.ts @@ -195,7 +195,7 @@ class ManagedSupervisor { this.workloadServer.notifyRun({ run }); }); - this.workerSession.on("runQueueMessage", async ({ time, message }) => { + this.workerSession.on("runQueueMessage", async ({ time, message, dequeueResponseMs, pollingIntervalMs }) => { this.logger.log(`Received message with timestamp ${time.toLocaleString()}`, message); if (message.completedWaitpoints.length > 0) { @@ -244,7 +244,9 @@ class ManagedSupervisor { this.logger.log("Scheduling run", { runId: message.run.id }); + const warmStartStart = performance.now(); const didWarmStart = await this.tryWarmStart(message); + const warmStartCheckMs = Math.round(performance.now() - warmStartStart); if (didWarmStart) { this.logger.log("Warm start successful", { runId: message.run.id }); @@ -260,6 +262,9 @@ class ManagedSupervisor { await this.workloadManager.create({ dequeuedAt: message.dequeuedAt, + dequeueResponseMs, + pollingIntervalMs, + warmStartCheckMs, envId: message.environment.id, envType: message.environment.type, image: message.image, diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index 355f8ae1760..3363236ff49 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -93,6 +93,10 @@ export class ComputeWorkloadManager implements WorkloadManager { machine: opts.machine.name, // Environment instanceName: env.TRIGGER_WORKER_INSTANCE_NAME, + // Supervisor timing + dequeueResponseMs: opts.dequeueResponseMs, + pollingIntervalMs: opts.pollingIntervalMs, + warmStartCheckMs: opts.warmStartCheckMs, // Request image: imageRef, url, diff --git a/apps/supervisor/src/workloadManager/types.ts b/apps/supervisor/src/workloadManager/types.ts index 90b61957795..82c7ea7b4c0 100644 --- a/apps/supervisor/src/workloadManager/types.ts +++ b/apps/supervisor/src/workloadManager/types.ts @@ -24,6 +24,10 @@ export interface WorkloadManagerCreateOptions { nextAttemptNumber?: number; dequeuedAt: Date; placementTags?: PlacementTag[]; + // Timing context (populated by supervisor handler, included in wide event) + dequeueResponseMs?: number; + pollingIntervalMs?: number; + warmStartCheckMs?: number; // identifiers envId: string; envType: EnvironmentType; diff --git a/packages/core/src/v3/runEngineWorker/supervisor/consumerPool.ts b/packages/core/src/v3/runEngineWorker/supervisor/consumerPool.ts index 2dd3d1b898b..d72cef75c7c 100644 --- a/packages/core/src/v3/runEngineWorker/supervisor/consumerPool.ts +++ b/packages/core/src/v3/runEngineWorker/supervisor/consumerPool.ts @@ -351,12 +351,12 @@ export class RunQueueConsumerPool { const consumer = this.consumerFactory({ ...this.consumerOptions, - onDequeue: async (messages) => { + onDequeue: async (messages, timing) => { // Always update queue length, default to 0 for empty dequeues or missing value this.updateQueueLength(messages[0]?.workerQueueLength ?? 0); // Forward to the original handler - await this.consumerOptions.onDequeue(messages); + await this.consumerOptions.onDequeue(messages, timing); }, }); diff --git a/packages/core/src/v3/runEngineWorker/supervisor/events.ts b/packages/core/src/v3/runEngineWorker/supervisor/events.ts index a51c504a3e6..df4a93686a9 100644 --- a/packages/core/src/v3/runEngineWorker/supervisor/events.ts +++ b/packages/core/src/v3/runEngineWorker/supervisor/events.ts @@ -6,6 +6,8 @@ export type WorkerEvents = { { time: Date; message: DequeuedMessage; + dequeueResponseMs?: number; + pollingIntervalMs?: number; }, ]; requestRunAttemptStart: [ diff --git a/packages/core/src/v3/runEngineWorker/supervisor/queueConsumer.ts b/packages/core/src/v3/runEngineWorker/supervisor/queueConsumer.ts index 4379eb54f37..76faee40809 100644 --- a/packages/core/src/v3/runEngineWorker/supervisor/queueConsumer.ts +++ b/packages/core/src/v3/runEngineWorker/supervisor/queueConsumer.ts @@ -15,7 +15,7 @@ export type RunQueueConsumerOptions = { preDequeue?: PreDequeueFn; preSkip?: PreSkipFn; maxRunCount?: number; - onDequeue: (messages: WorkerApiDequeueResponseBody) => Promise; + onDequeue: (messages: WorkerApiDequeueResponseBody, timing?: { dequeueResponseMs: number; pollingIntervalMs: number }) => Promise; }; export class RunQueueConsumer implements QueueConsumer { @@ -23,13 +23,14 @@ export class RunQueueConsumer implements QueueConsumer { private readonly preDequeue?: PreDequeueFn; private readonly preSkip?: PreSkipFn; private readonly maxRunCount?: number; - private readonly onDequeue: (messages: WorkerApiDequeueResponseBody) => Promise; + private readonly onDequeue: (messages: WorkerApiDequeueResponseBody, timing?: { dequeueResponseMs: number; pollingIntervalMs: number }) => Promise; private readonly logger = new SimpleStructuredLogger("queue-consumer"); private intervalMs: number; private idleIntervalMs: number; private isEnabled: boolean; + private lastScheduledIntervalMs: number; constructor(opts: RunQueueConsumerOptions) { this.isEnabled = false; @@ -38,6 +39,7 @@ export class RunQueueConsumer implements QueueConsumer { this.preDequeue = opts.preDequeue; this.preSkip = opts.preSkip; this.maxRunCount = opts.maxRunCount; + this.lastScheduledIntervalMs = opts.idleIntervalMs; this.onDequeue = opts.onDequeue; this.client = opts.client; } @@ -111,16 +113,18 @@ export class RunQueueConsumer implements QueueConsumer { let nextIntervalMs = this.idleIntervalMs; try { + const dequeueStart = performance.now(); const response = await this.client.dequeue({ maxResources: preDequeueResult?.maxResources, maxRunCount: this.maxRunCount, }); + const dequeueResponseMs = Math.round(performance.now() - dequeueStart); if (!response.success) { this.logger.error("Failed to dequeue", { error: response.error }); } else { try { - await this.onDequeue(response.data); + await this.onDequeue(response.data, { dequeueResponseMs, pollingIntervalMs: this.lastScheduledIntervalMs }); if (response.data.length > 0) { nextIntervalMs = this.intervalMs; @@ -141,6 +145,7 @@ export class RunQueueConsumer implements QueueConsumer { this.logger.verbose("scheduled dequeue with idle interval", { delayMs }); } + this.lastScheduledIntervalMs = delayMs; setTimeout(this.dequeue.bind(this), delayMs); } } diff --git a/packages/core/src/v3/runEngineWorker/supervisor/session.ts b/packages/core/src/v3/runEngineWorker/supervisor/session.ts index e5a783b8d41..b2d344fb3dc 100644 --- a/packages/core/src/v3/runEngineWorker/supervisor/session.ts +++ b/packages/core/src/v3/runEngineWorker/supervisor/session.ts @@ -80,13 +80,15 @@ export class SupervisorSession extends EventEmitter { }); } - private async onDequeue(messages: WorkerApiDequeueResponseBody): Promise { + private async onDequeue(messages: WorkerApiDequeueResponseBody, timing?: { dequeueResponseMs: number; pollingIntervalMs: number }): Promise { this.logger.verbose("Dequeued messages with contents", { count: messages.length, messages }); for (const message of messages) { this.emit("runQueueMessage", { time: new Date(), message, + dequeueResponseMs: timing?.dequeueResponseMs, + pollingIntervalMs: timing?.pollingIntervalMs, }); } } From 4d603adf675f4ef67db5d7b87c61996d8a389d60 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 20 Feb 2026 17:46:45 +0000 Subject: [PATCH 09/14] feat(supervisor): add compute checkpoint/restore support - Fix instance creation URL from /api/sandboxes to /api/instances - Pass name: runnerId when creating compute instances - Add snapshot(), deleteInstance(), and restore() methods to ComputeWorkloadManager - Add /api/v1/compute/snapshot-complete callback endpoint to WorkloadServer - Handle suspend requests in compute mode via fire-and-forget snapshot with callback - Handle restore in compute mode by calling gateway restore API directly - Wire computeManager into WorkloadServer for compute mode suspend/restore --- apps/supervisor/src/index.ts | 41 ++++-- .../supervisor/src/workloadManager/compute.ts | 124 ++++++++++++++++- apps/supervisor/src/workloadServer/index.ts | 126 ++++++++++++++++-- 3 files changed, 269 insertions(+), 22 deletions(-) diff --git a/apps/supervisor/src/index.ts b/apps/supervisor/src/index.ts index ede628f82a4..13c1ddb7f82 100644 --- a/apps/supervisor/src/index.ts +++ b/apps/supervisor/src/index.ts @@ -36,9 +36,11 @@ class ManagedSupervisor { private readonly metricsServer?: HttpServer; private readonly workloadServer: WorkloadServer; private readonly workloadManager: WorkloadManager; + private readonly computeManager?: ComputeWorkloadManager; private readonly logger = new SimpleStructuredLogger("managed-supervisor"); private readonly resourceMonitor: ResourceMonitor; private readonly checkpointClient?: CheckpointClient; + private readonly isComputeMode: boolean; private readonly podCleaner?: PodCleaner; private readonly failedPodHandler?: FailedPodHandler; @@ -78,16 +80,22 @@ class ManagedSupervisor { : new DockerResourceMonitor(new Docker()) : new NoopResourceMonitor(); - this.workloadManager = env.COMPUTE_GATEWAY_URL - ? new ComputeWorkloadManager({ - ...workloadManagerOptions, - gatewayUrl: env.COMPUTE_GATEWAY_URL, - gatewayAuthToken: env.COMPUTE_GATEWAY_AUTH_TOKEN, - gatewayTimeoutMs: env.COMPUTE_GATEWAY_TIMEOUT_MS, - }) - : this.isKubernetes + this.isComputeMode = !!env.COMPUTE_GATEWAY_URL; + + if (env.COMPUTE_GATEWAY_URL) { + const computeManager = new ComputeWorkloadManager({ + ...workloadManagerOptions, + gatewayUrl: env.COMPUTE_GATEWAY_URL, + gatewayAuthToken: env.COMPUTE_GATEWAY_AUTH_TOKEN, + gatewayTimeoutMs: env.COMPUTE_GATEWAY_TIMEOUT_MS, + }); + this.computeManager = computeManager; + this.workloadManager = computeManager; + } else { + this.workloadManager = this.isKubernetes ? new KubernetesWorkloadManager(workloadManagerOptions) : new DockerWorkloadManager(workloadManagerOptions); + } if (this.isKubernetes) { if (env.POD_CLEANER_ENABLED) { @@ -215,6 +223,22 @@ class ManagedSupervisor { if (checkpoint) { this.logger.log("Restoring run", { runId: message.run.id }); + if (this.isComputeMode && this.computeManager) { + try { + const didRestore = await this.computeManager.restore(checkpoint.location); + + if (didRestore) { + this.logger.log("Compute restore successful", { runId: message.run.id }); + } else { + this.logger.error("Compute restore failed", { runId: message.run.id }); + } + } catch (error) { + this.logger.error("Failed to restore run (compute)", { error }); + } + + return; + } + if (!this.checkpointClient) { this.logger.error("No checkpoint client", { runId: message.run.id }); return; @@ -309,6 +333,7 @@ class ManagedSupervisor { host: env.TRIGGER_WORKLOAD_API_HOST_INTERNAL, workerClient: this.workerSession.httpClient, checkpointClient: this.checkpointClient, + computeManager: this.computeManager, }); this.workloadServer.on("runConnected", this.onRunConnected.bind(this)); diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index 3363236ff49..1e1a3462413 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -60,7 +60,9 @@ export class ComputeWorkloadManager implements WorkloadManager { } if (this.opts.snapshotPollIntervalSeconds) { - envVars.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS = String(this.opts.snapshotPollIntervalSeconds); + envVars.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS = String( + this.opts.snapshotPollIntervalSeconds + ); } if (this.opts.additionalEnvVars) { @@ -78,7 +80,7 @@ export class ComputeWorkloadManager implements WorkloadManager { // Strip image digest — resolve by tag, not digest const imageRef = opts.image.split("@")[0]!; - const url = `${this.opts.gatewayUrl}/api/sandboxes`; + const url = `${this.opts.gatewayUrl}/api/instances`; // Wide event: single canonical log line emitted in finally const event: Record = { @@ -111,6 +113,7 @@ export class ComputeWorkloadManager implements WorkloadManager { headers, signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs), body: JSON.stringify({ + name: runnerId, image: imageRef, env: envVars, cpu: opts.machine.cpu, @@ -153,12 +156,125 @@ export class ComputeWorkloadManager implements WorkloadManager { return; } - event.sandboxId = data.id; + event.instanceId = data.id; event.ok = true; } finally { event.durationMs = Math.round(performance.now() - startMs); event.ok ??= false; - this.logger.info("create sandbox", event); + this.logger.info("create instance", event); + } + } + + private get authHeaders(): Record { + const headers: Record = { + "Content-Type": "application/json", + }; + if (this.opts.gatewayAuthToken) { + headers["Authorization"] = `Bearer ${this.opts.gatewayAuthToken}`; + } + return headers; + } + + async snapshot(opts: { + runnerId: string; + callbackUrl: string; + metadata: Record; + }): Promise { + const url = `${this.opts.gatewayUrl}/api/instances/${opts.runnerId}/snapshot`; + + const [error, response] = await tryCatch( + fetch(url, { + method: "POST", + headers: this.authHeaders, + signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs), + body: JSON.stringify({ + callback: { + url: opts.callbackUrl, + metadata: opts.metadata, + }, + }), + }) + ); + + if (error) { + this.logger.error("snapshot request failed", { + runnerId: opts.runnerId, + error: error instanceof Error ? error.message : String(error), + }); + return false; } + + if (response.status !== 202) { + this.logger.error("snapshot request rejected", { + runnerId: opts.runnerId, + status: response.status, + }); + return false; + } + + this.logger.info("snapshot request accepted", { runnerId: opts.runnerId }); + return true; + } + + async deleteInstance(runnerId: string): Promise { + const url = `${this.opts.gatewayUrl}/api/instances/${runnerId}`; + + const [error, response] = await tryCatch( + fetch(url, { + method: "DELETE", + headers: this.authHeaders, + signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs), + }) + ); + + if (error) { + this.logger.error("delete instance failed", { + runnerId, + error: error instanceof Error ? error.message : String(error), + }); + return false; + } + + if (!response.ok) { + this.logger.error("delete instance rejected", { + runnerId, + status: response.status, + }); + return false; + } + + this.logger.info("delete instance success", { runnerId }); + return true; + } + + async restore(snapshotId: string): Promise { + const url = `${this.opts.gatewayUrl}/api/snapshots/${snapshotId}/restore`; + + const [error, response] = await tryCatch( + fetch(url, { + method: "POST", + headers: this.authHeaders, + signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs), + }) + ); + + if (error) { + this.logger.error("restore request failed", { + snapshotId, + error: error instanceof Error ? error.message : String(error), + }); + return false; + } + + if (!response.ok) { + this.logger.error("restore request rejected", { + snapshotId, + status: response.status, + }); + return false; + } + + this.logger.info("restore request success", { snapshotId }); + return true; } } diff --git a/apps/supervisor/src/workloadServer/index.ts b/apps/supervisor/src/workloadServer/index.ts index 35d53d36099..6598da76a34 100644 --- a/apps/supervisor/src/workloadServer/index.ts +++ b/apps/supervisor/src/workloadServer/index.ts @@ -24,6 +24,7 @@ import { HttpServer, type CheckpointClient } from "@trigger.dev/core/v3/serverOn import { type IncomingMessage } from "node:http"; import { register } from "../metrics.js"; import { env } from "../env.js"; +import type { ComputeWorkloadManager } from "../workloadManager/compute.js"; // Use the official export when upgrading to socket.io@4.8.0 interface DefaultEventsMap { @@ -53,15 +54,25 @@ type WorkloadServerEvents = { ]; }; +const ComputeSnapshotCallbackBody = z.object({ + snapshot_id: z.string(), + instance_id: z.string(), + status: z.enum(["completed", "failed"]), + error: z.string().optional(), + metadata: z.record(z.string()).optional(), +}); + type WorkloadServerOptions = { port: number; host?: string; workerClient: SupervisorHttpClient; checkpointClient?: CheckpointClient; + computeManager?: ComputeWorkloadManager; }; export class WorkloadServer extends EventEmitter { private checkpointClient?: CheckpointClient; + private computeManager?: ComputeWorkloadManager; private readonly logger = new SimpleStructuredLogger("workload-server"); @@ -93,6 +104,7 @@ export class WorkloadServer extends EventEmitter { this.workerClient = opts.workerClient; this.checkpointClient = opts.checkpointClient; + this.computeManager = opts.computeManager; this.httpServer = this.createHttpServer({ host, port }); this.websocketServer = this.createWebsocketServer(); @@ -231,11 +243,19 @@ export class WorkloadServer extends EventEmitter { handler: async ({ reply, params, req }) => { this.logger.debug("Suspend request", { params, headers: req.headers }); - if (!this.checkpointClient) { + const runnerId = this.runnerIdFromRequest(req); + const deploymentVersion = this.deploymentVersionFromRequest(req); + const projectRef = this.projectRefFromRequest(req); + + if (!runnerId || !deploymentVersion || !projectRef) { + this.logger.error("Invalid headers for suspend request", { + ...params, + headers: req.headers, + }); reply.json( { ok: false, - error: "Checkpoints disabled", + error: "Invalid headers", } satisfies WorkloadSuspendRunResponseBody, false, 400 @@ -243,19 +263,35 @@ export class WorkloadServer extends EventEmitter { return; } - const runnerId = this.runnerIdFromRequest(req); - const deploymentVersion = this.deploymentVersionFromRequest(req); - const projectRef = this.projectRefFromRequest(req); + if (this.computeManager) { + // Compute mode: fire-and-forget snapshot with callback + reply.json({ ok: true } satisfies WorkloadSuspendRunResponseBody, false, 202); - if (!runnerId || !deploymentVersion || !projectRef) { - this.logger.error("Invalid headers for suspend request", { - ...params, - headers: req.headers, + const callbackUrl = `${env.TRIGGER_WORKLOAD_API_PROTOCOL}://${ + env.TRIGGER_WORKLOAD_API_DOMAIN ?? "localhost" + }:${env.TRIGGER_WORKLOAD_API_PORT_EXTERNAL}/api/v1/compute/snapshot-complete`; + + const snapshotResult = await this.computeManager.snapshot({ + runnerId, + callbackUrl, + metadata: { + runId: params.runFriendlyId, + snapshotFriendlyId: params.snapshotFriendlyId, + }, }); + + if (!snapshotResult) { + this.logger.error("Failed to request compute snapshot", { params, runnerId }); + } + + return; + } + + if (!this.checkpointClient) { reply.json( { ok: false, - error: "Invalid headers", + error: "Checkpoints disabled", } satisfies WorkloadSuspendRunResponseBody, false, 400 @@ -394,6 +430,76 @@ export class WorkloadServer extends EventEmitter { }); } + // Compute snapshot callback endpoint + httpServer.route("/api/v1/compute/snapshot-complete", "POST", { + bodySchema: ComputeSnapshotCallbackBody, + handler: async ({ reply, body }) => { + this.logger.info("Compute snapshot callback", { + snapshotId: body.snapshot_id, + instanceId: body.instance_id, + status: body.status, + error: body.error, + metadata: body.metadata, + }); + + const runId = body.metadata?.runId; + const snapshotFriendlyId = body.metadata?.snapshotFriendlyId; + + if (!runId || !snapshotFriendlyId) { + this.logger.error("Compute snapshot callback missing metadata", { body }); + reply.empty(400); + return; + } + + if (body.status === "completed") { + const result = await this.workerClient.submitSuspendCompletion({ + runId, + snapshotId: snapshotFriendlyId, + body: { + success: true, + checkpoint: { + type: "KUBERNETES", + location: body.snapshot_id, + }, + }, + }); + + if (result.success) { + this.logger.info("Suspend completion submitted, deleting instance", { + runId, + instanceId: body.instance_id, + }); + await this.computeManager?.deleteInstance(body.instance_id); + } else { + this.logger.error("Failed to submit suspend completion", { + runId, + snapshotFriendlyId, + error: result.error, + }); + } + } else { + const result = await this.workerClient.submitSuspendCompletion({ + runId, + snapshotId: snapshotFriendlyId, + body: { + success: false, + error: body.error ?? "Snapshot failed", + }, + }); + + if (!result.success) { + this.logger.error("Failed to submit suspend failure", { + runId, + snapshotFriendlyId, + error: result.error, + }); + } + } + + reply.empty(200); + }, + }); + return httpServer; } From c1511f9db1981980f0e1dd34ecc25d916cbb4610 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 20 Feb 2026 21:46:18 +0000 Subject: [PATCH 10/14] fix(cli): fix --load flag on local/self-hosted builds --- .changeset/fix-local-build-load.md | 5 +++++ packages/cli-v3/src/deploy/buildImage.ts | 26 +++++++++++++++++++----- 2 files changed, 26 insertions(+), 5 deletions(-) create mode 100644 .changeset/fix-local-build-load.md diff --git a/.changeset/fix-local-build-load.md b/.changeset/fix-local-build-load.md new file mode 100644 index 00000000000..13f91da9d6a --- /dev/null +++ b/.changeset/fix-local-build-load.md @@ -0,0 +1,5 @@ +--- +"trigger.dev": patch +--- + +Fix `--load` flag being silently ignored on local/self-hosted builds. diff --git a/packages/cli-v3/src/deploy/buildImage.ts b/packages/cli-v3/src/deploy/buildImage.ts index 2225d7db056..31a2b658545 100644 --- a/packages/cli-v3/src/deploy/buildImage.ts +++ b/packages/cli-v3/src/deploy/buildImage.ts @@ -205,6 +205,7 @@ async function remoteBuildImage(options: DepotBuildImageOptions): Promise Date: Sat, 21 Feb 2026 02:28:43 +0000 Subject: [PATCH 11/14] feat(supervisor): pass name, metadata, and resources in compute restore request Restore calls now send a request body with the runner name, env override metadata, cpu, and memory so the agent can inject them before the VM resumes. The runner fetches these overrides from TRIGGER_METADATA_URL at restore time. runnerId is derived per restore cycle as runner-{runIdShort}-{checkpointSuffix}, matching iceman's pattern. --- apps/supervisor/src/index.ts | 17 +++++++-- .../supervisor/src/workloadManager/compute.ts | 37 ++++++++++++++++--- 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/apps/supervisor/src/index.ts b/apps/supervisor/src/index.ts index 13c1ddb7f82..5ba2bbf7e2c 100644 --- a/apps/supervisor/src/index.ts +++ b/apps/supervisor/src/index.ts @@ -225,12 +225,23 @@ class ManagedSupervisor { if (this.isComputeMode && this.computeManager) { try { - const didRestore = await this.computeManager.restore(checkpoint.location); + // Derive runnerId unique per restore cycle (matches iceman's pattern) + const runIdShort = message.run.friendlyId.replace("run_", ""); + const checkpointSuffix = checkpoint.id.slice(-8); + const runnerId = `runner-${runIdShort}-${checkpointSuffix}`; + + const didRestore = await this.computeManager.restore({ + snapshotId: checkpoint.location, + runnerId, + runFriendlyId: message.run.friendlyId, + snapshotFriendlyId: message.snapshot.friendlyId, + machine: message.run.machine, + }); if (didRestore) { - this.logger.log("Compute restore successful", { runId: message.run.id }); + this.logger.log("Compute restore successful", { runId: message.run.id, runnerId }); } else { - this.logger.error("Compute restore failed", { runId: message.run.id }); + this.logger.error("Compute restore failed", { runId: message.run.id, runnerId }); } } catch (error) { this.logger.error("Failed to restore run (compute)", { error }); diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index 1e1a3462413..1d9e905ce5d 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -247,20 +247,43 @@ export class ComputeWorkloadManager implements WorkloadManager { return true; } - async restore(snapshotId: string): Promise { - const url = `${this.opts.gatewayUrl}/api/snapshots/${snapshotId}/restore`; + async restore(opts: { + snapshotId: string; + runnerId: string; + runFriendlyId: string; + snapshotFriendlyId: string; + machine: { cpu: number; memory: number }; + }): Promise { + const url = `${this.opts.gatewayUrl}/api/snapshots/${opts.snapshotId}/restore`; + + const metadata: Record = { + TRIGGER_RUNNER_ID: opts.runnerId, + TRIGGER_RUN_ID: opts.runFriendlyId, + TRIGGER_SNAPSHOT_ID: opts.snapshotFriendlyId, + TRIGGER_SUPERVISOR_API_PROTOCOL: this.opts.workloadApiProtocol, + TRIGGER_SUPERVISOR_API_PORT: String(this.opts.workloadApiPort), + TRIGGER_SUPERVISOR_API_DOMAIN: this.opts.workloadApiDomain ?? "", + TRIGGER_WORKER_INSTANCE_NAME: env.TRIGGER_WORKER_INSTANCE_NAME, + }; const [error, response] = await tryCatch( fetch(url, { method: "POST", headers: this.authHeaders, signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs), + body: JSON.stringify({ + name: opts.runnerId, + metadata, + cpu: opts.machine.cpu, + memory_mb: opts.machine.memory * 1024, + }), }) ); if (error) { this.logger.error("restore request failed", { - snapshotId, + snapshotId: opts.snapshotId, + runnerId: opts.runnerId, error: error instanceof Error ? error.message : String(error), }); return false; @@ -268,13 +291,17 @@ export class ComputeWorkloadManager implements WorkloadManager { if (!response.ok) { this.logger.error("restore request rejected", { - snapshotId, + snapshotId: opts.snapshotId, + runnerId: opts.runnerId, status: response.status, }); return false; } - this.logger.info("restore request success", { snapshotId }); + this.logger.info("restore request success", { + snapshotId: opts.snapshotId, + runnerId: opts.runnerId, + }); return true; } } From 43327439445436824492ff32adf0e280db89223d Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Sat, 21 Feb 2026 11:28:26 +0000 Subject: [PATCH 12/14] feat(supervisor): add flag to enable compute snapshots Gates snapshot/restore behaviour independently of compute mode. When disabled, VMs won't receive the metadata URL and suspend/restore are no-ops. Defaults to off so compute mode can be used without snapshots. --- apps/supervisor/src/env.ts | 1 + apps/supervisor/src/index.ts | 2 +- apps/supervisor/src/workloadManager/compute.ts | 18 +++++++++++------- apps/supervisor/src/workloadServer/index.ts | 2 +- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/apps/supervisor/src/env.ts b/apps/supervisor/src/env.ts index a8750221a87..3cf69513818 100644 --- a/apps/supervisor/src/env.ts +++ b/apps/supervisor/src/env.ts @@ -81,6 +81,7 @@ const Env = z.object({ COMPUTE_GATEWAY_URL: z.string().url().optional(), COMPUTE_GATEWAY_AUTH_TOKEN: z.string().optional(), COMPUTE_GATEWAY_TIMEOUT_MS: z.coerce.number().int().default(30_000), + COMPUTE_SNAPSHOTS_ENABLED: BoolEnv.default(false), // Kubernetes settings KUBERNETES_FORCE_ENABLED: BoolEnv.default(false), diff --git a/apps/supervisor/src/index.ts b/apps/supervisor/src/index.ts index 5ba2bbf7e2c..dd91591aba6 100644 --- a/apps/supervisor/src/index.ts +++ b/apps/supervisor/src/index.ts @@ -223,7 +223,7 @@ class ManagedSupervisor { if (checkpoint) { this.logger.log("Restoring run", { runId: message.run.id }); - if (this.isComputeMode && this.computeManager) { + if (this.isComputeMode && this.computeManager && env.COMPUTE_SNAPSHOTS_ENABLED) { try { // Derive runnerId unique per restore cycle (matches iceman's pattern) const runIdShort = message.run.friendlyId.replace("run_", ""); diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index 1d9e905ce5d..ea7be55d1a9 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -51,7 +51,7 @@ export class ComputeWorkloadManager implements WorkloadManager { envVars.TRIGGER_WARM_START_URL = this.opts.warmStartUrl; } - if (this.opts.metadataUrl) { + if (env.COMPUTE_SNAPSHOTS_ENABLED && this.opts.metadataUrl) { envVars.TRIGGER_METADATA_URL = this.opts.metadataUrl; } @@ -266,17 +266,21 @@ export class ComputeWorkloadManager implements WorkloadManager { TRIGGER_WORKER_INSTANCE_NAME: env.TRIGGER_WORKER_INSTANCE_NAME, }; + const body = { + name: opts.runnerId, + metadata, + cpu: opts.machine.cpu, + memory_mb: opts.machine.memory * 1024, + }; + + this.logger.debug("restore request body", { url, body }); + const [error, response] = await tryCatch( fetch(url, { method: "POST", headers: this.authHeaders, signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs), - body: JSON.stringify({ - name: opts.runnerId, - metadata, - cpu: opts.machine.cpu, - memory_mb: opts.machine.memory * 1024, - }), + body: JSON.stringify(body), }) ); diff --git a/apps/supervisor/src/workloadServer/index.ts b/apps/supervisor/src/workloadServer/index.ts index 6598da76a34..0586c259a39 100644 --- a/apps/supervisor/src/workloadServer/index.ts +++ b/apps/supervisor/src/workloadServer/index.ts @@ -263,7 +263,7 @@ export class WorkloadServer extends EventEmitter { return; } - if (this.computeManager) { + if (this.computeManager && env.COMPUTE_SNAPSHOTS_ENABLED) { // Compute mode: fire-and-forget snapshot with callback reply.json({ ok: true } satisfies WorkloadSuspendRunResponseBody, false, 202); From 5089bba33577d68db15321b4e9ab20d1e1c184b5 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Mon, 23 Feb 2026 11:49:55 +0000 Subject: [PATCH 13/14] feat(supervisor): require metadata URL when compute snapshots enabled --- apps/supervisor/src/env.ts | 300 +++++++++++++++++++------------------ 1 file changed, 157 insertions(+), 143 deletions(-) diff --git a/apps/supervisor/src/env.ts b/apps/supervisor/src/env.ts index 3cf69513818..33478bab64b 100644 --- a/apps/supervisor/src/env.ts +++ b/apps/supervisor/src/env.ts @@ -3,148 +3,162 @@ import { env as stdEnv } from "std-env"; import { z } from "zod"; import { AdditionalEnvVars, BoolEnv } from "./envUtil.js"; -const Env = z.object({ - // This will come from `spec.nodeName` in k8s - TRIGGER_WORKER_INSTANCE_NAME: z.string().default(randomUUID()), - TRIGGER_WORKER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().default(30), - - // Required settings - TRIGGER_API_URL: z.string().url(), - TRIGGER_WORKER_TOKEN: z.string(), // accepts file:// path to read from a file - MANAGED_WORKER_SECRET: z.string(), - OTEL_EXPORTER_OTLP_ENDPOINT: z.string().url(), // set on the runners - - // Workload API settings (coordinator mode) - the workload API is what the run controller connects to - TRIGGER_WORKLOAD_API_ENABLED: BoolEnv.default(true), - TRIGGER_WORKLOAD_API_PROTOCOL: z - .string() - .transform((s) => z.enum(["http", "https"]).parse(s.toLowerCase())) - .default("http"), - TRIGGER_WORKLOAD_API_DOMAIN: z.string().optional(), // If unset, will use orchestrator-specific default - TRIGGER_WORKLOAD_API_HOST_INTERNAL: z.string().default("0.0.0.0"), - TRIGGER_WORKLOAD_API_PORT_INTERNAL: z.coerce.number().default(8020), // This is the port the workload API listens on - TRIGGER_WORKLOAD_API_PORT_EXTERNAL: z.coerce.number().default(8020), // This is the exposed port passed to the run controller - - // Runner settings - RUNNER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().optional(), - RUNNER_SNAPSHOT_POLL_INTERVAL_SECONDS: z.coerce.number().optional(), - RUNNER_ADDITIONAL_ENV_VARS: AdditionalEnvVars, // optional (csv) - RUNNER_PRETTY_LOGS: BoolEnv.default(false), - - // Dequeue settings (provider mode) - TRIGGER_DEQUEUE_ENABLED: BoolEnv.default(true), - TRIGGER_DEQUEUE_INTERVAL_MS: z.coerce.number().int().default(250), - TRIGGER_DEQUEUE_IDLE_INTERVAL_MS: z.coerce.number().int().default(1000), - TRIGGER_DEQUEUE_MAX_RUN_COUNT: z.coerce.number().int().default(1), - TRIGGER_DEQUEUE_MIN_CONSUMER_COUNT: z.coerce.number().int().default(1), - TRIGGER_DEQUEUE_MAX_CONSUMER_COUNT: z.coerce.number().int().default(10), - TRIGGER_DEQUEUE_SCALING_STRATEGY: z.enum(["none", "smooth", "aggressive"]).default("none"), - TRIGGER_DEQUEUE_SCALING_UP_COOLDOWN_MS: z.coerce.number().int().default(5000), // 5 seconds - TRIGGER_DEQUEUE_SCALING_DOWN_COOLDOWN_MS: z.coerce.number().int().default(30000), // 30 seconds - TRIGGER_DEQUEUE_SCALING_TARGET_RATIO: z.coerce.number().default(1.0), // Target ratio of queue items to consumers (1.0 = 1 item per consumer) - TRIGGER_DEQUEUE_SCALING_EWMA_ALPHA: z.coerce.number().min(0).max(1).default(0.3), // Smooths queue length measurements (0=historical, 1=current) - TRIGGER_DEQUEUE_SCALING_BATCH_WINDOW_MS: z.coerce.number().int().positive().default(1000), // Batch window for metrics processing (ms) - TRIGGER_DEQUEUE_SCALING_DAMPING_FACTOR: z.coerce.number().min(0).max(1).default(0.7), // Smooths consumer count changes after EWMA (0=no scaling, 1=immediate) - - // Optional services - TRIGGER_WARM_START_URL: z.string().optional(), - TRIGGER_CHECKPOINT_URL: z.string().optional(), - TRIGGER_METADATA_URL: z.string().optional(), - - // Used by the resource monitor - RESOURCE_MONITOR_ENABLED: BoolEnv.default(false), - RESOURCE_MONITOR_OVERRIDE_CPU_TOTAL: z.coerce.number().optional(), - RESOURCE_MONITOR_OVERRIDE_MEMORY_TOTAL_GB: z.coerce.number().optional(), - - // Docker settings - DOCKER_API_VERSION: z.string().optional(), - DOCKER_PLATFORM: z.string().optional(), // e.g. linux/amd64, linux/arm64 - DOCKER_STRIP_IMAGE_DIGEST: BoolEnv.default(true), - DOCKER_REGISTRY_USERNAME: z.string().optional(), - DOCKER_REGISTRY_PASSWORD: z.string().optional(), - DOCKER_REGISTRY_URL: z.string().optional(), // e.g. https://index.docker.io/v1 - DOCKER_ENFORCE_MACHINE_PRESETS: BoolEnv.default(true), - DOCKER_AUTOREMOVE_EXITED_CONTAINERS: BoolEnv.default(true), - /** - * Network mode to use for all runners. Supported standard values are: `bridge`, `host`, `none`, and `container:`. - * Any other value is taken as a custom network's name to which all runners should connect to. - * - * Accepts a list of comma-separated values to attach to multiple networks. Additional networks are interpreted as network names and will be attached after container creation. - * - * **WARNING**: Specifying multiple networks will slightly increase startup times. - * - * @default "host" - */ - DOCKER_RUNNER_NETWORKS: z.string().default("host"), - - // Compute settings - COMPUTE_GATEWAY_URL: z.string().url().optional(), - COMPUTE_GATEWAY_AUTH_TOKEN: z.string().optional(), - COMPUTE_GATEWAY_TIMEOUT_MS: z.coerce.number().int().default(30_000), - COMPUTE_SNAPSHOTS_ENABLED: BoolEnv.default(false), - - // Kubernetes settings - KUBERNETES_FORCE_ENABLED: BoolEnv.default(false), - KUBERNETES_NAMESPACE: z.string().default("default"), - KUBERNETES_WORKER_NODETYPE_LABEL: z.string().default("v4-worker"), - KUBERNETES_IMAGE_PULL_SECRETS: z.string().optional(), // csv - KUBERNETES_EPHEMERAL_STORAGE_SIZE_LIMIT: z.string().default("10Gi"), - KUBERNETES_EPHEMERAL_STORAGE_SIZE_REQUEST: z.string().default("2Gi"), - KUBERNETES_STRIP_IMAGE_DIGEST: BoolEnv.default(false), - KUBERNETES_CPU_REQUEST_MIN_CORES: z.coerce.number().min(0).default(0), - KUBERNETES_CPU_REQUEST_RATIO: z.coerce.number().min(0).max(1).default(0.75), // Ratio of CPU limit, so 0.75 = 75% of CPU limit - KUBERNETES_MEMORY_REQUEST_MIN_GB: z.coerce.number().min(0).default(0), - KUBERNETES_MEMORY_REQUEST_RATIO: z.coerce.number().min(0).max(1).default(1), // Ratio of memory limit, so 1 = 100% of memory limit - - // Per-preset overrides of the global KUBERNETES_CPU_REQUEST_RATIO - KUBERNETES_CPU_REQUEST_RATIO_MICRO: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_CPU_REQUEST_RATIO_SMALL_1X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_CPU_REQUEST_RATIO_SMALL_2X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_CPU_REQUEST_RATIO_MEDIUM_1X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_CPU_REQUEST_RATIO_MEDIUM_2X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_CPU_REQUEST_RATIO_LARGE_1X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_CPU_REQUEST_RATIO_LARGE_2X: z.coerce.number().min(0).max(1).optional(), - - // Per-preset overrides of the global KUBERNETES_MEMORY_REQUEST_RATIO - KUBERNETES_MEMORY_REQUEST_RATIO_MICRO: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_MEMORY_REQUEST_RATIO_SMALL_1X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_MEMORY_REQUEST_RATIO_SMALL_2X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_MEMORY_REQUEST_RATIO_MEDIUM_1X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_MEMORY_REQUEST_RATIO_MEDIUM_2X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_MEMORY_REQUEST_RATIO_LARGE_1X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_MEMORY_REQUEST_RATIO_LARGE_2X: z.coerce.number().min(0).max(1).optional(), - - KUBERNETES_MEMORY_OVERHEAD_GB: z.coerce.number().min(0).optional(), // Optional memory overhead to add to the limit in GB - KUBERNETES_SCHEDULER_NAME: z.string().optional(), // Custom scheduler name for pods - KUBERNETES_LARGE_MACHINE_POOL_LABEL: z.string().optional(), // if set, large-* presets affinity for machinepool= - - // Project affinity settings - pods from the same project prefer the same node - KUBERNETES_PROJECT_AFFINITY_ENABLED: BoolEnv.default(false), - KUBERNETES_PROJECT_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(50), - KUBERNETES_PROJECT_AFFINITY_TOPOLOGY_KEY: z.string().trim().min(1).default("kubernetes.io/hostname"), - - // Placement tags settings - PLACEMENT_TAGS_ENABLED: BoolEnv.default(false), - PLACEMENT_TAGS_PREFIX: z.string().default("node.cluster.x-k8s.io"), - - // Metrics - METRICS_ENABLED: BoolEnv.default(true), - METRICS_COLLECT_DEFAULTS: BoolEnv.default(true), - METRICS_HOST: z.string().default("127.0.0.1"), - METRICS_PORT: z.coerce.number().int().default(9090), - - // Pod cleaner - POD_CLEANER_ENABLED: BoolEnv.default(true), - POD_CLEANER_INTERVAL_MS: z.coerce.number().int().default(10000), - POD_CLEANER_BATCH_SIZE: z.coerce.number().int().default(500), - - // Failed pod handler - FAILED_POD_HANDLER_ENABLED: BoolEnv.default(true), - FAILED_POD_HANDLER_RECONNECT_INTERVAL_MS: z.coerce.number().int().default(1000), - - // Debug - DEBUG: BoolEnv.default(false), - SEND_RUN_DEBUG_LOGS: BoolEnv.default(false), -}); +const Env = z + .object({ + // This will come from `spec.nodeName` in k8s + TRIGGER_WORKER_INSTANCE_NAME: z.string().default(randomUUID()), + TRIGGER_WORKER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().default(30), + + // Required settings + TRIGGER_API_URL: z.string().url(), + TRIGGER_WORKER_TOKEN: z.string(), // accepts file:// path to read from a file + MANAGED_WORKER_SECRET: z.string(), + OTEL_EXPORTER_OTLP_ENDPOINT: z.string().url(), // set on the runners + + // Workload API settings (coordinator mode) - the workload API is what the run controller connects to + TRIGGER_WORKLOAD_API_ENABLED: BoolEnv.default(true), + TRIGGER_WORKLOAD_API_PROTOCOL: z + .string() + .transform((s) => z.enum(["http", "https"]).parse(s.toLowerCase())) + .default("http"), + TRIGGER_WORKLOAD_API_DOMAIN: z.string().optional(), // If unset, will use orchestrator-specific default + TRIGGER_WORKLOAD_API_HOST_INTERNAL: z.string().default("0.0.0.0"), + TRIGGER_WORKLOAD_API_PORT_INTERNAL: z.coerce.number().default(8020), // This is the port the workload API listens on + TRIGGER_WORKLOAD_API_PORT_EXTERNAL: z.coerce.number().default(8020), // This is the exposed port passed to the run controller + + // Runner settings + RUNNER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().optional(), + RUNNER_SNAPSHOT_POLL_INTERVAL_SECONDS: z.coerce.number().optional(), + RUNNER_ADDITIONAL_ENV_VARS: AdditionalEnvVars, // optional (csv) + RUNNER_PRETTY_LOGS: BoolEnv.default(false), + + // Dequeue settings (provider mode) + TRIGGER_DEQUEUE_ENABLED: BoolEnv.default(true), + TRIGGER_DEQUEUE_INTERVAL_MS: z.coerce.number().int().default(250), + TRIGGER_DEQUEUE_IDLE_INTERVAL_MS: z.coerce.number().int().default(1000), + TRIGGER_DEQUEUE_MAX_RUN_COUNT: z.coerce.number().int().default(1), + TRIGGER_DEQUEUE_MIN_CONSUMER_COUNT: z.coerce.number().int().default(1), + TRIGGER_DEQUEUE_MAX_CONSUMER_COUNT: z.coerce.number().int().default(10), + TRIGGER_DEQUEUE_SCALING_STRATEGY: z.enum(["none", "smooth", "aggressive"]).default("none"), + TRIGGER_DEQUEUE_SCALING_UP_COOLDOWN_MS: z.coerce.number().int().default(5000), // 5 seconds + TRIGGER_DEQUEUE_SCALING_DOWN_COOLDOWN_MS: z.coerce.number().int().default(30000), // 30 seconds + TRIGGER_DEQUEUE_SCALING_TARGET_RATIO: z.coerce.number().default(1.0), // Target ratio of queue items to consumers (1.0 = 1 item per consumer) + TRIGGER_DEQUEUE_SCALING_EWMA_ALPHA: z.coerce.number().min(0).max(1).default(0.3), // Smooths queue length measurements (0=historical, 1=current) + TRIGGER_DEQUEUE_SCALING_BATCH_WINDOW_MS: z.coerce.number().int().positive().default(1000), // Batch window for metrics processing (ms) + TRIGGER_DEQUEUE_SCALING_DAMPING_FACTOR: z.coerce.number().min(0).max(1).default(0.7), // Smooths consumer count changes after EWMA (0=no scaling, 1=immediate) + + // Optional services + TRIGGER_WARM_START_URL: z.string().optional(), + TRIGGER_CHECKPOINT_URL: z.string().optional(), + TRIGGER_METADATA_URL: z.string().optional(), + + // Used by the resource monitor + RESOURCE_MONITOR_ENABLED: BoolEnv.default(false), + RESOURCE_MONITOR_OVERRIDE_CPU_TOTAL: z.coerce.number().optional(), + RESOURCE_MONITOR_OVERRIDE_MEMORY_TOTAL_GB: z.coerce.number().optional(), + + // Docker settings + DOCKER_API_VERSION: z.string().optional(), + DOCKER_PLATFORM: z.string().optional(), // e.g. linux/amd64, linux/arm64 + DOCKER_STRIP_IMAGE_DIGEST: BoolEnv.default(true), + DOCKER_REGISTRY_USERNAME: z.string().optional(), + DOCKER_REGISTRY_PASSWORD: z.string().optional(), + DOCKER_REGISTRY_URL: z.string().optional(), // e.g. https://index.docker.io/v1 + DOCKER_ENFORCE_MACHINE_PRESETS: BoolEnv.default(true), + DOCKER_AUTOREMOVE_EXITED_CONTAINERS: BoolEnv.default(true), + /** + * Network mode to use for all runners. Supported standard values are: `bridge`, `host`, `none`, and `container:`. + * Any other value is taken as a custom network's name to which all runners should connect to. + * + * Accepts a list of comma-separated values to attach to multiple networks. Additional networks are interpreted as network names and will be attached after container creation. + * + * **WARNING**: Specifying multiple networks will slightly increase startup times. + * + * @default "host" + */ + DOCKER_RUNNER_NETWORKS: z.string().default("host"), + + // Compute settings + COMPUTE_GATEWAY_URL: z.string().url().optional(), + COMPUTE_GATEWAY_AUTH_TOKEN: z.string().optional(), + COMPUTE_GATEWAY_TIMEOUT_MS: z.coerce.number().int().default(30_000), + COMPUTE_SNAPSHOTS_ENABLED: BoolEnv.default(false), + + // Kubernetes settings + KUBERNETES_FORCE_ENABLED: BoolEnv.default(false), + KUBERNETES_NAMESPACE: z.string().default("default"), + KUBERNETES_WORKER_NODETYPE_LABEL: z.string().default("v4-worker"), + KUBERNETES_IMAGE_PULL_SECRETS: z.string().optional(), // csv + KUBERNETES_EPHEMERAL_STORAGE_SIZE_LIMIT: z.string().default("10Gi"), + KUBERNETES_EPHEMERAL_STORAGE_SIZE_REQUEST: z.string().default("2Gi"), + KUBERNETES_STRIP_IMAGE_DIGEST: BoolEnv.default(false), + KUBERNETES_CPU_REQUEST_MIN_CORES: z.coerce.number().min(0).default(0), + KUBERNETES_CPU_REQUEST_RATIO: z.coerce.number().min(0).max(1).default(0.75), // Ratio of CPU limit, so 0.75 = 75% of CPU limit + KUBERNETES_MEMORY_REQUEST_MIN_GB: z.coerce.number().min(0).default(0), + KUBERNETES_MEMORY_REQUEST_RATIO: z.coerce.number().min(0).max(1).default(1), // Ratio of memory limit, so 1 = 100% of memory limit + + // Per-preset overrides of the global KUBERNETES_CPU_REQUEST_RATIO + KUBERNETES_CPU_REQUEST_RATIO_MICRO: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_SMALL_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_SMALL_2X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_MEDIUM_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_MEDIUM_2X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_LARGE_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_LARGE_2X: z.coerce.number().min(0).max(1).optional(), + + // Per-preset overrides of the global KUBERNETES_MEMORY_REQUEST_RATIO + KUBERNETES_MEMORY_REQUEST_RATIO_MICRO: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_SMALL_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_SMALL_2X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_MEDIUM_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_MEDIUM_2X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_LARGE_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_LARGE_2X: z.coerce.number().min(0).max(1).optional(), + + KUBERNETES_MEMORY_OVERHEAD_GB: z.coerce.number().min(0).optional(), // Optional memory overhead to add to the limit in GB + KUBERNETES_SCHEDULER_NAME: z.string().optional(), // Custom scheduler name for pods + KUBERNETES_LARGE_MACHINE_POOL_LABEL: z.string().optional(), // if set, large-* presets affinity for machinepool= + + // Project affinity settings - pods from the same project prefer the same node + KUBERNETES_PROJECT_AFFINITY_ENABLED: BoolEnv.default(false), + KUBERNETES_PROJECT_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(50), + KUBERNETES_PROJECT_AFFINITY_TOPOLOGY_KEY: z + .string() + .trim() + .min(1) + .default("kubernetes.io/hostname"), + + // Placement tags settings + PLACEMENT_TAGS_ENABLED: BoolEnv.default(false), + PLACEMENT_TAGS_PREFIX: z.string().default("node.cluster.x-k8s.io"), + + // Metrics + METRICS_ENABLED: BoolEnv.default(true), + METRICS_COLLECT_DEFAULTS: BoolEnv.default(true), + METRICS_HOST: z.string().default("127.0.0.1"), + METRICS_PORT: z.coerce.number().int().default(9090), + + // Pod cleaner + POD_CLEANER_ENABLED: BoolEnv.default(true), + POD_CLEANER_INTERVAL_MS: z.coerce.number().int().default(10000), + POD_CLEANER_BATCH_SIZE: z.coerce.number().int().default(500), + + // Failed pod handler + FAILED_POD_HANDLER_ENABLED: BoolEnv.default(true), + FAILED_POD_HANDLER_RECONNECT_INTERVAL_MS: z.coerce.number().int().default(1000), + + // Debug + DEBUG: BoolEnv.default(false), + SEND_RUN_DEBUG_LOGS: BoolEnv.default(false), + }) + .superRefine((data, ctx) => { + if (data.COMPUTE_SNAPSHOTS_ENABLED && !data.TRIGGER_METADATA_URL) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: "TRIGGER_METADATA_URL is required when COMPUTE_SNAPSHOTS_ENABLED is true", + path: ["TRIGGER_METADATA_URL"], + }); + } + }); export const env = Env.parse(stdEnv); From 7ed92216627dc09268c582f46e33d4b65456bff4 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Mon, 23 Feb 2026 13:05:22 +0000 Subject: [PATCH 14/14] fix(supervisor): require workload API domain when compute snapshots enabled Remove the silent `localhost` fallback for the snapshot callback URL, which would be unreachable from external compute gateways. Add env validation and a runtime guard matching the existing metadata URL pattern. --- apps/supervisor/src/env.ts | 7 +++++++ apps/supervisor/src/workloadServer/index.ts | 12 +++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/apps/supervisor/src/env.ts b/apps/supervisor/src/env.ts index 33478bab64b..da7ea5b91f1 100644 --- a/apps/supervisor/src/env.ts +++ b/apps/supervisor/src/env.ts @@ -159,6 +159,13 @@ const Env = z path: ["TRIGGER_METADATA_URL"], }); } + if (data.COMPUTE_SNAPSHOTS_ENABLED && !data.TRIGGER_WORKLOAD_API_DOMAIN) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: "TRIGGER_WORKLOAD_API_DOMAIN is required when COMPUTE_SNAPSHOTS_ENABLED is true", + path: ["TRIGGER_WORKLOAD_API_DOMAIN"], + }); + } }); export const env = Env.parse(stdEnv); diff --git a/apps/supervisor/src/workloadServer/index.ts b/apps/supervisor/src/workloadServer/index.ts index 0586c259a39..07801682bc0 100644 --- a/apps/supervisor/src/workloadServer/index.ts +++ b/apps/supervisor/src/workloadServer/index.ts @@ -264,12 +264,18 @@ export class WorkloadServer extends EventEmitter { } if (this.computeManager && env.COMPUTE_SNAPSHOTS_ENABLED) { + if (!env.TRIGGER_WORKLOAD_API_DOMAIN) { + this.logger.error( + "TRIGGER_WORKLOAD_API_DOMAIN is not set, cannot create snapshot callback URL" + ); + reply.json({ error: "Snapshot callbacks not configured" }, false, 500); + return; + } + // Compute mode: fire-and-forget snapshot with callback reply.json({ ok: true } satisfies WorkloadSuspendRunResponseBody, false, 202); - const callbackUrl = `${env.TRIGGER_WORKLOAD_API_PROTOCOL}://${ - env.TRIGGER_WORKLOAD_API_DOMAIN ?? "localhost" - }:${env.TRIGGER_WORKLOAD_API_PORT_EXTERNAL}/api/v1/compute/snapshot-complete`; + const callbackUrl = `${env.TRIGGER_WORKLOAD_API_PROTOCOL}://${env.TRIGGER_WORKLOAD_API_DOMAIN}:${env.TRIGGER_WORKLOAD_API_PORT_EXTERNAL}/api/v1/compute/snapshot-complete`; const snapshotResult = await this.computeManager.snapshot({ runnerId,