fstamatelopoulos · fstamatelopoulos · May 14, 2026 · May 14, 2026 · May 14, 2026 · May 14, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/packages/cli/src/commands/agents.ts b/packages/cli/src/commands/agents.ts
@@ -0,0 +1,152 @@
+/**
+ * `cfcf agents` — manage loop-spawned agent processes.
+ *
+ * Today only `reap` is implemented: list + kill subprocesses
+ * registered with the cfcf server. Mirrors the UX of `cfcf server
+ * reap` (item 6.31) but scoped differently:
+ *
+ *   - `cfcf server reap` finds ORPHANS (PPID==1) — children of a
+ *     prior cfcf server that hard-crashed and got reparented to
+ *     init. Works without a running cfcf server.
+ *
+ *   - `cfcf agents reap` lists LIVE-SERVER CHILDREN — what the
+ *     current cfcf server is still tracking. Requires the server
+ *     to be running. Use case: a subprocess survived a `cfcf stop`
+ *     (rare, but the missing-signals follow-up + this command
+ *     close that gap), or the user wants to inspect what's still
+ *     attached to the server.
+ *
+ * **Safety**: this only ever lists / kills processes in the
+ * `active-processes` registry, which is scoped to loop-spawned
+ * agent roles (dev / judge / architect / documenter / reflection).
+ * PA (`cfcf spec`) and HA (`cfcf help assistant`) run interactively
+ * outside the cfcf server (`stdio: "inherit"`), are NOT in the
+ * registry, and CANNOT be killed via this command.
+ */
+
+import type { Command } from "commander";
+import { createInterface } from "node:readline";
+import { isServerReachable, get, post } from "../client.js";
+
+interface ActiveProcessSummary {
+  workspaceId: string;
+  workspaceName: string;
+  role: "dev" | "judge" | "architect" | "documenter" | "reflection";
+  pid: number | undefined;
+  startedAt: string;
+  runtimeMs: number;
+  logFileName: string | null;
+}
+
+function readLine(question: string): Promise<string> {
+  const rl = createInterface({ input: process.stdin, output: process.stdout });
+  return new Promise((resolve) => {
+    rl.question(question, (answer) => {
+      rl.close();
+      resolve(answer.trim());
+    });
+  });
+}
+
+function formatRuntime(ms: number): string {
+  const seconds = Math.floor(ms / 1000);
+  const minutes = Math.floor(seconds / 60);
+  const hours = Math.floor(minutes / 60);
+  if (hours > 0) return `${hours}h${minutes % 60}m`;
+  if (minutes > 0) return `${minutes}m${seconds % 60}s`;
+  return `${seconds}s`;
+}
+
+function formatRow(p: ActiveProcessSummary): string {
+  const pidStr = p.pid !== undefined ? `pid ${p.pid}` : "pid ?";
+  const runtime = formatRuntime(p.runtimeMs);
+  const log = p.logFileName ? ` log=${p.logFileName}` : "";
+  return `  ${p.workspaceName} / ${p.role}: ${pidStr}, running for ${runtime}${log}`;
+}
+
+export function registerAgentsCommands(program: Command): void {
+  const agents = program
+    .command("agents")
+    .description("Manage running agent subprocesses");
+
+  agents
+    .command("reap")
+    .description("List + interactively kill active agent processes (loop-spawned only — PA/HA are untouched)")
+    .option("--workspace <name>", "Limit to a single workspace (by name or ID)")
+    .option("-y, --yes", "Kill without prompting (non-interactive use)")
+    .action(async (opts: { workspace?: string; yes?: boolean }) => {
+      if (!(await isServerReachable())) {
+        console.error(
+          "cfcf server is not running. Start it with: cfcf server start",
+        );
+        console.error(
+          "(For orphans from a previously-crashed server, use `cfcf server reap` instead.)",
+        );
+        process.exit(1);
+      }
+
+      const query = opts.workspace
+        ? `?workspace=${encodeURIComponent(opts.workspace)}`
+        : "";
+      const res = await get<{ active: ActiveProcessSummary[] }>(
+        `/api/active-processes${query}`,
+      );
+      if (!res.ok) {
+        console.error(`Failed to list active processes: ${res.error}`);
+        process.exit(1);
+      }
+
+      const procs = res.data!.active;
+      if (procs.length === 0) {
+        console.log(
+          opts.workspace
+            ? `No active agent processes for workspace "${opts.workspace}".`
+            : "No active agent processes.",
+        );
+        return;
+      }
+
+      console.log(
+        `Found ${procs.length} active agent process${procs.length === 1 ? "" : "es"}:`,
+      );
+      for (const p of procs) {
+        console.log(formatRow(p));
+      }
+      console.log();
+
+      let proceed = false;
+      if (opts.yes) {
+        proceed = true;
+      } else {
+        const answer = await readLine(
+          `Kill ${procs.length === 1 ? "this process" : `these ${procs.length} processes`}? [y/N]: `,
+        );
+        proceed = /^y(es)?$/i.test(answer);
+      }
+      if (!proceed) {
+        console.log("Aborted. No processes killed.");
+        return;
+      }
+
+      let killed = 0;
+      let failed = 0;
+      for (const p of procs) {
+        const r = await post<{ ok: boolean; error?: string }>(
+          `/api/active-processes/${encodeURIComponent(p.workspaceId)}/${p.role}/kill`,
+          {},
+        );
+        if (r.ok && r.data?.ok) {
+          console.log(`  ✓ ${p.workspaceName} / ${p.role} (pid ${p.pid})`);
+          killed++;
+        } else {
+          console.log(
+            `  ✗ ${p.workspaceName} / ${p.role} (pid ${p.pid}): ${r.data?.error ?? r.error ?? "unknown error"}`,
+          );
+          failed++;
+        }
+      }
+      console.log();
+      console.log(`Reap complete: ${killed} signaled, ${failed} failed.`);
+      if (failed > 0) process.exit(1);
+    });
+}
diff --git a/packages/cli/src/commands/resume.ts b/packages/cli/src/commands/resume.ts
@@ -26,6 +26,11 @@ const RESUME_ACTIONS = [
   "stop_loop_now",
   "refine_plan",
   "consult_reflection",
+  // Re-spawn dev on the same iteration. Typical use: a
+  // `missing_signals` pause (quota cap, crash) is being unblocked
+  // after the underlying cause cleared. The harness rolls back the
+  // iteration counter and re-creates the branch.
+  "retry_iteration",
 ] as const;
 
 type ResumeAction = (typeof RESUME_ACTIONS)[number];
@@ -81,7 +86,7 @@ function buildActionOption(program: Command): Option {
   // eslint-disable-next-line @typescript-eslint/no-explicit-any
   const opt = (program as any).createOption(
     "--action <name>",
-    "Resume action: continue | finish_loop | stop_loop_now | refine_plan | consult_reflection",
+    "Resume action: continue | finish_loop | stop_loop_now | refine_plan | consult_reflection | retry_iteration",
   );
   opt.choices(RESUME_ACTIONS as readonly string[]);
   opt.default("continue");

diff --git a/packages/cli/src/index.ts b/packages/cli/src/index.ts
@@ -42,6 +42,7 @@ import { registerSelfUpdateCommand } from "./commands/self-update.js";
 import { registerCompletionCommand } from "./commands/completion.js";
 import { registerHelpCommand } from "./commands/help.js";
 import { registerSpecCommand } from "./commands/spec.js";
+import { registerAgentsCommands } from "./commands/agents.js";
 import { maybePrintUpdateBanner } from "./update-banner.js";
 
 // --- Internal: run the server in-process ---
@@ -114,6 +115,7 @@ registerRunCommand(program);
 registerResumeCommand(program);
 registerStopCommand(program);
 registerSpecCommand(program);
+registerAgentsCommands(program);
 registerReviewCommand(program);
 registerDocumentCommand(program);
 registerReflectCommand(program);

diff --git a/packages/core/src/architect-runner.ts b/packages/core/src/architect-runner.ts
@@ -83,6 +83,31 @@ export function getReviewState(workspaceId: string): ReviewState | undefined {
   return reviewStore.get(workspaceId);
 }
 
+/**
+ * Force-mark this workspace's architect/review state as `failed`
+ * with the given reason. Used by `stopLoop()` + `cfcf agents reap`
+ * to flip the dashboard's "review running" indicator off
+ * immediately after killing the subprocess. Mirrors
+ * `markReflectStateFailed` / `markDocumentStateFailed`.
+ *
+ * Idempotent: no-op when no state exists, or when the state is
+ * already terminal (completed / failed). Returns true if a flip
+ * happened.
+ */
+export async function markReviewStateFailed(
+  workspaceId: string,
+  reason: string,
+): Promise<boolean> {
+  const current = reviewStore.get(workspaceId);
+  if (!current) return false;
+  if (!REVIEW_ACTIVE_STATUSES.has(current.status)) return false;
+  current.status = "failed";
+  current.error = reason;
+  current.completedAt = new Date().toISOString();
+  await setReviewState(current);
+  return true;
+}
+
 /**
  * Server-boot hook (item F.23): load every workspace's persisted
  * review state into the in-memory cache. Any state still in an active

diff --git a/packages/core/src/documenter-runner.ts b/packages/core/src/documenter-runner.ts
@@ -82,6 +82,31 @@ export function getDocumentState(workspaceId: string): DocumentState | undefined
   return documentStore.get(workspaceId);
 }
 
+/**
+ * Force-mark this workspace's documenter state as `failed` with the
+ * given reason. Used by `stopLoop()` + `cfcf agents reap` to flip
+ * the dashboard's "documenting" indicator off immediately after
+ * killing the subprocess. Mirrors `markReflectStateFailed` /
+ * `markReviewStateFailed`.
+ *
+ * Idempotent: no-op when no state exists, or when the state is
+ * already terminal (completed / failed). Returns true if a flip
+ * happened.
+ */
+export async function markDocumentStateFailed(
+  workspaceId: string,
+  reason: string,
+): Promise<boolean> {
+  const current = documentStore.get(workspaceId);
+  if (!current) return false;
+  if (!DOCUMENT_ACTIVE_STATUSES.has(current.status)) return false;
+  current.status = "failed";
+  current.error = reason;
+  current.completedAt = new Date().toISOString();
+  await setDocumentState(current);
+  return true;
+}
+
 /**
  * Server-boot hook (item F.23): hydrate the in-memory cache from disk
  * and clean up any state still claiming to be active. Returns the

diff --git a/packages/core/src/iteration-loop.test.ts b/packages/core/src/iteration-loop.test.ts
@@ -13,6 +13,7 @@ import {
   stopLoop,
   shouldRunReflection,
   resolveEffectiveDetermination,
+  loopActivePhaseToRole,
   type LoopState,
 } from "./iteration-loop.js";
 import type { WorkspaceConfig, DevSignals, JudgeSignals, ReflectionSignals } from "./types.js";
@@ -947,6 +948,47 @@ describe("pauseReasonAllowedActions (item 6.25)", () => {
   test("scope_complete excludes consult_reflection (no iterations to reflect on)", () => {
     expect(pauseReasonAllowedActions("scope_complete")).not.toContain("consult_reflection");
   });
+
+  // harness-missing-signals: when dev or judge exit without writing
+  // their signals file, the iteration is in an unknown state. The
+  // user's three meaningful options are: redo the iter (retry),
+  // skip it (continue), or abandon (stop). finish_loop /
+  // refine_plan / consult_reflection all assume we have a
+  // meaningful iter result to act on; we don't.
+  test("missing_signals: retry_iteration + continue + stop_loop_now", () => {
+    expectActions(pauseReasonAllowedActions("missing_signals"), [
+      "retry_iteration",
+      "continue",
+      "stop_loop_now",
+    ]);
+  });
+
+  test("missing_signals excludes finish_loop (no iter result to finish on)", () => {
+    expect(pauseReasonAllowedActions("missing_signals")).not.toContain("finish_loop");
+  });
+
+  test("missing_signals excludes refine_plan + consult_reflection (no iter data to refine/reflect on)", () => {
+    expect(pauseReasonAllowedActions("missing_signals")).not.toContain("refine_plan");
+    expect(pauseReasonAllowedActions("missing_signals")).not.toContain("consult_reflection");
+  });
+
+  test("retry_iteration is ONLY applicable to missing_signals (not offered for other pauseReasons)", () => {
+    // The retry action is specifically the "agent exited without
+    // signals, redo the iter" path. It doesn't make sense for the
+    // other pause classes (cadence, anomaly with signals, etc.
+    // — those have a known iter result and use other actions).
+    const otherReasons = [
+      undefined,
+      "cadence",
+      "anomaly",
+      "user_input_needed",
+      "max_iterations",
+      "scope_complete",
+    ] as const;
+    for (const reason of otherReasons) {
+      expect(pauseReasonAllowedActions(reason)).not.toContain("retry_iteration");
+    }
+  });
 });
 
 // 2026-05-02: Architect SCOPE_COMPLETE readiness verdict (item 6.25 follow-up).
@@ -1007,3 +1049,41 @@ describe("buildPreLoopBlockReason: SCOPE_COMPLETE message", () => {
     expect(msg).not.toContain("already implemented");
   });
 });
+
+
+// --- loopActivePhaseToRole (kill-on-stop feature) ---
+//
+// Maps the loop's `state.phase` to the AgentRole currently being
+// awaited, so stopLoop / `cfcf agents reap` know which subprocess
+// to signal. PA / HA are deliberately omitted — they run outside
+// the cfcf server (`stdio: "inherit"`) and are never in the
+// active-processes registry.
+
+describe("loopActivePhaseToRole", () => {
+  test("pre_loop_reviewing → architect", () => {
+    expect(loopActivePhaseToRole("pre_loop_reviewing")).toBe("architect");
+  });
+  test("dev_executing → dev", () => {
+    expect(loopActivePhaseToRole("dev_executing")).toBe("dev");
+  });
+  test("judging → judge", () => {
+    expect(loopActivePhaseToRole("judging")).toBe("judge");
+  });
+  test("reflecting → reflection (the iter-19 gmbot case)", () => {
+    expect(loopActivePhaseToRole("reflecting")).toBe("reflection");
+  });
+  test("documenting → documenter", () => {
+    expect(loopActivePhaseToRole("documenting")).toBe("documenter");
+  });
+  test("non-active phases return null (nothing to kill)", () => {
+    // preparing / deciding / paused / completed / failed / stopped:
+    // no subprocess is in flight, so stopLoop's kill loop should
+    // skip cleanly.
+    expect(loopActivePhaseToRole("preparing")).toBeNull();
+    expect(loopActivePhaseToRole("deciding")).toBeNull();
+    expect(loopActivePhaseToRole("paused")).toBeNull();
+    expect(loopActivePhaseToRole("completed")).toBeNull();
+    expect(loopActivePhaseToRole("failed")).toBeNull();
+    expect(loopActivePhaseToRole("stopped")).toBeNull();
+  });
+});