diff --git a/plugins/orchestrator/dist/server.js b/plugins/orchestrator/dist/server.js index 6b6141f..1b60687 100644 --- a/plugins/orchestrator/dist/server.js +++ b/plugins/orchestrator/dist/server.js @@ -6519,7 +6519,7 @@ var require_dist = __commonJS((exports, module) => { // mcp/server.ts import { resolve, join as join5 } from "path"; -import { existsSync as existsSync6, readFileSync as readFileSync3, writeFileSync } from "fs"; +import { existsSync as existsSync6, readFileSync as readFileSync3, readdirSync as readdirSync2, unlinkSync as unlinkSync2, writeFileSync } from "fs"; import { execSync } from "child_process"; // node_modules/zod/v3/external.js @@ -24629,22 +24629,22 @@ async function startSidecar() { } } catch {} try { - const { unlinkSync: unlinkSync2 } = await import("fs"); - unlinkSync2(portFile); + const { unlinkSync: unlinkSync3 } = await import("fs"); + unlinkSync3(portFile); } catch {} const baseArgs = ["--port", "0", "--port-file", portFile]; let result = await trySpawn(["uvx", "--with-requirements", requirementsPath, "python", sidecarPath, ...baseArgs], portFile, "uvx", 60000); if (!result) { try { - const { unlinkSync: unlinkSync2 } = await import("fs"); - unlinkSync2(portFile); + const { unlinkSync: unlinkSync3 } = await import("fs"); + unlinkSync3(portFile); } catch {} result = await trySpawn(["python", sidecarPath, ...baseArgs], portFile, "python", 30000); } if (!result) { try { - const { unlinkSync: unlinkSync2 } = await import("fs"); - unlinkSync2(portFile); + const { unlinkSync: unlinkSync3 } = await import("fs"); + unlinkSync3(portFile); } catch {} result = await trySpawn(["python3", sidecarPath, ...baseArgs], portFile, "python3", 30000); } @@ -26357,6 +26357,99 @@ foreach ($s in $siblings) { `); } } +function reapStaleActiveSessionFiles(stateDir) { + if (!existsSync6(stateDir)) + return; + let reaped = 0; + try { + const entries = readdirSync2(stateDir); + for (const entry of entries) { + const m = entry.match(/^active-session-(\d+)$/); + if (!m) + continue; + const pid = Number(m[1]); + if (!Number.isFinite(pid) || pid <= 0) + continue; + let alive = false; + try { + process.kill(pid, 0); + alive = true; + } catch { + alive = false; + } + if (!alive) { + try { + unlinkSync2(join5(stateDir, entry)); + reaped++; + } catch {} + } + } + } catch {} + if (reaped > 0) { + process.stderr.write(`[orchestrator] startup hygiene: reaped ${reaped} stale active-session- file(s) in ${stateDir} +`); + } +} +function warnAboutLikelyOrphanSiblings() { + if (process.platform !== "linux") + return; + const myPid = process.pid; + const distMarker = "orchestrator/dist/server.js"; + let procDirs; + try { + procDirs = readdirSync2("/proc").filter((n) => /^\d+$/.test(n)); + } catch { + return; + } + const orphanPids = []; + for (const pidStr of procDirs) { + const pid = Number(pidStr); + if (pid === myPid) + continue; + let isSiblingMcp = false; + try { + const cmdline = readFileSync3(`/proc/${pid}/cmdline`, "utf8"); + isSiblingMcp = cmdline.includes(distMarker); + } catch { + continue; + } + if (!isSiblingMcp) + continue; + let walk = pid; + let foundClaude = false; + for (let depth = 0;depth < 8; depth++) { + try { + const stat = readFileSync3(`/proc/${walk}/stat`, "utf8"); + const rparen = stat.lastIndexOf(")"); + if (rparen < 0) + break; + const name = stat.slice(stat.indexOf("(") + 1, rparen).toLowerCase(); + if (name === "claude" || name === "claude.exe") { + foundClaude = true; + break; + } + const fields = stat.slice(rparen + 2).split(/\s+/); + const ppid = parseInt(fields[1] ?? "0", 10); + if (!ppid || ppid === walk || ppid === 1) + break; + walk = ppid; + } catch { + break; + } + } + if (!foundClaude) + orphanPids.push(pid); + } + if (orphanPids.length > 0) { + process.stderr.write(`[orchestrator] startup hygiene: detected ${orphanPids.length} likely-orphan sibling MCP process(es): pid=${orphanPids.join(",")}. Their parent claude is no longer in the process tree, suggesting they outlived their owning session and may be running stale bytecode whose watchdog never fired. Diagnose with 'pstree -ps '; clean up with 'kill -9 ' if confirmed orphan. +`); + } +} +{ + const startupProjectDir = process.env.ORCHESTRATOR_PROJECT_ROOT || process.env.CLAUDE_PROJECT_DIR || process.cwd(); + reapStaleActiveSessionFiles(join5(startupProjectDir, ".orchestrator-state")); + warnAboutLikelyOrphanSiblings(); +} var initialParentClaudePid = findClaudeAncestorPid(); var initialParentClaudeCreationTime = initialParentClaudePid !== null ? getProcessCreationTime(initialParentClaudePid) : null; if (initialParentClaudePid) { diff --git a/plugins/orchestrator/mcp/server.ts b/plugins/orchestrator/mcp/server.ts index 85eed16..a34e2c7 100644 --- a/plugins/orchestrator/mcp/server.ts +++ b/plugins/orchestrator/mcp/server.ts @@ -1,5 +1,5 @@ import { resolve, join } from "node:path"; -import { existsSync, readFileSync, writeFileSync } from "node:fs"; +import { existsSync, readFileSync, readdirSync, unlinkSync, writeFileSync } from "node:fs"; import { execSync } from "node:child_process"; import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; @@ -2822,6 +2822,160 @@ foreach ($s in $siblings) { } } +/** + * Startup hygiene: remove stale per-PID `active-session-` files + * whose owning claude process has exited. The per-PID file scheme + * (introduced in 0.30.19+) makes session_id lookup race-free for + * concurrent sessions, but nothing has been reaping these files when + * the claude process they belong to dies. On a developer machine with + * many short-lived sessions per day, they accumulate indefinitely. + * + * They are cosmetic - the legacy single `active-session` file remains + * the primary lookup - but a slow directory listing eventually becomes + * a real cost. This sweep runs once at MCP startup; it is cheap, + * idempotent, and race-safe (we only unlink files whose PID is verified + * gone via `process.kill(pid, 0)`). + */ +function reapStaleActiveSessionFiles(stateDir: string): void { + if (!existsSync(stateDir)) return; + let reaped = 0; + try { + const entries = readdirSync(stateDir); + for (const entry of entries) { + const m = entry.match(/^active-session-(\d+)$/); + if (!m) continue; + const pid = Number(m[1]); + if (!Number.isFinite(pid) || pid <= 0) continue; + // Liveness probe: process.kill(pid, 0) throws if the PID does + // not exist. ESRCH = dead PID (reap). EPERM = alive but not + // ours to signal (rare for own state files; treat as alive to + // be safe). We don't distinguish error codes here because the + // failure cost of a missed reap is one extra orphan file at + // worst - next startup will retry. + let alive = false; + try { + process.kill(pid, 0); + alive = true; + } catch { + alive = false; + } + if (!alive) { + try { + unlinkSync(join(stateDir, entry)); + reaped++; + } catch { + // Lost a race with another session, or permission issue. + // Non-fatal; next startup will retry. + } + } + } + } catch { + // readdir failure - directory may not exist, or permission denied. + // Either way nothing to reap. + } + if (reaped > 0) { + process.stderr.write( + `[orchestrator] startup hygiene: reaped ${reaped} stale active-session- file(s) in ${stateDir}\n`, + ); + } +} + +/** + * Startup hygiene: detect sibling orchestrator MCP processes whose + * parent claude is no longer alive, suggesting they outlived their + * owning session and may be running stale bytecode whose orphan + * watchdog never fired. + * + * Logs a warning naming the suspect PIDs - does NOT auto-kill, because + * killing a sibling MCP can disrupt infrastructure shared across live + * sessions (e.g. the python sidecar bound to .sidecar-port is + * deliberately shared - killing a sibling can take it down). Detection + * surfaces the issue; the operator decides whether to clean up. + * + * This complements the orphan-bun watchdog (which catches "parent dies + * while I'm alive" cases for processes loaded with the watchdog code). + * It does not help against orphans whose loaded bytecode predates the + * watchdog improvements - those need manual cleanup - but it makes + * such orphans visible at the next session's startup. + * + * Linux only. Windows already has killOlderDuplicateMcps for a related + * but different case (siblings sharing our parent claude); the orphan + * case on Windows is rare because parent death usually reaps children. + */ +function warnAboutLikelyOrphanSiblings(): void { + if (process.platform !== "linux") return; + const myPid = process.pid; + // Look for any other bun process whose cmdline references the + // orchestrator dist - that's the canonical sibling-MCP signature. + // We use a path suffix rather than an absolute marker so the check + // works regardless of where the plugin marketplace lives. + const distMarker = "orchestrator/dist/server.js"; + let procDirs: string[]; + try { + procDirs = readdirSync("/proc").filter((n) => /^\d+$/.test(n)); + } catch { + return; + } + const orphanPids: number[] = []; + for (const pidStr of procDirs) { + const pid = Number(pidStr); + if (pid === myPid) continue; + let isSiblingMcp = false; + try { + const cmdline = readFileSync(`/proc/${pid}/cmdline`, "utf8"); + isSiblingMcp = cmdline.includes(distMarker); + } catch { + continue; + } + if (!isSiblingMcp) continue; + // Walk this sibling's parent chain looking for a live claude + // process. If we never find one in 8 hops, the sibling has no + // claude ancestor in its current tree - likely orphaned. + let walk = pid; + let foundClaude = false; + for (let depth = 0; depth < 8; depth++) { + try { + const stat = readFileSync(`/proc/${walk}/stat`, "utf8"); + const rparen = stat.lastIndexOf(")"); + if (rparen < 0) break; + const name = stat + .slice(stat.indexOf("(") + 1, rparen) + .toLowerCase(); + if (name === "claude" || name === "claude.exe") { + foundClaude = true; + break; + } + const fields = stat.slice(rparen + 2).split(/\s+/); + const ppid = parseInt(fields[1] ?? "0", 10); + if (!ppid || ppid === walk || ppid === 1) break; + walk = ppid; + } catch { + break; + } + } + if (!foundClaude) orphanPids.push(pid); + } + if (orphanPids.length > 0) { + process.stderr.write( + `[orchestrator] startup hygiene: detected ${orphanPids.length} likely-orphan sibling MCP process(es): pid=${orphanPids.join(",")}. ` + + `Their parent claude is no longer in the process tree, suggesting they outlived their owning session and may be running stale bytecode whose watchdog never fired. ` + + `Diagnose with 'pstree -ps '; clean up with 'kill -9 ' if confirmed orphan.\n`, + ); + } +} + +// Startup hygiene runs unconditionally - it doesn't depend on parent +// claude resolution and benefits future startups even if THIS one is +// about to exit (no-claude-ancestor case below). +{ + const startupProjectDir = + process.env.ORCHESTRATOR_PROJECT_ROOT || + process.env.CLAUDE_PROJECT_DIR || + process.cwd(); + reapStaleActiveSessionFiles(join(startupProjectDir, ".orchestrator-state")); + warnAboutLikelyOrphanSiblings(); +} + const initialParentClaudePid = findClaudeAncestorPid(); // 0.30.38: also capture parent claude.exe's creation time so the watchdog // can defend against PID reuse. Without this, when the user closes one