Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 100 additions & 7 deletions plugins/orchestrator/dist/server.js
Original file line number Diff line number Diff line change
Expand Up @@ -6519,7 +6519,7 @@ var require_dist = __commonJS((exports, module) => {

// mcp/server.ts
import { resolve, join as join5 } from "path";
import { existsSync as existsSync6, readFileSync as readFileSync3, writeFileSync } from "fs";
import { existsSync as existsSync6, readFileSync as readFileSync3, readdirSync as readdirSync2, unlinkSync as unlinkSync2, writeFileSync } from "fs";
import { execSync } from "child_process";

// node_modules/zod/v3/external.js
Expand Down Expand Up @@ -24629,22 +24629,22 @@ async function startSidecar() {
}
} catch {}
try {
const { unlinkSync: unlinkSync2 } = await import("fs");
unlinkSync2(portFile);
const { unlinkSync: unlinkSync3 } = await import("fs");
unlinkSync3(portFile);
} catch {}
const baseArgs = ["--port", "0", "--port-file", portFile];
let result = await trySpawn(["uvx", "--with-requirements", requirementsPath, "python", sidecarPath, ...baseArgs], portFile, "uvx", 60000);
if (!result) {
try {
const { unlinkSync: unlinkSync2 } = await import("fs");
unlinkSync2(portFile);
const { unlinkSync: unlinkSync3 } = await import("fs");
unlinkSync3(portFile);
} catch {}
result = await trySpawn(["python", sidecarPath, ...baseArgs], portFile, "python", 30000);
}
if (!result) {
try {
const { unlinkSync: unlinkSync2 } = await import("fs");
unlinkSync2(portFile);
const { unlinkSync: unlinkSync3 } = await import("fs");
unlinkSync3(portFile);
} catch {}
result = await trySpawn(["python3", sidecarPath, ...baseArgs], portFile, "python3", 30000);
}
Expand Down Expand Up @@ -26357,6 +26357,99 @@ foreach ($s in $siblings) {
`);
}
}
function reapStaleActiveSessionFiles(stateDir) {
if (!existsSync6(stateDir))
return;
let reaped = 0;
try {
const entries = readdirSync2(stateDir);
for (const entry of entries) {
const m = entry.match(/^active-session-(\d+)$/);
if (!m)
continue;
const pid = Number(m[1]);
if (!Number.isFinite(pid) || pid <= 0)
continue;
let alive = false;
try {
process.kill(pid, 0);
alive = true;
} catch {
alive = false;
}
if (!alive) {
try {
unlinkSync2(join5(stateDir, entry));
reaped++;
} catch {}
}
}
} catch {}
if (reaped > 0) {
process.stderr.write(`[orchestrator] startup hygiene: reaped ${reaped} stale active-session-<pid> file(s) in ${stateDir}
`);
}
}
function warnAboutLikelyOrphanSiblings() {
if (process.platform !== "linux")
return;
const myPid = process.pid;
const distMarker = "orchestrator/dist/server.js";
let procDirs;
try {
procDirs = readdirSync2("/proc").filter((n) => /^\d+$/.test(n));
} catch {
return;
}
const orphanPids = [];
for (const pidStr of procDirs) {
const pid = Number(pidStr);
if (pid === myPid)
continue;
let isSiblingMcp = false;
try {
const cmdline = readFileSync3(`/proc/${pid}/cmdline`, "utf8");
isSiblingMcp = cmdline.includes(distMarker);
} catch {
continue;
}
if (!isSiblingMcp)
continue;
let walk = pid;
let foundClaude = false;
for (let depth = 0;depth < 8; depth++) {
try {
const stat = readFileSync3(`/proc/${walk}/stat`, "utf8");
const rparen = stat.lastIndexOf(")");
if (rparen < 0)
break;
const name = stat.slice(stat.indexOf("(") + 1, rparen).toLowerCase();
if (name === "claude" || name === "claude.exe") {
foundClaude = true;
break;
}
const fields = stat.slice(rparen + 2).split(/\s+/);
const ppid = parseInt(fields[1] ?? "0", 10);
if (!ppid || ppid === walk || ppid === 1)
break;
walk = ppid;
} catch {
break;
}
}
if (!foundClaude)
orphanPids.push(pid);
}
if (orphanPids.length > 0) {
process.stderr.write(`[orchestrator] startup hygiene: detected ${orphanPids.length} likely-orphan sibling MCP process(es): pid=${orphanPids.join(",")}. Their parent claude is no longer in the process tree, suggesting they outlived their owning session and may be running stale bytecode whose watchdog never fired. Diagnose with 'pstree -ps <pid>'; clean up with 'kill -9 <pid>' if confirmed orphan.
`);
}
}
{
const startupProjectDir = process.env.ORCHESTRATOR_PROJECT_ROOT || process.env.CLAUDE_PROJECT_DIR || process.cwd();
reapStaleActiveSessionFiles(join5(startupProjectDir, ".orchestrator-state"));
warnAboutLikelyOrphanSiblings();
}
var initialParentClaudePid = findClaudeAncestorPid();
var initialParentClaudeCreationTime = initialParentClaudePid !== null ? getProcessCreationTime(initialParentClaudePid) : null;
if (initialParentClaudePid) {
Expand Down
156 changes: 155 additions & 1 deletion plugins/orchestrator/mcp/server.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { resolve, join } from "node:path";
import { existsSync, readFileSync, writeFileSync } from "node:fs";
import { existsSync, readFileSync, readdirSync, unlinkSync, writeFileSync } from "node:fs";
import { execSync } from "node:child_process";
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
Expand Down Expand Up @@ -2822,6 +2822,160 @@ foreach ($s in $siblings) {
}
}

/**
* Startup hygiene: remove stale per-PID `active-session-<pid>` files
* whose owning claude process has exited. The per-PID file scheme
* (introduced in 0.30.19+) makes session_id lookup race-free for
* concurrent sessions, but nothing has been reaping these files when
* the claude process they belong to dies. On a developer machine with
* many short-lived sessions per day, they accumulate indefinitely.
*
* They are cosmetic - the legacy single `active-session` file remains
* the primary lookup - but a slow directory listing eventually becomes
* a real cost. This sweep runs once at MCP startup; it is cheap,
* idempotent, and race-safe (we only unlink files whose PID is verified
* gone via `process.kill(pid, 0)`).
*/
function reapStaleActiveSessionFiles(stateDir: string): void {
if (!existsSync(stateDir)) return;
let reaped = 0;
try {
const entries = readdirSync(stateDir);
for (const entry of entries) {
const m = entry.match(/^active-session-(\d+)$/);
if (!m) continue;
const pid = Number(m[1]);
if (!Number.isFinite(pid) || pid <= 0) continue;
// Liveness probe: process.kill(pid, 0) throws if the PID does
// not exist. ESRCH = dead PID (reap). EPERM = alive but not
// ours to signal (rare for own state files; treat as alive to
// be safe). We don't distinguish error codes here because the
// failure cost of a missed reap is one extra orphan file at
// worst - next startup will retry.
let alive = false;
try {
process.kill(pid, 0);
alive = true;
} catch {
alive = false;
}
if (!alive) {
try {
unlinkSync(join(stateDir, entry));
reaped++;
} catch {
// Lost a race with another session, or permission issue.
// Non-fatal; next startup will retry.
}
}
}
} catch {
// readdir failure - directory may not exist, or permission denied.
// Either way nothing to reap.
}
if (reaped > 0) {
process.stderr.write(
`[orchestrator] startup hygiene: reaped ${reaped} stale active-session-<pid> file(s) in ${stateDir}\n`,
);
}
}

/**
* Startup hygiene: detect sibling orchestrator MCP processes whose
* parent claude is no longer alive, suggesting they outlived their
* owning session and may be running stale bytecode whose orphan
* watchdog never fired.
*
* Logs a warning naming the suspect PIDs - does NOT auto-kill, because
* killing a sibling MCP can disrupt infrastructure shared across live
* sessions (e.g. the python sidecar bound to .sidecar-port is
* deliberately shared - killing a sibling can take it down). Detection
* surfaces the issue; the operator decides whether to clean up.
*
* This complements the orphan-bun watchdog (which catches "parent dies
* while I'm alive" cases for processes loaded with the watchdog code).
* It does not help against orphans whose loaded bytecode predates the
* watchdog improvements - those need manual cleanup - but it makes
* such orphans visible at the next session's startup.
*
* Linux only. Windows already has killOlderDuplicateMcps for a related
* but different case (siblings sharing our parent claude); the orphan
* case on Windows is rare because parent death usually reaps children.
*/
function warnAboutLikelyOrphanSiblings(): void {
if (process.platform !== "linux") return;
const myPid = process.pid;
// Look for any other bun process whose cmdline references the
// orchestrator dist - that's the canonical sibling-MCP signature.
// We use a path suffix rather than an absolute marker so the check
// works regardless of where the plugin marketplace lives.
const distMarker = "orchestrator/dist/server.js";
let procDirs: string[];
try {
procDirs = readdirSync("/proc").filter((n) => /^\d+$/.test(n));
} catch {
return;
}
const orphanPids: number[] = [];
for (const pidStr of procDirs) {
const pid = Number(pidStr);
if (pid === myPid) continue;
let isSiblingMcp = false;
try {
const cmdline = readFileSync(`/proc/${pid}/cmdline`, "utf8");
isSiblingMcp = cmdline.includes(distMarker);
} catch {
continue;
}
if (!isSiblingMcp) continue;
// Walk this sibling's parent chain looking for a live claude
// process. If we never find one in 8 hops, the sibling has no
// claude ancestor in its current tree - likely orphaned.
let walk = pid;
let foundClaude = false;
for (let depth = 0; depth < 8; depth++) {
try {
const stat = readFileSync(`/proc/${walk}/stat`, "utf8");
const rparen = stat.lastIndexOf(")");
if (rparen < 0) break;
const name = stat
.slice(stat.indexOf("(") + 1, rparen)
.toLowerCase();
if (name === "claude" || name === "claude.exe") {
foundClaude = true;
break;
}
const fields = stat.slice(rparen + 2).split(/\s+/);
const ppid = parseInt(fields[1] ?? "0", 10);
if (!ppid || ppid === walk || ppid === 1) break;
walk = ppid;
} catch {
break;
}
}
if (!foundClaude) orphanPids.push(pid);
}
if (orphanPids.length > 0) {
process.stderr.write(
`[orchestrator] startup hygiene: detected ${orphanPids.length} likely-orphan sibling MCP process(es): pid=${orphanPids.join(",")}. ` +
`Their parent claude is no longer in the process tree, suggesting they outlived their owning session and may be running stale bytecode whose watchdog never fired. ` +
`Diagnose with 'pstree -ps <pid>'; clean up with 'kill -9 <pid>' if confirmed orphan.\n`,
);
}
}

// Startup hygiene runs unconditionally - it doesn't depend on parent
// claude resolution and benefits future startups even if THIS one is
// about to exit (no-claude-ancestor case below).
{
const startupProjectDir =
process.env.ORCHESTRATOR_PROJECT_ROOT ||
process.env.CLAUDE_PROJECT_DIR ||
process.cwd();
reapStaleActiveSessionFiles(join(startupProjectDir, ".orchestrator-state"));
warnAboutLikelyOrphanSiblings();
}

const initialParentClaudePid = findClaudeAncestorPid();
// 0.30.38: also capture parent claude.exe's creation time so the watchdog
// can defend against PID reuse. Without this, when the user closes one
Expand Down