diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 2cffd56..55f6547 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -6,13 +6,13 @@ "url": "https://github.com/jjackson" }, "metadata": { - "version": "0.13.278" + "version": "0.13.280" }, "plugins": [ { "name": "ace", "source": "./", - "version": "0.13.279", + "version": "0.13.280", "description": "AI Connect Engine — orchestrates the CRISPR-Connect lifecycle from idea through app building, Connect setup, LLO management, and closeout" } ] diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index 68565c1..5235fb1 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "ace", - "version": "0.13.279", + "version": "0.13.280", "description": "AI Connect Engine — orchestrates the CRISPR-Connect lifecycle from idea through app building, Connect setup, LLO management, and closeout", "author": { "name": "Jonathan Jackson", diff --git a/VERSION b/VERSION index 2054863..9d7f468 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.13.279 +0.13.280 diff --git a/mcp/mobile/backends/avd.ts b/mcp/mobile/backends/avd.ts index a2dc488..3c782c7 100644 --- a/mcp/mobile/backends/avd.ts +++ b/mcp/mobile/backends/avd.ts @@ -525,13 +525,27 @@ export class AvdBackend { * we already have. */ private async sweepStaleEmulatorState(): Promise { - // Step 1: orphan qemu sweep. + // Ordering is load-bearing — orphan-qemu kill MUST land BEFORE + // adb-server restart, with a brief socket-release window between + // them. If adb-server starts up while orphan qemu instances are + // still holding emulator-NNNN TCP sockets, the freshly-restarted + // daemon adopts the wedged-port state and we end up back in the + // same "package service did not bind" failure. The attempt-12 + // reproducer (malaria-itn-fgd 20260515-1645): 2 orphan qemu PIDs + // + 3 stale adb daemons; even with PR #349's adb-restart in place, + // the next `ensureAvdRunning` failed twice because the qemu PIDs + // were still alive when the daemon came back up. Operator had to + // run a second `adb kill-server`/`start-server` AFTER manually + // pkill'ing qemu before the third attempt finally booted. + // + // Step 1: orphan qemu sweep (must run first). // // Get all qemu-system PIDs. `pgrep -af` would give us the command- // line too, but we don't need it for the kill decision — the // adb-devices cross-reference is the authoritative signal. On // Windows we no-op (pgrep doesn't exist, and ACE mobile dev is // Mac/Linux-only in practice). + let orphansKilled = 0; if (process.platform !== 'win32') { try { const pgrep = await this.shell('pgrep', ['-f', 'qemu-system']).catch(() => null); @@ -564,28 +578,25 @@ export class AvdBackend { for (const pid of qemuPids) { // eslint-disable-next-line no-console console.warn(`[ace-mobile] sweepStaleEmulatorState: killing orphan qemu pid=${pid} (no adb devices visible)`); - try { process.kill(pid, 'SIGKILL'); } catch { /* already gone */ } + try { process.kill(pid, 'SIGKILL'); orphansKilled++; } catch { /* already gone */ } } } else if (qemuPids.length > liveCount) { - // Some qemu PIDs aren't matched by adb devices — best- - // effort: kill the excess. We can't precisely identify - // which PIDs are orphans without per-process port probing, - // but if pgrep shows N PIDs and adb sees M < N devices, - // (N - M) PIDs are orphans. We kill the lowest-PID - // excess (most likely to be the older orphans). - // - // Conservative: only kill if pgrep > 2 * live (i.e. we're - // confident there are MULTIPLE orphans, not a single - // legitimate emulator with a stale pgrep echo). This - // avoids killing a healthy concurrent emulator on a - // shared box. - if (qemuPids.length >= liveCount + 2) { - const excess = qemuPids.slice(0, qemuPids.length - liveCount); - for (const pid of excess) { - // eslint-disable-next-line no-console - console.warn(`[ace-mobile] sweepStaleEmulatorState: killing likely-orphan qemu pid=${pid} (${qemuPids.length} qemu PIDs, ${liveCount} adb devices)`); - try { process.kill(pid, 'SIGKILL'); } catch { /* already gone */ } - } + // Some qemu PIDs aren't matched by adb devices — these are + // orphans (a live qemu always has an adb-devices entry once + // the daemon has scanned the port range). If pgrep shows N + // PIDs and adb sees M < N devices, (N - M) PIDs are + // orphans. Kill the excess (lowest-PID first — most likely + // to be the older orphans). Tightened from PR #349's + // `qemuPids.length >= liveCount + 2` guard: that conservative + // threshold meant a 2-orphan + 1-stale-adb-entry state + // (the attempt-12 signature) was BELOW the kill threshold + // and got passed through to the adb-restart step with the + // orphans still alive — which is exactly the bug. + const excess = qemuPids.slice(0, qemuPids.length - liveCount); + for (const pid of excess) { + // eslint-disable-next-line no-console + console.warn(`[ace-mobile] sweepStaleEmulatorState: killing likely-orphan qemu pid=${pid} (${qemuPids.length} qemu PIDs, ${liveCount} adb devices)`); + try { process.kill(pid, 'SIGKILL'); orphansKilled++; } catch { /* already gone */ } } } } @@ -595,12 +606,29 @@ export class AvdBackend { } } + // Step 1.5: brief socket-release wait if we actually killed + // anything. SIGKILL is synchronous in the sense that process.kill + // returns immediately, but the kernel's TCP socket teardown for + // the emulator-NNNN ports the qemu was holding completes a few + // hundred ms later. Restarting adb-server BEFORE the sockets are + // released causes the daemon to either re-adopt the half-closed + // ports OR bind to a different set, both of which leave the + // subsequent emulator spawn unable to land cleanly. ~500ms is + // enough — empirically matches the SO_LINGER + TIME_WAIT + // shortened window on Darwin and Linux for the loopback + // sockets qemu uses for emulator-NNNN. + if (orphansKilled > 0) { + await new Promise((r) => setTimeout(r, 500)); + } + // Step 2: always restart adb-server. Cheap (~500ms) and resets // the daemon to a known-clean state independent of whether we // detected wedge symptoms. Cited in three observed instances // (malaria-itn-fgd attempts 8, 10, 11) as the manual recovery // operators ran — owning it inside the heal removes the - // operator-in-the-loop step. + // operator-in-the-loop step. MUST run AFTER the orphan-qemu + // kill + socket-release wait so the freshly-restarted daemon + // sees a clean port landscape. await this.shell('adb', ['kill-server']).catch(() => {}); await this.shell('adb', ['start-server']).catch(() => {}); } diff --git a/mcp/mobile/recipes/static/learn-tap-module.yaml b/mcp/mobile/recipes/static/learn-tap-module.yaml index ed298a9..0f820b8 100644 --- a/mcp/mobile/recipes/static/learn-tap-module.yaml +++ b/mcp/mobile/recipes/static/learn-tap-module.yaml @@ -60,16 +60,42 @@ appId: org.commcare.dalvik # case when module-name != form-name). No further action needed — # downstream `form-advance.yaml` takes over. # B. `screen_suite_menu_list` still visible AND `nav_btn_next` NOT -# visible → CommCare suppressed the auto-skip because module-name -# == form-name. Device is sitting on the intermediate one-row -# form-list; tap the only row (still text:${MODULE_NAME} since the -# form shares the module's name) to enter the form, then assert -# `nav_btn_next` visible so any further halt is fast and named. -# Otherwise (drilling from suite-root into a form-LIST, neither -# branch fires): the recipe falls through with the form-list still -# visible — caller is expected to invoke learn-tap-module again with -# the form's row name. This preserves the existing two-call drill -# pattern used by J1-style palettes. +# visible AND a row matching `${MODULE_NAME}` is rendered inside +# the menu-list body → CommCare suppressed the auto-skip because +# module-name == form-name. Device is sitting on the intermediate +# one-row form-list; the form row carries text:${MODULE_NAME} +# (since the form shares the module's name) — tap it to enter the +# form, then assert `nav_btn_next` visible so any further halt is +# fast and named. +# +# The `below: id: screen_suite_menu_list` scoping on the visible- +# text guard is load-bearing: when form-name != module-name (e.g. +# malaria-itn-fgd J1: module "Briefing Acknowledgement" → form +# "Acknowledge Readiness"), the form row's text does NOT match +# `${MODULE_NAME}`, so the guard fails and Branch B skips — letting +# the caller's next learn-tap-module invocation (with FORM_NAME) +# drill the form row by its own label. Without the body scoping, +# Branch B's bare `visible: text: ${MODULE_NAME}` would match the +# toolbar title (which still reads "Briefing Acknowledgement" on +# the form-list screen), the re-tap would land on the non-tappable +# toolbar, and the extendedWaitUntil would expire on the unchanged +# form-list. Live-reproduced 2026-05-19 (malaria-itn-fgd +# 20260515-1645 Phase 6 attempt 12) — see screenshot +# ❌-1779218294468-(chunk-0.yaml).png. +# +# Otherwise (drilling from suite-root into a form-LIST whose form +# name != module name, neither branch fires): the recipe falls +# through with the form-list still visible — caller is expected to +# invoke learn-tap-module again with the form's row name. This +# preserves the existing two-call drill pattern used by J1-style +# palettes. +# Outer guard: intermediate form-list still visible (auto-skip +# suppressed). Inner guard: same-name case specifically — a row +# matching `${MODULE_NAME}` is rendered inside the menu-list body. +# Nested rather than a single `when:` because Maestro `when:` clauses +# take ONE `visible:` element selector; expressing "form-list visible +# AND text-in-list-body visible" requires two predicates, which YAML +# only supports as a nested flow. - runFlow: when: visible: @@ -77,14 +103,26 @@ appId: org.commcare.dalvik notVisible: id: "org.commcare.dalvik:id/nav_btn_next" commands: - - takeScreenshot: "learn-tap-module-intermediate-${MODULE_NAME}" - - tapOn: - ${SELECTOR:learn-suite-row-by-name} - - waitForAnimationToEnd: - timeout: 5000 - - extendedWaitUntil: - visible: - id: "org.commcare.dalvik:id/nav_btn_next" - timeout: 10000 + - runFlow: + when: + # Same-name-only guard: the text-match is scoped to the + # list body so the toolbar title (which still reads + # ${MODULE_NAME} on the form-list screen even when the + # form has a different label) does NOT satisfy this + # clause. See header comment for full rationale. + visible: + text: "${MODULE_NAME}" + below: + id: "org.commcare.dalvik:id/screen_suite_menu_list" + commands: + - takeScreenshot: "learn-tap-module-intermediate-${MODULE_NAME}" + - tapOn: + ${SELECTOR:learn-suite-row-by-name} + - waitForAnimationToEnd: + timeout: 5000 + - extendedWaitUntil: + visible: + id: "org.commcare.dalvik:id/nav_btn_next" + timeout: 10000 - takeScreenshot: "learn-tap-module-after-${MODULE_NAME}" diff --git a/package.json b/package.json index 2921a85..31947fa 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "ace", - "version": "0.13.279", + "version": "0.13.280", "description": "AI Connect Engine - orchestrator for building Connect Opps using AI", "type": "module", "scripts": { diff --git a/test/mcp/mobile/avd.test.ts b/test/mcp/mobile/avd.test.ts index 1ec23c7..d8cfc25 100644 --- a/test/mcp/mobile/avd.test.ts +++ b/test/mcp/mobile/avd.test.ts @@ -335,6 +335,126 @@ describe('AvdBackend.ensureAvdRunning', () => { } }); + it('sweepStaleEmulatorState: orphan-qemu kill runs BEFORE adb kill-server with a socket-release wait between', async () => { + if (process.platform === 'win32') return; + // Regression guard for the 2026-05-19 malaria-itn-fgd attempt-12 + // re-occurrence (run 20260515-1645 Phase 6): PR #349 wired up + // both the orphan-qemu sweep and the adb-server restart, but + // without (a) firm ordering and (b) a socket-release wait between + // them. The bug pattern: SIGKILL on qemu returns synchronously, + // but the kernel takes a few hundred ms to release the + // emulator-NNNN TCP sockets the qemu was holding. When the + // adb-restart fires immediately after the kill, the freshly- + // restarted daemon re-adopts the wedged-port state and the next + // `ensureAvdRunning` still fails with "package service did not + // bind." Empirically 2 attempts failed even after PR #349 was + // live, before a *second* `adb kill-server`/`start-server` + // inside the dispatch cleared it. + // + // Structural fix: orphan-qemu kill MUST run first, then a brief + // (~500ms) wait, then adb-server restart. + const events: { name: string; t: number }[] = []; + const start = Date.now(); + const realKill = process.kill; + (process as { kill: typeof process.kill }).kill = ((pid: number, _sig?: string | number) => { + events.push({ name: `process.kill(${pid})`, t: Date.now() - start }); + return true; + }) as typeof process.kill; + try { + const shell = vi.fn(async (cmd: string, args: string[]) => { + const key = `${cmd} ${args.join(' ')}`; + events.push({ name: key, t: Date.now() - start }); + if (key === 'pgrep -f qemu-system') { + // Two orphan qemu PIDs (matches attempt-12 signature). + return { stdout: '90011\n90012\n', stderr: '', exitCode: 0 }; + } + if (key === 'adb devices') { + return { stdout: 'List of devices attached\n', stderr: '', exitCode: 0 }; + } + if (key === 'emulator -list-avds') { + // Unknown AVD short-circuits the rest of the boot path — + // we only care about the order of the sweep steps here. + return { stdout: 'Other_AVD\n', stderr: '', exitCode: 0 }; + } + return { stdout: '', stderr: '', exitCode: 0 }; + }); + const backend = new AvdBackend({ shell }); + await expect(backend.ensureAvdRunning('ACE_Pixel_API_34')).rejects.toBeInstanceOf(AvdBootError); + + const firstOrphanKill = events.findIndex((e) => e.name.startsWith('process.kill(')); + const adbKillSrv = events.findIndex((e) => e.name === 'adb kill-server'); + const adbStartSrv = events.findIndex((e) => e.name === 'adb start-server'); + expect(firstOrphanKill, 'expected at least one orphan-qemu kill').toBeGreaterThan(-1); + expect(adbKillSrv, 'expected adb kill-server in heal').toBeGreaterThan(-1); + expect(adbStartSrv, 'expected adb start-server in heal').toBeGreaterThan(adbKillSrv); + // ORDER: orphan kill must precede adb kill-server. + expect( + firstOrphanKill, + 'orphan-qemu kill must run BEFORE adb kill-server so the freshly-restarted daemon sees a clean port landscape', + ).toBeLessThan(adbKillSrv); + // SOCKET-RELEASE WAIT: at least ~400ms must pass between the + // last orphan kill and adb kill-server (we wait 500ms; allow a + // 100ms scheduling slack so the test isn't flaky on slow CI). + const lastOrphanKillT = [...events] + .reverse() + .find((e) => e.name.startsWith('process.kill('))!.t; + const adbKillSrvT = events[adbKillSrv].t; + expect( + adbKillSrvT - lastOrphanKillT, + 'expected ≥400ms socket-release wait between orphan-qemu kill and adb kill-server', + ).toBeGreaterThanOrEqual(400); + } finally { + (process as { kill: typeof process.kill }).kill = realKill; + } + }); + + it('sweepStaleEmulatorState: kills orphan qemu PIDs when adb sees only a partial subset (no +2 threshold)', async () => { + if (process.platform === 'win32') return; + // Regression guard against re-introducing PR #349's conservative + // `qemuPids.length >= liveCount + 2` guard. Attempt-12 signature: + // 2 orphan qemu PIDs + 1 stale adb-devices entry → length 2, + // liveCount 1 → 2 >= 1+2 is FALSE → no kill fires, orphans + // survive into the next ensureAvdRunning. Loosened to + // `qemuPids.length > liveCount`: with N > M, (N-M) PIDs are + // orphan and must be killed. + const killed: number[] = []; + const realKill = process.kill; + (process as { kill: typeof process.kill }).kill = ((pid: number, _sig?: string | number) => { + killed.push(pid); + return true; + }) as typeof process.kill; + try { + const shell = vi.fn(async (cmd: string, args: string[]) => { + const key = `${cmd} ${args.join(' ')}`; + if (key === 'pgrep -f qemu-system') { + return { stdout: '90021\n90022\n', stderr: '', exitCode: 0 }; + } + if (key === 'adb devices') { + // One legitimate-looking adb device entry; 2 qemu PIDs. + // PR #349 +2 guard wouldn't fire here — confirming the + // loosened guard fires the kill for the orphan. + return { + stdout: 'List of devices attached\nemulator-5554\tdevice\n', + stderr: '', + exitCode: 0, + }; + } + if (key === 'emulator -list-avds') { + return { stdout: 'Other_AVD\n', stderr: '', exitCode: 0 }; + } + return { stdout: '', stderr: '', exitCode: 0 }; + }); + const backend = new AvdBackend({ shell }); + await expect(backend.ensureAvdRunning('ACE_Pixel_API_34')).rejects.toBeInstanceOf(AvdBootError); + // (qemu length 2) - (live count 1) = 1 expected orphan kill. + expect(killed.length).toBeGreaterThanOrEqual(1); + // Killed the lowest PID first (the older, more-likely-orphan). + expect(killed[0]).toBe(90021); + } finally { + (process as { kill: typeof process.kill }).kill = realKill; + } + }); + it('sweepStaleEmulatorState: tolerates pgrep/kill failures (best-effort)', async () => { if (process.platform === 'win32') return; const realKill = process.kill; diff --git a/test/mcp/mobile/static-recipe-invariants.test.ts b/test/mcp/mobile/static-recipe-invariants.test.ts index a559565..82a48e6 100644 --- a/test/mcp/mobile/static-recipe-invariants.test.ts +++ b/test/mcp/mobile/static-recipe-invariants.test.ts @@ -213,4 +213,34 @@ describe('learn-tap-module.yaml', () => { /extendedWaitUntil:\s*\n\s*visible:\s*\n\s*id: "org\.commcare\.dalvik:id\/nav_btn_next"/, ); }); + + it('Branch B only fires when the form row text matches MODULE_NAME (same-name case)', () => { + // Regression guard for the 2026-05-19 malaria-itn-fgd run halt + // (run 20260515-1645 Phase 6 attempt 12) on J1 module + // "Briefing Acknowledgement" → form "Acknowledge Readiness": + // Branch B's pre-fix `when:` clause only required the + // intermediate form-list to be visible + nav_btn_next NOT visible, + // so it fired regardless of whether the form's display name + // matched the module name. On the form-list screen the toolbar + // still reads ${MODULE_NAME}, so the inner re-tap of + // `learn-suite-row-by-name` (text-anchored on ${MODULE_NAME}) + // landed on the non-tappable toolbar TextView; the subsequent + // extendedWaitUntil on nav_btn_next then expired against the + // unchanged form-list. + // + // Structural fix: Branch B's effective trigger must additionally + // require a `${MODULE_NAME}`-matching node SCOPED to the menu-list + // body (via `below: id: screen_suite_menu_list`). When form-name + // != module-name, no such body node exists, so Branch B skips and + // the caller's next learn-tap-module invocation (with FORM_NAME) + // drills the form row by its own label. + // + // Expressed as a nested runFlow because Maestro `when:` clauses + // accept ONE `visible:` element selector — combining two visible- + // predicate semantics (outer: list-id visible; inner: text-in- + // body visible) requires a nested flow. + expect(yaml).toMatch( + /visible:\s*\n\s*text: "\$\{MODULE_NAME\}"\s*\n\s*below:\s*\n\s*id: "org\.commcare\.dalvik:id\/screen_suite_menu_list"/, + ); + }); });