Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .claude-plugin/marketplace.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
"url": "https://github.com/jjackson"
},
"metadata": {
"version": "0.13.278"
"version": "0.13.280"
},
"plugins": [
{
"name": "ace",
"source": "./",
"version": "0.13.279",
"version": "0.13.280",
"description": "AI Connect Engine — orchestrates the CRISPR-Connect lifecycle from idea through app building, Connect setup, LLO management, and closeout"
}
]
Expand Down
2 changes: 1 addition & 1 deletion .claude-plugin/plugin.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "ace",
"version": "0.13.279",
"version": "0.13.280",
"description": "AI Connect Engine — orchestrates the CRISPR-Connect lifecycle from idea through app building, Connect setup, LLO management, and closeout",
"author": {
"name": "Jonathan Jackson",
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.13.279
0.13.280
72 changes: 50 additions & 22 deletions mcp/mobile/backends/avd.ts
Original file line number Diff line number Diff line change
Expand Up @@ -525,13 +525,27 @@ export class AvdBackend {
* we already have.
*/
private async sweepStaleEmulatorState(): Promise<void> {
// Step 1: orphan qemu sweep.
// Ordering is load-bearing — orphan-qemu kill MUST land BEFORE
// adb-server restart, with a brief socket-release window between
// them. If adb-server starts up while orphan qemu instances are
// still holding emulator-NNNN TCP sockets, the freshly-restarted
// daemon adopts the wedged-port state and we end up back in the
// same "package service did not bind" failure. The attempt-12
// reproducer (malaria-itn-fgd 20260515-1645): 2 orphan qemu PIDs
// + 3 stale adb daemons; even with PR #349's adb-restart in place,
// the next `ensureAvdRunning` failed twice because the qemu PIDs
// were still alive when the daemon came back up. Operator had to
// run a second `adb kill-server`/`start-server` AFTER manually
// pkill'ing qemu before the third attempt finally booted.
//
// Step 1: orphan qemu sweep (must run first).
//
// Get all qemu-system PIDs. `pgrep -af` would give us the command-
// line too, but we don't need it for the kill decision — the
// adb-devices cross-reference is the authoritative signal. On
// Windows we no-op (pgrep doesn't exist, and ACE mobile dev is
// Mac/Linux-only in practice).
let orphansKilled = 0;
if (process.platform !== 'win32') {
try {
const pgrep = await this.shell('pgrep', ['-f', 'qemu-system']).catch(() => null);
Expand Down Expand Up @@ -564,28 +578,25 @@ export class AvdBackend {
for (const pid of qemuPids) {
// eslint-disable-next-line no-console
console.warn(`[ace-mobile] sweepStaleEmulatorState: killing orphan qemu pid=${pid} (no adb devices visible)`);
try { process.kill(pid, 'SIGKILL'); } catch { /* already gone */ }
try { process.kill(pid, 'SIGKILL'); orphansKilled++; } catch { /* already gone */ }
}
} else if (qemuPids.length > liveCount) {
// Some qemu PIDs aren't matched by adb devices — best-
// effort: kill the excess. We can't precisely identify
// which PIDs are orphans without per-process port probing,
// but if pgrep shows N PIDs and adb sees M < N devices,
// (N - M) PIDs are orphans. We kill the lowest-PID
// excess (most likely to be the older orphans).
//
// Conservative: only kill if pgrep > 2 * live (i.e. we're
// confident there are MULTIPLE orphans, not a single
// legitimate emulator with a stale pgrep echo). This
// avoids killing a healthy concurrent emulator on a
// shared box.
if (qemuPids.length >= liveCount + 2) {
const excess = qemuPids.slice(0, qemuPids.length - liveCount);
for (const pid of excess) {
// eslint-disable-next-line no-console
console.warn(`[ace-mobile] sweepStaleEmulatorState: killing likely-orphan qemu pid=${pid} (${qemuPids.length} qemu PIDs, ${liveCount} adb devices)`);
try { process.kill(pid, 'SIGKILL'); } catch { /* already gone */ }
}
// Some qemu PIDs aren't matched by adb devices — these are
// orphans (a live qemu always has an adb-devices entry once
// the daemon has scanned the port range). If pgrep shows N
// PIDs and adb sees M < N devices, (N - M) PIDs are
// orphans. Kill the excess (lowest-PID first — most likely
// to be the older orphans). Tightened from PR #349's
// `qemuPids.length >= liveCount + 2` guard: that conservative
// threshold meant a 2-orphan + 1-stale-adb-entry state
// (the attempt-12 signature) was BELOW the kill threshold
// and got passed through to the adb-restart step with the
// orphans still alive — which is exactly the bug.
const excess = qemuPids.slice(0, qemuPids.length - liveCount);
for (const pid of excess) {
// eslint-disable-next-line no-console
console.warn(`[ace-mobile] sweepStaleEmulatorState: killing likely-orphan qemu pid=${pid} (${qemuPids.length} qemu PIDs, ${liveCount} adb devices)`);
try { process.kill(pid, 'SIGKILL'); orphansKilled++; } catch { /* already gone */ }
}
}
}
Expand All @@ -595,12 +606,29 @@ export class AvdBackend {
}
}

// Step 1.5: brief socket-release wait if we actually killed
// anything. SIGKILL is synchronous in the sense that process.kill
// returns immediately, but the kernel's TCP socket teardown for
// the emulator-NNNN ports the qemu was holding completes a few
// hundred ms later. Restarting adb-server BEFORE the sockets are
// released causes the daemon to either re-adopt the half-closed
// ports OR bind to a different set, both of which leave the
// subsequent emulator spawn unable to land cleanly. ~500ms is
// enough — empirically matches the SO_LINGER + TIME_WAIT
// shortened window on Darwin and Linux for the loopback
// sockets qemu uses for emulator-NNNN.
if (orphansKilled > 0) {
await new Promise((r) => setTimeout(r, 500));
}

// Step 2: always restart adb-server. Cheap (~500ms) and resets
// the daemon to a known-clean state independent of whether we
// detected wedge symptoms. Cited in three observed instances
// (malaria-itn-fgd attempts 8, 10, 11) as the manual recovery
// operators ran — owning it inside the heal removes the
// operator-in-the-loop step.
// operator-in-the-loop step. MUST run AFTER the orphan-qemu
// kill + socket-release wait so the freshly-restarted daemon
// sees a clean port landscape.
await this.shell('adb', ['kill-server']).catch(() => {});
await this.shell('adb', ['start-server']).catch(() => {});
}
Expand Down
76 changes: 57 additions & 19 deletions mcp/mobile/recipes/static/learn-tap-module.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,31 +60,69 @@ appId: org.commcare.dalvik
# case when module-name != form-name). No further action needed —
# downstream `form-advance.yaml` takes over.
# B. `screen_suite_menu_list` still visible AND `nav_btn_next` NOT
# visible → CommCare suppressed the auto-skip because module-name
# == form-name. Device is sitting on the intermediate one-row
# form-list; tap the only row (still text:${MODULE_NAME} since the
# form shares the module's name) to enter the form, then assert
# `nav_btn_next` visible so any further halt is fast and named.
# Otherwise (drilling from suite-root into a form-LIST, neither
# branch fires): the recipe falls through with the form-list still
# visible — caller is expected to invoke learn-tap-module again with
# the form's row name. This preserves the existing two-call drill
# pattern used by J1-style palettes.
# visible AND a row matching `${MODULE_NAME}` is rendered inside
# the menu-list body → CommCare suppressed the auto-skip because
# module-name == form-name. Device is sitting on the intermediate
# one-row form-list; the form row carries text:${MODULE_NAME}
# (since the form shares the module's name) — tap it to enter the
# form, then assert `nav_btn_next` visible so any further halt is
# fast and named.
#
# The `below: id: screen_suite_menu_list` scoping on the visible-
# text guard is load-bearing: when form-name != module-name (e.g.
# malaria-itn-fgd J1: module "Briefing Acknowledgement" → form
# "Acknowledge Readiness"), the form row's text does NOT match
# `${MODULE_NAME}`, so the guard fails and Branch B skips — letting
# the caller's next learn-tap-module invocation (with FORM_NAME)
# drill the form row by its own label. Without the body scoping,
# Branch B's bare `visible: text: ${MODULE_NAME}` would match the
# toolbar title (which still reads "Briefing Acknowledgement" on
# the form-list screen), the re-tap would land on the non-tappable
# toolbar, and the extendedWaitUntil would expire on the unchanged
# form-list. Live-reproduced 2026-05-19 (malaria-itn-fgd
# 20260515-1645 Phase 6 attempt 12) — see screenshot
# ❌-1779218294468-(chunk-0.yaml).png.
#
# Otherwise (drilling from suite-root into a form-LIST whose form
# name != module name, neither branch fires): the recipe falls
# through with the form-list still visible — caller is expected to
# invoke learn-tap-module again with the form's row name. This
# preserves the existing two-call drill pattern used by J1-style
# palettes.
# Outer guard: intermediate form-list still visible (auto-skip
# suppressed). Inner guard: same-name case specifically — a row
# matching `${MODULE_NAME}` is rendered inside the menu-list body.
# Nested rather than a single `when:` because Maestro `when:` clauses
# take ONE `visible:` element selector; expressing "form-list visible
# AND text-in-list-body visible" requires two predicates, which YAML
# only supports as a nested flow.
- runFlow:
when:
visible:
id: "org.commcare.dalvik:id/screen_suite_menu_list"
notVisible:
id: "org.commcare.dalvik:id/nav_btn_next"
commands:
- takeScreenshot: "learn-tap-module-intermediate-${MODULE_NAME}"
- tapOn:
${SELECTOR:learn-suite-row-by-name}
- waitForAnimationToEnd:
timeout: 5000
- extendedWaitUntil:
visible:
id: "org.commcare.dalvik:id/nav_btn_next"
timeout: 10000
- runFlow:
when:
# Same-name-only guard: the text-match is scoped to the
# list body so the toolbar title (which still reads
# ${MODULE_NAME} on the form-list screen even when the
# form has a different label) does NOT satisfy this
# clause. See header comment for full rationale.
visible:
text: "${MODULE_NAME}"
below:
id: "org.commcare.dalvik:id/screen_suite_menu_list"
commands:
- takeScreenshot: "learn-tap-module-intermediate-${MODULE_NAME}"
- tapOn:
${SELECTOR:learn-suite-row-by-name}
- waitForAnimationToEnd:
timeout: 5000
- extendedWaitUntil:
visible:
id: "org.commcare.dalvik:id/nav_btn_next"
timeout: 10000

- takeScreenshot: "learn-tap-module-after-${MODULE_NAME}"
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "ace",
"version": "0.13.279",
"version": "0.13.280",
"description": "AI Connect Engine - orchestrator for building Connect Opps using AI",
"type": "module",
"scripts": {
Expand Down
120 changes: 120 additions & 0 deletions test/mcp/mobile/avd.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,126 @@ describe('AvdBackend.ensureAvdRunning', () => {
}
});

it('sweepStaleEmulatorState: orphan-qemu kill runs BEFORE adb kill-server with a socket-release wait between', async () => {
if (process.platform === 'win32') return;
// Regression guard for the 2026-05-19 malaria-itn-fgd attempt-12
// re-occurrence (run 20260515-1645 Phase 6): PR #349 wired up
// both the orphan-qemu sweep and the adb-server restart, but
// without (a) firm ordering and (b) a socket-release wait between
// them. The bug pattern: SIGKILL on qemu returns synchronously,
// but the kernel takes a few hundred ms to release the
// emulator-NNNN TCP sockets the qemu was holding. When the
// adb-restart fires immediately after the kill, the freshly-
// restarted daemon re-adopts the wedged-port state and the next
// `ensureAvdRunning` still fails with "package service did not
// bind." Empirically 2 attempts failed even after PR #349 was
// live, before a *second* `adb kill-server`/`start-server`
// inside the dispatch cleared it.
//
// Structural fix: orphan-qemu kill MUST run first, then a brief
// (~500ms) wait, then adb-server restart.
const events: { name: string; t: number }[] = [];
const start = Date.now();
const realKill = process.kill;
(process as { kill: typeof process.kill }).kill = ((pid: number, _sig?: string | number) => {
events.push({ name: `process.kill(${pid})`, t: Date.now() - start });
return true;
}) as typeof process.kill;
try {
const shell = vi.fn(async (cmd: string, args: string[]) => {
const key = `${cmd} ${args.join(' ')}`;
events.push({ name: key, t: Date.now() - start });
if (key === 'pgrep -f qemu-system') {
// Two orphan qemu PIDs (matches attempt-12 signature).
return { stdout: '90011\n90012\n', stderr: '', exitCode: 0 };
}
if (key === 'adb devices') {
return { stdout: 'List of devices attached\n', stderr: '', exitCode: 0 };
}
if (key === 'emulator -list-avds') {
// Unknown AVD short-circuits the rest of the boot path —
// we only care about the order of the sweep steps here.
return { stdout: 'Other_AVD\n', stderr: '', exitCode: 0 };
}
return { stdout: '', stderr: '', exitCode: 0 };
});
const backend = new AvdBackend({ shell });
await expect(backend.ensureAvdRunning('ACE_Pixel_API_34')).rejects.toBeInstanceOf(AvdBootError);

const firstOrphanKill = events.findIndex((e) => e.name.startsWith('process.kill('));
const adbKillSrv = events.findIndex((e) => e.name === 'adb kill-server');
const adbStartSrv = events.findIndex((e) => e.name === 'adb start-server');
expect(firstOrphanKill, 'expected at least one orphan-qemu kill').toBeGreaterThan(-1);
expect(adbKillSrv, 'expected adb kill-server in heal').toBeGreaterThan(-1);
expect(adbStartSrv, 'expected adb start-server in heal').toBeGreaterThan(adbKillSrv);
// ORDER: orphan kill must precede adb kill-server.
expect(
firstOrphanKill,
'orphan-qemu kill must run BEFORE adb kill-server so the freshly-restarted daemon sees a clean port landscape',
).toBeLessThan(adbKillSrv);
// SOCKET-RELEASE WAIT: at least ~400ms must pass between the
// last orphan kill and adb kill-server (we wait 500ms; allow a
// 100ms scheduling slack so the test isn't flaky on slow CI).
const lastOrphanKillT = [...events]
.reverse()
.find((e) => e.name.startsWith('process.kill('))!.t;
const adbKillSrvT = events[adbKillSrv].t;
expect(
adbKillSrvT - lastOrphanKillT,
'expected ≥400ms socket-release wait between orphan-qemu kill and adb kill-server',
).toBeGreaterThanOrEqual(400);
} finally {
(process as { kill: typeof process.kill }).kill = realKill;
}
});

it('sweepStaleEmulatorState: kills orphan qemu PIDs when adb sees only a partial subset (no +2 threshold)', async () => {
if (process.platform === 'win32') return;
// Regression guard against re-introducing PR #349's conservative
// `qemuPids.length >= liveCount + 2` guard. Attempt-12 signature:
// 2 orphan qemu PIDs + 1 stale adb-devices entry → length 2,
// liveCount 1 → 2 >= 1+2 is FALSE → no kill fires, orphans
// survive into the next ensureAvdRunning. Loosened to
// `qemuPids.length > liveCount`: with N > M, (N-M) PIDs are
// orphan and must be killed.
const killed: number[] = [];
const realKill = process.kill;
(process as { kill: typeof process.kill }).kill = ((pid: number, _sig?: string | number) => {
killed.push(pid);
return true;
}) as typeof process.kill;
try {
const shell = vi.fn(async (cmd: string, args: string[]) => {
const key = `${cmd} ${args.join(' ')}`;
if (key === 'pgrep -f qemu-system') {
return { stdout: '90021\n90022\n', stderr: '', exitCode: 0 };
}
if (key === 'adb devices') {
// One legitimate-looking adb device entry; 2 qemu PIDs.
// PR #349 +2 guard wouldn't fire here — confirming the
// loosened guard fires the kill for the orphan.
return {
stdout: 'List of devices attached\nemulator-5554\tdevice\n',
stderr: '',
exitCode: 0,
};
}
if (key === 'emulator -list-avds') {
return { stdout: 'Other_AVD\n', stderr: '', exitCode: 0 };
}
return { stdout: '', stderr: '', exitCode: 0 };
});
const backend = new AvdBackend({ shell });
await expect(backend.ensureAvdRunning('ACE_Pixel_API_34')).rejects.toBeInstanceOf(AvdBootError);
// (qemu length 2) - (live count 1) = 1 expected orphan kill.
expect(killed.length).toBeGreaterThanOrEqual(1);
// Killed the lowest PID first (the older, more-likely-orphan).
expect(killed[0]).toBe(90021);
} finally {
(process as { kill: typeof process.kill }).kill = realKill;
}
});

it('sweepStaleEmulatorState: tolerates pgrep/kill failures (best-effort)', async () => {
if (process.platform === 'win32') return;
const realKill = process.kill;
Expand Down
30 changes: 30 additions & 0 deletions test/mcp/mobile/static-recipe-invariants.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -213,4 +213,34 @@ describe('learn-tap-module.yaml', () => {
/extendedWaitUntil:\s*\n\s*visible:\s*\n\s*id: "org\.commcare\.dalvik:id\/nav_btn_next"/,
);
});

it('Branch B only fires when the form row text matches MODULE_NAME (same-name case)', () => {
// Regression guard for the 2026-05-19 malaria-itn-fgd run halt
// (run 20260515-1645 Phase 6 attempt 12) on J1 module
// "Briefing Acknowledgement" → form "Acknowledge Readiness":
// Branch B's pre-fix `when:` clause only required the
// intermediate form-list to be visible + nav_btn_next NOT visible,
// so it fired regardless of whether the form's display name
// matched the module name. On the form-list screen the toolbar
// still reads ${MODULE_NAME}, so the inner re-tap of
// `learn-suite-row-by-name` (text-anchored on ${MODULE_NAME})
// landed on the non-tappable toolbar TextView; the subsequent
// extendedWaitUntil on nav_btn_next then expired against the
// unchanged form-list.
//
// Structural fix: Branch B's effective trigger must additionally
// require a `${MODULE_NAME}`-matching node SCOPED to the menu-list
// body (via `below: id: screen_suite_menu_list`). When form-name
// != module-name, no such body node exists, so Branch B skips and
// the caller's next learn-tap-module invocation (with FORM_NAME)
// drills the form row by its own label.
//
// Expressed as a nested runFlow because Maestro `when:` clauses
// accept ONE `visible:` element selector — combining two visible-
// predicate semantics (outer: list-id visible; inner: text-in-
// body visible) requires a nested flow.
expect(yaml).toMatch(
/visible:\s*\n\s*text: "\$\{MODULE_NAME\}"\s*\n\s*below:\s*\n\s*id: "org\.commcare\.dalvik:id\/screen_suite_menu_list"/,
);
});
});
Loading