From 63563ed841977e22ce5e25781021cec29d02af94 Mon Sep 17 00:00:00 2001 From: Jonathan Jackson Date: Tue, 19 May 2026 19:09:03 -0400 Subject: [PATCH] =?UTF-8?q?fix(mobile/probe):=20widen=20probe1=20timeout?= =?UTF-8?q?=208s=20=E2=86=92=2020s=20for=20Maestro=20v2=20cold-start?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The malaria-itn-app/20260517-1829 Phase 6 driver-unhealthy failures trace to probe1's 8s shell budget being below v2.3.0's JVM cold-start floor (~10-12s steady-state on a healthy AVD with Java 17). probe1 then shell-times-out on a working driver, triggering Stage 2's destructive uninstall+reinstall, which exposes the pre-existing install-race class as UNAVAILABLE on probe2. v1.39 and v2.3 are wire-compatible for our usage — same driver APK layout, same gRPC contract, same `--host`/`--port` flags, same step keys. The fix is purely a timeout calibration; no v1/v2 toggle, no ALLOWED_STEP_KEYS churn. doctor warns (not errors) on Maestro <2.0 so cloud-backend operators with v1-bundled AMI keep working until the AMI is rebaked separately. See docs/learnings/2026-05-19-maestro-v2-probe-timeout.md for the full trace + ruled-out hypotheses (the brief's "v2 broke the gRPC probe" framing was invalidated by direct empirical testing). Co-Authored-By: Claude Opus 4.7 (1M context) --- .claude-plugin/marketplace.json | 4 +- .claude-plugin/plugin.json | 2 +- VERSION | 2 +- bin/ace-doctor | 13 ++ .../2026-05-19-maestro-v2-probe-timeout.md | 127 ++++++++++++++++++ mcp/mobile/backends/maestro.ts | 7 +- mcp/mobile/client.ts | 12 +- package-lock.json | 4 +- package.json | 2 +- test/mcp/mobile/client.test.ts | 12 +- 10 files changed, 167 insertions(+), 18 deletions(-) create mode 100644 docs/learnings/2026-05-19-maestro-v2-probe-timeout.md diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 5116215..d44b267 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -6,13 +6,13 @@ "url": "https://github.com/jjackson" }, "metadata": { - "version": "0.13.282" + "version": "0.13.283" }, "plugins": [ { "name": "ace", "source": "./", - "version": "0.13.282", + "version": "0.13.283", "description": "AI Connect Engine — orchestrates the CRISPR-Connect lifecycle from idea through app building, Connect setup, LLO management, and closeout" } ] diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index 93318d3..d61556b 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "ace", - "version": "0.13.282", + "version": "0.13.283", "description": "AI Connect Engine — orchestrates the CRISPR-Connect lifecycle from idea through app building, Connect setup, LLO management, and closeout", "author": { "name": "Jonathan Jackson", diff --git a/VERSION b/VERSION index 32e9ca8..bb1a178 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.13.282 +0.13.283 diff --git a/bin/ace-doctor b/bin/ace-doctor index b55dfe1..65fcb99 100755 --- a/bin/ace-doctor +++ b/bin/ace-doctor @@ -2020,6 +2020,19 @@ if [ -n "$MAESTRO_BIN" ]; then fi MAESTRO_VER="$("$MAESTRO_BIN" --version 2>/dev/null | tail -1)" echo " Maestro: $MAESTRO_BIN ${MAESTRO_VER:-unknown} OK" + # Version-floor advisory: ACE standardized on Maestro v2.x as of + # 0.13.x (post-malaria-itn-app debug arc). v1.39 still works for + # recipe playback — driver APK layout and gRPC contract are + # wire-compatible — but the cheap probe1 in `assertMaestroDriverHealthy` + # was recalibrated for v2's slower JVM cold-start. Operators on v1.x + # should upgrade so local-backend behavior matches the (forthcoming) + # v2.x-baked cloud AMI. WARN-level, not BLOCKER: cloud-AMI rebake + # is tracked separately and operators using the cloud backend with a + # v1-bundled AMI still work fine. + if echo "$MAESTRO_VER" | grep -qE '^1\.'; then + echo " WARN maestro_version: ${MAESTRO_VER} — ACE has standardized on Maestro v2.x" + echo " fix: curl -Ls 'https://get.maestro.mobile.dev' | bash # reinstalls latest (currently v2.3.x)" + fi else echo " Maestro: MISSING — install with curl -Ls 'https://get.maestro.mobile.dev' | bash" fi diff --git a/docs/learnings/2026-05-19-maestro-v2-probe-timeout.md b/docs/learnings/2026-05-19-maestro-v2-probe-timeout.md new file mode 100644 index 0000000..29bf10c --- /dev/null +++ b/docs/learnings/2026-05-19-maestro-v2-probe-timeout.md @@ -0,0 +1,127 @@ +# Maestro v2.x probe1 timeout — false-positive "unhealthy" verdicts on a working driver + +**Surfaced 2026-05-19** during root-cause analysis of malaria-itn-app run +`20260517-1829` Phase 6 `app-screenshot-capture` failures. The brief +hypothesized a Maestro v1 → v2 wire-protocol break; the trace proves it +was a timeout calibration drift on the cheap first probe. + +## The empirical findings + +1. **Maestro CLI v2.3.0's driver-APK layout is unchanged from v1.39.x.** + `unzip -l ~/.maestro/lib/maestro-client.jar` on a v2.3.0 install shows + `maestro-app.apk` (11.7 MB) and `maestro-server.apk` (884 KB) at the + jar root — same names, same package IDs `dev.mobile.maestro` + + `dev.mobile.maestro.test`. `MaestroBackend.resolveDriverApks` works + unchanged on both major versions. +2. **The `--host`/`--port` flag contract is preserved.** `maestro + --host=localhost --port= hierarchy` against a warm AVD with + the driver running returns exit 0 with a full hierarchy JSON dump on + v2.3.0, identical to v1.39. +3. **Maestro v2.x's JVM cold-start is ~3-5× slower than v1.39.x.** + First-invocation `maestro hierarchy` against a healthy emulator takes + ~10-12s steady-state on v2.3.0 / Java 17. The previous probe1 budget + was 8s (`client.ts:720`), calibrated against v1.39's faster startup. +4. **The recipe step-key palette (`ALLOWED_STEP_KEYS`) is wire-compatible + with v2.x.** v2 added `setOrientation` / `setPermissions` / + `setClipboard` / `assertScreenshot` (not used here) and removed + `deterministicOrder` + `maestro upload` (not used here). Our static + recipes parse identically against v2.3.0. + +## The actual trace from the malaria-itn-app session log + +``` +Error: Maestro driver on AVD emulator-5556 is unhealthy after recovery: + probe1: shell timeout: maestro --host=localhost --port=5557 hierarchy; + install: package-list-before:app=true,test=true,already-installed; + repair: force-stop,uninstall,pm-uninstall-user-0,pm-ready, + apks-resolved,installed:app,installed:test, + apk-install-results:app=ok,test=ok, + package-list-after:app=true,test=true, + verified,instrumentation-kicked; + probe2: maestro hierarchy exit 1: UNAVAILABLE: io exception +``` + +Decoded: + +- **probe1 SHELL timeout** — the host-side 8s shell budget elapsed before + `maestro hierarchy` returned anything. The driver was healthy and + installed (`package-list-before:app=true,test=true,already-installed`); + the gRPC server was likely fine; the 8s budget was just below v2's + cold-start floor. +- **Stage 2 repair ran cleanly** — force-stop, uninstall, re-install all + succeeded. But repair's destructive side effect is the actual cause of + what follows. +- **probe2 UNAVAILABLE** — the post-uninstall reinstall hit the + well-documented install-race class (PRs #339/341/342, cold-boot + widening): the `am instrument` kick step doesn't reliably bind the + on-device gRPC server inside probe2's 90s budget on a freshly + reinstalled driver. + +So probe1's premature timeout *triggered* a destructive recovery cycle +on a perfectly working driver, and the destructive cycle then exposed +the pre-existing install-race class on the reinstall. + +## The fix + +`mcp/mobile/client.ts:720` — probe1 budget `8_000` → `20_000`. 20s is +comfortably above v2.3.0's ~12s cold-start ceiling while still tearing +down quickly when the driver is *actually* wedged. v1.39 callers pay no +visible cost (probe1 still returns in ~2s on healthy v1 drivers; the +budget is an upper bound, not a wall-clock). + +## What did NOT change + +- Driver APK extraction logic — v1 / v2 share the same jar layout. +- `--host`/`--port` flag invocation — same contract on both. +- `repairDriver` flow — the install-race class it papered over remains + the same class on v2. +- `ALLOWED_STEP_KEYS` — palette compatible with both. +- The cloud AMI's bundled Maestro version — see § AMI rebake below. + +## Standardization on v2.x + +The decision: rather than supporting v1 and v2 indefinitely, ACE +standardizes on Maestro v2.x as of this PR. The code remains +wire-compatible with both (the probe1 widening is forward-compatible); +the change is a doctor advisory and an operator-facing default. + +- **`bin/ace-doctor`** adds a `WARN maestro_version` line when the + installed `maestro --version` starts with `1.`, pointing the operator + at the `curl -Ls 'https://get.maestro.mobile.dev' | bash` upgrade + command. WARN-level (not BLOCKER) so operators using the cloud + backend with a v1-bundled AMI aren't blocked by their local install + state. +- **Cloud AMI rebake — pending.** The packer config that builds the + AMI is not in `jjackson/ace` or `jjackson/ace-web`; it may live under + the `acedimagi` Mac login or on AWS. Tracked as the next task in this + arc — the AMI's bundled Maestro CLI (currently ~1.36 per the + `app-test-cases/SKILL.md:262` comment) should be bumped to the same + v2.x line that the local install runs once we locate it. Until then + the cloud backend keeps working — same wire-protocol, same recipe + semantics — just with a v1.x CLI under the hood. + +## What we ruled OUT and why + +The brief framed this as "Maestro v2 broke the gRPC driver probe" and +asked for a multi-file refactor (driver APK extraction, gRPC probe path, +repair flow, doctor floor at v2.x as a *hard* gate, recipe re-authoring +for v2 syntax). Direct evidence ruled all of those out: + +| Ruled out | Evidence | +|---|---| +| Driver APK layout changed in v2 | `unzip -l` on v2.3.0 jar shows the v1.39 layout intact | +| `--host`/`--port` flags broken in v2 | Live call against running AVD returns exit 0 with full hierarchy JSON | +| gRPC contract changed | Stack trace is `io.grpc.StatusRuntimeException: UNAVAILABLE: io exception` — same class as the install-race class in PRs #339/341/342 | +| Recipe syntax broken in v2 | `ALLOWED_STEP_KEYS` audited against v2.0–v2.3 CHANGELOG; no removals affect our palette | +| Doctor should *fail* on v1.x | Cloud AMI still bundles v1.36; failing locally blocks cloud-backend operators for no benefit | + +## Why this matters + +The brief was a well-structured spec for a refactor that the code didn't +need. Phase 1 of systematic debugging caught it: read the actual error +trace before agreeing with the diagnosis. The trace told us +"probe1: shell timeout" (host-side, not protocol-side) and +"package-list-before:app=true,test=true,already-installed" (driver +healthy). Those two strings invalidated 90% of the brief in ~20 +minutes of evidence-gathering and saved a multi-day refactor that +wouldn't have fixed anything. diff --git a/mcp/mobile/backends/maestro.ts b/mcp/mobile/backends/maestro.ts index 80251b0..fc6aa1b 100644 --- a/mcp/mobile/backends/maestro.ts +++ b/mcp/mobile/backends/maestro.ts @@ -601,9 +601,10 @@ export class MaestroBackend { /** * Locate the two driver APKs on the host. They ship bundled inside * `~/.maestro/lib/maestro-client.jar` (verified on Maestro CLI 1.39.x - * — file naming: `maestro-app.apk` and `maestro-server.apk` at the - * jar root). Extract once to a per-version tempdir; reuse on - * subsequent calls. + * and 2.3.0 — file naming preserved across the v1 → v2 break: + * `maestro-app.apk` and `maestro-server.apk` at the jar root, same + * package IDs `dev.mobile.maestro` + `dev.mobile.maestro.test`). + * Extract once to a per-version tempdir; reuse on subsequent calls. */ private async resolveDriverApks(): Promise<{ app: string; test: string }> { const home = process.env.HOME || os.homedir(); diff --git a/mcp/mobile/client.ts b/mcp/mobile/client.ts index 5c30b68..411f26b 100644 --- a/mcp/mobile/client.ts +++ b/mcp/mobile/client.ts @@ -716,8 +716,16 @@ export class MobileClient { return; } const attempts: string[] = []; - // Stage 1: cheap probe. - let probe = await this.maestro.probeDriver(adbPort, 8_000); + // Stage 1: cheap probe. 20s budget covers the Maestro v2.x CLI's + // JVM cold-start (~10-12s steady-state on a healthy AVD), measured + // on v2.3.0 / Java 17 — the v1.39 budget of 8s ran shorter than v2's + // first-invocation init and caused false-positive "unhealthy" verdicts + // that triggered Stage 2 repair on a perfectly working driver + // (malaria-itn-app/20260517-1829 trace: probe1 always shell-timed out + // at 8s, full uninstall+reinstall ran, then probe2 hit the post-tear-down + // gRPC bind race and surfaced UNAVAILABLE). See + // docs/learnings/2026-05-19-maestro-v2-probe-timeout.md. + let probe = await this.maestro.probeDriver(adbPort, 20_000); if (probe.healthy) return; attempts.push(`probe1: ${probe.reason ?? 'unknown'}`); logInfo(`maestro_driver: stage 1 probe unhealthy on ${serial} — attempting install + repair`); diff --git a/package-lock.json b/package-lock.json index a2caa1f..1a92259 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "ace", - "version": "0.13.279", + "version": "0.13.283", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "ace", - "version": "0.13.279", + "version": "0.13.283", "license": "ISC", "dependencies": { "@anthropic-ai/sdk": "^0.94.0", diff --git a/package.json b/package.json index 1e3ac86..f91a284 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "ace", - "version": "0.13.282", + "version": "0.13.283", "description": "AI Connect Engine - orchestrator for building Connect Opps using AI", "type": "module", "scripts": { diff --git a/test/mcp/mobile/client.test.ts b/test/mcp/mobile/client.test.ts index 29e2652..4e125dd 100644 --- a/test/mcp/mobile/client.test.ts +++ b/test/mcp/mobile/client.test.ts @@ -301,7 +301,7 @@ describe('MobileClient.assertMaestroDriverHealthy', () => { it('passes through cleanly when the driver is healthy on the first probe', async () => { const { client, probeCalls, maestro } = makeClient([{ healthy: true }]); await expect(client.assertMaestroDriverHealthy('emulator-5554')).resolves.toBeUndefined(); - expect(probeCalls).toEqual([8_000]); // single short-timeout probe + expect(probeCalls).toEqual([20_000]); // single short-timeout probe expect(maestro.repairDriver).not.toHaveBeenCalled(); }); @@ -311,7 +311,7 @@ describe('MobileClient.assertMaestroDriverHealthy', () => { { healthy: true }, ]); await expect(client.assertMaestroDriverHealthy('emulator-5554')).resolves.toBeUndefined(); - expect(probeCalls).toEqual([8_000, 90_000]); // 2nd probe gets reinstall budget + expect(probeCalls).toEqual([20_000, 90_000]); // 2nd probe gets reinstall budget expect(maestro.repairDriver).toHaveBeenCalledTimes(1); expect(maestro.repairDriver).toHaveBeenCalledWith('emulator-5554'); }); @@ -346,7 +346,7 @@ describe('MobileClient.assertMaestroDriverHealthy', () => { await expect(client.assertMaestroDriverHealthy('emulator-5554')).resolves.toBeUndefined(); expect(maestro.ensureDriverInstalled).toHaveBeenCalledWith('emulator-5554'); expect(maestro.repairDriver).not.toHaveBeenCalled(); - expect(probeCalls).toEqual([8_000, 90_000]); // post-install probe gets full budget + expect(probeCalls).toEqual([20_000, 90_000]); // post-install probe gets full budget }); // When the driver is already installed (wedged-but-installed — the @@ -361,7 +361,7 @@ describe('MobileClient.assertMaestroDriverHealthy', () => { await expect(client.assertMaestroDriverHealthy('emulator-5554')).resolves.toBeUndefined(); expect(maestro.ensureDriverInstalled).toHaveBeenCalledWith('emulator-5554'); expect(maestro.repairDriver).toHaveBeenCalledTimes(1); - expect(probeCalls).toEqual([8_000, 90_000]); // probe1 + probe2 only — no probe1.5 + expect(probeCalls).toEqual([20_000, 90_000]); // probe1 + probe2 only — no probe1.5 }); // Fresh install that didn't recover on the post-install probe: must @@ -384,7 +384,7 @@ describe('MobileClient.assertMaestroDriverHealthy', () => { await expect(client.assertMaestroDriverHealthy('emulator-5554')).resolves.toBeUndefined(); expect(maestro.ensureDriverInstalled).toHaveBeenCalledWith('emulator-5554'); expect(maestro.repairDriver).toHaveBeenCalledTimes(1); - expect(probeCalls).toEqual([8_000, 90_000, 90_000]); // probe1 + probe1.5 + probe2 + expect(probeCalls).toEqual([20_000, 90_000, 90_000]); // probe1 + probe1.5 + probe2 }); // Live-reproduced on malaria-itn-fgd/20260515-1645 attempt 4 @@ -425,7 +425,7 @@ describe('MobileClient.assertMaestroDriverHealthy', () => { expect(maestro.repairDriver).toHaveBeenCalledTimes(1); // Two probes only: stage-1 + stage-2 (no stage-1.5 because // ensureDriverInstalled returned already-installed). - expect(probeCalls).toEqual([8_000, 90_000]); + expect(probeCalls).toEqual([20_000, 90_000]); }); // If ensureDriverInstalled throws (operator hasn't run mobile-bootstrap