From af7610d18c0765ea22ed98eed3a5ca4968024cc1 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Mon, 25 May 2026 14:37:56 -0400 Subject: [PATCH 1/3] Fix flaky E2E tests Make permission handler error coverage assert deterministic replayed tool results instead of waiting for final assistant text, and ensure background-agent tests wait for the completion notification before cleanup. Normalize equivalent replay proxy notification wording across SDK suites. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- dotnet/test/E2E/PermissionE2ETests.cs | 40 ++++++++++++++----- .../test/E2E/RpcTasksAndHandlersE2ETests.cs | 17 ++++++++ python/e2e/test_rpc_tasks_and_handlers_e2e.py | 18 +++++++++ test/harness/replayingCapiProxy.test.ts | 32 +++++++++++++++ test/harness/replayingCapiProxy.ts | 4 ++ 5 files changed, 102 insertions(+), 9 deletions(-) diff --git a/dotnet/test/E2E/PermissionE2ETests.cs b/dotnet/test/E2E/PermissionE2ETests.cs index 53b52c31c..3da42ab8f 100644 --- a/dotnet/test/E2E/PermissionE2ETests.cs +++ b/dotnet/test/E2E/PermissionE2ETests.cs @@ -226,24 +226,46 @@ await session2.SendAndWaitAsync(new MessageOptions [Fact] public async Task Should_Handle_Permission_Handler_Errors_Gracefully() { + var permissionRequestReceived = + new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); var session = await CreateSessionAsync(new SessionConfig { OnPermissionRequest = (request, invocation) => { - // Simulate an error in the handler + permissionRequestReceived.TrySetResult(request); throw new InvalidOperationException("Handler error"); } }); - await session.SendAsync(new MessageOptions + try { - Prompt = "Run 'echo test'. If you can't, say 'failed'." - }); - - var message = await TestHelper.GetFinalAssistantMessageAsync(session); - - // Should handle the error and deny permission - Assert.Matches("fail|cannot|unable|permission", message?.Data.Content?.ToLowerInvariant() ?? string.Empty); + var exchanges = await SendAndWaitForExchangesAsync( + session, + new MessageOptions + { + Prompt = "Run 'echo test'. If you can't, say 'failed'." + }, + minimumCount: 2); + + var permissionRequest = await permissionRequestReceived.Task.WaitAsync(TimeSpan.FromSeconds(30)); + Assert.IsType(permissionRequest); + + var toolResultMessage = exchanges + .SelectMany(exchange => exchange.Request.Messages) + .LastOrDefault(message => + message.Role == "tool" && + message.StringContent?.Contains("Permission denied", StringComparison.OrdinalIgnoreCase) == true); + + Assert.NotNull(toolResultMessage); + Assert.Contains( + "could not request permission", + toolResultMessage.StringContent ?? string.Empty, + StringComparison.OrdinalIgnoreCase); + } + finally + { + await session.DisposeAsync(); + } } [Fact] diff --git a/dotnet/test/E2E/RpcTasksAndHandlersE2ETests.cs b/dotnet/test/E2E/RpcTasksAndHandlersE2ETests.cs index 61c8d5878..d31237690 100644 --- a/dotnet/test/E2E/RpcTasksAndHandlersE2ETests.cs +++ b/dotnet/test/E2E/RpcTasksAndHandlersE2ETests.cs @@ -80,6 +80,22 @@ public async Task Should_Start_Background_Agent_And_Report_Task_Details() }); Assert.Contains("TASK_AGENT_READY", ready?.Data.Content ?? string.Empty, StringComparison.Ordinal); + var taskCompletionNotification = + new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + using var subscription = session.On(evt => + { + switch (evt) + { + case AssistantMessageEvent assistantMessage + when assistantMessage.Data.Content?.Contains("TASK_AGENT_DONE", StringComparison.Ordinal) == true: + taskCompletionNotification.TrySetResult(assistantMessage); + break; + case SessionErrorEvent error: + taskCompletionNotification.TrySetException(new Exception(error.Data.Message ?? "session error")); + break; + } + }); + var started = await session.Rpc.Tasks.StartAgentAsync( agentType: "general-purpose", prompt: "Reply with TASK_AGENT_DONE exactly.", @@ -123,6 +139,7 @@ await TestHelper.WaitForConditionAsync( Assert.NotNull(task); Assert.Contains("TASK_AGENT_DONE", task.LatestResponse ?? task.Result ?? string.Empty); + await taskCompletionNotification.Task.WaitAsync(TimeSpan.FromSeconds(30)); if (task.Status == GitHub.Copilot.Rpc.TaskStatus.Idle) { diff --git a/python/e2e/test_rpc_tasks_and_handlers_e2e.py b/python/e2e/test_rpc_tasks_and_handlers_e2e.py index 23b9f9896..e735f7db7 100644 --- a/python/e2e/test_rpc_tasks_and_handlers_e2e.py +++ b/python/e2e/test_rpc_tasks_and_handlers_e2e.py @@ -29,6 +29,7 @@ UIElicitationResponseAction, UIHandlePendingElicitationRequest, ) +from copilot.generated.session_events import AssistantMessageData, SessionErrorData from copilot.session import PermissionHandler from .testharness import E2ETestContext @@ -211,6 +212,21 @@ async def test_should_start_background_agent_and_report_task_details(self, ctx: session = await ctx.client.create_session( on_permission_request=PermissionHandler.approve_all, ) + task_completion_notification = asyncio.get_running_loop().create_future() + + def on_event(event): + if isinstance(event.data, AssistantMessageData) and "TASK_AGENT_DONE" in ( + event.data.content or "" + ): + if not task_completion_notification.done(): + task_completion_notification.set_result(event) + elif isinstance(event.data, SessionErrorData): + if not task_completion_notification.done(): + task_completion_notification.set_exception( + RuntimeError(event.data.message or "session error") + ) + + unsubscribe = session.on(on_event) try: ready = await session.send_and_wait( "Reply with TASK_AGENT_READY exactly.", @@ -262,6 +278,7 @@ async def test_should_start_background_agent_and_report_task_details(self, ctx: ) assert found_task is not None, f"Task {task_id} disappeared before it completed" assert "TASK_AGENT_DONE" in (found_task.latest_response or found_task.result or "") + await asyncio.wait_for(task_completion_notification, timeout=30.0) if found_task.status == TaskInfoStatus.IDLE: cancel = await session.rpc.tasks.cancel(TasksCancelRequest(id=task_id)) @@ -274,4 +291,5 @@ async def test_should_start_background_agent_and_report_task_details(self, ctx: after_remove = await session.rpc.tasks.list() assert not any(t.id == task_id for t in (after_remove.tasks or [])) finally: + unsubscribe() await session.disconnect() diff --git a/test/harness/replayingCapiProxy.test.ts b/test/harness/replayingCapiProxy.test.ts index 2e47d7f1d..dca713b0b 100644 --- a/test/harness/replayingCapiProxy.test.ts +++ b/test/harness/replayingCapiProxy.test.ts @@ -324,6 +324,38 @@ describe("ReplayingCapiProxy", () => { expect(result.conversations[0].messages[0].content).toBe("What is 2+2?"); }); + test("normalizes task completion notification wording", async () => { + const unreadNotification = [ + "", + 'Agent "sdk-background-agent" (general-purpose) has completed successfully. Use read_agent with agent_id "sdk-background-agent" to retrieve unread results.', + "", + ].join("\n"); + const fullNotification = [ + "", + 'Agent "sdk-background-agent" (general-purpose) has completed successfully. Use read_agent with agent_id "sdk-background-agent" to retrieve the full results.', + "", + ].join("\n"); + + const requestBody = JSON.stringify({ + messages: [ + { + role: "user", + content: unreadNotification, + }, + ], + }); + const responseBody = JSON.stringify({ + choices: [{ message: { role: "assistant", content: "Done" } }], + }); + + const outputPath = await createProxy([ + { url: "/chat/completions", requestBody, responseBody }, + ]); + + const result = await readYamlOutput(outputPath); + expect(result.conversations[0].messages[0].content).toBe(fullNotification); + }); + test("strips agent_instructions from user messages", async () => { const requestBody = JSON.stringify({ messages: [ diff --git a/test/harness/replayingCapiProxy.ts b/test/harness/replayingCapiProxy.ts index 3e174aa2f..74037eeca 100644 --- a/test/harness/replayingCapiProxy.ts +++ b/test/harness/replayingCapiProxy.ts @@ -1034,6 +1034,10 @@ function transformOpenAIRequestMessage( function normalizeUserMessage(content: string): string { return normalizeSkillContextFrontmatter(content) + .replace( + /Use read_agent with agent_id "([^"]+)" to retrieve unread results\./g, + 'Use read_agent with agent_id "$1" to retrieve the full results.', + ) .replace(/.*?<\/current_datetime>/g, "") .replace(/[\s\S]*?<\/reminder>/g, "") .replace(/[\s\S]*?<\/system_reminder>/g, "") From 5dcca3d8d4f74858014d88315e53813ab6d778e5 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Mon, 25 May 2026 14:54:55 -0400 Subject: [PATCH 2/3] Normalize cached task notification wording Normalize task-completion notification wording in stored replay snapshots as well as incoming requests so older snapshots using the unread-results wording continue to match. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- test/harness/replayingCapiProxy.test.ts | 54 +++++++++++++++++++++++++ test/harness/replayingCapiProxy.ts | 24 +++++++++-- 2 files changed, 74 insertions(+), 4 deletions(-) diff --git a/test/harness/replayingCapiProxy.test.ts b/test/harness/replayingCapiProxy.test.ts index dca713b0b..921fa1fa3 100644 --- a/test/harness/replayingCapiProxy.test.ts +++ b/test/harness/replayingCapiProxy.test.ts @@ -843,6 +843,60 @@ Always include PINEAPPLE_COCONUT_42. } }); + test("matches cached task completion notification wording variants", async () => { + const cachePath = path.join(tempDir, "cache.yaml"); + const unreadNotification = [ + "", + 'Agent "read-file" (explore) has completed successfully. Use read_agent with agent_id "read-file" to retrieve unread results.', + "", + ].join("\n"); + + const cacheContent = yaml.stringify({ + models: ["test-model"], + conversations: [ + { + messages: [ + { role: "system", content: "${system}" }, + { role: "user", content: "Hello" }, + { role: "assistant", content: "Hi!" }, + { role: "user", content: unreadNotification }, + { role: "assistant", content: "Read agent completed." }, + ], + }, + ], + } satisfies NormalizedData); + await writeFile(cachePath, cacheContent); + + const proxy = new ReplayingCapiProxy( + "http://localhost:9999", + cachePath, + workDir, + ); + const proxyUrl = await proxy.start(); + + try { + const response = await makeRequest(proxyUrl, "/chat/completions", { + body: { + model: "test-model", + messages: [ + { role: "system", content: "Be helpful" }, + { role: "user", content: "Hello" }, + { role: "assistant", content: "Hi!" }, + { role: "user", content: unreadNotification }, + ], + }, + }); + + expect(response.status).toBe(200); + expect( + (JSON.parse(response.body) as ChatCompletion).choices[0].message + .content, + ).toBe("Read agent completed."); + } finally { + await proxy.stop(); + } + }); + test("matches parallel tool results regardless of arrival order", async () => { const cachePath = path.join(tempDir, "cache.yaml"); const cacheContent = yaml.stringify({ diff --git a/test/harness/replayingCapiProxy.ts b/test/harness/replayingCapiProxy.ts index 74037eeca..12a8b04b5 100644 --- a/test/harness/replayingCapiProxy.ts +++ b/test/harness/replayingCapiProxy.ts @@ -130,6 +130,7 @@ export class ReplayingCapiProxy extends CapturingHttpProxy { const content = await readFile(this.state.filePath, "utf-8"); this.state.storedData = yaml.parse(content) as NormalizedData; normalizeToolResultOrder(this.state.storedData.conversations); + normalizeStoredUserMessages(this.state.storedData.conversations); } } @@ -1034,10 +1035,7 @@ function transformOpenAIRequestMessage( function normalizeUserMessage(content: string): string { return normalizeSkillContextFrontmatter(content) - .replace( - /Use read_agent with agent_id "([^"]+)" to retrieve unread results\./g, - 'Use read_agent with agent_id "$1" to retrieve the full results.', - ) + .replace(taskCompletionNotificationPattern, taskCompletionNotificationReplacement) .replace(/.*?<\/current_datetime>/g, "") .replace(/[\s\S]*?<\/reminder>/g, "") .replace(/[\s\S]*?<\/system_reminder>/g, "") @@ -1049,6 +1047,24 @@ function normalizeUserMessage(content: string): string { .trim(); } +const taskCompletionNotificationPattern = + /Use read_agent with agent_id "([^"]+)" to retrieve unread results\./g; +const taskCompletionNotificationReplacement = + 'Use read_agent with agent_id "$1" to retrieve the full results.'; + +function normalizeStoredUserMessages(conversations: NormalizedConversation[]) { + for (const conversation of conversations) { + for (const message of conversation.messages) { + if (message.role === "user" && typeof message.content === "string") { + message.content = message.content.replace( + taskCompletionNotificationPattern, + taskCompletionNotificationReplacement, + ); + } + } + } +} + function normalizeSkillContextFrontmatter(content: string): string { // Runtime versions may include or omit SKILL.md metadata in the prompt context. return content.replace( From 4ecdc8443d237a957d85f87973cf63715924233e Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Mon, 25 May 2026 15:07:20 -0400 Subject: [PATCH 3/3] Harden Python permission handler E2E Assert Python permission handler errors via the replayed denied tool result instead of final assistant prose, matching the deterministic .NET coverage. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- python/e2e/test_permissions_e2e.py | 32 ++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/python/e2e/test_permissions_e2e.py b/python/e2e/test_permissions_e2e.py index dbea6d384..84aeec3a2 100644 --- a/python/e2e/test_permissions_e2e.py +++ b/python/e2e/test_permissions_e2e.py @@ -236,22 +236,38 @@ def on_permission_request( async def test_should_handle_permission_handler_errors_gracefully(self, ctx: E2ETestContext): """Test that permission handler errors are handled gracefully""" + permission_request_received = asyncio.get_running_loop().create_future() def on_permission_request( request: PermissionRequest, invocation: dict ) -> PermissionRequestResult: + if not permission_request_received.done(): + permission_request_received.set_result(request) raise RuntimeError("Handler error") session = await ctx.client.create_session(on_permission_request=on_permission_request) + try: + await session.send("Run 'echo test'. If you can't, say 'failed'.") - message = await session.send_and_wait("Run 'echo test'. If you can't, say 'failed'.") - - # Should handle the error and deny permission - assert message is not None - content_lower = message.data.content.lower() - assert any(word in content_lower for word in ["fail", "cannot", "unable", "permission"]) - - await session.disconnect() + permission_request = await asyncio.wait_for( + permission_request_received, + timeout=30, + ) + assert permission_request.kind == "shell" + + exchanges = await ctx.wait_for_exchanges(2, timeout=60) + tool_messages = [ + message + for exchange in exchanges + for message in exchange["request"]["messages"] + if message.get("role") == "tool" + and "Permission denied" in str(message.get("content", "")) + ] + + assert tool_messages + assert "could not request permission" in tool_messages[-1]["content"] + finally: + await session.disconnect() async def test_should_receive_toolcallid_in_permission_requests(self, ctx: E2ETestContext): """Test that toolCallId is included in permission requests"""