From af7610d18c0765ea22ed98eed3a5ca4968024cc1 Mon Sep 17 00:00:00 2001
From: Stephen Toub <stoub@microsoft.com>
Date: Mon, 25 May 2026 14:37:56 -0400
Subject: [PATCH 1/3] Fix flaky E2E tests

Make permission handler error coverage assert deterministic replayed tool results instead of waiting for final assistant text, and ensure background-agent tests wait for the completion notification before cleanup. Normalize equivalent replay proxy notification wording across SDK suites.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 dotnet/test/E2E/PermissionE2ETests.cs         | 40 ++++++++++++++-----
 .../test/E2E/RpcTasksAndHandlersE2ETests.cs   | 17 ++++++++
 python/e2e/test_rpc_tasks_and_handlers_e2e.py | 18 +++++++++
 test/harness/replayingCapiProxy.test.ts       | 32 +++++++++++++++
 test/harness/replayingCapiProxy.ts            |  4 ++
 5 files changed, 102 insertions(+), 9 deletions(-)
diff --git a/dotnet/test/E2E/PermissionE2ETests.cs b/dotnet/test/E2E/PermissionE2ETests.cs
index 53b52c31c..3da42ab8f 100644
--- a/dotnet/test/E2E/PermissionE2ETests.cs
+++ b/dotnet/test/E2E/PermissionE2ETests.cs
@@ -226,24 +226,46 @@ await session2.SendAndWaitAsync(new MessageOptions
     [Fact]
     public async Task Should_Handle_Permission_Handler_Errors_Gracefully()
     {
+        var permissionRequestReceived =
+            new TaskCompletionSource<PermissionRequest>(TaskCreationOptions.RunContinuationsAsynchronously);
         var session = await CreateSessionAsync(new SessionConfig
         {
             OnPermissionRequest = (request, invocation) =>
             {
-                // Simulate an error in the handler
+                permissionRequestReceived.TrySetResult(request);
                 throw new InvalidOperationException("Handler error");
             }
         });
 
-        await session.SendAsync(new MessageOptions
+        try
         {
-            Prompt = "Run 'echo test'. If you can't, say 'failed'."
-        });
-
-        var message = await TestHelper.GetFinalAssistantMessageAsync(session);
-
-        // Should handle the error and deny permission
-        Assert.Matches("fail|cannot|unable|permission", message?.Data.Content?.ToLowerInvariant() ?? string.Empty);
+            var exchanges = await SendAndWaitForExchangesAsync(
+                session,
+                new MessageOptions
+                {
+                    Prompt = "Run 'echo test'. If you can't, say 'failed'."
+                },
+                minimumCount: 2);
+
+            var permissionRequest = await permissionRequestReceived.Task.WaitAsync(TimeSpan.FromSeconds(30));
+            Assert.IsType<PermissionRequestShell>(permissionRequest);
+
+            var toolResultMessage = exchanges
+                .SelectMany(exchange => exchange.Request.Messages)
+                .LastOrDefault(message =>
+                    message.Role == "tool" &&
+                    message.StringContent?.Contains("Permission denied", StringComparison.OrdinalIgnoreCase) == true);
+
+            Assert.NotNull(toolResultMessage);
+            Assert.Contains(
+                "could not request permission",
+                toolResultMessage.StringContent ?? string.Empty,
+                StringComparison.OrdinalIgnoreCase);
+        }
+        finally
+        {
+            await session.DisposeAsync();
+        }
     }
 
     [Fact]
diff --git a/dotnet/test/E2E/RpcTasksAndHandlersE2ETests.cs b/dotnet/test/E2E/RpcTasksAndHandlersE2ETests.cs
index 61c8d5878..d31237690 100644
--- a/dotnet/test/E2E/RpcTasksAndHandlersE2ETests.cs
+++ b/dotnet/test/E2E/RpcTasksAndHandlersE2ETests.cs
@@ -80,6 +80,22 @@ public async Task Should_Start_Background_Agent_And_Report_Task_Details()
         });
         Assert.Contains("TASK_AGENT_READY", ready?.Data.Content ?? string.Empty, StringComparison.Ordinal);
 
+        var taskCompletionNotification =
+            new TaskCompletionSource<AssistantMessageEvent>(TaskCreationOptions.RunContinuationsAsynchronously);
+        using var subscription = session.On<SessionEvent>(evt =>
+        {
+            switch (evt)
+            {
+                case AssistantMessageEvent assistantMessage
+                    when assistantMessage.Data.Content?.Contains("TASK_AGENT_DONE", StringComparison.Ordinal) == true:
+                    taskCompletionNotification.TrySetResult(assistantMessage);
+                    break;
+                case SessionErrorEvent error:
+                    taskCompletionNotification.TrySetException(new Exception(error.Data.Message ?? "session error"));
+                    break;
+            }
+        });
+
         var started = await session.Rpc.Tasks.StartAgentAsync(
             agentType: "general-purpose",
             prompt: "Reply with TASK_AGENT_DONE exactly.",
@@ -123,6 +139,7 @@ await TestHelper.WaitForConditionAsync(
 
         Assert.NotNull(task);
         Assert.Contains("TASK_AGENT_DONE", task.LatestResponse ?? task.Result ?? string.Empty);
+        await taskCompletionNotification.Task.WaitAsync(TimeSpan.FromSeconds(30));
 
         if (task.Status == GitHub.Copilot.Rpc.TaskStatus.Idle)
         {
diff --git a/python/e2e/test_rpc_tasks_and_handlers_e2e.py b/python/e2e/test_rpc_tasks_and_handlers_e2e.py
index 23b9f9896..e735f7db7 100644
--- a/python/e2e/test_rpc_tasks_and_handlers_e2e.py
+++ b/python/e2e/test_rpc_tasks_and_handlers_e2e.py
@@ -29,6 +29,7 @@
     UIElicitationResponseAction,
     UIHandlePendingElicitationRequest,
 )
+from copilot.generated.session_events import AssistantMessageData, SessionErrorData
 from copilot.session import PermissionHandler
 
 from .testharness import E2ETestContext
@@ -211,6 +212,21 @@ async def test_should_start_background_agent_and_report_task_details(self, ctx:
         session = await ctx.client.create_session(
             on_permission_request=PermissionHandler.approve_all,
         )
+        task_completion_notification = asyncio.get_running_loop().create_future()
+
+        def on_event(event):
+            if isinstance(event.data, AssistantMessageData) and "TASK_AGENT_DONE" in (
+                event.data.content or ""
+            ):
+                if not task_completion_notification.done():
+                    task_completion_notification.set_result(event)
+            elif isinstance(event.data, SessionErrorData):
+                if not task_completion_notification.done():
+                    task_completion_notification.set_exception(
+                        RuntimeError(event.data.message or "session error")
+                    )
+
+        unsubscribe = session.on(on_event)
         try:
             ready = await session.send_and_wait(
                 "Reply with TASK_AGENT_READY exactly.",
@@ -262,6 +278,7 @@ async def test_should_start_background_agent_and_report_task_details(self, ctx:
             )
             assert found_task is not None, f"Task {task_id} disappeared before it completed"
             assert "TASK_AGENT_DONE" in (found_task.latest_response or found_task.result or "")
+            await asyncio.wait_for(task_completion_notification, timeout=30.0)
 
             if found_task.status == TaskInfoStatus.IDLE:
                 cancel = await session.rpc.tasks.cancel(TasksCancelRequest(id=task_id))
@@ -274,4 +291,5 @@ async def test_should_start_background_agent_and_report_task_details(self, ctx:
             after_remove = await session.rpc.tasks.list()
             assert not any(t.id == task_id for t in (after_remove.tasks or []))
         finally:
+            unsubscribe()
             await session.disconnect()
diff --git a/test/harness/replayingCapiProxy.test.ts b/test/harness/replayingCapiProxy.test.ts
index 2e47d7f1d..dca713b0b 100644
--- a/test/harness/replayingCapiProxy.test.ts
+++ b/test/harness/replayingCapiProxy.test.ts
@@ -324,6 +324,38 @@ describe("ReplayingCapiProxy", () => {
     expect(result.conversations[0].messages[0].content).toBe("What is 2+2?");
   });
 
+  test("normalizes task completion notification wording", async () => {
+    const unreadNotification = [
+      "<system_notification>",
+      'Agent "sdk-background-agent" (general-purpose) has completed successfully. Use read_agent with agent_id "sdk-background-agent" to retrieve unread results.',
+      "</system_notification>",
+    ].join("\n");
+    const fullNotification = [
+      "<system_notification>",
+      'Agent "sdk-background-agent" (general-purpose) has completed successfully. Use read_agent with agent_id "sdk-background-agent" to retrieve the full results.',
+      "</system_notification>",
+    ].join("\n");
+
+    const requestBody = JSON.stringify({
+      messages: [
+        {
+          role: "user",
+          content: unreadNotification,
+        },
+      ],
+    });
+    const responseBody = JSON.stringify({
+      choices: [{ message: { role: "assistant", content: "Done" } }],
+    });
+
+    const outputPath = await createProxy([
+      { url: "/chat/completions", requestBody, responseBody },
+    ]);
+
+    const result = await readYamlOutput(outputPath);
+    expect(result.conversations[0].messages[0].content).toBe(fullNotification);
+  });
+
   test("strips agent_instructions from user messages", async () => {
     const requestBody = JSON.stringify({
       messages: [
diff --git a/test/harness/replayingCapiProxy.ts b/test/harness/replayingCapiProxy.ts
index 3e174aa2f..74037eeca 100644
--- a/test/harness/replayingCapiProxy.ts
+++ b/test/harness/replayingCapiProxy.ts
@@ -1034,6 +1034,10 @@ function transformOpenAIRequestMessage(
 
 function normalizeUserMessage(content: string): string {
   return normalizeSkillContextFrontmatter(content)
+    .replace(
+      /Use read_agent with agent_id "([^"]+)" to retrieve unread results\./g,
+      'Use read_agent with agent_id "$1" to retrieve the full results.',
+    )
     .replace(/<current_datetime>.*?<\/current_datetime>/g, "")
     .replace(/<reminder>[\s\S]*?<\/reminder>/g, "")
     .replace(/<system_reminder>[\s\S]*?<\/system_reminder>/g, "")

From 5dcca3d8d4f74858014d88315e53813ab6d778e5 Mon Sep 17 00:00:00 2001
From: Stephen Toub <stoub@microsoft.com>
Date: Mon, 25 May 2026 14:54:55 -0400
Subject: [PATCH 2/3] Normalize cached task notification wording

Normalize task-completion notification wording in stored replay snapshots as well as incoming requests so older snapshots using the unread-results wording continue to match.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 test/harness/replayingCapiProxy.test.ts | 54 +++++++++++++++++++++++++
 test/harness/replayingCapiProxy.ts      | 24 +++++++++--
 2 files changed, 74 insertions(+), 4 deletions(-)

diff --git a/test/harness/replayingCapiProxy.test.ts b/test/harness/replayingCapiProxy.test.ts
index dca713b0b..921fa1fa3 100644
--- a/test/harness/replayingCapiProxy.test.ts
+++ b/test/harness/replayingCapiProxy.test.ts
@@ -843,6 +843,60 @@ Always include PINEAPPLE_COCONUT_42.
       }
     });
 
+    test("matches cached task completion notification wording variants", async () => {
+      const cachePath = path.join(tempDir, "cache.yaml");
+      const unreadNotification = [
+        "<system_notification>",
+        'Agent "read-file" (explore) has completed successfully. Use read_agent with agent_id "read-file" to retrieve unread results.',
+        "</system_notification>",
+      ].join("\n");
+
+      const cacheContent = yaml.stringify({
+        models: ["test-model"],
+        conversations: [
+          {
+            messages: [
+              { role: "system", content: "${system}" },
+              { role: "user", content: "Hello" },
+              { role: "assistant", content: "Hi!" },
+              { role: "user", content: unreadNotification },
+              { role: "assistant", content: "Read agent completed." },
+            ],
+          },
+        ],
+      } satisfies NormalizedData);
+      await writeFile(cachePath, cacheContent);
+
+      const proxy = new ReplayingCapiProxy(
+        "http://localhost:9999",
+        cachePath,
+        workDir,
+      );
+      const proxyUrl = await proxy.start();
+
+      try {
+        const response = await makeRequest(proxyUrl, "/chat/completions", {
+          body: {
+            model: "test-model",
+            messages: [
+              { role: "system", content: "Be helpful" },
+              { role: "user", content: "Hello" },
+              { role: "assistant", content: "Hi!" },
+              { role: "user", content: unreadNotification },
+            ],
+          },
+        });
+
+        expect(response.status).toBe(200);
+        expect(
+          (JSON.parse(response.body) as ChatCompletion).choices[0].message
+            .content,
+        ).toBe("Read agent completed.");
+      } finally {
+        await proxy.stop();
+      }
+    });
+
     test("matches parallel tool results regardless of arrival order", async () => {
       const cachePath = path.join(tempDir, "cache.yaml");
       const cacheContent = yaml.stringify({
diff --git a/test/harness/replayingCapiProxy.ts b/test/harness/replayingCapiProxy.ts
index 74037eeca..12a8b04b5 100644
--- a/test/harness/replayingCapiProxy.ts
+++ b/test/harness/replayingCapiProxy.ts
@@ -130,6 +130,7 @@ export class ReplayingCapiProxy extends CapturingHttpProxy {
       const content = await readFile(this.state.filePath, "utf-8");
       this.state.storedData = yaml.parse(content) as NormalizedData;
       normalizeToolResultOrder(this.state.storedData.conversations);
+      normalizeStoredUserMessages(this.state.storedData.conversations);
     }
   }
 
@@ -1034,10 +1035,7 @@ function transformOpenAIRequestMessage(
 
 function normalizeUserMessage(content: string): string {
   return normalizeSkillContextFrontmatter(content)
-    .replace(
-      /Use read_agent with agent_id "([^"]+)" to retrieve unread results\./g,
-      'Use read_agent with agent_id "$1" to retrieve the full results.',
-    )
+    .replace(taskCompletionNotificationPattern, taskCompletionNotificationReplacement)
     .replace(/<current_datetime>.*?<\/current_datetime>/g, "")
     .replace(/<reminder>[\s\S]*?<\/reminder>/g, "")
     .replace(/<system_reminder>[\s\S]*?<\/system_reminder>/g, "")
@@ -1049,6 +1047,24 @@ function normalizeUserMessage(content: string): string {
     .trim();
 }
 
+const taskCompletionNotificationPattern =
+  /Use read_agent with agent_id "([^"]+)" to retrieve unread results\./g;
+const taskCompletionNotificationReplacement =
+  'Use read_agent with agent_id "$1" to retrieve the full results.';
+
+function normalizeStoredUserMessages(conversations: NormalizedConversation[]) {
+  for (const conversation of conversations) {
+    for (const message of conversation.messages) {
+      if (message.role === "user" && typeof message.content === "string") {
+        message.content = message.content.replace(
+          taskCompletionNotificationPattern,
+          taskCompletionNotificationReplacement,
+        );
+      }
+    }
+  }
+}
+
 function normalizeSkillContextFrontmatter(content: string): string {
   // Runtime versions may include or omit SKILL.md metadata in the prompt context.
   return content.replace(

From 4ecdc8443d237a957d85f87973cf63715924233e Mon Sep 17 00:00:00 2001
From: Stephen Toub <stoub@microsoft.com>
Date: Mon, 25 May 2026 15:07:20 -0400
Subject: [PATCH 3/3] Harden Python permission handler E2E

Assert Python permission handler errors via the replayed denied tool result instead of final assistant prose, matching the deterministic .NET coverage.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 python/e2e/test_permissions_e2e.py | 32 ++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/python/e2e/test_permissions_e2e.py b/python/e2e/test_permissions_e2e.py
index dbea6d384..84aeec3a2 100644
--- a/python/e2e/test_permissions_e2e.py
+++ b/python/e2e/test_permissions_e2e.py
@@ -236,22 +236,38 @@ def on_permission_request(
 
     async def test_should_handle_permission_handler_errors_gracefully(self, ctx: E2ETestContext):
         """Test that permission handler errors are handled gracefully"""
+        permission_request_received = asyncio.get_running_loop().create_future()
 
         def on_permission_request(
             request: PermissionRequest, invocation: dict
         ) -> PermissionRequestResult:
+            if not permission_request_received.done():
+                permission_request_received.set_result(request)
             raise RuntimeError("Handler error")
 
         session = await ctx.client.create_session(on_permission_request=on_permission_request)
+        try:
+            await session.send("Run 'echo test'. If you can't, say 'failed'.")
 
-        message = await session.send_and_wait("Run 'echo test'. If you can't, say 'failed'.")
-
-        # Should handle the error and deny permission
-        assert message is not None
-        content_lower = message.data.content.lower()
-        assert any(word in content_lower for word in ["fail", "cannot", "unable", "permission"])
-
-        await session.disconnect()
+            permission_request = await asyncio.wait_for(
+                permission_request_received,
+                timeout=30,
+            )
+            assert permission_request.kind == "shell"
+
+            exchanges = await ctx.wait_for_exchanges(2, timeout=60)
+            tool_messages = [
+                message
+                for exchange in exchanges
+                for message in exchange["request"]["messages"]
+                if message.get("role") == "tool"
+                and "Permission denied" in str(message.get("content", ""))
+            ]
+
+            assert tool_messages
+            assert "could not request permission" in tool_messages[-1]["content"]
+        finally:
+            await session.disconnect()
 
     async def test_should_receive_toolcallid_in_permission_requests(self, ctx: E2ETestContext):
         """Test that toolCallId is included in permission requests"""