diff --git a/Lis.Tests/Harness/AssertionTests.cs b/Lis.Tests/Harness/AssertionTests.cs new file mode 100644 index 0000000..742df13 --- /dev/null +++ b/Lis.Tests/Harness/AssertionTests.cs @@ -0,0 +1,298 @@ +namespace Lis.Tests.Harness; + +public class AssertionTests +{ + private static HarnessResult MakeResult( + string response = "Hello world", + List? toolCalls = null, + int outputTokens = 10) => new() + { + Response = response, + ToolCalls = toolCalls ?? [], + OutputTokens = outputTokens, + Duration = TimeSpan.FromMilliseconds(100), + History = [] + }; + + // ── ShouldCallTool ────────────────────────────────────────── + + [Fact] + public void ShouldCallTool_ToolPresent_Passes() + { + HarnessResult result = MakeResult(toolCalls: [ + new HarnessToolCall("mem", "create_memory", new() { ["content"] = "birthday" }) + ]); + + result.ShouldCallTool("mem", "create_memory"); + } + + [Fact] + public void ShouldCallTool_ToolAbsent_Throws() + { + HarnessResult result = MakeResult(); + + Assert.ThrowsAny(() => result.ShouldCallTool("mem", "create_memory")); + } + + [Fact] + public void ShouldCallTool_WrongPlugin_Throws() + { + HarnessResult result = MakeResult(toolCalls: [ + new HarnessToolCall("dt", "get_current_datetime", []) + ]); + + Assert.ThrowsAny(() => result.ShouldCallTool("mem", "create_memory")); + } + + // ── ShouldCallToolWithArg ─────────────────────────────────── + + [Fact] + public void ShouldCallToolWithArg_ArgContainsValue_Passes() + { + HarnessResult result = MakeResult(toolCalls: [ + new HarnessToolCall("mem", "create_memory", new() { ["content"] = "my birthday is Jan 1" }) + ]); + + result.ShouldCallToolWithArg("mem", "create_memory", "content", "birthday"); + } + + [Fact] + public void ShouldCallToolWithArg_ArgMissing_Throws() + { + HarnessResult result = MakeResult(toolCalls: [ + new HarnessToolCall("mem", "create_memory", new() { ["content"] = "hello" }) + ]); + + Assert.ThrowsAny(() => + result.ShouldCallToolWithArg("mem", "create_memory", "content", "birthday")); + } + + [Fact] + public void ShouldCallToolWithArg_CaseInsensitive() + { + HarnessResult result = MakeResult(toolCalls: [ + new HarnessToolCall("mem", "create_memory", new() { ["content"] = "My BIRTHDAY" }) + ]); + + result.ShouldCallToolWithArg("mem", "create_memory", "content", "birthday"); + } + + // ── ShouldNotCallTool ─────────────────────────────────────── + + [Fact] + public void ShouldNotCallTool_ToolAbsent_Passes() + { + HarnessResult result = MakeResult(); + + result.ShouldNotCallTool("exec", "run_command"); + } + + [Fact] + public void ShouldNotCallTool_ToolPresent_Throws() + { + HarnessResult result = MakeResult(toolCalls: [ + new HarnessToolCall("exec", "run_command", new() { ["command"] = "rm -rf /" }) + ]); + + Assert.ThrowsAny(() => result.ShouldNotCallTool("exec", "run_command")); + } + + // ── ShouldNotCallAnyTools ─────────────────────────────────── + + [Fact] + public void ShouldNotCallAnyTools_NoTools_Passes() + { + HarnessResult result = MakeResult(); + + result.ShouldNotCallAnyTools(); + } + + [Fact] + public void ShouldNotCallAnyTools_HasTools_Throws() + { + HarnessResult result = MakeResult(toolCalls: [ + new HarnessToolCall("dt", "get_current_datetime", []) + ]); + + Assert.ThrowsAny(() => result.ShouldNotCallAnyTools()); + } + + // ── ShouldRespondWithin ───────────────────────────────────── + + [Fact] + public void ShouldRespondWithin_UnderBudget_Passes() + { + HarnessResult result = MakeResult(outputTokens: 100); + + result.ShouldRespondWithin(maxTokens: 500); + } + + [Fact] + public void ShouldRespondWithin_OverBudget_Throws() + { + HarnessResult result = MakeResult(outputTokens: 600); + + Assert.ThrowsAny(() => result.ShouldRespondWithin(maxTokens: 500)); + } + + [Fact] + public void ShouldRespondWithin_ExactBudget_Passes() + { + HarnessResult result = MakeResult(outputTokens: 500); + + result.ShouldRespondWithin(maxTokens: 500); + } + + // ── ShouldContain ─────────────────────────────────────────── + + [Fact] + public void ShouldContain_KeywordPresent_Passes() + { + HarnessResult result = MakeResult(response: "Hello world, how are you?"); + + result.ShouldContain("world"); + } + + [Fact] + public void ShouldContain_CaseInsensitive() + { + HarnessResult result = MakeResult(response: "Hello World"); + + result.ShouldContain("hello"); + } + + [Fact] + public void ShouldContain_KeywordAbsent_Throws() + { + HarnessResult result = MakeResult(response: "Goodbye"); + + Assert.ThrowsAny(() => result.ShouldContain("hello")); + } + + // ── ShouldMatch ───────────────────────────────────────────── + + [Fact] + public void ShouldMatch_PatternMatches_Passes() + { + HarnessResult result = MakeResult(response: "Memory #42 saved."); + + result.ShouldMatch(@"Memory #\d+ saved\."); + } + + [Fact] + public void ShouldMatch_PatternDoesNotMatch_Throws() + { + HarnessResult result = MakeResult(response: "No match here"); + + Assert.ThrowsAny(() => result.ShouldMatch(@"^Memory #\d+$")); + } + + // ── ShouldNotContain ──────────────────────────────────────── + + [Fact] + public void ShouldNotContain_KeywordAbsent_Passes() + { + HarnessResult result = MakeResult(response: "All good"); + + result.ShouldNotContain("error"); + } + + [Fact] + public void ShouldNotContain_KeywordPresent_Throws() + { + HarnessResult result = MakeResult(response: "An error occurred"); + + Assert.ThrowsAny(() => result.ShouldNotContain("error")); + } + + // ── ResponseShouldNotBeEmpty ──────────────────────────────── + + [Fact] + public void ResponseShouldNotBeEmpty_HasContent_Passes() + { + HarnessResult result = MakeResult(response: "Hello"); + + result.ResponseShouldNotBeEmpty(); + } + + [Fact] + public void ResponseShouldNotBeEmpty_Empty_Throws() + { + HarnessResult result = MakeResult(response: ""); + + Assert.ThrowsAny(() => result.ResponseShouldNotBeEmpty()); + } + + [Fact] + public void ResponseShouldNotBeEmpty_Whitespace_Throws() + { + HarnessResult result = MakeResult(response: " "); + + Assert.ThrowsAny(() => result.ResponseShouldNotBeEmpty()); + } + + // ── ShouldHaveToolCallCount ───────────────────────────────── + + [Fact] + public void ShouldHaveToolCallCount_Correct_Passes() + { + HarnessResult result = MakeResult(toolCalls: [ + new HarnessToolCall("mem", "create_memory", []), + new HarnessToolCall("dt", "get_current_datetime", []) + ]); + + result.ShouldHaveToolCallCount(2); + } + + [Fact] + public void ShouldHaveToolCallCount_Wrong_Throws() + { + HarnessResult result = MakeResult(); + + Assert.ThrowsAny(() => result.ShouldHaveToolCallCount(1)); + } + + // ── Chaining ──────────────────────────────────────────────── + + [Fact] + public void Assertions_CanBeChained() + { + HarnessResult result = MakeResult( + response: "Memory #1 saved successfully.", + toolCalls: [new HarnessToolCall("mem", "create_memory", new() { ["content"] = "birthday" })], + outputTokens: 20); + + result + .ShouldCallTool("mem", "create_memory") + .ShouldCallToolWithArg("mem", "create_memory", "content", "birthday") + .ShouldNotCallTool("exec", "run_command") + .ShouldContain("saved") + .ShouldNotContain("error") + .ShouldMatch(@"Memory #\d+") + .ShouldRespondWithin(maxTokens: 100) + .ResponseShouldNotBeEmpty() + .ShouldHaveToolCallCount(1); + } + + // ── Edge cases ────────────────────────────────────────────── + + [Fact] + public void NullResponseInToolCalls_EmptyArgs() + { + HarnessResult result = MakeResult(toolCalls: [ + new HarnessToolCall("test", "func", []) + ]); + + result.ShouldCallTool("test", "func"); + Assert.ThrowsAny(() => result.ShouldCallToolWithArg("test", "func", "key", "value")); + } + + [Fact] + public void EmptyToolCalls_ShouldNotCallAnyPasses() + { + HarnessResult result = MakeResult(toolCalls: []); + + result.ShouldNotCallAnyTools(); + result.ShouldHaveToolCallCount(0); + } +} diff --git a/Lis.Tests/Harness/HarnessAssertions.cs b/Lis.Tests/Harness/HarnessAssertions.cs new file mode 100644 index 0000000..d174f9d --- /dev/null +++ b/Lis.Tests/Harness/HarnessAssertions.cs @@ -0,0 +1,106 @@ +namespace Lis.Tests.Harness; + +/// +/// Fluent assertion extensions for . +/// Each method returns the result for chaining and throws on failure. +/// +public static class HarnessAssertions +{ + /// Assert that the specified tool was called at least once. + public static HarnessResult ShouldCallTool(this HarnessResult result, string pluginName, string functionName) + { + bool found = result.ToolCalls.Any(tc => + tc.PluginName == pluginName && tc.FunctionName == functionName); + + Assert.True(found, + $"Expected tool '{pluginName}.{functionName}' to be called, but it was not. " + + $"Actual tool calls: [{FormatToolCalls(result.ToolCalls)}]"); + + return result; + } + + /// Assert that the specified tool was called with a specific argument containing a value. + public static HarnessResult ShouldCallToolWithArg( + this HarnessResult result, string pluginName, string functionName, + string argName, string containing) + { + HarnessToolCall? match = result.ToolCalls.FirstOrDefault(tc => + tc.PluginName == pluginName && tc.FunctionName == functionName + && tc.Arguments.TryGetValue(argName, out string? value) + && value.Contains(containing, StringComparison.OrdinalIgnoreCase)); + + Assert.NotNull(match); + + return result; + } + + /// Assert that the specified tool was NOT called. + public static HarnessResult ShouldNotCallTool(this HarnessResult result, string pluginName, string functionName) + { + bool found = result.ToolCalls.Any(tc => + tc.PluginName == pluginName && tc.FunctionName == functionName); + + Assert.False(found, + $"Expected tool '{pluginName}.{functionName}' to NOT be called, but it was."); + + return result; + } + + /// Assert that no tools were called at all. + public static HarnessResult ShouldNotCallAnyTools(this HarnessResult result) + { + Assert.Empty(result.ToolCalls); + return result; + } + + /// Assert the response is within a token budget. + public static HarnessResult ShouldRespondWithin(this HarnessResult result, int maxTokens) + { + Assert.True(result.OutputTokens <= maxTokens, + $"Expected response within {maxTokens} tokens, but got {result.OutputTokens}."); + + return result; + } + + /// Assert the response contains the given keyword (case-insensitive). + public static HarnessResult ShouldContain(this HarnessResult result, string keyword) + { + Assert.Contains(keyword, result.Response, StringComparison.OrdinalIgnoreCase); + return result; + } + + /// Assert the response matches a regex pattern. + public static HarnessResult ShouldMatch(this HarnessResult result, string pattern) + { + Assert.Matches(pattern, result.Response); + return result; + } + + /// Assert the response does NOT contain the given keyword (case-insensitive). + public static HarnessResult ShouldNotContain(this HarnessResult result, string keyword) + { + Assert.DoesNotContain(keyword, result.Response, StringComparison.OrdinalIgnoreCase); + return result; + } + + /// Assert the response is not empty or whitespace. + public static HarnessResult ResponseShouldNotBeEmpty(this HarnessResult result) + { + Assert.False(string.IsNullOrWhiteSpace(result.Response), + "Expected a non-empty response, but the response was empty or whitespace."); + + return result; + } + + /// Assert the exact number of tool calls made. + public static HarnessResult ShouldHaveToolCallCount(this HarnessResult result, int expectedCount) + { + Assert.Equal(expectedCount, result.ToolCalls.Count); + return result; + } + + private static string FormatToolCalls(List toolCalls) => + toolCalls.Count == 0 + ? "(none)" + : string.Join(", ", toolCalls.Select(tc => $"{tc.PluginName}.{tc.FunctionName}")); +} diff --git a/Lis.Tests/Harness/HarnessExampleTests.cs b/Lis.Tests/Harness/HarnessExampleTests.cs new file mode 100644 index 0000000..151f6a9 --- /dev/null +++ b/Lis.Tests/Harness/HarnessExampleTests.cs @@ -0,0 +1,148 @@ +namespace Lis.Tests.Harness; + +/// +/// Example tests demonstrating the LLM test harness API. +/// These show how to write tests for AI-powered features. +/// +public class HarnessExampleTests +{ + // ── Greeting: should NOT trigger any tools ────────────────── + + [Fact] + public async Task Greeting_DoesNotTriggerTools() + { + MockChatCompletionService mock = new(); + mock.QueueTextResponse("Hello! How can I help you today?"); + + LlmTestHarness harness = new(mock); + HarnessResult result = await harness.SimulateMessageAsync("Hi there!"); + + result + .ShouldNotCallAnyTools() + .ShouldContain("Hello") + .ResponseShouldNotBeEmpty(); + } + + // ── Memory creation: should call mem.create_memory ────────── + + [Fact] + public async Task RememberBirthday_TriggersMemoryTool() + { + MockChatCompletionService mock = new(); + mock.QueueToolCallResponse("mem", "create_memory", new() + { + ["content"] = "User's birthday is January 1st" + }); + mock.QueueTextResponse("Got it! I'll remember your birthday is January 1st."); + + LlmTestHarness harness = new(mock); + HarnessResult result = await harness.SimulateMessageAsync("Remember my birthday is January 1st"); + + result + .ShouldCallTool("mem", "create_memory") + .ShouldCallToolWithArg("mem", "create_memory", "content", "birthday") + .ShouldNotCallTool("exec", "run_command") + .ShouldContain("remember") + .ShouldHaveToolCallCount(1); + } + + // ── Token budget: response should stay under limit ────────── + + [Fact] + public async Task Response_StaysUnderTokenBudget() + { + MockChatCompletionService mock = new(); + mock.QueueTextResponse("Short answer."); + + LlmTestHarness harness = new(mock); + HarnessResult result = await harness.SimulateMessageAsync("What time is it?"); + + result.ShouldRespondWithin(maxTokens: 500); + } + + // ── Snapshot workflow example ──────────────────────────────── + + [Fact] + public async Task Snapshot_SaveAndCompare() + { + string tempDir = Path.Combine(Path.GetTempPath(), $"lis_example_{Guid.NewGuid():N}"); + try + { + MockChatCompletionService mock = new(); + mock.QueueTextResponse("The current date is 2025-01-15."); + + LlmTestHarness harness = new(mock); + HarnessResult result = await harness.SimulateMessageAsync("What's the date?"); + + SnapshotManager snapshots = new(tempDir); + + // First run: creates new snapshot + SnapshotComparison firstRun = snapshots.CompareWithSnapshot("date_query", result); + Assert.False(firstRun.SnapshotExists); + + // Approve it + snapshots.ApproveSnapshot("date_query"); + + // Second run with same output: matches + SnapshotComparison secondRun = snapshots.CompareWithSnapshot("date_query", result); + Assert.True(secondRun.IsMatch); + } + finally + { + if (Directory.Exists(tempDir)) + Directory.Delete(tempDir, recursive: true); + } + } + + // ── Multi-tool call ───────────────────────────────────────── + + [Fact] + public async Task MultiToolCall_BothToolsCaptured() + { + MockChatCompletionService mock = new(); + mock.QueueMultiToolCallResponse( + new MockToolCall("dt", "get_current_datetime", []), + new MockToolCall("mem", "search_memories", new() { ["query"] = "meeting" }) + ); + mock.QueueTextResponse("Your next meeting is at 3 PM."); + + LlmTestHarness harness = new(mock); + HarnessResult result = await harness.SimulateMessageAsync("When is my next meeting?"); + + result + .ShouldCallTool("dt", "get_current_datetime") + .ShouldCallTool("mem", "search_memories") + .ShouldHaveToolCallCount(2) + .ShouldContain("meeting"); + } + + // ── Custom system prompt ──────────────────────────────────── + + [Fact] + public async Task CustomSystemPrompt_IncludedInHistory() + { + MockChatCompletionService mock = new(); + mock.QueueTextResponse("I am Lis, your AI assistant."); + + LlmTestHarness harness = new(mock); + HarnessResult result = await harness.SimulateMessageAsync("Who are you?", + opts => opts.SystemPrompt = "You are Lis, a personal AI assistant."); + + Assert.True(result.History.Count >= 3); // system + user + assistant + Assert.Equal("You are Lis, a personal AI assistant.", result.History[0].Content); + } + + // ── Regex matching ────────────────────────────────────────── + + [Fact] + public async Task Response_MatchesExpectedPattern() + { + MockChatCompletionService mock = new(); + mock.QueueTextResponse("Memory #42 saved successfully."); + + LlmTestHarness harness = new(mock); + HarnessResult result = await harness.SimulateMessageAsync("Save this note"); + + result.ShouldMatch(@"Memory #\d+ saved"); + } +} diff --git a/Lis.Tests/Harness/HarnessResult.cs b/Lis.Tests/Harness/HarnessResult.cs new file mode 100644 index 0000000..58514f4 --- /dev/null +++ b/Lis.Tests/Harness/HarnessResult.cs @@ -0,0 +1,32 @@ +using Microsoft.SemanticKernel.ChatCompletion; + +namespace Lis.Tests.Harness; + +/// +/// Captures the full outcome of a simulated AI conversation turn. +/// +public sealed class HarnessResult +{ + /// Final assistant text response. + public string Response { get; init; } = string.Empty; + + /// Every tool invocation requested by the assistant. + public List ToolCalls { get; init; } = []; + + /// Estimated output token count (BPE o200k_base). + public int OutputTokens { get; init; } + + /// Wall-clock duration of the simulated turn. + public TimeSpan Duration { get; init; } + + /// Full chat history including system, user, assistant, and tool messages. + public ChatHistory History { get; init; } = []; +} + +/// +/// Represents a single tool invocation captured during the harness run. +/// +public sealed record HarnessToolCall( + string PluginName, + string FunctionName, + Dictionary Arguments); diff --git a/Lis.Tests/Harness/HarnessTests.cs b/Lis.Tests/Harness/HarnessTests.cs new file mode 100644 index 0000000..a6bf203 --- /dev/null +++ b/Lis.Tests/Harness/HarnessTests.cs @@ -0,0 +1,228 @@ +using Microsoft.SemanticKernel; +using Microsoft.SemanticKernel.ChatCompletion; + +namespace Lis.Tests.Harness; + +/// +/// Tests for and internals. +/// +public class HarnessTests +{ + // ── MockChatCompletionService ──────────────────────────────── + + [Fact] + public async Task Mock_TextResponse_ReturnsContent() + { + MockChatCompletionService mock = new(); + mock.QueueTextResponse("Hello!"); + + IReadOnlyList results = + await mock.GetChatMessageContentsAsync([]); + + Assert.Single(results); + Assert.Equal(AuthorRole.Assistant, results[0].Role); + Assert.Equal("Hello!", results[0].Content); + } + + [Fact] + public async Task Mock_ToolCallResponse_ReturnsFunctionCallContent() + { + MockChatCompletionService mock = new(); + mock.QueueToolCallResponse("mem", "create_memory", new() { ["content"] = "test" }); + + IReadOnlyList results = + await mock.GetChatMessageContentsAsync([]); + + Assert.Single(results); + FunctionCallContent? call = results[0].Items.OfType().FirstOrDefault(); + Assert.NotNull(call); + Assert.Equal("mem", call.PluginName); + Assert.Equal("create_memory", call.FunctionName); + } + + [Fact] + public async Task Mock_MultipleQueued_ReturnsInOrder() + { + MockChatCompletionService mock = new(); + mock.QueueTextResponse("First"); + mock.QueueTextResponse("Second"); + + IReadOnlyList first = await mock.GetChatMessageContentsAsync([]); + IReadOnlyList second = await mock.GetChatMessageContentsAsync([]); + + Assert.Equal("First", first[0].Content); + Assert.Equal("Second", second[0].Content); + } + + [Fact] + public async Task Mock_EmptyQueue_Throws() + { + MockChatCompletionService mock = new(); + + await Assert.ThrowsAsync(() => + mock.GetChatMessageContentsAsync([])); + } + + [Fact] + public async Task Mock_RecordsToolCalls() + { + MockChatCompletionService mock = new(); + mock.QueueToolCallResponse("dt", "get_current_datetime"); + + await mock.GetChatMessageContentsAsync([]); + + Assert.Single(mock.RecordedToolCalls); + Assert.Equal("dt", mock.RecordedToolCalls[0].PluginName); + Assert.Equal("get_current_datetime", mock.RecordedToolCalls[0].FunctionName); + } + + [Fact] + public async Task Mock_MultiToolCallResponse_AllRecorded() + { + MockChatCompletionService mock = new(); + mock.QueueMultiToolCallResponse( + new MockToolCall("dt", "get_current_datetime", []), + new MockToolCall("mem", "search_memories", new() { ["query"] = "test" }) + ); + + await mock.GetChatMessageContentsAsync([]); + + Assert.Equal(2, mock.RecordedToolCalls.Count); + } + + [Fact] + public async Task Mock_StreamingResponse_YieldsContent() + { + MockChatCompletionService mock = new(); + mock.QueueTextResponse("Streamed"); + + List chunks = []; + await foreach (StreamingChatMessageContent chunk in mock.GetStreamingChatMessageContentsAsync([])) + chunks.Add(chunk); + + Assert.Single(chunks); + Assert.Equal("Streamed", chunks[0].Content); + } + + // ── LlmTestHarness ───────────────────────────────────────── + + [Fact] + public async Task Harness_SimpleMessage_CapturesResponse() + { + MockChatCompletionService mock = new(); + mock.QueueTextResponse("Hi there!"); + + LlmTestHarness harness = new(mock); + HarnessResult result = await harness.SimulateMessageAsync("Hello"); + + Assert.Equal("Hi there!", result.Response); + Assert.Empty(result.ToolCalls); + Assert.True(result.OutputTokens > 0); + Assert.True(result.Duration > TimeSpan.Zero); + } + + [Fact] + public async Task Harness_ToolCall_CapturedAndRecorded() + { + MockChatCompletionService mock = new(); + mock.QueueToolCallResponse("mem", "create_memory", new() { ["content"] = "test data" }); + mock.QueueTextResponse("Saved!"); + + LlmTestHarness harness = new(mock); + HarnessResult result = await harness.SimulateMessageAsync("Remember this"); + + Assert.Single(result.ToolCalls); + Assert.Equal("mem", result.ToolCalls[0].PluginName); + Assert.Equal("create_memory", result.ToolCalls[0].FunctionName); + Assert.Equal("test data", result.ToolCalls[0].Arguments["content"]); + Assert.Equal("Saved!", result.Response); + } + + [Fact] + public async Task Harness_MultipleToolCalls_AllCaptured() + { + MockChatCompletionService mock = new(); + mock.QueueMultiToolCallResponse( + new MockToolCall("dt", "get_current_datetime", []), + new MockToolCall("mem", "search_memories", new() { ["query"] = "test" }) + ); + mock.QueueTextResponse("Done."); + + LlmTestHarness harness = new(mock); + HarnessResult result = await harness.SimulateMessageAsync("Search"); + + Assert.Equal(2, result.ToolCalls.Count); + Assert.Equal("Done.", result.Response); + } + + [Fact] + public async Task Harness_HistoryContainsAllMessages() + { + MockChatCompletionService mock = new(); + mock.QueueTextResponse("Reply"); + + LlmTestHarness harness = new(mock); + HarnessResult result = await harness.SimulateMessageAsync("Test"); + + // system + user + assistant = 3 + Assert.Equal(3, result.History.Count); + Assert.Equal(AuthorRole.System, result.History[0].Role); + Assert.Equal(AuthorRole.User, result.History[1].Role); + Assert.Equal(AuthorRole.Assistant, result.History[2].Role); + } + + [Fact] + public async Task Harness_CustomSystemPrompt_Applied() + { + MockChatCompletionService mock = new(); + mock.QueueTextResponse("Response"); + + LlmTestHarness harness = new(mock); + HarnessResult result = await harness.SimulateMessageAsync("Hello", + opts => opts.SystemPrompt = "Custom prompt"); + + Assert.Equal("Custom prompt", result.History[0].Content); + } + + [Fact] + public async Task Harness_TokenEstimation_Works() + { + MockChatCompletionService mock = new(); + mock.QueueTextResponse("This is a test response with several words."); + + LlmTestHarness harness = new(mock); + HarnessResult result = await harness.SimulateMessageAsync("Test"); + + Assert.True(result.OutputTokens > 0); + } + + [Fact] + public async Task Harness_EmptyResponse_ZeroTokens() + { + MockChatCompletionService mock = new(); + mock.QueueTextResponse(""); + + LlmTestHarness harness = new(mock); + HarnessResult result = await harness.SimulateMessageAsync("Test"); + + Assert.Equal(0, result.OutputTokens); + Assert.Equal(string.Empty, result.Response); + } + + [Fact] + public async Task Harness_MaxIterations_Respected() + { + MockChatCompletionService mock = new(); + // Queue more tool calls than max iterations + mock.QueueToolCallResponse("dt", "get_current_datetime"); + mock.QueueToolCallResponse("dt", "get_current_datetime"); + mock.QueueToolCallResponse("dt", "get_current_datetime"); + + LlmTestHarness harness = new(mock); + HarnessResult result = await harness.SimulateMessageAsync("Test", + opts => opts.MaxIterations = 2); + + // Should stop after 2 iterations even though there are more responses + Assert.True(result.ToolCalls.Count <= 2); + } +} diff --git a/Lis.Tests/Harness/LlmTestHarness.cs b/Lis.Tests/Harness/LlmTestHarness.cs new file mode 100644 index 0000000..66bf0cd --- /dev/null +++ b/Lis.Tests/Harness/LlmTestHarness.cs @@ -0,0 +1,149 @@ +using System.ComponentModel; +using System.Diagnostics; + +using Lis.Agent; + +using Microsoft.Extensions.DependencyInjection; +using Microsoft.SemanticKernel; +using Microsoft.SemanticKernel.ChatCompletion; + +namespace Lis.Tests.Harness; + +/// +/// Options to configure a harness simulation run. +/// +public sealed class HarnessOptions +{ + /// System prompt prepended to the chat history. + public string SystemPrompt { get; set; } = "You are a helpful assistant."; + + /// Maximum tool call iterations before stopping. + public int MaxIterations { get; set; } = 5; +} + +/// +/// Core test runner that simulates AI conversations using a . +/// Creates a minimal Semantic Kernel, sends user messages, captures responses and tool calls. +/// +public sealed class LlmTestHarness +{ + private readonly MockChatCompletionService _mockService; + private readonly Kernel _kernel; + + public LlmTestHarness(MockChatCompletionService mockService, Action? configureKernel = null) + { + this._mockService = mockService; + + IKernelBuilder builder = Kernel.CreateBuilder(); + builder.Services.AddSingleton(mockService); + configureKernel?.Invoke(builder); + this._kernel = builder.Build(); + } + + /// + /// Simulate sending a user message and capture the AI response. + /// + public Task SimulateMessageAsync(string userMessage) => + this.SimulateMessageAsync(userMessage, _ => { }); + + /// + /// Simulate sending a user message with custom options. + /// + public async Task SimulateMessageAsync(string userMessage, Action configure) + { + HarnessOptions options = new(); + configure(options); + + Stopwatch sw = Stopwatch.StartNew(); + + ChatHistory history = []; + history.AddSystemMessage(options.SystemPrompt); + history.AddUserMessage(userMessage); + + List allToolCalls = []; + string finalResponse = string.Empty; + + for (int i = 0; i < options.MaxIterations; i++) + { + IReadOnlyList results = + await this._mockService.GetChatMessageContentsAsync(history, kernel: this._kernel); + + foreach (ChatMessageContent msg in results) + { + history.Add(msg); + + // Check for tool calls + List functionCalls = msg.Items + .OfType() + .ToList(); + + if (functionCalls.Count > 0) + { + foreach (FunctionCallContent call in functionCalls) + { + Dictionary args = []; + if (call.Arguments is not null) + { + foreach (KeyValuePair kvp in call.Arguments) + args[kvp.Key] = kvp.Value?.ToString() ?? string.Empty; + } + + allToolCalls.Add(new HarnessToolCall( + call.PluginName ?? string.Empty, + call.FunctionName, + args)); + + // Try to invoke the function if it exists in the kernel + string toolResult; + try + { + FunctionResultContent result = await call.InvokeAsync(this._kernel); + toolResult = result.Result?.ToString() ?? string.Empty; + } + catch + { + toolResult = "Tool execution simulated."; + } + + ChatMessageContent toolMessage = new(AuthorRole.Tool, toolResult); + toolMessage.Items.Add(new FunctionResultContent(call, toolResult)); + history.Add(toolMessage); + } + + // Continue loop to get next response after tool results + continue; + } + + // Plain text response — we're done + if (msg.Content is not null) + finalResponse = msg.Content; + } + + // If last message was a plain text assistant response, stop + if (history.Count > 0 && history[^1].Role == AuthorRole.Assistant + && !history[^1].Items.OfType().Any()) + break; + } + + sw.Stop(); + + return new HarnessResult + { + Response = finalResponse, + ToolCalls = allToolCalls, + OutputTokens = TokenEstimator.Count(finalResponse), + Duration = sw.Elapsed, + History = history + }; + } +} + +/// +/// A simple kernel function for testing — echoes back the input. +/// +public sealed class EchoPlugin +{ + [KernelFunction("echo")] + [Description("Echoes the input back.")] + public string Echo([Description("Text to echo")] string input) => $"Echo: {input}"; +} diff --git a/Lis.Tests/Harness/MockChatCompletionService.cs b/Lis.Tests/Harness/MockChatCompletionService.cs new file mode 100644 index 0000000..437a4d8 --- /dev/null +++ b/Lis.Tests/Harness/MockChatCompletionService.cs @@ -0,0 +1,124 @@ +using System.Runtime.CompilerServices; + +using Microsoft.SemanticKernel; +using Microsoft.SemanticKernel.ChatCompletion; + +namespace Lis.Tests.Harness; + +/// +/// A configurable mock that queues pre-defined responses +/// and records tool calls for assertion. Supports multi-turn conversations with tool call simulation. +/// +public sealed class MockChatCompletionService : IChatCompletionService +{ + private readonly Queue _responses = new(); + private readonly List _recordedToolCalls = []; + + /// Tool calls captured during the conversation. + public IReadOnlyList RecordedToolCalls => this._recordedToolCalls; + + public IReadOnlyDictionary Attributes => new Dictionary(); + + /// + /// Enqueue a plain text response. + /// + public MockChatCompletionService QueueTextResponse(string text) + { + this._responses.Enqueue(new MockResponse { Text = text }); + return this; + } + + /// + /// Enqueue a response that includes tool calls followed by a text response. + /// The harness will invoke the tool calls, then the next GetChatMessageContentsAsync + /// call should return the follow-up text. + /// + public MockChatCompletionService QueueToolCallResponse( + string pluginName, string functionName, Dictionary? arguments = null) + { + this._responses.Enqueue(new MockResponse + { + ToolCalls = [new MockToolCall(pluginName, functionName, arguments ?? [])] + }); + return this; + } + + /// + /// Enqueue a response with multiple tool calls. + /// + public MockChatCompletionService QueueMultiToolCallResponse( + params MockToolCall[] toolCalls) + { + this._responses.Enqueue(new MockResponse { ToolCalls = [.. toolCalls] }); + return this; + } + + public Task> GetChatMessageContentsAsync( + ChatHistory chatHistory, + PromptExecutionSettings? executionSettings = null, + Kernel? kernel = null, + CancellationToken cancellationToken = default) + { + if (this._responses.Count == 0) + throw new InvalidOperationException("No more queued responses in MockChatCompletionService."); + + MockResponse response = this._responses.Dequeue(); + List results = []; + + if (response.ToolCalls.Count > 0) + { + ChatMessageContent toolCallMessage = new(AuthorRole.Assistant, content: null); + foreach (MockToolCall tc in response.ToolCalls) + { + this._recordedToolCalls.Add(new HarnessToolCall( + tc.PluginName, tc.FunctionName, tc.Arguments)); + + KernelArguments args = []; + foreach (KeyValuePair kvp in tc.Arguments) + args[kvp.Key] = kvp.Value; + + toolCallMessage.Items.Add(new FunctionCallContent( + functionName: tc.FunctionName, + pluginName: tc.PluginName, + id: Guid.NewGuid().ToString("N"), + arguments: args)); + } + results.Add(toolCallMessage); + } + else + { + results.Add(new ChatMessageContent(AuthorRole.Assistant, response.Text ?? string.Empty)); + } + + return Task.FromResult>(results); + } + + public async IAsyncEnumerable GetStreamingChatMessageContentsAsync( + ChatHistory chatHistory, + PromptExecutionSettings? executionSettings = null, + Kernel? kernel = null, + [EnumeratorCancellation] CancellationToken cancellationToken = default) + { + IReadOnlyList messages = + await this.GetChatMessageContentsAsync(chatHistory, executionSettings, kernel, cancellationToken); + + foreach (ChatMessageContent msg in messages) + { + yield return new StreamingChatMessageContent(msg.Role, msg.Content); + } + } + + private sealed class MockResponse + { + public string? Text { get; init; } + public List ToolCalls { get; init; } = []; + } +} + +/// +/// Represents a tool call to be simulated by the mock. +/// +public sealed record MockToolCall( + string PluginName, + string FunctionName, + Dictionary Arguments); diff --git a/Lis.Tests/Harness/SnapshotManager.cs b/Lis.Tests/Harness/SnapshotManager.cs new file mode 100644 index 0000000..7a7d3a7 --- /dev/null +++ b/Lis.Tests/Harness/SnapshotManager.cs @@ -0,0 +1,211 @@ +using System.Security.Cryptography; +using System.Text; +using System.Text.Json; +using System.Text.Json.Serialization; + +namespace Lis.Tests.Harness; + +/// +/// Comparison result between a current run and an approved snapshot. +/// +public sealed class SnapshotComparison +{ + public bool IsMatch { get; init; } + public bool SnapshotExists { get; init; } + public bool IsApproved { get; init; } + public string? Difference { get; init; } +} + +/// +/// Persisted snapshot data for a test. +/// +public sealed class SnapshotData +{ + [JsonPropertyName("response")] + public string Response { get; init; } = string.Empty; + + [JsonPropertyName("tool_calls")] + public List ToolCalls { get; init; } = []; + + [JsonPropertyName("approved_at")] + public DateTimeOffset? ApprovedAt { get; init; } + + [JsonPropertyName("approved_hash")] + public string? ApprovedHash { get; init; } +} + +/// +/// Tool call data stored in a snapshot. +/// +public sealed class SnapshotToolCall +{ + [JsonPropertyName("plugin_name")] + public string PluginName { get; init; } = string.Empty; + + [JsonPropertyName("function_name")] + public string FunctionName { get; init; } = string.Empty; + + [JsonPropertyName("arguments")] + public Dictionary Arguments { get; init; } = []; +} + +/// +/// Manages snapshot files for LLM test output comparison. +/// Snapshots are saved as JSON and can be approved for future regression checks. +/// +public sealed class SnapshotManager +{ + private readonly string _snapshotDirectory; + private static readonly JsonSerializerOptions JsonOptions = new() + { + WriteIndented = true, + DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull + }; + + public SnapshotManager(string? snapshotDirectory = null) + { + this._snapshotDirectory = snapshotDirectory + ?? Path.Combine(AppContext.BaseDirectory, "__snapshots__"); + } + + /// Save a snapshot for the given test name. + public void SaveSnapshot(string testName, HarnessResult result) + { + Directory.CreateDirectory(this._snapshotDirectory); + + SnapshotData data = ToSnapshotData(result); + string json = JsonSerializer.Serialize(data, JsonOptions); + string path = this.GetSnapshotPath(testName); + File.WriteAllText(path, json); + } + + /// Load a previously saved snapshot, or null if none exists. + public SnapshotData? LoadSnapshot(string testName) + { + string path = this.GetSnapshotPath(testName); + if (!File.Exists(path)) return null; + + string json = File.ReadAllText(path); + return JsonSerializer.Deserialize(json, JsonOptions); + } + + /// Compare current result with the approved snapshot. + public SnapshotComparison CompareWithSnapshot(string testName, HarnessResult result) + { + SnapshotData? existing = this.LoadSnapshot(testName); + + if (existing is null) + { + // First run — save and indicate new snapshot + this.SaveSnapshot(testName, result); + return new SnapshotComparison + { + IsMatch = false, + SnapshotExists = false, + IsApproved = false, + Difference = "New snapshot created — needs approval." + }; + } + + if (existing.ApprovedHash is null) + { + return new SnapshotComparison + { + IsMatch = false, + SnapshotExists = true, + IsApproved = false, + Difference = "Snapshot exists but has not been approved yet." + }; + } + + // Compare by content hash + SnapshotData currentData = ToSnapshotData(result); + string currentHash = ComputeHash(currentData); + + if (currentHash == existing.ApprovedHash) + { + return new SnapshotComparison + { + IsMatch = true, + SnapshotExists = true, + IsApproved = true + }; + } + + // Determine difference + string difference = BuildDifference(existing, currentData); + + // Save the new result for review + this.SaveSnapshot(testName, result); + + return new SnapshotComparison + { + IsMatch = false, + SnapshotExists = true, + IsApproved = true, + Difference = difference + }; + } + + /// Mark the current snapshot as approved. + public void ApproveSnapshot(string testName) + { + SnapshotData? existing = this.LoadSnapshot(testName); + if (existing is null) + throw new InvalidOperationException($"No snapshot found for '{testName}'."); + + string hash = ComputeHash(existing); + SnapshotData approved = new() + { + Response = existing.Response, + ToolCalls = existing.ToolCalls, + ApprovedAt = DateTimeOffset.UtcNow, + ApprovedHash = hash + }; + + string json = JsonSerializer.Serialize(approved, JsonOptions); + string path = this.GetSnapshotPath(testName); + File.WriteAllText(path, json); + } + + private string GetSnapshotPath(string testName) + { + // Sanitize test name for file system + string safe = string.Join("_", testName.Split(Path.GetInvalidFileNameChars())); + return Path.Combine(this._snapshotDirectory, $"{safe}.json"); + } + + private static SnapshotData ToSnapshotData(HarnessResult result) => new() + { + Response = result.Response, + ToolCalls = result.ToolCalls.Select(tc => new SnapshotToolCall + { + PluginName = tc.PluginName, + FunctionName = tc.FunctionName, + Arguments = tc.Arguments + }).ToList() + }; + + internal static string ComputeHash(SnapshotData data) + { + string json = JsonSerializer.Serialize(new { data.Response, data.ToolCalls }, JsonOptions); + byte[] bytes = SHA256.HashData(Encoding.UTF8.GetBytes(json)); + return Convert.ToHexStringLower(bytes); + } + + private static string BuildDifference(SnapshotData approved, SnapshotData current) + { + List diffs = []; + + if (approved.Response != current.Response) + diffs.Add($"Response changed: '{Truncate(approved.Response)}' → '{Truncate(current.Response)}'"); + + if (approved.ToolCalls.Count != current.ToolCalls.Count) + diffs.Add($"Tool call count: {approved.ToolCalls.Count} → {current.ToolCalls.Count}"); + + return diffs.Count > 0 ? string.Join("; ", diffs) : "Content hash mismatch (subtle difference)."; + } + + private static string Truncate(string text, int maxLength = 80) => + text.Length <= maxLength ? text : text[..maxLength] + "…"; +} diff --git a/Lis.Tests/Harness/SnapshotTests.cs b/Lis.Tests/Harness/SnapshotTests.cs new file mode 100644 index 0000000..7837385 --- /dev/null +++ b/Lis.Tests/Harness/SnapshotTests.cs @@ -0,0 +1,229 @@ +namespace Lis.Tests.Harness; + +public class SnapshotTests : IDisposable +{ + private readonly string _tempDir; + private readonly SnapshotManager _sut; + + public SnapshotTests() + { + this._tempDir = Path.Combine(Path.GetTempPath(), $"lis_snapshots_{Guid.NewGuid():N}"); + this._sut = new SnapshotManager(this._tempDir); + } + + public void Dispose() + { + if (Directory.Exists(this._tempDir)) + Directory.Delete(this._tempDir, recursive: true); + GC.SuppressFinalize(this); + } + + private static HarnessResult MakeResult(string response = "Hello", List? toolCalls = null) => new() + { + Response = response, + ToolCalls = toolCalls ?? [], + OutputTokens = 5, + Duration = TimeSpan.FromMilliseconds(50), + History = [] + }; + + // ── SaveSnapshot / LoadSnapshot ───────────────────────────── + + [Fact] + public void SaveAndLoad_RoundTrips() + { + HarnessResult result = MakeResult("Test response", [ + new HarnessToolCall("mem", "create_memory", new() { ["content"] = "birthday" }) + ]); + + this._sut.SaveSnapshot("save_load_test", result); + SnapshotData? loaded = this._sut.LoadSnapshot("save_load_test"); + + Assert.NotNull(loaded); + Assert.Equal("Test response", loaded.Response); + Assert.Single(loaded.ToolCalls); + Assert.Equal("mem", loaded.ToolCalls[0].PluginName); + Assert.Equal("create_memory", loaded.ToolCalls[0].FunctionName); + Assert.Equal("birthday", loaded.ToolCalls[0].Arguments["content"]); + } + + [Fact] + public void LoadSnapshot_NonExistent_ReturnsNull() + { + SnapshotData? loaded = this._sut.LoadSnapshot("does_not_exist"); + + Assert.Null(loaded); + } + + [Fact] + public void SaveSnapshot_CreatesDirectory() + { + Assert.False(Directory.Exists(this._tempDir)); + + this._sut.SaveSnapshot("dir_test", MakeResult()); + + Assert.True(Directory.Exists(this._tempDir)); + } + + [Fact] + public void SaveSnapshot_OverwritesExisting() + { + this._sut.SaveSnapshot("overwrite_test", MakeResult("First")); + this._sut.SaveSnapshot("overwrite_test", MakeResult("Second")); + + SnapshotData? loaded = this._sut.LoadSnapshot("overwrite_test"); + Assert.NotNull(loaded); + Assert.Equal("Second", loaded.Response); + } + + // ── CompareWithSnapshot ───────────────────────────────────── + + [Fact] + public void Compare_NoExistingSnapshot_CreatesNew() + { + HarnessResult result = MakeResult("New response"); + + SnapshotComparison comparison = this._sut.CompareWithSnapshot("new_test", result); + + Assert.False(comparison.IsMatch); + Assert.False(comparison.SnapshotExists); + Assert.False(comparison.IsApproved); + Assert.Contains("needs approval", comparison.Difference); + } + + [Fact] + public void Compare_ExistingButNotApproved_ReportsUnapproved() + { + this._sut.SaveSnapshot("unapproved_test", MakeResult("Response")); + + HarnessResult result = MakeResult("Response"); + SnapshotComparison comparison = this._sut.CompareWithSnapshot("unapproved_test", result); + + Assert.False(comparison.IsMatch); + Assert.True(comparison.SnapshotExists); + Assert.False(comparison.IsApproved); + Assert.Contains("not been approved", comparison.Difference); + } + + [Fact] + public void Compare_ApprovedAndMatching_ReturnsMatch() + { + this._sut.SaveSnapshot("approved_test", MakeResult("Stable response")); + this._sut.ApproveSnapshot("approved_test"); + + HarnessResult result = MakeResult("Stable response"); + SnapshotComparison comparison = this._sut.CompareWithSnapshot("approved_test", result); + + Assert.True(comparison.IsMatch); + Assert.True(comparison.SnapshotExists); + Assert.True(comparison.IsApproved); + Assert.Null(comparison.Difference); + } + + [Fact] + public void Compare_ApprovedButChanged_ReportsDifference() + { + this._sut.SaveSnapshot("changed_test", MakeResult("Original")); + this._sut.ApproveSnapshot("changed_test"); + + HarnessResult changed = MakeResult("Modified"); + SnapshotComparison comparison = this._sut.CompareWithSnapshot("changed_test", changed); + + Assert.False(comparison.IsMatch); + Assert.True(comparison.SnapshotExists); + Assert.True(comparison.IsApproved); + Assert.Contains("Response changed", comparison.Difference); + } + + [Fact] + public void Compare_ToolCallCountChanged_ReportsDifference() + { + HarnessResult original = MakeResult("Same", [ + new HarnessToolCall("mem", "search", []) + ]); + this._sut.SaveSnapshot("tools_changed", original); + this._sut.ApproveSnapshot("tools_changed"); + + HarnessResult changed = MakeResult("Same", [ + new HarnessToolCall("mem", "search", []), + new HarnessToolCall("mem", "create_memory", []) + ]); + SnapshotComparison comparison = this._sut.CompareWithSnapshot("tools_changed", changed); + + Assert.False(comparison.IsMatch); + Assert.Contains("Tool call count", comparison.Difference); + } + + // ── ApproveSnapshot ───────────────────────────────────────── + + [Fact] + public void ApproveSnapshot_SetsApprovedFields() + { + this._sut.SaveSnapshot("approve_test", MakeResult("Approved response")); + + this._sut.ApproveSnapshot("approve_test"); + + SnapshotData? loaded = this._sut.LoadSnapshot("approve_test"); + Assert.NotNull(loaded); + Assert.NotNull(loaded.ApprovedAt); + Assert.NotNull(loaded.ApprovedHash); + Assert.Equal("Approved response", loaded.Response); + } + + [Fact] + public void ApproveSnapshot_NonExistent_Throws() + { + Assert.Throws(() => + this._sut.ApproveSnapshot("nonexistent")); + } + + // ── ComputeHash ───────────────────────────────────────────── + + [Fact] + public void ComputeHash_SameContent_SameHash() + { + SnapshotData a = new() { Response = "Hello", ToolCalls = [] }; + SnapshotData b = new() { Response = "Hello", ToolCalls = [] }; + + string hashA = SnapshotManager.ComputeHash(a); + string hashB = SnapshotManager.ComputeHash(b); + + Assert.Equal(hashA, hashB); + } + + [Fact] + public void ComputeHash_DifferentContent_DifferentHash() + { + SnapshotData a = new() { Response = "Hello", ToolCalls = [] }; + SnapshotData b = new() { Response = "Goodbye", ToolCalls = [] }; + + string hashA = SnapshotManager.ComputeHash(a); + string hashB = SnapshotManager.ComputeHash(b); + + Assert.NotEqual(hashA, hashB); + } + + [Fact] + public void ComputeHash_IgnoresApprovedFields() + { + SnapshotData a = new() { Response = "Test", ToolCalls = [] }; + SnapshotData b = new() { Response = "Test", ToolCalls = [], ApprovedAt = DateTimeOffset.UtcNow, ApprovedHash = "abc" }; + + string hashA = SnapshotManager.ComputeHash(a); + string hashB = SnapshotManager.ComputeHash(b); + + Assert.Equal(hashA, hashB); + } + + // ── File naming ───────────────────────────────────────────── + + [Fact] + public void SaveSnapshot_HandlesSpecialCharsInName() + { + this._sut.SaveSnapshot("test:with/special", MakeResult()); + + // Should not throw — chars are sanitized + SnapshotData? loaded = this._sut.LoadSnapshot("test:with/special"); + Assert.NotNull(loaded); + } +} diff --git a/global.json b/global.json index 058bafa..816dd81 100644 --- a/global.json +++ b/global.json @@ -1,5 +1,6 @@ { "sdk": { - "version": "10.0.103" + "version": "10.0.103", + "rollForward": "latestMinor" } }