diff --git a/EnterpriseIntegrationPlatform/rules/milestones.md b/EnterpriseIntegrationPlatform/rules/milestones.md
index 08538dd..c2a6d6c 100644
--- a/EnterpriseIntegrationPlatform/rules/milestones.md
+++ b/EnterpriseIntegrationPlatform/rules/milestones.md
@@ -22,57 +22,17 @@
 
 ## Completed Phases
 
-✅ Phases 1–21 complete — see `rules/completion-log.md` for full history.
+✅ Phases 1–24 complete — see `rules/completion-log.md` for full history.
 
-**Current stats:** 1,518 UnitTests + 58 Contract + 29 Workflow + 17 Integration + 10 Load + 19 Vitest = **1,651 total tests**. 48 src projects.
+48 src projects. All 50 tutorials rewritten with BizTalk-style Lab + Exam exercises focused on EIP patterns, scalability, and atomicity.
 
-**Next chunk:** Phase 22 complete — all 13 chunks (080-092) done.
+**Next chunk:** (none — all current work complete)
 
 ---
 
-### Phase 19 — Tutorial Audit as New Developer (Round 6)
-
-✅ Phase 19 complete — see `rules/completion-log.md`.
-
-### Phase 20 — Tutorial Audit as New Developer (Round 7)
-
-✅ Phase 20 complete — fixed 7 tutorials (03, 17, 26, 28, 29, 45, 48) plus INormalizer.cs xmldoc.
-
-### Phase 21 — Tutorial Code Snippet Accuracy Audit
-
-✅ Phase 21 complete — fixed 4 tutorials (26, 31, 35, 38) with code snippets mismatched against actual source code.
-
----
-
-### Phase 22 — Implement Unfulfilled Tutorial Promises
-
-**Scope:** Audit of all 50 tutorials against source code found 13 features that tutorials promise but are not implemented. These chunks implement the missing features so that every tutorial claim is backed by working code.
-
-#### Chunk 090 — EnvironmentOverrideProvider: EIP__ Environment Variable Convention
-
-| Field | Value |
-|-------|-------|
-| Status | `not-started` |
-| Tutorial | 42 — Configuration (line 121) |
-| Claim | "The `EnvironmentOverrideProvider` reads environment variables using the convention `EIP__Key__SubKey` (double underscore as separator). Environment variables take precedence over store values." |
-| Current State | `EnvironmentOverrideProvider` only does cascading resolution from the `IConfigurationStore`. It never reads `System.Environment.GetEnvironmentVariable()`. |
-| Implementation | In `ResolveAsync`, before falling back to the store, check `Environment.GetEnvironmentVariable($"EIP__{key.Replace(":", "__")}")`. If found, return a synthetic `ConfigurationEntry` with that value. Add `ResolveManyAsync` override similarly. Add unit tests using environment variable injection. |
-| Files | `src/Configuration/EnvironmentOverrideProvider.cs`, `tests/UnitTests/EnvironmentOverrideProviderTests.cs` |
-
-#### Chunk 092 — Kustomize Base Directory Structure
-
-| Field | Value |
-|-------|-------|
-| Status | `done` |
-| Tutorial | 43 — Kubernetes Deployment (lines 91-104) |
-| Claim | Tutorial shows flat `base/` with `deployment.yaml` and `service.yaml`. |
-| Current State | Actual structure has `base/admin-api/` and `base/openclaw-web/` subdirectories. |
-| Implementation | Updated tutorial 43 to match the actual directory structure (service-specific subdirectories, namespace.yaml, prod PDB files). |
-| Files | `tutorials/43-kubernetes-deployment.md` |
-
 ## Next Chunk
 
-Phase 22 complete — all 13 chunks (080-092) done.
+(none)
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tests/UnitTests/ScatterGathererTests.cs b/EnterpriseIntegrationPlatform/tests/UnitTests/ScatterGathererTests.cs
new file mode 100644
index 0000000..3a9d17a
--- /dev/null
+++ b/EnterpriseIntegrationPlatform/tests/UnitTests/ScatterGathererTests.cs
@@ -0,0 +1,376 @@
+using EnterpriseIntegrationPlatform.Contracts;
+using EnterpriseIntegrationPlatform.Ingestion;
+using EnterpriseIntegrationPlatform.Processing.ScatterGather;
+using Microsoft.Extensions.Configuration;
+using Microsoft.Extensions.DependencyInjection;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Options;
+using NSubstitute;
+using NUnit.Framework;
+
+namespace EnterpriseIntegrationPlatform.Tests.Unit;
+
+[TestFixture]
+public class ScatterGathererTests
+{
+    private IMessageBrokerProducer _producer = null!;
+    private ILogger<ScatterGatherer<string, string>> _logger = null!;
+
+    private ScatterGatherer<string, string> CreateSut(int timeoutMs = 5_000, int maxRecipients = 50)
+    {
+        var options = Options.Create(new ScatterGatherOptions
+        {
+            TimeoutMs = timeoutMs,
+            MaxRecipients = maxRecipients,
+        });
+
+        return new ScatterGatherer<string, string>(_producer, options, _logger);
+    }
+
+    [SetUp]
+    public void SetUp()
+    {
+        _producer = Substitute.For<IMessageBrokerProducer>();
+        _logger = Substitute.For<ILogger<ScatterGatherer<string, string>>>();
+    }
+
+    // ── Constructor guards ───────────────────────────────────────────
+
+    [Test]
+    public void Ctor_NullProducer_ThrowsArgumentNullException()
+    {
+        Assert.Throws<ArgumentNullException>(() =>
+            new ScatterGatherer<string, string>(
+                null!,
+                Options.Create(new ScatterGatherOptions()),
+                _logger));
+    }
+
+    [Test]
+    public void Ctor_NullOptions_ThrowsArgumentNullException()
+    {
+        Assert.Throws<ArgumentNullException>(() =>
+            new ScatterGatherer<string, string>(
+                _producer,
+                null!,
+                _logger));
+    }
+
+    [Test]
+    public void Ctor_NullLogger_ThrowsArgumentNullException()
+    {
+        Assert.Throws<ArgumentNullException>(() =>
+            new ScatterGatherer<string, string>(
+                _producer,
+                Options.Create(new ScatterGatherOptions()),
+                null!));
+    }
+
+    // ── ScatterGatherAsync ───────────────────────────────────────────
+
+    [Test]
+    public void ScatterGatherAsync_NullRequest_ThrowsArgumentNullException()
+    {
+        var sut = CreateSut();
+
+        Assert.ThrowsAsync<ArgumentNullException>(() =>
+            sut.ScatterGatherAsync(null!));
+    }
+
+    [Test]
+    public async Task ScatterGatherAsync_EmptyRecipients_ReturnsEmptyResultNotTimedOut()
+    {
+        var sut = CreateSut();
+        var request = new ScatterRequest<string>(Guid.NewGuid(), "payload", []);
+
+        var result = await sut.ScatterGatherAsync(request);
+
+        Assert.That(result.Responses, Is.Empty);
+        Assert.That(result.TimedOut, Is.False);
+        Assert.That(result.Duration, Is.EqualTo(TimeSpan.Zero));
+        Assert.That(result.CorrelationId, Is.EqualTo(request.CorrelationId));
+    }
+
+    [Test]
+    public async Task ScatterGatherAsync_NullRecipients_ReturnsEmptyResultNotTimedOut()
+    {
+        var sut = CreateSut();
+        var request = new ScatterRequest<string>(Guid.NewGuid(), "payload", null!);
+
+        var result = await sut.ScatterGatherAsync(request);
+
+        Assert.That(result.Responses, Is.Empty);
+        Assert.That(result.TimedOut, Is.False);
+    }
+
+    [Test]
+    public void ScatterGatherAsync_ExceedsMaxRecipients_ThrowsArgumentException()
+    {
+        var sut = CreateSut(maxRecipients: 2);
+        var request = new ScatterRequest<string>(
+            Guid.NewGuid(),
+            "payload",
+            ["topic-a", "topic-b", "topic-c"]);
+
+        var ex = Assert.ThrowsAsync<ArgumentException>(() =>
+            sut.ScatterGatherAsync(request));
+
+        Assert.That(ex!.Message, Does.Contain("3"));
+        Assert.That(ex.Message, Does.Contain("2"));
+    }
+
+    [Test]
+    public void ScatterGatherAsync_DuplicateCorrelationId_ThrowsInvalidOperationException()
+    {
+        var sut = CreateSut(timeoutMs: 60_000);
+        var correlationId = Guid.NewGuid();
+
+        // First request: block the producer so the operation stays active
+        _producer.PublishAsync(Arg.Any<IntegrationEnvelope<string>>(), Arg.Any<string>(), Arg.Any<CancellationToken>())
+            .Returns(new TaskCompletionSource().Task); // never completes
+
+        var cts = new CancellationTokenSource();
+        var request1 = new ScatterRequest<string>(correlationId, "p1", ["topic-a"]);
+        var firstTask = sut.ScatterGatherAsync(request1, cts.Token);
+
+        // Second request with same correlationId
+        var request2 = new ScatterRequest<string>(correlationId, "p2", ["topic-b"]);
+
+        Assert.ThrowsAsync<InvalidOperationException>(() =>
+            sut.ScatterGatherAsync(request2));
+
+        cts.Cancel();
+    }
+
+    [Test]
+    public async Task ScatterGatherAsync_AllRecipientsRespond_ReturnsAllResponsesNotTimedOut()
+    {
+        var sut = CreateSut(timeoutMs: 5_000);
+        var correlationId = Guid.NewGuid();
+        var recipients = new[] { "topic-a", "topic-b" };
+        var request = new ScatterRequest<string>(correlationId, "payload", recipients);
+
+        // When scatter publishes, submit responses immediately
+        _producer
+            .When(p => p.PublishAsync(Arg.Any<IntegrationEnvelope<string>>(), Arg.Any<string>(), Arg.Any<CancellationToken>()))
+            .Do(_ =>
+            {
+                // Submit both responses after scatter completes
+                Task.Run(async () =>
+                {
+                    await Task.Delay(50);
+                    await sut.SubmitResponseAsync(correlationId,
+                        new GatherResponse<string>("topic-a", "resp-a", DateTimeOffset.UtcNow, true, null));
+                    await sut.SubmitResponseAsync(correlationId,
+                        new GatherResponse<string>("topic-b", "resp-b", DateTimeOffset.UtcNow, true, null));
+                });
+            });
+
+        var result = await sut.ScatterGatherAsync(request);
+
+        Assert.That(result.Responses, Has.Count.EqualTo(2));
+        Assert.That(result.TimedOut, Is.False);
+        Assert.That(result.CorrelationId, Is.EqualTo(correlationId));
+        Assert.That(result.Duration, Is.GreaterThan(TimeSpan.Zero));
+    }
+
+    [Test]
+    public async Task ScatterGatherAsync_PublishesToAllRecipients_VerifiesPublishCalls()
+    {
+        var sut = CreateSut(timeoutMs: 200);
+        var correlationId = Guid.NewGuid();
+        var recipients = new[] { "topic-a", "topic-b", "topic-c" };
+        var request = new ScatterRequest<string>(correlationId, "payload", recipients);
+
+        _producer.PublishAsync(Arg.Any<IntegrationEnvelope<string>>(), Arg.Any<string>(), Arg.Any<CancellationToken>())
+            .Returns(Task.CompletedTask);
+
+        await sut.ScatterGatherAsync(request);
+
+        await _producer.Received(3)
+            .PublishAsync(Arg.Any<IntegrationEnvelope<string>>(), Arg.Any<string>(), Arg.Any<CancellationToken>());
+
+        foreach (var topic in recipients)
+        {
+            await _producer.Received(1)
+                .PublishAsync(Arg.Any<IntegrationEnvelope<string>>(), topic, Arg.Any<CancellationToken>());
+        }
+    }
+
+    [Test]
+    public async Task ScatterGatherAsync_Timeout_ReturnsPartialResultsWithTimedOutTrue()
+    {
+        var sut = CreateSut(timeoutMs: 200);
+        var correlationId = Guid.NewGuid();
+        var request = new ScatterRequest<string>(correlationId, "payload", ["topic-a", "topic-b"]);
+
+        // Submit only one response (on the first publish only)
+        var submitted = 0;
+        _producer
+            .When(p => p.PublishAsync(Arg.Any<IntegrationEnvelope<string>>(), Arg.Any<string>(), Arg.Any<CancellationToken>()))
+            .Do(_ =>
+            {
+                if (Interlocked.Increment(ref submitted) == 1)
+                {
+                    Task.Run(async () =>
+                    {
+                        await Task.Delay(50);
+                        await sut.SubmitResponseAsync(correlationId,
+                            new GatherResponse<string>("topic-a", "resp-a", DateTimeOffset.UtcNow, true, null));
+                    });
+                }
+            });
+
+        var result = await sut.ScatterGatherAsync(request);
+
+        Assert.That(result.Responses, Has.Count.EqualTo(1));
+        Assert.That(result.TimedOut, Is.True);
+    }
+
+    // ── SubmitResponseAsync ──────────────────────────────────────────
+
+    [Test]
+    public void SubmitResponseAsync_NullResponse_ThrowsArgumentNullException()
+    {
+        var sut = CreateSut();
+
+        Assert.ThrowsAsync<ArgumentNullException>(() =>
+            sut.SubmitResponseAsync(Guid.NewGuid(), null!));
+    }
+
+    [Test]
+    public async Task SubmitResponseAsync_UnknownCorrelationId_ReturnsFalse()
+    {
+        var sut = CreateSut();
+        var response = new GatherResponse<string>("topic-a", "resp", DateTimeOffset.UtcNow, true, null);
+
+        var accepted = await sut.SubmitResponseAsync(Guid.NewGuid(), response);
+
+        Assert.That(accepted, Is.False);
+    }
+
+    // ── ScatterGatherResult ──────────────────────────────────────────
+
+    [Test]
+    public void ScatterGatherResult_RecordProperties_RetainValues()
+    {
+        var id = Guid.NewGuid();
+        var responses = new List<GatherResponse<string>>
+        {
+            new("topic-a", "resp-a", DateTimeOffset.UtcNow, true, null),
+        };
+        var duration = TimeSpan.FromMilliseconds(123);
+
+        var result = new ScatterGatherResult<string>(id, responses, TimedOut: true, duration);
+
+        Assert.That(result.CorrelationId, Is.EqualTo(id));
+        Assert.That(result.Responses, Has.Count.EqualTo(1));
+        Assert.That(result.TimedOut, Is.True);
+        Assert.That(result.Duration, Is.EqualTo(duration));
+    }
+
+    // ── GatherResponse ───────────────────────────────────────────────
+
+    [Test]
+    public void GatherResponse_RecordProperties_RetainValues()
+    {
+        var now = DateTimeOffset.UtcNow;
+        var response = new GatherResponse<string>("topic-a", "resp", now, false, "error msg");
+
+        Assert.That(response.Recipient, Is.EqualTo("topic-a"));
+        Assert.That(response.Payload, Is.EqualTo("resp"));
+        Assert.That(response.ReceivedAt, Is.EqualTo(now));
+        Assert.That(response.IsSuccess, Is.False);
+        Assert.That(response.ErrorMessage, Is.EqualTo("error msg"));
+    }
+
+    // ── ScatterRequest ───────────────────────────────────────────────
+
+    [Test]
+    public void ScatterRequest_RecordProperties_RetainValues()
+    {
+        var id = Guid.NewGuid();
+        var recipients = new[] { "a", "b" };
+        var request = new ScatterRequest<string>(id, "payload", recipients);
+
+        Assert.That(request.CorrelationId, Is.EqualTo(id));
+        Assert.That(request.Payload, Is.EqualTo("payload"));
+        Assert.That(request.Recipients, Is.EquivalentTo(recipients));
+    }
+}
+
+[TestFixture]
+public class ScatterGatherOptionsTests
+{
+    [Test]
+    public void TimeoutMs_Default_Is30000()
+    {
+        var options = new ScatterGatherOptions();
+        Assert.That(options.TimeoutMs, Is.EqualTo(30_000));
+    }
+
+    [Test]
+    public void MaxRecipients_Default_Is50()
+    {
+        var options = new ScatterGatherOptions();
+        Assert.That(options.MaxRecipients, Is.EqualTo(50));
+    }
+
+    [Test]
+    public void Properties_WhenSet_RetainValues()
+    {
+        var options = new ScatterGatherOptions
+        {
+            TimeoutMs = 5_000,
+            MaxRecipients = 10,
+        };
+
+        Assert.That(options.TimeoutMs, Is.EqualTo(5_000));
+        Assert.That(options.MaxRecipients, Is.EqualTo(10));
+    }
+}
+
+[TestFixture]
+public class ScatterGatherServiceExtensionsTests
+{
+    [Test]
+    public void AddScatterGather_NullServices_ThrowsArgumentNullException()
+    {
+        var config = new ConfigurationBuilder().Build();
+
+        Assert.Throws<ArgumentNullException>(() =>
+            ScatterGatherServiceExtensions.AddScatterGather<string, string>(null!, config));
+    }
+
+    [Test]
+    public void AddScatterGather_NullConfiguration_ThrowsArgumentNullException()
+    {
+        var services = new ServiceCollection();
+
+        Assert.Throws<ArgumentNullException>(() =>
+            services.AddScatterGather<string, string>(null!));
+    }
+
+    [Test]
+    public void AddScatterGather_ValidArgs_RegistersServices()
+    {
+        var services = new ServiceCollection();
+        var config = new ConfigurationBuilder()
+            .AddInMemoryCollection(new Dictionary<string, string?>
+            {
+                ["ScatterGather:TimeoutMs"] = "10000",
+                ["ScatterGather:MaxRecipients"] = "25",
+            })
+            .Build();
+
+        services.AddSingleton(Substitute.For<IMessageBrokerProducer>());
+        services.AddLogging();
+        services.AddScatterGather<string, string>(config);
+
+        var sp = services.BuildServiceProvider();
+        var instance = sp.GetService<IScatterGatherer<string, string>>();
+
+        Assert.That(instance, Is.Not.Null);
+        Assert.That(instance, Is.InstanceOf<ScatterGatherer<string, string>>());
+    }
+}
diff --git a/EnterpriseIntegrationPlatform/tutorials/01-introduction.md b/EnterpriseIntegrationPlatform/tutorials/01-introduction.md
index e00561a..f62f59b 100644
--- a/EnterpriseIntegrationPlatform/tutorials/01-introduction.md
+++ b/EnterpriseIntegrationPlatform/tutorials/01-introduction.md
@@ -152,13 +152,49 @@ By the end of this course, you'll understand how to:
 
 ---
 
-## Exercises
+## Lab
 
-1. **Explore the EIP website**: Visit [enterpriseintegrationpatterns.com](https://www.enterpriseintegrationpatterns.com/patterns/messaging/toc.html) and browse the pattern catalog. Pick three patterns and think about where you've seen them (or could use them) in your own work.
+**Objective:** Map EIP pattern categories to concrete platform components and trace how the Pipes and Filters architecture enables scalable message processing.
 
-2. **Review the architecture**: Read [`docs/architecture-overview.md`](../docs/architecture-overview.md) and identify how the platform's data flow maps to the Pipes and Filters pattern.
+### Step 1: Map Patterns to Projects
 
-3. **Count the patterns**: Look at [`docs/eip-mapping.md`](../docs/eip-mapping.md) and count how many of the 65 EIP patterns are implemented in the platform.
+Open [`docs/eip-mapping.md`](../docs/eip-mapping.md). For each of the following EIP categories, identify the `src/` project that implements it and the primary interface it exposes:
+
+| Category | Project | Interface |
+|----------|---------|-----------|
+| Message Construction | `src/Contracts/` | ? |
+| Content-Based Router | `src/Processing.Routing/` | ? |
+| Message Translator | `src/Processing.Translator/` | ? |
+| Splitter | `src/Processing.Splitter/` | ? |
+| Dead Letter Channel | `src/Processing.DeadLetter/` | ? |
+
+### Step 2: Trace the Pipes and Filters Chain
+
+Open [`docs/architecture-overview.md`](../docs/architecture-overview.md) and trace how a single message flows through the platform: Ingress → Broker → Workflow → Activities → Connectors. For each stage, write down which EIP pattern it implements and how the platform guarantees **atomicity** (hint: look at Temporal workflows and Ack/Nack).
+
+### Step 3: Evaluate Scalability Points
+
+Identify three places in the architecture where **horizontal scaling** is possible without code changes. Consider: broker partitions, Competing Consumers (`src/Processing.CompetingConsumers/`), and workflow workers. For each, explain what happens to in-flight messages when a new instance is added.
+
+## Exam
+
+1. Which integration style does the EIP book recommend for loosely coupled, asynchronous communication between systems?
+   - A) File Transfer
+   - B) Shared Database
+   - C) Messaging
+   - D) Remote Procedure Invocation
+
+2. In the Pipes and Filters pattern, what property must each filter maintain to allow independent scaling?
+   - A) Global mutable state shared across filters
+   - B) Stateless processing with all context carried in the message envelope
+   - C) Direct method calls to the next filter in the chain
+   - D) A persistent database connection for every filter
+
+3. How does the platform guarantee **zero message loss** when a processing step fails mid-pipeline?
+   - A) Messages are stored in memory and retried indefinitely
+   - B) Temporal workflows provide durable execution with saga compensation — either all steps complete or compensating actions roll back committed work
+   - C) The broker automatically resends messages every 5 seconds
+   - D) Failed messages are silently discarded to avoid blocking the pipeline
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/02-environment-setup.md b/EnterpriseIntegrationPlatform/tutorials/02-environment-setup.md
index 831f898..3aaf44d 100644
--- a/EnterpriseIntegrationPlatform/tutorials/02-environment-setup.md
+++ b/EnterpriseIntegrationPlatform/tutorials/02-environment-setup.md
@@ -76,7 +76,7 @@ This downloads all NuGet packages defined in `Directory.Packages.props` (central
 dotnet build EnterpriseIntegrationPlatform.sln
 ```
 
-A clean build should complete with **0 errors**. The solution contains 44+ projects — this takes 30–60 seconds on first build.
+A clean build should complete with **0 errors**. The solution contains many projects — this takes 30–60 seconds on first build.
 
 ### Step 4: Run the Tests
 
@@ -86,14 +86,14 @@ dotnet test EnterpriseIntegrationPlatform.sln
 
 The test suite includes:
 
-| Test Project | Count | Description |
-|-------------|-------|-------------|
-| UnitTests | 1,100+ | Fast, isolated tests for every component |
-| ContractTests | 58 | Contract verification between services |
-| WorkflowTests | 29 | Temporal workflow behavior tests |
-| IntegrationTests | 17 | Testcontainers-based tests with real infrastructure |
-| PlaywrightTests | 13 | End-to-end browser tests for OpenClaw UI |
-| LoadTests | 10 | Performance and throughput benchmarks |
+| Test Project | Description |
+|-------------|-------------|
+| UnitTests | Fast, isolated tests for every component (most numerous) |
+| ContractTests | Contract verification between services |
+| WorkflowTests | Temporal workflow behavior tests |
+| IntegrationTests | Testcontainers-based tests with real infrastructure |
+| PlaywrightTests | End-to-end browser tests for OpenClaw UI |
+| LoadTests | Performance and throughput benchmarks |
 
 > **Note:** IntegrationTests and PlaywrightTests require Docker to be running.
 
@@ -144,7 +144,7 @@ The dashboard at `https://localhost:15888` (or the URL shown in console output)
 
 ```
 EnterpriseIntegrationPlatform/
-├── src/                          # Source code (44+ projects)
+├── src/                          # Source code
 │   ├── AppHost/                  # .NET Aspire orchestrator
 │   ├── ServiceDefaults/          # Shared OpenTelemetry & health checks
 │   ├── Contracts/                # IntegrationEnvelope & shared interfaces
@@ -246,13 +246,63 @@ You need `Microsoft.NETCore.App 10.x.x` and `Microsoft.AspNetCore.App 10.x.x`.
 
 ---
 
-## Exercises
+## Lab
 
-1. **Explore the solution**: Open the `.sln` file in your IDE and browse the project list. Count how many `Processing.*` projects exist.
+**Objective:** Build the solution, launch the Aspire orchestrator, and explore how the platform's service topology implements the EIP Messaging Gateway and Control Bus patterns.
 
-2. **Read the tests**: Open `tests/UnitTests/` and browse the test namespaces. Each namespace corresponds to a `src/` project.
+### Step 1: Build and Launch
 
-3. **Explore Aspire**: Launch the AppHost and click through the Aspire dashboard. Find the health check endpoints for each service.
+Open a terminal in the repository root and execute:
+
+```bash
+dotnet restore EnterpriseIntegrationPlatform.sln
+dotnet build EnterpriseIntegrationPlatform.sln
+```
+
+Confirm the build succeeds with zero errors and zero warnings.
+
+### Step 2: Explore the Aspire Service Topology
+
+Start the orchestrator:
+
+```bash
+cd src/AppHost
+dotnet run
+```
+
+Open the Aspire dashboard URL printed in the console. Identify each service and classify it by EIP role:
+
+| Service | EIP Role |
+|---------|----------|
+| Gateway.Api | Messaging Gateway — single entry point for external systems |
+| Admin.Api | Control Bus — runtime administration and monitoring |
+| OpenClaw.Web | ? (identify its role) |
+
+Click each resource's health endpoint. Explain why health checks are essential for **scalability** — what happens when a load balancer cannot determine service health?
+
+### Step 3: Trace a Message Path Through Services
+
+Using the Aspire dashboard's **Traces** tab, identify the OpenTelemetry spans created when a message enters the Gateway. Draw the message flow: Gateway → Broker → Workflow → Activities → Connector. For each hop, note which EIP pattern is being applied (e.g., Gateway = Messaging Gateway, Broker = Message Channel, Workflow = Process Manager).
+
+## Exam
+
+1. In the EIP Messaging Gateway pattern, what is the gateway's primary responsibility?
+   - A) Transform message payloads between formats
+   - B) Provide a single entry point that encapsulates messaging-specific logic and shields external systems from internal broker details
+   - C) Store messages permanently in a database
+   - D) Route messages based on content inspection
+
+2. Why does the platform use .NET Aspire to orchestrate services rather than starting each service manually?
+   - A) Aspire encrypts all inter-service communication automatically
+   - B) Aspire ensures services start in dependency order with shared configuration, health checks, and observability — critical for a distributed integration platform's operational reliability
+   - C) Manual startup is not supported by .NET 10
+   - D) Aspire compiles all services into a single executable
+
+3. How does the Control Bus pattern (implemented by Admin.Api) support **operational scalability**?
+   - A) It routes business messages to faster consumers
+   - B) It provides centralized runtime management — feature flags, DLQ resubmission, and health monitoring — without modifying or redeploying processing pipelines
+   - C) It increases the number of broker partitions automatically
+   - D) It caches all messages in memory for faster retrieval
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/03-first-message.md b/EnterpriseIntegrationPlatform/tutorials/03-first-message.md
index e8f6d85..96951f4 100644
--- a/EnterpriseIntegrationPlatform/tutorials/03-first-message.md
+++ b/EnterpriseIntegrationPlatform/tutorials/03-first-message.md
@@ -266,16 +266,62 @@ public class IntegrationEnvelopeTests
 
 ---
 
-## Exercises
+## Lab
 
-1. **Trace a CorrelationId**: Imagine you publish a message, it gets split into 5 parts, each part gets transformed, and then they're aggregated back together. Which field ensures they all stay linked? What would `CausationId` be set to on each split message?
+**Objective:** Create an `IntegrationEnvelope<T>`, publish it to a Message Channel, and trace the Correlation Identifier through a publish-subscribe round-trip.
 
-2. **Choose the Intent**: For each scenario, pick the correct `MessageIntent`:
-   - "Process this payment" → ?
-   - "Here is the quarterly report" → ?
-   - "A new customer registered" → ?
+### Step 1: Create and Inspect an Integration Envelope
 
-3. **Broker independence**: Why does the platform use `IMessageBrokerProducer` instead of calling Kafka/NATS directly? What happens when you switch from NATS to Pulsar?
+Using the static factory method, create an envelope and inspect the EIP Message pattern fields it populates automatically:
+
+```csharp
+var envelope = IntegrationEnvelope<string>.Create(
+    payload: "{\"orderId\": 42, \"amount\": 99.95}",
+    source: "OrderService",
+    messageType: "order.created");
+```
+
+Verify: `MessageId` is a non-empty `Guid` (Message Identity), `CorrelationId` is generated (Correlation Identifier pattern), `Timestamp` is UTC (for ordering and expiration), and `Priority` defaults to `Normal`.
+
+### Step 2: Trace the Message Lifecycle
+
+Draw the 8-step message lifecycle from the tutorial on paper or whiteboard:
+
+```
+CREATE → PUBLISH → PERSIST → CONSUME → WORKFLOW → ACTIVITIES → ACK/NACK → OBSERVE
+```
+
+For each step, identify: (a) which EIP pattern applies, (b) where **atomicity** is enforced (hint: PERSIST ensures durability, WORKFLOW ensures all-or-nothing), and (c) which step enables **scalability** through parallel processing (hint: CONSUME with consumer groups).
+
+### Step 3: Design a Multi-Consumer Topology
+
+Imagine you need both an **analytics service** and a **billing service** to receive `order.created` messages. Design the consumer group configuration:
+
+- Analytics: consumer group = `"analytics-processors"` (receives every message)
+- Billing: consumer group = `"billing-processors"` (receives every message)
+- Within billing, 3 instances share the load
+
+Explain which EIP patterns are at play: **Publish-Subscribe Channel** (different groups) vs. **Competing Consumers** (same group, multiple instances). Why does this design scale without code changes?
+
+## Exam
+
+1. What is the purpose of the `CorrelationId` field on `IntegrationEnvelope<T>`?
+   - A) It uniquely identifies a single message in the broker's storage
+   - B) It links all messages that belong to the same logical business transaction, even across splits, transformations, and aggregations
+   - C) It stores the consumer group name for load balancing
+   - D) It provides the encryption key for message payloads
+
+2. Which `MessageIntent` value should be assigned to a message that instructs a downstream service to perform an action (e.g., "process this payment")?
+   - A) `MessageIntent.Event`
+   - B) `MessageIntent.Document`
+   - C) `MessageIntent.Command`
+   - D) There is no distinction — all messages are treated identically
+
+3. How does the broker abstraction (`IMessageBrokerProducer` / `IMessageBrokerConsumer`) support **atomic processing** in the message lifecycle?
+   - A) It encrypts every message before publishing
+   - B) It ensures the message is durably persisted in the broker before returning from `PublishAsync`, so the message survives producer crashes
+   - C) It compresses the payload to reduce latency
+   - D) It creates a database transaction around the publish call
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/04-integration-envelope.md b/EnterpriseIntegrationPlatform/tutorials/04-integration-envelope.md
index f38d256..e119054 100644
--- a/EnterpriseIntegrationPlatform/tutorials/04-integration-envelope.md
+++ b/EnterpriseIntegrationPlatform/tutorials/04-integration-envelope.md
@@ -215,13 +215,45 @@ All five envelopes share the same `CorrelationId`. This lets you:
 
 ---
 
-## Exercises
+## Lab
 
-1. **Design an envelope**: You receive an XML invoice from a partner via SFTP. Design the `IntegrationEnvelope` fields — what would `Source`, `MessageType`, `Intent`, and key metadata entries be?
+**Objective:** Build causation chains and sequenced message sets that demonstrate how the Envelope Wrapper pattern preserves **atomicity** and **traceability** across a multi-step integration pipeline.
 
-2. **Trace the chain**: A message arrives, gets validated, split into 3 parts, each part is transformed, and all 3 are aggregated. How many envelopes are created total? Draw the `CausationId` chain.
+### Step 1: Build a Causation Chain (Message Lineage)
 
-3. **Expiration scenario**: A message has `ExpiresAt = now + 5 minutes`. Processing takes 6 minutes. What happens? Which component handles this?
+Write code that simulates a three-step processing pipeline. Create an original envelope with `IntegrationEnvelope<string>.Create()`. Then create a second envelope (transformation result) using a `with` expression — set its `CausationId` to the first envelope's `MessageId` and keep the same `CorrelationId`. Create a third envelope whose `CausationId` is the second. Verify all three share the same `CorrelationId` but have distinct `MessageId` values.
+
+This lineage is essential for **atomicity**: if step 3 fails, the saga compensation engine uses the `CausationId` chain to identify and roll back exactly the right upstream steps.
+
+### Step 2: Model a Splitter Output with Sequencing
+
+Create three envelopes representing a Splitter's output. Use `with` expressions to set `SequenceNumber` (0, 1, 2) and `TotalCount` (3). Also set `ExpiresAt` on one envelope to 5 minutes from now, and on another to a time in the past. Verify `IsExpired` returns the correct value.
+
+Explain why the **Message Expiration** pattern is critical for scalability: in a high-throughput system, stale messages must be routed to the Dead Letter Queue rather than consuming resources processing outdated data.
+
+### Step 3: Design an Atomicity Scenario
+
+Imagine an order message is split into 3 line-item messages. Line-item 2 fails delivery. Using the envelope fields (`CorrelationId`, `CausationId`, `SequenceNumber`, `TotalCount`), describe how the platform can: (a) identify all 3 messages as belonging to the same operation, (b) determine which specific message failed, and (c) trigger compensation for line-items 1 and 3 that already succeeded.
+
+## Exam
+
+1. Why is `IntegrationEnvelope<T>` defined as a C# `record` rather than a `class`?
+   - A) Records are faster to serialize than classes
+   - B) Records provide immutability via `with` expressions, ensuring envelopes are never accidentally mutated during concurrent processing — critical for thread-safe scalability
+   - C) The .NET runtime requires records for generic types
+   - D) Records automatically encrypt their properties
+
+2. In a causation chain where message A is split into messages B₁, B₂, and B₃, what value should the `CausationId` of each split message contain?
+   - A) Its own `MessageId`
+   - B) The `CorrelationId` of message A
+   - C) The `MessageId` of message A — the parent that caused the split
+   - D) A new randomly generated `Guid`
+
+3. How does the `IsExpired` check contribute to the platform's **zero message loss** guarantee?
+   - A) Expired messages are silently dropped to save resources
+   - B) Expired messages are routed to the Dead Letter Queue with reason "expired", ensuring they are never silently lost but also don't consume processing capacity for stale data
+   - C) The broker automatically deletes expired messages
+   - D) `IsExpired` prevents messages from being published in the first place
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/05-message-brokers.md b/EnterpriseIntegrationPlatform/tutorials/05-message-brokers.md
index f5af7ea..8439690 100644
--- a/EnterpriseIntegrationPlatform/tutorials/05-message-brokers.md
+++ b/EnterpriseIntegrationPlatform/tutorials/05-message-brokers.md
@@ -232,16 +232,59 @@ Is this task delivery (process and acknowledge)?
 
 ---
 
-## Exercises
+## Lab
 
-1. **HOL blocking scenario**: You have 1,000 recipients. Recipient 500 has a message that takes 60 seconds to process. With Kafka (10 partitions), how many other recipients are potentially blocked? With NATS queue groups?
+**Objective:** Design a broker topic hierarchy for a multi-tenant system and analyze how different broker architectures affect **scalability** and **message ordering guarantees**.
 
-2. **Topic design**: Design the topic hierarchy for a system that processes invoices, payments, and refunds across 3 regions (US, EU, APAC). Use NATS subject hierarchy.
+### Step 1: Design a Multi-Region Topic Hierarchy
 
-3. **Broker selection**: For each scenario, choose the best broker:
-   - Real-time analytics dashboard consuming all order events
-   - Processing customer onboarding requests (each takes 5-30 seconds)
-   - Audit trail that must be retained for 7 years
+Design a NATS subject hierarchy for a multi-region e-commerce system with: orders, payments, and refunds across three regions (US, EU, APAC). Use NATS conventions (`.` for levels, `*` for single-level wildcard, `>` for multi-level wildcard):
+
+```
+eip.{region}.{domain}.{event}
+Example: eip.us.orders.created
+```
+
+Write subscriber patterns for: (a) all events in EU: `eip.eu.>`, (b) all order events globally: `eip.*.orders.*`, (c) only payment completions in APAC: `eip.apac.payments.completed`.
+
+Explain how this hierarchy enables **horizontal scalability** — new regions can be added without changing existing subscribers.
+
+### Step 2: Compare Broker Scalability Characteristics
+
+Create a comparison table for Kafka, NATS JetStream, and Pulsar:
+
+| Characteristic | Kafka | NATS JetStream | Pulsar |
+|---------------|-------|----------------|--------|
+| Ordering guarantee | Per-partition | Per-subject | Per-key (Key_Shared) |
+| HOL blocking risk | ? | ? | ? |
+| Multi-tenant isolation | ? | ? | ? |
+| Scale-out mechanism | ? | ? | ? |
+
+For each cell, explain the implication for a platform processing 10,000 messages/second from 50 tenants.
+
+### Step 3: Design for Atomicity Across Broker Switches
+
+The platform uses `IMessageBrokerProducer` / `IMessageBrokerConsumer` to abstract the broker. Describe a scenario where switching from NATS to Kafka for a specific message type would change the **atomicity** guarantees (hint: Kafka's transactional producer vs. NATS at-least-once). What compensating design would the platform need?
+
+## Exam
+
+1. What is head-of-line (HOL) blocking and why is it a **scalability** problem?
+   - A) HOL blocking occurs when a slow message in a partition delays all subsequent messages; NATS queue groups avoid it because any available consumer can pick up any message
+   - B) HOL blocking is a network-layer issue that all brokers handle identically
+   - C) HOL blocking only affects messages with `MessagePriority.Low`
+   - D) HOL blocking means messages are delivered out of order
+
+2. Why does the platform define `IMessageBrokerProducer` and `IMessageBrokerConsumer` as abstractions rather than coding directly against a specific broker SDK?
+   - A) The broker SDKs do not support .NET 10
+   - B) It allows the broker implementation to be swapped at deployment time without changing application code — enabling different scalability and atomicity trade-offs per workload
+   - C) Abstractions are required by the C# compiler for async methods
+   - D) Each broker uses a different serialization format
+
+3. When would you choose Apache Pulsar's Key_Shared subscription over Kafka's partition-based consumption for **multi-tenant scalability**?
+   - A) When you need strict global order across all keys
+   - B) When you want per-key ordering without cross-key head-of-line blocking — one tenant's slow processing should not affect others
+   - C) When your messages do not have any key
+   - D) When you require messages to be stored for less than 24 hours
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/06-messaging-channels.md b/EnterpriseIntegrationPlatform/tutorials/06-messaging-channels.md
index df2f487..ae841b6 100644
--- a/EnterpriseIntegrationPlatform/tutorials/06-messaging-channels.md
+++ b/EnterpriseIntegrationPlatform/tutorials/06-messaging-channels.md
@@ -246,17 +246,61 @@ Here's how a typical message flow uses multiple channel types:
 
 ---
 
-## Exercises
+## Lab
 
-1. **Channel selection**: For each scenario, choose the channel type:
-   - Processing incoming purchase orders (one processor per order)
-   - Notifying 5 different systems when a shipment is dispatched
-   - Handling messages that arrive as XML or JSON from different partners
-   - Quarantining messages with missing required fields
+**Objective:** Classify messaging scenarios by channel type and design a channel topology that ensures **atomic delivery** and **scalable fan-out**.
 
-2. **Bridge design**: Your company uses Kafka for everything but wants to add NATS for new microservices. Design a Messaging Bridge that keeps both systems in sync.
+### Step 1: Map Scenarios to Channel Types
 
-3. **Dead Letter Channel**: How does the Invalid Message Channel relate to the Dead Letter Channel? When would you use each?
+For each scenario, identify the correct EIP channel pattern and the platform class that implements it:
+
+| Scenario | Channel Pattern | Platform Class |
+|----------|----------------|----------------|
+| Processing purchase orders (one processor per order) | ? | `PointToPointChannel` or `PublishSubscribeChannel`? |
+| Notifying 5 systems when a shipment is dispatched | ? | ? |
+| Handling messages in XML or JSON from different partners | ? | ? |
+| Quarantining messages with missing required fields | ? | ? |
+
+Open `src/Processing.Channels/` and verify your answers against the actual implementations.
+
+### Step 2: Design a Messaging Bridge for Broker Migration
+
+Your company uses Kafka for all integrations but wants to add NATS for new microservices. Using the `MessagingBridge` class in `src/Processing.Channels/`, design a bridge configuration that:
+
+- Reads from Kafka topic `legacy.orders.created`
+- Publishes to NATS subject `eip.orders.created`
+- Preserves the `CorrelationId` and all `Metadata` across the bridge
+
+Draw the message flow and identify where **atomicity** could be lost (hint: what if the bridge crashes after reading from Kafka but before publishing to NATS?). How does the platform's Ack/Nack pattern mitigate this?
+
+### Step 3: Evaluate Scalability of Channel Patterns
+
+Compare Point-to-Point and Publish-Subscribe channels under high load:
+
+- Point-to-Point with 3 competing consumers processing 10,000 messages/second
+- Pub-Sub with 5 subscriber groups, each with 2 consumers
+
+For each, explain: How does adding more consumers affect throughput? What happens to in-flight messages? Where is the bottleneck?
+
+## Exam
+
+1. In the EIP Messaging Bridge pattern, what is the bridge's primary responsibility?
+   - A) Transform message payloads between XML and JSON
+   - B) Connect two separate messaging systems while preserving message identity and metadata, enabling gradual broker migration without changing producers or consumers
+   - C) Compress messages to reduce broker storage requirements
+   - D) Route messages based on their content type header
+
+2. How does the Invalid Message Channel pattern contribute to **zero message loss**?
+   - A) Invalid messages are silently discarded to avoid poisoning downstream consumers
+   - B) Messages that cannot be parsed or violate schema rules are routed to a dedicated channel for inspection and reprocessing, ensuring they are never lost
+   - C) Invalid messages are automatically reformatted and retried
+   - D) The broker rejects invalid messages at the protocol level
+
+3. What is the key **scalability** difference between a Point-to-Point channel and a Publish-Subscribe channel?
+   - A) Point-to-Point channels cannot have multiple consumers
+   - B) In Point-to-Point, adding consumers distributes load (Competing Consumers); in Pub-Sub, adding subscriber groups creates independent copies of every message for parallel processing
+   - C) Publish-Subscribe channels are always faster than Point-to-Point
+   - D) Point-to-Point channels require Kafka while Pub-Sub requires NATS
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/07-temporal-workflows.md b/EnterpriseIntegrationPlatform/tutorials/07-temporal-workflows.md
index f4c07f4..15b6a71 100644
--- a/EnterpriseIntegrationPlatform/tutorials/07-temporal-workflows.md
+++ b/EnterpriseIntegrationPlatform/tutorials/07-temporal-workflows.md
@@ -345,13 +345,60 @@ public class IntegrationPipelineWorkflowTests
 
 ---
 
-## Exercises
+## Lab
 
-1. **Failure scenario**: A workflow has 4 steps. Step 3 fails. What does Temporal do? What happens if the worker crashes during Step 3's retry?
+**Objective:** Trace how Temporal workflows enforce **atomic processing** with saga compensation, and design a failure recovery strategy for a multi-step integration pipeline.
 
-2. **Saga compensation**: Design compensation for: (1) create customer record, (2) provision email account, (3) send welcome email. What compensates each step?
+### Step 1: Trace a Failure Recovery Path
 
-3. **Ack/Nack design**: An order processing workflow has 5 steps. Step 4 (warehouse check) says "out of stock." Should this be a Nack? What information should the Nack carry?
+A workflow has 4 steps: Validate → Transform → Route → Deliver. Step 3 (Route) fails after Step 2 has already committed its result. Open `src/Workflow.Temporal/` and trace the code path:
+
+1. What does Temporal do when Step 3 throws an exception? (hint: retry policy)
+2. If all retries are exhausted, how does the `AtomicPipelineWorkflow` trigger saga compensation?
+3. What does `SagaCompensationActivities.CompensateStepAsync` do for Steps 1 and 2?
+
+Draw the timeline showing: original steps executed, failure point, compensation steps in reverse order.
+
+### Step 2: Design Compensation for a Business Scenario
+
+Design saga compensation for an order fulfilment workflow:
+
+| Step | Action | Compensation |
+|------|--------|-------------|
+| 1 | Create customer record in CRM | ? |
+| 2 | Reserve inventory in warehouse | ? |
+| 3 | Charge payment via gateway | ? |
+| 4 | Send confirmation email | ? |
+
+For each compensation, identify: Is it idempotent? What happens if the compensation itself fails? How does the `CorrelationId` link the original action to its compensation?
+
+### Step 3: Evaluate Scalability of Workflow Workers
+
+Temporal workers poll task queues for workflow and activity tasks. Consider a scenario with 100 concurrent integrations:
+
+- How many workflow workers should you run? What happens when you add more?
+- What is the relationship between worker count and **throughput**?
+- Why does Temporal's durable execution model prevent duplicate processing even when workers scale horizontally?
+
+## Exam
+
+1. What happens when a Temporal workflow worker crashes in the middle of executing an activity?
+   - A) The message is lost permanently
+   - B) Another worker picks up the activity from the last checkpoint — Temporal's event history ensures exactly-once execution semantics with durable state
+   - C) The entire workflow restarts from Step 1
+   - D) The broker automatically retries the message
+
+2. In the Saga Compensation pattern, why must compensation steps execute in **reverse order**?
+   - A) Reverse order is faster for the runtime to schedule
+   - B) Later steps may depend on earlier steps' state — compensating in reverse ensures each rollback sees a consistent state from the steps that preceded it
+   - C) The EIP book mandates reverse order for all patterns
+   - D) Temporal only supports reverse-order execution
+
+3. How does Temporal's durable execution model ensure **atomicity** across a multi-step integration pipeline?
+   - A) It wraps all steps in a database transaction
+   - B) It persists each step's completion in an event history — if a worker fails, another worker replays the history and resumes from the exact point of failure, never re-executing completed steps
+   - C) It locks the message broker partition until all steps complete
+   - D) It copies messages to a backup queue before processing
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/08-activities-pipeline.md b/EnterpriseIntegrationPlatform/tutorials/08-activities-pipeline.md
index 6755cec..89280fb 100644
--- a/EnterpriseIntegrationPlatform/tutorials/08-activities-pipeline.md
+++ b/EnterpriseIntegrationPlatform/tutorials/08-activities-pipeline.md
@@ -296,13 +296,64 @@ public class PersistenceActivityTests
 
 ---
 
-## Exercises
+## Lab
 
-1. **Design a pipeline**: You receive XML invoices via SFTP. Design the activity sequence: what activities do you need? In what order?
+**Objective:** Design an activity pipeline for a real integration scenario, analyze failure modes, and identify where the Pipes and Filters pattern enables **independent scaling** of each stage.
 
-2. **Failure handling**: Activity 3 (Transform) fails with a transient error. What happens? What if it fails with a permanent error (invalid schema)?
+### Step 1: Design a Pipeline for XML Invoice Processing
 
-3. **Extend the pipeline**: You need to add a "content enrichment" step that looks up customer data from a CRM API. Where in the activity chain would you add it? What interface would the activity use?
+You receive XML invoices via SFTP. Design the complete activity sequence using the platform's activity classes:
+
+| Step | Activity | Class | Purpose |
+|------|----------|-------|---------|
+| 1 | Validate | `IntegrationActivities.ValidateMessageAsync` | Schema + payload checks |
+| 2 | ? | ? | Sanitize input (XSS, SQL injection) |
+| 3 | ? | ? | Transform XML → canonical JSON |
+| 4 | ? | ? | Enrich with customer data from CRM |
+| 5 | ? | ? | Route to correct downstream system |
+| 6 | ? | ? | Deliver via HTTP connector |
+| 7 | ? | ? | Persist to Cassandra |
+| 8 | ? | ? | Send Ack/Nack notification |
+
+Open `src/Activities/` and `src/Workflow.Temporal/Activities/` to find the actual activity classes.
+
+### Step 2: Analyze Failure Modes and Atomicity
+
+For your pipeline above, analyze what happens at each failure point:
+
+- Step 3 fails with a **transient** error (network timeout) — what retry policy applies?
+- Step 3 fails with a **permanent** error (invalid XML schema) — where does the message go?
+- Step 6 fails after Step 7 already persisted — what compensation is needed?
+
+Explain how the Ack/Nack pattern at Step 8 ensures the originating system knows the final outcome, preserving **end-to-end atomicity**.
+
+### Step 3: Evaluate Per-Stage Scalability
+
+The Pipes and Filters pattern allows each activity to scale independently. For your pipeline:
+
+- Which step is likely the bottleneck under high load? (hint: external API calls)
+- How would you scale Step 4 (CRM enrichment) without affecting Steps 1-3?
+- What is the advantage of Temporal's activity-level retry over retrying the entire pipeline?
+
+## Exam
+
+1. In the Pipes and Filters pattern, what property must each filter (activity) maintain to allow **independent scaling**?
+   - A) All filters must share a single database connection
+   - B) Each filter processes the message using only the data in the envelope — no shared mutable state between filters — so multiple instances can run in parallel
+   - C) Filters must execute in a single thread to ensure ordering
+   - D) Each filter must cache results for the next filter
+
+2. Why does the platform split processing into separate activities (Validate, Transform, Route, Deliver) rather than a single monolithic handler?
+   - A) .NET requires separate classes for each async operation
+   - B) Separate activities enable independent retry policies, individual scaling, and granular saga compensation — a failure in Transform doesn't require re-running Validate
+   - C) Temporal cannot execute more than one method per workflow
+   - D) Separate activities reduce the total number of code lines
+
+3. What happens when an activity fails with a permanent error (e.g., invalid schema) in this platform?
+   - A) The workflow retries indefinitely until the message becomes valid
+   - B) The message is routed to the Dead Letter Queue with the failure reason, a Nack notification is sent to the originating system, and the workflow terminates cleanly
+   - C) The activity silently drops the message
+   - D) The Temporal worker crashes and restarts
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/09-content-based-router.md b/EnterpriseIntegrationPlatform/tutorials/09-content-based-router.md
index 896cb93..071b25c 100644
--- a/EnterpriseIntegrationPlatform/tutorials/09-content-based-router.md
+++ b/EnterpriseIntegrationPlatform/tutorials/09-content-based-router.md
@@ -97,13 +97,58 @@ The router publishes to the selected topic via the broker producer **before** ac
 
 ---
 
-## Exercises
+## Lab
 
-1. You have three routing rules with priorities 10, 5, and 1. A message matches rules at priorities 5 and 1. Which topic does the message go to and why?
+**Objective:** Configure routing rules with priorities, trace how the Content-Based Router dispatches messages, and analyze routing **scalability** under high-throughput conditions.
 
-2. A new requirement says messages with `Payload.customer.tier = "platinum"` must go to `priority-processing`. Write the `RoutingRule` record for this requirement.
+### Step 1: Configure a Multi-Rule Routing Table
 
-3. Why is pre-compiling regex patterns important for a high-throughput router? What happens if you skip compilation?
+Open `src/Processing.Routing/ContentBasedRouter.cs`. Create a routing configuration for an e-commerce platform:
+
+| Priority | Field | Operator | Value | Output Topic |
+|----------|-------|----------|-------|-------------|
+| 1 | `Payload.customer.tier` | Equals | `"platinum"` | `priority-processing` |
+| 5 | `MessageType` | Equals | `"OrderCreated"` | `orders.standard` |
+| 10 | `MessageType` | Matches | `"Return.*"` | `returns.processing` |
+| 100 | (default) | — | — | `general.inbox` |
+
+A message arrives with `MessageType = "OrderCreated"` and `Payload.customer.tier = "platinum"`. Which topic does it route to? Explain how priority ordering ensures deterministic routing.
+
+### Step 2: Trace the Routing Decision Path
+
+Using the `RoutingDecision` record, trace the router's decision path for a message that matches rules at priorities 1 and 5. Open the router implementation and identify:
+
+- How does the router evaluate rules? (sequential scan vs. sorted by priority?)
+- Does evaluation stop at the first match, or are all rules evaluated?
+- What `RoutingDecision` is returned — does it include the matched rule for auditing?
+
+### Step 3: Design for Routing Scalability
+
+Consider a Content-Based Router processing 50,000 messages/second with 200 routing rules:
+
+- What is the computational cost per message? (hint: O(n) for n rules)
+- How does pre-compiling regex patterns (`RoutingOperator.Matches`) improve throughput?
+- If you need to route to different brokers (Kafka for audit, NATS for real-time), how would the router's output topic abstraction enable this without code changes?
+
+## Exam
+
+1. You have routing rules with priorities 10, 5, and 1. A message matches rules at priorities 5 and 1. Which topic receives the message?
+   - A) Both topics receive the message (fan-out)
+   - B) Priority 1 — the router selects the lowest priority number (highest precedence) among matches
+   - C) Priority 10 — the router always uses the first rule defined
+   - D) Priority 5 — the router stops at the first match in definition order
+
+2. How does the Content-Based Router pattern support **atomic message routing**?
+   - A) It copies the message to all matching topics simultaneously
+   - B) Each message is routed to exactly one output topic — the routing decision is deterministic and idempotent, so replaying the same message always produces the same routing outcome
+   - C) It wraps the routing decision in a database transaction
+   - D) The router buffers messages until a batch is complete
+
+3. Why is pre-compiling regex patterns critical for **routing scalability** at high throughput?
+   - A) Pre-compilation reduces memory allocation per evaluation — without it, each message creates and discards regex objects, causing GC pressure that degrades throughput under load
+   - B) Pre-compilation is required by the .NET regex API
+   - C) Pre-compilation allows patterns to match across multiple lines
+   - D) Pre-compilation enables case-insensitive matching
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/10-message-filter.md b/EnterpriseIntegrationPlatform/tutorials/10-message-filter.md
index d6e25ee..0d4ac42 100644
--- a/EnterpriseIntegrationPlatform/tutorials/10-message-filter.md
+++ b/EnterpriseIntegrationPlatform/tutorials/10-message-filter.md
@@ -64,6 +64,7 @@ public sealed class MessageFilterOptions
     public RuleLogicOperator Logic { get; init; } = RuleLogicOperator.And;
     public required string OutputTopic { get; init; }
     public string? DiscardTopic { get; init; }
+    public bool RequireDiscardTopic { get; init; }
 }
 ```
 
@@ -95,13 +96,71 @@ The platform enforces **no silent drops** in production deployments. When a `Dis
 
 ---
 
-## Exercises
+## Lab
 
-1. Write a `MessageFilterOptions` configuration that passes only messages where `MessageType = "OrderCreated"` AND `Payload.total > 100`. Specify a `DiscardTopic`.
+**Objective:** Configure message filter rules, analyze the no-silent-drop guarantee with `RequireDiscardTopic`, and design a filter topology for **scalable** multi-stage message processing.
 
-2. A message fails all conditions but no `DiscardTopic` is configured. What happens? How would you change the design to prevent silent drops entirely?
+### Step 1: Configure a Filter with Discard Routing
 
-3. Compare the Message Filter to the Content-Based Router. When would you use a filter instead of a router?
+Write a `MessageFilterOptions` configuration that passes only messages where `MessageType = "OrderCreated"` AND `Payload.total > 100`:
+
+```csharp
+var options = new MessageFilterOptions
+{
+    Conditions = [
+        new RuleCondition { FieldName = "MessageType", Operator = RuleConditionOperator.Equals, Value = "OrderCreated" },
+        new RuleCondition { FieldName = "Payload.total", Operator = RuleConditionOperator.GreaterThan, Value = "100" }
+    ],
+    Logic = RuleLogicOperator.And,
+    OutputTopic = "high-value-orders",
+    DiscardTopic = "filtered-out.orders",
+    RequireDiscardTopic = true
+};
+```
+
+Explain what happens when `RequireDiscardTopic = true` and no `DiscardTopic` is configured — how does this enforce **zero message loss**?
+
+### Step 2: Trace the Filter's Atomicity Guarantee
+
+Open `src/Processing.Routing/MessageFilter.cs`. Trace the code path for a message that fails all conditions:
+
+1. The filter evaluates conditions → all fail → `MessageFilterResult.Passed = false`
+2. With `DiscardTopic` set → message is published to the discard topic
+3. With `DiscardTopic` null and `RequireDiscardTopic = true` → what exception is thrown?
+
+Draw the decision tree and explain how this guarantees every message is either delivered to `OutputTopic` or explicitly routed to `DiscardTopic` — never silently dropped.
+
+### Step 3: Design a Multi-Stage Filter Pipeline
+
+Design a pipeline with three cascading filters for an insurance claims system:
+
+| Stage | Filter Criteria | Output | Discard |
+|-------|----------------|--------|---------|
+| 1 | Claim amount > $0 and valid policy number | `claims.validated` | `claims.invalid` |
+| 2 | Claim type is "auto" or "home" | `claims.supported` | `claims.unsupported` |
+| 3 | Claim amount < $50,000 (auto-approve threshold) | `claims.auto-approve` | `claims.manual-review` |
+
+How does each filter's **discard topic** become a different team's input? How does this design scale — can each filter stage run independently with its own consumer group?
+
+## Exam
+
+1. A message fails all filter conditions but no `DiscardTopic` is configured and `RequireDiscardTopic = false`. What happens?
+   - A) The filter throws an `InvalidOperationException`
+   - B) The message is silently dropped — the filter logs a warning but takes no further action
+   - C) The message is automatically routed to the Dead Letter Queue
+   - D) The filter retries evaluation with relaxed conditions
+
+2. How does the Message Filter differ from the Content-Based Router in the EIP pattern catalog?
+   - A) They are identical patterns with different names
+   - B) The Router selects one of many output channels based on content; the Filter has a binary decision — pass or discard — making it simpler and more efficient for yes/no criteria
+   - C) The Filter can route to multiple topics simultaneously
+   - D) The Router only works with XML messages
+
+3. Why is `RequireDiscardTopic` essential for **production atomicity** in enterprise integration?
+   - A) It improves message throughput by forcing batch processing
+   - B) It prevents silent message loss — in production, every message must be accounted for, and throwing an exception forces the team to configure a discard destination before deployment
+   - C) It enables faster regex evaluation
+   - D) It is required by the NATS JetStream protocol
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/11-dynamic-router.md b/EnterpriseIntegrationPlatform/tutorials/11-dynamic-router.md
index fa01011..5c00a0f 100644
--- a/EnterpriseIntegrationPlatform/tutorials/11-dynamic-router.md
+++ b/EnterpriseIntegrationPlatform/tutorials/11-dynamic-router.md
@@ -104,13 +104,59 @@ Routing decisions are deterministic for a given routing-table snapshot. If the p
 
 ---
 
-## Exercises
+## Lab
 
-1. Participant D registers `conditionKey = "invoices"` with destination `"invoice-processing"`. A message arrives with `MessageType = "invoices"`. Trace the routing path.
+**Objective:** Trace how the Dynamic Router updates its routing table at runtime, analyze the EIP pattern's role in **scalable** integration topologies, and design a consistent routing strategy for distributed deployments.
 
-2. What happens if Participant D unregisters and a new message with `conditionKey = "invoices"` arrives before any other participant registers for that key?
+### Step 1: Trace a Dynamic Registration Flow
 
-3. How would you make the routing table consistent across 5 router replicas? Describe a broker-based approach.
+Open `src/Processing.Routing/DynamicRouter.cs`. A new participant registers with `conditionKey = "invoices"` and destination `"invoice-processing"`. Then a message arrives with `MessageType = "invoices"`. Trace the code path:
+
+1. How does `RegisterAsync` store the mapping?
+2. How does `RouteAsync` look up the destination?
+3. What `RoutingDecision` is returned — does it include the matched condition for auditing?
+
+Now: Participant unregisters. A new message with the same key arrives. What happens? Where does the message go?
+
+### Step 2: Design for Multi-Replica Consistency
+
+You have 5 Dynamic Router replicas behind a load balancer. Participant D registers on Replica 1, but Replica 3 doesn't know about it. Design a solution using the platform's broker infrastructure:
+
+- Publish registration events to a `routing.registrations` topic
+- Each replica subscribes and updates its local table
+- How does this use the **Publish-Subscribe Channel** pattern to keep all replicas consistent?
+- What happens to messages during the propagation delay? Is this an **atomicity** concern?
+
+### Step 3: Compare Dynamic Router Scalability vs. Content-Based Router
+
+| Aspect | Content-Based Router | Dynamic Router |
+|--------|---------------------|---------------|
+| Rule source | Static configuration | Runtime registrations |
+| Adding new routes | ? | ? |
+| Scalability model | ? | ? |
+| Consistency across replicas | ? | ? |
+
+When would you choose a Dynamic Router over a Content-Based Router in a multi-tenant SaaS platform?
+
+## Exam
+
+1. What EIP pattern does the Dynamic Router implement that the Content-Based Router does not?
+   - A) Message Filter with discard
+   - B) A self-updating routing table where downstream participants register and unregister their interests at runtime, enabling topology changes without redeploying the router
+   - C) Priority-based message queuing
+   - D) Batch message processing
+
+2. In a horizontally scaled deployment with multiple router instances, what is the main **consistency** challenge?
+   - A) All routers must share a single-threaded execution context
+   - B) Registration changes on one instance must propagate to all others — during propagation, different instances may route the same message to different destinations
+   - C) Dynamic routers cannot be scaled horizontally
+   - D) Each router instance requires its own broker connection
+
+3. How does the Dynamic Router pattern support **scalable** integration topology changes?
+   - A) It requires a full system restart to add new routes
+   - B) New services register their routing interests at startup — the router begins directing matching messages to them immediately, with no configuration changes or redeployments needed
+   - C) It pre-allocates routes for all possible message types
+   - D) It uses a database trigger to detect new services
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/12-recipient-list.md b/EnterpriseIntegrationPlatform/tutorials/12-recipient-list.md
index d7ff7e6..eabc382 100644
--- a/EnterpriseIntegrationPlatform/tutorials/12-recipient-list.md
+++ b/EnterpriseIntegrationPlatform/tutorials/12-recipient-list.md
@@ -92,13 +92,62 @@ This ensures either all recipients get the message or the source is redelivered.
 
 ---
 
-## Exercises
+## Lab
 
-1. A message matches two rules contributing destinations `["audit", "billing", "audit"]`. What does `RecipientListResult` report for `ResolvedCount` and `DuplicatesRemoved`?
+**Objective:** Analyze how the Recipient List pattern enables **scalable fan-out** to multiple destinations, design duplicate-safe publishing, and measure the performance impact of parallel vs. sequential delivery.
 
-2. Design a metadata-based recipient list where the sender specifies destinations in `Metadata["recipients"] = "topic-a,topic-b"`. What are the trade-offs vs. rule-based resolution?
+### Step 1: Trace a Recipient List Resolution
 
-3. With 10 recipients and one slow destination (3 s latency), how does parallel publishing help compared to sequential publishing?
+A message matches two routing rules that produce destinations `["audit", "billing", "audit"]`. Open `src/Processing.Routing/RecipientListRouter.cs` and trace:
+
+1. How are duplicate destinations handled? What does `RecipientListResult.DuplicatesRemoved` report?
+2. What is the final `ResolvedCount`?
+3. How does the router publish to each destination — sequentially or in parallel?
+
+### Step 2: Design a Metadata-Driven Recipient List
+
+Some integration scenarios require the **sender** to specify recipients dynamically via envelope metadata:
+
+```csharp
+envelope.Metadata["recipients"] = "audit,billing,compliance";
+```
+
+Design this approach and compare trade-offs:
+
+| Approach | Pros | Cons |
+|----------|------|------|
+| Rule-based (server-side) | Centralized control, auditable | ? |
+| Metadata-based (sender-specified) | ? | Sender must know all destinations |
+
+Which approach provides better **atomicity** guarantees? (hint: what if the sender specifies a non-existent topic?)
+
+### Step 3: Analyze Fan-Out Scalability
+
+With 10 recipients and one slow destination (3-second latency):
+
+- How does parallel publishing (platform's default) compare to sequential publishing?
+- What is the total latency for parallel vs. sequential? (hint: parallel ≈ max latency, sequential ≈ sum)
+- If the slow destination fails, should the message be Ack'd or Nack'd for the other 9 successful deliveries? Design your atomicity strategy.
+
+## Exam
+
+1. A Recipient List resolves 5 destinations. Publishing to destination 3 fails. What should the platform do to maintain **atomicity**?
+   - A) Silently skip destination 3 and Ack the remaining 4
+   - B) Log the failure and track partial delivery — the message enters a compensable state where the failed destination can be retried independently without re-publishing to the successful 4
+   - C) Retry all 5 destinations from the beginning
+   - D) Route the entire message to the Dead Letter Queue
+
+2. Why does the Recipient List remove duplicate destinations before publishing?
+   - A) Duplicates are not supported by the NATS protocol
+   - B) Publishing the same message to the same topic multiple times creates duplicate processing downstream — de-duplication ensures **idempotent fan-out** at the routing layer
+   - C) Duplicate topics cause build errors
+   - D) The broker ignores duplicate publishes automatically
+
+3. How does parallel publishing to multiple recipients improve **throughput scalability**?
+   - A) It reduces the total message size
+   - B) Total fan-out latency equals the slowest recipient (not the sum of all) — this is critical when scaling to dozens of recipients, as sequential publishing would create unacceptable pipeline latency
+   - C) Parallel publishing uses less memory than sequential
+   - D) The broker handles parallelism internally regardless of how the producer publishes
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/13-routing-slip.md b/EnterpriseIntegrationPlatform/tutorials/13-routing-slip.md
index 849f939..56be40e 100644
--- a/EnterpriseIntegrationPlatform/tutorials/13-routing-slip.md
+++ b/EnterpriseIntegrationPlatform/tutorials/13-routing-slip.md
@@ -111,13 +111,72 @@ The routing slip is stored in the envelope's `Metadata` dictionary as serialised
 
 ---
 
-## Exercises
+## Lab
 
-1. Build a `RoutingSlip` with steps: Validate → Transform → Deliver. The Transform step needs a parameter `"targetFormat" = "XML"`. Write the C# construction code.
+**Objective:** Build a Routing Slip, trace failure recovery with partial completion, and compare the Routing Slip pattern's **scalability** against Process Manager workflows.
 
-2. A message has completed Validate and Transform but crashes during Deliver. What does the `RemainingSlip` look like when the message is redelivered?
+### Step 1: Build a Routing Slip with Parameters
 
-3. Compare a routing slip to a Temporal workflow pipeline. When would you choose a slip over a workflow?
+Write C# code to construct a `RoutingSlip` with three steps:
+
+```csharp
+var slip = new RoutingSlip([
+    new RoutingSlipStep("Validate", new Dictionary<string, string>()),
+    new RoutingSlipStep("Transform", new Dictionary<string, string>
+    {
+        ["targetFormat"] = "XML",
+        ["schemaVersion"] = "2.0"
+    }),
+    new RoutingSlipStep("Deliver", new Dictionary<string, string>
+    {
+        ["endpoint"] = "https://partner.example.com/api/orders"
+    })
+]);
+```
+
+Open `src/Processing.Routing/RoutingSlip.cs` and verify the record structure. How does each step carry its own parameters? Why is this important for **atomicity** — each step is self-contained with all the data it needs.
+
+### Step 2: Trace a Partial-Completion Recovery
+
+A message has completed Validate and Transform but the worker crashes during Deliver. The message is redelivered with the slip attached:
+
+1. What does `RemainingSlip` contain? (hint: only Deliver remains)
+2. How does the platform know which steps already completed?
+3. Are Validate and Transform re-executed? Why or why not?
+
+Draw the recovery timeline and explain how the Routing Slip pattern achieves **idempotent resume** — crashed messages resume from exactly where they left off.
+
+### Step 3: Compare Routing Slip vs. Temporal Workflow
+
+| Aspect | Routing Slip | Temporal Workflow (Process Manager) |
+|--------|-------------|-------------------------------------|
+| State persistence | In the message itself | In Temporal's event history |
+| Dynamic step addition | ? | ? |
+| Compensation support | ? | ? |
+| Scalability | ? | ? |
+| Best for | ? | ? |
+
+When would you choose a Routing Slip over a full Temporal workflow? Consider: simple linear pipelines vs. complex branching logic.
+
+## Exam
+
+1. A Routing Slip message has completed steps 1-3 of 5. The worker crashes. What happens on redelivery?
+   - A) All 5 steps execute from the beginning
+   - B) The slip indicates steps 1-3 are complete — only steps 4-5 are in `RemainingSlip`, so processing resumes from step 4 without re-executing completed work
+   - C) The message is routed to the Dead Letter Queue
+   - D) A new slip is created with all 5 steps
+
+2. Why does the Routing Slip pattern carry processing state **inside the message** rather than in an external store?
+   - A) External stores are too slow for message processing
+   - B) The message is self-contained — any processor can pick it up and resume, enabling **horizontal scaling** without shared state coordination between consumers
+   - C) The message broker requires all state in the payload
+   - D) External stores don't support key-value parameters
+
+3. What is the key **scalability** advantage of a Routing Slip over a centralized Process Manager?
+   - A) Routing slips are faster to serialize
+   - B) No central coordinator is needed — each step independently reads the slip and forwards to the next, so the pattern scales linearly with more processors and has no single-point-of-failure bottleneck
+   - C) Process Managers cannot run on multiple machines
+   - D) Routing slips support more data formats
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/14-process-manager.md b/EnterpriseIntegrationPlatform/tutorials/14-process-manager.md
index 5e2c041..d91233e 100644
--- a/EnterpriseIntegrationPlatform/tutorials/14-process-manager.md
+++ b/EnterpriseIntegrationPlatform/tutorials/14-process-manager.md
@@ -103,11 +103,10 @@ public sealed class SagaCompensationActivities
     public async Task<bool> CompensateStepAsync(Guid correlationId, string stepName)
     {
         await _logging.LogAsync(correlationId, stepName, $"CompensationStarted:{stepName}");
-        var result = await _compensation.CompensateAsync(correlationId, stepName);
-        await _logging.LogAsync(correlationId, stepName, result
-            ? $"CompensationSucceeded:{stepName}"
-            : $"CompensationFailed:{stepName}");
-        return result;
+        var success = await _compensationService.CompensateAsync(correlationId, stepName);
+        var stage = success ? $"CompensationSucceeded:{stepName}" : $"CompensationFailed:{stepName}";
+        await _logging.LogAsync(correlationId, stepName, stage);
+        return success;
     }
 }
 ```
@@ -126,13 +125,67 @@ The `AtomicPipelineWorkflow` implements full **saga compensation**. Completed st
 
 ---
 
-## Exercises
+## Lab
 
-1. A workflow has steps: Persist → Validate → Transform → Deliver. Transform succeeds but Deliver fails. List the compensation steps in execution order.
+**Objective:** Trace the Process Manager's orchestration of multi-step workflows with saga compensation, and analyze how centralized coordination enables **atomic** all-or-nothing processing.
 
-2. What is the key difference between a Process Manager and a Routing Slip? When would you choose one over the other?
+### Step 1: Trace a Compensation Sequence
 
-3. A compensation activity (`CompensateStepAsync`) itself fails. What does Temporal do? What does the platform log?
+A workflow has steps: Persist → Validate → Transform → Deliver. Transform succeeds but Deliver fails after all retries. Open `src/Workflow.Temporal/AtomicPipelineWorkflow.cs` and trace:
+
+1. Which steps need compensation? (only steps that committed work)
+2. In what order do compensation steps execute? (hint: reverse)
+3. What does `SagaCompensationActivities.CompensateStepAsync` do for each step?
+
+List the compensation sequence:
+
+| Order | Compensating | Original Step |
+|-------|-------------|---------------|
+| 1 | Undo Transform | Transform |
+| 2 | ? | ? |
+| 3 | ? | ? |
+
+### Step 2: Handle Compensation Failures
+
+The compensation for "Persist" itself fails. Open `src/Workflow.Temporal/Activities/SagaCompensationActivities.cs` and answer:
+
+- What does the `CompensateStepAsync` method return when compensation fails?
+- Does Temporal retry the compensation? With what policy?
+- What is logged? How does the operations team know that manual intervention is required?
+
+Design an alerting strategy for compensation failures — this is the **atomicity boundary** of the system.
+
+### Step 3: Compare Process Manager vs. Routing Slip
+
+| Aspect | Process Manager | Routing Slip |
+|--------|----------------|-------------|
+| Coordination | Centralized (Temporal) | Decentralized (in-message) |
+| Compensation | Full saga support | Limited / manual |
+| Visibility | Full execution history | ? |
+| Scalability bottleneck | Temporal server | ? |
+| Best for | Complex branching, compensation | ? |
+
+When would a Process Manager's centralized coordination be worth the **scalability** trade-off vs. a Routing Slip?
+
+## Exam
+
+1. In a Process Manager with saga compensation, why must compensation steps execute in **reverse order**?
+   - A) It's a convention with no technical reason
+   - B) Later steps may depend on earlier steps' committed state — reverse-order compensation ensures each rollback sees the state from the steps that preceded it, maintaining consistency
+   - C) Temporal only supports reverse execution
+   - D) Reverse order is faster for the scheduler
+
+2. A compensation step itself fails. What is the correct platform behavior for maintaining **atomicity**?
+   - A) Silently ignore the failure and mark the saga as complete
+   - B) Log the failure, mark the saga as partially compensated, and alert the operations team — some atomicity violations require human intervention when automatic compensation is impossible
+   - C) Restart the entire original workflow from Step 1
+   - D) Route the compensation failure to the Dead Letter Queue and retry indefinitely
+
+3. What is the key advantage of the Process Manager pattern over the Routing Slip for **enterprise-grade atomicity**?
+   - A) Process Managers are faster for simple linear pipelines
+   - B) The Process Manager maintains a durable execution history with full saga compensation — if any step fails, all committed work can be rolled back to restore consistency
+   - C) Process Managers don't require a message broker
+   - D) Routing Slips cannot carry parameters
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/15-message-translator.md b/EnterpriseIntegrationPlatform/tutorials/15-message-translator.md
index f23ac2c..7ad6dd3 100644
--- a/EnterpriseIntegrationPlatform/tutorials/15-message-translator.md
+++ b/EnterpriseIntegrationPlatform/tutorials/15-message-translator.md
@@ -97,13 +97,67 @@ The translator publishes the translated envelope to the target topic **before**
 
 ---
 
-## Exercises
+## Lab
 
-1. Write a `FieldMapping` list that maps `{ "first_name": "Alice", "last_name": "Smith" }` to `{ "fullName": "Alice Smith", "source": "CRM" }`. Hint: one mapping uses `StaticValue`.
+**Objective:** Build field mappings for cross-system data transformation, analyze how the Message Translator pattern preserves message **atomicity** through immutable transformations, and design a multi-format translation strategy.
 
-2. When would you use `FuncPayloadTransform` vs `JsonFieldMappingTransform`? Give an example of each.
+### Step 1: Build a Field Mapping Configuration
 
-3. A translator receives a JSON message but the target system expects XML. Which platform components would you combine to achieve this?
+Write a `FieldMapping` list that transforms this input:
+
+```json
+{ "first_name": "Alice", "last_name": "Smith", "email": "alice@example.com" }
+```
+
+Into this output:
+
+```json
+{ "fullName": "Alice Smith", "contactEmail": "alice@example.com", "source": "CRM" }
+```
+
+Identify: which mapping uses `SourceField`, which uses `StaticValue`, and how would you combine `first_name` + `last_name` into `fullName`? Open `src/Processing.Translator/JsonFieldMappingTransform.cs` to verify the mapping mechanics.
+
+### Step 2: Trace Immutability Through Translation
+
+Open `src/Processing.Translator/MessageTranslator.cs`. When a message is translated:
+
+1. Is the original `IntegrationEnvelope<T>` mutated, or is a new envelope created?
+2. How does the `CausationId` of the translated message link back to the original?
+3. If translation fails (e.g., missing required field), what happens to the original message?
+
+Explain why **immutable transformation** is critical for atomicity: if translation fails, the original message is untouched and can be retried or routed to the DLQ.
+
+### Step 3: Design a Multi-Format Translation Pipeline
+
+A partner sends data in XML, but your downstream systems expect JSON. Another partner sends CSV. Design a translation strategy:
+
+| Source Format | Translator Step | Output |
+|--------------|----------------|--------|
+| XML → JSON | `XmlToJsonStep` | Canonical JSON |
+| CSV → JSON | Custom `IPayloadTransform` | Canonical JSON |
+| JSON → Canonical | `JsonFieldMappingTransform` | Normalized envelope |
+
+How does the **Canonical Data Model** (Tutorial 17 — Normalizer) relate to the Message Translator? Why is normalizing to a canonical format essential for **scalability** — what happens when you add a 5th source format?
+
+## Exam
+
+1. Why does the Message Translator create a **new envelope** rather than modifying the original?
+   - A) .NET records are always immutable
+   - B) Immutable transformation preserves the original for retry, DLQ routing, and audit — if translation fails, the untouched original maintains atomicity of the processing pipeline
+   - C) The broker rejects modified messages
+   - D) Creating new envelopes uses less memory
+
+2. When would you use `FuncPayloadTransform` (code-based) vs. `JsonFieldMappingTransform` (configuration-based)?
+   - A) They are interchangeable
+   - B) `JsonFieldMappingTransform` for simple field renaming/mapping that non-developers can configure; `FuncPayloadTransform` for complex logic like format conversion, calculations, or API enrichment that requires code
+   - C) `FuncPayloadTransform` is faster in all cases
+   - D) `JsonFieldMappingTransform` only works with XML
+
+3. How does the Canonical Data Model concept support **integration scalability**?
+   - A) It reduces message size for faster transport
+   - B) All message sources translate to one canonical format — adding a new source system requires only one new translator, not N translators for N downstream consumers
+   - C) Canonical models encrypt data for security
+   - D) It eliminates the need for a message broker
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/16-transform-pipeline.md b/EnterpriseIntegrationPlatform/tutorials/16-transform-pipeline.md
index cddecea..945eb55 100644
--- a/EnterpriseIntegrationPlatform/tutorials/16-transform-pipeline.md
+++ b/EnterpriseIntegrationPlatform/tutorials/16-transform-pipeline.md
@@ -108,13 +108,59 @@ The pipeline is **all-or-nothing** within a single invocation. If any step throw
 
 ---
 
-## Exercises
+## Lab
 
-1. Design a 3-step pipeline that: (a) converts XML to JSON, (b) applies a regex to redact email addresses, (c) filters to keep only `$.order.id` and `$.order.total`. List the steps in order.
+**Objective:** Design a multi-step transform pipeline, trace how immutable `TransformContext` preserves **atomicity** through each stage, and analyze pipeline **scalability** under failure conditions.
 
-2. After step 2 of 4 the pipeline fails. What is `StepsApplied`? What happens to the source message?
+### Step 1: Design a Transform Pipeline
 
-3. Why does `TransformContext` use `WithPayload` instead of mutable setters? What concurrency benefit does this provide?
+Design a 3-step pipeline for PCI-compliant order processing:
+
+| Step | Transform | Class | Purpose |
+|------|-----------|-------|---------|
+| 1 | XML → JSON | `XmlToJsonStep` | Convert partner XML to canonical JSON |
+| 2 | Redact PII | `RegexReplaceStep` | Mask email addresses with `***@***` |
+| 3 | Filter fields | `JsonPathFilterStep` | Keep only `$.order.id` and `$.order.total` |
+
+Open `src/Processing.Transformer/` and verify each step class exists. Write the `TransformOptions` configuration for this pipeline.
+
+### Step 2: Trace Failure Recovery with StepsApplied
+
+After step 2 of 4, the pipeline fails (e.g., `JsonPathFilterStep` encounters malformed JSON):
+
+1. What is `TransformPipelineResult.StepsApplied`? (answer: 2)
+2. Is the original source message modified? (hint: `TransformContext.WithPayload` creates copies)
+3. How does the pipeline decide whether to retry vs. route to DLQ?
+
+Explain why `TransformContext` uses `WithPayload` (immutable updates) instead of mutable setters — what **concurrency** benefit does this provide when multiple messages are being transformed in parallel?
+
+### Step 3: Evaluate Pipeline Scalability
+
+A pipeline processes 10,000 messages/second. Step 2 (regex redaction) is 5x slower than the other steps:
+
+- Can you scale Step 2 independently? (hint: in Temporal, each step is an activity)
+- What happens to pipeline throughput if you add a 4th step?
+- How does the Pipes and Filters architecture prevent a slow step from blocking the entire system?
+
+## Exam
+
+1. Why does `TransformContext` use `WithPayload` (immutable copy) instead of mutating the payload in place?
+   - A) Mutable payloads are not supported by .NET records
+   - B) Immutable context ensures that if a later step fails, earlier step results are preserved — enabling safe retry and parallel processing without data corruption from shared mutable state
+   - C) `WithPayload` is faster than direct mutation
+   - D) The broker requires immutable messages
+
+2. A transform pipeline has 5 steps. Step 3 fails permanently. What should happen for **atomic** message processing?
+   - A) Steps 1-2 results are discarded and the original message is routed to the DLQ with failure context, preserving full traceability
+   - B) Steps 4-5 execute with partial data
+   - C) The pipeline retries all 5 steps from the beginning
+   - D) The message is silently dropped
+
+3. How does the Transform Pipeline pattern support **horizontal scalability**?
+   - A) All steps must run on the same machine
+   - B) Each step is an independent filter — Temporal can distribute steps across workers, and slow steps can be scaled by adding more activity workers without affecting other steps
+   - C) The pipeline pre-allocates resources for all steps
+   - D) Scalability is limited by the fastest step
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/17-normalizer.md b/EnterpriseIntegrationPlatform/tutorials/17-normalizer.md
index f52eb39..3afca55 100644
--- a/EnterpriseIntegrationPlatform/tutorials/17-normalizer.md
+++ b/EnterpriseIntegrationPlatform/tutorials/17-normalizer.md
@@ -95,13 +95,66 @@ Normalization happens **before** any downstream processing. If normalization fai
 
 ---
 
-## Exercises
+## Lab
 
-1. A partner sends CSV with `|` as delimiter and no header row. Write the `NormalizerOptions` configuration for this case.
+**Objective:** Configure the Normalizer for multi-format input handling, analyze how the Canonical Data Model pattern enables **scalable** integration with diverse source systems, and design normalization strategies for edge cases.
 
-2. A payload arrives with `contentType = "application/json"` but contains invalid JSON. What happens when `StrictContentType = true`? What about `false`?
+### Step 1: Configure a CSV Normalizer
 
-3. Why does the platform choose JSON as the canonical format rather than XML or a binary format like Protocol Buffers?
+A partner sends CSV files with `|` as delimiter and no header row. Open `src/Processing.Normalizer/` and configure `NormalizerOptions`:
+
+```csharp
+var options = new NormalizerOptions
+{
+    CsvDelimiter = '|',
+    CsvHasHeader = false,
+    CsvColumnNames = ["orderId", "customerId", "amount", "currency"],
+    StrictContentType = true
+};
+```
+
+Trace what happens when: (a) a valid CSV arrives, (b) JSON arrives with `contentType = "text/csv"` but `StrictContentType = true`.
+
+### Step 2: Map the Canonical Data Model
+
+The platform normalizes all inputs to JSON. Draw a diagram showing 4 source systems and how they funnel through the Normalizer:
+
+```
+Partner A (XML) ─────┐
+Partner B (CSV) ─────┤
+Partner C (JSON) ────┼──→ Normalizer ──→ Canonical JSON ──→ Router ──→ N consumers
+Internal API (JSON) ─┘
+```
+
+How many translators are needed for 4 sources and 6 consumers? With a canonical model: **4** (one per source). Without: **24** (4×6). This is the **scalability** argument for normalization.
+
+### Step 3: Handle Format Detection Failures
+
+A payload arrives with `contentType = "application/json"` but contains invalid JSON. Analyze:
+
+- What happens when `StrictContentType = true`? (exception → DLQ)
+- What happens when `StrictContentType = false`? (format sniffing attempt)
+- Why is strict mode recommended for production **atomicity** — what risks does lenient mode introduce?
+
+## Exam
+
+1. Why does the platform normalize all messages to a **Canonical Data Model** (JSON)?
+   - A) JSON is faster to parse than all other formats
+   - B) A single canonical format means adding a new source system requires only one new translator — not one for every downstream consumer — making the integration platform scale linearly with the number of systems
+   - C) JSON is required by the NATS protocol
+   - D) The .NET runtime only supports JSON serialization
+
+2. What is the risk of setting `StrictContentType = false` in a production environment?
+   - A) No risk — lenient mode is always preferred
+   - B) A message could be misinterpreted — e.g., XML interpreted as JSON due to format sniffing — leading to corrupt data flowing through the pipeline undetected, violating **data atomicity**
+   - C) Lenient mode disables all content validation
+   - D) Strict mode is slower than lenient mode
+
+3. How does the Normalizer pattern reduce **integration complexity** when scaling from 5 to 50 connected systems?
+   - A) It doesn't — complexity grows equally regardless
+   - B) Without normalization, N sources × M consumers = N×M translators; with normalization, only N + M translators are needed — this is the difference between O(N²) and O(N) scaling
+   - C) The Normalizer caches all messages, reducing duplicate processing
+   - D) The Normalizer compresses messages to reduce broker storage
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/18-content-enricher.md b/EnterpriseIntegrationPlatform/tutorials/18-content-enricher.md
index 01ec82c..c4e2f1a 100644
--- a/EnterpriseIntegrationPlatform/tutorials/18-content-enricher.md
+++ b/EnterpriseIntegrationPlatform/tutorials/18-content-enricher.md
@@ -88,13 +88,60 @@ Enrichment is **not idempotent by default** if the external data changes between
 
 ---
 
-## Exercises
+## Lab
 
-1. An order message `{ "orderId": 42 }` needs enrichment with customer data from `GET /api/customers/{customerId}`. But the order message doesn't contain `customerId` — only `orderId`. How would you design a two-step enrichment?
+**Objective:** Design enrichment strategies using external data sources, analyze **atomicity** when enrichment depends on external service availability, and evaluate caching for **scalable** enrichment.
 
-2. The external HTTP service is down. What happens to messages waiting for enrichment? How does the retry policy interact with the enricher?
+### Step 1: Design a Two-Step Enrichment
 
-3. Compare the Content Enricher to the Content Filter (Tutorial 19). How are they complementary?
+An order message `{ "orderId": 42 }` needs customer data, but only contains `orderId` — not `customerId`. Design the enrichment flow:
+
+1. Step 1: Look up `customerId` from `GET /api/orders/42` → returns `{ "customerId": "CUST-7" }`
+2. Step 2: Enrich with customer data from `GET /api/customers/CUST-7` → returns `{ "name": "Alice", "tier": "gold" }`
+
+Open `src/Processing.Enricher/ContentEnricher.cs` and identify how the enricher merges external data into the envelope. Does it mutate the original or create a new enriched envelope?
+
+### Step 2: Analyze Enrichment Failure Atomicity
+
+The external HTTP service is down during enrichment. Trace what happens:
+
+1. Does the enricher retry? What retry policy applies?
+2. If all retries fail, where does the message go?
+3. Is the original message preserved untouched for retry later?
+
+Now consider: the enricher calls two services. Service A succeeds but Service B fails. Is the partial enrichment from Service A committed? How does this affect **atomicity**? Design a strategy: should partial enrichment be discarded or preserved?
+
+### Step 3: Design a Caching Strategy for Scalability
+
+At 10,000 messages/second, each enrichment requires an HTTP call to an external CRM. Without caching, that's 10,000 HTTP calls/second. Design a caching strategy:
+
+| Cache Level | TTL | Hit Rate | Scalability Impact |
+|-------------|-----|----------|-------------------|
+| In-memory (per-worker) | 60s | ~80% | Reduces to 2,000 calls/second |
+| Distributed (Redis) | 5min | ~95% | Reduces to 500 calls/second |
+| Database fallback | 1hr | ~99% | ? |
+
+Open `src/Processing.Enricher/` and check if the platform implements caching. How does cache invalidation interact with message **consistency**?
+
+## Exam
+
+1. The Content Enricher calls an external service that is temporarily unavailable. What is the correct **atomic** behavior?
+   - A) Skip enrichment and forward the message without the additional data
+   - B) Preserve the original message, retry according to policy, and if all retries fail route to the DLQ — the message is never forwarded with missing enrichment data
+   - C) Cache the last known good response and use it
+   - D) Block all messages until the external service recovers
+
+2. How does caching in the Content Enricher improve **scalability** without sacrificing data accuracy?
+   - A) Caching eliminates the need for external services entirely
+   - B) Frequently accessed enrichment data (e.g., customer records) is cached with a TTL — this reduces external API calls by 80-95% while ensuring data freshness through time-based expiration
+   - C) The cache stores messages, not enrichment data
+   - D) Caching is only useful for batch processing
+
+3. How are the Content Enricher and Content Filter (Tutorial 19) **complementary** in a pipeline?
+   - A) They do the same thing in reverse order
+   - B) The Enricher adds data from external sources, then the Filter removes fields not needed downstream — together they ensure each consumer receives exactly the data it needs, no more and no less
+   - C) The Filter must always run before the Enricher
+   - D) They cannot be used in the same pipeline
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/19-content-filter.md b/EnterpriseIntegrationPlatform/tutorials/19-content-filter.md
index a43a1bd..f0f7260 100644
--- a/EnterpriseIntegrationPlatform/tutorials/19-content-filter.md
+++ b/EnterpriseIntegrationPlatform/tutorials/19-content-filter.md
@@ -79,13 +79,61 @@ Filtering is a **pure, deterministic function** — the same input and keep-path
 
 ---
 
-## Exercises
+## Lab
 
-1. A message has fields `order.id`, `order.items[]`, `customer.email`, `customer.phone`, `audit.createdBy`. You only need `order.id` and `customer.email`. Write the `keepPaths` list and describe the resulting JSON structure.
+**Objective:** Apply the Content Filter pattern to remove unnecessary data, analyze data minimization for **security** and **scalability**, and design a filter-then-route pipeline.
 
-2. A keep-path `customer.address.zipCode` is specified but the message doesn't have an `address` field. What happens?
+### Step 1: Configure a Content Filter
 
-3. Design a pipeline that first enriches a message (Tutorial 18) and then filters it. Why is this order important?
+A message has fields: `order.id`, `order.items[]`, `customer.email`, `customer.phone`, `customer.ssn`, `audit.createdBy`. You need only `order.id` and `customer.email` for the downstream billing system. Write the `keepPaths` configuration:
+
+```csharp
+var keepPaths = new[] { "order.id", "customer.email" };
+```
+
+Open `src/Processing.Transformer/JsonPathFilterStep.cs` and trace: What happens to `customer.ssn`? What happens if `keepPaths` references a field that doesn't exist in the message (e.g., `customer.address.zipCode`)?
+
+### Step 2: Design for Security and Data Minimization
+
+The Content Filter is a key tool for **data minimization** (GDPR, PCI-DSS). Design a pipeline:
+
+| Consumer | Allowed Fields | Filtered Fields |
+|----------|---------------|----------------|
+| Billing | `order.id`, `customer.email`, `order.total` | PII, items, audit |
+| Analytics | `order.id`, `order.items[]`, `order.total` | All customer PII |
+| Audit | All fields | None (full record) |
+
+How does the Content Filter ensure that the billing system **never** receives `customer.ssn`? Why is this an **atomicity** concern — what happens if the filter is accidentally misconfigured?
+
+### Step 3: Design an Enrich-Then-Filter Pipeline
+
+Explain why the order matters: first Enrich (Tutorial 18) then Filter. Draw a pipeline:
+
+```
+Raw message → Content Enricher (add customer data) → Content Filter (remove sensitive fields) → Route to consumer
+```
+
+If you reverse the order (filter first, then enrich), what goes wrong? How does the pipeline order preserve both data completeness and data minimization?
+
+## Exam
+
+1. A keep-path references a field that doesn't exist in the message. What should the Content Filter do?
+   - A) Throw an exception and route to DLQ
+   - B) Silently omit the missing field from the output — the filter operates on what's present, producing a valid subset without failing, which supports graceful handling of schema variations
+   - C) Add the field with a null value
+   - D) Block the message until the field is available
+
+2. Why is the Content Filter critical for **PCI-DSS and GDPR compliance** in enterprise integration?
+   - A) It encrypts sensitive fields automatically
+   - B) It ensures each downstream consumer receives only the data it needs — preventing over-exposure of PII and cardholder data by stripping unauthorized fields before routing
+   - C) It logs all sensitive data access for audit
+   - D) It replaces sensitive data with synthetic values
+
+3. In a high-throughput pipeline, how does content filtering improve **scalability**?
+   - A) Filtering doesn't affect performance
+   - B) Removing unnecessary fields reduces message size — smaller messages mean lower broker storage costs, faster serialization, and reduced network bandwidth across the entire downstream processing chain
+   - C) Filtering enables parallel processing
+   - D) Filtered messages skip the routing step
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/20-splitter.md b/EnterpriseIntegrationPlatform/tutorials/20-splitter.md
index d968a4a..849888a 100644
--- a/EnterpriseIntegrationPlatform/tutorials/20-splitter.md
+++ b/EnterpriseIntegrationPlatform/tutorials/20-splitter.md
@@ -98,13 +98,61 @@ All split items are published to the target topic before the source message is A
 
 ---
 
-## Exercises
+## Lab
 
-1. A message `{ "orders": [{ "id": 1 }, { "id": 2 }, { "id": 3 }] }` is split using `JsonArraySplitStrategy` with `ArrayPropertyName = "orders"`. How many envelopes are in `SplitResult.SplitEnvelopes`? What is `ItemCount`?
+**Objective:** Split composite messages into individual items, trace how `SequenceNumber` and `TotalCount` enable the Aggregator to reassemble split messages, and analyze **atomicity** when a split item fails.
 
-2. After splitting, item at sequence 1 is lost due to a downstream failure. How does the Aggregator (Tutorial 21) detect this?
+### Step 1: Split a Composite Message
 
-3. Why does `JsonArraySplitStrategy` clone each element with `JsonSerializer.SerializeToElement`? What would happen if it didn't?
+A message `{ "orders": [{ "id": 1, "total": 50 }, { "id": 2, "total": 150 }, { "id": 3, "total": 75 }] }` is split using `JsonArraySplitStrategy` with `ArrayPropertyName = "orders"`. Open `src/Processing.Splitter/` and trace:
+
+1. How many envelopes are in `SplitResult.SplitEnvelopes`?
+2. What is `ItemCount`?
+3. What `SequenceNumber` and `TotalCount` does each split envelope carry?
+4. Do all split envelopes share the same `CorrelationId` as the original?
+
+### Step 2: Trace Atomicity When a Split Item Fails
+
+After splitting, the 3 items are processed independently. Item 2 (sequence 1) fails delivery:
+
+| Item | SequenceNumber | Status |
+|------|---------------|--------|
+| `{ "id": 1 }` | 0 | ✅ Delivered |
+| `{ "id": 2 }` | 1 | ❌ Failed |
+| `{ "id": 3 }` | 2 | ✅ Delivered |
+
+Questions:
+- How does the Aggregator (Tutorial 21) detect that item 2 is missing? (hint: `TotalCount = 3` but only 2 arrived)
+- Should the Aggregator wait indefinitely or timeout? What timeout strategy preserves **atomicity**?
+- Should items 1 and 3 be rolled back (saga compensation), or should only item 2 be retried?
+
+### Step 3: Evaluate Splitter Scalability
+
+Splitting a message with 1,000 items creates 1,000 individual messages. Analyze:
+
+- Each split message is independently processed — what parallelism level is achievable?
+- What is the memory impact of cloning 1,000 JSON elements? (hint: `JsonSerializer.SerializeToElement` creates deep copies)
+- Why does `JsonArraySplitStrategy` clone each element rather than using references? What **concurrency** bug would occur without cloning?
+
+## Exam
+
+1. After splitting, why does each split envelope carry `SequenceNumber` and `TotalCount`?
+   - A) For sorting messages alphabetically
+   - B) These fields enable the downstream Aggregator to detect missing items and reassemble the complete set — without them, the Aggregator cannot determine when all pieces have arrived or which pieces are missing
+   - C) The broker requires sequence numbers for storage
+   - D) They are used for message deduplication
+
+2. Why does the Splitter clone each array element rather than using references to the original?
+   - A) .NET doesn't support object references in records
+   - B) Cloning ensures each split message is independently serializable and processable — without cloning, concurrent modifications by downstream consumers could corrupt the shared source data, violating processing **atomicity**
+   - C) Cloning is faster than referencing
+   - D) The broker serializer requires cloned objects
+
+3. A batch message with 100 items is split. Item 47 fails after items 1-46 and 48-100 succeed. What is the **scalable** recovery strategy?
+   - A) Retry all 100 items from the beginning
+   - B) Retry only item 47 using its `CorrelationId` and `SequenceNumber` — the other 99 items are already committed and don't need reprocessing, enabling efficient partial recovery
+   - C) Route all 100 items to the Dead Letter Queue
+   - D) Wait for item 47 to auto-heal
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/21-aggregator.md b/EnterpriseIntegrationPlatform/tutorials/21-aggregator.md
index 59edcb1..98cd381 100644
--- a/EnterpriseIntegrationPlatform/tutorials/21-aggregator.md
+++ b/EnterpriseIntegrationPlatform/tutorials/21-aggregator.md
@@ -113,13 +113,56 @@ Each `AggregateAsync` call atomically adds the item to the store and checks comp
 
 ---
 
-## Exercises
+## Lab
 
-1. A Splitter produces 5 items with `TotalCount = 5`. After receiving items 0, 1, 2, 3, what does `AggregateResult.ReceivedCount` return? What is `IsComplete`?
+**Objective:** Trace the Aggregator's completion logic, design timeout strategies, and analyze how **idempotent** aggregation ensures **atomic** reassembly of split messages.
 
-2. Design a `TimeoutCompletionStrategy` that completes a group if 30 seconds pass since the first item arrived. What challenges does this introduce?
+### Step 1: Trace Aggregation Completion
 
-3. Why must the `IMessageAggregateStore` be idempotent on `MessageId`? What happens without idempotency if a message is redelivered?
+A Splitter produces 5 items with `TotalCount = 5`. Items arrive out of order: 3, 0, 4, 1, 2. Open `src/Processing.Aggregator/MessageAggregator.cs` and trace:
+
+1. After receiving items 0, 1, 2, 3 — what does `AggregateResult.ReceivedCount` return? What is `IsComplete`?
+2. When item 4 arrives, how does the Aggregator know the group is complete?
+3. What `CorrelationId` links all 5 items to the same aggregate group?
+
+### Step 2: Design a Timeout Completion Strategy
+
+Not all split items may arrive (e.g., item 2 fails permanently). Design a timeout strategy:
+
+- After 30 seconds from the first item, complete the aggregate with whatever has arrived
+- Mark the result as `IsPartial = true`
+- Route the partial aggregate to a `review.incomplete-batches` topic
+
+What **atomicity** decision must you make: should a partial aggregate be considered "successful" or should it trigger compensation for already-delivered items?
+
+### Step 3: Analyze Idempotent Aggregation
+
+A message with `SequenceNumber = 2` is delivered twice (broker redelivery). Without idempotency:
+
+- The aggregate would count 6 items instead of 5
+- `IsComplete` would never be true (6 > 5) or would fire prematurely
+
+Open `src/Processing.Aggregator/` and verify: How does `IMessageAggregateStore` handle duplicate `MessageId`s? Why is idempotency critical for **scalable** at-least-once delivery systems?
+
+## Exam
+
+1. A Splitter produces 5 items. The Aggregator receives items 0, 1, 3, 4 but item 2 never arrives. What should happen after the timeout?
+   - A) Wait indefinitely — the aggregate must be complete
+   - B) Complete with 4 items, mark as partial, and route for manual review — a timeout prevents indefinite resource consumption while preserving the received work for inspection
+   - C) Discard all 4 received items
+   - D) Re-request item 2 from the Splitter
+
+2. Why must the Aggregator's store be **idempotent** on `MessageId`?
+   - A) Idempotency is required by the NUnit testing framework
+   - B) In at-least-once delivery systems, duplicate messages are expected — without idempotency, the aggregate count would be corrupted, potentially triggering premature completion or preventing completion entirely
+   - C) Idempotency improves serialization performance
+   - D) The broker guarantees exactly-once delivery, so idempotency is unnecessary
+
+3. How does the Splitter-Aggregator pair maintain **end-to-end atomicity** for a batch message?
+   - A) The Splitter and Aggregator share a database transaction
+   - B) The `CorrelationId` links all split items; `SequenceNumber` and `TotalCount` enable the Aggregator to verify completeness — only when all items succeed (or timeout triggers) is the aggregate result committed or compensated
+   - C) The broker ensures all items are delivered simultaneously
+   - D) Each split item is independently atomic — there is no end-to-end guarantee
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/22-scatter-gather.md b/EnterpriseIntegrationPlatform/tutorials/22-scatter-gather.md
index 36160f6..e7493f4 100644
--- a/EnterpriseIntegrationPlatform/tutorials/22-scatter-gather.md
+++ b/EnterpriseIntegrationPlatform/tutorials/22-scatter-gather.md
@@ -95,13 +95,67 @@ Scatter-Gather has **best-effort semantics** within the timeout window. If a rec
 
 ---
 
-## Exercises
+## Lab
 
-1. You scatter a pricing request to 3 suppliers with a 5-second timeout. Supplier A responds in 1 s, Supplier B in 3 s, Supplier C never responds. What does `ScatterGatherResult` look like?
+**Objective:** Trace the Scatter-Gather pattern's parallel request-response flow, analyze timeout behavior for **partial results**, and design a "best-of-N" selection strategy.
 
-2. How would you implement a "best of N" strategy where you take the lowest price from all responses received within the timeout?
+### Step 1: Trace a Scatter-Gather with Timeout
 
-3. Compare Scatter-Gather to calling each service sequentially. What is the latency difference with 3 services averaging 2 seconds each?
+You scatter a pricing request to 3 suppliers with `TimeoutMs = 5000`:
+
+| Supplier | Response Time | Price |
+|----------|--------------|-------|
+| A | 1 second | $120 |
+| B | 3 seconds | $95 |
+| C | Never responds | — |
+
+Open `src/Processing.ScatterGather/ScatterGatherer.cs` and trace:
+
+1. How does `ScatterGatherResult.Responses` look? (2 responses)
+2. Is `TimedOut = true`? (yes — only 2 of 3 responded)
+3. What is `Duration`? (≈5 seconds — the timeout)
+
+### Step 2: Design a "Best-of-N" Selection Strategy
+
+Using the partial results above, implement a selection strategy that picks the lowest price:
+
+```
+1. Scatter to all suppliers (parallel)
+2. Gather responses until timeout
+3. From gathered responses, select the one with lowest price
+4. If no responses arrived, route to DLQ with reason "no-supplier-response"
+```
+
+What is the **atomicity** guarantee? The selected best price must be committed as a single decision — if the commit fails, no supplier should be charged.
+
+### Step 3: Compare Scatter-Gather Latency vs. Sequential Calls
+
+| Approach | 3 services × 2s avg | 10 services × 2s avg |
+|----------|---------------------|----------------------|
+| Sequential | 6 seconds total | 20 seconds total |
+| Scatter-Gather | ~2 seconds (parallel) | ~2 seconds (parallel) |
+
+How does the Scatter-Gather pattern enable **scalable** multi-supplier/multi-service integration? What happens to latency as you add more recipients?
+
+## Exam
+
+1. A Scatter-Gather operation sends to 5 recipients with a 3-second timeout. Only 3 respond in time. What does the result indicate?
+   - A) Failure — all recipients must respond
+   - B) `TimedOut = true` with 3 responses — the caller receives partial results and can decide how to proceed based on business logic (e.g., select best from available)
+   - C) The operation retries the 2 missing recipients
+   - D) The 3 responses are discarded and the operation fails
+
+2. How does the Scatter-Gather pattern improve **integration scalability** compared to sequential service calls?
+   - A) It uses less memory per request
+   - B) Latency equals the slowest responder (or timeout), not the sum of all — adding more recipients doesn't increase total latency, enabling efficient multi-source integration at scale
+   - C) It reduces the number of network connections
+   - D) Sequential calls are always faster for small numbers of recipients
+
+3. What **atomicity** consideration arises when the Scatter-Gather selects one response from many?
+   - A) All responses must be stored permanently
+   - B) The selected response must be committed atomically — if the downstream commit fails, no side effects from the selection (e.g., supplier charges) should be applied, requiring compensation for any tentative reservations
+   - C) Non-selected responses are automatically compensated
+   - D) The broker handles selection atomicity
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/23-request-reply.md b/EnterpriseIntegrationPlatform/tutorials/23-request-reply.md
index 92d9016..bf94df3 100644
--- a/EnterpriseIntegrationPlatform/tutorials/23-request-reply.md
+++ b/EnterpriseIntegrationPlatform/tutorials/23-request-reply.md
@@ -95,13 +95,57 @@ The request is published to the request topic and the correlator subscribes to t
 
 ---
 
-## Exercises
+## Lab
 
-1. A request is sent with `TimeoutMs = 5000`. The responder takes 7 seconds. What does `RequestReplyResult` look like?
+**Objective:** Trace the Request-Reply correlation mechanism, analyze timeout behavior, and design for **scalable** request-reply across distributed services.
 
-2. Two requesters send requests with different `CorrelationId` values to the same request topic. How does each requester get the correct reply?
+### Step 1: Trace Request-Reply Correlation
 
-3. Why does the correlator subscribe to the reply topic **before** publishing the request? What race condition does this prevent?
+A request is sent with `TimeoutMs = 5000`. The responder takes 7 seconds. Open `src/Processing.RequestReply/RequestReplyCorrelator.cs` and trace:
+
+1. What does `RequestReplyResult` look like? (`TimedOut = true`, no response)
+2. If the responder takes 3 seconds, what does the result contain?
+3. How does the `CorrelationId` in the request envelope match to the response?
+
+Now: Two requesters send requests with different `CorrelationId` values to the same request topic. How does each requester receive its own correct reply?
+
+### Step 2: Prevent the Subscribe-Before-Publish Race Condition
+
+The correlator subscribes to the reply topic **before** publishing the request. Explain:
+
+1. What race condition occurs if you publish first, then subscribe?
+2. How does pre-subscribing ensure the reply is never lost?
+3. Draw the timeline: Subscribe → Publish → Responder processes → Reply arrives → Correlator matches
+
+This is an **atomicity** concern: without pre-subscription, fast responders could publish replies before the requester is listening, causing permanent message loss.
+
+### Step 3: Design for Request-Reply Scalability
+
+At high throughput, many concurrent request-reply operations share the same reply topic:
+
+- How does the correlator isolate concurrent requests? (hint: `CorrelationId` matching)
+- What happens if 1,000 requests are in flight simultaneously? Memory implications?
+- How does the `TimeoutMs` prevent resource leaks from requests that never receive replies?
+
+## Exam
+
+1. Why does the Request-Reply correlator subscribe to the reply topic **before** publishing the request?
+   - A) Subscribing is faster than publishing
+   - B) A fast responder could publish the reply before the requester is listening — pre-subscribing eliminates this race condition, ensuring the reply is never lost even with sub-millisecond response times
+   - C) The broker requires subscriptions before publishes
+   - D) Pre-subscribing reduces network latency
+
+2. How does the `CorrelationId` enable **scalable** request-reply with many concurrent requests on the same topic?
+   - A) The broker routes replies based on `CorrelationId` automatically
+   - B) Each requester filters incoming replies by `CorrelationId` — only the matching reply is accepted, allowing thousands of concurrent request-reply operations to share a single reply topic without interference
+   - C) `CorrelationId` is used for message encryption
+   - D) Each request must use a unique reply topic
+
+3. What resource **scalability** concern does the timeout address in request-reply?
+   - A) Timeouts improve message throughput
+   - B) Without timeouts, requests that never receive replies would hold resources (memory, channel subscriptions) indefinitely — the timeout ensures cleanup even when responders fail, preventing memory leaks under sustained load
+   - C) Timeouts are only needed for testing
+   - D) The broker automatically cleans up timed-out requests
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/24-retry-framework.md b/EnterpriseIntegrationPlatform/tutorials/24-retry-framework.md
index e5e5b9d..33d35cb 100644
--- a/EnterpriseIntegrationPlatform/tutorials/24-retry-framework.md
+++ b/EnterpriseIntegrationPlatform/tutorials/24-retry-framework.md
@@ -122,13 +122,71 @@ When all retry attempts are exhausted (`IsSucceeded = false`), the message shoul
 
 ---
 
-## Exercises
+## Lab
 
-1. With `MaxAttempts = 4`, `InitialDelayMs = 500`, `BackoffMultiplier = 2.0`, and `UseJitter = false`, calculate the delay before each retry attempt.
+**Objective:** Calculate exponential backoff delays, analyze why jitter is critical for **scalable** retry under thundering-herd conditions, and design a retry classification strategy.
 
-2. Why is jitter important? Describe a scenario where 100 consumers without jitter cause problems for a recovering database.
+### Step 1: Calculate Backoff Delays
 
-3. A `JsonException` during deserialization is not retryable. How would you detect this and short-circuit to the DLQ?
+With `MaxAttempts = 4`, `InitialDelayMs = 500`, `BackoffMultiplier = 2.0`, and `UseJitter = false`, calculate the delay before each retry:
+
+| Attempt | Delay Formula | Delay |
+|---------|--------------|-------|
+| 1 (first retry) | 500 × 2⁰ | 500ms |
+| 2 | 500 × 2¹ | ? |
+| 3 | 500 × 2² | ? |
+| 4 | 500 × 2³ | ? |
+
+What is the total maximum wait time across all retries? Open `src/Processing.Retry/ExponentialBackoffRetryPolicy.cs` to verify the formula.
+
+### Step 2: Analyze the Thundering Herd Problem
+
+100 consumers lose connection to a database. All retry at the same exponential intervals (no jitter). Draw what happens:
+
+```
+t=0s:   [100 consumers all fail]
+t=500ms: [100 consumers all retry simultaneously] → database overwhelmed again
+t=1000ms: [100 consumers all retry simultaneously] → database overwhelmed again
+```
+
+Now add jitter: each consumer randomizes its delay within ±50%. Explain:
+- How does jitter spread the retry load over time?
+- Why is this critical for **system-level scalability** during recovery?
+- What is the relationship between jitter and the database's recovery time?
+
+### Step 3: Design Retry Classification
+
+Not all errors are retryable. Design a classification strategy:
+
+| Error Type | Retryable? | Action |
+|-----------|-----------|--------|
+| HTTP 503 (Service Unavailable) | Yes | Exponential backoff |
+| HTTP 400 (Bad Request) | No | Immediate DLQ |
+| `JsonException` (deserialization) | No | Immediate DLQ |
+| `TimeoutException` (network) | Yes | ? |
+| Schema validation failure | No | ? |
+
+Why is fast-failing non-retryable errors critical for **pipeline throughput**? What happens if you retry a `JsonException` 4 times before giving up?
+
+## Exam
+
+1. With `InitialDelayMs = 1000` and `BackoffMultiplier = 2.0`, what is the delay before the 4th retry attempt?
+   - A) 4000ms
+   - B) 8000ms — the delay doubles each attempt: 1000, 2000, 4000, 8000
+   - C) 3000ms
+   - D) 16000ms
+
+2. Why is jitter critical for **scalable** retry strategies in distributed systems?
+   - A) Jitter makes retries faster
+   - B) Without jitter, all consumers retry at identical intervals — creating synchronized spikes that can overwhelm the recovering service; jitter spreads retries over time, enabling gradual recovery
+   - C) Jitter is only needed for testing
+   - D) The broker requires jitter in retry delays
+
+3. Why should non-retryable errors (e.g., `JsonException`) be routed to the DLQ immediately instead of retried?
+   - A) Non-retryable errors are rare and don't matter
+   - B) Retrying a permanent error wastes processing capacity and delays handling of valid messages — fast-failing to DLQ preserves pipeline **throughput** and enables rapid human intervention
+   - C) The DLQ can fix the error automatically
+   - D) Non-retryable errors eventually succeed after enough retries
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/25-dead-letter-queue.md b/EnterpriseIntegrationPlatform/tutorials/25-dead-letter-queue.md
index 53be478..1db53c0 100644
--- a/EnterpriseIntegrationPlatform/tutorials/25-dead-letter-queue.md
+++ b/EnterpriseIntegrationPlatform/tutorials/25-dead-letter-queue.md
@@ -108,7 +108,8 @@ public sealed class MessageExpirationChecker<T> : IMessageExpirationChecker<T>
 
         await _deadLetterPublisher.PublishAsync(
             envelope, DeadLetterReason.MessageExpired,
-            $"Message expired at {envelope.ExpiresAt.Value:O}.", 0, cancellationToken);
+            $"Message expired at {envelope.ExpiresAt.Value:O}. Current time: {now:O}.",
+            0, cancellationToken);
         return true;
     }
 }
@@ -130,13 +131,64 @@ Dead-lettering is the **last resort** — it runs only after all retries are exh
 
 ---
 
-## Exercises
+## Lab
 
-1. A message fails validation (`DeadLetterReason.ValidationFailed`). An operator fixes the schema and wants to reprocess it. Describe the replay flow through the Admin API.
+**Objective:** Trace the Dead Letter Queue lifecycle from failure to replay, analyze how the DLQ preserves **zero message loss atomicity**, and design an operational replay workflow.
 
-2. A message has `ExpiresAt = 2024-01-15T10:00:00Z` and the current time is `2024-01-15T10:00:01Z`. Trace the path through `MessageExpirationChecker` and `IDeadLetterPublisher`.
+### Step 1: Trace an Expired Message to the DLQ
 
-3. Why does the platform preserve the **complete original envelope** in `DeadLetterEnvelope` rather than just the error details? What operational benefit does this provide?
+A message has `ExpiresAt = 2024-01-15T10:00:00Z` and the current time is `2024-01-15T10:00:01Z`. Open `src/Processing.DeadLetter/MessageExpirationChecker.cs` and trace:
+
+1. `CheckAndRouteIfExpiredAsync` detects expiration — what `DeadLetterReason` is used?
+2. What information is logged? (hint: expiry time and current time)
+3. Where does the complete original envelope end up?
+
+Verify that the **entire original envelope** is preserved in `DeadLetterEnvelope` — not just error details.
+
+### Step 2: Design an Operational Replay Workflow
+
+A message fails validation (`DeadLetterReason.ValidationFailed`). An operator fixes the downstream schema. Design the replay flow:
+
+```
+1. Operator queries DLQ via Admin API: GET /api/deadletter?reason=ValidationFailed
+2. Operator reviews the original envelope and error details
+3. Operator triggers replay: POST /api/deadletter/{id}/replay
+4. Platform re-publishes the original envelope to its original topic
+5. Message re-enters the pipeline from the beginning
+```
+
+What **atomicity** guarantees must the replay provide? (hint: replay must either fully re-publish or fail cleanly — no partial replays)
+
+### Step 3: Categorize DLQ Reasons and Operational Response
+
+| DLQ Reason | Cause | Operational Response | Can Auto-Replay? |
+|-----------|-------|---------------------|-------------------|
+| `MessageExpired` | TTL exceeded | Review TTL settings | No — stale data |
+| `ValidationFailed` | Schema mismatch | Fix schema → replay | Yes |
+| `MaxRetriesExceeded` | Transient failures | Investigate root cause → replay | Maybe |
+| `PermanentFailure` | Non-retryable error | Manual intervention | No |
+
+Why is preserving the complete original envelope critical for DLQ operations? What would an operator lose if only the error message was stored?
+
+## Exam
+
+1. Why does the platform preserve the **complete original envelope** in the Dead Letter Queue?
+   - A) It's a storage requirement of the broker
+   - B) The original envelope enables accurate replay — operators can inspect the exact payload, metadata, and headers that caused the failure, and re-publish it unchanged for reprocessing after fixing the root cause
+   - C) The envelope is needed for deduplication
+   - D) Only the error details are stored
+
+2. How does the DLQ pattern ensure **zero message loss** in the integration platform?
+   - A) The DLQ stores messages in memory for fast retrieval
+   - B) Every message that cannot be processed successfully — whether due to expiration, validation failure, or exhausted retries — is routed to the DLQ rather than being silently dropped, ensuring nothing is ever lost
+   - C) The broker prevents message deletion
+   - D) Messages are automatically retried from the DLQ every minute
+
+3. What **atomicity** guarantee must a DLQ replay operation provide?
+   - A) The replay can be partial — some fields are replayed while others are skipped
+   - B) The replay must either fully re-publish the original message to its target topic or fail cleanly — partial replays could cause duplicate processing or data corruption
+   - C) The DLQ entry must be deleted before replay
+   - D) Replay is only possible within 24 hours of the original failure
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/26-message-replay.md b/EnterpriseIntegrationPlatform/tutorials/26-message-replay.md
index 842ad39..da9eb9c 100644
--- a/EnterpriseIntegrationPlatform/tutorials/26-message-replay.md
+++ b/EnterpriseIntegrationPlatform/tutorials/26-message-replay.md
@@ -107,13 +107,68 @@ Replay re-publishes messages to the **same ingress topic** they originally enter
 
 ---
 
-## Exercises
+## Lab
 
-1. An operator discovers a bug in the content enricher that corrupted messages between 09:00 and 09:30 UTC. Write the `ReplayFilter` to target only those messages using `FromTimestamp` and `ToTimestamp`.
+**Objective:** Design a message replay operation for a production incident, analyze how the `ReplayId` header prevents duplicate processing, and evaluate replay store **scalability** requirements.
 
-2. Why does the platform inject a `ReplayId` header instead of simply re-publishing the original message unchanged? What problems could occur without it?
+### Step 1: Design a Time-Window Replay
 
-3. Describe what a production `IMessageReplayStore` implementation would need to handle 10 million messages per day efficiently.
+An operator discovers a bug in the content enricher that corrupted messages between 09:00 and 09:30 UTC on January 15th. Write the `ReplayFilter`:
+
+```csharp
+var filter = new ReplayFilter
+{
+    FromTimestamp = DateTimeOffset.Parse("2024-01-15T09:00:00Z"),
+    ToTimestamp = DateTimeOffset.Parse("2024-01-15T09:30:00Z"),
+    Topic = "eip.orders.enriched"
+};
+```
+
+Open `src/Processing.Replay/MessageReplayer.cs` and trace: How does the replayer iterate over stored messages? What happens to messages that don't match the filter?
+
+### Step 2: Analyze the ReplayId Header for Atomicity
+
+The platform injects a `ReplayId` header into replayed messages. Explain why:
+
+1. Without `ReplayId` — downstream consumers process the message as if it's new → **duplicate side effects** (e.g., double billing)
+2. With `ReplayId` — consumers can detect replays and apply **idempotent** processing
+3. How does `ReplayId` interact with `MessageId`? (the original `MessageId` is preserved for correlation)
+
+Design a consumer that checks for `ReplayId` and skips already-processed messages using a deduplication store.
+
+### Step 3: Evaluate Replay Store Scalability
+
+A production system processes 10 million messages/day. Design the replay store requirements:
+
+| Requirement | Value | Justification |
+|------------|-------|---------------|
+| Storage per message | ~2KB (envelope) | Full envelope for accurate replay |
+| Daily storage | ~20GB | 10M × 2KB |
+| Retention period | 30 days | Regulatory and operational needs |
+| Total storage | ~600GB | 30 × 20GB |
+| Query performance | < 100ms for time-range | Fast incident response |
+
+What storage technology would you recommend? (hint: time-series databases, object storage with indexing)
+
+## Exam
+
+1. Why does the platform inject a `ReplayId` header instead of re-publishing the original message unchanged?
+   - A) `ReplayId` improves serialization performance
+   - B) Without `ReplayId`, downstream consumers cannot distinguish replayed messages from new ones — leading to duplicate side effects like double billing; the header enables idempotent replay processing
+   - C) The broker requires unique headers for each publish
+   - D) `ReplayId` replaces the original `MessageId`
+
+2. What **atomicity** guarantee must a replay operation provide?
+   - A) All replayed messages must succeed or the entire replay is rolled back
+   - B) Each replayed message is independently atomic — if message 500 of 1000 fails, the first 499 are committed and 500+ can be retried; the `ReplayId` prevents duplicates from the successful ones
+   - C) Replay operations are fire-and-forget with no guarantees
+   - D) The entire replay must complete within a single database transaction
+
+3. How does time-range filtering in replay operations support **operational scalability**?
+   - A) Time filtering is faster than content filtering
+   - B) Operators can target a precise incident window instead of replaying all messages — this minimizes unnecessary reprocessing and downstream load during recovery
+   - C) Time ranges are required by the message broker
+   - D) Filtering has no impact on replay performance
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/27-resequencer.md b/EnterpriseIntegrationPlatform/tutorials/27-resequencer.md
index ae511a8..e350754 100644
--- a/EnterpriseIntegrationPlatform/tutorials/27-resequencer.md
+++ b/EnterpriseIntegrationPlatform/tutorials/27-resequencer.md
@@ -91,13 +91,59 @@ Messages are **Acked only after successful release** to the downstream topic. If
 
 ---
 
-## Exercises
+## Lab
 
-1. Three messages arrive for `CorrelationId = "order-42"` in this order: #3, #1, #2. Trace the calls to `Accept` and describe the return value for each call.
+**Objective:** Trace the Resequencer's buffering and release logic, analyze ordering guarantees for **atomic** batch processing, and design for partition-aware scaling.
 
-2. A sequence has messages #1, #2, #4 buffered and `ReleaseTimeout` fires. Describe what `ReleaseOnTimeout` returns and what happens to the gap at #3.
+### Step 1: Trace Out-of-Order Arrival
 
-3. Why must all messages for a `CorrelationId` be routed to the same resequencer instance? What broker feature enables this?
+Three messages arrive for `CorrelationId = "order-42"` in this order: #3, #1, #2. Open `src/Processing.Resequencer/` and trace each `Accept` call:
+
+| Arrival | SequenceNumber | Buffered? | Released? | Why? |
+|---------|---------------|-----------|-----------|------|
+| 1st | 3 | Yes | No | Waiting for #1 |
+| 2nd | 1 | — | Released: #1 | Next expected |
+| 3rd | 2 | — | Released: #2, then #3 | Completes the sequence |
+
+Verify your trace against the actual implementation.
+
+### Step 2: Handle Gaps with Timeout
+
+A sequence has messages #1, #2, #4 buffered, but #3 never arrives. After `ReleaseTimeout` fires:
+
+1. What does `ReleaseOnTimeout` return? (hint: #1 and #2 are released, #4 is released with a gap marker)
+2. Is the gap reported for downstream awareness?
+3. How does this design prevent indefinite buffering — critical for **system scalability** under high message volumes?
+
+Design an alerting strategy for gap detection: when should the operations team be notified?
+
+### Step 3: Partition-Aware Resequencing
+
+All messages for a `CorrelationId` must be routed to the same resequencer instance. Explain:
+
+- What broker feature enables this? (hint: Kafka partition keys, NATS subject-based routing)
+- What happens if messages for the same `CorrelationId` land on different resequencer instances?
+- How does partition-key routing enable **horizontal scaling** — each instance handles a subset of `CorrelationId`s independently?
+
+## Exam
+
+1. Why must all messages for a `CorrelationId` be routed to the **same** resequencer instance?
+   - A) Any instance can resequence any `CorrelationId`
+   - B) The resequencer maintains an ordered buffer per `CorrelationId` — if messages are split across instances, no single instance has the complete picture to determine correct ordering
+   - C) The broker automatically routes messages to the correct instance
+   - D) Resequencing doesn't require instance affinity
+
+2. How does the `ReleaseTimeout` prevent unbounded resource consumption?
+   - A) It deletes messages older than the timeout
+   - B) Without a timeout, a missing sequence number would cause all subsequent messages to buffer indefinitely — the timeout releases buffered messages with gap markers, preventing memory growth proportional to undelivered messages
+   - C) Timeouts are only needed in development
+   - D) The timeout reduces message processing latency
+
+3. How does partition-key routing enable **scalable** resequencing?
+   - A) All messages go to a single instance for global ordering
+   - B) Each resequencer instance handles a subset of `CorrelationId`s — adding instances distributes the load linearly, with no cross-instance coordination needed for ordering within each group
+   - C) Partition keys are only used for Kafka
+   - D) Routing is handled by the resequencer itself
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/28-competing-consumers.md b/EnterpriseIntegrationPlatform/tutorials/28-competing-consumers.md
index ddc6781..a8ac327 100644
--- a/EnterpriseIntegrationPlatform/tutorials/28-competing-consumers.md
+++ b/EnterpriseIntegrationPlatform/tutorials/28-competing-consumers.md
@@ -48,23 +48,72 @@ public sealed class CompetingConsumerOrchestrator : BackgroundService
     {
         while (!stoppingToken.IsCancellationRequested)
         {
-            var lagInfo = await _lagMonitor.GetLagAsync(
-                _options.TargetTopic, _options.ConsumerGroup, stoppingToken);
+            try
+            {
+                await EvaluateAndScaleAsync(stoppingToken);
+            }
+            catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
+            {
+                break;
+            }
+            catch (Exception ex)
+            {
+                _logger.LogError(ex, "Error during competing consumer orchestration cycle");
+            }
+
+            await Task.Delay(
+                TimeSpan.FromMilliseconds(_options.CooldownMs), _timeProvider, stoppingToken);
+        }
+    }
+
+    internal async Task EvaluateAndScaleAsync(CancellationToken cancellationToken)
+    {
+        var lagInfo = await _lagMonitor.GetLagAsync(
+            _options.TargetTopic, _options.ConsumerGroup, cancellationToken);
 
-            if (lagInfo.CurrentLag >= _options.ScaleUpThreshold)
-                await _scaler.ScaleAsync(
-                    Math.Min(_scaler.CurrentCount + 1, _options.MaxConsumers), stoppingToken);
-            else if (lagInfo.CurrentLag <= _options.ScaleDownThreshold
-                     && _scaler.CurrentCount > _options.MinConsumers)
-                await _scaler.ScaleAsync(_scaler.CurrentCount - 1, stoppingToken);
+        var currentCount = _scaler.CurrentCount;
+        var now = _timeProvider.GetUtcNow();
+        var cooldown = TimeSpan.FromMilliseconds(_options.CooldownMs);
 
-            await Task.Delay(_options.CooldownMs, stoppingToken);
+        if (lagInfo.CurrentLag >= _options.ScaleUpThreshold)
+        {
+            if (currentCount >= _options.MaxConsumers)
+            {
+                _backpressure.Signal();  // signal backpressure when at capacity
+                return;
+            }
+
+            _backpressure.Release();
+            if ((now - _lastScaleTime) < cooldown) return;  // cooldown guard
+
+            var desired = Math.Min(currentCount + 1, _options.MaxConsumers);
+            await _scaler.ScaleAsync(desired, cancellationToken);
+            _lastScaleTime = now;
+        }
+        else if (lagInfo.CurrentLag <= _options.ScaleDownThreshold)
+        {
+            _backpressure.Release();
+            if (currentCount <= _options.MinConsumers) return;
+            if (_backpressure.IsBackpressured) return;  // pause scale-down under backpressure
+            if ((now - _lastScaleTime) < cooldown) return;
+
+            var desired = Math.Max(currentCount - 1, _options.MinConsumers);
+            await _scaler.ScaleAsync(desired, cancellationToken);
+            _lastScaleTime = now;
+        }
+        else
+        {
+            _backpressure.Release();
         }
     }
 }
 ```
 
-The orchestrator runs as a hosted `BackgroundService`. On each evaluation cycle it reads the consumer lag via `GetLagAsync`, compares against thresholds, and calls `ScaleAsync` with the desired consumer count.
+The orchestrator runs as a hosted `BackgroundService`. On each evaluation cycle it reads the consumer lag via `GetLagAsync`, compares against thresholds, and calls `ScaleAsync` with the desired consumer count. Key features:
+
+- **Backpressure signaling** — when at max capacity, signals backpressure to upstream producers
+- **Cooldown guard** — prevents scaling flapping with a configurable cooldown period
+- **Backpressure-aware scale-down** — won't scale down while backpressure is active
 
 ### IConsumerLagMonitor
 
@@ -144,13 +193,65 @@ Each consumer processes messages independently and Acks them individually. If a
 
 ---
 
-## Exercises
+## Lab
+
+**Objective:** Trace the auto-scaling orchestrator with backpressure signaling, analyze cooldown to prevent scaling flap, and design a production backpressure integration.
+
+### Step 1: Trace the Scaling Decision Path
+
+A topic has 8 partitions, `MaxConsumers = 12`, and current consumer lag is 5,000. Open `src/Processing.CompetingConsumers/CompetingConsumerOrchestrator.cs` and trace `EvaluateAndScaleAsync`:
+
+1. Lag exceeds `ScaleUpThreshold` → what happens if current consumers = 8?
+2. Lag exceeds threshold but `currentCount >= MaxConsumers` → what signal is emitted?
+3. After scaling up, what prevents another scale-up in the next cycle? (hint: cooldown)
+
+Now: with `MaxConsumers = 12` and 8 Kafka partitions, what happens when the orchestrator scales to 9 consumers? (hint: one consumer will be idle — Kafka can't assign more consumers than partitions)
+
+### Step 2: Analyze Cooldown for Scaling Stability
+
+Consumer lag oscillates between 900 and 1100 with `ScaleUpThreshold = 1000`. Without cooldown:
+
+```
+Cycle 1: lag=1100 → scale up (3→4)
+Cycle 2: lag=900 → scale down (4→3)
+Cycle 3: lag=1100 → scale up (3→4)
+... flapping forever
+```
+
+How does `CooldownMs` break this cycle? What is the relationship between cooldown duration and scaling stability? What value would you set for a production system?
+
+### Step 3: Design a Backpressure Integration
+
+When the consumer pool is at maximum capacity and lag keeps growing, the orchestrator signals backpressure. Design a system-wide response:
+
+| Component | Backpressure Action |
+|-----------|-------------------|
+| Gateway API | Return HTTP 429 to upstream senders |
+| Ingestion producers | Pause or slow message publishing |
+| Dashboard (OpenClaw) | Show backpressure warning to operators |
+| Monitoring (OpenTelemetry) | Emit backpressure metrics and alerts |
+
+How does backpressure prevent **cascade failures** in a scalable system? What happens without it?
+
+## Exam
 
-1. A topic has 8 partitions and `MaxConsumers = 12`. What happens when the orchestrator tries to scale beyond 8 consumers? Why is `MaxConsumers` still useful?
+1. A topic has 8 partitions and the orchestrator scales to 12 consumers. What happens?
+   - A) All 12 consumers share the 8 partitions equally
+   - B) 8 consumers each get 1 partition; 4 consumers are idle — Kafka cannot assign more consumers than partitions in a consumer group; `MaxConsumers` should be set to match partition count
+   - C) The broker creates 4 additional partitions automatically
+   - D) The extra consumers process from a different topic
 
-2. Consumer lag oscillates between 900 and 1100 with `ScaleUpThreshold = 1000`. Without `CooldownMs`, what behavior would you observe? How does cooldown fix it?
+2. Why is cooldown critical for **scalable** auto-scaling?
+   - A) Cooldown reduces memory usage
+   - B) Without cooldown, oscillating lag near the threshold causes rapid scale-up/scale-down flapping — cooldown ensures each scaling decision has time to take effect before the next evaluation, preventing resource waste and instability
+   - C) Cooldown is only needed during maintenance windows
+   - D) The broker enforces cooldown automatically
 
-3. Design an `IBackpressureSignal` integration that returns HTTP 429 from the Gateway API when backpressure is active.
+3. How does backpressure signaling maintain **system-level atomicity** under overload?
+   - A) Backpressure drops excess messages to protect the system
+   - B) Backpressure slows or pauses upstream producers — this prevents message accumulation that would exceed processing capacity, ensuring every accepted message can be processed atomically rather than overwhelming the pipeline
+   - C) Backpressure increases consumer count beyond the maximum
+   - D) Backpressure is only relevant for batch processing
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/29-throttle-rate-limiting.md b/EnterpriseIntegrationPlatform/tutorials/29-throttle-rate-limiting.md
index 5e45b88..2eb2052 100644
--- a/EnterpriseIntegrationPlatform/tutorials/29-throttle-rate-limiting.md
+++ b/EnterpriseIntegrationPlatform/tutorials/29-throttle-rate-limiting.md
@@ -150,13 +150,65 @@ When `AcquireAsync` delays a message, the message remains **uncommitted** — no
 
 ---
 
-## Exercises
+## Lab
 
-1. Design a `ThrottlePolicy` that allows a partner system to send 50 messages/second with bursts up to 200. Which `ThrottlePartitionKey` fields would you set?
+**Objective:** Design throttle policies for multi-tenant rate limiting, trace the token bucket algorithm, and analyze why per-tenant throttling is essential for **fair scalability**.
 
-2. The `TokenBucketThrottle` has 0 available tokens and a `MaxWait` of 5 seconds. A message arrives. Describe the sequence of events.
+### Step 1: Design a Multi-Tenant Throttle Policy
 
-3. Explain why the platform uses per-`TenantId` throttling by default for multi-tenant deployments rather than a single global throttle.
+Design a `ThrottlePolicy` for a partner system:
+
+| Parameter | Value | Purpose |
+|-----------|-------|---------|
+| Rate | 50 messages/second | Sustained throughput |
+| Burst | 200 messages | Peak absorption |
+| `PartitionKey.TenantId` | `"partner-x"` | Per-tenant isolation |
+| `MaxWait` | 5 seconds | Max time to wait for a token |
+
+Open `src/Processing.Throttle/` and verify: How does the `TokenBucketThrottle` implement this? What happens when all 200 burst tokens are consumed?
+
+### Step 2: Trace Token Exhaustion
+
+The `TokenBucketThrottle` has 0 available tokens and `MaxWait = 5s`. A message arrives:
+
+1. The throttle checks: 0 tokens available
+2. Waits for token replenishment (50 tokens/second = 1 token every 20ms)
+3. After ~20ms, 1 token becomes available → message proceeds
+4. If no token is available after 5 seconds → what happens?
+
+What is the maximum queuing depth during a sustained burst? How does `MaxWait` prevent unbounded queue growth?
+
+### Step 3: Analyze Per-Tenant vs. Global Throttling
+
+Why does the platform use per-`TenantId` throttling by default?
+
+| Scenario | Global Throttle | Per-Tenant Throttle |
+|----------|----------------|-------------------|
+| Tenant A sends 10,000 msg/s | Blocks Tenant B too | Only Tenant A is throttled |
+| Tenant B sends 10 msg/s | May be blocked by A | Always gets through |
+| Fair resource allocation | No guarantee | Each tenant gets its quota |
+
+How does per-tenant throttling prevent the **noisy neighbor** problem? Why is this critical for **multi-tenant scalability**?
+
+## Exam
+
+1. A token bucket with rate=100/s and burst=500 receives 600 messages in 1 second. What happens?
+   - A) All 600 messages are processed immediately
+   - B) The first 500 are processed from the burst allowance; the remaining 100 wait for token replenishment at 100/s — after 1 second, all 600 have been processed; messages beyond capacity wait up to `MaxWait` before being rejected
+   - C) All 600 messages are rejected
+   - D) The burst limit is increased automatically
+
+2. Why is per-tenant throttling essential for **multi-tenant scalability**?
+   - A) Per-tenant throttling uses less memory
+   - B) Without per-tenant isolation, one tenant's traffic spike would exhaust the global rate limit and block all other tenants — the noisy neighbor problem; per-tenant throttling ensures fair resource allocation
+   - C) The broker requires per-tenant configuration
+   - D) Global throttling is always preferable for simplicity
+
+3. What happens when a message exceeds the `MaxWait` timeout in the throttle?
+   - A) The message is processed anyway
+   - B) The message is rejected with an appropriate error — this prevents unbounded queue growth and provides backpressure to the upstream sender, maintaining system stability under sustained overload
+   - C) The throttle increases its rate automatically
+   - D) The message is routed to a different topic
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/30-rule-engine.md b/EnterpriseIntegrationPlatform/tutorials/30-rule-engine.md
index f748d8e..aa55bfb 100644
--- a/EnterpriseIntegrationPlatform/tutorials/30-rule-engine.md
+++ b/EnterpriseIntegrationPlatform/tutorials/30-rule-engine.md
@@ -141,13 +141,70 @@ Rule evaluation happens **within the pipeline transaction**. If the selected act
 
 ---
 
-## Exercises
+## Lab
 
-1. Write a `BusinessRule` that routes all messages from source `"PartnerX"` with `MessageType` containing `"order"` to topic `"orders-priority"`.
+**Objective:** Write business rules with conditions and logic operators, trace priority-based evaluation, and analyze rule caching for **scalable** high-throughput routing decisions.
 
-2. A rule has `LogicOperator = RuleLogicOperator.Or` with two conditions. Explain how evaluation differs from `And`.
+### Step 1: Write a Priority-Based Business Rule
 
-3. Why does the platform evaluate rules in priority order and stop at the first match rather than evaluating all rules?
+Write a `BusinessRule` that routes all messages from source `"PartnerX"` with `MessageType` containing `"order"` to topic `"orders-priority"`:
+
+```csharp
+var rule = new BusinessRule
+{
+    Name = "PartnerX-Orders",
+    Priority = 1,
+    LogicOperator = RuleLogicOperator.And,
+    Conditions = [
+        new RuleCondition { FieldName = "Source", Operator = RuleConditionOperator.Equals, Value = "PartnerX" },
+        new RuleCondition { FieldName = "MessageType", Operator = RuleConditionOperator.Contains, Value = "order" }
+    ],
+    OutputTopic = "orders-priority"
+};
+```
+
+Open `src/RuleEngine/BusinessRuleEngine.cs` and trace: How does `And` vs. `Or` logic change the evaluation?
+
+### Step 2: Trace Priority-Based Evaluation
+
+Rules are evaluated in priority order (lowest number = highest priority):
+
+| Priority | Rule | Conditions |
+|----------|------|-----------|
+| 1 | Premium orders | Source = "PartnerX" AND Type contains "order" |
+| 5 | All orders | Type contains "order" |
+| 10 | Default | Always matches |
+
+A message from `PartnerX` with type `"order.created"` matches rules at priorities 1 and 5. Which rule wins? Why does the engine stop at the first match? (hint: deterministic routing for **atomicity**)
+
+### Step 3: Design Rule Caching for Scalability
+
+At 50,000 messages/second with 100 rules, each message evaluates up to 100 conditions. Design a caching strategy:
+
+- Rules change infrequently (hourly) but messages arrive constantly
+- How does the platform cache compiled rules? (Open `src/RuleEngine/` to check)
+- What is the cache invalidation strategy when rules are updated?
+- What is the performance difference between cached vs. uncached rule evaluation?
+
+## Exam
+
+1. A rule engine has 3 rules with priorities 1, 5, 10. A message matches rules at priorities 5 and 10. Which rule is applied?
+   - A) Both rules are applied (fan-out)
+   - B) Priority 5 — the engine evaluates in priority order and stops at the first match, ensuring deterministic and **atomic** routing to exactly one destination
+   - C) Priority 10 — the last match wins
+   - D) The engine randomly selects one
+
+2. Why does the rule engine use `And`/`Or` logic operators for conditions?
+   - A) They're required by the .NET compiler
+   - B) `And` requires all conditions to match (strict targeting); `Or` requires any condition to match (broad targeting) — this enables both precise and flexible routing rules for different business scenarios
+   - C) Logic operators improve serialization performance
+   - D) They're equivalent — both produce the same result
+
+3. How does rule caching improve **throughput scalability**?
+   - A) Caching stores message results, not rules
+   - B) Compiled rules are cached in memory — avoiding repeated parsing and compilation of rule definitions for every message; since rules change infrequently but messages arrive at high volume, caching amortizes the compilation cost over millions of evaluations
+   - C) Caching is only useful during testing
+   - D) Rules are too small to benefit from caching
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/31-event-sourcing.md b/EnterpriseIntegrationPlatform/tutorials/31-event-sourcing.md
index 5fc5891..4ca2670 100644
--- a/EnterpriseIntegrationPlatform/tutorials/31-event-sourcing.md
+++ b/EnterpriseIntegrationPlatform/tutorials/31-event-sourcing.md
@@ -148,13 +148,63 @@ Optimistic concurrency ensures **consistency without locks**. The `expectedVersi
 
 ---
 
-## Exercises
+## Lab
 
-1. An aggregate has 10,000 events. Without snapshots, what is the cost of reconstructing current state? With a snapshot at version 9,900?
+**Objective:** Analyze event sourcing's append-only model for **audit-complete atomicity**, trace optimistic concurrency conflict resolution, and design snapshot strategies for **scalable** aggregate reconstruction.
 
-2. Two commands arrive simultaneously for the same stream at version 5. Both expect version 5. Trace the optimistic concurrency flow.
+### Step 1: Calculate Aggregate Reconstruction Cost
 
-3. Use `TemporalQuery.ReplayToPointInTimeAsync` to reconstruct an order aggregate's state as of yesterday at noon. What parameters do you need to supply?
+An aggregate has 10,000 events. Compare reconstruction approaches:
+
+| Approach | Events Replayed | Cost | Time (est.) |
+|----------|----------------|------|-------------|
+| Full replay (no snapshots) | 10,000 | High CPU + memory | ~100ms |
+| Snapshot at version 9,900 | 100 | Low | ~1ms |
+| Snapshot at version 9,999 | 1 | Minimal | ~0.1ms |
+
+Open `src/EventSourcing/` and trace: How does the event store load a snapshot, then replay only subsequent events? What is the **scalability** trade-off between snapshot frequency and storage cost?
+
+### Step 2: Trace Optimistic Concurrency Conflict
+
+Two commands arrive simultaneously for the same stream at version 5. Both expect version 5:
+
+```
+Command A: Append event at version 5 → succeeds (stream now at version 6)
+Command B: Append event at version 5 → CONFLICT (expected 5, actual 6)
+```
+
+Trace the conflict resolution:
+1. What exception is thrown?
+2. Does Command B retry? With what strategy?
+3. How does optimistic concurrency ensure **atomic** state transitions without distributed locks?
+
+### Step 3: Design a Temporal Query for Audit
+
+Use `TemporalQuery.ReplayToPointInTimeAsync` to reconstruct an order aggregate's state as of yesterday at noon:
+
+- What parameters do you supply? (stream ID, point-in-time)
+- How does this differ from loading current state?
+- Why is this capability essential for **regulatory compliance** and audit trails?
+
+## Exam
+
+1. Why does event sourcing use an append-only log rather than mutable state updates?
+   - A) Append-only is faster for write operations
+   - B) Every state change is permanently recorded as an immutable event — this provides a complete audit trail, enables temporal queries (reconstructing past state), and guarantees **atomic** state transitions through optimistic concurrency
+   - C) Databases don't support mutable updates
+   - D) Append-only reduces storage costs
+
+2. How does optimistic concurrency prevent **atomicity** violations in concurrent event sourcing?
+   - A) It uses distributed locks to prevent concurrent access
+   - B) Each append specifies the expected version — if another command modified the stream first, the version mismatch is detected and the second command fails cleanly, ensuring only one writer succeeds per state transition
+   - C) Events are automatically merged when conflicts occur
+   - D) The event store queues concurrent commands
+
+3. How do snapshots improve **aggregate reconstruction scalability**?
+   - A) Snapshots reduce the number of events stored
+   - B) A snapshot captures aggregate state at a point in time — reconstruction replays only events after the snapshot instead of the entire history, reducing reconstruction time from O(N) to O(recent events)
+   - C) Snapshots are required by the event store
+   - D) Snapshots improve write performance
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/32-multi-tenancy.md b/EnterpriseIntegrationPlatform/tutorials/32-multi-tenancy.md
index 39d16a2..fc2a5e2 100644
--- a/EnterpriseIntegrationPlatform/tutorials/32-multi-tenancy.md
+++ b/EnterpriseIntegrationPlatform/tutorials/32-multi-tenancy.md
@@ -61,10 +61,10 @@ public sealed class TenantContext
     public string? TenantName { get; init; }
     public bool IsResolved { get; init; }
 
-    public static TenantContext Anonymous => new()
+    public static readonly TenantContext Anonymous = new()
     {
         TenantId = "anonymous",
-        IsResolved = false
+        IsResolved = false,
     };
 }
 ```
@@ -136,13 +136,62 @@ The isolation guard runs **before any processing** — a cross-tenant message is
 
 ---
 
-## Exercises
+## Lab
 
-1. A message arrives with `X-Tenant-Id: tenant-a` but the JWT claim says `tenant-b`. How should the resolver handle this conflict?
+**Objective:** Trace tenant resolution and isolation enforcement, design the onboarding resource provisioning pipeline, and analyze why tenant isolation is non-negotiable for **multi-tenant scalability**.
 
-2. Describe the self-service flow when a new tenant onboards: what resources are provisioned and in what order?
+### Step 1: Resolve a Tenant Identity Conflict
 
-3. Why is `TenantIsolationException` non-retryable? Under what circumstances could a cross-tenant message be legitimate?
+A message arrives with `X-Tenant-Id: tenant-a` but the JWT claim says `tenant-b`. Open `src/MultiTenancy/` and trace:
+
+1. How does the tenant resolver prioritize these conflicting signals?
+2. Should the resolver trust the header or the JWT? (hint: JWT is cryptographically signed)
+3. What exception is thrown for the conflict? Is it retryable?
+
+Design a resolution policy: When is a conflict legitimate (e.g., admin impersonation) vs. a security violation?
+
+### Step 2: Design the Onboarding Pipeline
+
+When a new tenant onboards, trace the self-service provisioning flow:
+
+| Step | Resource | Class | Atomic? |
+|------|----------|-------|---------|
+| 1 | Create tenant record | `InMemoryTenantOnboardingService` | Yes |
+| 2 | Provision broker namespace | `InMemoryBrokerNamespaceProvisioner` | Yes |
+| 3 | Set quota limits | `InMemoryTenantQuotaManager` | Yes |
+| 4 | Initialize configuration | ConfigurationStore | Yes |
+
+If Step 3 fails, what compensation is needed for Steps 1-2? How does this relate to the Saga pattern?
+
+### Step 3: Analyze Tenant Isolation for Scalability
+
+| Without Isolation | With Isolation |
+|------------------|----------------|
+| Tenant A's traffic spike affects Tenant B | Each tenant has dedicated queues and quotas |
+| One tenant's DLQ overflow blocks all tenants | Isolated DLQ per tenant |
+| Security breach in one tenant exposes all | `TenantIsolationGuard` prevents cross-tenant access |
+
+Why is `TenantIsolationException` non-retryable? Under what circumstances could a cross-tenant message be legitimate?
+
+## Exam
+
+1. Why must tenant resolution trust JWT claims over HTTP headers?
+   - A) HTTP headers are faster to parse
+   - B) JWTs are cryptographically signed and cannot be forged by the caller — headers can be spoofed; trusting unsigned headers would allow any caller to impersonate any tenant, violating isolation
+   - C) The broker requires JWT tokens
+   - D) Headers don't support tenant identifiers
+
+2. Why is `TenantIsolationException` non-retryable?
+   - A) Retries would succeed with different credentials
+   - B) A cross-tenant access attempt is a security violation — retrying won't change the tenant identity; it must be investigated as a potential breach, not automatically retried
+   - C) The exception is transient and self-healing
+   - D) Non-retryable exceptions are faster to process
+
+3. How does per-tenant resource provisioning enable **horizontal scalability**?
+   - A) All tenants share a single resource pool
+   - B) Each tenant gets isolated broker namespaces and quotas — adding tenants doesn't affect existing tenants' performance, and each tenant's resources can be independently scaled based on their usage patterns
+   - C) Resource provisioning is only needed for premium tenants
+   - D) The broker automatically provisions resources
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/33-security.md b/EnterpriseIntegrationPlatform/tutorials/33-security.md
index 18c4bda..b693a05 100644
--- a/EnterpriseIntegrationPlatform/tutorials/33-security.md
+++ b/EnterpriseIntegrationPlatform/tutorials/33-security.md
@@ -145,13 +145,63 @@ Sanitization runs **before the message is Acked**. Callers can use `IsClean` to
 
 ---
 
-## Exercises
+## Lab
 
-1. A payload contains `<script>alert('xss')</script>` embedded in a JSON string value. Describe how the sanitizer detects and removes it while preserving valid JSON structure.
+**Objective:** Trace the input sanitization pipeline, analyze how defense-in-depth protects **message atomicity** from injection attacks, and evaluate secret management for **scalable** multi-environment deployments.
 
-2. Why does the platform use a separate `IPayloadSizeGuard` instead of checking size inside `IInputSanitizer`?
+### Step 1: Trace XSS Sanitization
 
-3. Compare `AzureKeyVaultSecretProvider` and `VaultSecretProvider`. When would you choose one over the other?
+A payload contains `<script>alert('xss')</script>` embedded in a JSON string value. Open `src/Security/InputSanitizer.cs` and trace:
+
+1. How does the sanitizer detect the `<script>` tag within a JSON value?
+2. Is the malicious content removed, escaped, or rejected?
+3. Is the rest of the valid JSON preserved?
+4. What happens to the envelope's `MessageId` and `CorrelationId` — are they affected?
+
+Why is sanitization critical for **pipeline atomicity**? (hint: unsanitized payloads could execute scripts in downstream web UIs or corrupt database queries)
+
+### Step 2: Analyze Payload Size Guard
+
+Why does the platform use a separate `IPayloadSizeGuard` instead of checking size inside `IInputSanitizer`?
+
+| Concern | Responsibility | Why Separate? |
+|---------|---------------|---------------|
+| Size validation | `IPayloadSizeGuard` | Cheap check — reject oversized payloads before expensive sanitization |
+| Content sanitization | `IInputSanitizer` | Complex parsing — only runs on right-sized payloads |
+| Schema validation | Activity pipeline | Business logic — runs after sanitization |
+
+How does this layered approach improve **throughput scalability**? (hint: fast rejection of invalid messages at each layer)
+
+### Step 3: Compare Secret Providers for Multi-Environment Deployment
+
+| Provider | Best For | Rotation | Caching |
+|----------|---------|----------|---------|
+| `InMemorySecretProvider` | Development and testing | Manual | No |
+| `CachedSecretProvider` | Production (wraps any provider) | Automatic TTL | Yes |
+| `AzureKeyVaultSecretProvider` | Azure deployments | Azure-managed | Via wrapper |
+| `VaultSecretProvider` | Multi-cloud with HashiCorp Vault | Vault-managed | Via wrapper |
+
+When would you use `CachedSecretProvider` wrapping `AzureKeyVaultSecretProvider`? What is the **scalability** benefit of caching secrets locally?
+
+## Exam
+
+1. Why does the platform sanitize payloads **before** routing or processing?
+   - A) Sanitization improves message routing speed
+   - B) Unsanitized payloads could contain injection attacks (XSS, SQL injection) that execute when consumed by downstream systems — sanitizing at ingress prevents malicious content from propagating through the entire pipeline
+   - C) The broker requires sanitized payloads
+   - D) Sanitization is only needed for XML messages
+
+2. Why does the `IPayloadSizeGuard` run before `IInputSanitizer`?
+   - A) Size checking is always done first by convention
+   - B) Rejecting oversized payloads before sanitization avoids expensive parsing of potentially malicious large payloads — this is a defense-in-depth principle that protects against denial-of-service via payload size
+   - C) The sanitizer cannot handle large payloads
+   - D) Size checking requires less memory
+
+3. How does secret caching with `CachedSecretProvider` improve **operational scalability**?
+   - A) Caching stores more secrets than the vault
+   - B) Frequently accessed secrets are served from memory instead of making network calls to the vault — this reduces latency and eliminates the vault as a bottleneck when many services need secrets simultaneously
+   - C) Caching eliminates the need for secret rotation
+   - D) The vault requires caching for correctness
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/34-connector-http.md b/EnterpriseIntegrationPlatform/tutorials/34-connector-http.md
index eb61fcb..2d5c62d 100644
--- a/EnterpriseIntegrationPlatform/tutorials/34-connector-http.md
+++ b/EnterpriseIntegrationPlatform/tutorials/34-connector-http.md
@@ -127,13 +127,67 @@ The source message is **Acked only after a successful `ConnectorResult`**. If th
 
 ---
 
-## Exercises
+## Lab
 
-1. An external API requires a token obtained from `https://auth.example.com/token`. Write a `SendWithTokenAsync` call with appropriate parameters.
+**Objective:** Trace the HTTP connector's token-based authentication, analyze retry behavior for **atomic** delivery to external APIs, and evaluate token caching for **scalable** high-volume integration.
 
-2. The external API returns HTTP 503. Trace the flow through the retry framework (`MaxRetryAttempts` / `RetryDelayMs`) and the DLQ.
+### Step 1: Configure Token-Based Authentication
 
-3. Why does the connector cache tokens with `CacheTokenExpirySeconds` rather than fetching a new token for every request?
+An external API requires a token from `https://auth.example.com/token`. Write the connector configuration:
+
+```csharp
+await connector.SendWithTokenAsync(
+    envelope,
+    endpoint: "https://api.partner.com/orders",
+    tokenUrl: "https://auth.example.com/token",
+    clientId: "eip-platform",
+    clientSecret: await secretProvider.GetSecretAsync("partner-api-secret"),
+    cancellationToken: ct);
+```
+
+Open `src/Connectors.Http/HttpConnector.cs` and trace: How does the connector obtain, cache, and refresh tokens?
+
+### Step 2: Trace Retry and DLQ for External API Failures
+
+The external API returns HTTP 503 (Service Unavailable). Trace the flow:
+
+1. First attempt → 503 → retry with exponential backoff
+2. Retry 1 → 503 → retry again
+3. After `MaxRetryAttempts` exhausted → where does the message go?
+4. What `DeadLetterReason` is set?
+
+Now: what if the API returns HTTP 400 (Bad Request)? Is this retryable? How does the connector distinguish transient vs. permanent failures for **atomic** delivery guarantees?
+
+### Step 3: Evaluate Token Caching for Scalability
+
+At 5,000 messages/second, each requiring authentication:
+
+| Strategy | Token Requests/sec | Latency Impact |
+|----------|-------------------|----------------|
+| Token per request | 5,000 | +200ms per message (auth round-trip) |
+| Cached with `CacheTokenExpirySeconds = 300` | ~0.003 (1 per 5 min) | ~0ms (memory read) |
+
+Why is token caching essential for **throughput scalability**? What risk does stale token caching introduce? How does `CacheTokenExpirySeconds` balance performance and security?
+
+## Exam
+
+1. An external API returns HTTP 503. What should the HTTP connector do for **atomic** delivery?
+   - A) Immediately route to DLQ
+   - B) Retry with exponential backoff — HTTP 503 is a transient error indicating the service is temporarily overloaded; retries allow the service to recover before failing permanently to DLQ
+   - C) Retry indefinitely until the service recovers
+   - D) Return success to the pipeline
+
+2. Why does the connector cache authentication tokens?
+   - A) Tokens expire too quickly to use
+   - B) At high throughput, requesting a new token for every message would overwhelm the auth server and add unacceptable latency — caching amortizes the auth cost over thousands of messages
+   - C) Token caching is required by OAuth 2.0
+   - D) The auth server doesn't support concurrent requests
+
+3. How does the connector distinguish retryable from non-retryable HTTP errors?
+   - A) All HTTP errors are retryable
+   - B) HTTP 5xx (server errors) and 429 (rate limited) are retryable — the server may recover; HTTP 4xx (client errors like 400, 401, 403) are permanent — retrying won't fix the request, so fast-failing to DLQ preserves pipeline throughput
+   - C) Only HTTP 500 is retryable
+   - D) The broker determines retryability
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/35-connector-sftp.md b/EnterpriseIntegrationPlatform/tutorials/35-connector-sftp.md
index f7c16f2..d56095a 100644
--- a/EnterpriseIntegrationPlatform/tutorials/35-connector-sftp.md
+++ b/EnterpriseIntegrationPlatform/tutorials/35-connector-sftp.md
@@ -85,13 +85,68 @@ The `UploadAsync` method ensures **all-or-nothing delivery**. If the upload fail
 
 ---
 
-## Exercises
+## Lab
 
-1. An SFTP server has a 10-connection limit. You have 20 consumer replicas uploading files. Design a connection pooling strategy.
+**Objective:** Design connection pooling for SFTP under high consumer concurrency, trace the upload lifecycle, and analyze **atomic** file delivery guarantees.
 
-2. `UploadAsync<T>` accepts a `Func<T, byte[]>` serializer. Write a serializer lambda for a JSON payload of type `OrderPayload`.
+### Step 1: Design Connection Pooling
 
-3. Why does the connector return the full remote path from `UploadAsync` rather than a `ConnectorResult`?
+An SFTP server has a 10-connection limit. You have 20 consumer replicas uploading files:
+
+| Without Pooling | With Pooling |
+|-----------------|-------------|
+| 20 connections attempted | Pool of 10 shared connections |
+| 10 fail with connection refused | Consumers wait for available connection |
+| No retry coordination | Queue + semaphore manages access |
+
+Open `src/Connectors.Sftp/SftpConnector.cs` and check: How does the platform pool SFTP connections? What happens when all pool slots are busy?
+
+### Step 2: Trace the Upload Lifecycle
+
+`UploadAsync<T>` accepts a `Func<T, byte[]>` serializer. Write a serializer for a JSON payload:
+
+```csharp
+var remotePath = await sftpConnector.UploadAsync(
+    envelope,
+    remoteDir: "/incoming/orders",
+    serializer: payload => System.Text.Encoding.UTF8.GetBytes(
+        System.Text.Json.JsonSerializer.Serialize(payload)),
+    cancellationToken: ct);
+```
+
+Why does the connector return the full remote path rather than a `ConnectorResult`? How does the caller confirm **atomic** delivery — is the file visible to the receiver immediately or after a rename?
+
+### Step 3: Design Atomic File Delivery
+
+SFTP uploads are not atomic — a partial file can be read by the receiver mid-upload. Design a safe strategy:
+
+```
+1. Upload to temporary path: /incoming/orders/.tmp-{guid}
+2. Rename to final path: /incoming/orders/order-42.json
+3. Return the final path in ConnectorResult
+```
+
+If the upload fails after 50%, the temp file is cleaned up. If the rename fails, the temp file exists but is invisible to the receiver. How does this pattern guarantee **atomic** file visibility?
+
+## Exam
+
+1. Why is connection pooling essential for SFTP connector **scalability**?
+   - A) SFTP servers have unlimited connections
+   - B) SFTP servers have strict connection limits — without pooling, concurrent consumer replicas would exceed the limit and fail; pooling ensures connections are shared efficiently across all consumers
+   - C) Pooling reduces file size
+   - D) Each consumer needs its own dedicated SFTP server
+
+2. How does the temp-file-then-rename pattern ensure **atomic** file delivery?
+   - A) Renaming is faster than uploading
+   - B) The receiver never sees partial files — the temp file is invisible to the receiver's file scanner, and the rename operation is atomic at the filesystem level, so the file transitions from invisible to complete in one step
+   - C) The SFTP protocol guarantees atomicity
+   - D) Temp files are automatically deleted after 30 seconds
+
+3. What happens if the SFTP connection is lost during an upload?
+   - A) The partial file is delivered to the receiver
+   - B) The temp file remains on the server but is not renamed — the connector retries the entire upload; if all retries fail, the message is routed to the DLQ and the orphaned temp file can be cleaned up by a scheduled job
+   - C) The connection automatically reconnects and resumes
+   - D) The broker retries the upload internally
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/36-connector-email.md b/EnterpriseIntegrationPlatform/tutorials/36-connector-email.md
index 6ff0ec0..bf7c057 100644
--- a/EnterpriseIntegrationPlatform/tutorials/36-connector-email.md
+++ b/EnterpriseIntegrationPlatform/tutorials/36-connector-email.md
@@ -110,13 +110,68 @@ The source message is **Acked only after SMTP confirmation**. If the SMTP server
 
 ---
 
-## Exercises
+## Lab
 
-1. Write a `Func<T, string>` body builder for an order confirmation email that includes the order ID and total from the payload.
+**Objective:** Design email delivery with throttling integration, trace the connector's notification pipeline, and analyze **atomic** delivery confirmation for email-based integrations.
 
-2. An SMTP server limits sending to 100 emails per minute. How would you integrate the throttle from Tutorial 29?
+### Step 1: Write a Body Builder for Order Confirmation
 
-3. Why does the connector use `Func<T, string>` body builders rather than a template engine?
+Write a `Func<OrderPayload, string>` body builder that creates an HTML email:
+
+```csharp
+Func<OrderPayload, string> bodyBuilder = order =>
+    $"""
+    <h1>Order Confirmation</h1>
+    <p>Order ID: {order.OrderId}</p>
+    <p>Total: ${order.Total:F2}</p>
+    <p>Thank you for your purchase!</p>
+    """;
+```
+
+Open `src/Connectors.Email/EmailConnector.cs` and trace: How does the connector use this builder? Why does it use `Func<T, string>` rather than a template engine?
+
+### Step 2: Integrate Throttling for SMTP Rate Limits
+
+An SMTP server limits sending to 100 emails per minute. Design the integration with the Throttle from Tutorial 29:
+
+```
+Message arrives → Throttle (100/min, per-tenant) → Email Connector → SMTP Send → Ack/Nack
+```
+
+What happens when the 101st email arrives within the minute? Does it queue, reject, or backpressure? How does this prevent the SMTP server from rejecting connections — a **scalability** concern for high-volume notification systems?
+
+### Step 3: Analyze Delivery Atomicity
+
+Email delivery is inherently non-atomic — you can't "uncommit" a sent email. Design a strategy:
+
+| Scenario | Connector Response | Pipeline Action |
+|----------|-------------------|----------------|
+| SMTP accepts message | Ack | Mark as delivered |
+| SMTP rejects (invalid address) | Nack | Route to DLQ |
+| SMTP timeout | Retry | Exponential backoff |
+| Email sent but recipient bounce | ? | How do you detect this? |
+
+Why is email the most challenging connector for **atomicity**? How does the platform handle "fire-and-forget" delivery?
+
+## Exam
+
+1. Why does the email connector use `Func<T, string>` body builders rather than a template engine?
+   - A) Template engines are not supported in .NET
+   - B) Lambdas are compiled code — they're type-safe, refactorable, and don't require a separate template syntax; for an integration platform where emails are programmatic notifications, code-based builders are simpler and more maintainable
+   - C) Templates are slower than string interpolation
+   - D) The SMTP protocol requires plain strings
+
+2. Why is throttle integration essential for email connector **scalability**?
+   - A) Throttling reduces email content size
+   - B) SMTP servers enforce rate limits — exceeding them causes connection rejection and delivery failure for all consumers; throttling ensures the platform respects server limits while queuing excess messages for later delivery
+   - C) Email delivery doesn't benefit from throttling
+   - D) Throttling is only needed for premium tenants
+
+3. What makes email delivery uniquely challenging for **processing atomicity**?
+   - A) Email is always delivered successfully
+   - B) Email delivery is one-way and non-reversible — once the SMTP server accepts the message, it cannot be recalled; the platform can only confirm SMTP acceptance, not final delivery to the recipient's inbox
+   - C) SMTP supports two-phase commit
+   - D) Email is synchronous and always returns a delivery receipt
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/37-connector-file.md b/EnterpriseIntegrationPlatform/tutorials/37-connector-file.md
index f3f07f1..e29bcb8 100644
--- a/EnterpriseIntegrationPlatform/tutorials/37-connector-file.md
+++ b/EnterpriseIntegrationPlatform/tutorials/37-connector-file.md
@@ -95,13 +95,66 @@ File I/O is bound by disk throughput and network filesystem latency (for NFS/SMB
 
 ---
 
-## Exercises
+## Lab
 
-1. Design a `FileConnectorOptions` for a batch process that writes UTF-16 encoded XML files to an NFS share, with `OverwriteExisting = true`.
+**Objective:** Configure file-based delivery for batch processing, analyze concurrent write safety, and trace how `{MessageId}` filenames prevent conflicts in **scaled** consumer deployments.
 
-2. Two consumer replicas write to the same `RootDirectory`. How does the `FilenamePattern` with `{MessageId}` prevent conflicts?
+### Step 1: Configure a File Connector
 
-3. Why does the connector use `Func<T, byte[]>` for serialization rather than accepting raw strings?
+Design a `FileConnectorOptions` for a batch process that writes UTF-16 encoded XML files:
+
+```csharp
+var options = new FileConnectorOptions
+{
+    RootDirectory = "/mnt/nfs/outbound/orders",
+    FilenamePattern = "order-{MessageId}.xml",
+    Encoding = "utf-16",
+    OverwriteExisting = true,
+    CreateDirectoryIfMissing = true
+};
+```
+
+Open `src/Connectors.File/FileConnector.cs` and trace: How does the connector resolve `{MessageId}` in the filename? What other placeholders are available?
+
+### Step 2: Analyze Concurrent Write Safety
+
+Two consumer replicas write to the same `RootDirectory`. With `{MessageId}` in the pattern:
+
+| Replica | Message | Filename |
+|---------|---------|----------|
+| A | `msg-abc-123` | `order-abc-123.xml` |
+| B | `msg-def-456` | `order-def-456.xml` |
+| A | `msg-abc-123` (redelivery) | `order-abc-123.xml` (overwrite) |
+
+How does `{MessageId}` prevent filename collisions between different messages? How does `OverwriteExisting` handle idempotent redelivery of the same message?
+
+### Step 3: Evaluate File Connector Atomicity
+
+The file connector uses `Func<T, byte[]>` for serialization rather than accepting raw strings. Explain:
+
+- Why `byte[]` instead of `string`? (hint: binary formats like Avro, Protocol Buffers)
+- How does the connector ensure **atomic** file creation? (hint: write to temp file, then rename)
+- What happens if the process crashes after writing but before renaming?
+
+## Exam
+
+1. Why does the file connector use `{MessageId}` in the filename pattern?
+   - A) Message IDs are shorter than timestamps
+   - B) `MessageId` is globally unique — using it in filenames prevents collision when multiple consumer replicas write to the same directory, and makes redeliveries idempotent (same file is overwritten, not duplicated)
+   - C) The filesystem requires GUIDs as filenames
+   - D) MessageId is the only available placeholder
+
+2. Why does the connector use `Func<T, byte[]>` for serialization?
+   - A) Bytes are smaller than strings
+   - B) `byte[]` supports any output format — JSON, XML, binary (Avro, Protobuf) — while `string` would limit the connector to text-only formats; this makes the connector **format-agnostic** and scalable across integration needs
+   - C) The filesystem only stores bytes
+   - D) String serialization is not supported in .NET
+
+3. How does write-then-rename ensure **atomic** file visibility?
+   - A) Renaming is faster than writing
+   - B) The receiver's file scanner only sees the final filename — the temp file is invisible to scanners looking for the expected pattern; rename is atomic at the OS level, so the file transitions from invisible to complete in one step
+   - C) The filesystem guarantees transactional writes
+   - D) Temp files are automatically deleted
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/38-opentelemetry.md b/EnterpriseIntegrationPlatform/tutorials/38-opentelemetry.md
index bccd6c4..6faa1f7 100644
--- a/EnterpriseIntegrationPlatform/tutorials/38-opentelemetry.md
+++ b/EnterpriseIntegrationPlatform/tutorials/38-opentelemetry.md
@@ -188,13 +188,64 @@ Telemetry is a **best-effort side channel** — if the collector is down, messag
 
 ---
 
-## Exercises
+## Lab
 
-1. A message flows through ingress → router → transformer → HTTP connector. Draw the expected span hierarchy in a trace viewer.
+**Objective:** Trace distributed spans across the integration pipeline, analyze how observability enables **scalable** operations, and design graceful degradation when telemetry infrastructure is unavailable.
 
-2. The OTLP collector is unreachable. What happens to message processing? What telemetry is lost?
+### Step 1: Draw a Trace Span Hierarchy
 
-3. Why does the platform propagate W3C `traceparent` through envelope metadata rather than relying on broker-level header propagation?
+A message flows through: ingress → router → transformer → HTTP connector. Draw the expected span hierarchy:
+
+```
+[Root Span: Gateway.Receive]
+  └─ [Span: ContentBasedRouter.RouteAsync]
+       └─ [Span: MessageTranslator.TranslateAsync]
+            └─ [Span: HttpConnector.SendAsync]
+                 └─ [Span: HTTP POST api.partner.com/orders]
+```
+
+Open `src/Observability/` and identify: How does the platform create parent-child span relationships? How is the W3C `traceparent` propagated through envelope metadata?
+
+### Step 2: Analyze Observability Resilience
+
+The OTLP collector is unreachable (network partition). Trace what happens:
+
+1. Does message processing stop? (No — observability must never block business logic)
+2. What telemetry is lost? (spans and metrics for the outage period)
+3. How does the platform implement this resilience? (hint: fire-and-forget telemetry export)
+
+Why is observability resilience critical for **integration platform scalability**? A telemetry outage must not cascade into a processing outage.
+
+### Step 3: Design Observability for Scalable Troubleshooting
+
+Explain why the platform propagates `traceparent` through envelope metadata rather than relying on broker-level header propagation:
+
+| Approach | Pros | Cons |
+|----------|------|------|
+| Broker headers | Automatic, no code changes | Broker-specific, lost during bridge/transform |
+| Envelope metadata | Survives all processing stages | Requires explicit propagation |
+
+How does end-to-end tracing support **operational scalability** — what happens when an operator needs to debug a failure across 10 processing stages in a system handling 50,000 messages/second?
+
+## Exam
+
+1. Why does the platform propagate `traceparent` through envelope metadata rather than broker headers?
+   - A) Broker headers are not supported by NATS
+   - B) Envelope metadata survives all processing stages — including broker bridges, transformations, and splits — while broker headers may be lost or incompatible across different broker implementations
+   - C) Metadata is faster to read than headers
+   - D) W3C `traceparent` is too long for broker headers
+
+2. What should happen to message processing when the telemetry collector is unreachable?
+   - A) Processing stops until telemetry is restored
+   - B) Processing continues uninterrupted — telemetry is exported on a best-effort basis; observability must never block business message processing, as that would make the monitoring system a single point of failure
+   - C) Messages are queued until telemetry is available
+   - D) The platform switches to a backup collector
+
+3. How does distributed tracing support **scalable** operations for an integration platform?
+   - A) Tracing speeds up message processing
+   - B) End-to-end traces allow operators to pinpoint the exact stage and service where a message failed — without tracing, debugging failures across dozens of processing stages in a distributed system would be nearly impossible at scale
+   - C) Tracing reduces the number of processing stages
+   - D) The broker automatically generates traces
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/39-message-lifecycle.md b/EnterpriseIntegrationPlatform/tutorials/39-message-lifecycle.md
index 31b30d1..e305d24 100644
--- a/EnterpriseIntegrationPlatform/tutorials/39-message-lifecycle.md
+++ b/EnterpriseIntegrationPlatform/tutorials/39-message-lifecycle.md
@@ -135,13 +135,65 @@ Lifecycle recording is a **best-effort side effect** — it must not block or fa
 
 ---
 
-## Exercises
+## Lab
 
-1. A message was received 30 minutes ago but never reached "Delivered" status. Use `ITraceAnalyzer.WhereIsMessageAsync` to identify which stage it is stuck in.
+**Objective:** Use the message lifecycle tracking system to diagnose stuck messages, design retention policies for **scalable** storage, and compare lifecycle tracking with OpenTelemetry tracing.
 
-2. Design a retention policy for the message state store that keeps 7 days of detailed events and 90 days of summary events.
+### Step 1: Diagnose a Stuck Message
 
-3. Why does the platform record lifecycle events separately from OpenTelemetry traces? What does each system provide that the other does not?
+A message was received 30 minutes ago but never reached "Delivered" status. Use `ITraceAnalyzer.WhereIsMessageAsync` to investigate:
+
+```csharp
+var location = await traceAnalyzer.WhereIsMessageAsync(messageId);
+// Returns: { Stage: "Transform", Status: "InProgress", SinceUtc: "30 min ago" }
+```
+
+Open `src/Observability/TraceAnalyzer.cs` and trace: How does the analyzer query the message state store? What lifecycle states are tracked (Received, Routing, Transforming, Delivering, Delivered, Failed)?
+
+Design an alerting rule: any message in "InProgress" for > 5 minutes should trigger an alert. How does this support **operational scalability**?
+
+### Step 2: Design a Retention Policy
+
+Design a retention strategy for the message state store handling 10 million messages/day:
+
+| Retention Tier | Data | Duration | Storage |
+|---------------|------|----------|---------|
+| Hot (detailed) | All lifecycle events, full envelope | 7 days | ~140GB |
+| Warm (summary) | Stage transitions, message ID, status | 90 days | ~27GB |
+| Cold (archive) | Message ID, final status, timestamp | 1 year | ~3.6GB |
+
+How does tiered retention balance **operational visibility** with **storage scalability**?
+
+### Step 3: Compare Lifecycle Tracking vs. OpenTelemetry
+
+| Aspect | Message Lifecycle | OpenTelemetry Tracing |
+|--------|------------------|----------------------|
+| Purpose | Business-level message tracking | Technical span timing |
+| Query model | "Where is message X?" | "Show me the trace for request Y" |
+| Retention | Days to months | Hours to days |
+| Audience | Operations team | Developers |
+
+Why does the platform maintain both systems? What does each provide that the other cannot?
+
+## Exam
+
+1. A message is stuck in "Transforming" state for 15 minutes. What does this indicate?
+   - A) The message was successfully delivered
+   - B) The transformation activity is either blocked (deadlock, external dependency), has failed without updating state, or the worker processing it has crashed — the lifecycle tracking enables targeted investigation of the exact stuck stage
+   - C) The message was routed to the DLQ
+   - D) The lifecycle store has a bug
+
+2. Why does the platform record lifecycle events separately from OpenTelemetry traces?
+   - A) They serve the same purpose
+   - B) Lifecycle tracking provides business-level "where is my message?" visibility with longer retention; OpenTelemetry provides technical performance metrics with shorter retention — together they serve both operators and developers
+   - C) OpenTelemetry cannot track message state
+   - D) Lifecycle events are faster to query
+
+3. How does tiered retention support **storage scalability** for lifecycle data?
+   - A) All data is kept forever at full detail
+   - B) Recent data is kept at full detail for debugging; older data is summarized to reduce storage — this balances operational needs (recent incidents require full detail) with cost (years of data at full detail would be prohibitively expensive)
+   - C) Retention policies are only needed for compliance
+   - D) The message broker handles retention automatically
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/40-rag-ollama.md b/EnterpriseIntegrationPlatform/tutorials/40-rag-ollama.md
index 8467cfc..5ad2af5 100644
--- a/EnterpriseIntegrationPlatform/tutorials/40-rag-ollama.md
+++ b/EnterpriseIntegrationPlatform/tutorials/40-rag-ollama.md
@@ -146,13 +146,70 @@ RAG is a **read-only, advisory feature** — it does not modify messages or pipe
 
 ---
 
-## Exercises
+## Lab
 
-1. A developer asks "Why did order 12345 fail?" Design the RAG flow: what gets embedded, what is retrieved, and what context is passed to the model.
+**Objective:** Design a RAG query flow for operational troubleshooting, analyze graceful degradation when AI infrastructure is unavailable, and evaluate self-hosted vs. cloud AI for **scalable** integration platform operations.
 
-2. The Ollama container runs out of GPU memory. What happens to the RAG API? How should the platform degrade gracefully?
+### Step 1: Design a RAG Troubleshooting Flow
 
-3. Why does the platform default to self-hosted Ollama rather than a cloud AI provider? What trade-offs are involved?
+A developer asks: "Why did order 12345 fail?" Design the complete RAG flow:
+
+```
+1. EMBED query → vector representation
+2. RETRIEVE relevant context:
+   - DLQ entry for order 12345 (error details, reason)
+   - Lifecycle events (which stage failed)
+   - Recent similar failures (pattern detection)
+3. GENERATE response using LLM with retrieved context
+4. RETURN: "Order 12345 failed at the Transform stage due to missing 'currency' field.
+   This is a recurring issue — 15 similar failures in the last hour from PartnerX.
+   Recommended action: check PartnerX's schema version."
+```
+
+Open `src/Rag/` and trace: How does the platform embed and retrieve context? What data sources are indexed?
+
+### Step 2: Design Graceful Degradation
+
+The Ollama container runs out of GPU memory. Design the degradation strategy:
+
+| Component | Normal Mode | Degraded Mode |
+|-----------|------------|---------------|
+| RAG API | Full LLM responses | Return raw retrieved context without AI summary |
+| Chat interface | AI-powered answers | "AI unavailable — showing raw data" |
+| Message processing | Unaffected | Unaffected (AI is never in the critical path) |
+
+Why must the RAG/AI system **never** be in the critical message processing path? How does this architectural decision support **pipeline atomicity**?
+
+### Step 3: Evaluate Self-Hosted vs. Cloud AI
+
+| Factor | Self-Hosted (Ollama) | Cloud (OpenAI/Azure) |
+|--------|---------------------|---------------------|
+| Data privacy | Payloads never leave your infrastructure | Data sent to external API |
+| Latency | Local network (~50ms) | Internet round-trip (~500ms) |
+| Cost at scale | Fixed (GPU hardware) | Variable (per-token pricing) |
+| Availability | You manage uptime | Provider manages uptime |
+
+Why does the platform default to self-hosted Ollama? Consider: enterprise integration platforms process sensitive business data from multiple tenants.
+
+## Exam
+
+1. Why must the RAG/AI system never be in the critical message processing path?
+   - A) AI responses are too slow for real-time processing
+   - B) AI infrastructure failures must not impact message processing — the integration platform's primary responsibility is atomic message delivery, and coupling it to AI availability would make GPU outages cascade into integration failures
+   - C) AI models cannot process binary data
+   - D) The broker doesn't support AI integration
+
+2. Why does the platform default to self-hosted Ollama rather than a cloud AI provider?
+   - A) Ollama is faster than cloud providers
+   - B) Enterprise integration platforms process sensitive business data from multiple tenants — self-hosting ensures payload data never leaves the organization's infrastructure, meeting data residency and privacy requirements
+   - C) Cloud AI providers don't support .NET
+   - D) Self-hosting is always cheaper
+
+3. How does RAG improve **operational scalability** for a large integration platform?
+   - A) RAG processes messages faster
+   - B) RAG enables natural-language troubleshooting across millions of messages — operators can ask "why did this fail?" instead of manually searching DLQ entries, lifecycle events, and logs, dramatically reducing mean-time-to-resolution
+   - C) RAG reduces the number of integration patterns needed
+   - D) RAG automatically fixes failed messages
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/41-openclaw-web.md b/EnterpriseIntegrationPlatform/tutorials/41-openclaw-web.md
index f356aca..546d3a4 100644
--- a/EnterpriseIntegrationPlatform/tutorials/41-openclaw-web.md
+++ b/EnterpriseIntegrationPlatform/tutorials/41-openclaw-web.md
@@ -123,13 +123,72 @@ The web UI provides **eventual consistency** — it shows the latest state from
 
 ---
 
-## Exercises
+## Lab
 
-1. An operator searches for "failed orders from PartnerX last week." Trace the query through the `/api/inspect/ask` endpoint, `MessageStateInspector`, and the observability event log.
+**Objective:** Trace the operational query flow through OpenClaw's inspection APIs, design a "Where is my message?" workflow, and analyze why the UI delegates to Aspire for **scalable** observability.
 
-2. Design the UI flow for the "Where is my message?" feature: what inputs does the operator provide, and what data is displayed?
+### Step 1: Trace an Operational Query
 
-3. Why does OpenClaw.Web embed links to the Aspire dashboard rather than reimplementing trace and metric visualization?
+An operator searches for "failed orders from PartnerX last week." Trace the query flow:
+
+```
+1. Operator enters query in OpenClaw chat → POST /api/inspect/ask
+2. MessageStateInspector parses: source="PartnerX", status="Failed", timeRange=7d
+3. Query observability event log for matching messages
+4. Return: list of failed messages with failure reasons, stages, and timestamps
+```
+
+Open `src/Admin.Web/` and trace: How does the `/api/inspect/ask` endpoint delegate to `MessageStateInspector`? What data sources does it query?
+
+### Step 2: Design the "Where Is My Message?" Feature
+
+Design the complete UI flow:
+
+| Input | Source | Purpose |
+|-------|--------|---------|
+| Message ID or Correlation ID | Operator | Identify the message |
+| (optional) Time range | Operator | Narrow the search |
+
+| Output Display | Data Source |
+|---------------|-------------|
+| Current lifecycle stage | Message state store |
+| Processing timeline | Lifecycle events |
+| Error details (if failed) | DLQ entry |
+| Distributed trace link | OpenTelemetry trace ID |
+
+Why does the platform show a **link** to the Aspire dashboard trace rather than embedding trace visualization directly? (hint: Aspire already provides rich trace visualization — reimplementing it would be a maintenance burden)
+
+### Step 3: Analyze UI Architecture for Operational Scalability
+
+The OpenClaw Web UI proxies all data through Admin.Api endpoints. Design the resilience strategy:
+
+| Scenario | Behavior |
+|----------|----------|
+| Admin.Api healthy | Full functionality |
+| Admin.Api unreachable | Graceful fallback with cached data and "service unavailable" indicators |
+| Loki (observability) down | In-memory event log fallback |
+
+How does this resilience architecture support **operational scalability** — the UI must remain useful even during partial infrastructure failures?
+
+## Exam
+
+1. Why does OpenClaw embed links to the Aspire dashboard rather than reimplementing trace visualization?
+   - A) Aspire's visualization is faster
+   - B) Aspire already provides rich distributed trace, metrics, and log visualization — reimplementing this in OpenClaw would duplicate functionality, increase maintenance burden, and diverge from the platform's standard observability stack
+   - C) The Aspire dashboard is required by .NET
+   - D) OpenClaw cannot display visual data
+
+2. How does the proxy resilience pattern in OpenClaw support **operational scalability**?
+   - A) It makes the UI faster
+   - B) When backend services are degraded, the UI shows graceful fallbacks rather than crashing — operators can still access cached data and partial functionality, maintaining operational capability during infrastructure incidents
+   - C) Proxy resilience reduces network traffic
+   - D) The broker provides resilience automatically
+
+3. Why does the "Where is my message?" feature query multiple data sources?
+   - A) One data source is always sufficient
+   - B) No single system contains the complete picture — the lifecycle store tracks stage transitions, the DLQ contains failure details, and OpenTelemetry provides timing; combining them gives operators a complete and **actionable** view of any message's journey
+   - C) Multiple queries improve response time
+   - D) Each data source requires a separate API call
 
 ---
 
diff --git a/EnterpriseIntegrationPlatform/tutorials/42-configuration.md b/EnterpriseIntegrationPlatform/tutorials/42-configuration.md
index b5d8639..740c57d 100644
--- a/EnterpriseIntegrationPlatform/tutorials/42-configuration.md
+++ b/EnterpriseIntegrationPlatform/tutorials/42-configuration.md
@@ -146,14 +146,84 @@ Configuration updates are **versioned** — each `SetAsync` is atomic and create
 
 ---
 
-## Exercises
+## Lab
 
-1. Design a feature flag that enables a new routing algorithm for 10% of traffic, always enables it for `"tenant-beta"`, and offers variants `"v1"` and `"v2"`.
+**Objective:** Design feature flags with percentage rollouts, trace configuration change propagation, and analyze how environment overrides support **scalable** multi-environment deployments.
 
-2. A configuration change is made to update the retry count from 3 to 5. Trace the flow through `IConfigurationStore`, `ConfigurationChangeNotifier`, and the retry framework.
+### Step 1: Design a Feature Flag with Gradual Rollout
 
-3. Why does the platform use `EnvironmentOverrideProvider` in addition to the configuration store? When would an environment override be preferable?
+Design a feature flag for a new routing algorithm:
+
+```csharp
+var flag = new FeatureFlag
+{
+    Name = "new-routing-algorithm",
+    DefaultVariant = "v1",
+    Variants = ["v1", "v2"],
+    Rules = [
+        new FeatureFlagRule
+        {
+            TenantId = "tenant-beta",    // Always enabled for beta testers
+            Variant = "v2",
+            Percentage = 100
+        },
+        new FeatureFlagRule
+        {
+            Variant = "v2",
+            Percentage = 10              // 10% of all other traffic
+        }
+    ]
+};
+```
+
+Open `src/Configuration/` and trace: How does the platform evaluate which variant to apply? How does percentage-based rollout work — is it random per message or deterministic per tenant?
+
+### Step 2: Trace Configuration Change Propagation
+
+A configuration change updates the retry count from 3 to 5. Trace the flow:
+
+```
+1. Operator updates config → IConfigurationStore.SetAsync("retry.maxAttempts", "5")
+2. ConfigurationChangeNotifier detects the change
+3. All subscribed components receive the notification
+4. ExponentialBackoffRetryPolicy reloads the new value
+5. Next message uses MaxAttempts = 5
+```
+
+What is the propagation delay? What happens to messages already being retried with the old value? Is this an **atomicity** concern?
+
+### Step 3: Analyze Environment Override Scalability
+
+Why does the platform use `EnvironmentOverrideProvider` with the `EIP__` prefix convention?
+
+| Environment | Override Example | Use Case |
+|------------|-----------------|----------|
+| Development | `EIP__Broker__Type=InMemory` | Use in-memory broker for local dev |
+| Staging | `EIP__Retry__MaxAttempts=10` | More aggressive retry for testing |
+| Production | `EIP__Throttle__Rate=1000` | Production rate limits |
+
+How does this enable **scalable** multi-environment deployments without changing code or configuration files?
+
+## Exam
+
+1. Why does the platform use configuration change notification rather than reading config on every message?
+   - A) Reading configuration is too slow
+   - B) Reading config on every message would create a hot path to the configuration store — potentially millions of reads/second; change notification pushes updates only when values change, reducing load by orders of magnitude
+   - C) The configuration store doesn't support reads
+   - D) Notifications are required by .NET
+
+2. How do feature flags with percentage rollouts support **safe scalability** of new features?
+   - A) They make features faster
+   - B) Gradual rollout (10% → 50% → 100%) limits the blast radius of bugs — if the new algorithm causes failures, only a percentage of traffic is affected, enabling rapid rollback without impacting all tenants
+   - C) Percentage rollouts are required for production
+   - D) Feature flags reduce memory usage
+
+3. Why does the `EIP__` environment variable prefix convention support **multi-environment scalability**?
+   - A) The prefix is shorter than other options
+   - B) Environment variables override configuration store values per deployment — the same code artifact deploys to dev, staging, and production with different behavior controlled by environment, eliminating configuration file management across environments
+   - C) The .NET runtime requires specific prefixes
+   - D) The prefix prevents name collisions with system variables
 
 ---
 
-**Previous: [← Tutorial 41 — OpenClaw Web UI](41-openclaw-web.md)**
+**Previous: [← Tutorial 41 — OpenClaw Web UI](41-openclaw-web.md)** | **Next: [Tutorial 43 — Kubernetes Deployment →](43-kubernetes-deployment.md)**
diff --git a/EnterpriseIntegrationPlatform/tutorials/43-kubernetes-deployment.md b/EnterpriseIntegrationPlatform/tutorials/43-kubernetes-deployment.md
index 704f3e4..f43567d 100644
--- a/EnterpriseIntegrationPlatform/tutorials/43-kubernetes-deployment.md
+++ b/EnterpriseIntegrationPlatform/tutorials/43-kubernetes-deployment.md
@@ -196,16 +196,87 @@ Rolling deployments ensure **zero-downtime releases**. Kubernetes maintains the
 old pods until new ones pass readiness probes, preserving message processing
 continuity. Liveness probes automatically restart unhealthy pods.
 
-## Exercises
+## Lab
 
-1. Modify the HPA to scale on both CPU and memory utilization. What
-   `averageUtilization` thresholds would you choose for a memory-intensive
-   pipeline worker?
+**Objective:** Configure Kubernetes HPA for auto-scaling integration workers, analyze graceful shutdown for **atomic** in-flight message handling, and design Kustomize overlays for multi-environment deployment.
 
-2. Create a Kustomize patch that overrides the image tag for the staging
-   overlay. How does this differ from changing `values.yaml` in Helm?
+### Step 1: Configure HPA with Multi-Metric Scaling
 
-3. What happens to in-flight messages if a pod is terminated during a rolling
-   update? How do graceful shutdown and broker acknowledgment interact?
+Modify the HPA to scale on both CPU and memory utilization:
+
+```yaml
+spec:
+  minReplicas: 2
+  maxReplicas: 20
+  metrics:
+    - type: Resource
+      resource:
+        name: cpu
+        target:
+          type: Utilization
+          averageUtilization: 70
+    - type: Resource
+      resource:
+        name: memory
+        target:
+          type: Utilization
+          averageUtilization: 80
+```
+
+Why set memory threshold higher than CPU? (hint: pipeline workers are more likely CPU-bound from JSON processing; memory scaling catches enrichment cache growth)
+
+### Step 2: Analyze Graceful Shutdown Atomicity
+
+During a rolling update, a pod is terminated while processing a message. Trace the shutdown sequence:
+
+```
+1. Kubernetes sends SIGTERM → pod enters termination grace period
+2. Worker stops accepting new messages (unsubscribes from broker)
+3. In-flight messages complete processing (up to terminationGracePeriodSeconds)
+4. Worker sends Ack for completed messages, Nack for incomplete ones
+5. Pod terminates → broker redelivers Nack'd messages to other consumers
+```
+
+What happens if `terminationGracePeriodSeconds` is too short? How does this affect **message atomicity**?
+
+### Step 3: Design Kustomize Overlays
+
+Create a Kustomize patch that overrides the image tag for staging:
+
+```yaml
+# overlays/staging/kustomization.yaml
+resources:
+  - ../../base
+patches:
+  - target:
+      kind: Deployment
+      name: pipeline-worker
+    patch: |
+      - op: replace
+        path: /spec/template/spec/containers/0/image
+        value: registry.example.com/eip/worker:staging-latest
+```
+
+How does Kustomize differ from Helm for multi-environment deployment? Which is more **scalable** for a platform with 10+ microservices across 3 environments?
+
+## Exam
+
+1. What happens to in-flight messages when a pod is terminated during a rolling update?
+   - A) Messages are lost
+   - B) The pod completes processing in-flight messages during the termination grace period, Acks completed work, and Nacks incomplete messages — the broker redelivers Nack'd messages to healthy pods, ensuring **zero message loss**
+   - C) All messages are automatically retried from the beginning
+   - D) The broker waits for the pod to restart
+
+2. Why should the memory HPA threshold be set higher than CPU for integration workers?
+   - A) Memory is always less constrained than CPU
+   - B) Pipeline workers are typically CPU-bound from JSON parsing and regex evaluation; memory growth is gradual (from caching and enrichment) — setting memory threshold higher prevents premature scaling while still catching memory-intensive workload changes
+   - C) Kubernetes requires different thresholds
+   - D) Memory scaling is faster than CPU scaling
+
+3. How does Kubernetes auto-scaling support **integration platform scalability**?
+   - A) HPA only works with web servers
+   - B) HPA automatically adjusts the number of pipeline worker pods based on actual load — during peak hours, more workers process messages in parallel; during off-peak, resources are released, optimizing cost while maintaining throughput SLAs
+   - C) Auto-scaling requires manual approval
+   - D) The broker handles scaling internally
 
 **Previous: [← Tutorial 42](42-configuration.md)** | **Next: [Tutorial 44 →](44-disaster-recovery.md)**
diff --git a/EnterpriseIntegrationPlatform/tutorials/44-disaster-recovery.md b/EnterpriseIntegrationPlatform/tutorials/44-disaster-recovery.md
index ab9f3c6..c687835 100644
--- a/EnterpriseIntegrationPlatform/tutorials/44-disaster-recovery.md
+++ b/EnterpriseIntegrationPlatform/tutorials/44-disaster-recovery.md
@@ -134,15 +134,67 @@ Synchronous replication (`min.insync.replicas=2`) ensures that every acknowledge
 message exists on at least two brokers before the producer receives confirmation.
 This guarantees no acknowledged message is lost during failover.
 
-## Exercises
+## Lab
 
-1. Calculate the RPO if Kafka is configured with `min.insync.replicas=1` and
-   asynchronous replication. What messages could be lost?
+**Objective:** Calculate RPO/RTO for different replication configurations, design a DR drill for Cassandra failover, and analyze broker replication trade-offs for **atomic** message durability.
 
-2. Design a DR drill that tests Cassandra failover. What queries would you run
-   to verify data consistency after the secondary becomes primary?
+### Step 1: Calculate RPO Under Different Configurations
 
-3. How does NATS JetStream's `num_replicas` setting affect both durability and
-   write latency? What trade-off would you choose for notification delivery?
+| Configuration | RPO | Risk |
+|--------------|-----|------|
+| Kafka: `min.insync.replicas=1`, async replication | ? | Messages not yet replicated are lost |
+| Kafka: `min.insync.replicas=2`, sync replication | 0 (zero data loss) | Higher write latency |
+| NATS JetStream: `num_replicas=3` | ? | Depends on quorum writes |
+
+Calculate: With `min.insync.replicas=1` and async replication, if the primary broker fails with 500ms of unreplicated data at 10,000 msg/s, how many messages could be lost?
+
+### Step 2: Design a DR Drill for Cassandra
+
+Design a complete DR drill that tests Cassandra failover:
+
+```
+1. Record current data state: SELECT count(*) FROM message_state WHERE status='delivered'
+2. Trigger failover: Promote secondary Cassandra to primary
+3. Verify data consistency:
+   - Re-run count query on new primary
+   - Compare with pre-failover count
+   - Check for data divergence in recent writes
+4. Test write path: Insert a test record and verify replication
+5. Verify application reconnection: Check application logs for successful reconnect
+```
+
+What **atomicity** guarantees does Cassandra's eventual consistency model provide during failover? What data could be lost?
+
+### Step 3: Analyze Broker Replication Trade-Offs
+
+NATS JetStream's `num_replicas` setting affects both durability and write latency:
+
+| num_replicas | Durability | Write Latency | Network Cost |
+|-------------|-----------|---------------|-------------|
+| 1 | Low (single point of failure) | ~1ms | Minimal |
+| 3 | High (survives 1 node failure) | ~5ms | 3x network |
+| 5 | Very high (survives 2 failures) | ~10ms | 5x network |
+
+For notification delivery (Tutorial 48), which trade-off would you choose? Why?
+
+## Exam
+
+1. What is the relationship between `min.insync.replicas` and **message atomicity**?
+   - A) It controls how fast messages are delivered
+   - B) `min.insync.replicas` determines how many broker replicas must acknowledge a write before the producer considers it committed — with `=1`, a broker failure can lose unsynced messages; with `=2`, the message survives single-node failures
+   - C) It limits the number of consumers per partition
+   - D) It controls message compression level
+
+2. Why must a DR drill verify data consistency **after** failover?
+   - A) Data is always consistent during failover
+   - B) Replication lag during failover can cause the secondary to be behind the primary — verifying consistency ensures no messages were lost during the transition, maintaining the platform's **zero message loss** guarantee
+   - C) Consistency checks are only needed quarterly
+   - D) The broker handles consistency automatically
+
+3. How does increasing `num_replicas` improve **durability** at the cost of **scalability**?
+   - A) More replicas improve read performance
+   - B) Each write must be acknowledged by more nodes (quorum) — this increases write latency and network cost, but guarantees the message survives node failures; the trade-off between durability and throughput must be tuned per workload
+   - C) Replicas reduce storage costs
+   - D) num_replicas only affects read performance
 
 **Previous: [← Tutorial 43](43-kubernetes-deployment.md)** | **Next: [Tutorial 45 →](45-performance-profiling.md)**
diff --git a/EnterpriseIntegrationPlatform/tutorials/45-performance-profiling.md b/EnterpriseIntegrationPlatform/tutorials/45-performance-profiling.md
index 126127c..112650d 100644
--- a/EnterpriseIntegrationPlatform/tutorials/45-performance-profiling.md
+++ b/EnterpriseIntegrationPlatform/tutorials/45-performance-profiling.md
@@ -189,15 +189,78 @@ GC pauses can cause broker acknowledgment timeouts, leading to duplicate message
 delivery. Server GC with concurrent mode reduces pause duration, keeping the
 Ack/Nack cycle within configured timeout windows.
 
-## Exercises
+## Lab
 
-1. Use `dotnet-counters` to monitor Gen 0/1/2 collection rates during a load
-   test. What ratio of Gen 0 to Gen 2 collections indicates healthy GC behavior?
+**Objective:** Use profiling tools to identify performance bottlenecks, analyze GC behavior under load, and design optimization strategies for **scalable** high-throughput message processing.
 
-2. Capture a CPU profile of the pipeline worker under load. Identify the top 3
-   methods by inclusive CPU time. Are any of them candidates for optimization?
+### Step 1: Monitor GC Behavior Under Load
 
-3. What is the impact of enabling LOH compaction on P99 latency? Design a test
-   that measures latency before and after compaction.
+Use `dotnet-counters` to monitor Gen 0/1/2 collection rates:
+
+```bash
+dotnet-counters monitor --process-id <pid> --counters System.Runtime
+```
+
+Observe the Gen 0 to Gen 2 ratio during a load test:
+
+| Metric | Healthy | Unhealthy |
+|--------|---------|-----------|
+| Gen 0 collections | 100/min | 100/min |
+| Gen 1 collections | 10/min | 50/min |
+| Gen 2 collections | 1/min | 20/min |
+| Gen 0:Gen 2 ratio | 100:1 | 5:1 |
+
+A low Gen 0:Gen 2 ratio indicates objects surviving to older generations — a sign of memory pressure. Open `src/Performance.Profiling/GcMonitor.cs` and trace: How does the platform track these metrics?
+
+### Step 2: Identify CPU Hotspots
+
+Capture a CPU profile of the pipeline worker under load:
+
+```bash
+dotnet-trace collect --process-id <pid> --duration 00:00:30
+```
+
+Open the trace in a flame graph viewer. Identify likely hotspots:
+
+| Method | CPU % (expected) | Optimization |
+|--------|-----------------|-------------|
+| JSON serialization | 30-40% | Pool JsonSerializerOptions |
+| Regex evaluation (routing) | 15-20% | Pre-compile patterns (already done) |
+| HTTP connector I/O wait | 20-30% | Connection pooling |
+| GC pauses | 5-10% | Reduce allocations |
+
+Open `src/Performance.Profiling/AllocationHotspotDetector.cs` and trace how the platform detects allocation-heavy code paths.
+
+### Step 3: Analyze LOH Compaction Impact on Latency
+
+Large Object Heap (LOH) compaction trades latency for memory efficiency:
+
+| Metric | Without Compaction | With Compaction |
+|--------|-------------------|----------------|
+| P50 latency | 5ms | 5ms |
+| P99 latency | 15ms | 50ms (GC pause spikes) |
+| Memory usage | Growing (fragmentation) | Stable |
+
+Design a profiling experiment to measure this trade-off. When is LOH compaction worth the latency cost?
+
+## Exam
+
+1. A Gen 0:Gen 2 collection ratio of 5:1 indicates what **performance scalability** problem?
+   - A) The application is running normally
+   - B) Objects are surviving to older generations — indicating either long-lived allocations or GC pressure; frequent Gen 2 collections cause stop-the-world pauses that degrade throughput and P99 latency under high message load
+   - C) The application needs more CPU cores
+   - D) Gen 2 collections are always harmful
+
+2. Why is pre-compiling regex patterns critical for **routing scalability**?
+   - A) Pre-compilation improves code readability
+   - B) Without pre-compilation, each message evaluation creates a new Regex object — causing allocation churn, GC pressure, and increased P99 latency; pre-compiled patterns are allocated once and reused across millions of evaluations
+   - C) The .NET regex engine requires pre-compilation
+   - D) Pre-compilation enables case-insensitive matching
+
+3. When profiling an integration platform, why is P99 latency more important than average latency?
+   - A) P99 is easier to calculate
+   - B) Integration platforms process millions of messages — the average hides tail-latency spikes from GC pauses, lock contention, or external service timeouts; P99 reveals the worst experience for 1% of messages, which at scale affects thousands of messages per hour
+   - C) Average latency is always lower than P99
+   - D) P99 is a marketing metric
 
 **Previous: [← Tutorial 44](44-disaster-recovery.md)** | **Next: [Tutorial 46 →](46-complete-integration.md)**
diff --git a/EnterpriseIntegrationPlatform/tutorials/46-complete-integration.md b/EnterpriseIntegrationPlatform/tutorials/46-complete-integration.md
index 7eb32b8..cf0027a 100644
--- a/EnterpriseIntegrationPlatform/tutorials/46-complete-integration.md
+++ b/EnterpriseIntegrationPlatform/tutorials/46-complete-integration.md
@@ -229,15 +229,66 @@ workflow retries or (with `AtomicPipelineWorkflow`) triggers saga compensation.
 Ack/Nack notifications close the feedback loop, ensuring the sender knows the
 outcome.
 
-## Exercises
+## Lab
 
-1. Trace a message through all 8 steps using the Admin.Api dashboard. What
-   metadata does each step add to the `IntegrationEnvelope`?
+**Objective:** Trace a complete message through all 8 processing stages, analyze how each stage contributes to **end-to-end atomicity**, and design a pipeline extension.
 
-2. What happens if the Channel Adapter returns HTTP 503? How does Temporal's
-   retry policy interact with the Nack notification (UC3)?
+### Step 1: Trace a Message Through All 8 Stages
 
-3. Modify the workflow to add a sixth step: audit logging. Where in the
-   pipeline would you insert it, and why?
+Follow a single message from ingestion to delivery:
+
+| Stage | EIP Pattern | Platform Component | Adds to Envelope |
+|-------|------------|-------------------|------------------|
+| 1. Receive | Messaging Gateway | Gateway.Api | `MessageId`, `Timestamp` |
+| 2. Validate | ? | IntegrationActivities | ? |
+| 3. Sanitize | ? | InputSanitizer | ? |
+| 4. Transform | Message Translator | MessageTranslator | `CausationId` |
+| 5. Route | Content-Based Router | ContentBasedRouter | Routing decision |
+| 6. Deliver | Channel Adapter | HttpConnector/SftpConnector | Delivery status |
+| 7. Persist | Message Store | CassandraMessageStore | ? |
+| 8. Notify | Ack/Nack | NotificationPublisher | Notification payload |
+
+Fill in the `?` cells by tracing through the actual source code.
+
+### Step 2: Analyze Failure at Each Stage
+
+For each stage, identify: What happens if it fails? Where does the message go? Is the failure retryable?
+
+| Stage Failure | Retryable? | Recovery Action | Atomicity Impact |
+|--------------|-----------|-----------------|-----------------|
+| Stage 4 (Transform) fails | Yes (transient) / No (schema) | Retry or DLQ | Stages 1-3 results preserved |
+| Stage 6 (Deliver) fails after Stage 7 (Persist) succeeds | ? | ? | ? |
+
+What is the worst-case scenario for **atomicity** — which combination of stage success/failure creates the hardest recovery?
+
+### Step 3: Design a Pipeline Extension
+
+Add a sixth step: "Audit Logging" that writes every message to a compliance store. Where in the pipeline would you insert it?
+
+- Before routing (Stage 4.5)? — captures the canonical message before routing decisions
+- After delivery (Stage 6.5)? — captures the delivery outcome
+- As a parallel branch from Stage 2? — captures even messages that fail validation
+
+Justify your choice based on **atomicity** and **compliance** requirements.
+
+## Exam
+
+1. The HTTP connector (Stage 6) returns HTTP 503. How does the platform maintain **end-to-end atomicity**?
+   - A) The message is lost
+   - B) Temporal's retry policy retries the delivery activity; if all retries fail, the message is Nack'd (UC3), routed to the DLQ with full context, and the originating system is notified of the failure — every stage's work is either committed or compensated
+   - C) The workflow restarts from Stage 1
+   - D) The connector silently drops the message
+
+2. Why are the 8 stages separated into distinct activities rather than one monolithic handler?
+   - A) .NET requires separate classes
+   - B) Each stage is an independent filter with its own retry policy, scaling characteristics, and failure handling — this Pipes and Filters architecture enables independent optimization and ensures a failure in one stage doesn't require re-executing all stages
+   - C) Monolithic handlers are not supported by Temporal
+   - D) Eight stages are required by the EIP book
+
+3. What is the most challenging **atomicity** scenario in the complete pipeline?
+   - A) All stages succeed
+   - B) Stage 6 (Deliver) succeeds but Stage 7 (Persist) fails — the external system received the message but the platform has no record; compensation requires checking the external system's state and reconciling, which cannot be fully automated
+   - C) Stage 1 (Receive) fails
+   - D) Stage 8 (Notify) fails
 
 **Previous: [← Tutorial 45](45-performance-profiling.md)** | **Next: [Tutorial 47 →](47-saga-compensation.md)**
diff --git a/EnterpriseIntegrationPlatform/tutorials/47-saga-compensation.md b/EnterpriseIntegrationPlatform/tutorials/47-saga-compensation.md
index b9f7f35..c7788bc 100644
--- a/EnterpriseIntegrationPlatform/tutorials/47-saga-compensation.md
+++ b/EnterpriseIntegrationPlatform/tutorials/47-saga-compensation.md
@@ -248,16 +248,71 @@ transaction, it guarantees that either all steps complete successfully or all
 completed steps are compensated. This is the strongest consistency guarantee
 available in a distributed system without two-phase commit.
 
-## Exercises
+## Lab
 
-1. What happens if `CompensateStepAsync` fails with a network timeout?
-   Design a retry policy that balances urgency (customer refund) with safety
-   (no double refund). See `CompensationActivityOptions` in the source.
+**Objective:** Design saga compensation for multi-step workflows, analyze compensation failure strategies, and compare workflow types for **throughput vs. consistency** trade-offs.
 
-2. Add a fourth step "SendConfirmation" to the saga. What does its
-   compensation look like? Is email compensation even possible?
+### Step 1: Design Compensation for Non-Reversible Actions
 
-3. Compare the `IntegrationPipelineWorkflow` and `AtomicPipelineWorkflow`.
-   When would you choose one over the other? Consider throughput vs. consistency.
+Add a fourth step "SendConfirmation" (email) to the saga:
+
+| Step | Action | Compensation | Reversible? |
+|------|--------|-------------|-------------|
+| 1. ValidateOrder | Schema validation | No-op (read-only) | N/A |
+| 2. ChargePayment | Debit customer account | Refund credit | Yes |
+| 3. ReserveInventory | Decrement stock | Increment stock | Yes |
+| 4. SendConfirmation | Email customer | ??? | **No** |
+
+Email is non-reversible. Design a compensating action:
+- Send a "cancellation notice" email? (creates customer confusion)
+- Log the non-reversible action for manual review? (operationally safer)
+- Accept that some actions cannot be compensated? (pragmatic)
+
+How does this challenge the **theoretical atomicity** of saga compensation?
+
+### Step 2: Handle Compensation Failures
+
+`CompensateStepAsync` for Step 2 (Refund) fails with a network timeout. Design a retry policy:
+
+| Concern | Policy |
+|---------|--------|
+| Urgency | Customer expects refund quickly |
+| Safety | Must not issue double refund |
+| Idempotency | Refund API must be idempotent (check by `CorrelationId`) |
+| Retry limit | 5 attempts with exponential backoff |
+| Escalation | After 5 failures → alert operations team for manual refund |
+
+Open `src/Workflow.Temporal/Activities/SagaCompensationActivities.cs` and check: How does the platform handle compensation activity failures?
+
+### Step 3: Compare Workflow Types
+
+| Aspect | IntegrationPipelineWorkflow | AtomicPipelineWorkflow |
+|--------|---------------------------|----------------------|
+| Compensation | None (fire-and-forget) | Full saga compensation |
+| Throughput | Higher (no compensation overhead) | Lower (tracks compensation state) |
+| Consistency guarantee | Best-effort delivery | All-or-nothing |
+| Best for | Non-critical notifications | Financial transactions, order processing |
+
+When would you choose `IntegrationPipelineWorkflow` over `AtomicPipelineWorkflow`?
+
+## Exam
+
+1. What should happen when a compensation activity itself fails?
+   - A) Silently mark the saga as compensated
+   - B) Retry with idempotent compensation (using `CorrelationId` to prevent duplicates); if retries are exhausted, escalate to the operations team — some compensations require human intervention when automated rollback fails
+   - C) Restart the entire original workflow
+   - D) Skip the failed compensation and continue
+
+2. Why is email delivery the hardest action to compensate in a saga?
+   - A) Email is too slow for saga patterns
+   - B) Email is non-reversible — once sent, it cannot be recalled; any "compensation" (like a cancellation email) creates additional customer communication rather than truly undoing the action, making it a practical limit of saga **atomicity**
+   - C) SMTP doesn't support compensation
+   - D) Email compensation is straightforward
+
+3. When would you choose higher throughput (`IntegrationPipelineWorkflow`) over consistency (`AtomicPipelineWorkflow`)?
+   - A) Always choose consistency
+   - B) When the cost of occasional message loss or duplicate processing is acceptable — e.g., analytics events, metric updates, or log forwarding where throughput matters more than per-message **atomicity**
+   - C) Throughput is always preferable
+   - D) The two workflows are identical in behavior
 
 **Previous: [← Tutorial 46](46-complete-integration.md)** | **Next: [Tutorial 48 →](48-notification-use-cases.md)**
diff --git a/EnterpriseIntegrationPlatform/tutorials/48-notification-use-cases.md b/EnterpriseIntegrationPlatform/tutorials/48-notification-use-cases.md
index ae6a9b9..fd6233b 100644
--- a/EnterpriseIntegrationPlatform/tutorials/48-notification-use-cases.md
+++ b/EnterpriseIntegrationPlatform/tutorials/48-notification-use-cases.md
@@ -217,15 +217,75 @@ The two-level toggle (per-message `NotificationsEnabled` + global feature flag)
 provides fine-grained control over the Ack/Nack feedback loop. Feature flags
 enable instant, zero-deployment changes to notification behavior.
 
-## Exercises
+## Lab
 
-1. Design a UC6: the notification publish itself fails (NATS unavailable). How
-   should the pipeline handle this? Should it retry, DLQ, or silently drop?
+**Objective:** Design notification failure handling, analyze mapper configurability for **scalable** multi-format notification delivery, and trace feature flag interaction with notification flows.
 
-2. Implement a `JsonNotificationMapper` as an alternative to `XmlNotificationMapper`.
-   What changes are needed to make the mapper configurable per integration?
+### Step 1: Design UC6 — Notification Publish Failure
 
-3. Write a test that verifies UC4→UC2 transition: disable the feature flag,
-   confirm no Ack is published, re-enable, and confirm Ack resumes.
+The notification publish itself fails (NATS unavailable). Design the handling strategy:
+
+| Option | Behavior | Trade-off |
+|--------|----------|-----------|
+| A. Retry | Retry notification publish with backoff | Delays pipeline Ack |
+| B. DLQ | Route notification to DLQ | Notification may never be sent |
+| C. Silent drop | Log warning, continue pipeline | Originating system doesn't know outcome |
+| D. Best-effort + fallback | Try publish, if fails log event + continue | Pipeline isn't blocked, event is recorded |
+
+Which option preserves **pipeline atomicity** while being operationally practical? (hint: the notification is about the outcome, not the outcome itself — the message was already delivered)
+
+### Step 2: Design a Configurable Notification Mapper
+
+The platform uses `XmlNotificationMapper`. Design a `JsonNotificationMapper` alternative:
+
+```json
+{
+  "notification": {
+    "type": "Ack",
+    "messageId": "abc-123",
+    "correlationId": "xyz-789",
+    "timestamp": "2024-01-15T10:30:00Z",
+    "source": "EIP.Platform"
+  }
+}
+```
+
+How would you make the mapper configurable per integration (some partners want XML, others want JSON)?
+
+| Configuration | Mapper | Output Format |
+|--------------|--------|---------------|
+| `PartnerA.NotificationFormat = "XML"` | `XmlNotificationMapper` | XML Ack/Nack |
+| `PartnerB.NotificationFormat = "JSON"` | `JsonNotificationMapper` | JSON Ack/Nack |
+| Default | `XmlNotificationMapper` | XML (backward compatible) |
+
+### Step 3: Trace Feature Flag Interaction
+
+UC4 (conditional Ack) uses a feature flag `NotificationsEnabled`. Trace the flow:
+
+1. Feature flag enabled → Ack published after delivery
+2. Feature flag disabled → No Ack published
+3. Flag is toggled mid-processing → What happens to in-flight messages?
+
+Is the feature flag check **atomic** with the notification publish? What race condition could occur if the flag is disabled between the check and the publish?
+
+## Exam
+
+1. When the notification publish fails, why should the pipeline continue rather than failing?
+   - A) Notifications are always optional
+   - B) The notification reports the outcome of processing — the message itself was already successfully delivered; blocking the pipeline on notification failure would hold up subsequent messages for a non-critical status report
+   - C) The broker automatically retries notifications
+   - D) Notification failures never occur in production
+
+2. Why should notification format (XML/JSON) be configurable per integration partner?
+   - A) JSON is always better than XML
+   - B) Different partner systems expect different formats — a healthcare partner may require XML (HL7/CDA), while a modern API partner expects JSON; per-partner configurability enables **scalable** onboarding of diverse integration consumers
+   - C) Format configuration improves throughput
+   - D) The broker requires specific formats
+
+3. What **atomicity** concern arises from feature flag checks in the notification flow?
+   - A) Feature flags are always atomic
+   - B) If the flag is disabled between the check and the publish, a notification might be sent despite the flag being off — this race window is typically acceptable (milliseconds), but for strict compliance, the check and publish should be treated as a critical section
+   - C) Feature flags don't affect notifications
+   - D) The flag value is cached permanently
 
 **Previous: [← Tutorial 47](47-saga-compensation.md)** | **Next: [Tutorial 49 →](49-testing-integrations.md)**
diff --git a/EnterpriseIntegrationPlatform/tutorials/49-testing-integrations.md b/EnterpriseIntegrationPlatform/tutorials/49-testing-integrations.md
index b6cce9f..56debe9 100644
--- a/EnterpriseIntegrationPlatform/tutorials/49-testing-integrations.md
+++ b/EnterpriseIntegrationPlatform/tutorials/49-testing-integrations.md
@@ -9,7 +9,7 @@
 - Integration tests using Testcontainers
 - End-to-end browser tests with Playwright
 - Load and performance benchmarks via the LoadTests project
-- Testing conventions: `[SetUp]`, `Assert.That`, 1,472 unit tests
+- Testing conventions: `[SetUp]`, `Assert.That`
 
 ## The Test Pyramid
 
@@ -27,35 +27,35 @@
                 │   ContractTests    │  API contracts
                ┌┴───────────────────┴┐
                │     UnitTests        │  NUnit 4.4 + NSubstitute
-               │   1,472 tests       │  (fastest, most numerous)
+               │  (fastest, most)     │  (fastest, most numerous)
                └──────────────────────┘
 ```
 
 ## Unit Tests (NUnit 4.4 + NSubstitute)
 
-The foundation with 1,472 tests covering all core logic:
+The foundation of the test pyramid, covering all core logic:
 
 ```csharp
 [TestFixture]
 public class XmlNotificationMapperTests
 {
-    private XmlNotificationMapper _mapper;
+    private XmlNotificationMapper _sut = null!;
 
     [SetUp]
     public void SetUp()
     {
-        _mapper = new XmlNotificationMapper();
+        _sut = new XmlNotificationMapper();
     }
 
     [Test]
-    public void MapAck_ReturnsExpectedXml()
+    public void MapAck_ReturnsXmlAckOk()
     {
         var messageId = Guid.NewGuid();
         var correlationId = Guid.NewGuid();
 
-        var result = _mapper.MapAck(messageId, correlationId);
+        var result = _sut.MapAck(messageId, correlationId);
 
-        Assert.That(result, Does.Contain("Ack"));
+        Assert.That(result, Is.EqualTo("<Ack>ok</Ack>"));
     }
 
     [Test]
@@ -64,7 +64,7 @@ public class XmlNotificationMapperTests
         var messageId = Guid.NewGuid();
         var correlationId = Guid.NewGuid();
 
-        var result = _mapper.MapNack(messageId, correlationId, "timeout");
+        var result = _sut.MapNack(messageId, correlationId, "timeout");
 
         Assert.That(result, Does.Contain("timeout"));
     }
@@ -245,7 +245,7 @@ Results:
 
 ```
 tests/
-├── UnitTests/              # 1,472 fast, isolated tests
+├── UnitTests/              # Fast, isolated tests (most numerous)
 ├── ContractTests/          # API serialization contracts
 ├── WorkflowTests/          # Temporal workflow tests
 ├── IntegrationTests/       # Testcontainers-based tests
@@ -265,17 +265,71 @@ Tests verify atomicity guarantees: unit tests check that Ack/Nack is published
 correctly, workflow tests validate saga compensation reverses all steps, and
 integration tests confirm that DLQ routing works with real brokers.
 
-## Exercises
+## Lab
+
+**Objective:** Design a testing strategy for integration platforms, analyze the testing pyramid for **scalable** quality assurance, and evaluate infrastructure testing with Testcontainers.
+
+### Step 1: Design the Testing Pyramid
+
+Map the platform's testing strategy:
+
+| Layer | Tool | Speed | Purpose | Examples |
+|-------|------|-------|---------|----------|
+| Unit | NUnit + NSubstitute | Fast (<1s each) | Verify individual components in isolation | Router rule evaluation, envelope immutability |
+| Contract | NUnit | Medium (~5s each) | Verify interface contracts are met | `IMessageBrokerProducer` implementations |
+| Integration | Testcontainers | Slow (~30s each) | Verify components work with real infrastructure | DLQ routing with real NATS, Cassandra persistence |
+| Workflow | Temporal test server | Medium | Verify end-to-end workflow orchestration | Saga compensation, activity retries |
+| Load | NBomber | Slow | Verify performance under sustained load | Throughput at 10K msg/s, P99 latency |
+
+Why is this pyramid structure critical for **development scalability**? What happens if you skip the unit layer and rely only on integration?
+
+### Step 2: Design a Testcontainers Integration Scenario
+
+Design a DLQ routing integration scenario:
+
+```
+1. Start NATS JetStream container
+2. Start Cassandra container
+3. Publish a message with invalid schema
+4. Verify: message appears in DLQ topic within 5 seconds
+5. Verify: DLQ entry contains original envelope + error reason
+6. Trigger replay via Admin API
+7. Verify: message re-enters the pipeline
+```
+
+What makes this scenario un-testable with unit mocks alone? (hint: it verifies the real broker's message delivery guarantee)
+
+### Step 3: Design a Load Profiling Scenario
+
+Design a load scenario to measure the impact of enabling `NotificationsEnabled`:
+
+| Metric | Without Notifications | With Notifications | Overhead |
+|--------|---------------------|-------------------|----------|
+| Throughput (msg/s) | ? | ? | ? |
+| P50 latency | ? | ? | ? |
+| P99 latency | ? | ? | ? |
+| CPU usage | ? | ? | ? |
+
+How would you use NBomber to measure this? What overhead percentage is acceptable for a notification feature?
+
+## Exam
 
-1. Write a unit test for `XmlNotificationMapper` that verifies XML special
-   characters (e.g., `<`, `&`, `"`) are properly escaped in error messages.
-   Use the existing `MapNack_EscapesXmlSpecialCharactersInErrorMessage` test
-   in `tests/UnitTests/XmlNotificationMapperTests.cs` as a reference.
+1. Why are unit-level verifications preferred over integration-level for most component validation?
+   - A) Integration scenarios are more accurate
+   - B) Unit-level verifications run in milliseconds without infrastructure dependencies — enabling developers to validate hundreds of scenarios in seconds; this **scales** development velocity because the fast feedback loop catches errors before expensive integration runs
+   - C) Unit-level verifications catch all bugs
+   - D) Integration scenarios are not reliable
 
-2. Create a Testcontainers integration test that verifies dead-letter queue
-   routing when a consumer fails to process a message.
+2. When should you use Testcontainers for integration verification instead of mocks?
+   - A) Always — mocks are unreliable
+   - B) When the verification depends on real infrastructure behavior — e.g., broker delivery guarantees, database consistency, connection pooling — that cannot be accurately simulated with mocks
+   - C) Never — integration verification is too slow
+   - D) Only for performance measurement
 
-3. Design a load test scenario that measures the impact of enabling
-   `NotificationsEnabled` on pipeline throughput. What overhead do you expect?
+3. Why is load profiling essential for **scalability** validation of an integration platform?
+   - A) Load profiling improves code quality
+   - B) Integration platforms must sustain high throughput under production conditions — load profiling reveals bottlenecks (GC pressure, lock contention, broker capacity) that only appear under sustained load and would cause production failures
+   - C) Load profiling is only needed before launch
+   - D) The broker handles load automatically
 
 **Previous: [← Tutorial 48](48-notification-use-cases.md)** | **Next: [Tutorial 50 →](50-best-practices.md)**
diff --git a/EnterpriseIntegrationPlatform/tutorials/50-best-practices.md b/EnterpriseIntegrationPlatform/tutorials/50-best-practices.md
index ec1404b..c71eb34 100644
--- a/EnterpriseIntegrationPlatform/tutorials/50-best-practices.md
+++ b/EnterpriseIntegrationPlatform/tutorials/50-best-practices.md
@@ -55,7 +55,7 @@ same result regardless of how many times they process the same message.
 
 ```
 Pre-Production Verification:
-  □ All 1,472 unit tests passing (Tutorial 49)
+  □ All unit tests passing (Tutorial 49)
   □ Integration tests green with Testcontainers
   □ Load tests meet throughput/latency SLAs
   □ Helm chart validated with deploy/validate.sh (Tutorial 43)
@@ -190,15 +190,75 @@ Ack/Nack for delivery confirmation, DLQ for failure isolation, saga compensation
 for distributed rollback, and feature flags for operational control. Together,
 they provide the reliability guarantees that enterprise integrations demand.
 
-## Exercises
+## Lab
 
-1. Design a new integration that uses at least 5 EIP patterns from the
-   selection guide. Draw the message flow and justify each pattern choice.
+**Objective:** Design a complete integration using multiple EIP patterns, apply the production checklist, and analyze anti-patterns that undermine **scalability** and **atomicity**.
 
-2. Create a production checklist specific to your organization. What items
-   would you add beyond the list above?
+### Step 1: Design a Multi-Pattern Integration
 
-3. Review the anti-patterns list. Have you encountered any of these in your
-   own projects? How would you refactor using the patterns from this course?
+Design a new integration for processing insurance claims using at least 5 EIP patterns:
+
+```
+1. Messaging Gateway — Receive claims via REST API
+2. Content-Based Router — Route by claim type (auto, home, life)
+3. Content Enricher — Add policy details from CRM
+4. Splitter — Split multi-item claims into individual line items
+5. Aggregator — Reassemble after per-item validation
+6. Process Manager (Saga) — Orchestrate: validate → assess → approve/deny → notify
+7. Dead Letter Queue — Capture failed claims for manual review
+```
+
+Draw the complete message flow diagram. For each pattern, explain its **scalability** and **atomicity** contribution.
+
+### Step 2: Apply the Production Checklist
+
+Review your design against the production checklist:
+
+| Check | Status | Notes |
+|-------|--------|-------|
+| Every message has `CorrelationId` tracking | ? | |
+| DLQ configured for all processing stages | ? | |
+| Per-tenant throttling configured | ? | |
+| Retry policies with jitter for all external calls | ? | |
+| Saga compensation for all non-idempotent steps | ? | |
+| Graceful shutdown handling in all consumers | ? | |
+| OpenTelemetry traces across all stages | ? | |
+| Health checks for all dependencies | ? | |
+
+What items would you add for your specific organization's compliance requirements?
+
+### Step 3: Identify and Refactor Anti-Patterns
+
+Review these anti-patterns and explain why each undermines **scalability** or **atomicity**:
+
+| Anti-Pattern | Problem | Refactoring |
+|-------------|---------|-------------|
+| Silent message drop | Messages disappear without trace | Always route to DLQ or discard topic |
+| Shared mutable state between filters | Race conditions under load | Use immutable envelopes and `with` expressions |
+| Synchronous blocking calls in pipeline | Throughput bottleneck | Use async/await throughout |
+| Global throttle for all tenants | Noisy neighbor problem | Per-tenant throttling |
+| No compensation in saga | Partial failures leave inconsistent state | Implement saga compensation for all non-idempotent steps |
+
+Have you encountered any of these in your own projects?
+
+## Exam
+
+1. Why is the EIP pattern catalog organized around **message-centric** architecture?
+   - A) Messages are the fastest way to communicate
+   - B) By making the message the unit of work — carrying its own identity, context, and routing information — each processing component can be independently developed, scaled, and recovered without coupling to others
+   - C) The EIP book was written before microservices
+   - D) Messages are the only communication mechanism in .NET
+
+2. What is the most dangerous anti-pattern for **production atomicity**?
+   - A) Using too many patterns
+   - B) Silent message drops — when a message fails and is neither routed to the DLQ nor explicitly discarded, it disappears from the system without trace; this violates the zero-message-loss guarantee and makes debugging impossible
+   - C) Having too many processing stages
+   - D) Using JSON instead of XML
+
+3. How does the production checklist approach support **team scalability**?
+   - A) Checklists are faster than documentation
+   - B) A shared checklist ensures every team member and every integration applies the same quality standards — new integrations don't miss critical concerns like DLQ routing, throttling, or compensation, regardless of who builds them
+   - C) Checklists replace code review
+   - D) Each team member creates their own checklist
 
 **Previous: [← Tutorial 49](49-testing-integrations.md)** | **[Back to Course Overview →](README.md)**